ae7d072bf7f1a2df3cc68ac2c9947b07f0cf05a2
[rhynodge.git] / src / main / java / net / pterodactylus / rhynodge / filters / webpages / savoy / MovieExtractor.java
1 package net.pterodactylus.rhynodge.filters.webpages.savoy;
2
3 import static java.time.format.DateTimeFormatter.ofPattern;
4 import static java.util.Optional.empty;
5 import static java.util.Optional.of;
6
7 import java.time.LocalDate;
8 import java.time.LocalDateTime;
9 import java.time.LocalTime;
10 import java.time.format.DateTimeFormatter;
11 import java.util.HashSet;
12 import java.util.Optional;
13 import java.util.Set;
14 import java.util.regex.Matcher;
15 import java.util.regex.Pattern;
16
17 import org.jsoup.nodes.Document;
18 import org.jsoup.nodes.Element;
19
20 /**
21  * Extracts {@link Movie} information from an HTML document.
22  *
23  * @author <a href="mailto:bombe@pterodactylus.net">David ‘Bombe’ Roden</a>
24  */
25 public class MovieExtractor {
26
27         private static final Pattern datePattern = Pattern.compile(".*([0-9]{2}\\.[0-9]{2}\\.[0-9]{2}).*");
28         private static final Pattern timePattern = Pattern.compile(".*([0-9]{2}:[0-9]{2}).*");
29         private static final DateTimeFormatter dateFormatter = ofPattern("dd.MM.uu");
30         private static final DateTimeFormatter timeFormatter = ofPattern("HH:mm");
31
32         public Set<Movie> getMovies(Document document) {
33                 Set<Movie> movies = new HashSet<>();
34                 for (Element movieElement : document.select(".tx-spmovies-pi1-listrow")) {
35                         String name = movieElement.select(".tx-spmovies-pi1-header h1").text();
36                         Movie movie = new Movie(name);
37                         for (TicketLink ticketLink : extractTicketLinks(movieElement)) {
38                                 movie.addTicketLink(ticketLink);
39                         }
40                         movies.add(movie);
41                 }
42                 return movies;
43         }
44
45         private Iterable<? extends TicketLink> extractTicketLinks(Element movieElement) {
46                 Set<TicketLink> ticketLinks = new HashSet<>();
47                 int dateCellIndex = 1;
48                 for (Element dateCell : movieElement.select(".tx-spmovies-pi1-date-column")) {
49                         Optional<String> dateString = extractDateString(dateCell);
50                         if (!dateString.isPresent()) {
51                                 continue;
52                         }
53                         for (Element timeCell : getTimeCells(movieElement, dateCellIndex++)) {
54                                 Optional<String> timeString = extractTimeString(timeCell.select("a").text());
55                                 if (!timeString.isPresent()) {
56                                         continue;
57                                 }
58                                 LocalDateTime localDateTime = getPresentationTime(dateString, timeString);
59                                 String link = timeCell.select("a").attr("href");
60                                 TicketLink ticketLink = new TicketLink(localDateTime, link);
61                                 ticketLinks.add(ticketLink);
62                         }
63                 }
64                 return ticketLinks;
65         }
66
67         private LocalDateTime getPresentationTime(Optional<String> dateString, Optional<String> timeString) {
68                 LocalDate date = LocalDate.parse(dateString.get(), dateFormatter);
69                 LocalTime localTime = LocalTime.parse(timeString.get(), timeFormatter);
70                 return date.atTime(localTime);
71         }
72
73         private Optional<String> extractTimeString(String cellContent) {
74                 Matcher timeMatcher = timePattern.matcher(cellContent);
75                 if (!timeMatcher.matches() || timeMatcher.groupCount() < 1) {
76                         return empty();
77                 }
78                 return of(timeMatcher.group(1));
79         }
80
81         private Iterable<? extends Element> getTimeCells(Element movieElement, int dateCellIndex) {
82                 return movieElement.select(".tx-spmovies-pi1-shows:eq(" + dateCellIndex + ") div.time");
83         }
84
85         private Optional<String> extractDateString(Element dateCell) {
86                 Matcher matcher = datePattern.matcher(dateCell.text());
87                 if (!matcher.matches() || matcher.groupCount() < 1) {
88                         return empty();
89                 }
90                 return of(matcher.group(1));
91         }
92
93 }