d2ce831681cf60089865490ebf309b4614fafc35
[arachne.git] / src / net / pterodactylus / arachne / core / URLFetcher.java
1 package net.pterodactylus.arachne.core;
2
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.net.MalformedURLException;
6 import java.net.URL;
7 import java.net.URLConnection;
8 import java.security.MessageDigest;
9 import java.security.NoSuchAlgorithmException;
10 import java.util.ArrayList;
11 import java.util.List;
12 import java.util.logging.Level;
13 import java.util.logging.Logger;
14
15 import net.pterodactylus.arachne.parser.HtmlEditorKitParser;
16 import net.pterodactylus.arachne.parser.ParserListener;
17 import de.ina.util.io.MessageDigestInputStream;
18 import de.ina.util.validation.Validation;
19
20 /**
21  * Fetches URLs, parses the received content (if it is HTML) and adds all
22  * resulting links to the queue in the core.
23  *
24  * @author David ‘Bombe’ Roden <bombe@pterodactylus.net>
25  */
26 class URLFetcher implements Runnable, ParserListener {
27
28         /** The logger. */
29         private static final Logger logger = Logger.getLogger(URLFetcher.class.getName());
30
31         /** The URL to fetch. */
32         private final URL url;
33
34         /** The message digest. */
35         private final MessageDigest messageDigest;
36
37         /** The hash of the fetched URL. */
38         private byte[] hash;
39
40         /** The collected URLs. */
41         private final List<Page> collectedPages = new ArrayList<Page>();
42
43         /** The title of the URL. */
44         private String title;
45
46         /**
47          * Creates a new fetcher for the given URL.
48          *
49          * @param url
50          *            The URL to fetch
51          * @throws NoSuchAlgorithmException
52          *             if no {@link MessageDigest} instance with an
53          *             <code>SHA-256</code> algorithm can be created
54          */
55         public URLFetcher(URL url) throws NoSuchAlgorithmException {
56                 Validation.begin().isNotNull("url", url).check();
57                 this.url = url;
58                 messageDigest = MessageDigest.getInstance("SHA-256");
59         }
60
61         //
62         // ACCESSORS
63         //
64
65         /**
66          * Returns the title of the fetched URL.
67          *
68          * @return The fetched URL’s title
69          */
70         public String getTitle() {
71                 return title;
72         }
73
74         /**
75          * Returns the pages collected while parsing this URL.
76          *
77          * @return The collected pages
78          */
79         public List<Page> getCollectedPages() {
80                 return collectedPages;
81         }
82
83         /**
84          * Returns the hash of the content of the fetched URL. The returned value is
85          * only valid after {@link #run()} has been called.
86          *
87          * @return The hash of the fetched content
88          */
89         public byte[] getHash() {
90                 byte[] hashCopy = new byte[hash.length];
91                 System.arraycopy(hash, 0, hashCopy, 0, hash.length);
92                 return hashCopy;
93         }
94
95         //
96         // INTERFACE Runnable
97         //
98
99         /**
100          * {@inheritdoc}
101          *
102          * @see java.lang.Runnable#run()
103          */
104         public void run() {
105                 logger.log(Level.INFO, "Starting URL Fetcher for “" + url + "”.");
106                 InputStream urlInputStream = null;
107                 MessageDigestInputStream hashInputStream = null;
108                 try {
109                         URLConnection urlConnection = url.openConnection();
110                         long contentLength = urlConnection.getContentLength();
111                         String contentType = urlConnection.getContentType();
112                         logger.log(Level.INFO, "Type is “" + contentType + "”, length is " + contentLength + ".");
113                         urlInputStream = urlConnection.getInputStream();
114                         hashInputStream = new MessageDigestInputStream(urlInputStream, messageDigest);
115                         HtmlEditorKitParser htmlEditorKitParser = new HtmlEditorKitParser();
116                         htmlEditorKitParser.parse(this, hashInputStream, "UTF-8");
117                         hash = messageDigest.digest();
118                 } catch (IOException ioe1) {
119                         logger.log(Level.WARNING, "Could not fetch “" + url + "”.", ioe1);
120                 }
121         }
122
123         //
124         // INTERFACE ParserListener
125         //
126
127         /**
128          * {@inheritDoc}
129          */
130         public void parsedLink(InputStream inputStream, String linkTarget, String linkTitle, String linkText) {
131                 URL newLink = null;
132                 try {
133                         newLink = new URL(url, linkTarget);
134                         Page newPage = Page.fromURL(newLink);
135                         if (newPage != null) {
136                                 collectedPages.add(newPage);
137                         }
138                 } catch (MalformedURLException mue1) {
139                         logger.log(Level.WARNING, "Could not create URL from “" + url + "” and “" + linkTarget + "”.", mue1);
140                 }
141         }
142
143         /**
144          * {@inheritDoc}
145          */
146         public void parsedTitle(InputStream inputStream, String title) {
147                 this.title = title;
148         }
149
150 }