Create message digest in core and hash fetched URLs.
[arachne.git] / src / net / pterodactylus / arachne / core / URLFetcher.java
1 package net.pterodactylus.arachne.core;
2
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.net.MalformedURLException;
6 import java.net.URL;
7 import java.net.URLConnection;
8 import java.security.MessageDigest;
9 import java.security.NoSuchAlgorithmException;
10 import java.util.logging.Level;
11 import java.util.logging.Logger;
12
13 import net.pterodactylus.arachne.parser.HtmlEditorKitParser;
14 import net.pterodactylus.arachne.parser.ParserListener;
15 import de.ina.util.io.MessageDigestInputStream;
16 import de.ina.util.validation.Validation;
17
18 /**
19  * Fetches URLs, parses the received content (if it is HTML) and adds all
20  * resulting links to the queue in the core.
21  *
22  * @author David ‘Bombe’ Roden <bombe@pterodactylus.net>
23  */
24 class URLFetcher implements Runnable, ParserListener {
25
26         /** The logger. */
27         private static final Logger logger = Logger.getLogger(URLFetcher.class.getName());
28
29         /** The core. */
30         private final Core core;
31
32         /** The URL to fetch. */
33         private final URL url;
34
35         /** The message digest. */
36         private final MessageDigest messageDigest;
37
38         /** The hash of the fetched URL. */
39         private byte[] hash;
40
41         /**
42          * Creates a new fetcher for the given URL.
43          *
44          * @param core
45          *            The core new pages are queued in
46          * @param url
47          *            The URL to fetch
48          * @throws NoSuchAlgorithmException
49          *             if no {@link MessageDigest} instance with an
50          *             <code>SHA-256</code> algorithm can be created
51          */
52         public URLFetcher(Core core, URL url) throws NoSuchAlgorithmException {
53                 Validation.begin().isNotNull("core", core).isNotNull("url", url).check();
54                 this.core = core;
55                 this.url = url;
56                 messageDigest = MessageDigest.getInstance("SHA-256");
57         }
58
59         //
60         // ACCESSORS
61         //
62
63         /**
64          * Returns the hash of the content of the fetched URL. The returned value is
65          * only valid after {@link #run()} has been called.
66          *
67          * @return The hash of the fetched content
68          */
69         public byte[] getHash() {
70                 byte[] hashCopy = new byte[hash.length];
71                 System.arraycopy(hash, 0, hashCopy, 0, hash.length);
72                 return hashCopy;
73         }
74
75         //
76         // INTERFACE Runnable
77         //
78
79         /**
80          * {@inheritdoc}
81          *
82          * @see java.lang.Runnable#run()
83          */
84         public void run() {
85                 logger.log(Level.INFO, "Starting URL Fetcher for “" + url + "”.");
86                 InputStream urlInputStream = null;
87                 MessageDigestInputStream hashInputStream = null;
88                 try {
89                         URLConnection urlConnection = url.openConnection();
90                         long contentLength = urlConnection.getContentLength();
91                         String contentType = urlConnection.getContentType();
92                         logger.log(Level.INFO, "Type is “" + contentType + "”, length is " + contentLength + ".");
93                         urlInputStream = urlConnection.getInputStream();
94                         hashInputStream = new MessageDigestInputStream(urlInputStream, messageDigest);
95                         HtmlEditorKitParser htmlEditorKitParser = new HtmlEditorKitParser();
96                         htmlEditorKitParser.parse(this, hashInputStream, "UTF-8");
97                         hash = messageDigest.digest();
98                 } catch (IOException ioe1) {
99                         logger.log(Level.WARNING, "Could not fetch “" + url + "”.", ioe1);
100                 }
101         }
102
103         //
104         // INTERFACE ParserListener
105         //
106
107         /**
108          * {@inheritDoc}
109          */
110         public void parsedLink(InputStream inputStream, String linkTarget, String linkTitle, String linkText) {
111                 URL newLink = null;
112                 try {
113                         newLink = new URL(url, linkTarget);
114                         core.addPage(newLink);
115                 } catch (MalformedURLException mue1) {
116                         logger.log(Level.WARNING, "Could not create URL from “" + url + "” and “" + linkTarget + "”.", mue1);
117                 } catch (IllegalArgumentException iae1) {
118                         logger.log(Level.WARNING, "Could not add “" + newLink + "” to core queue.", iae1);
119                 }
120         }
121
122         /**
123          * {@inheritDoc}
124          */
125         public void parsedTitle(InputStream inputStream, String title) {
126         }
127
128 }