Remove core dependency.
[arachne.git] / src / net / pterodactylus / arachne / core / URLFetcher.java
1 package net.pterodactylus.arachne.core;
2
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.net.MalformedURLException;
6 import java.net.URL;
7 import java.net.URLConnection;
8 import java.security.MessageDigest;
9 import java.security.NoSuchAlgorithmException;
10 import java.util.ArrayList;
11 import java.util.List;
12 import java.util.logging.Level;
13 import java.util.logging.Logger;
14
15 import net.pterodactylus.arachne.parser.HtmlEditorKitParser;
16 import net.pterodactylus.arachne.parser.ParserListener;
17 import de.ina.util.io.MessageDigestInputStream;
18 import de.ina.util.validation.Validation;
19
20 /**
21  * Fetches URLs, parses the received content (if it is HTML) and adds all
22  * resulting links to the queue in the core.
23  *
24  * @author David ‘Bombe’ Roden <bombe@pterodactylus.net>
25  */
26 class URLFetcher implements Runnable, ParserListener {
27
28         /** The logger. */
29         private static final Logger logger = Logger.getLogger(URLFetcher.class.getName());
30
31         /** The URL to fetch. */
32         private final URL url;
33
34         /** The message digest. */
35         private final MessageDigest messageDigest;
36
37         /** The hash of the fetched URL. */
38         private byte[] hash;
39
40         /** The collected URLs. */
41         private final List<Page> collectedPages = new ArrayList<Page>();
42
43         /**
44          * Creates a new fetcher for the given URL.
45          *
46          * @param url
47          *            The URL to fetch
48          * @throws NoSuchAlgorithmException
49          *             if no {@link MessageDigest} instance with an
50          *             <code>SHA-256</code> algorithm can be created
51          */
52         public URLFetcher(URL url) throws NoSuchAlgorithmException {
53                 Validation.begin().isNotNull("url", url).check();
54                 this.url = url;
55                 messageDigest = MessageDigest.getInstance("SHA-256");
56         }
57
58         //
59         // ACCESSORS
60         //
61
62         /**
63          * Returns the pages collected while parsing this URL.
64          *
65          * @return The collected pages
66          */
67         public List<Page> getCollectedPages() {
68                 return collectedPages;
69         }
70
71         /**
72          * Returns the hash of the content of the fetched URL. The returned value is
73          * only valid after {@link #run()} has been called.
74          *
75          * @return The hash of the fetched content
76          */
77         public byte[] getHash() {
78                 byte[] hashCopy = new byte[hash.length];
79                 System.arraycopy(hash, 0, hashCopy, 0, hash.length);
80                 return hashCopy;
81         }
82
83         //
84         // INTERFACE Runnable
85         //
86
87         /**
88          * {@inheritdoc}
89          *
90          * @see java.lang.Runnable#run()
91          */
92         public void run() {
93                 logger.log(Level.INFO, "Starting URL Fetcher for “" + url + "”.");
94                 InputStream urlInputStream = null;
95                 MessageDigestInputStream hashInputStream = null;
96                 try {
97                         URLConnection urlConnection = url.openConnection();
98                         long contentLength = urlConnection.getContentLength();
99                         String contentType = urlConnection.getContentType();
100                         logger.log(Level.INFO, "Type is “" + contentType + "”, length is " + contentLength + ".");
101                         urlInputStream = urlConnection.getInputStream();
102                         hashInputStream = new MessageDigestInputStream(urlInputStream, messageDigest);
103                         HtmlEditorKitParser htmlEditorKitParser = new HtmlEditorKitParser();
104                         htmlEditorKitParser.parse(this, hashInputStream, "UTF-8");
105                         hash = messageDigest.digest();
106                 } catch (IOException ioe1) {
107                         logger.log(Level.WARNING, "Could not fetch “" + url + "”.", ioe1);
108                 }
109         }
110
111         //
112         // INTERFACE ParserListener
113         //
114
115         /**
116          * {@inheritDoc}
117          */
118         public void parsedLink(InputStream inputStream, String linkTarget, String linkTitle, String linkText) {
119                 URL newLink = null;
120                 try {
121                         newLink = new URL(url, linkTarget);
122                         Page newPage = Page.fromURL(newLink);
123                         if (newPage != null) {
124                                 collectedPages.add(newPage);
125                         }
126                 } catch (MalformedURLException mue1) {
127                         logger.log(Level.WARNING, "Could not create URL from “" + url + "” and “" + linkTarget + "”.", mue1);
128                 }
129         }
130
131         /**
132          * {@inheritDoc}
133          */
134         public void parsedTitle(InputStream inputStream, String title) {
135         }
136
137 }