Log hash after fetching.
[arachne.git] / src / net / pterodactylus / arachne / core / URLFetcher.java
1 package net.pterodactylus.arachne.core;
2
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.net.MalformedURLException;
6 import java.net.URL;
7 import java.net.URLConnection;
8 import java.security.MessageDigest;
9 import java.security.NoSuchAlgorithmException;
10 import java.util.ArrayList;
11 import java.util.List;
12 import java.util.logging.Level;
13 import java.util.logging.Logger;
14
15 import net.pterodactylus.arachne.parser.Parser;
16 import net.pterodactylus.arachne.parser.ParserFactory;
17 import net.pterodactylus.arachne.parser.ParserListener;
18 import de.ina.util.io.MessageDigestInputStream;
19 import de.ina.util.number.Hex;
20 import de.ina.util.validation.Validation;
21
22 /**
23  * Fetches URLs, parses the received content (if it is HTML) and adds all
24  * resulting links to the queue in the core.
25  *
26  * @author David ‘Bombe’ Roden <bombe@pterodactylus.net>
27  */
28 class URLFetcher implements Runnable, ParserListener {
29
30         /** The logger. */
31         private static final Logger logger = Logger.getLogger(URLFetcher.class.getName());
32
33         /** The parser factory. */
34         private final ParserFactory parserFactory;
35
36         /** The URL to fetch. */
37         private final URL url;
38
39         /** The message digest. */
40         private final MessageDigest messageDigest;
41
42         /** The hash of the fetched URL. */
43         private byte[] hash;
44
45         /** The collected URLs. */
46         private final List<Page> collectedPages = new ArrayList<Page>();
47
48         /** The title of the URL. */
49         private String title;
50
51         /**
52          * Creates a new fetcher for the given URL.
53          *
54          * @param parserFactory
55          *            The parser factory that is used to create content-type
56          *            specific parsers
57          * @param url
58          *            The URL to fetch
59          * @throws NoSuchAlgorithmException
60          *             if no {@link MessageDigest} instance with an
61          *             <code>SHA-256</code> algorithm can be created
62          */
63         public URLFetcher(ParserFactory parserFactory, URL url) throws NoSuchAlgorithmException {
64                 Validation.begin().isNotNull("parserFactory", parserFactory).isNotNull("url", url).check();
65                 this.parserFactory = parserFactory;
66                 this.url = url;
67                 messageDigest = MessageDigest.getInstance("SHA-256");
68         }
69
70         //
71         // ACCESSORS
72         //
73
74         /**
75          * Returns the title of the fetched URL.
76          *
77          * @return The fetched URL’s title
78          */
79         public String getTitle() {
80                 return title;
81         }
82
83         /**
84          * Returns the pages collected while parsing this URL.
85          *
86          * @return The collected pages
87          */
88         public List<Page> getCollectedPages() {
89                 return collectedPages;
90         }
91
92         /**
93          * Returns the hash of the content of the fetched URL. The returned value is
94          * only valid after {@link #run()} has been called.
95          *
96          * @return The hash of the fetched content
97          */
98         public byte[] getHash() {
99                 byte[] hashCopy = new byte[hash.length];
100                 System.arraycopy(hash, 0, hashCopy, 0, hash.length);
101                 return hashCopy;
102         }
103
104         //
105         // INTERFACE Runnable
106         //
107
108         /**
109          * {@inheritDoc}
110          *
111          * @see java.lang.Runnable#run()
112          */
113         public void run() {
114                 logger.log(Level.INFO, "Starting URL Fetcher for “" + url + "”.");
115                 InputStream urlInputStream = null;
116                 MessageDigestInputStream hashInputStream = null;
117                 try {
118                         URLConnection urlConnection = url.openConnection();
119                         long contentLength = urlConnection.getContentLength();
120                         String contentType = urlConnection.getContentType();
121                         logger.log(Level.INFO, "Type is “" + contentType + "”, length is " + contentLength + ".");
122                         urlInputStream = urlConnection.getInputStream();
123                         hashInputStream = new MessageDigestInputStream(urlInputStream, messageDigest);
124                         Parser parser = parserFactory.getParser(contentType);
125                         if (parser == null) {
126                                 logger.log(Level.INFO, "No parser found for “" + contentType + "”.");
127                                 return;
128                         }
129                         parser.parse(this, hashInputStream, "UTF-8");
130                         hash = messageDigest.digest();
131                         logger.log(Level.FINE, "Hashed “" + url + "” to “" + Hex.toHex(hash) + "”.");
132                 } catch (IOException ioe1) {
133                         logger.log(Level.WARNING, "Could not fetch “" + url + "”.", ioe1);
134                 }
135         }
136
137         //
138         // INTERFACE ParserListener
139         //
140
141         /**
142          * {@inheritDoc}
143          */
144         public void parsedLink(InputStream inputStream, String linkTarget, String linkTitle, String linkText) {
145                 URL newLink = null;
146                 try {
147                         newLink = new URL(url, linkTarget);
148                         try {
149                                 Page newPage = Page.fromURL(newLink);
150                                 if (newPage != null) {
151                                         collectedPages.add(newPage);
152                                 }
153                         } catch (IllegalArgumentException iae1) {
154                                 /* ignore. */
155                         }
156                 } catch (MalformedURLException mue1) {
157                         logger.log(Level.WARNING, "Could not create URL from “" + url + "” and “" + linkTarget + "”.", mue1);
158                 }
159         }
160
161         /**
162          * {@inheritDoc}
163          */
164         public void parsedTitle(InputStream inputStream, String title) {
165                 this.title = title;
166         }
167
168 }