Use a default parser factory in the core and hand it in to every URL fetcher.
[arachne.git] / src / net / pterodactylus / arachne / core / URLFetcher.java
1 package net.pterodactylus.arachne.core;
2
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.net.MalformedURLException;
6 import java.net.URL;
7 import java.net.URLConnection;
8 import java.security.MessageDigest;
9 import java.security.NoSuchAlgorithmException;
10 import java.util.ArrayList;
11 import java.util.List;
12 import java.util.logging.Level;
13 import java.util.logging.Logger;
14
15 import net.pterodactylus.arachne.parser.Parser;
16 import net.pterodactylus.arachne.parser.ParserFactory;
17 import net.pterodactylus.arachne.parser.ParserListener;
18 import de.ina.util.io.MessageDigestInputStream;
19 import de.ina.util.validation.Validation;
20
21 /**
22  * Fetches URLs, parses the received content (if it is HTML) and adds all
23  * resulting links to the queue in the core.
24  *
25  * @author David ‘Bombe’ Roden <bombe@pterodactylus.net>
26  */
27 class URLFetcher implements Runnable, ParserListener {
28
29         /** The logger. */
30         private static final Logger logger = Logger.getLogger(URLFetcher.class.getName());
31
32         /** The parser factory. */
33         private final ParserFactory parserFactory;
34
35         /** The URL to fetch. */
36         private final URL url;
37
38         /** The message digest. */
39         private final MessageDigest messageDigest;
40
41         /** The hash of the fetched URL. */
42         private byte[] hash;
43
44         /** The collected URLs. */
45         private final List<Page> collectedPages = new ArrayList<Page>();
46
47         /** The title of the URL. */
48         private String title;
49
50         /**
51          * Creates a new fetcher for the given URL.
52          *
53          * @param parserFactory
54          *            The parser factory that is used to create content-type
55          *            specific parsers
56          * @param url
57          *            The URL to fetch
58          * @throws NoSuchAlgorithmException
59          *             if no {@link MessageDigest} instance with an
60          *             <code>SHA-256</code> algorithm can be created
61          */
62         public URLFetcher(ParserFactory parserFactory, URL url) throws NoSuchAlgorithmException {
63                 Validation.begin().isNotNull("parserFactory", parserFactory).isNotNull("url", url).check();
64                 this.parserFactory = parserFactory;
65                 this.url = url;
66                 messageDigest = MessageDigest.getInstance("SHA-256");
67         }
68
69         //
70         // ACCESSORS
71         //
72
73         /**
74          * Returns the title of the fetched URL.
75          *
76          * @return The fetched URL’s title
77          */
78         public String getTitle() {
79                 return title;
80         }
81
82         /**
83          * Returns the pages collected while parsing this URL.
84          *
85          * @return The collected pages
86          */
87         public List<Page> getCollectedPages() {
88                 return collectedPages;
89         }
90
91         /**
92          * Returns the hash of the content of the fetched URL. The returned value is
93          * only valid after {@link #run()} has been called.
94          *
95          * @return The hash of the fetched content
96          */
97         public byte[] getHash() {
98                 byte[] hashCopy = new byte[hash.length];
99                 System.arraycopy(hash, 0, hashCopy, 0, hash.length);
100                 return hashCopy;
101         }
102
103         //
104         // INTERFACE Runnable
105         //
106
107         /**
108          * {@inheritdoc}
109          *
110          * @see java.lang.Runnable#run()
111          */
112         public void run() {
113                 logger.log(Level.INFO, "Starting URL Fetcher for “" + url + "”.");
114                 InputStream urlInputStream = null;
115                 MessageDigestInputStream hashInputStream = null;
116                 try {
117                         URLConnection urlConnection = url.openConnection();
118                         long contentLength = urlConnection.getContentLength();
119                         String contentType = urlConnection.getContentType();
120                         logger.log(Level.INFO, "Type is “" + contentType + "”, length is " + contentLength + ".");
121                         urlInputStream = urlConnection.getInputStream();
122                         hashInputStream = new MessageDigestInputStream(urlInputStream, messageDigest);
123                         Parser parser = parserFactory.getParser(contentType);
124                         parser.parse(this, hashInputStream, "UTF-8");
125                         hash = messageDigest.digest();
126                 } catch (IOException ioe1) {
127                         logger.log(Level.WARNING, "Could not fetch “" + url + "”.", ioe1);
128                 }
129         }
130
131         //
132         // INTERFACE ParserListener
133         //
134
135         /**
136          * {@inheritDoc}
137          */
138         public void parsedLink(InputStream inputStream, String linkTarget, String linkTitle, String linkText) {
139                 URL newLink = null;
140                 try {
141                         newLink = new URL(url, linkTarget);
142                         Page newPage = Page.fromURL(newLink);
143                         if (newPage != null) {
144                                 collectedPages.add(newPage);
145                         }
146                 } catch (MalformedURLException mue1) {
147                         logger.log(Level.WARNING, "Could not create URL from “" + url + "” and “" + linkTarget + "”.", mue1);
148                 }
149         }
150
151         /**
152          * {@inheritDoc}
153          */
154         public void parsedTitle(InputStream inputStream, String title) {
155                 this.title = title;
156         }
157
158 }