1 package net.pterodactylus.arachne.core;
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.net.MalformedURLException;
7 import java.net.URLConnection;
8 import java.security.MessageDigest;
9 import java.security.NoSuchAlgorithmException;
10 import java.util.ArrayList;
11 import java.util.List;
12 import java.util.logging.Level;
13 import java.util.logging.Logger;
15 import net.pterodactylus.arachne.parser.Parser;
16 import net.pterodactylus.arachne.parser.ParserFactory;
17 import net.pterodactylus.arachne.parser.ParserListener;
18 import de.ina.util.io.MessageDigestInputStream;
19 import de.ina.util.number.Hex;
20 import de.ina.util.validation.Validation;
23 * Fetches URLs, parses the received content (if it is HTML) and adds all
24 * resulting links to the queue in the core.
26 * @author David ‘Bombe’ Roden <bombe@pterodactylus.net>
28 class URLFetcher implements Runnable, ParserListener {
31 private static final Logger logger = Logger.getLogger(URLFetcher.class.getName());
33 /** The parser factory. */
34 private final ParserFactory parserFactory;
36 /** The URL to fetch. */
37 private final URL url;
39 /** The message digest. */
40 private final MessageDigest messageDigest;
42 /** The hash of the fetched URL. */
45 /** The collected URLs. */
46 private final List<Page> collectedPages = new ArrayList<Page>();
48 /** The title of the URL. */
52 * Creates a new fetcher for the given URL.
54 * @param parserFactory
55 * The parser factory that is used to create content-type
59 * @throws NoSuchAlgorithmException
60 * if no {@link MessageDigest} instance with an
61 * <code>SHA-256</code> algorithm can be created
63 public URLFetcher(ParserFactory parserFactory, URL url) throws NoSuchAlgorithmException {
64 Validation.begin().isNotNull("parserFactory", parserFactory).isNotNull("url", url).check();
65 this.parserFactory = parserFactory;
67 messageDigest = MessageDigest.getInstance("SHA-256");
75 * Returns the title of the fetched URL.
77 * @return The fetched URL’s title
79 public String getTitle() {
84 * Returns the pages collected while parsing this URL.
86 * @return The collected pages
88 public List<Page> getCollectedPages() {
89 return collectedPages;
93 * Returns the hash of the content of the fetched URL. The returned value is
94 * only valid after {@link #run()} has been called.
96 * @return The hash of the fetched content
98 public byte[] getHash() {
99 byte[] hashCopy = new byte[hash.length];
100 System.arraycopy(hash, 0, hashCopy, 0, hash.length);
105 // INTERFACE Runnable
111 * @see java.lang.Runnable#run()
114 logger.log(Level.INFO, "Starting URL Fetcher for “" + url + "”.");
115 InputStream urlInputStream = null;
116 MessageDigestInputStream hashInputStream = null;
118 URLConnection urlConnection = url.openConnection();
119 long contentLength = urlConnection.getContentLength();
120 String contentType = urlConnection.getContentType();
121 logger.log(Level.INFO, "Type is “" + contentType + "”, length is " + contentLength + ".");
122 urlInputStream = urlConnection.getInputStream();
123 hashInputStream = new MessageDigestInputStream(urlInputStream, messageDigest);
124 Parser parser = parserFactory.getParser(contentType);
125 if (parser == null) {
126 logger.log(Level.INFO, "No parser found for “" + contentType + "”.");
129 parser.parse(this, hashInputStream, "UTF-8");
130 hash = messageDigest.digest();
131 logger.log(Level.FINE, "Hashed “" + url + "” to “" + Hex.toHex(hash) + "”.");
132 } catch (IOException ioe1) {
133 logger.log(Level.WARNING, "Could not fetch “" + url + "”.", ioe1);
138 // INTERFACE ParserListener
144 public void parsedLink(InputStream inputStream, String linkTarget, String linkTitle, String linkText) {
147 newLink = new URL(url, linkTarget);
149 Page newPage = Page.fromURL(newLink);
150 if (newPage != null) {
151 collectedPages.add(newPage);
153 } catch (IllegalArgumentException iae1) {
156 } catch (MalformedURLException mue1) {
157 logger.log(Level.WARNING, "Could not create URL from “" + url + "” and “" + linkTarget + "”.", mue1);
164 public void parsedTitle(InputStream inputStream, String title) {