1 package net.pterodactylus.arachne.core;
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.net.MalformedURLException;
7 import java.net.URLConnection;
8 import java.security.MessageDigest;
9 import java.security.NoSuchAlgorithmException;
10 import java.util.logging.Level;
11 import java.util.logging.Logger;
13 import net.pterodactylus.arachne.parser.HtmlEditorKitParser;
14 import net.pterodactylus.arachne.parser.ParserListener;
15 import de.ina.util.io.MessageDigestInputStream;
16 import de.ina.util.validation.Validation;
19 * Fetches URLs, parses the received content (if it is HTML) and adds all
20 * resulting links to the queue in the core.
22 * @author David ‘Bombe’ Roden <bombe@pterodactylus.net>
24 class URLFetcher implements Runnable, ParserListener {
27 private static final Logger logger = Logger.getLogger(URLFetcher.class.getName());
30 private final Core core;
32 /** The URL to fetch. */
33 private final URL url;
35 /** The message digest. */
36 private final MessageDigest messageDigest;
38 /** The hash of the fetched URL. */
42 * Creates a new fetcher for the given URL.
45 * The core new pages are queued in
48 * @throws NoSuchAlgorithmException
49 * if no {@link MessageDigest} instance with an
50 * <code>SHA-256</code> algorithm can be created
52 public URLFetcher(Core core, URL url) throws NoSuchAlgorithmException {
53 Validation.begin().isNotNull("core", core).isNotNull("url", url).check();
56 messageDigest = MessageDigest.getInstance("SHA-256");
64 * Returns the hash of the content of the fetched URL. The returned value is
65 * only valid after {@link #run()} has been called.
67 * @return The hash of the fetched content
69 public byte[] getHash() {
70 byte[] hashCopy = new byte[hash.length];
71 System.arraycopy(hash, 0, hashCopy, 0, hash.length);
82 * @see java.lang.Runnable#run()
85 logger.log(Level.INFO, "Starting URL Fetcher for “" + url + "”.");
86 InputStream urlInputStream = null;
87 MessageDigestInputStream hashInputStream = null;
89 URLConnection urlConnection = url.openConnection();
90 long contentLength = urlConnection.getContentLength();
91 String contentType = urlConnection.getContentType();
92 logger.log(Level.INFO, "Type is “" + contentType + "”, length is " + contentLength + ".");
93 urlInputStream = urlConnection.getInputStream();
94 hashInputStream = new MessageDigestInputStream(urlInputStream, messageDigest);
95 HtmlEditorKitParser htmlEditorKitParser = new HtmlEditorKitParser();
96 htmlEditorKitParser.parse(this, hashInputStream, "UTF-8");
97 hash = messageDigest.digest();
98 } catch (IOException ioe1) {
99 logger.log(Level.WARNING, "Could not fetch “" + url + "”.", ioe1);
104 // INTERFACE ParserListener
110 public void parsedLink(InputStream inputStream, String linkTarget, String linkTitle, String linkText) {
113 newLink = new URL(url, linkTarget);
114 core.addPage(newLink);
115 } catch (MalformedURLException mue1) {
116 logger.log(Level.WARNING, "Could not create URL from “" + url + "” and “" + linkTarget + "”.", mue1);
117 } catch (IllegalArgumentException iae1) {
118 logger.log(Level.WARNING, "Could not add “" + newLink + "” to core queue.", iae1);
125 public void parsedTitle(InputStream inputStream, String title) {