1 package net.pterodactylus.arachne.core;
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.net.MalformedURLException;
7 import java.net.URLConnection;
8 import java.security.MessageDigest;
9 import java.security.NoSuchAlgorithmException;
10 import java.util.ArrayList;
11 import java.util.List;
12 import java.util.logging.Level;
13 import java.util.logging.Logger;
15 import net.pterodactylus.arachne.parser.HtmlEditorKitParser;
16 import net.pterodactylus.arachne.parser.ParserListener;
17 import de.ina.util.io.MessageDigestInputStream;
18 import de.ina.util.validation.Validation;
21 * Fetches URLs, parses the received content (if it is HTML) and adds all
22 * resulting links to the queue in the core.
24 * @author David ‘Bombe’ Roden <bombe@pterodactylus.net>
26 class URLFetcher implements Runnable, ParserListener {
29 private static final Logger logger = Logger.getLogger(URLFetcher.class.getName());
31 /** The URL to fetch. */
32 private final URL url;
34 /** The message digest. */
35 private final MessageDigest messageDigest;
37 /** The hash of the fetched URL. */
40 /** The collected URLs. */
41 private final List<Page> collectedPages = new ArrayList<Page>();
44 * Creates a new fetcher for the given URL.
48 * @throws NoSuchAlgorithmException
49 * if no {@link MessageDigest} instance with an
50 * <code>SHA-256</code> algorithm can be created
52 public URLFetcher(URL url) throws NoSuchAlgorithmException {
53 Validation.begin().isNotNull("url", url).check();
55 messageDigest = MessageDigest.getInstance("SHA-256");
63 * Returns the pages collected while parsing this URL.
65 * @return The collected pages
67 public List<Page> getCollectedPages() {
68 return collectedPages;
72 * Returns the hash of the content of the fetched URL. The returned value is
73 * only valid after {@link #run()} has been called.
75 * @return The hash of the fetched content
77 public byte[] getHash() {
78 byte[] hashCopy = new byte[hash.length];
79 System.arraycopy(hash, 0, hashCopy, 0, hash.length);
90 * @see java.lang.Runnable#run()
93 logger.log(Level.INFO, "Starting URL Fetcher for “" + url + "”.");
94 InputStream urlInputStream = null;
95 MessageDigestInputStream hashInputStream = null;
97 URLConnection urlConnection = url.openConnection();
98 long contentLength = urlConnection.getContentLength();
99 String contentType = urlConnection.getContentType();
100 logger.log(Level.INFO, "Type is “" + contentType + "”, length is " + contentLength + ".");
101 urlInputStream = urlConnection.getInputStream();
102 hashInputStream = new MessageDigestInputStream(urlInputStream, messageDigest);
103 HtmlEditorKitParser htmlEditorKitParser = new HtmlEditorKitParser();
104 htmlEditorKitParser.parse(this, hashInputStream, "UTF-8");
105 hash = messageDigest.digest();
106 } catch (IOException ioe1) {
107 logger.log(Level.WARNING, "Could not fetch “" + url + "”.", ioe1);
112 // INTERFACE ParserListener
118 public void parsedLink(InputStream inputStream, String linkTarget, String linkTitle, String linkText) {
121 newLink = new URL(url, linkTarget);
122 Page newPage = Page.fromURL(newLink);
123 if (newPage != null) {
124 collectedPages.add(newPage);
126 } catch (MalformedURLException mue1) {
127 logger.log(Level.WARNING, "Could not create URL from “" + url + "” and “" + linkTarget + "”.", mue1);
134 public void parsedTitle(InputStream inputStream, String title) {