1 package net.pterodactylus.arachne.core;
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.net.MalformedURLException;
7 import java.net.URLConnection;
8 import java.security.MessageDigest;
9 import java.security.NoSuchAlgorithmException;
10 import java.util.ArrayList;
11 import java.util.List;
12 import java.util.logging.Level;
13 import java.util.logging.Logger;
15 import net.pterodactylus.arachne.parser.HtmlEditorKitParser;
16 import net.pterodactylus.arachne.parser.ParserListener;
17 import de.ina.util.io.MessageDigestInputStream;
18 import de.ina.util.validation.Validation;
21 * Fetches URLs, parses the received content (if it is HTML) and adds all
22 * resulting links to the queue in the core.
24 * @author David ‘Bombe’ Roden <bombe@pterodactylus.net>
26 class URLFetcher implements Runnable, ParserListener {
29 private static final Logger logger = Logger.getLogger(URLFetcher.class.getName());
31 /** The URL to fetch. */
32 private final URL url;
34 /** The message digest. */
35 private final MessageDigest messageDigest;
37 /** The hash of the fetched URL. */
40 /** The collected URLs. */
41 private final List<Page> collectedPages = new ArrayList<Page>();
43 /** The title of the URL. */
47 * Creates a new fetcher for the given URL.
51 * @throws NoSuchAlgorithmException
52 * if no {@link MessageDigest} instance with an
53 * <code>SHA-256</code> algorithm can be created
55 public URLFetcher(URL url) throws NoSuchAlgorithmException {
56 Validation.begin().isNotNull("url", url).check();
58 messageDigest = MessageDigest.getInstance("SHA-256");
66 * Returns the title of the fetched URL.
68 * @return The fetched URL’s title
70 public String getTitle() {
75 * Returns the pages collected while parsing this URL.
77 * @return The collected pages
79 public List<Page> getCollectedPages() {
80 return collectedPages;
84 * Returns the hash of the content of the fetched URL. The returned value is
85 * only valid after {@link #run()} has been called.
87 * @return The hash of the fetched content
89 public byte[] getHash() {
90 byte[] hashCopy = new byte[hash.length];
91 System.arraycopy(hash, 0, hashCopy, 0, hash.length);
102 * @see java.lang.Runnable#run()
105 logger.log(Level.INFO, "Starting URL Fetcher for “" + url + "”.");
106 InputStream urlInputStream = null;
107 MessageDigestInputStream hashInputStream = null;
109 URLConnection urlConnection = url.openConnection();
110 long contentLength = urlConnection.getContentLength();
111 String contentType = urlConnection.getContentType();
112 logger.log(Level.INFO, "Type is “" + contentType + "”, length is " + contentLength + ".");
113 urlInputStream = urlConnection.getInputStream();
114 hashInputStream = new MessageDigestInputStream(urlInputStream, messageDigest);
115 HtmlEditorKitParser htmlEditorKitParser = new HtmlEditorKitParser();
116 htmlEditorKitParser.parse(this, hashInputStream, "UTF-8");
117 hash = messageDigest.digest();
118 } catch (IOException ioe1) {
119 logger.log(Level.WARNING, "Could not fetch “" + url + "”.", ioe1);
124 // INTERFACE ParserListener
130 public void parsedLink(InputStream inputStream, String linkTarget, String linkTitle, String linkText) {
133 newLink = new URL(url, linkTarget);
134 Page newPage = Page.fromURL(newLink);
135 if (newPage != null) {
136 collectedPages.add(newPage);
138 } catch (MalformedURLException mue1) {
139 logger.log(Level.WARNING, "Could not create URL from “" + url + "” and “" + linkTarget + "”.", mue1);
146 public void parsedTitle(InputStream inputStream, String title) {