1 package net.pterodactylus.arachne.core;
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.net.MalformedURLException;
7 import java.net.URLConnection;
8 import java.security.MessageDigest;
9 import java.security.NoSuchAlgorithmException;
10 import java.util.ArrayList;
11 import java.util.List;
12 import java.util.logging.Level;
13 import java.util.logging.Logger;
15 import net.pterodactylus.arachne.parser.Parser;
16 import net.pterodactylus.arachne.parser.ParserFactory;
17 import net.pterodactylus.arachne.parser.ParserListener;
18 import de.ina.util.io.MessageDigestInputStream;
19 import de.ina.util.validation.Validation;
22 * Fetches URLs, parses the received content (if it is HTML) and adds all
23 * resulting links to the queue in the core.
25 * @author David ‘Bombe’ Roden <bombe@pterodactylus.net>
27 class URLFetcher implements Runnable, ParserListener {
30 private static final Logger logger = Logger.getLogger(URLFetcher.class.getName());
32 /** The parser factory. */
33 private final ParserFactory parserFactory;
35 /** The URL to fetch. */
36 private final URL url;
38 /** The message digest. */
39 private final MessageDigest messageDigest;
41 /** The hash of the fetched URL. */
44 /** The collected URLs. */
45 private final List<Page> collectedPages = new ArrayList<Page>();
47 /** The title of the URL. */
51 * Creates a new fetcher for the given URL.
53 * @param parserFactory
54 * The parser factory that is used to create content-type
58 * @throws NoSuchAlgorithmException
59 * if no {@link MessageDigest} instance with an
60 * <code>SHA-256</code> algorithm can be created
62 public URLFetcher(ParserFactory parserFactory, URL url) throws NoSuchAlgorithmException {
63 Validation.begin().isNotNull("parserFactory", parserFactory).isNotNull("url", url).check();
64 this.parserFactory = parserFactory;
66 messageDigest = MessageDigest.getInstance("SHA-256");
74 * Returns the title of the fetched URL.
76 * @return The fetched URL’s title
78 public String getTitle() {
83 * Returns the pages collected while parsing this URL.
85 * @return The collected pages
87 public List<Page> getCollectedPages() {
88 return collectedPages;
92 * Returns the hash of the content of the fetched URL. The returned value is
93 * only valid after {@link #run()} has been called.
95 * @return The hash of the fetched content
97 public byte[] getHash() {
98 byte[] hashCopy = new byte[hash.length];
99 System.arraycopy(hash, 0, hashCopy, 0, hash.length);
104 // INTERFACE Runnable
110 * @see java.lang.Runnable#run()
113 logger.log(Level.INFO, "Starting URL Fetcher for “" + url + "”.");
114 InputStream urlInputStream = null;
115 MessageDigestInputStream hashInputStream = null;
117 URLConnection urlConnection = url.openConnection();
118 long contentLength = urlConnection.getContentLength();
119 String contentType = urlConnection.getContentType();
120 logger.log(Level.INFO, "Type is “" + contentType + "”, length is " + contentLength + ".");
121 urlInputStream = urlConnection.getInputStream();
122 hashInputStream = new MessageDigestInputStream(urlInputStream, messageDigest);
123 Parser parser = parserFactory.getParser(contentType);
124 parser.parse(this, hashInputStream, "UTF-8");
125 hash = messageDigest.digest();
126 } catch (IOException ioe1) {
127 logger.log(Level.WARNING, "Could not fetch “" + url + "”.", ioe1);
132 // INTERFACE ParserListener
138 public void parsedLink(InputStream inputStream, String linkTarget, String linkTitle, String linkText) {
141 newLink = new URL(url, linkTarget);
142 Page newPage = Page.fromURL(newLink);
143 if (newPage != null) {
144 collectedPages.add(newPage);
146 } catch (MalformedURLException mue1) {
147 logger.log(Level.WARNING, "Could not create URL from “" + url + "” and “" + linkTarget + "”.", mue1);
154 public void parsedTitle(InputStream inputStream, String title) {