X-Git-Url: https://git.pterodactylus.net/?a=blobdiff_plain;f=src%2Fnet%2Fpterodactylus%2Farachne%2Fcore%2FURLFetcher.java;h=cdccd2bcd433d1e62da7e0c2f80752dcc5308c02;hb=a637923e6ea2bd6d53180b7e843b02a191b6d6e5;hp=8f8465dbbde9f49d15df0feaa13739e490a56530;hpb=8102ef3404b97d753d8166bd0f995745907d43a5;p=arachne.git diff --git a/src/net/pterodactylus/arachne/core/URLFetcher.java b/src/net/pterodactylus/arachne/core/URLFetcher.java index 8f8465d..cdccd2b 100644 --- a/src/net/pterodactylus/arachne/core/URLFetcher.java +++ b/src/net/pterodactylus/arachne/core/URLFetcher.java @@ -1,11 +1,21 @@ package net.pterodactylus.arachne.core; import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.ArrayList; +import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; +import net.pterodactylus.arachne.parser.Parser; +import net.pterodactylus.arachne.parser.ParserFactory; +import net.pterodactylus.arachne.parser.ParserListener; +import de.ina.util.io.MessageDigestInputStream; import de.ina.util.validation.Validation; /** @@ -14,31 +24,86 @@ import de.ina.util.validation.Validation; * * @author David ‘Bombe’ Roden */ -class URLFetcher implements Runnable { +class URLFetcher implements Runnable, ParserListener { /** The logger. */ private static final Logger logger = Logger.getLogger(URLFetcher.class.getName()); - /** The core. */ - private final Core core; + /** The parser factory. */ + private final ParserFactory parserFactory; /** The URL to fetch. */ private final URL url; + /** The message digest. */ + private final MessageDigest messageDigest; + + /** The hash of the fetched URL. */ + private byte[] hash; + + /** The collected URLs. */ + private final List collectedPages = new ArrayList(); + + /** The title of the URL. */ + private String title; + /** * Creates a new fetcher for the given URL. * + * @param parserFactory + * The parser factory that is used to create content-type + * specific parsers * @param url * The URL to fetch - * @param core - * TODO + * @throws NoSuchAlgorithmException + * if no {@link MessageDigest} instance with an + * SHA-256 algorithm can be created */ - public URLFetcher(Core core, URL url) { - Validation.begin().isNotNull("core", core).isNotNull("url", url).check(); - this.core = core; + public URLFetcher(ParserFactory parserFactory, URL url) throws NoSuchAlgorithmException { + Validation.begin().isNotNull("parserFactory", parserFactory).isNotNull("url", url).check(); + this.parserFactory = parserFactory; this.url = url; + messageDigest = MessageDigest.getInstance("SHA-256"); + } + + // + // ACCESSORS + // + + /** + * Returns the title of the fetched URL. + * + * @return The fetched URL’s title + */ + public String getTitle() { + return title; + } + + /** + * Returns the pages collected while parsing this URL. + * + * @return The collected pages + */ + public List getCollectedPages() { + return collectedPages; + } + + /** + * Returns the hash of the content of the fetched URL. The returned value is + * only valid after {@link #run()} has been called. + * + * @return The hash of the fetched content + */ + public byte[] getHash() { + byte[] hashCopy = new byte[hash.length]; + System.arraycopy(hash, 0, hashCopy, 0, hash.length); + return hashCopy; } + // + // INTERFACE Runnable + // + /** * {@inheritdoc} * @@ -46,14 +111,48 @@ class URLFetcher implements Runnable { */ public void run() { logger.log(Level.INFO, "Starting URL Fetcher for “" + url + "”."); + InputStream urlInputStream = null; + MessageDigestInputStream hashInputStream = null; try { URLConnection urlConnection = url.openConnection(); long contentLength = urlConnection.getContentLength(); String contentType = urlConnection.getContentType(); logger.log(Level.INFO, "Type is “" + contentType + "”, length is " + contentLength + "."); + urlInputStream = urlConnection.getInputStream(); + hashInputStream = new MessageDigestInputStream(urlInputStream, messageDigest); + Parser parser = parserFactory.getParser(contentType); + parser.parse(this, hashInputStream, "UTF-8"); + hash = messageDigest.digest(); } catch (IOException ioe1) { logger.log(Level.WARNING, "Could not fetch “" + url + "”.", ioe1); } } + // + // INTERFACE ParserListener + // + + /** + * {@inheritDoc} + */ + public void parsedLink(InputStream inputStream, String linkTarget, String linkTitle, String linkText) { + URL newLink = null; + try { + newLink = new URL(url, linkTarget); + Page newPage = Page.fromURL(newLink); + if (newPage != null) { + collectedPages.add(newPage); + } + } catch (MalformedURLException mue1) { + logger.log(Level.WARNING, "Could not create URL from “" + url + "” and “" + linkTarget + "”.", mue1); + } + } + + /** + * {@inheritDoc} + */ + public void parsedTitle(InputStream inputStream, String title) { + this.title = title; + } + }