X-Git-Url: https://git.pterodactylus.net/?a=blobdiff_plain;f=src%2Fnet%2Fpterodactylus%2Farachne%2Fcore%2FURLFetcher.java;h=59ad6230a034dcc971cdd1baf477f3f89ad3037d;hb=2257173f299e8d0af6e28a19f87ab6036b75ef80;hp=d57e15f69960a3073daa2fe7332df1e2aafc79cb;hpb=cb88a7c9368c4bc32ba5c6caf32c5c51110fde35;p=arachne.git diff --git a/src/net/pterodactylus/arachne/core/URLFetcher.java b/src/net/pterodactylus/arachne/core/URLFetcher.java index d57e15f..59ad623 100644 --- a/src/net/pterodactylus/arachne/core/URLFetcher.java +++ b/src/net/pterodactylus/arachne/core/URLFetcher.java @@ -7,12 +7,16 @@ import java.net.URL; import java.net.URLConnection; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; +import java.util.ArrayList; +import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; -import net.pterodactylus.arachne.parser.HtmlEditorKitParser; +import net.pterodactylus.arachne.parser.Parser; +import net.pterodactylus.arachne.parser.ParserFactory; import net.pterodactylus.arachne.parser.ParserListener; import de.ina.util.io.MessageDigestInputStream; +import de.ina.util.number.Hex; import de.ina.util.validation.Validation; /** @@ -26,8 +30,8 @@ class URLFetcher implements Runnable, ParserListener { /** The logger. */ private static final Logger logger = Logger.getLogger(URLFetcher.class.getName()); - /** The core. */ - private final Core core; + /** The parser factory. */ + private final ParserFactory parserFactory; /** The URL to fetch. */ private final URL url; @@ -38,20 +42,27 @@ class URLFetcher implements Runnable, ParserListener { /** The hash of the fetched URL. */ private byte[] hash; + /** The collected URLs. */ + private final List collectedPages = new ArrayList(); + + /** The title of the URL. */ + private String title; + /** * Creates a new fetcher for the given URL. * - * @param core - * The core new pages are queued in + * @param parserFactory + * The parser factory that is used to create content-type + * specific parsers * @param url * The URL to fetch * @throws NoSuchAlgorithmException * if no {@link MessageDigest} instance with an * SHA-256 algorithm can be created */ - public URLFetcher(Core core, URL url) throws NoSuchAlgorithmException { - Validation.begin().isNotNull("core", core).isNotNull("url", url).check(); - this.core = core; + public URLFetcher(ParserFactory parserFactory, URL url) throws NoSuchAlgorithmException { + Validation.begin().isNotNull("parserFactory", parserFactory).isNotNull("url", url).check(); + this.parserFactory = parserFactory; this.url = url; messageDigest = MessageDigest.getInstance("SHA-256"); } @@ -61,6 +72,24 @@ class URLFetcher implements Runnable, ParserListener { // /** + * Returns the title of the fetched URL. + * + * @return The fetched URL’s title + */ + public String getTitle() { + return title; + } + + /** + * Returns the pages collected while parsing this URL. + * + * @return The collected pages + */ + public List getCollectedPages() { + return collectedPages; + } + + /** * Returns the hash of the content of the fetched URL. The returned value is * only valid after {@link #run()} has been called. * @@ -77,7 +106,7 @@ class URLFetcher implements Runnable, ParserListener { // /** - * {@inheritdoc} + * {@inheritDoc} * * @see java.lang.Runnable#run() */ @@ -92,9 +121,14 @@ class URLFetcher implements Runnable, ParserListener { logger.log(Level.INFO, "Type is “" + contentType + "”, length is " + contentLength + "."); urlInputStream = urlConnection.getInputStream(); hashInputStream = new MessageDigestInputStream(urlInputStream, messageDigest); - HtmlEditorKitParser htmlEditorKitParser = new HtmlEditorKitParser(); - htmlEditorKitParser.parse(this, hashInputStream, "UTF-8"); + Parser parser = parserFactory.getParser(contentType); + if (parser == null) { + logger.log(Level.INFO, "No parser found for “" + contentType + "”."); + return; + } + parser.parse(this, hashInputStream, "UTF-8"); hash = messageDigest.digest(); + logger.log(Level.FINE, "Hashed “" + url + "” to “" + Hex.toHex(hash) + "”."); } catch (IOException ioe1) { logger.log(Level.WARNING, "Could not fetch “" + url + "”.", ioe1); } @@ -111,11 +145,16 @@ class URLFetcher implements Runnable, ParserListener { URL newLink = null; try { newLink = new URL(url, linkTarget); - core.addPage(newLink); + try { + Page newPage = Page.fromURL(newLink); + if (newPage != null) { + collectedPages.add(newPage); + } + } catch (IllegalArgumentException iae1) { + /* ignore. */ + } } catch (MalformedURLException mue1) { logger.log(Level.WARNING, "Could not create URL from “" + url + "” and “" + linkTarget + "”.", mue1); - } catch (IllegalArgumentException iae1) { - logger.log(Level.WARNING, "Could not add “" + newLink + "” to core queue.", iae1); } } @@ -123,6 +162,7 @@ class URLFetcher implements Runnable, ParserListener { * {@inheritDoc} */ public void parsedTitle(InputStream inputStream, String title) { + this.title = title; } }