X-Git-Url: https://git.pterodactylus.net/?a=blobdiff_plain;f=src%2Fnet%2Fpterodactylus%2Farachne%2Fcore%2FURLFetcher.java;h=ea109d0c82c49f3f6ebabe4ee5d95b0a5bd40dea;hb=bc2a6cc23e93037cae7f52815f20bdf24504ce5d;hp=d2ce831681cf60089865490ebf309b4614fafc35;hpb=60ed0322361a6f8110e1c8191138039adaa0f6b0;p=arachne.git diff --git a/src/net/pterodactylus/arachne/core/URLFetcher.java b/src/net/pterodactylus/arachne/core/URLFetcher.java index d2ce831..ea109d0 100644 --- a/src/net/pterodactylus/arachne/core/URLFetcher.java +++ b/src/net/pterodactylus/arachne/core/URLFetcher.java @@ -12,7 +12,8 @@ import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; -import net.pterodactylus.arachne.parser.HtmlEditorKitParser; +import net.pterodactylus.arachne.parser.Parser; +import net.pterodactylus.arachne.parser.ParserFactory; import net.pterodactylus.arachne.parser.ParserListener; import de.ina.util.io.MessageDigestInputStream; import de.ina.util.validation.Validation; @@ -28,6 +29,9 @@ class URLFetcher implements Runnable, ParserListener { /** The logger. */ private static final Logger logger = Logger.getLogger(URLFetcher.class.getName()); + /** The parser factory. */ + private final ParserFactory parserFactory; + /** The URL to fetch. */ private final URL url; @@ -46,14 +50,18 @@ class URLFetcher implements Runnable, ParserListener { /** * Creates a new fetcher for the given URL. * + * @param parserFactory + * The parser factory that is used to create content-type + * specific parsers * @param url * The URL to fetch * @throws NoSuchAlgorithmException * if no {@link MessageDigest} instance with an * SHA-256 algorithm can be created */ - public URLFetcher(URL url) throws NoSuchAlgorithmException { - Validation.begin().isNotNull("url", url).check(); + public URLFetcher(ParserFactory parserFactory, URL url) throws NoSuchAlgorithmException { + Validation.begin().isNotNull("parserFactory", parserFactory).isNotNull("url", url).check(); + this.parserFactory = parserFactory; this.url = url; messageDigest = MessageDigest.getInstance("SHA-256"); } @@ -97,7 +105,7 @@ class URLFetcher implements Runnable, ParserListener { // /** - * {@inheritdoc} + * {@inheritDoc} * * @see java.lang.Runnable#run() */ @@ -112,8 +120,12 @@ class URLFetcher implements Runnable, ParserListener { logger.log(Level.INFO, "Type is “" + contentType + "”, length is " + contentLength + "."); urlInputStream = urlConnection.getInputStream(); hashInputStream = new MessageDigestInputStream(urlInputStream, messageDigest); - HtmlEditorKitParser htmlEditorKitParser = new HtmlEditorKitParser(); - htmlEditorKitParser.parse(this, hashInputStream, "UTF-8"); + Parser parser = parserFactory.getParser(contentType); + if (parser == null) { + logger.log(Level.INFO, "No parser found for “" + contentType + "”."); + return; + } + parser.parse(this, hashInputStream, "UTF-8"); hash = messageDigest.digest(); } catch (IOException ioe1) { logger.log(Level.WARNING, "Could not fetch “" + url + "”.", ioe1); @@ -131,9 +143,13 @@ class URLFetcher implements Runnable, ParserListener { URL newLink = null; try { newLink = new URL(url, linkTarget); - Page newPage = Page.fromURL(newLink); - if (newPage != null) { - collectedPages.add(newPage); + try { + Page newPage = Page.fromURL(newLink); + if (newPage != null) { + collectedPages.add(newPage); + } + } catch (IllegalArgumentException iae1) { + /* ignore. */ } } catch (MalformedURLException mue1) { logger.log(Level.WARNING, "Could not create URL from “" + url + "” and “" + linkTarget + "”.", mue1);