Don’t try to parse when no parser was found.
[arachne.git] / src / net / pterodactylus / arachne / core / URLFetcher.java
index ddfbddd..ea109d0 100644 (file)
@@ -12,7 +12,8 @@ import java.util.List;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 
-import net.pterodactylus.arachne.parser.HtmlEditorKitParser;
+import net.pterodactylus.arachne.parser.Parser;
+import net.pterodactylus.arachne.parser.ParserFactory;
 import net.pterodactylus.arachne.parser.ParserListener;
 import de.ina.util.io.MessageDigestInputStream;
 import de.ina.util.validation.Validation;
@@ -28,6 +29,9 @@ class URLFetcher implements Runnable, ParserListener {
        /** The logger. */
        private static final Logger logger = Logger.getLogger(URLFetcher.class.getName());
 
+       /** The parser factory. */
+       private final ParserFactory parserFactory;
+
        /** The URL to fetch. */
        private final URL url;
 
@@ -40,17 +44,24 @@ class URLFetcher implements Runnable, ParserListener {
        /** The collected URLs. */
        private final List<Page> collectedPages = new ArrayList<Page>();
 
+       /** The title of the URL. */
+       private String title;
+
        /**
         * Creates a new fetcher for the given URL.
         *
+        * @param parserFactory
+        *            The parser factory that is used to create content-type
+        *            specific parsers
         * @param url
         *            The URL to fetch
         * @throws NoSuchAlgorithmException
         *             if no {@link MessageDigest} instance with an
         *             <code>SHA-256</code> algorithm can be created
         */
-       public URLFetcher(URL url) throws NoSuchAlgorithmException {
-               Validation.begin().isNotNull("url", url).check();
+       public URLFetcher(ParserFactory parserFactory, URL url) throws NoSuchAlgorithmException {
+               Validation.begin().isNotNull("parserFactory", parserFactory).isNotNull("url", url).check();
+               this.parserFactory = parserFactory;
                this.url = url;
                messageDigest = MessageDigest.getInstance("SHA-256");
        }
@@ -60,6 +71,15 @@ class URLFetcher implements Runnable, ParserListener {
        //
 
        /**
+        * Returns the title of the fetched URL.
+        *
+        * @return The fetched URL’s title
+        */
+       public String getTitle() {
+               return title;
+       }
+
+       /**
         * Returns the pages collected while parsing this URL.
         *
         * @return The collected pages
@@ -85,7 +105,7 @@ class URLFetcher implements Runnable, ParserListener {
        //
 
        /**
-        * {@inheritdoc}
+        * {@inheritDoc}
         *
         * @see java.lang.Runnable#run()
         */
@@ -100,8 +120,12 @@ class URLFetcher implements Runnable, ParserListener {
                        logger.log(Level.INFO, "Type is “" + contentType + "”, length is " + contentLength + ".");
                        urlInputStream = urlConnection.getInputStream();
                        hashInputStream = new MessageDigestInputStream(urlInputStream, messageDigest);
-                       HtmlEditorKitParser htmlEditorKitParser = new HtmlEditorKitParser();
-                       htmlEditorKitParser.parse(this, hashInputStream, "UTF-8");
+                       Parser parser = parserFactory.getParser(contentType);
+                       if (parser == null) {
+                               logger.log(Level.INFO, "No parser found for “" + contentType + "”.");
+                               return;
+                       }
+                       parser.parse(this, hashInputStream, "UTF-8");
                        hash = messageDigest.digest();
                } catch (IOException ioe1) {
                        logger.log(Level.WARNING, "Could not fetch “" + url + "”.", ioe1);
@@ -119,9 +143,13 @@ class URLFetcher implements Runnable, ParserListener {
                URL newLink = null;
                try {
                        newLink = new URL(url, linkTarget);
-                       Page newPage = Page.fromURL(newLink);
-                       if (newPage != null) {
-                               collectedPages.add(newPage);
+                       try {
+                               Page newPage = Page.fromURL(newLink);
+                               if (newPage != null) {
+                                       collectedPages.add(newPage);
+                               }
+                       } catch (IllegalArgumentException iae1) {
+                               /* ignore. */
                        }
                } catch (MalformedURLException mue1) {
                        logger.log(Level.WARNING, "Could not create URL from “" + url + "” and “" + linkTarget + "”.", mue1);
@@ -132,6 +160,7 @@ class URLFetcher implements Runnable, ParserListener {
         * {@inheritDoc}
         */
        public void parsedTitle(InputStream inputStream, String title) {
+               this.title = title;
        }
 
 }