Use a default parser factory in the core and hand it in to every URL fetcher.

[arachne.git] / src / net / pterodactylus / arachne / core / URLFetcher.java
diff --git a/src/net/pterodactylus/arachne/core/URLFetcher.java b/src/net/pterodactylus/arachne/core/URLFetcher.java

index df8180f..cdccd2b 100644 (file)
--- a/src/net/pterodactylus/arachne/core/URLFetcher.java
+++ b/src/net/pterodactylus/arachne/core/URLFetcher.java
@@ -2,13 +2,20 @@ package net.pterodactylus.arachne.core;
  
  import java.io.IOException;
  import java.io.InputStream;
+import java.net.MalformedURLException;
  import java.net.URL;
  import java.net.URLConnection;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.ArrayList;
+import java.util.List;
  import java.util.logging.Level;
  import java.util.logging.Logger;
  
-import net.pterodactylus.arachne.parser.HtmlEditorKitParser;
+import net.pterodactylus.arachne.parser.Parser;
+import net.pterodactylus.arachne.parser.ParserFactory;
  import net.pterodactylus.arachne.parser.ParserListener;
+import de.ina.util.io.MessageDigestInputStream;
  import de.ina.util.validation.Validation;
  
  /**
@@ -22,40 +29,100 @@ class URLFetcher implements Runnable, ParserListener {
         /** The logger. */
         private static final Logger logger = Logger.getLogger(URLFetcher.class.getName());
  
-       /** The core. */
-       private final Core core;
+       /** The parser factory. */
+       private final ParserFactory parserFactory;
  
         /** The URL to fetch. */
         private final URL url;
  
+       /** The message digest. */
+       private final MessageDigest messageDigest;
+
+       /** The hash of the fetched URL. */
+       private byte[] hash;
+
+       /** The collected URLs. */
+       private final List<Page> collectedPages = new ArrayList<Page>();
+
+       /** The title of the URL. */
+       private String title;
+
         /**
          * Creates a new fetcher for the given URL.
          *
+        * @param parserFactory
+        *            The parser factory that is used to create content-type
+        *            specific parsers
          * @param url
          *            The URL to fetch
-        * @param core
-        *            TODO
+        * @throws NoSuchAlgorithmException
+        *             if no {@link MessageDigest} instance with an
+        *             <code>SHA-256</code> algorithm can be created
          */
-       public URLFetcher(Core core, URL url) {
-               Validation.begin().isNotNull("core", core).isNotNull("url", url).check();
-               this.core = core;
+       public URLFetcher(ParserFactory parserFactory, URL url) throws NoSuchAlgorithmException {
+               Validation.begin().isNotNull("parserFactory", parserFactory).isNotNull("url", url).check();
+               this.parserFactory = parserFactory;
                 this.url = url;
+               messageDigest = MessageDigest.getInstance("SHA-256");
+       }
+
+       //
+       // ACCESSORS
+       //
+
+       /**
+        * Returns the title of the fetched URL.
+        *
+        * @return The fetched URL’s title
+        */
+       public String getTitle() {
+               return title;
         }
  
         /**
+        * Returns the pages collected while parsing this URL.
+        *
+        * @return The collected pages
+        */
+       public List<Page> getCollectedPages() {
+               return collectedPages;
+       }
+
+       /**
+        * Returns the hash of the content of the fetched URL. The returned value is
+        * only valid after {@link #run()} has been called.
+        *
+        * @return The hash of the fetched content
+        */
+       public byte[] getHash() {
+               byte[] hashCopy = new byte[hash.length];
+               System.arraycopy(hash, 0, hashCopy, 0, hash.length);
+               return hashCopy;
+       }
+
+       //
+       // INTERFACE Runnable
+       //
+
+       /**
          * {@inheritdoc}
          *
          * @see java.lang.Runnable#run()
          */
         public void run() {
                 logger.log(Level.INFO, "Starting URL Fetcher for “" + url + "”.");
+               InputStream urlInputStream = null;
+               MessageDigestInputStream hashInputStream = null;
                 try {
                         URLConnection urlConnection = url.openConnection();
                         long contentLength = urlConnection.getContentLength();
                         String contentType = urlConnection.getContentType();
                         logger.log(Level.INFO, "Type is “" + contentType + "”, length is " + contentLength + ".");
-                       HtmlEditorKitParser htmlEditorKitParser = new HtmlEditorKitParser();
-                       htmlEditorKitParser.parse(this, urlConnection.getInputStream(), "UTF-8");
+                       urlInputStream = urlConnection.getInputStream();
+                       hashInputStream = new MessageDigestInputStream(urlInputStream, messageDigest);
+                       Parser parser = parserFactory.getParser(contentType);
+                       parser.parse(this, hashInputStream, "UTF-8");
+                       hash = messageDigest.digest();
                 } catch (IOException ioe1) {
                         logger.log(Level.WARNING, "Could not fetch “" + url + "”.", ioe1);
                 }
@@ -69,14 +136,23 @@ class URLFetcher implements Runnable, ParserListener {
          * {@inheritDoc}
          */
         public void parsedLink(InputStream inputStream, String linkTarget, String linkTitle, String linkText) {
-               System.out.println("Found link to “" + linkTarget + "” named “" + linkText + "” or “" + linkTitle + "”.");
+               URL newLink = null;
+               try {
+                       newLink = new URL(url, linkTarget);
+                       Page newPage = Page.fromURL(newLink);
+                       if (newPage != null) {
+                               collectedPages.add(newPage);
+                       }
+               } catch (MalformedURLException mue1) {
+                       logger.log(Level.WARNING, "Could not create URL from “" + url + "” and “" + linkTarget + "”.", mue1);
+               }
         }
  
         /**
          * {@inheritDoc}
          */
         public void parsedTitle(InputStream inputStream, String title) {
-               System.out.println("Found title “" + title + "”.");
+               this.title = title;
         }
  
  }