Implementated adding a page by URL.
[arachne.git] / src / net / pterodactylus / arachne / core / Core.java
index f3fd5de..a2c50cd 100644 (file)
@@ -5,11 +5,17 @@ package net.pterodactylus.arachne.core;
 
 import java.net.MalformedURLException;
 import java.net.URL;
+import java.security.NoSuchAlgorithmException;
 import java.util.ArrayList;
-import java.util.Collections;
 import java.util.List;
+import java.util.concurrent.Executor;
+import java.util.concurrent.Executors;
+import java.util.logging.Level;
+import java.util.logging.Logger;
 
+import net.pterodactylus.arachne.parser.ParserFactory;
 import de.ina.util.service.AbstractService;
+import de.ina.util.thread.DumpingThreadFactory;
 import de.ina.util.validation.Validation;
 
 /**
@@ -19,6 +25,9 @@ import de.ina.util.validation.Validation;
  */
 public class Core extends AbstractService {
 
+       /** The logger. */
+       private static final Logger logger = Logger.getLogger(Core.class.getName());
+
        //
        // PROPERTIES
        //
@@ -33,8 +42,28 @@ public class Core extends AbstractService {
        // INTERNAL MEMBERS
        //
 
+       /** The parser factory. */
+       private ParserFactory parserFactory = new ParserFactory();
+
+       /** Thread pool for the URL fetches. */
+       private Executor urlFetcherExecutor = Executors.newFixedThreadPool(1, new DumpingThreadFactory("URLFetcher-"));
+
        /** The current list of URLs to crawl. */
-       private final List<Page> pages = Collections.synchronizedList(new ArrayList<Page>());
+       private final List<Page> pages = new ArrayList<Page>();
+
+       //
+       // ACCESSORS
+       //
+
+       /**
+        * Sets the host name of the node.
+        *
+        * @param nodeHost
+        *            The node’s host name
+        */
+       public void setNodeHost(String nodeHost) {
+               this.nodeHost = nodeHost;
+       }
 
        //
        // ACTIONS
@@ -48,38 +77,7 @@ public class Core extends AbstractService {
         */
        public void addPage(URL url) {
                Validation.begin().isNotNull("url", url).check().isEqual("url.getHost()", url.getHost(), (Object) nodeHost).isEqual("url.getPort()", url.getPort(), nodePort).check();
-               String path = url.getPath();
-               if (path.length() == 0) {
-                       path = "/";
-               }
-               String[] pathComponents = path.split("/");
-               if (pathComponents.length < 2) {
-                       throw new IllegalArgumentException("URL “" + url + "” is not a valid freenet page.");
-               }
-               String siteName = pathComponents[1];
-               String[] siteComponents = siteName.split("@");
-               if (siteComponents.length != 2) {
-                       throw new IllegalArgumentException("siteName “" + siteName + "” is not a valid freenet page.");
-               }
-               if (!"USK".equals(siteComponents[0]) && !"SSK".equals(siteComponents[0]) && !"CHK".equals(siteComponents[0])) {
-                       throw new IllegalArgumentException("siteName “" + siteName + "” is not a valid freenet page.");
-               }
-               if ("USK".equals(siteComponents[0])) {
-                       Site site = new Site(siteComponents[1], pathComponents[2]);
-                       Edition edition = new Edition(site, Integer.parseInt(pathComponents[3]));
-                       Page page = new Page(edition, createPath(pathComponents, 4));
-                       addPage(page);
-               }
-               if ("SSK".equals(siteComponents[0])) {
-                       int lastDash = pathComponents[2].lastIndexOf('-');
-                       String basename = pathComponents[2].substring(0, lastDash);
-                       int editionNumber = Integer.parseInt(pathComponents[2].substring(lastDash + 1));
-                       Site site = new Site(siteComponents[1], basename);
-                       Edition edition = new Edition(site, editionNumber);
-                       Page page = new Page(edition, createPath(pathComponents, 3));
-                       addPage(page);
-               }
-               /* TODO: handle CHK */
+               addPage(Page.fromURL(url));
        }
 
        /**
@@ -108,30 +106,48 @@ public class Core extends AbstractService {
        }
 
        //
-       // PRIVATE METHODS
+       // SERVICE METHODS
        //
 
        /**
-        * Creates a path from the given String array, starting at the given index.
-        * The path is created by joining all Strings from the array, separating
-        * them with a slash (‘/’).
+        * {@inheritDoc}
         *
-        * @param pathComponents
-        *            The array of path components
-        * @param index
-        *            The index of the first path components
-        * @return The joined path
+        * @see de.ina.util.service.AbstractService#serviceRun()
         */
-       private String createPath(String[] pathComponents, int index) {
-               Validation.begin().isNotNull("pathComponents", pathComponents).check().isLess("index", pathComponents.length, index).check();
-               StringBuilder path = new StringBuilder();
-               for (int pathComponentIndex = index; pathComponentIndex < pathComponents.length; pathComponentIndex++) {
-                       if (path.length() > 0) {
-                               path.append('/');
+       @Override
+       @SuppressWarnings("null")
+       protected void serviceRun() {
+               while (!shouldStop()) {
+                       Page nextPage = null;
+                       synchronized (syncObject) {
+                               while (!shouldStop() && pages.isEmpty()) {
+                                       try {
+                                               syncObject.wait();
+                                       } catch (InterruptedException ie1) {
+                                               /* ignore. */
+                                       }
+                               }
+                               if (!shouldStop()) {
+                                       nextPage = pages.remove(0);
+                               }
+                       }
+                       if (shouldStop()) {
+                               break;
+                       }
+                       URL nextURL = nextPage.toURL(nodeHost, nodePort);
+                       if (nextURL == null) {
+                               logger.log(Level.INFO, "Skipping “" + nextPage + "”.");
+                               continue;
+                       }
+                       URLFetcher urlFetcher;
+                       try {
+                               logger.log(Level.INFO, "Fetching “" + nextURL + "”...");
+                               urlFetcher = new URLFetcher(parserFactory, nextURL);
+                               urlFetcherExecutor.execute(urlFetcher);
+                       } catch (NoSuchAlgorithmException nsae1) {
+                               logger.log(Level.SEVERE, "Could not get “SHA-256” message digest!", nsae1);
                        }
-                       path.append(pathComponents[pathComponentIndex]);
                }
-               return path.toString();
        }
 
 }