src/net/pterodactylus/arachne/core/Core.java

   1 /*
   2  * © 2009 David ‘Bombe’ Roden
   3  */
   4 package net.pterodactylus.arachne.core;
   5
   6 import java.net.MalformedURLException;
   7 import java.net.URL;
   8 import java.security.NoSuchAlgorithmException;
   9 import java.util.ArrayList;
  10 import java.util.List;
  11 import java.util.concurrent.Executor;
  12 import java.util.concurrent.Executors;
  13 import java.util.logging.Level;
  14 import java.util.logging.Logger;
  15
  16 import net.pterodactylus.arachne.parser.ParserFactory;
  17 import de.ina.util.service.AbstractService;
  18 import de.ina.util.thread.DumpingThreadFactory;
  19 import de.ina.util.validation.Validation;
  20
  21 /**
  22  * Arachne’s core.
  23  *
  24  * @author David ‘Bombe’ Roden <bombe@pterodactylus.net>
  25  */
  26 public class Core extends AbstractService {
  27
  28         /** The logger. */
  29         private static final Logger logger = Logger.getLogger(Core.class.getName());
  30
  31         //
  32         // PROPERTIES
  33         //
  34
  35         /** The host of the freenet node. */
  36         private String nodeHost = "localhost";
  37
  38         /** The port of the freenet node. */
  39         private int nodePort = 8888;
  40
  41         //
  42         // INTERNAL MEMBERS
  43         //
  44
  45         /** The parser factory. */
  46         private ParserFactory parserFactory = new ParserFactory();
  47
  48         /** Thread pool for the URL fetches. */
  49         private Executor urlFetcherExecutor = Executors.newFixedThreadPool(1, new DumpingThreadFactory("URLFetcher-"));
  50
  51         /** The current list of URLs to crawl. */
  52         private final List<Page> pages = new ArrayList<Page>();
  53
  54         //
  55         // ACCESSORS
  56         //
  57
  58         /**
  59          * Sets the host name of the node.
  60          *
  61          * @param nodeHost
  62          *            The node’s host name
  63          */
  64         public void setNodeHost(String nodeHost) {
  65                 this.nodeHost = nodeHost;
  66         }
  67
  68         //
  69         // ACTIONS
  70         //
  71
  72         /**
  73          * Adds the given URL to the list of pages to crawl.
  74          *
  75          * @param url
  76          *            The URL to add
  77          */
  78         public void addPage(URL url) {
  79                 Validation.begin().isNotNull("url", url).check().isEqual("url.getHost()", url.getHost(), (Object) nodeHost).isEqual("url.getPort()", url.getPort(), nodePort).check();
  80                 addPage(Page.fromURL(url));
  81         }
  82
  83         /**
  84          * Adds the given URL to the list of pages to crawl.
  85          *
  86          * @param url
  87          *            The URL of the page to crawl
  88          * @throws MalformedURLException
  89          *             if the URL is not a valid URL
  90          */
  91         public void addPage(String url) throws MalformedURLException {
  92                 Validation.begin().isNotNull("url", (Object) url).check();
  93                 addPage(new URL(url));
  94         }
  95
  96         /**
  97          * Adds the given page to the list of pages to crawl.
  98          *
  99          * @param page
 100          *            The page to add
 101          */
 102         public void addPage(Page page) {
 103                 Validation.begin().isNotNull("page", page).check();
 104                 pages.add(page);
 105                 notifySyncObject();
 106         }
 107
 108         //
 109         // SERVICE METHODS
 110         //
 111
 112         /**
 113          * {@inheritDoc}
 114          *
 115          * @see de.ina.util.service.AbstractService#serviceRun()
 116          */
 117         @Override
 118         @SuppressWarnings("null")
 119         protected void serviceRun() {
 120                 while (!shouldStop()) {
 121                         Page nextPage = null;
 122                         synchronized (syncObject) {
 123                                 while (!shouldStop() && pages.isEmpty()) {
 124                                         try {
 125                                                 syncObject.wait();
 126                                         } catch (InterruptedException ie1) {
 127                                                 /* ignore. */
 128                                         }
 129                                 }
 130                                 if (!shouldStop()) {
 131                                         nextPage = pages.remove(0);
 132                                 }
 133                         }
 134                         if (shouldStop()) {
 135                                 break;
 136                         }
 137                         URL nextURL = nextPage.toURL(nodeHost, nodePort);
 138                         if (nextURL == null) {
 139                                 logger.log(Level.INFO, "Skipping “" + nextPage + "”.");
 140                                 continue;
 141                         }
 142                         final URLFetcher urlFetcher;
 143                         try {
 144                                 logger.log(Level.INFO, "Fetching “" + nextURL + "”...");
 145                                 urlFetcher = new URLFetcher(parserFactory, nextURL);
 146                                 urlFetcherExecutor.execute(new Runnable() {
 147
 148                                         public void run() {
 149                                                 urlFetcher.run();
 150                                                 for (Page page : urlFetcher.getCollectedPages()) {
 151                                                         addPage(page);
 152                                                 }
 153                                         }
 154                                 });
 155                         } catch (NoSuchAlgorithmException nsae1) {
 156                                 logger.log(Level.SEVERE, "Could not get “SHA-256” message digest!", nsae1);
 157                         }
 158                 }
 159         }
 160
 161 }