src/net/pterodactylus/arachne/core/Core.java

   1 /*
   2  * © 2009 David ‘Bombe’ Roden
   3  */
   4 package net.pterodactylus.arachne.core;
   5
   6 import java.net.MalformedURLException;
   7 import java.net.URL;
   8 import java.security.NoSuchAlgorithmException;
   9 import java.util.ArrayList;
  10 import java.util.HashSet;
  11 import java.util.List;
  12 import java.util.Set;
  13 import java.util.concurrent.Executor;
  14 import java.util.concurrent.Executors;
  15 import java.util.logging.Level;
  16 import java.util.logging.Logger;
  17
  18 import net.pterodactylus.arachne.parser.ParserFactory;
  19 import de.ina.util.service.AbstractService;
  20 import de.ina.util.thread.DumpingThreadFactory;
  21 import de.ina.util.validation.Validation;
  22
  23 /**
  24  * Arachne’s core.
  25  *
  26  * @author David ‘Bombe’ Roden <bombe@pterodactylus.net>
  27  */
  28 public class Core extends AbstractService {
  29
  30         /** The logger. */
  31         private static final Logger logger = Logger.getLogger(Core.class.getName());
  32
  33         //
  34         // PROPERTIES
  35         //
  36
  37         /** The host of the freenet node. */
  38         private String nodeHost = "localhost";
  39
  40         /** The port of the freenet node. */
  41         private int nodePort = 8888;
  42
  43         //
  44         // INTERNAL MEMBERS
  45         //
  46
  47         /** The parser factory. */
  48         private ParserFactory parserFactory = new ParserFactory();
  49
  50         /** Thread pool for the URL fetches. */
  51         private Executor urlFetcherExecutor = Executors.newSingleThreadExecutor(new DumpingThreadFactory("URLFetcher-"));
  52
  53         /** The current list of URLs to crawl. */
  54         private final List<Page> pages = new ArrayList<Page>();
  55
  56         /** Already crawled pages. */
  57         private final Set<Page> crawledPages = new HashSet<Page>();
  58
  59         //
  60         // ACCESSORS
  61         //
  62
  63         /**
  64          * Sets the host name of the node.
  65          *
  66          * @param nodeHost
  67          *            The node’s host name
  68          */
  69         public void setNodeHost(String nodeHost) {
  70                 this.nodeHost = nodeHost;
  71         }
  72
  73         //
  74         // ACTIONS
  75         //
  76
  77         /**
  78          * Adds the given URL to the list of pages to crawl.
  79          *
  80          * @param url
  81          *            The URL to add
  82          */
  83         public void addPage(URL url) {
  84                 Validation.begin().isNotNull("url", url).check().isEqual("url.getHost()", url.getHost(), (Object) nodeHost).isEqual("url.getPort()", url.getPort(), nodePort).check();
  85                 addPage(Page.fromURL(url));
  86         }
  87
  88         /**
  89          * Adds the given URL to the list of pages to crawl.
  90          *
  91          * @param url
  92          *            The URL of the page to crawl
  93          * @throws MalformedURLException
  94          *             if the URL is not a valid URL
  95          */
  96         public void addPage(String url) throws MalformedURLException {
  97                 Validation.begin().isNotNull("url", (Object) url).check();
  98                 addPage(new URL(url));
  99         }
 100
 101         /**
 102          * Adds the given page to the list of pages to crawl.
 103          *
 104          * @param page
 105          *            The page to add
 106          */
 107         public void addPage(Page page) {
 108                 Validation.begin().isNotNull("page", page).check();
 109                 if (!crawledPages.contains(page) && !pages.contains(page)) {
 110                         pages.add(page);
 111                         notifySyncObject();
 112                 }
 113         }
 114
 115         //
 116         // SERVICE METHODS
 117         //
 118
 119         /**
 120          * {@inheritDoc}
 121          *
 122          * @see de.ina.util.service.AbstractService#serviceRun()
 123          */
 124         @Override
 125         protected void serviceRun() {
 126                 while (!shouldStop()) {
 127                         final Page nextPage;
 128                         Page page = null;
 129                         synchronized (syncObject) {
 130                                 while (!shouldStop() && pages.isEmpty()) {
 131                                         try {
 132                                                 syncObject.wait();
 133                                         } catch (InterruptedException ie1) {
 134                                                 /* ignore. */
 135                                         }
 136                                 }
 137                                 if (!shouldStop()) {
 138                                         page = pages.remove(0);
 139                                 }
 140                         }
 141                         if (shouldStop()) {
 142                                 break;
 143                         }
 144                         nextPage = page;
 145                         URL nextURL = nextPage.toURL(nodeHost, nodePort);
 146                         if (nextURL == null) {
 147                                 logger.log(Level.INFO, "Skipping “" + nextPage + "”.");
 148                                 continue;
 149                         }
 150                         final URLFetcher urlFetcher;
 151                         try {
 152                                 logger.log(Level.INFO, "Fetching “" + nextURL + "”...");
 153                                 urlFetcher = new URLFetcher(parserFactory, nextURL);
 154                                 urlFetcherExecutor.execute(new Runnable() {
 155
 156                                         @SuppressWarnings("synthetic-access")
 157                                         public void run() {
 158                                                 urlFetcher.run();
 159                                                 crawledPages.add(nextPage);
 160                                                 for (Page page : urlFetcher.getCollectedPages()) {
 161                                                         addPage(page);
 162                                                 }
 163                                         }
 164                                 });
 165                         } catch (NoSuchAlgorithmException nsae1) {
 166                                 logger.log(Level.SEVERE, "Could not get “SHA-256” message digest!", nsae1);
 167                         }
 168                 }
 169         }
 170
 171 }