2 * © 2009 David ‘Bombe’ Roden
4 package net.pterodactylus.arachne.core;
6 import java.net.MalformedURLException;
8 import java.security.NoSuchAlgorithmException;
9 import java.util.ArrayList;
10 import java.util.HashSet;
11 import java.util.List;
13 import java.util.concurrent.Executor;
14 import java.util.concurrent.Executors;
15 import java.util.logging.Level;
16 import java.util.logging.Logger;
18 import net.pterodactylus.arachne.parser.ParserFactory;
19 import de.ina.util.service.AbstractService;
20 import de.ina.util.thread.DumpingThreadFactory;
21 import de.ina.util.validation.Validation;
26 * @author David ‘Bombe’ Roden <bombe@pterodactylus.net>
28 public class Core extends AbstractService {
31 private static final Logger logger = Logger.getLogger(Core.class.getName());
37 /** The host of the freenet node. */
38 private String nodeHost = "localhost";
40 /** The port of the freenet node. */
41 private int nodePort = 8888;
47 /** The parser factory. */
48 private ParserFactory parserFactory = new ParserFactory();
50 /** Thread pool for the URL fetches. */
51 private Executor urlFetcherExecutor = Executors.newSingleThreadExecutor(new DumpingThreadFactory("URLFetcher-"));
53 /** The current list of URLs to crawl. */
54 private final List<Page> pages = new ArrayList<Page>();
56 /** Already crawled pages. */
57 private final Set<Page> crawledPages = new HashSet<Page>();
64 * Sets the host name of the node.
67 * The node’s host name
69 public void setNodeHost(String nodeHost) {
70 this.nodeHost = nodeHost;
78 * Adds the given URL to the list of pages to crawl.
83 public void addPage(URL url) {
84 Validation.begin().isNotNull("url", url).check().isEqual("url.getHost()", url.getHost(), (Object) nodeHost).isEqual("url.getPort()", url.getPort(), nodePort).check();
85 addPage(Page.fromURL(url));
89 * Adds the given URL to the list of pages to crawl.
92 * The URL of the page to crawl
93 * @throws MalformedURLException
94 * if the URL is not a valid URL
96 public void addPage(String url) throws MalformedURLException {
97 Validation.begin().isNotNull("url", (Object) url).check();
98 addPage(new URL(url));
102 * Adds the given page to the list of pages to crawl.
107 public void addPage(Page page) {
108 Validation.begin().isNotNull("page", page).check();
109 if (!crawledPages.contains(page) && !pages.contains(page)) {
122 * @see de.ina.util.service.AbstractService#serviceRun()
125 protected void serviceRun() {
126 while (!shouldStop()) {
129 synchronized (syncObject) {
130 while (!shouldStop() && pages.isEmpty()) {
133 } catch (InterruptedException ie1) {
138 page = pages.remove(0);
145 URL nextURL = nextPage.toURL(nodeHost, nodePort);
146 if (nextURL == null) {
147 logger.log(Level.INFO, "Skipping “" + nextPage + "”.");
150 final URLFetcher urlFetcher;
152 logger.log(Level.INFO, "Fetching “" + nextURL + "”...");
153 urlFetcher = new URLFetcher(parserFactory, nextURL);
154 urlFetcherExecutor.execute(new Runnable() {
156 @SuppressWarnings("synthetic-access")
159 crawledPages.add(nextPage);
160 for (Page page : urlFetcher.getCollectedPages()) {
165 } catch (NoSuchAlgorithmException nsae1) {
166 logger.log(Level.SEVERE, "Could not get “SHA-256” message digest!", nsae1);