Implementated adding a page by URL.
[arachne.git] / src / net / pterodactylus / arachne / core / Core.java
1 /*
2  * © 2009 David ‘Bombe’ Roden
3  */
4 package net.pterodactylus.arachne.core;
5
6 import java.net.MalformedURLException;
7 import java.net.URL;
8 import java.security.NoSuchAlgorithmException;
9 import java.util.ArrayList;
10 import java.util.List;
11 import java.util.concurrent.Executor;
12 import java.util.concurrent.Executors;
13 import java.util.logging.Level;
14 import java.util.logging.Logger;
15
16 import net.pterodactylus.arachne.parser.ParserFactory;
17 import de.ina.util.service.AbstractService;
18 import de.ina.util.thread.DumpingThreadFactory;
19 import de.ina.util.validation.Validation;
20
21 /**
22  * Arachne’s core.
23  *
24  * @author David ‘Bombe’ Roden <bombe@pterodactylus.net>
25  */
26 public class Core extends AbstractService {
27
28         /** The logger. */
29         private static final Logger logger = Logger.getLogger(Core.class.getName());
30
31         //
32         // PROPERTIES
33         //
34
35         /** The host of the freenet node. */
36         private String nodeHost = "localhost";
37
38         /** The port of the freenet node. */
39         private int nodePort = 8888;
40
41         //
42         // INTERNAL MEMBERS
43         //
44
45         /** The parser factory. */
46         private ParserFactory parserFactory = new ParserFactory();
47
48         /** Thread pool for the URL fetches. */
49         private Executor urlFetcherExecutor = Executors.newFixedThreadPool(1, new DumpingThreadFactory("URLFetcher-"));
50
51         /** The current list of URLs to crawl. */
52         private final List<Page> pages = new ArrayList<Page>();
53
54         //
55         // ACCESSORS
56         //
57
58         /**
59          * Sets the host name of the node.
60          *
61          * @param nodeHost
62          *            The node’s host name
63          */
64         public void setNodeHost(String nodeHost) {
65                 this.nodeHost = nodeHost;
66         }
67
68         //
69         // ACTIONS
70         //
71
72         /**
73          * Adds the given URL to the list of pages to crawl.
74          *
75          * @param url
76          *            The URL to add
77          */
78         public void addPage(URL url) {
79                 Validation.begin().isNotNull("url", url).check().isEqual("url.getHost()", url.getHost(), (Object) nodeHost).isEqual("url.getPort()", url.getPort(), nodePort).check();
80                 addPage(Page.fromURL(url));
81         }
82
83         /**
84          * Adds the given URL to the list of pages to crawl.
85          *
86          * @param url
87          *            The URL of the page to crawl
88          * @throws MalformedURLException
89          *             if the URL is not a valid URL
90          */
91         public void addPage(String url) throws MalformedURLException {
92                 Validation.begin().isNotNull("url", (Object) url).check();
93                 addPage(new URL(url));
94         }
95
96         /**
97          * Adds the given page to the list of pages to crawl.
98          *
99          * @param page
100          *            The page to add
101          */
102         public void addPage(Page page) {
103                 Validation.begin().isNotNull("page", page).check();
104                 pages.add(page);
105                 notifySyncObject();
106         }
107
108         //
109         // SERVICE METHODS
110         //
111
112         /**
113          * {@inheritDoc}
114          *
115          * @see de.ina.util.service.AbstractService#serviceRun()
116          */
117         @Override
118         @SuppressWarnings("null")
119         protected void serviceRun() {
120                 while (!shouldStop()) {
121                         Page nextPage = null;
122                         synchronized (syncObject) {
123                                 while (!shouldStop() && pages.isEmpty()) {
124                                         try {
125                                                 syncObject.wait();
126                                         } catch (InterruptedException ie1) {
127                                                 /* ignore. */
128                                         }
129                                 }
130                                 if (!shouldStop()) {
131                                         nextPage = pages.remove(0);
132                                 }
133                         }
134                         if (shouldStop()) {
135                                 break;
136                         }
137                         URL nextURL = nextPage.toURL(nodeHost, nodePort);
138                         if (nextURL == null) {
139                                 logger.log(Level.INFO, "Skipping “" + nextPage + "”.");
140                                 continue;
141                         }
142                         URLFetcher urlFetcher;
143                         try {
144                                 logger.log(Level.INFO, "Fetching “" + nextURL + "”...");
145                                 urlFetcher = new URLFetcher(parserFactory, nextURL);
146                                 urlFetcherExecutor.execute(urlFetcher);
147                         } catch (NoSuchAlgorithmException nsae1) {
148                                 logger.log(Level.SEVERE, "Could not get “SHA-256” message digest!", nsae1);
149                         }
150                 }
151         }
152
153 }