Use a default parser factory in the core and hand it in to every URL fetcher.
[arachne.git] / src / net / pterodactylus / arachne / core / Core.java
1 /*
2  * © 2009 David ‘Bombe’ Roden
3  */
4 package net.pterodactylus.arachne.core;
5
6 import java.net.MalformedURLException;
7 import java.net.URL;
8 import java.security.NoSuchAlgorithmException;
9 import java.util.ArrayList;
10 import java.util.List;
11 import java.util.concurrent.Executor;
12 import java.util.concurrent.Executors;
13 import java.util.logging.Level;
14 import java.util.logging.Logger;
15
16 import net.pterodactylus.arachne.parser.ParserFactory;
17 import de.ina.util.service.AbstractService;
18 import de.ina.util.thread.DumpingThreadFactory;
19 import de.ina.util.validation.Validation;
20
21 /**
22  * Arachne’s core.
23  *
24  * @author David ‘Bombe’ Roden <bombe@pterodactylus.net>
25  */
26 public class Core extends AbstractService {
27
28         /** The logger. */
29         private static final Logger logger = Logger.getLogger(Core.class.getName());
30
31         //
32         // PROPERTIES
33         //
34
35         /** The host of the freenet node. */
36         private String nodeHost = "localhost";
37
38         /** The port of the freenet node. */
39         private int nodePort = 8888;
40
41         //
42         // INTERNAL MEMBERS
43         //
44
45         /** The parser factory. */
46         private ParserFactory parserFactory = new ParserFactory();
47
48         /** Thread pool for the URL fetches. */
49         private Executor urlFetcherExecutor = Executors.newFixedThreadPool(1, new DumpingThreadFactory("URLFetcher-"));
50
51         /** The current list of URLs to crawl. */
52         private final List<Page> pages = new ArrayList<Page>();
53
54         //
55         // ACCESSORS
56         //
57
58         /**
59          * Sets the host name of the node.
60          *
61          * @param nodeHost
62          *            The node’s host name
63          */
64         public void setNodeHost(String nodeHost) {
65                 this.nodeHost = nodeHost;
66         }
67
68         //
69         // ACTIONS
70         //
71
72         /**
73          * Adds the given URL to the list of pages to crawl.
74          *
75          * @param url
76          *            The URL to add
77          */
78         public void addPage(URL url) {
79                 Validation.begin().isNotNull("url", url).check().isEqual("url.getHost()", url.getHost(), (Object) nodeHost).isEqual("url.getPort()", url.getPort(), nodePort).check();
80         }
81
82         /**
83          * Adds the given URL to the list of pages to crawl.
84          *
85          * @param url
86          *            The URL of the page to crawl
87          * @throws MalformedURLException
88          *             if the URL is not a valid URL
89          */
90         public void addPage(String url) throws MalformedURLException {
91                 Validation.begin().isNotNull("url", (Object) url).check();
92                 addPage(new URL(url));
93         }
94
95         /**
96          * Adds the given page to the list of pages to crawl.
97          *
98          * @param page
99          *            The page to add
100          */
101         public void addPage(Page page) {
102                 Validation.begin().isNotNull("page", page).check();
103                 pages.add(page);
104                 notifySyncObject();
105         }
106
107         //
108         // SERVICE METHODS
109         //
110
111         /**
112          * {@inheritdoc}
113          *
114          * @see de.ina.util.service.AbstractService#serviceRun()
115          */
116         @Override
117         @SuppressWarnings("null")
118         protected void serviceRun() {
119                 while (!shouldStop()) {
120                         Page nextPage = null;
121                         synchronized (syncObject) {
122                                 while (!shouldStop() && pages.isEmpty()) {
123                                         try {
124                                                 syncObject.wait();
125                                         } catch (InterruptedException ie1) {
126                                                 /* ignore. */
127                                         }
128                                 }
129                                 if (!shouldStop()) {
130                                         nextPage = pages.remove(0);
131                                 }
132                         }
133                         if (shouldStop()) {
134                                 break;
135                         }
136                         URL nextURL = nextPage.toURL(nodeHost, nodePort);
137                         if (nextURL == null) {
138                                 logger.log(Level.INFO, "Skipping “" + nextPage + "”.");
139                                 continue;
140                         }
141                         URLFetcher urlFetcher;
142                         try {
143                                 logger.log(Level.INFO, "Fetching “" + nextURL + "”...");
144                                 urlFetcher = new URLFetcher(parserFactory, nextURL);
145                                 urlFetcherExecutor.execute(urlFetcher);
146                         } catch (NoSuchAlgorithmException nsae1) {
147                                 logger.log(Level.SEVERE, "Could not get “SHA-256” message digest!", nsae1);
148                         }
149                 }
150         }
151
152 }