Remember crawled pages and don’t crawl them again.
[arachne.git] / src / net / pterodactylus / arachne / core / Core.java
1 /*
2  * © 2009 David ‘Bombe’ Roden
3  */
4 package net.pterodactylus.arachne.core;
5
6 import java.net.MalformedURLException;
7 import java.net.URL;
8 import java.security.NoSuchAlgorithmException;
9 import java.util.ArrayList;
10 import java.util.HashSet;
11 import java.util.List;
12 import java.util.Set;
13 import java.util.concurrent.Executor;
14 import java.util.concurrent.Executors;
15 import java.util.logging.Level;
16 import java.util.logging.Logger;
17
18 import net.pterodactylus.arachne.parser.ParserFactory;
19 import de.ina.util.service.AbstractService;
20 import de.ina.util.thread.DumpingThreadFactory;
21 import de.ina.util.validation.Validation;
22
23 /**
24  * Arachne’s core.
25  *
26  * @author David ‘Bombe’ Roden <bombe@pterodactylus.net>
27  */
28 public class Core extends AbstractService {
29
30         /** The logger. */
31         private static final Logger logger = Logger.getLogger(Core.class.getName());
32
33         //
34         // PROPERTIES
35         //
36
37         /** The host of the freenet node. */
38         private String nodeHost = "localhost";
39
40         /** The port of the freenet node. */
41         private int nodePort = 8888;
42
43         //
44         // INTERNAL MEMBERS
45         //
46
47         /** The parser factory. */
48         private ParserFactory parserFactory = new ParserFactory();
49
50         /** Thread pool for the URL fetches. */
51         private Executor urlFetcherExecutor = Executors.newFixedThreadPool(1, new DumpingThreadFactory("URLFetcher-"));
52
53         /** The current list of URLs to crawl. */
54         private final List<Page> pages = new ArrayList<Page>();
55
56         /** Already crawled pages. */
57         private final Set<Page> crawledPages = new HashSet<Page>();
58
59         //
60         // ACCESSORS
61         //
62
63         /**
64          * Sets the host name of the node.
65          *
66          * @param nodeHost
67          *            The node’s host name
68          */
69         public void setNodeHost(String nodeHost) {
70                 this.nodeHost = nodeHost;
71         }
72
73         //
74         // ACTIONS
75         //
76
77         /**
78          * Adds the given URL to the list of pages to crawl.
79          *
80          * @param url
81          *            The URL to add
82          */
83         public void addPage(URL url) {
84                 Validation.begin().isNotNull("url", url).check().isEqual("url.getHost()", url.getHost(), (Object) nodeHost).isEqual("url.getPort()", url.getPort(), nodePort).check();
85                 addPage(Page.fromURL(url));
86         }
87
88         /**
89          * Adds the given URL to the list of pages to crawl.
90          *
91          * @param url
92          *            The URL of the page to crawl
93          * @throws MalformedURLException
94          *             if the URL is not a valid URL
95          */
96         public void addPage(String url) throws MalformedURLException {
97                 Validation.begin().isNotNull("url", (Object) url).check();
98                 addPage(new URL(url));
99         }
100
101         /**
102          * Adds the given page to the list of pages to crawl.
103          *
104          * @param page
105          *            The page to add
106          */
107         public void addPage(Page page) {
108                 Validation.begin().isNotNull("page", page).check();
109                 if (!crawledPages.contains(page) && !pages.contains(page)) {
110                         pages.add(page);
111                         notifySyncObject();
112                 }
113         }
114
115         //
116         // SERVICE METHODS
117         //
118
119         /**
120          * {@inheritDoc}
121          *
122          * @see de.ina.util.service.AbstractService#serviceRun()
123          */
124         @Override
125         protected void serviceRun() {
126                 while (!shouldStop()) {
127                         final Page nextPage;
128                         Page page = null;
129                         synchronized (syncObject) {
130                                 while (!shouldStop() && pages.isEmpty()) {
131                                         try {
132                                                 syncObject.wait();
133                                         } catch (InterruptedException ie1) {
134                                                 /* ignore. */
135                                         }
136                                 }
137                                 if (!shouldStop()) {
138                                         page = pages.remove(0);
139                                 }
140                         }
141                         if (shouldStop()) {
142                                 break;
143                         }
144                         nextPage = page;
145                         URL nextURL = nextPage.toURL(nodeHost, nodePort);
146                         if (nextURL == null) {
147                                 logger.log(Level.INFO, "Skipping “" + nextPage + "”.");
148                                 continue;
149                         }
150                         final URLFetcher urlFetcher;
151                         try {
152                                 logger.log(Level.INFO, "Fetching “" + nextURL + "”...");
153                                 urlFetcher = new URLFetcher(parserFactory, nextURL);
154                                 urlFetcherExecutor.execute(new Runnable() {
155
156                                         @SuppressWarnings("synthetic-access")
157                                         public void run() {
158                                                 urlFetcher.run();
159                                                 crawledPages.add(nextPage);
160                                                 for (Page page : urlFetcher.getCollectedPages()) {
161                                                         addPage(page);
162                                                 }
163                                         }
164                                 });
165                         } catch (NoSuchAlgorithmException nsae1) {
166                                 logger.log(Level.SEVERE, "Could not get “SHA-256” message digest!", nsae1);
167                         }
168                 }
169         }
170
171 }