Create message digest in core and hash fetched URLs.
[arachne.git] / src / net / pterodactylus / arachne / core / Core.java
1 /*
2  * © 2009 David ‘Bombe’ Roden
3  */
4 package net.pterodactylus.arachne.core;
5
6 import java.net.MalformedURLException;
7 import java.net.URL;
8 import java.security.NoSuchAlgorithmException;
9 import java.util.ArrayList;
10 import java.util.List;
11 import java.util.concurrent.Executor;
12 import java.util.concurrent.Executors;
13 import java.util.logging.Level;
14 import java.util.logging.Logger;
15
16 import de.ina.util.service.AbstractService;
17 import de.ina.util.thread.DumpingThreadFactory;
18 import de.ina.util.validation.Validation;
19
20 /**
21  * Arachne’s core.
22  *
23  * @author David ‘Bombe’ Roden <bombe@pterodactylus.net>
24  */
25 public class Core extends AbstractService {
26
27         /** The logger. */
28         private static final Logger logger = Logger.getLogger(Core.class.getName());
29
30         //
31         // PROPERTIES
32         //
33
34         /** The host of the freenet node. */
35         private String nodeHost = "localhost";
36
37         /** The port of the freenet node. */
38         private int nodePort = 8888;
39
40         //
41         // INTERNAL MEMBERS
42         //
43
44         /** Thread pool for the URL fetches. */
45         private Executor urlFetcherExecutor = Executors.newFixedThreadPool(1, new DumpingThreadFactory("URLFetcher-"));
46
47         /** The current list of URLs to crawl. */
48         private final List<Page> pages = new ArrayList<Page>();
49
50         //
51         // ACCESSORS
52         //
53
54         /**
55          * Sets the host name of the node.
56          *
57          * @param nodeHost
58          *            The node’s host name
59          */
60         public void setNodeHost(String nodeHost) {
61                 this.nodeHost = nodeHost;
62         }
63
64         //
65         // ACTIONS
66         //
67
68         /**
69          * Adds the given URL to the list of pages to crawl.
70          *
71          * @param url
72          *            The URL to add
73          */
74         public void addPage(URL url) {
75                 Validation.begin().isNotNull("url", url).check().isEqual("url.getHost()", url.getHost(), (Object) nodeHost).isEqual("url.getPort()", url.getPort(), nodePort).check();
76                 String path = url.getPath();
77                 if (path.length() == 0) {
78                         path = "/";
79                 }
80                 String[] pathComponents = path.split("/");
81                 if (pathComponents.length < 2) {
82                         throw new IllegalArgumentException("URL “" + url + "” is not a valid freenet page.");
83                 }
84                 String siteName = pathComponents[1];
85                 String[] siteComponents = siteName.split("@");
86                 if (siteComponents.length != 2) {
87                         throw new IllegalArgumentException("siteName “" + siteName + "” is not a valid freenet page.");
88                 }
89                 if (!"USK".equals(siteComponents[0]) && !"SSK".equals(siteComponents[0]) && !"CHK".equals(siteComponents[0])) {
90                         throw new IllegalArgumentException("siteName “" + siteName + "” is not a valid freenet page.");
91                 }
92                 if ("USK".equals(siteComponents[0])) {
93                         Site site = new Site(siteComponents[1], pathComponents[2]);
94                         Edition edition = new Edition(site, Integer.parseInt(pathComponents[3]));
95                         Page page = new Page(edition, createPath(pathComponents, 4));
96                         addPage(page);
97                 }
98                 if ("SSK".equals(siteComponents[0])) {
99                         int lastDash = pathComponents[2].lastIndexOf('-');
100                         String basename = pathComponents[2].substring(0, lastDash);
101                         int editionNumber = Integer.parseInt(pathComponents[2].substring(lastDash + 1));
102                         Site site = new Site(siteComponents[1], basename);
103                         Edition edition = new Edition(site, editionNumber);
104                         Page page = new Page(edition, createPath(pathComponents, 3));
105                         addPage(page);
106                 }
107                 /* TODO: handle CHK */
108         }
109
110         /**
111          * Adds the given URL to the list of pages to crawl.
112          *
113          * @param url
114          *            The URL of the page to crawl
115          * @throws MalformedURLException
116          *             if the URL is not a valid URL
117          */
118         public void addPage(String url) throws MalformedURLException {
119                 Validation.begin().isNotNull("url", (Object) url).check();
120                 addPage(new URL(url));
121         }
122
123         /**
124          * Adds the given page to the list of pages to crawl.
125          *
126          * @param page
127          *            The page to add
128          */
129         public void addPage(Page page) {
130                 Validation.begin().isNotNull("page", page).check();
131                 pages.add(page);
132                 notifySyncObject();
133         }
134
135         //
136         // SERVICE METHODS
137         //
138
139         /**
140          * {@inheritdoc}
141          *
142          * @see de.ina.util.service.AbstractService#serviceRun()
143          */
144         @Override
145         protected void serviceRun() {
146                 while (!shouldStop()) {
147                         Page nextPage = null;
148                         synchronized (syncObject) {
149                                 while (!shouldStop() && pages.isEmpty()) {
150                                         try {
151                                                 syncObject.wait();
152                                         } catch (InterruptedException ie1) {
153                                                 /* ignore. */
154                                         }
155                                 }
156                                 if (!shouldStop()) {
157                                         nextPage = pages.remove(0);
158                                 }
159                         }
160                         if (shouldStop()) {
161                                 break;
162                         }
163                         URL nextURL = createURL(nextPage);
164                         if (nextURL == null) {
165                                 logger.log(Level.INFO, "Skipping “" + nextPage + "”.");
166                                 continue;
167                         }
168                         URLFetcher urlFetcher;
169                         try {
170                                 logger.log(Level.INFO, "Fetching “" + nextURL + "”...");
171                                 urlFetcher = new URLFetcher(this, nextURL);
172                                 urlFetcherExecutor.execute(urlFetcher);
173                         } catch (NoSuchAlgorithmException nsae1) {
174                                 logger.log(Level.SEVERE, "Could not get “SHA-256” message digest!", nsae1);
175                         }
176                 }
177         }
178
179         //
180         // PRIVATE METHODS
181         //
182
183         /**
184          * Creates a path from the given String array, starting at the given index.
185          * The path is created by joining all Strings from the array, separating
186          * them with a slash (‘/’).
187          *
188          * @param pathComponents
189          *            The array of path components
190          * @param index
191          *            The index of the first path components
192          * @return The joined path
193          */
194         private String createPath(String[] pathComponents, int index) {
195                 Validation.begin().isNotNull("pathComponents", pathComponents).check().isLessOrEqual("index", index, pathComponents.length).check();
196                 StringBuilder path = new StringBuilder();
197                 for (int pathComponentIndex = index; pathComponentIndex < pathComponents.length; pathComponentIndex++) {
198                         if (path.length() > 0) {
199                                 path.append('/');
200                         }
201                         path.append(pathComponents[pathComponentIndex]);
202                 }
203                 return path.toString();
204         }
205
206         /**
207          * Creates a URL from the given page.
208          *
209          * @param page
210          *            The page to create a URL from
211          * @return The created URL, or <code>null</code> if the URL could not be
212          *         created
213          */
214         private URL createURL(Page page) {
215                 try {
216                         return new URL("http://" + nodeHost + ":" + nodePort + "/SSK@" + page.getEdition().getSite().getKey() + "/" + page.getEdition().getSite().getBasename() + "-" + page.getEdition().getNumber() + "/" + page.getPath());
217                 } catch (MalformedURLException mue1) {
218                         /* nearly impossible. */
219                 }
220                 return null;
221         }
222
223 }