2 * © 2009 David ‘Bombe’ Roden
4 package net.pterodactylus.arachne.core;
6 import java.net.MalformedURLException;
8 import java.security.NoSuchAlgorithmException;
9 import java.util.ArrayList;
10 import java.util.List;
11 import java.util.concurrent.Executor;
12 import java.util.concurrent.Executors;
13 import java.util.logging.Level;
14 import java.util.logging.Logger;
16 import de.ina.util.service.AbstractService;
17 import de.ina.util.thread.DumpingThreadFactory;
18 import de.ina.util.validation.Validation;
23 * @author David ‘Bombe’ Roden <bombe@pterodactylus.net>
25 public class Core extends AbstractService {
28 private static final Logger logger = Logger.getLogger(Core.class.getName());
34 /** The host of the freenet node. */
35 private String nodeHost = "localhost";
37 /** The port of the freenet node. */
38 private int nodePort = 8888;
44 /** Thread pool for the URL fetches. */
45 private Executor urlFetcherExecutor = Executors.newFixedThreadPool(1, new DumpingThreadFactory("URLFetcher-"));
47 /** The current list of URLs to crawl. */
48 private final List<Page> pages = new ArrayList<Page>();
55 * Sets the host name of the node.
58 * The node’s host name
60 public void setNodeHost(String nodeHost) {
61 this.nodeHost = nodeHost;
69 * Adds the given URL to the list of pages to crawl.
74 public void addPage(URL url) {
75 Validation.begin().isNotNull("url", url).check().isEqual("url.getHost()", url.getHost(), (Object) nodeHost).isEqual("url.getPort()", url.getPort(), nodePort).check();
76 String path = url.getPath();
77 if (path.length() == 0) {
80 String[] pathComponents = path.split("/");
81 if (pathComponents.length < 2) {
82 throw new IllegalArgumentException("URL “" + url + "” is not a valid freenet page.");
84 String siteName = pathComponents[1];
85 String[] siteComponents = siteName.split("@");
86 if (siteComponents.length != 2) {
87 throw new IllegalArgumentException("siteName “" + siteName + "” is not a valid freenet page.");
89 if (!"USK".equals(siteComponents[0]) && !"SSK".equals(siteComponents[0]) && !"CHK".equals(siteComponents[0])) {
90 throw new IllegalArgumentException("siteName “" + siteName + "” is not a valid freenet page.");
92 if ("USK".equals(siteComponents[0])) {
93 Site site = new Site(siteComponents[1], pathComponents[2]);
94 Edition edition = new Edition(site, Integer.parseInt(pathComponents[3]));
95 Page page = new Page(edition, createPath(pathComponents, 4));
98 if ("SSK".equals(siteComponents[0])) {
99 int lastDash = pathComponents[2].lastIndexOf('-');
100 String basename = pathComponents[2].substring(0, lastDash);
101 int editionNumber = Integer.parseInt(pathComponents[2].substring(lastDash + 1));
102 Site site = new Site(siteComponents[1], basename);
103 Edition edition = new Edition(site, editionNumber);
104 Page page = new Page(edition, createPath(pathComponents, 3));
107 /* TODO: handle CHK */
111 * Adds the given URL to the list of pages to crawl.
114 * The URL of the page to crawl
115 * @throws MalformedURLException
116 * if the URL is not a valid URL
118 public void addPage(String url) throws MalformedURLException {
119 Validation.begin().isNotNull("url", (Object) url).check();
120 addPage(new URL(url));
124 * Adds the given page to the list of pages to crawl.
129 public void addPage(Page page) {
130 Validation.begin().isNotNull("page", page).check();
142 * @see de.ina.util.service.AbstractService#serviceRun()
145 protected void serviceRun() {
146 while (!shouldStop()) {
147 Page nextPage = null;
148 synchronized (syncObject) {
149 while (!shouldStop() && pages.isEmpty()) {
152 } catch (InterruptedException ie1) {
157 nextPage = pages.remove(0);
163 URL nextURL = createURL(nextPage);
164 if (nextURL == null) {
165 logger.log(Level.INFO, "Skipping “" + nextPage + "”.");
168 URLFetcher urlFetcher;
170 logger.log(Level.INFO, "Fetching “" + nextURL + "”...");
171 urlFetcher = new URLFetcher(this, nextURL);
172 urlFetcherExecutor.execute(urlFetcher);
173 } catch (NoSuchAlgorithmException nsae1) {
174 logger.log(Level.SEVERE, "Could not get “SHA-256” message digest!", nsae1);
184 * Creates a path from the given String array, starting at the given index.
185 * The path is created by joining all Strings from the array, separating
186 * them with a slash (‘/’).
188 * @param pathComponents
189 * The array of path components
191 * The index of the first path components
192 * @return The joined path
194 private String createPath(String[] pathComponents, int index) {
195 Validation.begin().isNotNull("pathComponents", pathComponents).check().isLessOrEqual("index", index, pathComponents.length).check();
196 StringBuilder path = new StringBuilder();
197 for (int pathComponentIndex = index; pathComponentIndex < pathComponents.length; pathComponentIndex++) {
198 if (path.length() > 0) {
201 path.append(pathComponents[pathComponentIndex]);
203 return path.toString();
207 * Creates a URL from the given page.
210 * The page to create a URL from
211 * @return The created URL, or <code>null</code> if the URL could not be
214 private URL createURL(Page page) {
216 return new URL("http://" + nodeHost + ":" + nodePort + "/SSK@" + page.getEdition().getSite().getKey() + "/" + page.getEdition().getSite().getBasename() + "-" + page.getEdition().getNumber() + "/" + page.getPath());
217 } catch (MalformedURLException mue1) {
218 /* nearly impossible. */