X-Git-Url: https://git.pterodactylus.net/?a=blobdiff_plain;f=src%2Fnet%2Fpterodactylus%2Farachne%2Fcore%2FCore.java;h=9bcddb0aaf661119c59a8597d91a60bec8b5c61c;hb=83a84def68ebc8d094bc9837ec72b5fc5cfdd782;hp=e7fc8efa3378b370dbb4b786da83ba20cfe6c7c9;hpb=383df78e9f3307dd489fd07e341a9900ebdb4590;p=arachne.git diff --git a/src/net/pterodactylus/arachne/core/Core.java b/src/net/pterodactylus/arachne/core/Core.java index e7fc8ef..9bcddb0 100644 --- a/src/net/pterodactylus/arachne/core/Core.java +++ b/src/net/pterodactylus/arachne/core/Core.java @@ -5,13 +5,17 @@ package net.pterodactylus.arachne.core; import java.net.MalformedURLException; import java.net.URL; +import java.security.NoSuchAlgorithmException; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Set; import java.util.concurrent.Executor; import java.util.concurrent.Executors; import java.util.logging.Level; import java.util.logging.Logger; +import net.pterodactylus.arachne.parser.ParserFactory; import de.ina.util.service.AbstractService; import de.ina.util.thread.DumpingThreadFactory; import de.ina.util.validation.Validation; @@ -40,12 +44,18 @@ public class Core extends AbstractService { // INTERNAL MEMBERS // + /** The parser factory. */ + private ParserFactory parserFactory = new ParserFactory(); + /** Thread pool for the URL fetches. */ private Executor urlFetcherExecutor = Executors.newFixedThreadPool(1, new DumpingThreadFactory("URLFetcher-")); /** The current list of URLs to crawl. */ private final List pages = new ArrayList(); + /** Already crawled pages. */ + private final Set crawledPages = new HashSet(); + // // ACCESSORS // @@ -72,38 +82,7 @@ public class Core extends AbstractService { */ public void addPage(URL url) { Validation.begin().isNotNull("url", url).check().isEqual("url.getHost()", url.getHost(), (Object) nodeHost).isEqual("url.getPort()", url.getPort(), nodePort).check(); - String path = url.getPath(); - if (path.length() == 0) { - path = "/"; - } - String[] pathComponents = path.split("/"); - if (pathComponents.length < 2) { - throw new IllegalArgumentException("URL “" + url + "” is not a valid freenet page."); - } - String siteName = pathComponents[1]; - String[] siteComponents = siteName.split("@"); - if (siteComponents.length != 2) { - throw new IllegalArgumentException("siteName “" + siteName + "” is not a valid freenet page."); - } - if (!"USK".equals(siteComponents[0]) && !"SSK".equals(siteComponents[0]) && !"CHK".equals(siteComponents[0])) { - throw new IllegalArgumentException("siteName “" + siteName + "” is not a valid freenet page."); - } - if ("USK".equals(siteComponents[0])) { - Site site = new Site(siteComponents[1], pathComponents[2]); - Edition edition = new Edition(site, Integer.parseInt(pathComponents[3])); - Page page = new Page(edition, createPath(pathComponents, 4)); - addPage(page); - } - if ("SSK".equals(siteComponents[0])) { - int lastDash = pathComponents[2].lastIndexOf('-'); - String basename = pathComponents[2].substring(0, lastDash); - int editionNumber = Integer.parseInt(pathComponents[2].substring(lastDash + 1)); - Site site = new Site(siteComponents[1], basename); - Edition edition = new Edition(site, editionNumber); - Page page = new Page(edition, createPath(pathComponents, 3)); - addPage(page); - } - /* TODO: handle CHK */ + addPage(Page.fromURL(url)); } /** @@ -127,8 +106,10 @@ public class Core extends AbstractService { */ public void addPage(Page page) { Validation.begin().isNotNull("page", page).check(); - pages.add(page); - notifySyncObject(); + if (!crawledPages.contains(page) && !pages.contains(page)) { + pages.add(page); + notifySyncObject(); + } } // @@ -136,14 +117,15 @@ public class Core extends AbstractService { // /** - * {@inheritdoc} + * {@inheritDoc} * * @see de.ina.util.service.AbstractService#serviceRun() */ @Override protected void serviceRun() { while (!shouldStop()) { - Page nextPage = null; + final Page nextPage; + Page page = null; synchronized (syncObject) { while (!shouldStop() && pages.isEmpty()) { try { @@ -153,64 +135,37 @@ public class Core extends AbstractService { } } if (!shouldStop()) { - nextPage = pages.remove(0); + page = pages.remove(0); } } if (shouldStop()) { break; } - URL nextURL = createURL(nextPage); + nextPage = page; + URL nextURL = nextPage.toURL(nodeHost, nodePort); if (nextURL == null) { logger.log(Level.INFO, "Skipping “" + nextPage + "”."); continue; } - URLFetcher urlFetcher = new URLFetcher(this, nextURL); - urlFetcherExecutor.execute(urlFetcher); - } - } - - // - // PRIVATE METHODS - // - - /** - * Creates a path from the given String array, starting at the given index. - * The path is created by joining all Strings from the array, separating - * them with a slash (‘/’). - * - * @param pathComponents - * The array of path components - * @param index - * The index of the first path components - * @return The joined path - */ - private String createPath(String[] pathComponents, int index) { - Validation.begin().isNotNull("pathComponents", pathComponents).check().isLess("index", index, pathComponents.length).check(); - StringBuilder path = new StringBuilder(); - for (int pathComponentIndex = index; pathComponentIndex < pathComponents.length; pathComponentIndex++) { - if (path.length() > 0) { - path.append('/'); + final URLFetcher urlFetcher; + try { + logger.log(Level.INFO, "Fetching “" + nextURL + "”..."); + urlFetcher = new URLFetcher(parserFactory, nextURL); + urlFetcherExecutor.execute(new Runnable() { + + @SuppressWarnings("synthetic-access") + public void run() { + urlFetcher.run(); + crawledPages.add(nextPage); + for (Page page : urlFetcher.getCollectedPages()) { + addPage(page); + } + } + }); + } catch (NoSuchAlgorithmException nsae1) { + logger.log(Level.SEVERE, "Could not get “SHA-256” message digest!", nsae1); } - path.append(pathComponents[pathComponentIndex]); - } - return path.toString(); - } - - /** - * Creates a URL from the given page. - * - * @param page - * The page to create a URL from - * @return The created URL, or null if the URL could not be - * created - */ - private URL createURL(Page page) { - try { - return new URL("http://" + nodeHost + ":" + nodePort + "/"); - } catch (MalformedURLException mue1) { - /* nearly impossible. */ } - return null; } }