X-Git-Url: https://git.pterodactylus.net/?a=blobdiff_plain;f=src%2Fnet%2Fpterodactylus%2Farachne%2Fcore%2FCore.java;h=9bcddb0aaf661119c59a8597d91a60bec8b5c61c;hb=83a84def68ebc8d094bc9837ec72b5fc5cfdd782;hp=9efc9607008621ecd4ae2400eca1e7bed31037ee;hpb=a637923e6ea2bd6d53180b7e843b02a191b6d6e5;p=arachne.git diff --git a/src/net/pterodactylus/arachne/core/Core.java b/src/net/pterodactylus/arachne/core/Core.java index 9efc960..9bcddb0 100644 --- a/src/net/pterodactylus/arachne/core/Core.java +++ b/src/net/pterodactylus/arachne/core/Core.java @@ -7,7 +7,9 @@ import java.net.MalformedURLException; import java.net.URL; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Set; import java.util.concurrent.Executor; import java.util.concurrent.Executors; import java.util.logging.Level; @@ -51,6 +53,9 @@ public class Core extends AbstractService { /** The current list of URLs to crawl. */ private final List pages = new ArrayList(); + /** Already crawled pages. */ + private final Set crawledPages = new HashSet(); + // // ACCESSORS // @@ -77,6 +82,7 @@ public class Core extends AbstractService { */ public void addPage(URL url) { Validation.begin().isNotNull("url", url).check().isEqual("url.getHost()", url.getHost(), (Object) nodeHost).isEqual("url.getPort()", url.getPort(), nodePort).check(); + addPage(Page.fromURL(url)); } /** @@ -100,8 +106,10 @@ public class Core extends AbstractService { */ public void addPage(Page page) { Validation.begin().isNotNull("page", page).check(); - pages.add(page); - notifySyncObject(); + if (!crawledPages.contains(page) && !pages.contains(page)) { + pages.add(page); + notifySyncObject(); + } } // @@ -109,15 +117,15 @@ public class Core extends AbstractService { // /** - * {@inheritdoc} + * {@inheritDoc} * * @see de.ina.util.service.AbstractService#serviceRun() */ @Override - @SuppressWarnings("null") protected void serviceRun() { while (!shouldStop()) { - Page nextPage = null; + final Page nextPage; + Page page = null; synchronized (syncObject) { while (!shouldStop() && pages.isEmpty()) { try { @@ -127,22 +135,33 @@ public class Core extends AbstractService { } } if (!shouldStop()) { - nextPage = pages.remove(0); + page = pages.remove(0); } } if (shouldStop()) { break; } + nextPage = page; URL nextURL = nextPage.toURL(nodeHost, nodePort); if (nextURL == null) { logger.log(Level.INFO, "Skipping “" + nextPage + "”."); continue; } - URLFetcher urlFetcher; + final URLFetcher urlFetcher; try { logger.log(Level.INFO, "Fetching “" + nextURL + "”..."); urlFetcher = new URLFetcher(parserFactory, nextURL); - urlFetcherExecutor.execute(urlFetcher); + urlFetcherExecutor.execute(new Runnable() { + + @SuppressWarnings("synthetic-access") + public void run() { + urlFetcher.run(); + crawledPages.add(nextPage); + for (Page page : urlFetcher.getCollectedPages()) { + addPage(page); + } + } + }); } catch (NoSuchAlgorithmException nsae1) { logger.log(Level.SEVERE, "Could not get “SHA-256” message digest!", nsae1); }