X-Git-Url: https://git.pterodactylus.net/?a=blobdiff_plain;f=src%2Fnet%2Fpterodactylus%2Farachne%2Fcore%2FCore.java;h=9bcddb0aaf661119c59a8597d91a60bec8b5c61c;hb=83a84def68ebc8d094bc9837ec72b5fc5cfdd782;hp=a2c50cd4ab70c7fa829f976b9c03d6f9266fe275;hpb=d4f943d0df64a8e82dce8721b740d28dd5192d88;p=arachne.git diff --git a/src/net/pterodactylus/arachne/core/Core.java b/src/net/pterodactylus/arachne/core/Core.java index a2c50cd..9bcddb0 100644 --- a/src/net/pterodactylus/arachne/core/Core.java +++ b/src/net/pterodactylus/arachne/core/Core.java @@ -7,7 +7,9 @@ import java.net.MalformedURLException; import java.net.URL; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Set; import java.util.concurrent.Executor; import java.util.concurrent.Executors; import java.util.logging.Level; @@ -51,6 +53,9 @@ public class Core extends AbstractService { /** The current list of URLs to crawl. */ private final List pages = new ArrayList(); + /** Already crawled pages. */ + private final Set crawledPages = new HashSet(); + // // ACCESSORS // @@ -101,8 +106,10 @@ public class Core extends AbstractService { */ public void addPage(Page page) { Validation.begin().isNotNull("page", page).check(); - pages.add(page); - notifySyncObject(); + if (!crawledPages.contains(page) && !pages.contains(page)) { + pages.add(page); + notifySyncObject(); + } } // @@ -115,10 +122,10 @@ public class Core extends AbstractService { * @see de.ina.util.service.AbstractService#serviceRun() */ @Override - @SuppressWarnings("null") protected void serviceRun() { while (!shouldStop()) { - Page nextPage = null; + final Page nextPage; + Page page = null; synchronized (syncObject) { while (!shouldStop() && pages.isEmpty()) { try { @@ -128,22 +135,33 @@ public class Core extends AbstractService { } } if (!shouldStop()) { - nextPage = pages.remove(0); + page = pages.remove(0); } } if (shouldStop()) { break; } + nextPage = page; URL nextURL = nextPage.toURL(nodeHost, nodePort); if (nextURL == null) { logger.log(Level.INFO, "Skipping “" + nextPage + "”."); continue; } - URLFetcher urlFetcher; + final URLFetcher urlFetcher; try { logger.log(Level.INFO, "Fetching “" + nextURL + "”..."); urlFetcher = new URLFetcher(parserFactory, nextURL); - urlFetcherExecutor.execute(urlFetcher); + urlFetcherExecutor.execute(new Runnable() { + + @SuppressWarnings("synthetic-access") + public void run() { + urlFetcher.run(); + crawledPages.add(nextPage); + for (Page page : urlFetcher.getCollectedPages()) { + addPage(page); + } + } + }); } catch (NoSuchAlgorithmException nsae1) { logger.log(Level.SEVERE, "Could not get “SHA-256” message digest!", nsae1); }