From: David ‘Bombe’ Roden Date: Mon, 16 Mar 2009 23:09:47 +0000 (+0100) Subject: Remember crawled pages and don’t crawl them again. X-Git-Url: https://git.pterodactylus.net/?p=arachne.git;a=commitdiff_plain;h=83a84def68ebc8d094bc9837ec72b5fc5cfdd782 Remember crawled pages and don’t crawl them again. Don’t add pages that are already scheduled for crawling. --- diff --git a/src/net/pterodactylus/arachne/core/Core.java b/src/net/pterodactylus/arachne/core/Core.java index 8afe014..9bcddb0 100644 --- a/src/net/pterodactylus/arachne/core/Core.java +++ b/src/net/pterodactylus/arachne/core/Core.java @@ -7,7 +7,9 @@ import java.net.MalformedURLException; import java.net.URL; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Set; import java.util.concurrent.Executor; import java.util.concurrent.Executors; import java.util.logging.Level; @@ -51,6 +53,9 @@ public class Core extends AbstractService { /** The current list of URLs to crawl. */ private final List pages = new ArrayList(); + /** Already crawled pages. */ + private final Set crawledPages = new HashSet(); + // // ACCESSORS // @@ -101,8 +106,10 @@ public class Core extends AbstractService { */ public void addPage(Page page) { Validation.begin().isNotNull("page", page).check(); - pages.add(page); - notifySyncObject(); + if (!crawledPages.contains(page) && !pages.contains(page)) { + pages.add(page); + notifySyncObject(); + } } // @@ -115,10 +122,10 @@ public class Core extends AbstractService { * @see de.ina.util.service.AbstractService#serviceRun() */ @Override - @SuppressWarnings("null") protected void serviceRun() { while (!shouldStop()) { - Page nextPage = null; + final Page nextPage; + Page page = null; synchronized (syncObject) { while (!shouldStop() && pages.isEmpty()) { try { @@ -128,12 +135,13 @@ public class Core extends AbstractService { } } if (!shouldStop()) { - nextPage = pages.remove(0); + page = pages.remove(0); } } if (shouldStop()) { break; } + nextPage = page; URL nextURL = nextPage.toURL(nodeHost, nodePort); if (nextURL == null) { logger.log(Level.INFO, "Skipping “" + nextPage + "”."); @@ -145,8 +153,10 @@ public class Core extends AbstractService { urlFetcher = new URLFetcher(parserFactory, nextURL); urlFetcherExecutor.execute(new Runnable() { + @SuppressWarnings("synthetic-access") public void run() { urlFetcher.run(); + crawledPages.add(nextPage); for (Page page : urlFetcher.getCollectedPages()) { addPage(page); }