From 83a84def68ebc8d094bc9837ec72b5fc5cfdd782 Mon Sep 17 00:00:00 2001 From: =?utf8?q?David=20=E2=80=98Bombe=E2=80=99=20Roden?= Date: Tue, 17 Mar 2009 00:09:47 +0100 Subject: [PATCH] =?utf8?q?Remember=20crawled=20pages=20and=20don=E2=80=99t?= =?utf8?q?=20crawl=20them=20again.=20Don=E2=80=99t=20add=20pages=20that=20?= =?utf8?q?are=20already=20scheduled=20for=20crawling.?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit --- src/net/pterodactylus/arachne/core/Core.java | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/net/pterodactylus/arachne/core/Core.java b/src/net/pterodactylus/arachne/core/Core.java index 8afe014..9bcddb0 100644 --- a/src/net/pterodactylus/arachne/core/Core.java +++ b/src/net/pterodactylus/arachne/core/Core.java @@ -7,7 +7,9 @@ import java.net.MalformedURLException; import java.net.URL; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Set; import java.util.concurrent.Executor; import java.util.concurrent.Executors; import java.util.logging.Level; @@ -51,6 +53,9 @@ public class Core extends AbstractService { /** The current list of URLs to crawl. */ private final List pages = new ArrayList(); + /** Already crawled pages. */ + private final Set crawledPages = new HashSet(); + // // ACCESSORS // @@ -101,8 +106,10 @@ public class Core extends AbstractService { */ public void addPage(Page page) { Validation.begin().isNotNull("page", page).check(); - pages.add(page); - notifySyncObject(); + if (!crawledPages.contains(page) && !pages.contains(page)) { + pages.add(page); + notifySyncObject(); + } } // @@ -115,10 +122,10 @@ public class Core extends AbstractService { * @see de.ina.util.service.AbstractService#serviceRun() */ @Override - @SuppressWarnings("null") protected void serviceRun() { while (!shouldStop()) { - Page nextPage = null; + final Page nextPage; + Page page = null; synchronized (syncObject) { while (!shouldStop() && pages.isEmpty()) { try { @@ -128,12 +135,13 @@ public class Core extends AbstractService { } } if (!shouldStop()) { - nextPage = pages.remove(0); + page = pages.remove(0); } } if (shouldStop()) { break; } + nextPage = page; URL nextURL = nextPage.toURL(nodeHost, nodePort); if (nextURL == null) { logger.log(Level.INFO, "Skipping “" + nextPage + "”."); @@ -145,8 +153,10 @@ public class Core extends AbstractService { urlFetcher = new URLFetcher(parserFactory, nextURL); urlFetcherExecutor.execute(new Runnable() { + @SuppressWarnings("synthetic-access") public void run() { urlFetcher.run(); + crawledPages.add(nextPage); for (Page page : urlFetcher.getCollectedPages()) { addPage(page); } -- 2.7.4