Remember crawled pages and don’t crawl them again.
authorDavid ‘Bombe’ Roden <bombe@pterodactylus.net>
Mon, 16 Mar 2009 23:09:47 +0000 (00:09 +0100)
committerDavid ‘Bombe’ Roden <bombe@pterodactylus.net>
Mon, 16 Mar 2009 23:09:47 +0000 (00:09 +0100)
Don’t add pages that are already scheduled for crawling.

src/net/pterodactylus/arachne/core/Core.java

index 8afe014..9bcddb0 100644 (file)
@@ -7,7 +7,9 @@ import java.net.MalformedURLException;
 import java.net.URL;
 import java.security.NoSuchAlgorithmException;
 import java.util.ArrayList;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 import java.util.concurrent.Executor;
 import java.util.concurrent.Executors;
 import java.util.logging.Level;
@@ -51,6 +53,9 @@ public class Core extends AbstractService {
        /** The current list of URLs to crawl. */
        private final List<Page> pages = new ArrayList<Page>();
 
+       /** Already crawled pages. */
+       private final Set<Page> crawledPages = new HashSet<Page>();
+
        //
        // ACCESSORS
        //
@@ -101,8 +106,10 @@ public class Core extends AbstractService {
         */
        public void addPage(Page page) {
                Validation.begin().isNotNull("page", page).check();
-               pages.add(page);
-               notifySyncObject();
+               if (!crawledPages.contains(page) && !pages.contains(page)) {
+                       pages.add(page);
+                       notifySyncObject();
+               }
        }
 
        //
@@ -115,10 +122,10 @@ public class Core extends AbstractService {
         * @see de.ina.util.service.AbstractService#serviceRun()
         */
        @Override
-       @SuppressWarnings("null")
        protected void serviceRun() {
                while (!shouldStop()) {
-                       Page nextPage = null;
+                       final Page nextPage;
+                       Page page = null;
                        synchronized (syncObject) {
                                while (!shouldStop() && pages.isEmpty()) {
                                        try {
@@ -128,12 +135,13 @@ public class Core extends AbstractService {
                                        }
                                }
                                if (!shouldStop()) {
-                                       nextPage = pages.remove(0);
+                                       page = pages.remove(0);
                                }
                        }
                        if (shouldStop()) {
                                break;
                        }
+                       nextPage = page;
                        URL nextURL = nextPage.toURL(nodeHost, nodePort);
                        if (nextURL == null) {
                                logger.log(Level.INFO, "Skipping “" + nextPage + "”.");
@@ -145,8 +153,10 @@ public class Core extends AbstractService {
                                urlFetcher = new URLFetcher(parserFactory, nextURL);
                                urlFetcherExecutor.execute(new Runnable() {
 
+                                       @SuppressWarnings("synthetic-access")
                                        public void run() {
                                                urlFetcher.run();
+                                               crawledPages.add(nextPage);
                                                for (Page page : urlFetcher.getCollectedPages()) {
                                                        addPage(page);
                                                }