X-Git-Url: https://git.pterodactylus.net/?a=blobdiff_plain;f=src%2Fnet%2Fpterodactylus%2Farachne%2Fcore%2FCore.java;h=53dedb80c0e87e3d2c24454227e5b279a1ea571e;hb=b3f6d8f1de06593b1266d0432f77f684dd79bb4f;hp=9e3f1a55a1d0d1143b792474429eef63013b7a3e;hpb=c85307a32e0957c15233b4d9ead5f58d8d167244;p=arachne.git diff --git a/src/net/pterodactylus/arachne/core/Core.java b/src/net/pterodactylus/arachne/core/Core.java index 9e3f1a5..53dedb8 100644 --- a/src/net/pterodactylus/arachne/core/Core.java +++ b/src/net/pterodactylus/arachne/core/Core.java @@ -7,12 +7,15 @@ import java.net.MalformedURLException; import java.net.URL; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Set; import java.util.concurrent.Executor; import java.util.concurrent.Executors; import java.util.logging.Level; import java.util.logging.Logger; +import net.pterodactylus.arachne.parser.ParserFactory; import de.ina.util.service.AbstractService; import de.ina.util.thread.DumpingThreadFactory; import de.ina.util.validation.Validation; @@ -41,12 +44,18 @@ public class Core extends AbstractService { // INTERNAL MEMBERS // + /** The parser factory. */ + private ParserFactory parserFactory = new ParserFactory(); + /** Thread pool for the URL fetches. */ - private Executor urlFetcherExecutor = Executors.newFixedThreadPool(1, new DumpingThreadFactory("URLFetcher-")); + private Executor urlFetcherExecutor = Executors.newSingleThreadExecutor(new DumpingThreadFactory("URLFetcher-")); /** The current list of URLs to crawl. */ private final List pages = new ArrayList(); + /** Already crawled pages. */ + private final Set crawledPages = new HashSet(); + // // ACCESSORS // @@ -73,6 +82,7 @@ public class Core extends AbstractService { */ public void addPage(URL url) { Validation.begin().isNotNull("url", url).check().isEqual("url.getHost()", url.getHost(), (Object) nodeHost).isEqual("url.getPort()", url.getPort(), nodePort).check(); + addPage(Page.fromURL(url)); } /** @@ -96,8 +106,10 @@ public class Core extends AbstractService { */ public void addPage(Page page) { Validation.begin().isNotNull("page", page).check(); - pages.add(page); - notifySyncObject(); + if (!crawledPages.contains(page) && !pages.contains(page)) { + pages.add(page); + notifySyncObject(); + } } // @@ -105,15 +117,15 @@ public class Core extends AbstractService { // /** - * {@inheritdoc} + * {@inheritDoc} * * @see de.ina.util.service.AbstractService#serviceRun() */ @Override - @SuppressWarnings("null") protected void serviceRun() { while (!shouldStop()) { - Page nextPage = null; + final Page nextPage; + Page page = null; synchronized (syncObject) { while (!shouldStop() && pages.isEmpty()) { try { @@ -123,22 +135,33 @@ public class Core extends AbstractService { } } if (!shouldStop()) { - nextPage = pages.remove(0); + page = pages.remove(0); } } if (shouldStop()) { break; } + nextPage = page; URL nextURL = nextPage.toURL(nodeHost, nodePort); if (nextURL == null) { logger.log(Level.INFO, "Skipping “" + nextPage + "”."); continue; } - URLFetcher urlFetcher; + final URLFetcher urlFetcher; try { logger.log(Level.INFO, "Fetching “" + nextURL + "”..."); - urlFetcher = new URLFetcher(nextURL); - urlFetcherExecutor.execute(urlFetcher); + urlFetcher = new URLFetcher(parserFactory, nextURL); + urlFetcherExecutor.execute(new Runnable() { + + @SuppressWarnings("synthetic-access") + public void run() { + urlFetcher.run(); + crawledPages.add(nextPage); + for (Page page : urlFetcher.getCollectedPages()) { + addPage(page); + } + } + }); } catch (NoSuchAlgorithmException nsae1) { logger.log(Level.SEVERE, "Could not get “SHA-256” message digest!", nsae1); }