import java.net.URL;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
+import java.util.HashSet;
import java.util.List;
+import java.util.Set;
import java.util.concurrent.Executor;
import java.util.concurrent.Executors;
import java.util.logging.Level;
import java.util.logging.Logger;
+import net.pterodactylus.arachne.parser.ParserFactory;
import de.ina.util.service.AbstractService;
import de.ina.util.thread.DumpingThreadFactory;
import de.ina.util.validation.Validation;
// INTERNAL MEMBERS
//
+ /** The parser factory. */
+ private ParserFactory parserFactory = new ParserFactory();
+
/** Thread pool for the URL fetches. */
private Executor urlFetcherExecutor = Executors.newFixedThreadPool(1, new DumpingThreadFactory("URLFetcher-"));
/** The current list of URLs to crawl. */
private final List<Page> pages = new ArrayList<Page>();
+ /** Already crawled pages. */
+ private final Set<Page> crawledPages = new HashSet<Page>();
+
//
// ACCESSORS
//
*/
public void addPage(URL url) {
Validation.begin().isNotNull("url", url).check().isEqual("url.getHost()", url.getHost(), (Object) nodeHost).isEqual("url.getPort()", url.getPort(), nodePort).check();
+ addPage(Page.fromURL(url));
}
/**
*/
public void addPage(Page page) {
Validation.begin().isNotNull("page", page).check();
- pages.add(page);
- notifySyncObject();
+ if (!crawledPages.contains(page) && !pages.contains(page)) {
+ pages.add(page);
+ notifySyncObject();
+ }
}
//
//
/**
- * {@inheritdoc}
+ * {@inheritDoc}
*
* @see de.ina.util.service.AbstractService#serviceRun()
*/
@Override
- @SuppressWarnings("null")
protected void serviceRun() {
while (!shouldStop()) {
- Page nextPage = null;
+ final Page nextPage;
+ Page page = null;
synchronized (syncObject) {
while (!shouldStop() && pages.isEmpty()) {
try {
}
}
if (!shouldStop()) {
- nextPage = pages.remove(0);
+ page = pages.remove(0);
}
}
if (shouldStop()) {
break;
}
+ nextPage = page;
URL nextURL = nextPage.toURL(nodeHost, nodePort);
if (nextURL == null) {
logger.log(Level.INFO, "Skipping “" + nextPage + "”.");
continue;
}
- URLFetcher urlFetcher;
+ final URLFetcher urlFetcher;
try {
logger.log(Level.INFO, "Fetching “" + nextURL + "”...");
- urlFetcher = new URLFetcher(nextURL);
- urlFetcherExecutor.execute(urlFetcher);
+ urlFetcher = new URLFetcher(parserFactory, nextURL);
+ urlFetcherExecutor.execute(new Runnable() {
+
+ @SuppressWarnings("synthetic-access")
+ public void run() {
+ urlFetcher.run();
+ crawledPages.add(nextPage);
+ for (Page page : urlFetcher.getCollectedPages()) {
+ addPage(page);
+ }
+ }
+ });
} catch (NoSuchAlgorithmException nsae1) {
logger.log(Level.SEVERE, "Could not get “SHA-256” message digest!", nsae1);
}