projects
/
arachne.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
Remember crawled pages and don’t crawl them again.
[arachne.git]
/
src
/
net
/
pterodactylus
/
arachne
/
core
/
Core.java
diff --git
a/src/net/pterodactylus/arachne/core/Core.java
b/src/net/pterodactylus/arachne/core/Core.java
index
8afe014
..
9bcddb0
100644
(file)
--- a/
src/net/pterodactylus/arachne/core/Core.java
+++ b/
src/net/pterodactylus/arachne/core/Core.java
@@
-7,7
+7,9
@@
import java.net.MalformedURLException;
import java.net.URL;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.net.URL;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
+import java.util.HashSet;
import java.util.List;
import java.util.List;
+import java.util.Set;
import java.util.concurrent.Executor;
import java.util.concurrent.Executors;
import java.util.logging.Level;
import java.util.concurrent.Executor;
import java.util.concurrent.Executors;
import java.util.logging.Level;
@@
-51,6
+53,9
@@
public class Core extends AbstractService {
/** The current list of URLs to crawl. */
private final List<Page> pages = new ArrayList<Page>();
/** The current list of URLs to crawl. */
private final List<Page> pages = new ArrayList<Page>();
+ /** Already crawled pages. */
+ private final Set<Page> crawledPages = new HashSet<Page>();
+
//
// ACCESSORS
//
//
// ACCESSORS
//
@@
-101,8
+106,10
@@
public class Core extends AbstractService {
*/
public void addPage(Page page) {
Validation.begin().isNotNull("page", page).check();
*/
public void addPage(Page page) {
Validation.begin().isNotNull("page", page).check();
- pages.add(page);
- notifySyncObject();
+ if (!crawledPages.contains(page) && !pages.contains(page)) {
+ pages.add(page);
+ notifySyncObject();
+ }
}
//
}
//
@@
-115,10
+122,10
@@
public class Core extends AbstractService {
* @see de.ina.util.service.AbstractService#serviceRun()
*/
@Override
* @see de.ina.util.service.AbstractService#serviceRun()
*/
@Override
- @SuppressWarnings("null")
protected void serviceRun() {
while (!shouldStop()) {
protected void serviceRun() {
while (!shouldStop()) {
- Page nextPage = null;
+ final Page nextPage;
+ Page page = null;
synchronized (syncObject) {
while (!shouldStop() && pages.isEmpty()) {
try {
synchronized (syncObject) {
while (!shouldStop() && pages.isEmpty()) {
try {
@@
-128,12
+135,13
@@
public class Core extends AbstractService {
}
}
if (!shouldStop()) {
}
}
if (!shouldStop()) {
-
nextP
age = pages.remove(0);
+
p
age = pages.remove(0);
}
}
if (shouldStop()) {
break;
}
}
}
if (shouldStop()) {
break;
}
+ nextPage = page;
URL nextURL = nextPage.toURL(nodeHost, nodePort);
if (nextURL == null) {
logger.log(Level.INFO, "Skipping “" + nextPage + "”.");
URL nextURL = nextPage.toURL(nodeHost, nodePort);
if (nextURL == null) {
logger.log(Level.INFO, "Skipping “" + nextPage + "”.");
@@
-145,8
+153,10
@@
public class Core extends AbstractService {
urlFetcher = new URLFetcher(parserFactory, nextURL);
urlFetcherExecutor.execute(new Runnable() {
urlFetcher = new URLFetcher(parserFactory, nextURL);
urlFetcherExecutor.execute(new Runnable() {
+ @SuppressWarnings("synthetic-access")
public void run() {
urlFetcher.run();
public void run() {
urlFetcher.run();
+ crawledPages.add(nextPage);
for (Page page : urlFetcher.getCollectedPages()) {
addPage(page);
}
for (Page page : urlFetcher.getCollectedPages()) {
addPage(page);
}