import java.net.MalformedURLException;
import java.net.URL;
+import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Executor;
logger.log(Level.INFO, "Skipping “" + nextPage + "”.");
continue;
}
- URLFetcher urlFetcher = new URLFetcher(this, nextURL);
- urlFetcherExecutor.execute(urlFetcher);
+ URLFetcher urlFetcher;
+ try {
+ logger.log(Level.INFO, "Fetching “" + nextURL + "”...");
+ urlFetcher = new URLFetcher(this, nextURL);
+ urlFetcherExecutor.execute(urlFetcher);
+ } catch (NoSuchAlgorithmException nsae1) {
+ logger.log(Level.SEVERE, "Could not get “SHA-256” message digest!", nsae1);
+ }
}
}
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
import java.util.logging.Level;
import java.util.logging.Logger;
import net.pterodactylus.arachne.parser.HtmlEditorKitParser;
import net.pterodactylus.arachne.parser.ParserListener;
+import de.ina.util.io.MessageDigestInputStream;
import de.ina.util.validation.Validation;
/**
/** The URL to fetch. */
private final URL url;
+ /** The message digest. */
+ private final MessageDigest messageDigest;
+
+ /** The hash of the fetched URL. */
+ private byte[] hash;
+
/**
* Creates a new fetcher for the given URL.
*
+ * @param core
+ * The core new pages are queued in
* @param url
* The URL to fetch
- * @param core
- * TODO
+ * @throws NoSuchAlgorithmException
+ * if no {@link MessageDigest} instance with an
+ * <code>SHA-256</code> algorithm can be created
*/
- public URLFetcher(Core core, URL url) {
+ public URLFetcher(Core core, URL url) throws NoSuchAlgorithmException {
Validation.begin().isNotNull("core", core).isNotNull("url", url).check();
this.core = core;
this.url = url;
+ messageDigest = MessageDigest.getInstance("SHA-256");
}
+ //
+ // ACCESSORS
+ //
+
+ /**
+ * Returns the hash of the content of the fetched URL. The returned value is
+ * only valid after {@link #run()} has been called.
+ *
+ * @return The hash of the fetched content
+ */
+ public byte[] getHash() {
+ byte[] hashCopy = new byte[hash.length];
+ System.arraycopy(hash, 0, hashCopy, 0, hash.length);
+ return hashCopy;
+ }
+
+ //
+ // INTERFACE Runnable
+ //
+
/**
* {@inheritdoc}
*
*/
public void run() {
logger.log(Level.INFO, "Starting URL Fetcher for “" + url + "”.");
+ InputStream urlInputStream = null;
+ MessageDigestInputStream hashInputStream = null;
try {
URLConnection urlConnection = url.openConnection();
long contentLength = urlConnection.getContentLength();
String contentType = urlConnection.getContentType();
logger.log(Level.INFO, "Type is “" + contentType + "”, length is " + contentLength + ".");
+ urlInputStream = urlConnection.getInputStream();
+ hashInputStream = new MessageDigestInputStream(urlInputStream, messageDigest);
HtmlEditorKitParser htmlEditorKitParser = new HtmlEditorKitParser();
- htmlEditorKitParser.parse(this, urlConnection.getInputStream(), "UTF-8");
+ htmlEditorKitParser.parse(this, hashInputStream, "UTF-8");
+ hash = messageDigest.digest();
} catch (IOException ioe1) {
logger.log(Level.WARNING, "Could not fetch “" + url + "”.", ioe1);
}