package net.pterodactylus.arachne.core;
import java.io.IOException;
+import java.io.InputStream;
+import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.ArrayList;
+import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
+import net.pterodactylus.arachne.parser.Parser;
+import net.pterodactylus.arachne.parser.ParserFactory;
+import net.pterodactylus.arachne.parser.ParserListener;
+import de.ina.util.io.MessageDigestInputStream;
import de.ina.util.validation.Validation;
/**
*
* @author David ‘Bombe’ Roden <bombe@pterodactylus.net>
*/
-class URLFetcher implements Runnable {
+class URLFetcher implements Runnable, ParserListener {
/** The logger. */
private static final Logger logger = Logger.getLogger(URLFetcher.class.getName());
- /** The core. */
- private final Core core;
+ /** The parser factory. */
+ private final ParserFactory parserFactory;
/** The URL to fetch. */
private final URL url;
+ /** The message digest. */
+ private final MessageDigest messageDigest;
+
+ /** The hash of the fetched URL. */
+ private byte[] hash;
+
+ /** The collected URLs. */
+ private final List<Page> collectedPages = new ArrayList<Page>();
+
+ /** The title of the URL. */
+ private String title;
+
/**
* Creates a new fetcher for the given URL.
*
+ * @param parserFactory
+ * The parser factory that is used to create content-type
+ * specific parsers
* @param url
* The URL to fetch
- * @param core
- * TODO
+ * @throws NoSuchAlgorithmException
+ * if no {@link MessageDigest} instance with an
+ * <code>SHA-256</code> algorithm can be created
*/
- public URLFetcher(Core core, URL url) {
- Validation.begin().isNotNull("core", core).isNotNull("url", url).check();
- this.core = core;
+ public URLFetcher(ParserFactory parserFactory, URL url) throws NoSuchAlgorithmException {
+ Validation.begin().isNotNull("parserFactory", parserFactory).isNotNull("url", url).check();
+ this.parserFactory = parserFactory;
this.url = url;
+ messageDigest = MessageDigest.getInstance("SHA-256");
+ }
+
+ //
+ // ACCESSORS
+ //
+
+ /**
+ * Returns the title of the fetched URL.
+ *
+ * @return The fetched URL’s title
+ */
+ public String getTitle() {
+ return title;
+ }
+
+ /**
+ * Returns the pages collected while parsing this URL.
+ *
+ * @return The collected pages
+ */
+ public List<Page> getCollectedPages() {
+ return collectedPages;
+ }
+
+ /**
+ * Returns the hash of the content of the fetched URL. The returned value is
+ * only valid after {@link #run()} has been called.
+ *
+ * @return The hash of the fetched content
+ */
+ public byte[] getHash() {
+ byte[] hashCopy = new byte[hash.length];
+ System.arraycopy(hash, 0, hashCopy, 0, hash.length);
+ return hashCopy;
}
+ //
+ // INTERFACE Runnable
+ //
+
/**
- * {@inheritdoc}
+ * {@inheritDoc}
*
* @see java.lang.Runnable#run()
*/
public void run() {
logger.log(Level.INFO, "Starting URL Fetcher for “" + url + "”.");
+ InputStream urlInputStream = null;
+ MessageDigestInputStream hashInputStream = null;
try {
URLConnection urlConnection = url.openConnection();
long contentLength = urlConnection.getContentLength();
String contentType = urlConnection.getContentType();
logger.log(Level.INFO, "Type is “" + contentType + "”, length is " + contentLength + ".");
+ urlInputStream = urlConnection.getInputStream();
+ hashInputStream = new MessageDigestInputStream(urlInputStream, messageDigest);
+ Parser parser = parserFactory.getParser(contentType);
+ parser.parse(this, hashInputStream, "UTF-8");
+ hash = messageDigest.digest();
} catch (IOException ioe1) {
logger.log(Level.WARNING, "Could not fetch “" + url + "”.", ioe1);
}
}
+ //
+ // INTERFACE ParserListener
+ //
+
+ /**
+ * {@inheritDoc}
+ */
+ public void parsedLink(InputStream inputStream, String linkTarget, String linkTitle, String linkText) {
+ URL newLink = null;
+ try {
+ newLink = new URL(url, linkTarget);
+ try {
+ Page newPage = Page.fromURL(newLink);
+ if (newPage != null) {
+ collectedPages.add(newPage);
+ }
+ } catch (IllegalArgumentException iae1) {
+ /* ignore. */
+ }
+ } catch (MalformedURLException mue1) {
+ logger.log(Level.WARNING, "Could not create URL from “" + url + "” and “" + linkTarget + "”.", mue1);
+ }
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ public void parsedTitle(InputStream inputStream, String title) {
+ this.title = title;
+ }
+
}