src/net/pterodactylus/arachne/core/URLFetcher.java

   1 package net.pterodactylus.arachne.core;
   2
   3 import java.io.IOException;
   4 import java.io.InputStream;
   5 import java.net.MalformedURLException;
   6 import java.net.URL;
   7 import java.net.URLConnection;
   8 import java.security.MessageDigest;
   9 import java.security.NoSuchAlgorithmException;
  10 import java.util.ArrayList;
  11 import java.util.List;
  12 import java.util.logging.Level;
  13 import java.util.logging.Logger;
  14
  15 import net.pterodactylus.arachne.parser.Parser;
  16 import net.pterodactylus.arachne.parser.ParserFactory;
  17 import net.pterodactylus.arachne.parser.ParserListener;
  18 import de.ina.util.io.MessageDigestInputStream;
  19 import de.ina.util.number.Hex;
  20 import de.ina.util.validation.Validation;
  21
  22 /**
  23  * Fetches URLs, parses the received content (if it is HTML) and adds all
  24  * resulting links to the queue in the core.
  25  *
  26  * @author David ‘Bombe’ Roden <bombe@pterodactylus.net>
  27  */
  28 class URLFetcher implements Runnable, ParserListener {
  29
  30         /** The logger. */
  31         private static final Logger logger = Logger.getLogger(URLFetcher.class.getName());
  32
  33         /** The parser factory. */
  34         private final ParserFactory parserFactory;
  35
  36         /** The URL to fetch. */
  37         private final URL url;
  38
  39         /** The message digest. */
  40         private final MessageDigest messageDigest;
  41
  42         /** The hash of the fetched URL. */
  43         private byte[] hash;
  44
  45         /** The collected URLs. */
  46         private final List<Page> collectedPages = new ArrayList<Page>();
  47
  48         /** The title of the URL. */
  49         private String title;
  50
  51         /**
  52          * Creates a new fetcher for the given URL.
  53          *
  54          * @param parserFactory
  55          *            The parser factory that is used to create content-type
  56          *            specific parsers
  57          * @param url
  58          *            The URL to fetch
  59          * @throws NoSuchAlgorithmException
  60          *             if no {@link MessageDigest} instance with an
  61          *             <code>SHA-256</code> algorithm can be created
  62          */
  63         public URLFetcher(ParserFactory parserFactory, URL url) throws NoSuchAlgorithmException {
  64                 Validation.begin().isNotNull("parserFactory", parserFactory).isNotNull("url", url).check();
  65                 this.parserFactory = parserFactory;
  66                 this.url = url;
  67                 messageDigest = MessageDigest.getInstance("SHA-256");
  68         }
  69
  70         //
  71         // ACCESSORS
  72         //
  73
  74         /**
  75          * Returns the title of the fetched URL.
  76          *
  77          * @return The fetched URL’s title
  78          */
  79         public String getTitle() {
  80                 return title;
  81         }
  82
  83         /**
  84          * Returns the pages collected while parsing this URL.
  85          *
  86          * @return The collected pages
  87          */
  88         public List<Page> getCollectedPages() {
  89                 return collectedPages;
  90         }
  91
  92         /**
  93          * Returns the hash of the content of the fetched URL. The returned value is
  94          * only valid after {@link #run()} has been called.
  95          *
  96          * @return The hash of the fetched content
  97          */
  98         public byte[] getHash() {
  99                 byte[] hashCopy = new byte[hash.length];
 100                 System.arraycopy(hash, 0, hashCopy, 0, hash.length);
 101                 return hashCopy;
 102         }
 103
 104         //
 105         // INTERFACE Runnable
 106         //
 107
 108         /**
 109          * {@inheritDoc}
 110          *
 111          * @see java.lang.Runnable#run()
 112          */
 113         public void run() {
 114                 logger.log(Level.INFO, "Starting URL Fetcher for “" + url + "”.");
 115                 InputStream urlInputStream = null;
 116                 MessageDigestInputStream hashInputStream = null;
 117                 try {
 118                         URLConnection urlConnection = url.openConnection();
 119                         long contentLength = urlConnection.getContentLength();
 120                         String contentType = urlConnection.getContentType();
 121                         logger.log(Level.INFO, "Type is “" + contentType + "”, length is " + contentLength + ".");
 122                         urlInputStream = urlConnection.getInputStream();
 123                         hashInputStream = new MessageDigestInputStream(urlInputStream, messageDigest);
 124                         Parser parser = parserFactory.getParser(contentType);
 125                         if (parser == null) {
 126                                 logger.log(Level.INFO, "No parser found for “" + contentType + "”.");
 127                                 return;
 128                         }
 129                         parser.parse(this, hashInputStream, "UTF-8");
 130                         hash = messageDigest.digest();
 131                         logger.log(Level.FINE, "Hashed “" + url + "” to “" + Hex.toHex(hash) + "”.");
 132                 } catch (IOException ioe1) {
 133                         logger.log(Level.WARNING, "Could not fetch “" + url + "”.", ioe1);
 134                 }
 135         }
 136
 137         //
 138         // INTERFACE ParserListener
 139         //
 140
 141         /**
 142          * {@inheritDoc}
 143          */
 144         public void parsedLink(InputStream inputStream, String linkTarget, String linkTitle, String linkText) {
 145                 URL newLink = null;
 146                 try {
 147                         newLink = new URL(url, linkTarget);
 148                         try {
 149                                 Page newPage = Page.fromURL(newLink);
 150                                 if (newPage != null) {
 151                                         collectedPages.add(newPage);
 152                                 }
 153                         } catch (IllegalArgumentException iae1) {
 154                                 /* ignore. */
 155                         }
 156                 } catch (MalformedURLException mue1) {
 157                         logger.log(Level.WARNING, "Could not create URL from “" + url + "” and “" + linkTarget + "”.", mue1);
 158                 }
 159         }
 160
 161         /**
 162          * {@inheritDoc}
 163          */
 164         public void parsedTitle(InputStream inputStream, String title) {
 165                 this.title = title;
 166         }
 167
 168 }