src/net/pterodactylus/arachne/core/URLFetcher.java

   1 package net.pterodactylus.arachne.core;
   2
   3 import java.io.IOException;
   4 import java.io.InputStream;
   5 import java.net.MalformedURLException;
   6 import java.net.URL;
   7 import java.net.URLConnection;
   8 import java.security.MessageDigest;
   9 import java.security.NoSuchAlgorithmException;
  10 import java.util.ArrayList;
  11 import java.util.List;
  12 import java.util.logging.Level;
  13 import java.util.logging.Logger;
  14
  15 import net.pterodactylus.arachne.parser.Parser;
  16 import net.pterodactylus.arachne.parser.ParserFactory;
  17 import net.pterodactylus.arachne.parser.ParserListener;
  18 import de.ina.util.io.MessageDigestInputStream;
  19 import de.ina.util.validation.Validation;
  20
  21 /**
  22  * Fetches URLs, parses the received content (if it is HTML) and adds all
  23  * resulting links to the queue in the core.
  24  *
  25  * @author David ‘Bombe’ Roden <bombe@pterodactylus.net>
  26  */
  27 class URLFetcher implements Runnable, ParserListener {
  28
  29         /** The logger. */
  30         private static final Logger logger = Logger.getLogger(URLFetcher.class.getName());
  31
  32         /** The parser factory. */
  33         private final ParserFactory parserFactory;
  34
  35         /** The URL to fetch. */
  36         private final URL url;
  37
  38         /** The message digest. */
  39         private final MessageDigest messageDigest;
  40
  41         /** The hash of the fetched URL. */
  42         private byte[] hash;
  43
  44         /** The collected URLs. */
  45         private final List<Page> collectedPages = new ArrayList<Page>();
  46
  47         /** The title of the URL. */
  48         private String title;
  49
  50         /**
  51          * Creates a new fetcher for the given URL.
  52          *
  53          * @param parserFactory
  54          *            The parser factory that is used to create content-type
  55          *            specific parsers
  56          * @param url
  57          *            The URL to fetch
  58          * @throws NoSuchAlgorithmException
  59          *             if no {@link MessageDigest} instance with an
  60          *             <code>SHA-256</code> algorithm can be created
  61          */
  62         public URLFetcher(ParserFactory parserFactory, URL url) throws NoSuchAlgorithmException {
  63                 Validation.begin().isNotNull("parserFactory", parserFactory).isNotNull("url", url).check();
  64                 this.parserFactory = parserFactory;
  65                 this.url = url;
  66                 messageDigest = MessageDigest.getInstance("SHA-256");
  67         }
  68
  69         //
  70         // ACCESSORS
  71         //
  72
  73         /**
  74          * Returns the title of the fetched URL.
  75          *
  76          * @return The fetched URL’s title
  77          */
  78         public String getTitle() {
  79                 return title;
  80         }
  81
  82         /**
  83          * Returns the pages collected while parsing this URL.
  84          *
  85          * @return The collected pages
  86          */
  87         public List<Page> getCollectedPages() {
  88                 return collectedPages;
  89         }
  90
  91         /**
  92          * Returns the hash of the content of the fetched URL. The returned value is
  93          * only valid after {@link #run()} has been called.
  94          *
  95          * @return The hash of the fetched content
  96          */
  97         public byte[] getHash() {
  98                 byte[] hashCopy = new byte[hash.length];
  99                 System.arraycopy(hash, 0, hashCopy, 0, hash.length);
 100                 return hashCopy;
 101         }
 102
 103         //
 104         // INTERFACE Runnable
 105         //
 106
 107         /**
 108          * {@inheritdoc}
 109          *
 110          * @see java.lang.Runnable#run()
 111          */
 112         public void run() {
 113                 logger.log(Level.INFO, "Starting URL Fetcher for “" + url + "”.");
 114                 InputStream urlInputStream = null;
 115                 MessageDigestInputStream hashInputStream = null;
 116                 try {
 117                         URLConnection urlConnection = url.openConnection();
 118                         long contentLength = urlConnection.getContentLength();
 119                         String contentType = urlConnection.getContentType();
 120                         logger.log(Level.INFO, "Type is “" + contentType + "”, length is " + contentLength + ".");
 121                         urlInputStream = urlConnection.getInputStream();
 122                         hashInputStream = new MessageDigestInputStream(urlInputStream, messageDigest);
 123                         Parser parser = parserFactory.getParser(contentType);
 124                         parser.parse(this, hashInputStream, "UTF-8");
 125                         hash = messageDigest.digest();
 126                 } catch (IOException ioe1) {
 127                         logger.log(Level.WARNING, "Could not fetch “" + url + "”.", ioe1);
 128                 }
 129         }
 130
 131         //
 132         // INTERFACE ParserListener
 133         //
 134
 135         /**
 136          * {@inheritDoc}
 137          */
 138         public void parsedLink(InputStream inputStream, String linkTarget, String linkTitle, String linkText) {
 139                 URL newLink = null;
 140                 try {
 141                         newLink = new URL(url, linkTarget);
 142                         Page newPage = Page.fromURL(newLink);
 143                         if (newPage != null) {
 144                                 collectedPages.add(newPage);
 145                         }
 146                 } catch (MalformedURLException mue1) {
 147                         logger.log(Level.WARNING, "Could not create URL from “" + url + "” and “" + linkTarget + "”.", mue1);
 148                 }
 149         }
 150
 151         /**
 152          * {@inheritDoc}
 153          */
 154         public void parsedTitle(InputStream inputStream, String title) {
 155                 this.title = title;
 156         }
 157
 158 }