Basic implementation of HTML parser.
authorDavid ‘Bombe’ Roden <bombe@pterodactylus.net>
Mon, 9 Mar 2009 20:24:39 +0000 (21:24 +0100)
committerDavid ‘Bombe’ Roden <bombe@pterodactylus.net>
Mon, 9 Mar 2009 20:24:39 +0000 (21:24 +0100)
src/net/pterodactylus/arachne/parser/HtmlEditorKitParser.java

index 45ba774..47e2d52 100644 (file)
@@ -3,16 +3,51 @@
  */
 package net.pterodactylus.arachne.parser;
 
  */
 package net.pterodactylus.arachne.parser;
 
+import java.io.BufferedReader;
+import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStream;
+import java.io.InputStreamReader;
 
 
+import javax.swing.text.MutableAttributeSet;
+import javax.swing.text.html.HTML;
 import javax.swing.text.html.HTMLEditorKit;
 import javax.swing.text.html.HTMLEditorKit;
+import javax.swing.text.html.HTML.Tag;
+import javax.swing.text.html.HTMLEditorKit.ParserCallback;
+import javax.swing.text.html.parser.ParserDelegator;
+
+import de.ina.util.io.Closer;
 
 /**
 
 /**
- * {@link Parser} implementation based on Swing’s {@link HTMLEditorKit}.
+ * {@link Parser} implementation based on Swing’s {@link HTMLEditorKit}. This
+ * parser can be re-used but is not thread-safe!
  *
  * @author David ‘Bombe’ Roden <bombe@pterodactylus.net>
  */
  *
  * @author David ‘Bombe’ Roden <bombe@pterodactylus.net>
  */
-public class HtmlEditorKitParser implements Parser {
+public class HtmlEditorKitParser extends HTMLEditorKit.ParserCallback implements Parser {
+
+       /** The parser listener. */
+       private ParserListener parserListener;
+
+       /** The current input stream. */
+       private InputStream inputStream;
+
+       /** Whether we’re currently parsing the title. */
+       private boolean inTitle;
+
+       /** The title text. */
+       private String titleText;
+
+       /** Whether we’re currently parsing a link. */
+       private boolean inLink;
+
+       /** The target of the link. */
+       private String linkTarget;
+
+       /** The title attribute of the link. */
+       private String linkTitle;
+
+       /** The text of the link. */
+       private StringBuilder linkText;
 
        /**
         * {@inheritdoc}
 
        /**
         * {@inheritdoc}
@@ -21,6 +56,77 @@ public class HtmlEditorKitParser implements Parser {
         *      InputStream, String)
         */
        public void parse(ParserListener parserListener, InputStream inputStream, String charset) throws IOException {
         *      InputStream, String)
         */
        public void parse(ParserListener parserListener, InputStream inputStream, String charset) throws IOException {
+               this.parserListener = parserListener;
+               this.inputStream = inputStream;
+               InputStreamReader inputStreamReader = null;
+               BufferedReader htmlReader = null;
+               try {
+                       inputStreamReader = new InputStreamReader(inputStream, charset);
+                       htmlReader = new BufferedReader(inputStreamReader);
+                       new ParserDelegator().parse(htmlReader, this, true);
+               } finally {
+                       Closer.close(htmlReader);
+                       Closer.close(inputStreamReader);
+               }
+       }
+
+       //
+       // METHODS FROM ParserCallback
+       //
+
+       /**
+        * {@inheritDoc}
+        */
+       @Override
+       public void handleStartTag(Tag tag, MutableAttributeSet attributeSet, int position) {
+               if (tag == Tag.TITLE) {
+                       if (!attributeSet.containsAttribute(ParserCallback.IMPLIED, Boolean.TRUE)) {
+                               inTitle = true;
+                       }
+               } else if (tag == Tag.A) {
+                       String href = (String) attributeSet.getAttribute(HTML.Attribute.HREF);
+                       if (href != null) {
+                               linkTarget = href;
+                               inLink = true;
+                               linkText = new StringBuilder();
+                               linkTitle = null;
+                               String title = (String) attributeSet.getAttribute(HTML.Attribute.TITLE);
+                               if (title != null) {
+                                       linkTitle = title;
+                               }
+                       }
+               }
+               if (inLink) {
+                       if (tag.breaksFlow()) {
+                               linkText.append(' ');
+                       }
+               }
+       }
+
+       /**
+        * {@inheritDoc}
+        */
+       @Override
+       public void handleText(char[] data, int pos) {
+               if (inTitle) {
+                       titleText = new String(data);
+               } else if (inLink) {
+                       linkText.append(data);
+               }
+       }
+
+       /**
+        * {@inheritDoc}
+        */
+       @Override
+       public void handleEndTag(Tag tag, int position) {
+               if (tag == Tag.TITLE) {
+                       inTitle = false;
+                       parserListener.parsedTitle(inputStream, titleText);
+               } else if (tag == Tag.A) {
+                       inLink = false;
+                       parserListener.parsedLink(inputStream, linkTarget, linkTitle, linkText.toString());
+               }
        }
 
 }
        }
 
 }