From: David ‘Bombe’ Roden Date: Mon, 9 Mar 2009 20:24:39 +0000 (+0100) Subject: Basic implementation of HTML parser. X-Git-Url: https://git.pterodactylus.net/?a=commitdiff_plain;h=80488ec32fe623903f8717a969af2c2089974c57;p=arachne.git Basic implementation of HTML parser. --- diff --git a/src/net/pterodactylus/arachne/parser/HtmlEditorKitParser.java b/src/net/pterodactylus/arachne/parser/HtmlEditorKitParser.java index 45ba774..47e2d52 100644 --- a/src/net/pterodactylus/arachne/parser/HtmlEditorKitParser.java +++ b/src/net/pterodactylus/arachne/parser/HtmlEditorKitParser.java @@ -3,16 +3,51 @@ */ package net.pterodactylus.arachne.parser; +import java.io.BufferedReader; +import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; +import javax.swing.text.MutableAttributeSet; +import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLEditorKit; +import javax.swing.text.html.HTML.Tag; +import javax.swing.text.html.HTMLEditorKit.ParserCallback; +import javax.swing.text.html.parser.ParserDelegator; + +import de.ina.util.io.Closer; /** - * {@link Parser} implementation based on Swing’s {@link HTMLEditorKit}. + * {@link Parser} implementation based on Swing’s {@link HTMLEditorKit}. This + * parser can be re-used but is not thread-safe! * * @author David ‘Bombe’ Roden */ -public class HtmlEditorKitParser implements Parser { +public class HtmlEditorKitParser extends HTMLEditorKit.ParserCallback implements Parser { + + /** The parser listener. */ + private ParserListener parserListener; + + /** The current input stream. */ + private InputStream inputStream; + + /** Whether we’re currently parsing the title. */ + private boolean inTitle; + + /** The title text. */ + private String titleText; + + /** Whether we’re currently parsing a link. */ + private boolean inLink; + + /** The target of the link. */ + private String linkTarget; + + /** The title attribute of the link. */ + private String linkTitle; + + /** The text of the link. */ + private StringBuilder linkText; /** * {@inheritdoc} @@ -21,6 +56,77 @@ public class HtmlEditorKitParser implements Parser { * InputStream, String) */ public void parse(ParserListener parserListener, InputStream inputStream, String charset) throws IOException { + this.parserListener = parserListener; + this.inputStream = inputStream; + InputStreamReader inputStreamReader = null; + BufferedReader htmlReader = null; + try { + inputStreamReader = new InputStreamReader(inputStream, charset); + htmlReader = new BufferedReader(inputStreamReader); + new ParserDelegator().parse(htmlReader, this, true); + } finally { + Closer.close(htmlReader); + Closer.close(inputStreamReader); + } + } + + // + // METHODS FROM ParserCallback + // + + /** + * {@inheritDoc} + */ + @Override + public void handleStartTag(Tag tag, MutableAttributeSet attributeSet, int position) { + if (tag == Tag.TITLE) { + if (!attributeSet.containsAttribute(ParserCallback.IMPLIED, Boolean.TRUE)) { + inTitle = true; + } + } else if (tag == Tag.A) { + String href = (String) attributeSet.getAttribute(HTML.Attribute.HREF); + if (href != null) { + linkTarget = href; + inLink = true; + linkText = new StringBuilder(); + linkTitle = null; + String title = (String) attributeSet.getAttribute(HTML.Attribute.TITLE); + if (title != null) { + linkTitle = title; + } + } + } + if (inLink) { + if (tag.breaksFlow()) { + linkText.append(' '); + } + } + } + + /** + * {@inheritDoc} + */ + @Override + public void handleText(char[] data, int pos) { + if (inTitle) { + titleText = new String(data); + } else if (inLink) { + linkText.append(data); + } + } + + /** + * {@inheritDoc} + */ + @Override + public void handleEndTag(Tag tag, int position) { + if (tag == Tag.TITLE) { + inTitle = false; + parserListener.parsedTitle(inputStream, titleText); + } else if (tag == Tag.A) { + inLink = false; + parserListener.parsedLink(inputStream, linkTarget, linkTitle, linkText.toString()); + } } }