Basic implementation of HTML parser.
[arachne.git] / src / net / pterodactylus / arachne / parser / HtmlEditorKitParser.java
1 /*
2  * © 2009 David ‘Bombe’ Roden
3  */
4 package net.pterodactylus.arachne.parser;
5
6 import java.io.BufferedReader;
7 import java.io.IOException;
8 import java.io.InputStream;
9 import java.io.InputStreamReader;
10
11 import javax.swing.text.MutableAttributeSet;
12 import javax.swing.text.html.HTML;
13 import javax.swing.text.html.HTMLEditorKit;
14 import javax.swing.text.html.HTML.Tag;
15 import javax.swing.text.html.HTMLEditorKit.ParserCallback;
16 import javax.swing.text.html.parser.ParserDelegator;
17
18 import de.ina.util.io.Closer;
19
20 /**
21  * {@link Parser} implementation based on Swing’s {@link HTMLEditorKit}. This
22  * parser can be re-used but is not thread-safe!
23  *
24  * @author David ‘Bombe’ Roden <bombe@pterodactylus.net>
25  */
26 public class HtmlEditorKitParser extends HTMLEditorKit.ParserCallback implements Parser {
27
28         /** The parser listener. */
29         private ParserListener parserListener;
30
31         /** The current input stream. */
32         private InputStream inputStream;
33
34         /** Whether we’re currently parsing the title. */
35         private boolean inTitle;
36
37         /** The title text. */
38         private String titleText;
39
40         /** Whether we’re currently parsing a link. */
41         private boolean inLink;
42
43         /** The target of the link. */
44         private String linkTarget;
45
46         /** The title attribute of the link. */
47         private String linkTitle;
48
49         /** The text of the link. */
50         private StringBuilder linkText;
51
52         /**
53          * {@inheritdoc}
54          *
55          * @see net.pterodactylus.arachne.parser.Parser#parse(ParserListener,
56          *      InputStream, String)
57          */
58         public void parse(ParserListener parserListener, InputStream inputStream, String charset) throws IOException {
59                 this.parserListener = parserListener;
60                 this.inputStream = inputStream;
61                 InputStreamReader inputStreamReader = null;
62                 BufferedReader htmlReader = null;
63                 try {
64                         inputStreamReader = new InputStreamReader(inputStream, charset);
65                         htmlReader = new BufferedReader(inputStreamReader);
66                         new ParserDelegator().parse(htmlReader, this, true);
67                 } finally {
68                         Closer.close(htmlReader);
69                         Closer.close(inputStreamReader);
70                 }
71         }
72
73         //
74         // METHODS FROM ParserCallback
75         //
76
77         /**
78          * {@inheritDoc}
79          */
80         @Override
81         public void handleStartTag(Tag tag, MutableAttributeSet attributeSet, int position) {
82                 if (tag == Tag.TITLE) {
83                         if (!attributeSet.containsAttribute(ParserCallback.IMPLIED, Boolean.TRUE)) {
84                                 inTitle = true;
85                         }
86                 } else if (tag == Tag.A) {
87                         String href = (String) attributeSet.getAttribute(HTML.Attribute.HREF);
88                         if (href != null) {
89                                 linkTarget = href;
90                                 inLink = true;
91                                 linkText = new StringBuilder();
92                                 linkTitle = null;
93                                 String title = (String) attributeSet.getAttribute(HTML.Attribute.TITLE);
94                                 if (title != null) {
95                                         linkTitle = title;
96                                 }
97                         }
98                 }
99                 if (inLink) {
100                         if (tag.breaksFlow()) {
101                                 linkText.append(' ');
102                         }
103                 }
104         }
105
106         /**
107          * {@inheritDoc}
108          */
109         @Override
110         public void handleText(char[] data, int pos) {
111                 if (inTitle) {
112                         titleText = new String(data);
113                 } else if (inLink) {
114                         linkText.append(data);
115                 }
116         }
117
118         /**
119          * {@inheritDoc}
120          */
121         @Override
122         public void handleEndTag(Tag tag, int position) {
123                 if (tag == Tag.TITLE) {
124                         inTitle = false;
125                         parserListener.parsedTitle(inputStream, titleText);
126                 } else if (tag == Tag.A) {
127                         inLink = false;
128                         parserListener.parsedLink(inputStream, linkTarget, linkTitle, linkText.toString());
129                 }
130         }
131
132 }