Rework the text parser.
[Sone.git] / src / main / java / net / pterodactylus / sone / text / SoneTextParser.java
1 /*
2  * Sone - SoneTextParser.java - Copyright © 2010–2013 David Roden
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
16  */
17
18 package net.pterodactylus.sone.text;
19
20 import static com.google.common.base.Optional.absent;
21 import static com.google.common.base.Optional.of;
22 import static com.google.common.collect.FluentIterable.from;
23 import static com.google.common.collect.Lists.newArrayList;
24
25 import java.io.BufferedReader;
26 import java.io.IOException;
27 import java.io.Reader;
28 import java.net.MalformedURLException;
29 import java.util.Comparator;
30 import java.util.EnumMap;
31 import java.util.List;
32 import java.util.Map;
33 import java.util.Map.Entry;
34 import java.util.logging.Logger;
35 import java.util.regex.Matcher;
36 import java.util.regex.Pattern;
37
38 import net.pterodactylus.sone.data.Post;
39 import net.pterodactylus.sone.data.Sone;
40 import net.pterodactylus.sone.database.Database;
41 import net.pterodactylus.util.io.Closer;
42 import net.pterodactylus.util.logging.Logging;
43
44 import com.google.common.base.Optional;
45 import com.google.common.collect.ImmutableMap;
46 import freenet.keys.FreenetURI;
47
48 /**
49  * {@link Parser} implementation that can recognize Freenet URIs.
50  *
51  * @author <a href="mailto:bombe@pterodactylus.net">David ‘Bombe’ Roden</a>
52  */
53 public class SoneTextParser implements Parser<SoneTextParserContext> {
54
55         /** The logger. */
56         private static final Logger logger = Logging.getLogger(SoneTextParser.class);
57
58         /** Pattern to detect whitespace. */
59         private static final Pattern whitespacePattern = Pattern.compile("[\\u000a\u0020\u00a0\u1680\u180e\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u200c\u200d\u202f\u205f\u2060\u2800\u3000]");
60
61         /**
62          * Enumeration for all recognized link types.
63          *
64          * @author <a href="mailto:bombe@pterodactylus.net">David ‘Bombe’ Roden</a>
65          */
66         private enum LinkType {
67
68                 KSK("KSK@"),
69                 CHK("CHK@"),
70                 SSK("SSK@") {
71                         @Override
72                         public boolean isSigned() {
73                                 return true;
74                         }
75                 },
76                 USK("USK@") {
77                         @Override
78                         public boolean isSigned() {
79                                 return true;
80                         }
81                 },
82                 HTTP("http://"),
83                 HTTPS("https://"),
84                 SONE("sone://"),
85                 POST("post://");
86
87                 private final String scheme;
88
89                 private LinkType(String scheme) {
90                         this.scheme = scheme;
91                 }
92
93                 public String getScheme() {
94                         return scheme;
95                 }
96
97                 public boolean isSigned() {
98                         return false;
99                 }
100
101         }
102
103         private final PartCreators partCreators = new PartCreators();
104         private final Database database;
105
106         /**
107          * Creates a new freenet link parser.
108          *
109          * @param database
110          */
111         public SoneTextParser(Database database) {
112                 this.database = database;
113         }
114
115         //
116         // PART METHODS
117         //
118
119         @Override
120         public Iterable<Part> parse(SoneTextParserContext context, Reader source) throws IOException {
121                 PartContainer parts = new PartContainer();
122                 BufferedReader bufferedReader = (source instanceof BufferedReader) ? (BufferedReader) source : new BufferedReader(source);
123                 try {
124                         String line;
125                         while ((line = bufferedReader.readLine()) != null) {
126                                 for (String pieceOfLine : splitLine(line)) {
127                                         parts.add(createPart(pieceOfLine, context));
128                                 }
129                         }
130                 } finally {
131                         if (bufferedReader != source) {
132                                 Closer.close(bufferedReader);
133                         }
134                 }
135                 removeTrailingWhitespaceParts(parts);
136                 return optimizeParts(parts);
137         }
138
139         private Iterable<Part> optimizeParts(PartContainer partContainer) {
140                 PartContainer parts = new PartContainer();
141                 boolean firstPart = true;
142                 Part lastPart = null;
143                 int emptyLines = 0;
144                 for (Part part : partContainer) {
145                         if (firstPart) {
146                                 if ("\n".equals(part.getText())) {
147                                         continue;
148                                 }
149                                 firstPart = false;
150                         }
151                         if ("\n".equals(part.getText())) {
152                                 emptyLines++;
153                                 if (emptyLines > 2) {
154                                         continue;
155                                 }
156                         } else {
157                                 emptyLines = 0;
158                         }
159                         if ((lastPart != null) && lastPart.isPlainText() && part.isPlainText()) {
160                                 parts.removePart(parts.size() - 1);
161                                 PlainTextPart combinedPart = new PlainTextPart(lastPart.getText() + part.getText());
162                                 parts.add(combinedPart);
163                                 lastPart = combinedPart;
164                         } else if ((lastPart != null) && part.isFreenetLink() && lastPart.isPlainText() && lastPart.getText().endsWith("freenet:")) {
165                                 parts.removePart(parts.size() - 1);
166                                 String lastPartText = lastPart.getText();
167                                 lastPartText = lastPartText.substring(0, lastPartText.length() - "freenet:".length());
168                                 if (lastPartText.length() > 0) {
169                                         parts.add(new PlainTextPart(lastPartText));
170                                 }
171                                 lastPart = part;
172                                 parts.add(part);
173                         } else {
174                                 lastPart = part;
175                                 parts.add(part);
176                         }
177                 }
178                 return parts;
179         }
180
181         private Part createPart(String line, SoneTextParserContext context) {
182                 Optional<Part> linkPart = createLinkPart(line, context);
183                 return linkPart.or(new PlainTextPart(line));
184         }
185
186         private Optional<Part> createLinkPart(String line, SoneTextParserContext context) {
187                 Optional<NextLink> nextLink = findNextLink(line);
188                 if (!nextLink.isPresent()) {
189                         return absent();
190                 }
191                 return partCreators.createPart(nextLink.get().getLinkType(), line, context);
192         }
193
194         private List<String> splitLine(String line) {
195                 List<String> linePieces = newArrayList();
196                 int currentIndex = 0;
197                 while (currentIndex < line.length()) {
198                         Optional<NextLink> nextLink = findNextLink(line.substring(currentIndex));
199                         if (!nextLink.isPresent()) {
200                                 linePieces.add(line.substring(currentIndex));
201                                 break;
202                         }
203                         int nextIndex = currentIndex + nextLink.get().getNextIndex();
204                         if (nextIndex > currentIndex) {
205                                 linePieces.add(line.substring(currentIndex, nextIndex));
206                         }
207                         int nextWhitespace = nextIndex + findNextWhitespaceOrEndOfLine(line.substring(nextIndex));
208                         linePieces.add(line.substring(nextIndex, nextWhitespace));
209                         currentIndex = nextWhitespace;
210                 }
211                 linePieces.add("\n");
212                 return linePieces;
213         }
214
215         private void removeTrailingWhitespaceParts(PartContainer parts) {
216                 for (int partIndex = parts.size() - 1; partIndex >= 0; --partIndex) {
217                         Part part = parts.getPart(partIndex);
218                         if (!(part instanceof PlainTextPart) || !"\n".equals(part.getText())) {
219                                 break;
220                         }
221                         parts.removePart(partIndex);
222                 }
223         }
224
225         private boolean linkMatchesPostingSone(SoneTextParserContext context, String link) {
226                 return (context != null) && (context.getPostingSone() != null) && link.substring(4, Math.min(link.length(), 47)).equals(context.getPostingSone().getId());
227         }
228
229         private boolean lineIsLongEnoughToContainAPostLink(String line) {
230                 return line.length() >= (7 + 36);
231         }
232
233         private static boolean lineIsLongEnoughToContainASoneLink(String line) {
234                 return line.length() >= (7 + 43);
235         }
236
237         private int findNextWhitespaceOrEndOfLine(String line) {
238                 Matcher matcher = whitespacePattern.matcher(line);
239                 return matcher.find(0) ? matcher.start() : line.length();
240         }
241
242         private Optional<NextLink> findNextLink(String line) {
243                 EnumMap<LinkType, Integer> linkTypeIndexes = new EnumMap<LinkType, Integer>(LinkType.class);
244                 for (LinkType linkType : LinkType.values()) {
245                         int index = line.indexOf(linkType.getScheme());
246                         if (index != -1) {
247                                 linkTypeIndexes.put(linkType, index);
248                         }
249                 }
250                 if (linkTypeIndexes.isEmpty()) {
251                         return absent();
252                 }
253                 Entry<LinkType, Integer> smallestEntry = from(linkTypeIndexes.entrySet()).toSortedList(locateSmallestIndex()).get(0);
254                 return of(new NextLink(smallestEntry.getValue(), smallestEntry.getKey()));
255         }
256
257         private Comparator<Entry<LinkType, Integer>> locateSmallestIndex() {
258                 return new Comparator<Entry<LinkType, Integer>>() {
259                         @Override
260                         public int compare(Entry<LinkType, Integer> leftEntry, Entry<LinkType, Integer> rightEntry) {
261                                 return leftEntry.getValue() - rightEntry.getValue();
262                         }
263                 };
264         }
265
266         private class PartCreators {
267
268                 private final Map<LinkType, PartCreator> partCreators = ImmutableMap.<LinkType, PartCreator>builder()
269                                 .put(LinkType.SONE, new SonePartCreator())
270                                 .put(LinkType.POST, new PostPartCreator())
271                                 .put(LinkType.KSK, new FreenetLinkPartCreator(LinkType.KSK))
272                                 .put(LinkType.CHK, new FreenetLinkPartCreator(LinkType.CHK))
273                                 .put(LinkType.SSK, new FreenetLinkPartCreator(LinkType.SSK))
274                                 .put(LinkType.USK, new FreenetLinkPartCreator(LinkType.USK))
275                                 .put(LinkType.HTTP, new InternetLinkPartCreator(LinkType.HTTP))
276                                 .put(LinkType.HTTPS, new InternetLinkPartCreator(LinkType.HTTPS))
277                                 .build();
278
279                 public Optional<Part> createPart(LinkType linkType, String line, SoneTextParserContext context) {
280                         if (line.equals(linkType.getScheme())) {
281                                 return of((Part) new PlainTextPart(line));
282                         }
283                         return partCreators.get(linkType).createPart(line, context);
284                 }
285
286         }
287
288         private class SonePartCreator implements PartCreator {
289
290                 @Override
291                 public Optional<Part> createPart(String line, SoneTextParserContext context) {
292                         if (!lineIsLongEnoughToContainASoneLink(line)) {
293                                 return absent();
294                         }
295                         String soneId = line.substring(7, 50);
296                         Optional<Sone> sone = database.getSone(soneId);
297                         if (!sone.isPresent()) {
298                                 return absent();
299                         }
300                         return Optional.<Part>of(new SonePart(sone.get()));
301                 }
302
303         }
304
305         private class PostPartCreator implements PartCreator {
306
307                 @Override
308                 public Optional<Part> createPart(String line, SoneTextParserContext context) {
309                         if (!lineIsLongEnoughToContainAPostLink(line)) {
310                                 return absent();
311                         }
312                         String postId = line.substring(7, 43);
313                         Optional<Post> post = database.getPost(postId);
314                         if (!post.isPresent()) {
315                                 return absent();
316                         }
317                         return Optional.<Part>of(new PostPart(post.get()));
318                 }
319
320         }
321
322         private class FreenetLinkPartCreator implements PartCreator {
323
324                 private final LinkType linkType;
325
326                 protected FreenetLinkPartCreator(LinkType linkType) {
327                         this.linkType = linkType;
328                 }
329
330                 @Override
331                 public Optional<Part> createPart(String link, SoneTextParserContext context) {
332                         String name = link;
333                         if (name.indexOf('?') > -1) {
334                                 name = name.substring(0, name.indexOf('?'));
335                         }
336                         if (name.endsWith("/")) {
337                                 name = name.substring(0, name.length() - 1);
338                         }
339                         try {
340                                 FreenetURI uri = new FreenetURI(name);
341                                 name = uri.lastMetaString();
342                                 if (name == null) {
343                                         name = uri.getDocName();
344                                 }
345                                 if (name == null) {
346                                         name = link.substring(0, Math.min(9, link.length()));
347                                 }
348                                 boolean fromPostingSone = linkType.isSigned() && linkMatchesPostingSone(context, link);
349                                 return Optional.<Part>of(new FreenetLinkPart(link, name, fromPostingSone));
350                         } catch (MalformedURLException mue1) {
351                                 /* ignore. */
352                         } catch (NullPointerException npe1) {
353                                 /* ignore. */
354                         } catch (ArrayIndexOutOfBoundsException aioobe1) {
355                                 /* ignore. */
356                         }
357                         return absent();
358                 }
359
360         }
361
362         private class InternetLinkPartCreator implements PartCreator {
363
364                 private final LinkType linkType;
365
366                 private InternetLinkPartCreator(LinkType linkType) {
367                         this.linkType = linkType;
368                 }
369
370                 @Override
371                 public Optional<Part> createPart(String link, SoneTextParserContext context) {
372                         String name = link;
373                         name = link.substring(linkType.getScheme().length());
374                         int firstSlash = name.indexOf('/');
375                         int lastSlash = name.lastIndexOf('/');
376                         if ((lastSlash - firstSlash) > 3) {
377                                 name = name.substring(0, firstSlash + 1) + "…" + name.substring(lastSlash);
378                         }
379                         if (name.endsWith("/")) {
380                                 name = name.substring(0, name.length() - 1);
381                         }
382                         if (((name.indexOf('/') > -1) && (name.indexOf('.') < name.lastIndexOf('.', name.indexOf('/'))) || ((name.indexOf('/') == -1) && (name.indexOf('.') < name.lastIndexOf('.')))) && name.startsWith("www.")) {
383                                 name = name.substring(4);
384                         }
385                         if (name.indexOf('?') > -1) {
386                                 name = name.substring(0, name.indexOf('?'));
387                         }
388                         return Optional.<Part>of(new LinkPart(link, name));
389                 }
390
391         }
392
393         private interface PartCreator {
394
395                 Optional<Part> createPart(String line, SoneTextParserContext context);
396
397         }
398
399         /**
400          * Container for position and type of the next link in a line.
401          *
402          * @author <a href="mailto:bombe@pterodactylus.net">David ‘Bombe’ Roden</a>
403          */
404         private static class NextLink {
405
406                 private final int nextIndex;
407                 private final LinkType linkType;
408
409                 private NextLink(int nextIndex, LinkType linkType) {
410                         this.nextIndex = nextIndex;
411                         this.linkType = linkType;
412                 }
413
414                 private int getNextIndex() {
415                         return nextIndex;
416                 }
417
418                 private LinkType getLinkType() {
419                         return linkType;
420                 }
421
422         }
423
424 }