From: David ‘Bombe’ Roden Date: Fri, 13 Oct 2017 06:00:21 +0000 (+0200) Subject: Replace parser with Kotlin version X-Git-Tag: 0.9.8^2~31 X-Git-Url: https://git.pterodactylus.net/?a=commitdiff_plain;h=a45d1b5d50019622c4c6fce6ffa46b4e640b8e7f;p=Sone.git Replace parser with Kotlin version --- diff --git a/src/main/java/net/pterodactylus/sone/text/SoneTextParser.java b/src/main/java/net/pterodactylus/sone/text/SoneTextParser.java deleted file mode 100644 index 39d26db..0000000 --- a/src/main/java/net/pterodactylus/sone/text/SoneTextParser.java +++ /dev/null @@ -1,431 +0,0 @@ -/* - * Sone - SoneTextParser.java - Copyright © 2010–2016 David Roden - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -package net.pterodactylus.sone.text; - -import static com.google.common.base.Optional.absent; -import static com.google.common.base.Optional.of; -import static java.util.logging.Logger.getLogger; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.Reader; -import java.io.StringReader; -import java.net.MalformedURLException; -import java.util.ArrayList; -import java.util.List; -import java.util.logging.Level; -import java.util.logging.Logger; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; - -import net.pterodactylus.sone.data.Post; -import net.pterodactylus.sone.data.Sone; -import net.pterodactylus.sone.data.impl.IdOnlySone; -import net.pterodactylus.sone.database.PostProvider; -import net.pterodactylus.sone.database.SoneProvider; - -import com.google.common.base.Optional; -import org.bitpedia.util.Base32; - -import freenet.keys.FreenetURI; -import freenet.support.Base64; - -/** - * {@link Parser} implementation that can recognize Freenet URIs. - * - * @author David ‘Bombe’ Roden - */ -public class SoneTextParser implements Parser { - - /** The logger. */ - private static final Logger logger = getLogger(SoneTextParser.class.getName()); - - /** Pattern to detect whitespace. */ - private static final Pattern whitespacePattern = Pattern.compile("[\\u000a\u0020\u00a0\u1680\u180e\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u200c\u200d\u202f\u205f\u2060\u2800\u3000]"); - - private static class NextLink { - - private final int position; - private final String link; - private final String remainder; - private final LinkType linkType; - - private NextLink(int position, String link, String remainder, LinkType linkType) { - this.position = position; - this.link = link; - this.remainder = remainder; - this.linkType = linkType; - } - - public int getPosition() { - return position; - } - - public String getLink() { - return link; - } - - public String getRemainder() { - return remainder; - } - - public LinkType getLinkType() { - return linkType; - } - - } - - /** - * Enumeration for all recognized link types. - * - * @author David ‘Bombe’ Roden - */ - private enum LinkType { - - KSK("KSK@", true), - CHK("CHK@", true), - SSK("SSK@", true), - USK("USK@", true), - HTTP("http://", false), - HTTPS("https://", false), - SONE("sone://", false), - POST("post://", false), - - FREEMAIL("", true) { - @Override - public Optional findNext(String line) { - int nextFreemailSuffix = line.indexOf(".freemail"); - if (nextFreemailSuffix < 54) { - /* 52 chars for the id, 1 on @, at least 1 for the local part. */ - return absent(); - } - if (line.charAt(nextFreemailSuffix - 53) != '@') { - return absent(); - } - if (!line.substring(nextFreemailSuffix - 52, nextFreemailSuffix).matches("^[a-z2-7]*$")) { - return absent(); - } - int startOfLocalPart = nextFreemailSuffix - 54; - if (!isAllowedInLocalPart(line.charAt(startOfLocalPart))) { - return absent(); - } - while ((startOfLocalPart > 0) && isAllowedInLocalPart(line.charAt(startOfLocalPart - 1))) { - startOfLocalPart--; - } - return of(new NextLink(startOfLocalPart, line.substring(startOfLocalPart, nextFreemailSuffix + 9), line.substring(nextFreemailSuffix + 9), this)); - } - - private boolean isAllowedInLocalPart(char character) { - return ((character >= 'A') && (character <= 'Z')) - || ((character >= 'a') && (character <= 'z')) - || ((character >= '0') && (character <= '9')) - || (character == '.') || (character == '-') || (character == '_'); - } - }; - - private final String scheme; - private final boolean freenetLink; - - LinkType(String scheme, boolean freenetLink) { - this.scheme = scheme; - this.freenetLink = freenetLink; - } - - /** - * Returns the scheme of this link type. - * - * @return The scheme of this link type - */ - public String getScheme() { - return scheme; - } - - public boolean isFreenetLink() { - return freenetLink; - } - - public Optional findNext(String line) { - int nextLinkPosition = line.indexOf(getScheme()); - if (nextLinkPosition == -1) { - return absent(); - } - int endOfLink = findEndOfLink(line.substring(nextLinkPosition)); - return of(new NextLink(nextLinkPosition, line.substring(nextLinkPosition, nextLinkPosition + endOfLink), line.substring(nextLinkPosition + endOfLink), this)); - } - - private static int findEndOfLink(String line) { - Matcher matcher = whitespacePattern.matcher(line); - int endOfLink = matcher.find() ? matcher.start() : line.length(); - while (isPunctuation(line.charAt(endOfLink - 1))) { - endOfLink--; - } - int openParens = 0; - for (int i = 0; i < endOfLink; i++) { - switch (line.charAt(i)) { - case '(': - openParens++; - break; - case ')': - openParens--; - if (openParens < 0) { - return i; - } - default: - } - } - return endOfLink; - } - - } - - /** The Sone provider. */ - private final SoneProvider soneProvider; - - /** The post provider. */ - private final PostProvider postProvider; - - /** - * Creates a new freenet link parser. - * - * @param soneProvider - * The Sone provider - * @param postProvider - * The post provider - */ - public SoneTextParser(SoneProvider soneProvider, PostProvider postProvider) { - this.soneProvider = soneProvider; - this.postProvider = postProvider; - } - - // - // PART METHODS - // - - /** - * {@inheritDoc} - */ - @Nonnull - @Override - public Iterable parse(@Nonnull String source, @Nullable SoneTextParserContext context) { - List parts = new ArrayList<>(); - try (Reader sourceReader = new StringReader(source); - BufferedReader bufferedReader = new BufferedReader(sourceReader)) { - String line; - boolean lastLineEmpty = true; - int emptyLines = 0; - while ((line = bufferedReader.readLine()) != null) { - if (line.trim().length() == 0) { - if (lastLineEmpty) { - continue; - } - parts.add(new PlainTextPart("\n")); - ++emptyLines; - lastLineEmpty = emptyLines == 2; - continue; - } - emptyLines = 0; - /* - * lineComplete tracks whether the block you are parsing is the - * first block of the line. this is important because sometimes - * you have to add an additional line break. - */ - boolean lineComplete = true; - while (line.length() > 0) { - Optional nextLink = findNextLink(line); - if (!nextLink.isPresent()) { - if (lineComplete && !lastLineEmpty) { - parts.add(new PlainTextPart("\n" + line)); - } else { - parts.add(new PlainTextPart(line)); - } - break; - } - LinkType linkType = nextLink.get().getLinkType(); - int next = nextLink.get().getPosition(); - - /* cut off “freenet:” from before keys. */ - if (linkType.isFreenetLink() && (next >= 8) && (line.substring(next - 8, next).equals("freenet:"))) { - next -= 8; - line = line.substring(0, next) + line.substring(next + 8); - } - - /* if there is text before the next item, write it out. */ - if (lineComplete && !lastLineEmpty) { - parts.add(new PlainTextPart("\n")); - } - if (next > 0) { - parts.add(new PlainTextPart(line.substring(0, next))); - line = line.substring(next); - } - lineComplete = false; - - String link = nextLink.get().getLink(); - logger.log(Level.FINER, String.format("Found link: %s", link)); - - /* if there is no text after the scheme, it’s not a link! */ - if (link.equals(linkType.getScheme())) { - parts.add(new PlainTextPart(linkType.getScheme())); - line = line.substring(linkType.getScheme().length()); - continue; - } - - switch (linkType) { - case SONE: - renderSoneLink(parts, link); - break; - case POST: - renderPostLink(parts, link); - break; - case KSK: - case CHK: - case SSK: - case USK: - renderFreenetLink(parts, link, linkType, context); - break; - case HTTP: - case HTTPS: - renderHttpLink(parts, link, linkType); - break; - case FREEMAIL: - renderFreemailLink(parts, link); - } - - line = nextLink.get().getRemainder(); - } - lastLineEmpty = false; - } - } catch (IOException ioe1) { - // a buffered reader around a string reader should never throw. - throw new RuntimeException(ioe1); - } - for (int partIndex = parts.size() - 1; partIndex >= 0; --partIndex) { - Part part = parts.get(partIndex); - if (!(part instanceof PlainTextPart) || !"\n".equals(part.getText())) { - break; - } - parts.remove(partIndex); - } - return parts; - } - - public static Optional findNextLink(String line) { - int earliestLinkPosition = Integer.MAX_VALUE; - NextLink earliestNextLink = null; - for (LinkType possibleLinkType : LinkType.values()) { - Optional nextLink = possibleLinkType.findNext(line); - if (nextLink.isPresent()) { - if (nextLink.get().getPosition() < earliestLinkPosition) { - earliestNextLink = nextLink.get(); - earliestLinkPosition = earliestNextLink.getPosition(); - } - } - } - return Optional.fromNullable(earliestNextLink); - } - - private void renderSoneLink(List parts, String line) { - if (line.length() >= (7 + 43)) { - String soneId = line.substring(7, 50); - Optional sone = soneProvider.getSone(soneId); - parts.add(new SonePart(sone.or(new IdOnlySone(soneId)))); - } else { - parts.add(new PlainTextPart(line)); - } - } - - private void renderPostLink(List parts, String line) { - if (line.length() >= (7 + 36)) { - String postId = line.substring(7, 43); - Optional post = postProvider.getPost(postId); - if (post.isPresent()) { - parts.add(new PostPart(post.get())); - } else { - parts.add(new PlainTextPart(line.substring(0, 43))); - } - } else { - parts.add(new PlainTextPart(line)); - } - } - - private void renderFreenetLink(List parts, String link, LinkType linkType, @Nullable SoneTextParserContext context) { - String name = link; - String linkWithoutParameters = link; - if (name.indexOf('?') > -1) { - linkWithoutParameters = name = name.substring(0, name.indexOf('?')); - } - if (name.endsWith("/")) { - name = name.substring(0, name.length() - 1); - } - try { - FreenetURI uri = new FreenetURI(name); - name = uri.lastMetaString(); - if (name == null) { - name = uri.getDocName(); - } - if (name == null) { - name = link.substring(0, Math.min(9, link.length())); - } - boolean fromPostingSone = ((linkType == LinkType.SSK) || (linkType == LinkType.USK)) && (context != null) && (context.getPostingSone() != null) && link.substring(4, Math.min(link.length(), 47)).equals(context.getPostingSone().getId()); - parts.add(new FreenetLinkPart(link, name, linkWithoutParameters, fromPostingSone)); - } catch (MalformedURLException mue1) { - /* not a valid link, insert as plain text. */ - parts.add(new PlainTextPart(link)); - } catch (NullPointerException npe1) { - /* FreenetURI sometimes throws these, too. */ - parts.add(new PlainTextPart(link)); - } catch (ArrayIndexOutOfBoundsException aioobe1) { - /* oh, and these, too. */ - parts.add(new PlainTextPart(link)); - } - } - - private void renderHttpLink(List parts, String link, LinkType linkType) { - String name = link.substring(linkType == LinkType.HTTP ? 7 : 8); - int firstSlash = name.indexOf('/'); - int lastSlash = name.lastIndexOf('/'); - if ((lastSlash - firstSlash) > 3) { - name = name.substring(0, firstSlash + 1) + "…" + name.substring(lastSlash); - } - if (name.endsWith("/")) { - name = name.substring(0, name.length() - 1); - } - if (((name.indexOf('/') > -1) && (name.indexOf('.') < name.lastIndexOf('.', name.indexOf('/'))) || ((name.indexOf('/') == -1) && (name.indexOf('.') < name.lastIndexOf('.')))) && name.startsWith("www.")) { - name = name.substring(4); - } - if (name.indexOf('?') > -1) { - name = name.substring(0, name.indexOf('?')); - } - parts.add(new LinkPart(link, name)); - } - - private void renderFreemailLink(List parts, String line) { - int separator = line.indexOf('@'); - String freemailId = line.substring(separator + 1, separator + 53); - String identityId = Base64.encode(Base32.decode(freemailId)); - String emailLocalPart = line.substring(0, separator); - parts.add(new FreemailPart(emailLocalPart, freemailId, identityId)); - } - - private static boolean isPunctuation(char character) { - return (character == '.') || (character == ',') || (character == '!') || (character == '?'); - } - -} diff --git a/src/main/kotlin/net/pterodactylus/sone/text/SoneTextParser.kt b/src/main/kotlin/net/pterodactylus/sone/text/SoneTextParser.kt new file mode 100644 index 0000000..68a38ef --- /dev/null +++ b/src/main/kotlin/net/pterodactylus/sone/text/SoneTextParser.kt @@ -0,0 +1,197 @@ +package net.pterodactylus.sone.text + +import freenet.keys.FreenetURI +import freenet.support.Base64 +import net.pterodactylus.sone.data.Sone +import net.pterodactylus.sone.data.impl.IdOnlySone +import net.pterodactylus.sone.database.PostProvider +import net.pterodactylus.sone.database.SoneProvider +import net.pterodactylus.sone.text.LinkType.CHK +import net.pterodactylus.sone.text.LinkType.FREEMAIL +import net.pterodactylus.sone.text.LinkType.HTTP +import net.pterodactylus.sone.text.LinkType.HTTPS +import net.pterodactylus.sone.text.LinkType.KSK +import net.pterodactylus.sone.text.LinkType.POST +import net.pterodactylus.sone.text.LinkType.SONE +import net.pterodactylus.sone.text.LinkType.SSK +import net.pterodactylus.sone.text.LinkType.USK +import net.pterodactylus.sone.utils.let +import org.bitpedia.util.Base32 +import java.net.MalformedURLException + +/** + * [Parser] implementation that can recognize Freenet URIs. + */ +class SoneTextParser(private val soneProvider: SoneProvider?, private val postProvider: PostProvider?) { + + fun parse(source: String, context: SoneTextParserContext?) = + source.split("\n") + .dropWhile { it.trim() == "" } + .dropLastWhile { it.trim() == "" } + .mergeMultipleEmptyLines() + .flatMap { splitLineIntoParts(it, context) } + .removeEmptyPlainTextParts() + .mergeAdjacentPlainTextParts() + + private fun splitLineIntoParts(line: String, context: SoneTextParserContext?) = + generateSequence(PlainTextPart("") as Part to line) { remainder -> + if (remainder.second == "") + null + else + LinkType.values() + .mapNotNull { it.findNext(remainder.second) } + .minBy { it.position } + .let { + when { + it == null -> PlainTextPart(remainder.second) to "" + it.position == 0 -> it.toPart(context) to it.remainder + else -> PlainTextPart(remainder.second.substring(0, it.position)) to (it.link + it.remainder) + } + } + }.map { it.first }.toList() + + private fun NextLink.toPart(context: SoneTextParserContext?) = when (linkType) { + KSK, CHK -> try { + FreenetURI(link).let { freenetUri -> + FreenetLinkPart( + link, + if (freenetUri.isKSK) { + freenetUri.guessableKey + } else { + freenetUri.metaString ?: freenetUri.docName ?: link.substring(0, 9) + }, + link.split('?').first() + ) + } + } catch (e: MalformedURLException) { + PlainTextPart(link) + } + SSK, USK -> + try { + FreenetLinkPart(link, FreenetURI(link).docName, trusted = context?.routingKey?.contentEquals(FreenetURI(link).routingKey) == true) + } catch (e: MalformedURLException) { + PlainTextPart(link) + } + SONE -> link.substring(7).let { SonePart(soneProvider?.getSone(it)?.orNull() ?: IdOnlySone(it)) } + POST -> postProvider?.getPost(link.substring(7))?.let { PostPart(it) } ?: PlainTextPart(link) + FREEMAIL -> link.indexOf('@').let { atSign -> + link.substring(atSign + 1, link.length - 9).let { freemailId -> + FreemailPart(link.substring(0, atSign), freemailId, freemailId.decodedId) + } + } + HTTP, HTTPS -> LinkPart(link, link + .withoutProtocol + .withoutWwwPrefix + .withoutUrlParameters + .withoutMiddlePathComponents + .withoutTrailingSlash) + } + +} + +private fun List.mergeMultipleEmptyLines() = fold(emptyList()) { previous, current -> + if (previous.isEmpty()) { + previous + current + } else { + if ((previous.last() == "\n") && (current == "")) { + previous + } else { + previous + ("\n" + current) + } + } +} + +private fun List.mergeAdjacentPlainTextParts() = fold(emptyList()) { parts, part -> + if ((parts.lastOrNull() is PlainTextPart) && (part is PlainTextPart)) { + parts.dropLast(1) + PlainTextPart(parts.last().text + part.text) + } else { + parts + part + } +} + +private fun List.removeEmptyPlainTextParts() = filterNot { it == PlainTextPart("") } + +private val String.decodedId: String get() = Base64.encode(Base32.decode(this)) +private val String.withoutProtocol get() = substring(indexOf("//") + 2) +private val String.withoutUrlParameters get() = split('?').first() + +private val String.withoutWwwPrefix + get() = split("/") + .replaceFirst { it.split(".").dropWhile { it == "www" }.joinToString(".") } + .joinToString("/") + +private fun List.replaceFirst(replacement: (T) -> T) = mapIndexed { index, element -> + if (index == 0) replacement(element) else element +} + +private val String.withoutMiddlePathComponents + get() = split("/").let { + if (it.size > 2) { + "${it.first()}/…/${it.last()}" + } else { + it.joinToString("/") + } + } +private val String.withoutTrailingSlash get() = if (endsWith("/")) substring(0, length - 1) else this +private val SoneTextParserContext.routingKey: ByteArray? get() = postingSone?.routingKey +private val Sone.routingKey: ByteArray get() = Base64.decode(id) + +private enum class LinkType(private val scheme: String, private val freenetLink: Boolean) { + + KSK("KSK@", true), + CHK("CHK@", true), + SSK("SSK@", true), + USK("USK@", true), + HTTP("http://", false), + HTTPS("https://", false), + SONE("sone://", false) { + override fun validateLinkLength(length: Int) = length.takeIf { it == 50 } + }, + POST("post://", false), + FREEMAIL("", true) { + override fun findNext(line: String): NextLink? { + val nextFreemailSuffix = line.indexOf(".freemail").takeIf { it >= 54 } ?: return null + if (line[nextFreemailSuffix - 53] != '@') return null + if (!line.substring(nextFreemailSuffix - 52, nextFreemailSuffix).matches(Regex("^[a-z2-7]*\$"))) return null + val firstCharacterIndex = generateSequence(nextFreemailSuffix - 53) { + it.minus(1).takeIf { (it >= 0) && line[it].validLocalPart } + }.lastOrNull() ?: return null + return NextLink(firstCharacterIndex, this, line.substring(firstCharacterIndex, nextFreemailSuffix + 9), line.substring(nextFreemailSuffix + 9)) + } + + private val Char.validLocalPart get() = (this in ('A'..'Z')) || (this in ('a'..'z')) || (this in ('0'..'9')) || (this == '-') || (this == '_') || (this == '.') + }; + + open fun findNext(line: String): NextLink? { + val nextLinkPosition = line.indexOf(scheme).takeIf { it != -1 } ?: return null + val endOfLink = line.substring(nextLinkPosition).findEndOfLink().validate() ?: return null + val link = line.substring(nextLinkPosition, nextLinkPosition + endOfLink) + val realNextLinkPosition = if (freenetLink && line.substring(0, nextLinkPosition).endsWith("freenet:")) nextLinkPosition - 8 else nextLinkPosition + return NextLink(realNextLinkPosition, this, link, line.substring(nextLinkPosition + endOfLink)) + } + + private fun String.findEndOfLink() = + substring(0, whitespace.find(this)?.range?.start ?: length) + .dropLastWhile(::isPunctuation) + .upToFirstUnmatchedParen() + + private fun Int.validate() = validateLinkLength(this) + protected open fun validateLinkLength(length: Int) = length.takeIf { it > scheme.length } + + private fun String.upToFirstUnmatchedParen() = + foldIndexed(Pair(0, null)) { index, (openParens, firstUnmatchedParen), currentChar -> + when (currentChar) { + '(' -> (openParens + 1) to firstUnmatchedParen + ')' -> ((openParens - 1) to (if (openParens == 0) (firstUnmatchedParen ?: index) else firstUnmatchedParen)) + else -> openParens to firstUnmatchedParen + } + }.second ?: length + +} + +private val punctuationChars = listOf('.', ',', '?', '!') +private fun isPunctuation(char: Char) = char in punctuationChars + +private val whitespace = Regex("[\\u000a\u0020\u00a0\u1680\u180e\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u200c\u200d\u202f\u205f\u2060\u2800\u3000]") + +private data class NextLink(val position: Int, val linkType: LinkType, val link: String, val remainder: String)