X-Git-Url: https://git.pterodactylus.net/?p=Sone.git;a=blobdiff_plain;f=src%2Fmain%2Fkotlin%2Fnet%2Fpterodactylus%2Fsone%2Ftext%2FSoneTextParser.kt;fp=src%2Fmain%2Fkotlin%2Fnet%2Fpterodactylus%2Fsone%2Ftext%2FSoneTextParser.kt;h=68a38ef5b650ddf97ac69f74abf918b07d226027;hp=0000000000000000000000000000000000000000;hb=a45d1b5d50019622c4c6fce6ffa46b4e640b8e7f;hpb=a21cf1224cebf99f3210883efbf4afb9bd8da87c diff --git a/src/main/kotlin/net/pterodactylus/sone/text/SoneTextParser.kt b/src/main/kotlin/net/pterodactylus/sone/text/SoneTextParser.kt new file mode 100644 index 0000000..68a38ef --- /dev/null +++ b/src/main/kotlin/net/pterodactylus/sone/text/SoneTextParser.kt @@ -0,0 +1,197 @@ +package net.pterodactylus.sone.text + +import freenet.keys.FreenetURI +import freenet.support.Base64 +import net.pterodactylus.sone.data.Sone +import net.pterodactylus.sone.data.impl.IdOnlySone +import net.pterodactylus.sone.database.PostProvider +import net.pterodactylus.sone.database.SoneProvider +import net.pterodactylus.sone.text.LinkType.CHK +import net.pterodactylus.sone.text.LinkType.FREEMAIL +import net.pterodactylus.sone.text.LinkType.HTTP +import net.pterodactylus.sone.text.LinkType.HTTPS +import net.pterodactylus.sone.text.LinkType.KSK +import net.pterodactylus.sone.text.LinkType.POST +import net.pterodactylus.sone.text.LinkType.SONE +import net.pterodactylus.sone.text.LinkType.SSK +import net.pterodactylus.sone.text.LinkType.USK +import net.pterodactylus.sone.utils.let +import org.bitpedia.util.Base32 +import java.net.MalformedURLException + +/** + * [Parser] implementation that can recognize Freenet URIs. + */ +class SoneTextParser(private val soneProvider: SoneProvider?, private val postProvider: PostProvider?) { + + fun parse(source: String, context: SoneTextParserContext?) = + source.split("\n") + .dropWhile { it.trim() == "" } + .dropLastWhile { it.trim() == "" } + .mergeMultipleEmptyLines() + .flatMap { splitLineIntoParts(it, context) } + .removeEmptyPlainTextParts() + .mergeAdjacentPlainTextParts() + + private fun splitLineIntoParts(line: String, context: SoneTextParserContext?) = + generateSequence(PlainTextPart("") as Part to line) { remainder -> + if (remainder.second == "") + null + else + LinkType.values() + .mapNotNull { it.findNext(remainder.second) } + .minBy { it.position } + .let { + when { + it == null -> PlainTextPart(remainder.second) to "" + it.position == 0 -> it.toPart(context) to it.remainder + else -> PlainTextPart(remainder.second.substring(0, it.position)) to (it.link + it.remainder) + } + } + }.map { it.first }.toList() + + private fun NextLink.toPart(context: SoneTextParserContext?) = when (linkType) { + KSK, CHK -> try { + FreenetURI(link).let { freenetUri -> + FreenetLinkPart( + link, + if (freenetUri.isKSK) { + freenetUri.guessableKey + } else { + freenetUri.metaString ?: freenetUri.docName ?: link.substring(0, 9) + }, + link.split('?').first() + ) + } + } catch (e: MalformedURLException) { + PlainTextPart(link) + } + SSK, USK -> + try { + FreenetLinkPart(link, FreenetURI(link).docName, trusted = context?.routingKey?.contentEquals(FreenetURI(link).routingKey) == true) + } catch (e: MalformedURLException) { + PlainTextPart(link) + } + SONE -> link.substring(7).let { SonePart(soneProvider?.getSone(it)?.orNull() ?: IdOnlySone(it)) } + POST -> postProvider?.getPost(link.substring(7))?.let { PostPart(it) } ?: PlainTextPart(link) + FREEMAIL -> link.indexOf('@').let { atSign -> + link.substring(atSign + 1, link.length - 9).let { freemailId -> + FreemailPart(link.substring(0, atSign), freemailId, freemailId.decodedId) + } + } + HTTP, HTTPS -> LinkPart(link, link + .withoutProtocol + .withoutWwwPrefix + .withoutUrlParameters + .withoutMiddlePathComponents + .withoutTrailingSlash) + } + +} + +private fun List.mergeMultipleEmptyLines() = fold(emptyList()) { previous, current -> + if (previous.isEmpty()) { + previous + current + } else { + if ((previous.last() == "\n") && (current == "")) { + previous + } else { + previous + ("\n" + current) + } + } +} + +private fun List.mergeAdjacentPlainTextParts() = fold(emptyList()) { parts, part -> + if ((parts.lastOrNull() is PlainTextPart) && (part is PlainTextPart)) { + parts.dropLast(1) + PlainTextPart(parts.last().text + part.text) + } else { + parts + part + } +} + +private fun List.removeEmptyPlainTextParts() = filterNot { it == PlainTextPart("") } + +private val String.decodedId: String get() = Base64.encode(Base32.decode(this)) +private val String.withoutProtocol get() = substring(indexOf("//") + 2) +private val String.withoutUrlParameters get() = split('?').first() + +private val String.withoutWwwPrefix + get() = split("/") + .replaceFirst { it.split(".").dropWhile { it == "www" }.joinToString(".") } + .joinToString("/") + +private fun List.replaceFirst(replacement: (T) -> T) = mapIndexed { index, element -> + if (index == 0) replacement(element) else element +} + +private val String.withoutMiddlePathComponents + get() = split("/").let { + if (it.size > 2) { + "${it.first()}/…/${it.last()}" + } else { + it.joinToString("/") + } + } +private val String.withoutTrailingSlash get() = if (endsWith("/")) substring(0, length - 1) else this +private val SoneTextParserContext.routingKey: ByteArray? get() = postingSone?.routingKey +private val Sone.routingKey: ByteArray get() = Base64.decode(id) + +private enum class LinkType(private val scheme: String, private val freenetLink: Boolean) { + + KSK("KSK@", true), + CHK("CHK@", true), + SSK("SSK@", true), + USK("USK@", true), + HTTP("http://", false), + HTTPS("https://", false), + SONE("sone://", false) { + override fun validateLinkLength(length: Int) = length.takeIf { it == 50 } + }, + POST("post://", false), + FREEMAIL("", true) { + override fun findNext(line: String): NextLink? { + val nextFreemailSuffix = line.indexOf(".freemail").takeIf { it >= 54 } ?: return null + if (line[nextFreemailSuffix - 53] != '@') return null + if (!line.substring(nextFreemailSuffix - 52, nextFreemailSuffix).matches(Regex("^[a-z2-7]*\$"))) return null + val firstCharacterIndex = generateSequence(nextFreemailSuffix - 53) { + it.minus(1).takeIf { (it >= 0) && line[it].validLocalPart } + }.lastOrNull() ?: return null + return NextLink(firstCharacterIndex, this, line.substring(firstCharacterIndex, nextFreemailSuffix + 9), line.substring(nextFreemailSuffix + 9)) + } + + private val Char.validLocalPart get() = (this in ('A'..'Z')) || (this in ('a'..'z')) || (this in ('0'..'9')) || (this == '-') || (this == '_') || (this == '.') + }; + + open fun findNext(line: String): NextLink? { + val nextLinkPosition = line.indexOf(scheme).takeIf { it != -1 } ?: return null + val endOfLink = line.substring(nextLinkPosition).findEndOfLink().validate() ?: return null + val link = line.substring(nextLinkPosition, nextLinkPosition + endOfLink) + val realNextLinkPosition = if (freenetLink && line.substring(0, nextLinkPosition).endsWith("freenet:")) nextLinkPosition - 8 else nextLinkPosition + return NextLink(realNextLinkPosition, this, link, line.substring(nextLinkPosition + endOfLink)) + } + + private fun String.findEndOfLink() = + substring(0, whitespace.find(this)?.range?.start ?: length) + .dropLastWhile(::isPunctuation) + .upToFirstUnmatchedParen() + + private fun Int.validate() = validateLinkLength(this) + protected open fun validateLinkLength(length: Int) = length.takeIf { it > scheme.length } + + private fun String.upToFirstUnmatchedParen() = + foldIndexed(Pair(0, null)) { index, (openParens, firstUnmatchedParen), currentChar -> + when (currentChar) { + '(' -> (openParens + 1) to firstUnmatchedParen + ')' -> ((openParens - 1) to (if (openParens == 0) (firstUnmatchedParen ?: index) else firstUnmatchedParen)) + else -> openParens to firstUnmatchedParen + } + }.second ?: length + +} + +private val punctuationChars = listOf('.', ',', '?', '!') +private fun isPunctuation(char: Char) = char in punctuationChars + +private val whitespace = Regex("[\\u000a\u0020\u00a0\u1680\u180e\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u200c\u200d\u202f\u205f\u2060\u2800\u3000]") + +private data class NextLink(val position: Int, val linkType: LinkType, val link: String, val remainder: String)