From: David ‘Bombe’ Roden Date: Fri, 2 Sep 2022 15:18:21 +0000 (+0200) Subject: 🚸 Improve text extraction from freesites X-Git-Url: https://git.pterodactylus.net/?p=Sone.git;a=commitdiff_plain;h=17a659821355e6396f464e50a9b4048c0ea01ff7 🚸 Improve text extraction from freesites --- diff --git a/src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt b/src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt index 2849029..88cb1f5 100644 --- a/src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt +++ b/src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt @@ -6,7 +6,6 @@ import com.google.common.cache.CacheBuilder import freenet.keys.FreenetURI import org.jsoup.Jsoup import org.jsoup.nodes.Document -import org.jsoup.nodes.TextNode import java.io.ByteArrayInputStream import java.net.URLDecoder import java.nio.charset.Charset @@ -101,7 +100,6 @@ private val Document.metaDescription: String? private val Document.firstNonHeadingParagraph: String? get() = body().children() - .filter { it.children().all { it is TextNode } } .map { it to it.text() } .filterNot { it.second == "" } .firstOrNull { !it.first.tagName().startsWith("h", ignoreCase = true) } diff --git a/src/test/kotlin/net/pterodactylus/sone/core/DefaultElementLoaderTest.kt b/src/test/kotlin/net/pterodactylus/sone/core/DefaultElementLoaderTest.kt index 1b03d7a..8a3d40f 100644 --- a/src/test/kotlin/net/pterodactylus/sone/core/DefaultElementLoaderTest.kt +++ b/src/test/kotlin/net/pterodactylus/sone/core/DefaultElementLoaderTest.kt @@ -126,14 +126,14 @@ class DefaultElementLoaderTest { } @Test - fun `element loader can not extract description if html is more complicated`() { + fun `element loader can extract description if html is more complicated`() { runWithCallback(textKey) { elementLoader, _, callback, _ -> callback.loaded(FreenetURI(textKey), "text/html; charset=UTF-8", read("element-loader3.html")) val linkedElement = elementLoader.loadElement(textKey) assertThat(linkedElement, isLinkedElement(equalTo(textKey), allOf( hasEntry("type", "html"), hasEntry("size", 204), hasEntry("sizeHuman", "204 B"), hasEntry("title", "Some Nice Page Title"), - hasEntry("description", null) + hasEntry("description", "This is the first paragraph of the very nice freesite.") ))) } }