From 17a659821355e6396f464e50a9b4048c0ea01ff7 Mon Sep 17 00:00:00 2001 From: =?utf8?q?David=20=E2=80=98Bombe=E2=80=99=20Roden?= Date: Fri, 2 Sep 2022 17:18:21 +0200 Subject: [PATCH] =?utf8?q?=F0=9F=9A=B8=20Improve=20text=20extraction=20fro?= =?utf8?q?m=20freesites?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit --- src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt | 2 -- .../kotlin/net/pterodactylus/sone/core/DefaultElementLoaderTest.kt | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt b/src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt index 2849029..88cb1f5 100644 --- a/src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt +++ b/src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt @@ -6,7 +6,6 @@ import com.google.common.cache.CacheBuilder import freenet.keys.FreenetURI import org.jsoup.Jsoup import org.jsoup.nodes.Document -import org.jsoup.nodes.TextNode import java.io.ByteArrayInputStream import java.net.URLDecoder import java.nio.charset.Charset @@ -101,7 +100,6 @@ private val Document.metaDescription: String? private val Document.firstNonHeadingParagraph: String? get() = body().children() - .filter { it.children().all { it is TextNode } } .map { it to it.text() } .filterNot { it.second == "" } .firstOrNull { !it.first.tagName().startsWith("h", ignoreCase = true) } diff --git a/src/test/kotlin/net/pterodactylus/sone/core/DefaultElementLoaderTest.kt b/src/test/kotlin/net/pterodactylus/sone/core/DefaultElementLoaderTest.kt index 1b03d7a..8a3d40f 100644 --- a/src/test/kotlin/net/pterodactylus/sone/core/DefaultElementLoaderTest.kt +++ b/src/test/kotlin/net/pterodactylus/sone/core/DefaultElementLoaderTest.kt @@ -126,14 +126,14 @@ class DefaultElementLoaderTest { } @Test - fun `element loader can not extract description if html is more complicated`() { + fun `element loader can extract description if html is more complicated`() { runWithCallback(textKey) { elementLoader, _, callback, _ -> callback.loaded(FreenetURI(textKey), "text/html; charset=UTF-8", read("element-loader3.html")) val linkedElement = elementLoader.loadElement(textKey) assertThat(linkedElement, isLinkedElement(equalTo(textKey), allOf( hasEntry("type", "html"), hasEntry("size", 204), hasEntry("sizeHuman", "204 B"), hasEntry("title", "Some Nice Page Title"), - hasEntry("description", null) + hasEntry("description", "This is the first paragraph of the very nice freesite.") ))) } } -- 2.7.4