From ca05f37d6d77ebad800b252719b0ff03877fc968 Mon Sep 17 00:00:00 2001 From: =?utf8?q?David=20=E2=80=98Bombe=E2=80=99=20Roden?= Date: Fri, 2 Sep 2022 17:27:11 +0200 Subject: [PATCH] =?utf8?q?=F0=9F=9A=B8=20Improve=20text=20extraction=20eve?= =?utf8?q?n=20further?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This actually revamps the way the first paragraph is extracted from freesites and could cause a lot more descriptions to show up in Sone — which was the goal! Previously I tried to locate all top-level nodes (under ) that themselves had text nodes below them and whose name did not start with an “h” (to exclude the header tags) but it turns out this can be easily defeated by wrapping all of the site in e.g. a
tag. And I’m sure that a
tag would do exactly the same… So now I use a CSS selector query to get all

and

nodes, get those with text nodes below them and then get their text (which flattens them for me and removes embedded tags like or ). --- .../net/pterodactylus/sone/core/DefaultElementLoader.kt | 6 +++--- .../net/pterodactylus/sone/core/DefaultElementLoaderTest.kt | 12 ++++++++++++ .../net/pterodactylus/sone/core/element-loader5.html | 12 ++++++++++++ 3 files changed, 27 insertions(+), 3 deletions(-) create mode 100644 src/test/resources/net/pterodactylus/sone/core/element-loader5.html diff --git a/src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt b/src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt index 88cb1f5..69872c5 100644 --- a/src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt +++ b/src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt @@ -99,10 +99,10 @@ private val Document.metaDescription: String? ?.second private val Document.firstNonHeadingParagraph: String? - get() = body().children() + get() = body().select("div, p") + .filter { it.textNodes().isNotEmpty() } .map { it to it.text() } - .filterNot { it.second == "" } - .firstOrNull { !it.first.tagName().startsWith("h", ignoreCase = true) } + .firstOrNull { it.second != "" } ?.second private val Int.human get() = when (this) { diff --git a/src/test/kotlin/net/pterodactylus/sone/core/DefaultElementLoaderTest.kt b/src/test/kotlin/net/pterodactylus/sone/core/DefaultElementLoaderTest.kt index 8a3d40f..64456bc 100644 --- a/src/test/kotlin/net/pterodactylus/sone/core/DefaultElementLoaderTest.kt +++ b/src/test/kotlin/net/pterodactylus/sone/core/DefaultElementLoaderTest.kt @@ -151,6 +151,18 @@ class DefaultElementLoaderTest { } @Test + fun `element loader can extract first paragraph from real-world example`() { + runWithCallback(textKey) { elementLoader, _, callback, _ -> + callback.loaded(FreenetURI(textKey), "text/html; charset=UTF-8", read("element-loader5.html")) + val linkedElement = elementLoader.loadElement(textKey) + assertThat(linkedElement, isLinkedElement(equalTo(textKey), allOf( + hasEntry("type", "html"), hasEntry("title", "Some Nice Page Title"), + hasEntry("description", "This is the first paragraph of the very nice freesite.") + ))) + } + } + + @Test fun `image is not loaded again after it failed`() { runWithCallback(IMAGE_ID) { elementLoader, _, callback, _ -> elementLoader.loadElement(IMAGE_ID) diff --git a/src/test/resources/net/pterodactylus/sone/core/element-loader5.html b/src/test/resources/net/pterodactylus/sone/core/element-loader5.html new file mode 100644 index 0000000..94fe1b0 --- /dev/null +++ b/src/test/resources/net/pterodactylus/sone/core/element-loader5.html @@ -0,0 +1,12 @@ + + + + Some Nice Page Title + + +
+

First Paragraph

+

This is the first paragraph of the very nice freesite.

+
+ + -- 2.7.4