From: David ‘Bombe’ Roden Date: Sun, 23 Apr 2017 14:08:43 +0000 (+0200) Subject: Add HTML parsing to element loader X-Git-Tag: 0.9.7^2~245 X-Git-Url: https://git.pterodactylus.net/?p=Sone.git;a=commitdiff_plain;h=71d9b57ce88771258d2d3678b90bb5a55c6e4d9e;hp=5a1d1f95e7d122752a405bf1730e67cafb0df177 Add HTML parsing to element loader --- diff --git a/build.gradle b/build.gradle index ce95eb9..3322f5a 100644 --- a/build.gradle +++ b/build.gradle @@ -50,11 +50,11 @@ dependencies { compile group: 'com.google.guava', name: 'guava', version: '14.0.1' compile group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.1.2' compile group: 'com.google.code.findbugs', name: 'jsr305', version: '2.0.1' + compile group: 'org.jsoup', name: 'jsoup', version: '1.10.2' testCompile group: 'org.jetbrains.kotlin', name: 'kotlin-test', version: kotlinVersion testCompile group: 'junit', name: 'junit', version: '4.11' testCompile group: 'org.mockito', name: 'mockito-core', version: '2.1.0' - testCompile group: 'org.jsoup', name: 'jsoup', version: '1.7.1' testCompile group: 'org.hamcrest', name: 'hamcrest-all', version: '1.3' } diff --git a/src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt b/src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt index 67d5538..f553ae8 100644 --- a/src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt +++ b/src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt @@ -4,10 +4,15 @@ import com.google.common.base.Ticker import com.google.common.cache.Cache import com.google.common.cache.CacheBuilder import freenet.keys.FreenetURI +import org.jsoup.Jsoup +import org.jsoup.nodes.Document +import org.jsoup.nodes.TextNode import java.io.ByteArrayInputStream import java.net.URLDecoder +import java.nio.charset.Charset import java.text.Normalizer import java.util.concurrent.TimeUnit.MINUTES +import javax.activation.MimeType import javax.imageio.ImageIO import javax.inject.Inject @@ -20,26 +25,55 @@ class DefaultElementLoader(private val freenetInterface: FreenetInterface, ticke private val loadingLinks: Cache = CacheBuilder.newBuilder().build() private val failureCache: Cache = CacheBuilder.newBuilder().ticker(ticker).expireAfterWrite(30, MINUTES).build() - private val imageCache: Cache = CacheBuilder.newBuilder().build() + private val elementCache: Cache = CacheBuilder.newBuilder().build() private val callback = object : FreenetInterface.BackgroundFetchCallback { override fun shouldCancel(uri: FreenetURI, mimeType: String, size: Long): Boolean { - return !mimeType.startsWith("image/") || (size > 2097152) + return (size > 2097152) || (!mimeType.startsWith("image/") && !mimeType.startsWith("text/html")) } override fun loaded(uri: FreenetURI, mimeType: String, data: ByteArray) { - if (!mimeType.startsWith("image/")) { - return - } - ByteArrayInputStream(data).use { - ImageIO.read(it) - }?.let { - imageCache.get(uri.toString().decode().normalize()) { - LinkedElement(uri.toString(), properties = mapOf("size" to data.size, "sizeHuman" to data.size.human)) + MimeType(mimeType).also { mimeType -> + when { + mimeType.primaryType == "image" -> { + ByteArrayInputStream(data).use { + ImageIO.read(it) + }?.let { + elementCache.get(uri.toString().decode().normalize()) { + LinkedElement(uri.toString(), properties = mapOf("size" to data.size, "sizeHuman" to data.size.human)) + } + } + } + mimeType.baseType == "text/html" -> { + val document = Jsoup.parse(data.toString(Charset.forName(mimeType.getParameter("charset") ?: "UTF-8"))) + elementCache.get(uri.toString().decode().normalize()) { + LinkedElement(uri.toString(), properties = mapOf( + "size" to data.size, "sizeHuman" to data.size.human, + "title" to document.title().emptyToNull, + "description" to (document.metaDescription ?: document.firstNonHeadingParagraph) + )) + } + } } + removeLoadingLink(uri) } - removeLoadingLink(uri) } + private val String?.emptyToNull get() = if (this == "") null else this + + private val Document.metaDescription: String? + get() = head().getElementsByTag("meta") + .map { it.attr("name") to it.attr("content") } + .firstOrNull { it.first == "description" } + ?.second + + private val Document.firstNonHeadingParagraph: String? + get() = body().children() + .filter { it.children().all { it is TextNode } } + .map { it to it.text() } + .filterNot { it.second == "" } + .firstOrNull { !it.first.tagName().startsWith("h", ignoreCase = true) } + ?.second + private val Int.human get() = when (this) { in 0..1023 -> "$this B" in 1024..1048575 -> "${this / 1024} KiB" @@ -62,7 +96,7 @@ class DefaultElementLoader(private val freenetInterface: FreenetInterface, ticke override fun loadElement(link: String): LinkedElement { val normalizedLink = link.decode().normalize() synchronized(loadingLinks) { - imageCache.getIfPresent(normalizedLink)?.run { + elementCache.getIfPresent(normalizedLink)?.run { return this } failureCache.getIfPresent(normalizedLink)?.run { diff --git a/src/test/kotlin/net/pterodactylus/sone/core/DefaultElementLoaderTest.kt b/src/test/kotlin/net/pterodactylus/sone/core/DefaultElementLoaderTest.kt index 03b42c3..ed66dd9 100644 --- a/src/test/kotlin/net/pterodactylus/sone/core/DefaultElementLoaderTest.kt +++ b/src/test/kotlin/net/pterodactylus/sone/core/DefaultElementLoaderTest.kt @@ -8,6 +8,7 @@ import net.pterodactylus.sone.test.capture import net.pterodactylus.sone.test.mock import org.hamcrest.MatcherAssert.assertThat import org.hamcrest.Matchers.`is` +import org.hamcrest.Matchers.equalTo import org.junit.Test import org.mockito.ArgumentMatchers.any import org.mockito.ArgumentMatchers.eq @@ -27,6 +28,7 @@ class DefaultElementLoaderTest { private val freenetURI = FreenetURI(IMAGE_ID) private const val decomposedKey = "CHK@DCiVgTWW9nnWHJc9EVwtFJ6jAfBSVyy~rgiPvhUKbS4,mNY85V0x7dYcv7SnEYo1PCC6y2wNWMDNt-y9UWQx9fI,AAMC--8/fru%CC%88hstu%CC%88ck.jpg" private const val normalizedKey = "CHK@DCiVgTWW9nnWHJc9EVwtFJ6jAfBSVyy~rgiPvhUKbS4,mNY85V0x7dYcv7SnEYo1PCC6y2wNWMDNt-y9UWQx9fI,AAMC--8/frühstück.jpg" + private const val textKey = "KSK@gpl.html" private val sizeOkay = 2097152L private val sizeNotOkay = sizeOkay + 1 } @@ -99,6 +101,62 @@ class DefaultElementLoaderTest { } @Test + fun `element loader can extract description from description header`() { + elementLoader.loadElement(textKey) + verify(freenetInterface).startFetch(eq(FreenetURI(textKey)), callback.capture()) + callback.value.loaded(FreenetURI(textKey), "text/html; charset=UTF-8", read("element-loader.html")) + val linkedElement = elementLoader.loadElement(textKey) + assertThat(linkedElement, equalTo(LinkedElement(textKey, properties = mapOf( + "size" to 266, + "sizeHuman" to "266 B", + "title" to "Some Nice Page Title", + "description" to "This is an example of a very nice freesite." + )))) + } + + @Test + fun `element loader can extract description from first non-heading paragraph`() { + elementLoader.loadElement(textKey) + verify(freenetInterface).startFetch(eq(FreenetURI(textKey)), callback.capture()) + callback.value.loaded(FreenetURI(textKey), "text/html; charset=UTF-8", read("element-loader2.html")) + val linkedElement = elementLoader.loadElement(textKey) + assertThat(linkedElement, equalTo(LinkedElement(textKey, properties = mapOf( + "size" to 185, + "sizeHuman" to "185 B", + "title" to "Some Nice Page Title", + "description" to "This is the first paragraph of the very nice freesite." + )))) + } + + @Test + fun `element loader can not extract description if html is more complicated`() { + elementLoader.loadElement(textKey) + verify(freenetInterface).startFetch(eq(FreenetURI(textKey)), callback.capture()) + callback.value.loaded(FreenetURI(textKey), "text/html; charset=UTF-8", read("element-loader3.html")) + val linkedElement = elementLoader.loadElement(textKey) + assertThat(linkedElement, equalTo(LinkedElement(textKey, properties = mapOf( + "size" to 204, + "sizeHuman" to "204 B", + "title" to "Some Nice Page Title", + "description" to null + )))) + } + + @Test + fun `element loader can not extract title if it is missing`() { + elementLoader.loadElement(textKey) + verify(freenetInterface).startFetch(eq(FreenetURI(textKey)), callback.capture()) + callback.value.loaded(FreenetURI(textKey), "text/html; charset=UTF-8", read("element-loader4.html")) + val linkedElement = elementLoader.loadElement(textKey) + assertThat(linkedElement, equalTo(LinkedElement(textKey, properties = mapOf( + "size" to 229, + "sizeHuman" to "229 B", + "title" to null, + "description" to "This is an example of a very nice freesite." + )))) + } + + @Test fun `image is not loaded again after it failed`() { elementLoader.loadElement(IMAGE_ID) verify(freenetInterface).startFetch(eq(freenetURI), callback.capture()) diff --git a/src/test/resources/net/pterodactylus/sone/core/element-loader.html b/src/test/resources/net/pterodactylus/sone/core/element-loader.html new file mode 100644 index 0000000..c803ff3 --- /dev/null +++ b/src/test/resources/net/pterodactylus/sone/core/element-loader.html @@ -0,0 +1,11 @@ + + + + Some Nice Page Title + + + +

First Paragraph

+

This is the first paragraph of the very nice freesite.

+ + diff --git a/src/test/resources/net/pterodactylus/sone/core/element-loader2.html b/src/test/resources/net/pterodactylus/sone/core/element-loader2.html new file mode 100644 index 0000000..7bff482 --- /dev/null +++ b/src/test/resources/net/pterodactylus/sone/core/element-loader2.html @@ -0,0 +1,10 @@ + + + + Some Nice Page Title + + +

First Paragraph

+

This is the first paragraph of the very nice freesite.

+ + diff --git a/src/test/resources/net/pterodactylus/sone/core/element-loader3.html b/src/test/resources/net/pterodactylus/sone/core/element-loader3.html new file mode 100644 index 0000000..d8fce37 --- /dev/null +++ b/src/test/resources/net/pterodactylus/sone/core/element-loader3.html @@ -0,0 +1,10 @@ + + + + Some Nice Page Title + + +

First Paragraph

+

This is the first paragraph of the very nice freesite.

+ + diff --git a/src/test/resources/net/pterodactylus/sone/core/element-loader4.html b/src/test/resources/net/pterodactylus/sone/core/element-loader4.html new file mode 100644 index 0000000..b660b6e --- /dev/null +++ b/src/test/resources/net/pterodactylus/sone/core/element-loader4.html @@ -0,0 +1,10 @@ + + + + + + +

First Paragraph

+

This is the first paragraph of the very nice freesite.

+ +