Add HTML parsing to element loader
authorDavid ‘Bombe’ Roden <bombe@pterodactylus.net>
Sun, 23 Apr 2017 14:08:43 +0000 (16:08 +0200)
committerDavid ‘Bombe’ Roden <bombe@pterodactylus.net>
Sun, 23 Apr 2017 14:08:43 +0000 (16:08 +0200)
build.gradle
src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt
src/test/kotlin/net/pterodactylus/sone/core/DefaultElementLoaderTest.kt
src/test/resources/net/pterodactylus/sone/core/element-loader.html [new file with mode: 0644]
src/test/resources/net/pterodactylus/sone/core/element-loader2.html [new file with mode: 0644]
src/test/resources/net/pterodactylus/sone/core/element-loader3.html [new file with mode: 0644]
src/test/resources/net/pterodactylus/sone/core/element-loader4.html [new file with mode: 0644]

index ce95eb9..3322f5a 100644 (file)
@@ -50,11 +50,11 @@ dependencies {
     compile group: 'com.google.guava', name: 'guava', version: '14.0.1'
     compile group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.1.2'
     compile group: 'com.google.code.findbugs', name: 'jsr305', version: '2.0.1'
+    compile group: 'org.jsoup', name: 'jsoup', version: '1.10.2'
 
     testCompile group: 'org.jetbrains.kotlin', name: 'kotlin-test', version: kotlinVersion
     testCompile group: 'junit', name: 'junit', version: '4.11'
     testCompile group: 'org.mockito', name: 'mockito-core', version: '2.1.0'
-    testCompile group: 'org.jsoup', name: 'jsoup', version: '1.7.1'
     testCompile group: 'org.hamcrest', name: 'hamcrest-all', version: '1.3'
 }
 
index 67d5538..f553ae8 100644 (file)
@@ -4,10 +4,15 @@ import com.google.common.base.Ticker
 import com.google.common.cache.Cache
 import com.google.common.cache.CacheBuilder
 import freenet.keys.FreenetURI
+import org.jsoup.Jsoup
+import org.jsoup.nodes.Document
+import org.jsoup.nodes.TextNode
 import java.io.ByteArrayInputStream
 import java.net.URLDecoder
+import java.nio.charset.Charset
 import java.text.Normalizer
 import java.util.concurrent.TimeUnit.MINUTES
+import javax.activation.MimeType
 import javax.imageio.ImageIO
 import javax.inject.Inject
 
@@ -20,26 +25,55 @@ class DefaultElementLoader(private val freenetInterface: FreenetInterface, ticke
 
        private val loadingLinks: Cache<String, Boolean> = CacheBuilder.newBuilder().build<String, Boolean>()
        private val failureCache: Cache<String, Boolean> = CacheBuilder.newBuilder().ticker(ticker).expireAfterWrite(30, MINUTES).build<String, Boolean>()
-       private val imageCache: Cache<String, LinkedElement> = CacheBuilder.newBuilder().build<String, LinkedElement>()
+       private val elementCache: Cache<String, LinkedElement> = CacheBuilder.newBuilder().build<String, LinkedElement>()
        private val callback = object : FreenetInterface.BackgroundFetchCallback {
                override fun shouldCancel(uri: FreenetURI, mimeType: String, size: Long): Boolean {
-                       return !mimeType.startsWith("image/") || (size > 2097152)
+                       return (size > 2097152) || (!mimeType.startsWith("image/") && !mimeType.startsWith("text/html"))
                }
 
                override fun loaded(uri: FreenetURI, mimeType: String, data: ByteArray) {
-                       if (!mimeType.startsWith("image/")) {
-                               return
-                       }
-                       ByteArrayInputStream(data).use {
-                               ImageIO.read(it)
-                       }?.let {
-                               imageCache.get(uri.toString().decode().normalize()) {
-                                       LinkedElement(uri.toString(), properties = mapOf("size" to data.size, "sizeHuman" to data.size.human))
+                       MimeType(mimeType).also { mimeType ->
+                               when {
+                                       mimeType.primaryType == "image" -> {
+                                               ByteArrayInputStream(data).use {
+                                                       ImageIO.read(it)
+                                               }?.let {
+                                                       elementCache.get(uri.toString().decode().normalize()) {
+                                                               LinkedElement(uri.toString(), properties = mapOf("size" to data.size, "sizeHuman" to data.size.human))
+                                                       }
+                                               }
+                                       }
+                                       mimeType.baseType == "text/html" -> {
+                                               val document = Jsoup.parse(data.toString(Charset.forName(mimeType.getParameter("charset") ?: "UTF-8")))
+                                               elementCache.get(uri.toString().decode().normalize()) {
+                                                       LinkedElement(uri.toString(), properties = mapOf(
+                                                                       "size" to data.size, "sizeHuman" to data.size.human,
+                                                                       "title" to document.title().emptyToNull,
+                                                                       "description" to (document.metaDescription ?: document.firstNonHeadingParagraph)
+                                                       ))
+                                               }
+                                       }
                                }
+                               removeLoadingLink(uri)
                        }
-                       removeLoadingLink(uri)
                }
 
+               private val String?.emptyToNull get() = if (this == "") null else this
+
+               private val Document.metaDescription: String?
+                       get() = head().getElementsByTag("meta")
+                                       .map { it.attr("name") to it.attr("content") }
+                                       .firstOrNull { it.first == "description" }
+                                       ?.second
+
+               private val Document.firstNonHeadingParagraph: String?
+                       get() = body().children()
+                                       .filter { it.children().all { it is TextNode } }
+                                       .map { it to it.text() }
+                                       .filterNot { it.second == "" }
+                                       .firstOrNull { !it.first.tagName().startsWith("h", ignoreCase = true) }
+                                       ?.second
+
                private val Int.human get() = when (this) {
                        in 0..1023 -> "$this B"
                        in 1024..1048575 -> "${this / 1024} KiB"
@@ -62,7 +96,7 @@ class DefaultElementLoader(private val freenetInterface: FreenetInterface, ticke
        override fun loadElement(link: String): LinkedElement {
                val normalizedLink = link.decode().normalize()
                synchronized(loadingLinks) {
-                       imageCache.getIfPresent(normalizedLink)?.run {
+                       elementCache.getIfPresent(normalizedLink)?.run {
                                return this
                        }
                        failureCache.getIfPresent(normalizedLink)?.run {
index 03b42c3..ed66dd9 100644 (file)
@@ -8,6 +8,7 @@ import net.pterodactylus.sone.test.capture
 import net.pterodactylus.sone.test.mock
 import org.hamcrest.MatcherAssert.assertThat
 import org.hamcrest.Matchers.`is`
+import org.hamcrest.Matchers.equalTo
 import org.junit.Test
 import org.mockito.ArgumentMatchers.any
 import org.mockito.ArgumentMatchers.eq
@@ -27,6 +28,7 @@ class DefaultElementLoaderTest {
                private val freenetURI = FreenetURI(IMAGE_ID)
                private const val decomposedKey = "CHK@DCiVgTWW9nnWHJc9EVwtFJ6jAfBSVyy~rgiPvhUKbS4,mNY85V0x7dYcv7SnEYo1PCC6y2wNWMDNt-y9UWQx9fI,AAMC--8/fru%CC%88hstu%CC%88ck.jpg"
                private const val normalizedKey = "CHK@DCiVgTWW9nnWHJc9EVwtFJ6jAfBSVyy~rgiPvhUKbS4,mNY85V0x7dYcv7SnEYo1PCC6y2wNWMDNt-y9UWQx9fI,AAMC--8/frühstück.jpg"
+               private const val textKey = "KSK@gpl.html"
                private val sizeOkay = 2097152L
                private val sizeNotOkay = sizeOkay + 1
        }
@@ -99,6 +101,62 @@ class DefaultElementLoaderTest {
        }
 
        @Test
+       fun `element loader can extract description from description header`() {
+           elementLoader.loadElement(textKey)
+               verify(freenetInterface).startFetch(eq(FreenetURI(textKey)), callback.capture())
+               callback.value.loaded(FreenetURI(textKey), "text/html; charset=UTF-8", read("element-loader.html"))
+               val linkedElement = elementLoader.loadElement(textKey)
+               assertThat(linkedElement, equalTo(LinkedElement(textKey, properties = mapOf(
+                               "size" to 266,
+                               "sizeHuman" to "266 B",
+                               "title" to "Some Nice Page Title",
+                               "description" to "This is an example of a very nice freesite."
+               ))))
+       }
+
+       @Test
+       fun `element loader can extract description from first non-heading paragraph`() {
+           elementLoader.loadElement(textKey)
+               verify(freenetInterface).startFetch(eq(FreenetURI(textKey)), callback.capture())
+               callback.value.loaded(FreenetURI(textKey), "text/html; charset=UTF-8", read("element-loader2.html"))
+               val linkedElement = elementLoader.loadElement(textKey)
+               assertThat(linkedElement, equalTo(LinkedElement(textKey, properties = mapOf(
+                               "size" to 185,
+                               "sizeHuman" to "185 B",
+                               "title" to "Some Nice Page Title",
+                               "description" to "This is the first paragraph of the very nice freesite."
+               ))))
+       }
+
+       @Test
+       fun `element loader can not extract description if html is more complicated`() {
+           elementLoader.loadElement(textKey)
+               verify(freenetInterface).startFetch(eq(FreenetURI(textKey)), callback.capture())
+               callback.value.loaded(FreenetURI(textKey), "text/html; charset=UTF-8", read("element-loader3.html"))
+               val linkedElement = elementLoader.loadElement(textKey)
+               assertThat(linkedElement, equalTo(LinkedElement(textKey, properties = mapOf(
+                               "size" to 204,
+                               "sizeHuman" to "204 B",
+                               "title" to "Some Nice Page Title",
+                               "description" to null
+               ))))
+       }
+
+       @Test
+       fun `element loader can not extract title if it is missing`() {
+           elementLoader.loadElement(textKey)
+               verify(freenetInterface).startFetch(eq(FreenetURI(textKey)), callback.capture())
+               callback.value.loaded(FreenetURI(textKey), "text/html; charset=UTF-8", read("element-loader4.html"))
+               val linkedElement = elementLoader.loadElement(textKey)
+               assertThat(linkedElement, equalTo(LinkedElement(textKey, properties = mapOf(
+                               "size" to 229,
+                               "sizeHuman" to "229 B",
+                               "title" to null,
+                               "description" to "This is an example of a very nice freesite."
+               ))))
+       }
+
+       @Test
        fun `image is not loaded again after it failed`() {
                elementLoader.loadElement(IMAGE_ID)
                verify(freenetInterface).startFetch(eq(freenetURI), callback.capture())
diff --git a/src/test/resources/net/pterodactylus/sone/core/element-loader.html b/src/test/resources/net/pterodactylus/sone/core/element-loader.html
new file mode 100644 (file)
index 0000000..c803ff3
--- /dev/null
@@ -0,0 +1,11 @@
+<!DOCTYPE html>
+<html>
+<head>
+       <title>Some Nice Page Title</title>
+       <meta name="description" content="This is an example of a very nice freesite.">
+</head>
+<body>
+<h1>First Paragraph</h1>
+<p>This is the first paragraph of the very nice freesite.</p>
+</body>
+</html>
diff --git a/src/test/resources/net/pterodactylus/sone/core/element-loader2.html b/src/test/resources/net/pterodactylus/sone/core/element-loader2.html
new file mode 100644 (file)
index 0000000..7bff482
--- /dev/null
@@ -0,0 +1,10 @@
+<!DOCTYPE html>
+<html>
+<head>
+       <title>Some Nice Page Title</title>
+</head>
+<body>
+<h1>First Paragraph</h1>
+<p>This is the first paragraph of the very nice freesite.</p>
+</body>
+</html>
diff --git a/src/test/resources/net/pterodactylus/sone/core/element-loader3.html b/src/test/resources/net/pterodactylus/sone/core/element-loader3.html
new file mode 100644 (file)
index 0000000..d8fce37
--- /dev/null
@@ -0,0 +1,10 @@
+<!DOCTYPE html>
+<html>
+<head>
+       <title>Some Nice Page Title</title>
+</head>
+<body>
+<h1>First Paragraph</h1>
+<p>This is the <a href="#foo">first paragraph</a> of the very nice freesite.</p>
+</body>
+</html>
diff --git a/src/test/resources/net/pterodactylus/sone/core/element-loader4.html b/src/test/resources/net/pterodactylus/sone/core/element-loader4.html
new file mode 100644 (file)
index 0000000..b660b6e
--- /dev/null
@@ -0,0 +1,10 @@
+<!DOCTYPE html>
+<html>
+<head>
+       <meta name="description" content="This is an example of a very nice freesite.">
+</head>
+<body>
+<h1>First Paragraph</h1>
+<p>This is the first paragraph of the very nice freesite.</p>
+</body>
+</html>