compile group: 'com.google.guava', name: 'guava', version: '14.0.1'
compile group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.1.2'
compile group: 'com.google.code.findbugs', name: 'jsr305', version: '2.0.1'
+ compile group: 'org.jsoup', name: 'jsoup', version: '1.10.2'
testCompile group: 'org.jetbrains.kotlin', name: 'kotlin-test', version: kotlinVersion
testCompile group: 'junit', name: 'junit', version: '4.11'
testCompile group: 'org.mockito', name: 'mockito-core', version: '2.1.0'
- testCompile group: 'org.jsoup', name: 'jsoup', version: '1.7.1'
testCompile group: 'org.hamcrest', name: 'hamcrest-all', version: '1.3'
}
import com.google.common.cache.Cache
import com.google.common.cache.CacheBuilder
import freenet.keys.FreenetURI
+import org.jsoup.Jsoup
+import org.jsoup.nodes.Document
+import org.jsoup.nodes.TextNode
import java.io.ByteArrayInputStream
import java.net.URLDecoder
+import java.nio.charset.Charset
import java.text.Normalizer
import java.util.concurrent.TimeUnit.MINUTES
+import javax.activation.MimeType
import javax.imageio.ImageIO
import javax.inject.Inject
private val loadingLinks: Cache<String, Boolean> = CacheBuilder.newBuilder().build<String, Boolean>()
private val failureCache: Cache<String, Boolean> = CacheBuilder.newBuilder().ticker(ticker).expireAfterWrite(30, MINUTES).build<String, Boolean>()
- private val imageCache: Cache<String, LinkedElement> = CacheBuilder.newBuilder().build<String, LinkedElement>()
+ private val elementCache: Cache<String, LinkedElement> = CacheBuilder.newBuilder().build<String, LinkedElement>()
private val callback = object : FreenetInterface.BackgroundFetchCallback {
override fun shouldCancel(uri: FreenetURI, mimeType: String, size: Long): Boolean {
- return !mimeType.startsWith("image/") || (size > 2097152)
+ return (size > 2097152) || (!mimeType.startsWith("image/") && !mimeType.startsWith("text/html"))
}
override fun loaded(uri: FreenetURI, mimeType: String, data: ByteArray) {
- if (!mimeType.startsWith("image/")) {
- return
- }
- ByteArrayInputStream(data).use {
- ImageIO.read(it)
- }?.let {
- imageCache.get(uri.toString().decode().normalize()) {
- LinkedElement(uri.toString(), properties = mapOf("size" to data.size, "sizeHuman" to data.size.human))
+ MimeType(mimeType).also { mimeType ->
+ when {
+ mimeType.primaryType == "image" -> {
+ ByteArrayInputStream(data).use {
+ ImageIO.read(it)
+ }?.let {
+ elementCache.get(uri.toString().decode().normalize()) {
+ LinkedElement(uri.toString(), properties = mapOf("size" to data.size, "sizeHuman" to data.size.human))
+ }
+ }
+ }
+ mimeType.baseType == "text/html" -> {
+ val document = Jsoup.parse(data.toString(Charset.forName(mimeType.getParameter("charset") ?: "UTF-8")))
+ elementCache.get(uri.toString().decode().normalize()) {
+ LinkedElement(uri.toString(), properties = mapOf(
+ "size" to data.size, "sizeHuman" to data.size.human,
+ "title" to document.title().emptyToNull,
+ "description" to (document.metaDescription ?: document.firstNonHeadingParagraph)
+ ))
+ }
+ }
}
+ removeLoadingLink(uri)
}
- removeLoadingLink(uri)
}
+ private val String?.emptyToNull get() = if (this == "") null else this
+
+ private val Document.metaDescription: String?
+ get() = head().getElementsByTag("meta")
+ .map { it.attr("name") to it.attr("content") }
+ .firstOrNull { it.first == "description" }
+ ?.second
+
+ private val Document.firstNonHeadingParagraph: String?
+ get() = body().children()
+ .filter { it.children().all { it is TextNode } }
+ .map { it to it.text() }
+ .filterNot { it.second == "" }
+ .firstOrNull { !it.first.tagName().startsWith("h", ignoreCase = true) }
+ ?.second
+
private val Int.human get() = when (this) {
in 0..1023 -> "$this B"
in 1024..1048575 -> "${this / 1024} KiB"
override fun loadElement(link: String): LinkedElement {
val normalizedLink = link.decode().normalize()
synchronized(loadingLinks) {
- imageCache.getIfPresent(normalizedLink)?.run {
+ elementCache.getIfPresent(normalizedLink)?.run {
return this
}
failureCache.getIfPresent(normalizedLink)?.run {
import net.pterodactylus.sone.test.mock
import org.hamcrest.MatcherAssert.assertThat
import org.hamcrest.Matchers.`is`
+import org.hamcrest.Matchers.equalTo
import org.junit.Test
import org.mockito.ArgumentMatchers.any
import org.mockito.ArgumentMatchers.eq
private val freenetURI = FreenetURI(IMAGE_ID)
private const val decomposedKey = "CHK@DCiVgTWW9nnWHJc9EVwtFJ6jAfBSVyy~rgiPvhUKbS4,mNY85V0x7dYcv7SnEYo1PCC6y2wNWMDNt-y9UWQx9fI,AAMC--8/fru%CC%88hstu%CC%88ck.jpg"
private const val normalizedKey = "CHK@DCiVgTWW9nnWHJc9EVwtFJ6jAfBSVyy~rgiPvhUKbS4,mNY85V0x7dYcv7SnEYo1PCC6y2wNWMDNt-y9UWQx9fI,AAMC--8/frühstück.jpg"
+ private const val textKey = "KSK@gpl.html"
private val sizeOkay = 2097152L
private val sizeNotOkay = sizeOkay + 1
}
}
@Test
+ fun `element loader can extract description from description header`() {
+ elementLoader.loadElement(textKey)
+ verify(freenetInterface).startFetch(eq(FreenetURI(textKey)), callback.capture())
+ callback.value.loaded(FreenetURI(textKey), "text/html; charset=UTF-8", read("element-loader.html"))
+ val linkedElement = elementLoader.loadElement(textKey)
+ assertThat(linkedElement, equalTo(LinkedElement(textKey, properties = mapOf(
+ "size" to 266,
+ "sizeHuman" to "266 B",
+ "title" to "Some Nice Page Title",
+ "description" to "This is an example of a very nice freesite."
+ ))))
+ }
+
+ @Test
+ fun `element loader can extract description from first non-heading paragraph`() {
+ elementLoader.loadElement(textKey)
+ verify(freenetInterface).startFetch(eq(FreenetURI(textKey)), callback.capture())
+ callback.value.loaded(FreenetURI(textKey), "text/html; charset=UTF-8", read("element-loader2.html"))
+ val linkedElement = elementLoader.loadElement(textKey)
+ assertThat(linkedElement, equalTo(LinkedElement(textKey, properties = mapOf(
+ "size" to 185,
+ "sizeHuman" to "185 B",
+ "title" to "Some Nice Page Title",
+ "description" to "This is the first paragraph of the very nice freesite."
+ ))))
+ }
+
+ @Test
+ fun `element loader can not extract description if html is more complicated`() {
+ elementLoader.loadElement(textKey)
+ verify(freenetInterface).startFetch(eq(FreenetURI(textKey)), callback.capture())
+ callback.value.loaded(FreenetURI(textKey), "text/html; charset=UTF-8", read("element-loader3.html"))
+ val linkedElement = elementLoader.loadElement(textKey)
+ assertThat(linkedElement, equalTo(LinkedElement(textKey, properties = mapOf(
+ "size" to 204,
+ "sizeHuman" to "204 B",
+ "title" to "Some Nice Page Title",
+ "description" to null
+ ))))
+ }
+
+ @Test
+ fun `element loader can not extract title if it is missing`() {
+ elementLoader.loadElement(textKey)
+ verify(freenetInterface).startFetch(eq(FreenetURI(textKey)), callback.capture())
+ callback.value.loaded(FreenetURI(textKey), "text/html; charset=UTF-8", read("element-loader4.html"))
+ val linkedElement = elementLoader.loadElement(textKey)
+ assertThat(linkedElement, equalTo(LinkedElement(textKey, properties = mapOf(
+ "size" to 229,
+ "sizeHuman" to "229 B",
+ "title" to null,
+ "description" to "This is an example of a very nice freesite."
+ ))))
+ }
+
+ @Test
fun `image is not loaded again after it failed`() {
elementLoader.loadElement(IMAGE_ID)
verify(freenetInterface).startFetch(eq(freenetURI), callback.capture())
--- /dev/null
+<!DOCTYPE html>
+<html>
+<head>
+ <title>Some Nice Page Title</title>
+ <meta name="description" content="This is an example of a very nice freesite.">
+</head>
+<body>
+<h1>First Paragraph</h1>
+<p>This is the first paragraph of the very nice freesite.</p>
+</body>
+</html>
--- /dev/null
+<!DOCTYPE html>
+<html>
+<head>
+ <title>Some Nice Page Title</title>
+</head>
+<body>
+<h1>First Paragraph</h1>
+<p>This is the first paragraph of the very nice freesite.</p>
+</body>
+</html>
--- /dev/null
+<!DOCTYPE html>
+<html>
+<head>
+ <title>Some Nice Page Title</title>
+</head>
+<body>
+<h1>First Paragraph</h1>
+<p>This is the <a href="#foo">first paragraph</a> of the very nice freesite.</p>
+</body>
+</html>
--- /dev/null
+<!DOCTYPE html>
+<html>
+<head>
+ <meta name="description" content="This is an example of a very nice freesite.">
+</head>
+<body>
+<h1>First Paragraph</h1>
+<p>This is the first paragraph of the very nice freesite.</p>
+</body>
+</html>