🚸 Improve text extraction even further
[Sone.git] / src / main / kotlin / net / pterodactylus / sone / core / DefaultElementLoader.kt
1 package net.pterodactylus.sone.core
2
3 import com.google.common.base.Ticker
4 import com.google.common.cache.Cache
5 import com.google.common.cache.CacheBuilder
6 import freenet.keys.FreenetURI
7 import org.jsoup.Jsoup
8 import org.jsoup.nodes.Document
9 import java.io.ByteArrayInputStream
10 import java.net.URLDecoder
11 import java.nio.charset.Charset
12 import java.text.Normalizer
13 import java.util.concurrent.TimeUnit.MINUTES
14 import javax.activation.MimeType
15 import javax.imageio.ImageIO
16 import javax.inject.Inject
17
18 /**
19  * [ElementLoader] implementation that uses a simple Guava [com.google.common.cache.Cache].
20  */
21 class DefaultElementLoader(private val freenetInterface: FreenetInterface, ticker: Ticker): ElementLoader {
22
23         @Inject constructor(freenetInterface: FreenetInterface): this(freenetInterface, Ticker.systemTicker())
24
25         private val loadingLinks: Cache<String, Boolean> = CacheBuilder.newBuilder().build()
26         private val failureCache: Cache<String, Boolean> = CacheBuilder.newBuilder().ticker(ticker).expireAfterWrite(30, MINUTES).build()
27         private val elementCache: Cache<String, LinkedElement> = CacheBuilder.newBuilder().build()
28         private val callback = object: FreenetInterface.BackgroundFetchCallback {
29                 override fun shouldCancel(uri: FreenetURI, mimeType: String, size: Long): Boolean {
30                         return (size > 2097152) || (!mimeType.startsWith("image/") && !mimeType.startsWith("text/html"))
31                 }
32
33                 override fun loaded(uri: FreenetURI, mimeTypeText: String, data: ByteArray) {
34                         MimeType(mimeTypeText).also { mimeType ->
35                                 when {
36                                         mimeType.primaryType == "image" -> {
37                                                 ByteArrayInputStream(data).use {
38                                                         ImageIO.read(it)
39                                                 }?.let {
40                                                         elementCache.get(uri.toString().decode().normalize()) {
41                                                                 LinkedElement(uri.toString(), properties = mapOf("type" to "image", "size" to data.size, "sizeHuman" to data.size.human))
42                                                         }
43                                                 }
44                                         }
45                                         mimeType.baseType == "text/html" -> {
46                                                 val document = Jsoup.parse(data.toString(Charset.forName(mimeType.getParameter("charset") ?: "UTF-8")))
47                                                 elementCache.get(uri.toString().decode().normalize()) {
48                                                         LinkedElement(uri.toString(), properties = mapOf(
49                                                                         "type" to "html", "size" to data.size, "sizeHuman" to data.size.human,
50                                                                         "title" to document.title().emptyToNull,
51                                                                         "description" to (document.metaDescription ?: document.firstNonHeadingParagraph)
52                                                         ))
53                                                 }
54                                         }
55                                 }
56                                 removeLoadingLink(uri)
57                         }
58                 }
59
60                 override fun failed(uri: FreenetURI) {
61                         failureCache.put(uri.toString().decode().normalize(), true)
62                         removeLoadingLink(uri)
63                 }
64
65                 private fun removeLoadingLink(uri: FreenetURI) {
66                         synchronized(loadingLinks) {
67                                 loadingLinks.invalidate(uri.toString().decode().normalize())
68                         }
69                 }
70         }
71
72         override fun loadElement(link: String): LinkedElement {
73                 val normalizedLink = link.decode().normalize()
74                 synchronized(loadingLinks) {
75                         elementCache.getIfPresent(normalizedLink)?.run {
76                                 return this
77                         }
78                         failureCache.getIfPresent(normalizedLink)?.run {
79                                 return LinkedElement(link, failed = true)
80                         }
81                         if (loadingLinks.getIfPresent(normalizedLink) == null) {
82                                 loadingLinks.put(normalizedLink, true)
83                                 freenetInterface.startFetch(FreenetURI(link), callback)
84                         }
85                 }
86                 return LinkedElement(link, loading = true)
87         }
88
89 }
90
91 private fun String.decode() = URLDecoder.decode(this, "UTF-8")!!
92 private fun String.normalize() = Normalizer.normalize(this, Normalizer.Form.NFC)!!
93 private val String?.emptyToNull get() = if (this == "") null else this
94
95 private val Document.metaDescription: String?
96         get() = head().getElementsByTag("meta")
97                 .map { it.attr("name") to it.attr("content") }
98                 .firstOrNull { it.first == "description" }
99                 ?.second
100
101 private val Document.firstNonHeadingParagraph: String?
102         get() = body().select("div, p")
103                 .filter { it.textNodes().isNotEmpty() }
104                 .map { it to it.text() }
105                 .firstOrNull { it.second != "" }
106                 ?.second
107
108 private val Int.human get() = when (this) {
109         in 0..1023 -> "$this B"
110         in 1024..1048575 -> "${this / 1024} KiB"
111         in 1048576..1073741823 -> "${this / 1048576} MiB"
112         else -> "${this / 1073741824} GiB"
113 }