Add HTML parsing to element loader
[Sone.git] / src / main / kotlin / net / pterodactylus / sone / core / DefaultElementLoader.kt
1 package net.pterodactylus.sone.core
2
3 import com.google.common.base.Ticker
4 import com.google.common.cache.Cache
5 import com.google.common.cache.CacheBuilder
6 import freenet.keys.FreenetURI
7 import org.jsoup.Jsoup
8 import org.jsoup.nodes.Document
9 import org.jsoup.nodes.TextNode
10 import java.io.ByteArrayInputStream
11 import java.net.URLDecoder
12 import java.nio.charset.Charset
13 import java.text.Normalizer
14 import java.util.concurrent.TimeUnit.MINUTES
15 import javax.activation.MimeType
16 import javax.imageio.ImageIO
17 import javax.inject.Inject
18
19 /**
20  * [ElementLoader] implementation that uses a simple Guava [com.google.common.cache.Cache].
21  */
22 class DefaultElementLoader(private val freenetInterface: FreenetInterface, ticker: Ticker) : ElementLoader {
23
24         @Inject constructor(freenetInterface: FreenetInterface) : this(freenetInterface, Ticker.systemTicker())
25
26         private val loadingLinks: Cache<String, Boolean> = CacheBuilder.newBuilder().build<String, Boolean>()
27         private val failureCache: Cache<String, Boolean> = CacheBuilder.newBuilder().ticker(ticker).expireAfterWrite(30, MINUTES).build<String, Boolean>()
28         private val elementCache: Cache<String, LinkedElement> = CacheBuilder.newBuilder().build<String, LinkedElement>()
29         private val callback = object : FreenetInterface.BackgroundFetchCallback {
30                 override fun shouldCancel(uri: FreenetURI, mimeType: String, size: Long): Boolean {
31                         return (size > 2097152) || (!mimeType.startsWith("image/") && !mimeType.startsWith("text/html"))
32                 }
33
34                 override fun loaded(uri: FreenetURI, mimeType: String, data: ByteArray) {
35                         MimeType(mimeType).also { mimeType ->
36                                 when {
37                                         mimeType.primaryType == "image" -> {
38                                                 ByteArrayInputStream(data).use {
39                                                         ImageIO.read(it)
40                                                 }?.let {
41                                                         elementCache.get(uri.toString().decode().normalize()) {
42                                                                 LinkedElement(uri.toString(), properties = mapOf("size" to data.size, "sizeHuman" to data.size.human))
43                                                         }
44                                                 }
45                                         }
46                                         mimeType.baseType == "text/html" -> {
47                                                 val document = Jsoup.parse(data.toString(Charset.forName(mimeType.getParameter("charset") ?: "UTF-8")))
48                                                 elementCache.get(uri.toString().decode().normalize()) {
49                                                         LinkedElement(uri.toString(), properties = mapOf(
50                                                                         "size" to data.size, "sizeHuman" to data.size.human,
51                                                                         "title" to document.title().emptyToNull,
52                                                                         "description" to (document.metaDescription ?: document.firstNonHeadingParagraph)
53                                                         ))
54                                                 }
55                                         }
56                                 }
57                                 removeLoadingLink(uri)
58                         }
59                 }
60
61                 private val String?.emptyToNull get() = if (this == "") null else this
62
63                 private val Document.metaDescription: String?
64                         get() = head().getElementsByTag("meta")
65                                         .map { it.attr("name") to it.attr("content") }
66                                         .firstOrNull { it.first == "description" }
67                                         ?.second
68
69                 private val Document.firstNonHeadingParagraph: String?
70                         get() = body().children()
71                                         .filter { it.children().all { it is TextNode } }
72                                         .map { it to it.text() }
73                                         .filterNot { it.second == "" }
74                                         .firstOrNull { !it.first.tagName().startsWith("h", ignoreCase = true) }
75                                         ?.second
76
77                 private val Int.human get() = when (this) {
78                         in 0..1023 -> "$this B"
79                         in 1024..1048575 -> "${this / 1024} KiB"
80                         in 1048576..1073741823 -> "${this / 1048576} MiB"
81                         else -> "${this / 1073741824} GiB"
82                 }
83
84                 override fun failed(uri: FreenetURI) {
85                         failureCache.put(uri.toString().decode().normalize(), true)
86                         removeLoadingLink(uri)
87                 }
88
89                 private fun removeLoadingLink(uri: FreenetURI) {
90                         synchronized(loadingLinks) {
91                                 loadingLinks.invalidate(uri.toString().decode().normalize())
92                         }
93                 }
94         }
95
96         override fun loadElement(link: String): LinkedElement {
97                 val normalizedLink = link.decode().normalize()
98                 synchronized(loadingLinks) {
99                         elementCache.getIfPresent(normalizedLink)?.run {
100                                 return this
101                         }
102                         failureCache.getIfPresent(normalizedLink)?.run {
103                                 return LinkedElement(link, failed = true)
104                         }
105                         if (loadingLinks.getIfPresent(normalizedLink) == null) {
106                                 loadingLinks.put(normalizedLink, true)
107                                 freenetInterface.startFetch(FreenetURI(link), callback)
108                         }
109                 }
110                 return LinkedElement(link, loading = true)
111         }
112
113         private fun String.decode() = URLDecoder.decode(this, "UTF-8")!!
114         private fun String.normalize() = Normalizer.normalize(this, Normalizer.Form.NFC)!!
115
116 }