projects
/
Sone.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
🚸 Improve text extraction even further
[Sone.git]
/
src
/
main
/
kotlin
/
net
/
pterodactylus
/
sone
/
core
/
DefaultElementLoader.kt
diff --git
a/src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt
b/src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt
index
2849029
..
69872c5
100644
(file)
--- a/
src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt
+++ b/
src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt
@@
-6,7
+6,6
@@
import com.google.common.cache.CacheBuilder
import freenet.keys.FreenetURI
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import freenet.keys.FreenetURI
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
-import org.jsoup.nodes.TextNode
import java.io.ByteArrayInputStream
import java.net.URLDecoder
import java.nio.charset.Charset
import java.io.ByteArrayInputStream
import java.net.URLDecoder
import java.nio.charset.Charset
@@
-100,11
+99,10
@@
private val Document.metaDescription: String?
?.second
private val Document.firstNonHeadingParagraph: String?
?.second
private val Document.firstNonHeadingParagraph: String?
- get() = body().
children(
)
- .filter { it.
children().all { it is TextNode }
}
+ get() = body().
select("div, p"
)
+ .filter { it.
textNodes().isNotEmpty()
}
.map { it to it.text() }
.map { it to it.text() }
- .filterNot { it.second == "" }
- .firstOrNull { !it.first.tagName().startsWith("h", ignoreCase = true) }
+ .firstOrNull { it.second != "" }
?.second
private val Int.human get() = when (this) {
?.second
private val Int.human get() = when (this) {