projects
/
Sone.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
🚸 Improve text extraction even further
[Sone.git]
/
src
/
main
/
kotlin
/
net
/
pterodactylus
/
sone
/
core
/
DefaultElementLoader.kt
diff --git
a/src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt
b/src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt
index
88cb1f5
..
69872c5
100644
(file)
--- a/
src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt
+++ b/
src/main/kotlin/net/pterodactylus/sone/core/DefaultElementLoader.kt
@@
-99,10
+99,10
@@
private val Document.metaDescription: String?
?.second
private val Document.firstNonHeadingParagraph: String?
?.second
private val Document.firstNonHeadingParagraph: String?
- get() = body().children()
+ get() = body().select("div, p")
+ .filter { it.textNodes().isNotEmpty() }
.map { it to it.text() }
.map { it to it.text() }
- .filterNot { it.second == "" }
- .firstOrNull { !it.first.tagName().startsWith("h", ignoreCase = true) }
+ .firstOrNull { it.second != "" }
?.second
private val Int.human get() = when (this) {
?.second
private val Int.human get() = when (this) {