From 81bdef0713afd1f2f0ea389b31a49b46ba7c4157 Mon Sep 17 00:00:00 2001 From: jvoisin Date: Thu, 26 Dec 2024 16:37:44 +0100 Subject: [PATCH] Replace whatlanggo with an ad-hoc implementation The package github.com/abadojack/whatlanggo is unmaintained since 5 years, is overkill for simply detecting CJK, and is quite slow. --- go.mod | 1 - go.sum | 2 -- internal/reader/readingtime/readingtime.go | 38 ++++++++++++---------- 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/go.mod b/go.mod index 38341a407d4..50d4cfa6d66 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,6 @@ module miniflux.app/v2 require ( github.com/PuerkitoBio/goquery v1.10.0 - github.com/abadojack/whatlanggo v1.0.1 github.com/andybalholm/brotli v1.1.1 github.com/coreos/go-oidc/v3 v3.11.0 github.com/go-webauthn/webauthn v0.11.2 diff --git a/go.sum b/go.sum index 68a5ed1c2f8..28407d82c23 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,5 @@ github.com/PuerkitoBio/goquery v1.10.0 h1:6fiXdLuUvYs2OJSvNRqlNPoBm6YABE226xrbavY5Wv4= github.com/PuerkitoBio/goquery v1.10.0/go.mod h1:TjZZl68Q3eGHNBA8CWaxAN7rOU1EbDz3CWuolcO5Yu4= -github.com/abadojack/whatlanggo v1.0.1 h1:19N6YogDnf71CTHm3Mp2qhYfkRdyvbgwWdd2EPxJRG4= -github.com/abadojack/whatlanggo v1.0.1/go.mod h1:66WiQbSbJBIlOZMsvbKe5m6pzQovxCH9B/K8tQB2uoc= github.com/andybalholm/brotli v1.1.1 h1:PR2pgnyFznKEugtsUo0xLdDop5SKXd5Qf5ysW+7XdTA= github.com/andybalholm/brotli v1.1.1/go.mod h1:05ib4cKhjx3OQYUY22hTVd34Bc8upXjOLL2rKwwZBoA= github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= diff --git a/internal/reader/readingtime/readingtime.go b/internal/reader/readingtime/readingtime.go index 9159ee710c8..6175718c420 100644 --- a/internal/reader/readingtime/readingtime.go +++ b/internal/reader/readingtime/readingtime.go @@ -7,33 +7,37 @@ package readingtime import ( "math" "strings" + "unicode" "unicode/utf8" "miniflux.app/v2/internal/reader/sanitizer" - - "github.com/abadojack/whatlanggo" ) // EstimateReadingTime returns the estimated reading time of an article in minute. func EstimateReadingTime(content string, defaultReadingSpeed, cjkReadingSpeed int) int { sanitizedContent := sanitizer.StripTags(content) + truncationPoint := min(len(sanitizedContent), 50) - // Litterature on language detection says that around 100 signes is enough, we're safe here. - truncationPoint := min(len(sanitizedContent), 250) - - // We're only interested in identifying Japanse/Chinese/Korean - options := whatlanggo.Options{ - Whitelist: map[whatlanggo.Lang]bool{ - whatlanggo.Jpn: true, - whatlanggo.Cmn: true, - whatlanggo.Kor: true, - }, + if isCJK(sanitizedContent[:truncationPoint]) { + return int(math.Ceil(float64(utf8.RuneCountInString(sanitizedContent)) / float64(cjkReadingSpeed))) } - langInfo := whatlanggo.DetectWithOptions(sanitizedContent[:truncationPoint], options) + return int(math.Ceil(float64(len(strings.Fields(sanitizedContent))) / float64(defaultReadingSpeed))) +} - if langInfo.IsReliable() { - return int(math.Ceil(float64(utf8.RuneCountInString(sanitizedContent)) / float64(cjkReadingSpeed))) +func isCJK(text string) bool { + totalCJK := 0 + + for _, r := range text[:min(len(text), 50)] { + if unicode.Is(unicode.Han, r) || + unicode.Is(unicode.Hangul, r) || + unicode.Is(unicode.Hiragana, r) || + unicode.Is(unicode.Katakana, r) || + unicode.Is(unicode.Yi, r) || + unicode.Is(unicode.Bopomofo, r) { + totalCJK++ + } } - nbOfWords := len(strings.Fields(sanitizedContent)) - return int(math.Ceil(float64(nbOfWords) / float64(defaultReadingSpeed))) + + // if at least 50% of the text is CJK, odds are that the text is in CJK. + return totalCJK > len(text)/50 }