From e8ec125278f09c463ed1ea849b0bf03335639f8a Mon Sep 17 00:00:00 2001 From: quambene Date: Thu, 19 Dec 2024 02:56:12 +0100 Subject: [PATCH] Fix line breaks for paragraphs (#95) * Add patch * Update dependencies * Migrate to readability-rs * Update toolchain * Allow dead code * Update readability * Initialize regex once * Remove clone * Elide lifetimes --- Cargo.lock | 133 ++++++++++++++++----------- Cargo.toml | 2 +- rust-toolchain.toml | 2 +- src/bookmark_reader/chromium.rs | 2 +- src/bookmark_reader/firefox.rs | 2 +- src/bookmark_reader/safari.rs | 2 +- src/bookmark_reader/target_writer.rs | 1 + src/errors.rs | 3 +- src/html.rs | 27 +++++- tests/common.rs | 1 + 10 files changed, 109 insertions(+), 66 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9cd05c7..f726d7c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "addr2line" @@ -111,9 +111,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.93" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775" +checksum = "c1fd03a028ef38ba2276dce7e33fcd6369c158a1bca17946c4b1b701891c1ff7" [[package]] name = "assert-json-diff" @@ -252,7 +252,7 @@ dependencies = [ "parking_lot", "plist", "predicates", - "readability", + "readability-rs", "regex", "reqwest", "rlimit", @@ -261,7 +261,7 @@ dependencies = [ "serde_json", "similar", "tempfile", - "thiserror", + "thiserror 1.0.69", "tokio", "url", "uuid", @@ -305,9 +305,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.2.2" +version = "1.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f34d93e62b03caf570cccc334cbc6c2fceca82f39211051345108adcba3eebdc" +checksum = "27f657647bcff5394bf56c7317665bbf790a137a50eaaa5c6bfbb9e27a518f2d" dependencies = [ "shlex", ] @@ -378,9 +378,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.21" +version = "4.5.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb3b4b9e5a7c7514dfa52869339ee98b3156b0bfb4e8a77c4ff4babb64b1604f" +checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84" dependencies = [ "clap_builder", "clap_derive", @@ -388,9 +388,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.21" +version = "4.5.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b17a95aa67cc7b5ebd32aa5370189aa0d79069ef1c64ce893bd30fb24bff20ec" +checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838" dependencies = [ "anstream", "anstyle", @@ -412,9 +412,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" [[package]] name = "cloudabi" @@ -751,9 +751,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.2.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "float-cmp" @@ -1036,7 +1036,7 @@ dependencies = [ "ipnet", "once_cell", "rand 0.8.5", - "thiserror", + "thiserror 1.0.69", "tinyvec", "tokio", "tracing", @@ -1059,7 +1059,7 @@ dependencies = [ "rand 0.8.5", "resolv-conf", "smallvec", - "thiserror", + "thiserror 1.0.69", "tokio", "tracing", ] @@ -1389,9 +1389,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.6.0" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" +checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" dependencies = [ "equivalent", "hashbrown", @@ -1481,7 +1481,7 @@ dependencies = [ "combine", "jni-sys", "log", - "thiserror", + "thiserror 1.0.69", "walkdir", ] @@ -1493,10 +1493,11 @@ checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130" [[package]] name = "js-sys" -version = "0.3.73" +version = "0.3.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb15147158e79fd8b8afd0252522769c4f48725460b37338544d8379d94fc8f9" +checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7" dependencies = [ + "once_cell", "wasm-bindgen", ] @@ -2299,16 +2300,17 @@ dependencies = [ ] [[package]] -name = "readability" -version = "0.3.0" +name = "readability-rs" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e56596e20a6d3cf715182d9b6829220621e6e985cec04d00410cee29821b4220" +checksum = "5a17841ca2fc1c3e2aed7c44b29121ab099176923c0ac55d9906edea8ab025bc" dependencies = [ "html5ever 0.26.0", "lazy_static", + "log", "markup5ever_rcdom", "regex", - "reqwest", + "thiserror 2.0.7", "url", ] @@ -2329,7 +2331,7 @@ checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" dependencies = [ "getrandom 0.2.15", "libredox", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -2435,15 +2437,15 @@ checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" [[package]] name = "rustix" -version = "0.38.41" +version = "0.38.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6" +checksum = "f93dc38ecbab2eb790ff964bb77fa94faf256fd3e73285fd7ba0903b76bedb85" dependencies = [ "bitflags 2.6.0", "errno", "libc", "linux-raw-sys", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -2583,7 +2585,7 @@ checksum = "c7715380eec75f029a4ef7de39a9200e0a63823176b759d055b613f5a87df6a6" dependencies = [ "percent-encoding", "serde", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -2804,7 +2806,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c" dependencies = [ "cfg-if", - "fastrand 2.2.0", + "fastrand 2.3.0", "once_cell", "rustix", "windows-sys 0.59.0", @@ -2842,7 +2844,16 @@ version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ - "thiserror-impl", + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93605438cbd668185516ab499d589afb7ee1859ea3d5fc8f6b0755e1c7443767" +dependencies = [ + "thiserror-impl 2.0.7", ] [[package]] @@ -2856,11 +2867,22 @@ dependencies = [ "syn 2.0.90", ] +[[package]] +name = "thiserror-impl" +version = "2.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1d8749b4531af2117677a5fcd12b1348a3fe2b81e36e61ffeac5c4aa3273e36" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.90", +] + [[package]] name = "time" -version = "0.3.36" +version = "0.3.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885" +checksum = "35e7868883861bd0e56d9ac6efcaaca0d6d5d82a2a7ec8209ff492c07cf37b21" dependencies = [ "deranged", "itoa", @@ -2879,9 +2901,9 @@ checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" [[package]] name = "time-macros" -version = "0.2.18" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf" +checksum = "2834e6017e3e5e4b9834939793b282bc03b37a3336245fa820e35e233e2a85de" dependencies = [ "num-conv", "time-core", @@ -2924,9 +2946,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.41.1" +version = "1.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22cfb5bee7a6a52939ca9224d6ac897bb669134078daa8735560897f69de4d33" +checksum = "5cec9b21b0450273377fc97bd4c33a8acffc8c996c987a7c5b319a0083707551" dependencies = [ "backtrace", "bytes", @@ -2962,9 +2984,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.12" +version = "0.7.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61e7c3654c13bcd040d4a03abee2c75b1d14a37b423cf5a813ceae1cc903ec6a" +checksum = "d7fcaa8d55a2bdd6b83ace262b016eca0d79ee02818c5c1bcdf0305114081078" dependencies = [ "bytes", "futures-core", @@ -3148,9 +3170,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.96" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21d3b25c3ea1126a2ad5f4f9068483c2af1e64168f847abe863a526b8dbfe00b" +checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396" dependencies = [ "cfg-if", "once_cell", @@ -3159,13 +3181,12 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.96" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52857d4c32e496dc6537646b5b117081e71fd2ff06de792e3577a150627db283" +checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79" dependencies = [ "bumpalo", "log", - "once_cell", "proc-macro2", "quote", "syn 2.0.90", @@ -3174,9 +3195,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.46" +version = "0.4.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "951fe82312ed48443ac78b66fa43eded9999f738f6022e67aead7b708659e49a" +checksum = "38176d9b44ea84e9184eff0bc34cc167ed044f816accfe5922e54d84cf48eca2" dependencies = [ "cfg-if", "js-sys", @@ -3187,9 +3208,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.96" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "920b0ffe069571ebbfc9ddc0b36ba305ef65577c94b06262ed793716a1afd981" +checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3197,9 +3218,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.96" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf59002391099644be3524e23b781fa43d2be0c5aa0719a18c0731b9d195cab6" +checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2" dependencies = [ "proc-macro2", "quote", @@ -3210,15 +3231,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.96" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5047c5392700766601942795a436d7d2599af60dcc3cc1248c9120bfb0827b0" +checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6" [[package]] name = "web-sys" -version = "0.3.73" +version = "0.3.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "476364ff87d0ae6bfb661053a9104ab312542658c3d8f963b7ace80b6f9b26b9" +checksum = "04dd7223427d52553d3702c004d3b2fe07c148165faa56313cb00211e31c12bc" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/Cargo.toml b/Cargo.toml index fe66a55..d3cd6f9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,7 +35,7 @@ serde = { version = "1.0.175", features = ["derive"] } serde_json = "1.0.103" html5ever = "0.24.1" html2md = "0.2.14" -readability = "0.3.0" +readability-rs = "0.5.0" scraper = "0.18.1" regex = "1.9.1" colored = "2.0.4" diff --git a/rust-toolchain.toml b/rust-toolchain.toml index af663bd..2a53366 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,3 +1,3 @@ [toolchain] -channel = "1.76" +channel = "1.83" profile = "default" diff --git a/src/bookmark_reader/chromium.rs b/src/bookmark_reader/chromium.rs index 54ad81e..386e27e 100644 --- a/src/bookmark_reader/chromium.rs +++ b/src/bookmark_reader/chromium.rs @@ -129,7 +129,7 @@ impl ChromiumReader { } } -impl<'a> ReadBookmark<'a> for ChromiumReader { +impl ReadBookmark<'_> for ChromiumReader { type ParsedValue = serde_json::Value; fn name(&self) -> SourceType { diff --git a/src/bookmark_reader/firefox.rs b/src/bookmark_reader/firefox.rs index a05e2ca..02f130d 100644 --- a/src/bookmark_reader/firefox.rs +++ b/src/bookmark_reader/firefox.rs @@ -192,7 +192,7 @@ impl FirefoxReader { } } -impl<'a> ReadBookmark<'a> for FirefoxReader { +impl ReadBookmark<'_> for FirefoxReader { type ParsedValue = serde_json::Value; fn name(&self) -> SourceType { diff --git a/src/bookmark_reader/safari.rs b/src/bookmark_reader/safari.rs index 0bf282f..b5773da 100644 --- a/src/bookmark_reader/safari.rs +++ b/src/bookmark_reader/safari.rs @@ -100,7 +100,7 @@ impl SafariReader { } } -impl<'a> ReadBookmark<'a> for SafariReader { +impl ReadBookmark<'_> for SafariReader { type ParsedValue = plist::Value; fn name(&self) -> SourceType { diff --git a/src/bookmark_reader/target_writer.rs b/src/bookmark_reader/target_writer.rs index 769fa0f..757e93b 100644 --- a/src/bookmark_reader/target_writer.rs +++ b/src/bookmark_reader/target_writer.rs @@ -3,6 +3,7 @@ use std::io::{Seek, Write}; /// Extension trait for [`Write`] and [`Seek`] to read target bookmarks. pub trait WriteTarget { + #[allow(dead_code)] fn write(&mut self, target_bookmarks: &TargetBookmarks) -> Result<(), BogrepError>; } diff --git a/src/errors.rs b/src/errors.rs index 4da75a9..90e854d 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -1,3 +1,4 @@ +use readability::ReadabilityError; use reqwest::header::ToStrError; use std::{io, string::FromUtf8Error}; use thiserror::Error; @@ -28,7 +29,7 @@ pub enum BogrepError { #[error("Can't parse html")] ParseHtml(String), #[error("Can't convert html: {0}")] - ConvertHtml(readability::error::Error), + ConvertHtml(ReadabilityError), #[error("Invalid utf8: {0}")] ConvertUtf8(#[from] FromUtf8Error), #[error("Can't read from HTML: {0}")] diff --git a/src/html.rs b/src/html.rs index b1635bc..6188410 100644 --- a/src/html.rs +++ b/src/html.rs @@ -7,10 +7,15 @@ use html5ever::{ ParseOpts, QualName, }; use log::{debug, trace}; -use readability::extractor; +use readability::{extract, ExtractOptions, ScorerOptions}; +use regex::Regex; use reqwest::Url; use scraper::{Html, Selector}; -use std::{borrow::BorrowMut, io::Cursor, rc::Rc}; +use std::{borrow::BorrowMut, io::Cursor, rc::Rc, sync::OnceLock}; + +static UNLIKELY_CANDIDATES: OnceLock = OnceLock::new(); +static NEGATIVE_CANDIDATES: OnceLock = OnceLock::new(); +static POSITIVE_CANDIDATES: OnceLock = OnceLock::new(); pub fn filter_html(html: &str) -> Result { let dom = parse_document(RcDom::default(), ParseOpts::default()) @@ -97,8 +102,22 @@ fn is_filtered_tag(tag_name: &QualName) -> bool { pub fn convert_to_text(html: &str, bookmark_url: &Url) -> Result { let mut cursor = Cursor::new(html); - let product = - extractor::extract(&mut cursor, bookmark_url).map_err(BogrepError::ConvertHtml)?; + let options = ExtractOptions { parse_options: Default::default(), scorer_options: ScorerOptions { + unlikely_candidates: UNLIKELY_CANDIDATES.get_or_init(|| { + Regex::new( + "combx|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter|ssba", + ) + .unwrap() + }), + negative_candidates: NEGATIVE_CANDIDATES.get_or_init(|| { + Regex::new("combx|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget|form|textfield|uiScale|hidden").unwrap() + }), + positive_candidates: POSITIVE_CANDIDATES.get_or_init(|| { + Regex::new("article|body|content|entry|hentry|main|page|pagination|post|blog|story").unwrap() + }), + ..Default::default() + }}; + let product = extract(&mut cursor, bookmark_url, options).map_err(BogrepError::ConvertHtml)?; Ok(product.text) } diff --git a/tests/common.rs b/tests/common.rs index 0b807cb..11c4cf6 100644 --- a/tests/common.rs +++ b/tests/common.rs @@ -72,6 +72,7 @@ pub async fn mount_mocks(mock_server: &MockServer, num_mocks: u32) -> HashMap