diff --git a/Cargo.lock b/Cargo.lock index e48f153..c3942f9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -381,9 +381,10 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "fast_html2md" -version = "0.0.43" +version = "0.0.44" dependencies = [ "auto_encoder", + "futures-util", "html5ever", "indoc", "lazy_static", @@ -394,7 +395,6 @@ dependencies = [ "regex", "spectral", "tokio", - "tokio-stream", "url", ] @@ -1727,17 +1727,6 @@ dependencies = [ "syn 2.0.96", ] -[[package]] -name = "tokio-stream" -version = "0.1.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047" -dependencies = [ - "futures-core", - "pin-project-lite", - "tokio", -] - [[package]] name = "unicode-ident" version = "1.0.15" diff --git a/README.md b/README.md index d3f3f5c..13ada0a 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ assert_eq!(md, "JAMES"); ### With Async Streaming -For handling large or concurrent workloads, use async streaming. Ensure you have a tokio async runtime: +For handling large or concurrent workloads, use async streaming with the `stream` and `rewriter` feature. Ensure you have a tokio async runtime: ```rust let md = html2md::rewrite_html_streaming("

JAMES

", false).await; @@ -43,8 +43,9 @@ assert_eq!(md, "JAMES"); ## Features -- **Rewriter:** High performance transformation using the `rewriter` feature (default). -- **Scraper:** Alternative approach for HTML parsing with the `scraper` feature. +- **rewriter:** High performance transformation using the `rewriter` feature (default). +- **scraper:** Alternative approach for HTML parsing with the `scraper` feature. +- **stream:** enables streaming chunks for rewriter. ### About diff --git a/benches/Cargo.toml b/benches/Cargo.toml index 9b3bfb1..a6605d7 100644 --- a/benches/Cargo.toml +++ b/benches/Cargo.toml @@ -6,7 +6,7 @@ edition = "2021" [dependencies] criterion = { version = "0.5", features = ["html_reports", "async_tokio"] } -fast_html2md = { path = "../fast_html2md", version = "0", features = ["tokio", "scraper"] } +fast_html2md = { path = "../fast_html2md", version = "0", features = ["stream", "scraper"] } tokio = { version = "1", features = [ "full" ] } [[bench]] diff --git a/fast_html2md/Cargo.toml b/fast_html2md/Cargo.toml index d0b2f61..87051d2 100644 --- a/fast_html2md/Cargo.toml +++ b/fast_html2md/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fast_html2md" -version = "0.0.43" +version = "0.0.44" edition = "2021" description = "A fast html2md crate for rust" categories = ["development-tools", "parsing", "parser-implementations"] @@ -28,8 +28,7 @@ url = "2" markup5ever_rcdom = { version = "0.3.0", optional = true } html5ever = { version = "0.27", optional = true } lol_html = { version = "2", optional = true } -tokio = { version = "1", features = ["sync"], optional = true } -tokio-stream = { version = "0.1", optional = true } +futures-util = { version = "0.3", optional = true, default-features = false } [dev-dependencies] spectral = "0.6.0" @@ -41,4 +40,4 @@ tokio = { version = "1", features = ["full"] } default = ["rewriter"] rewriter = ["dep:lol_html"] scraper = ["dep:html5ever", "dep:markup5ever_rcdom"] -tokio = ["dep:tokio", "dep:tokio-stream"] +stream = ["dep:futures-util"] diff --git a/fast_html2md/src/lib.rs b/fast_html2md/src/lib.rs index 97c0b53..ddbf2b5 100644 --- a/fast_html2md/src/lib.rs +++ b/fast_html2md/src/lib.rs @@ -1,7 +1,6 @@ use extended::sifter::{WhitespaceSifter, WhitespaceSifterBytes}; use lazy_static::lazy_static; use regex::Regex; -use url::Url; // we want to just use the rewriter instead for v0.1. pub mod extended; @@ -40,7 +39,7 @@ pub fn rewrite_html(html: &str, commonmark: bool) -> String { /// and returns converted string. Incomplete work in progress for major performance increases. /// # Arguments /// `html` is source HTML as `String` -#[cfg(all(feature = "tokio", feature = "rewriter"))] +#[cfg(all(feature = "stream", feature = "rewriter"))] pub async fn rewrite_html_streaming(html: &str, commonmark: bool) -> String { rewriter::writer::convert_html_to_markdown_send(html, &None, commonmark, &None) .await @@ -55,12 +54,12 @@ pub async fn rewrite_html_streaming(html: &str, commonmark: bool) -> String { /// `custom` is custom tag hadler producers for tags you want, can be empty /// `commonmark` is for adjusting markdown output to commonmark /// `url` is used to provide absolute url handling -#[cfg(all(feature = "tokio", feature = "rewriter"))] +#[cfg(all(feature = "stream", feature = "rewriter"))] pub fn rewrite_html_custom_with_url( html: &str, custom: &Option>, commonmark: bool, - url: &Option, + url: &Option, ) -> String { rewriter::writer::convert_html_to_markdown(html, &custom, commonmark, url).unwrap_or_default() } @@ -74,12 +73,12 @@ pub fn rewrite_html_custom_with_url( /// `commonmark` is for adjusting markdown output to commonmark /// `url` is used to provide absolute url handling /// `chunk_size` the chunk size to use. -#[cfg(all(feature = "tokio", feature = "rewriter"))] +#[cfg(all(feature = "stream", feature = "rewriter"))] pub async fn rewrite_html_custom_with_url_and_chunk( html: &str, custom: &Option>, commonmark: bool, - url: &Option, + url: &Option, chunk_size: usize, ) -> String { rewriter::writer::convert_html_to_markdown_send_with_size( @@ -97,12 +96,12 @@ pub async fn rewrite_html_custom_with_url_and_chunk( /// `custom` is custom tag hadler producers for tags you want, can be empty /// `commonmark` is for adjusting markdown output to commonmark /// `url` is used to provide absolute url handling -#[cfg(all(feature = "tokio", feature = "rewriter"))] +#[cfg(all(feature = "stream", feature = "rewriter"))] pub async fn rewrite_html_custom_with_url_streaming( html: &str, custom: &Option>, commonmark: bool, - url: &Option, + url: &Option, ) -> String { rewriter::writer::convert_html_to_markdown_send(html, &custom, commonmark, url) .await diff --git a/fast_html2md/src/rewriter/writer.rs b/fast_html2md/src/rewriter/writer.rs index b1a0428..e24aa28 100644 --- a/fast_html2md/src/rewriter/writer.rs +++ b/fast_html2md/src/rewriter/writer.rs @@ -198,7 +198,7 @@ pub(crate) fn convert_html_to_markdown( } /// Convert to markdown streaming re-writer with chunk size. -#[cfg(feature = "tokio")] +#[cfg(feature = "stream")] pub async fn convert_html_to_markdown_send_with_size( html: &str, custom: &Option>, @@ -206,7 +206,7 @@ pub async fn convert_html_to_markdown_send_with_size( url: &Option, chunk_size: usize, ) -> Result> { - use tokio_stream::StreamExt; + use futures_util::stream::{self, StreamExt}; let settings = get_rewriter_settings_send(commonmark, custom, url.clone()); let mut rewrited_bytes: Vec = Vec::new(); @@ -215,12 +215,8 @@ pub async fn convert_html_to_markdown_send_with_size( rewrited_bytes.extend_from_slice(&c); }); - let html_bytes = html.as_bytes(); - let chunks = html_bytes.chunks(chunk_size); - - let mut stream = tokio_stream::iter(chunks); - let mut wrote_error = false; + let mut stream = stream::iter(html.as_bytes().chunks(chunk_size)); while let Some(chunk) = stream.next().await { if rewriter.write(chunk).is_err() { @@ -237,7 +233,7 @@ pub async fn convert_html_to_markdown_send_with_size( } /// Convert to markdown streaming re-writer -#[cfg(feature = "tokio")] +#[cfg(feature = "stream")] pub async fn convert_html_to_markdown_send( html: &str, custom: &Option>, diff --git a/fast_html2md/tests/integration.rs b/fast_html2md/tests/integration.rs index 299e863..76b5733 100644 --- a/fast_html2md/tests/integration.rs +++ b/fast_html2md/tests/integration.rs @@ -102,7 +102,7 @@ fn test_real_world_wiki_rewriter() -> Result<(), Box> { } #[tokio::test] -#[cfg(all(feature = "tokio", feature = "rewriter"))] +#[cfg(all(feature = "stream", feature = "rewriter"))] async fn test_real_world_wiki_async() -> Result<(), Box> { use std::error::Error; use std::fs::{self, File}; @@ -279,7 +279,7 @@ fn test_html_from_text() { } #[test] -#[cfg(all(feature = "tokio", feature = "rewriter"))] +#[cfg(all(feature = "stream", feature = "rewriter"))] fn test_html_from_text_rewrite() { let mut html = Box::new(String::new()); let mut html_file = File::open("../test-samples/real-world-1.html").unwrap(); @@ -512,7 +512,7 @@ fn test_real_spider() { #[tokio::test] #[ignore] -#[cfg(all(feature = "tokio", feature = "rewriter"))] +#[cfg(all(feature = "stream", feature = "rewriter"))] async fn test_real_spider_async() { let mut html = String::new(); let mut html_file: File = File::open("../test-samples/spider-cloud.html").unwrap();