Skip to content

Commit

Permalink
perf(sift): add one pass convert
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Nov 15, 2024
1 parent 81ef4ed commit 5164df3
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 6 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion fast_html2md/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fast_html2md"
version = "0.0.35"
version = "0.0.37"
edition = "2021"
description = "A fast html2md crate for rust"
categories = ["development-tools", "parsing", "parser-implementations"]
Expand Down
36 changes: 36 additions & 0 deletions fast_html2md/src/extended/sifter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,43 @@ pub trait WhitespaceSifter: AsRef<str> {
}
}

/// A trait containing all `Vec<u8>` whitespace-sifting functions.
pub trait WhitespaceSifterBytes: AsRef<[u8]> {
/// This removes duplicate whitespaces from a `Vec<u8>`.
/// It supports the same whitespace definition as [char::is_ascii_whitespace].
#[must_use]
fn sift_bytes(&self) -> String {
let input = self.as_ref();
let mut out: String = String::with_capacity(input.len());
sift_preallocated(input, &mut out);
out
}

/// This removes duplicate whitespaces from a `Vec<u8>`.
/// It preserves deduplicated newlines.
#[must_use]
fn sift_bytes_preserve_newlines(&self) -> String {
let bytes = self.as_ref();
let mut out = String::with_capacity(bytes.len());
let mut ind: usize = 0;

while ind < bytes.len() {
sift_preallocated_until_newline(bytes, &mut ind, &mut out);
}

if out.ends_with("\r\n") {
let _ = out.pop();
let _ = out.pop();
} else if out.ends_with('\n') {
let _ = out.pop();
}

out
}
}

impl<T: AsRef<str>> WhitespaceSifter for T {}
impl<T: AsRef<[u8]>> WhitespaceSifterBytes for T {}

/// A custom implementation of `str::trim_start`.
fn sift_trim_start(bytes: &[u8], ind: &mut usize, out: &mut String) {
Expand Down
10 changes: 9 additions & 1 deletion fast_html2md/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use extended::sifter::WhitespaceSifterBytes;
use html5ever::driver::ParseOpts;
use html5ever::parse_document;
use html5ever::tendril::TendrilSink;
Expand Down Expand Up @@ -404,7 +405,14 @@ fn escape_markdown(result: &StructuredPrinter, text: &str) -> String {
///
/// Clears excessive punctuation that would be trimmed by renderer anyway
pub fn clean_markdown(input: &str) -> String {
input.sift().into()
input.sift()
}

/// Called after all processing has been finished
///
/// Clears excessive punctuation that would be trimmed by renderer anyway
pub fn clean_markdown_bytes(input: &Vec<u8>) -> String {
input.sift_bytes()
}

/// Intermediate result of HTML -> Markdown conversion.
Expand Down
23 changes: 20 additions & 3 deletions fast_html2md/src/rewriter/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ use super::images::rewrite_image_element;
use super::lists::handle_list_or_item;
use super::quotes::{rewrite_blockquote_element, rewrite_blockquote_text};
use super::styles::rewrite_style_element;
use crate::{clean_markdown, escape_markdown_base};
use crate::clean_markdown_bytes;
use lol_html::html_content::ContentType::{Html, Text};
use lol_html::html_content::Element;
use lol_html::{doc_comments, text};
use lol_html::{element, rewrite_str, RewriteStrSettings};
use lol_html::{element, RewriteStrSettings};
use std::cell::RefCell;
use std::rc::Rc;
use url::Url;
Expand Down Expand Up @@ -270,7 +270,24 @@ pub(crate) fn convert_html_to_markdown(
let settings = get_rewriter_settings(commonmark, custom, url.clone());

match rewrite_str(&Box::new(html), settings) {
Ok(markdown) => Ok(clean_markdown(&markdown)),
Ok(markdown) => Ok(clean_markdown_bytes(&markdown)),
Err(e) => Err(e.into()),
}
}

/// Shortcut to rewrite string and encode correctly
pub fn rewrite_str<'h, 's, H: lol_html::HandlerTypes>(
html: &str,
settings: impl Into<lol_html::Settings<'h, 's, H>>,
) -> Result<Vec<u8>, lol_html::errors::RewritingError> {
let mut output = vec![];

let mut rewriter = lol_html::HtmlRewriter::new(settings.into(), |c: &[u8]| {
output.extend_from_slice(c);
});

rewriter.write(html.as_bytes())?;
rewriter.end()?;

Ok(output)
}

0 comments on commit 5164df3

Please sign in to comment.