Skip to content

Commit

Permalink
chore(tables): revert onepass table write
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Nov 11, 2024
1 parent 5b08a87 commit 2014ea5
Show file tree
Hide file tree
Showing 9 changed files with 73,438 additions and 7,445 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "fast_html2md"
version = "0.0.22"
version = "0.0.24"
edition = "2021"
description = "A fast html2md crate for rust"
categories = ["development-tools", "parsing", "parser-implementations"]
Expand Down
126 changes: 83 additions & 43 deletions src/tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ pub struct TableHandler {
const TD: LocalName = html5ever::local_name!("td");
const TH: LocalName = html5ever::local_name!("th");

const TABLE_LIMIT: usize = 1000;

impl TableHandler {
/// A new table handler.
pub fn new(commonmark: bool, url: Option<std::sync::Arc<Url>>) -> Self {
Expand Down Expand Up @@ -52,33 +54,47 @@ impl TagHandler for TableHandler {

column_widths = vec![3; column_count];

// header row must always be present
// detect max column width
for (idx, row) in rows.iter().enumerate() {
table_markup.push('|');
let cells = collect_children(row, any_matcher);
if idx >= TABLE_LIMIT {
break;
}

let mut inner_table_markup = String::new();
let cells = collect_children(row, any_matcher);

for index in 0..column_count {
if index >= 10000 {
break;
}

// from regular rows
if let Some(cell) = cells.get(index) {
let mut text = to_text(cell, self.commonmark, &self.url);
let text = to_text(cell, self.commonmark, &self.url);

column_widths[index] = cmp::max(column_widths[index], text.chars().count());
}
}
}

// header row must always be present
for (idx, row) in rows.iter().enumerate() {
if idx >= TABLE_LIMIT {
break;
}

// we need to fill all cells in a column, even if some rows don't have enough
pad_cell_text(&mut text, column_widths[index]);
table_markup.push('|');

inner_table_markup.push_str(&text);
}
let cells = collect_children(row, any_matcher);

inner_table_markup.push('|');
for index in 0..column_count {
// we need to fill all cells in a column, even if some rows don't have enough
let padded_cell_text = pad_cell_text(
&cells.get(index),
column_widths[index],
self.commonmark,
&self.url,
);

table_markup.push_str(&padded_cell_text);
table_markup.push('|');
}

table_markup.push_str(&inner_table_markup);
table_markup.push('\n');

if idx == 0 {
Expand Down Expand Up @@ -137,20 +153,15 @@ impl TagHandler for TableHandler {

table_markup.push('\n');
}

if idx >= 100 {
break;
}
}

printer.insert_newline();
printer.insert_newline();
printer.append_str(&table_markup);
}
}

fn after_handle(&mut self, printer: &mut StructuredPrinter) {
printer.insert_newline();
}
fn after_handle(&mut self, _printer: &mut StructuredPrinter) {}

fn skip_descendants(&self) -> bool {
true
Expand All @@ -162,19 +173,52 @@ impl TagHandler for TableHandler {
/// `tag` - optional reference to currently processed handle, text is extracted from here
///
/// `column_width` - precomputed column width to compute padding length from
fn pad_cell_text(text: &mut String, column_width: usize) {
// Compute difference between column width and text length
let len_diff = column_width
.checked_sub(text.chars().count())
.unwrap_or_default();

if len_diff > 0 {
if len_diff > 1 {
text.insert(0, ' ');
text.push(' ');
fn pad_cell_text(
tag: &Option<&Handle>,
column_width: usize,
commonmark: bool,
url: &Option<Arc<Url>>,
) -> String {
let mut result = String::new();

if let Some(cell) = tag {
// have header at specified position
let text = to_text(cell, commonmark, url);

// compute difference between width and text length
let len_diff = column_width
.checked_sub(text.chars().count())
.unwrap_or_default();

if len_diff > 0 {
// should pad
if len_diff > 1 {
result.push(' ');
result.push_str(&text);
result.push(' ');
} else {
// it's just one space, add at the end
result.push_str(&text);
result.push(' ');
}
} else {
text.push(' ');
// shouldn't pad, text fills whole cell
result.push_str(&text);
}
} else {
// no text in this cell, fill cell with spaces
result.push(' ');
}

result
}

/// Extracts tag name from passed tag
/// Returns empty string if it's not an html element
fn tag_name(tag: &Handle) -> String {
match tag.data {
NodeData::Element { ref name, .. } => name.local.to_string(),
_ => String::new(),
}
}

Expand All @@ -183,16 +227,12 @@ fn pad_cell_text(text: &mut String, column_width: usize) {
fn find_children(tag: &Handle, name: &str) -> Vec<Handle> {
let mut result: Vec<Handle> = vec![];
let children = tag.children.borrow();

for child in children.iter() {
if let NodeData::Element { ref name, .. } = tag.data {
if name.local == name.local {
result.push(child.clone());
}
if tag_name(child) == name {
result.push(child.clone());
}

let mut descendants = find_children(child, name);

result.append(&mut descendants);
}

Expand All @@ -205,8 +245,8 @@ fn collect_children<P>(tag: &Handle, predicate: P) -> Vec<Handle>
where
P: Fn(&Handle) -> bool,
{
let mut result: Vec<Handle> = vec![];
let children = tag.children.borrow();
let mut result: Vec<Handle> = Vec::with_capacity(children.len());

for child in children.iter() {
if predicate(child) {
Expand All @@ -219,15 +259,15 @@ where

/// Convert html tag to text. This collects all tag children in correct order where they're observed
/// and concatenates their text, recursively.
fn to_text(tag: &Handle, commonmark: bool, url: &Option<std::sync::Arc<Url>>) -> String {
fn to_text(tag: &Handle, commonmark: bool, url: &Option<Arc<Url>>) -> String {
let mut printer = StructuredPrinter::default();
walk(
tag,
&mut printer,
&HashMap::default(),
commonmark,
&url,
false,
url,
true,
);
clean_markdown(&printer.data)
}
2 changes: 1 addition & 1 deletion test-samples/wiki/en-wikipedia-org_wiki_Aspartic_acid.html
Original file line number Diff line number Diff line change
Expand Up @@ -8071,7 +8071,7 @@ <h3 id="Interactive_pathway_map">
word-wrap: break-word;
}
.mw-parser-output .citation q {
quotes: '"''"'"'" "'";
quotes: '"' '"' "'" "'";
}
.mw-parser-output .citation:target {
background-color: rgba(0, 127, 255, 0.133);
Expand Down
Loading

0 comments on commit 2014ea5

Please sign in to comment.