massive update, errors handling, remove unwraps

oiwn · Nov 13, 2024 · dbb412e · dbb412e
1 parent e8a1ef5
commit dbb412e
Showing 11 changed files with 284 additions and 69 deletions.
diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
@@ -0,0 +1,23 @@
+name: coverage
+
+on: [push]
+jobs:
+  test:
+    name: coverage
+    runs-on: ubuntu-latest
+    container:
+      image: xd009642/tarpaulin:develop-nightly
+      options: --security-opt seccomp=unconfined
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+
+      - name: Generate code coverage
+        run: |
+          cargo +nightly tarpaulin --verbose --all-features --workspace --timeout 120 --out xml
+
+      - name: Upload to codecov.io
+        uses: codecov/codecov-action@v2
+        with:
+          # token: ${{secrets.CODECOV_TOKEN}} # not required for public repos
+          fail_ci_if_error: true
diff --git a/.gitignore b/.gitignore
@@ -8,3 +8,4 @@
 /tmp
 /data
 *.profraw
+all_code.txt
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "dom-content-extraction"
-version = "0.3.3"
+version = "0.3.4"
 
 description = "Rust implementation of Content extraction via text density paper"
 license = "MPL-2.0"
@@ -32,6 +32,7 @@ panic = "abort"
 [dependencies]
 ego-tree = "0.9"
 scraper = "0.21"
+thiserror = "2.0.3"
 
 [dev-dependencies]
 criterion = "0.5"

diff --git a/Justfile b/Justfile
@@ -0,0 +1,2 @@
+coverage:
+    cargo tarpaulin
diff --git a/Makefile b/Makefile
diff --git a/README.md b/README.md
@@ -4,27 +4,61 @@
    <a href="https://crates.io/crates/dom-content-extraction">
         <img src="https://img.shields.io/crates/dr/dom-content-extraction" alt="Crates.io">
     </a>
+  <a href="https://codecov.io/github/oiwn/dom-content-extraction" > 
+   <img src="https://codecov.io/github/oiwn/dom-content-extraction/graph/badge.svg?token=6Y7IYX29OP"/> 
+   </a>
 </div>
 
-Rust implementation of Fei Sun, Dandan Song and Lejian Liao paper:
 
-[Content Extraction via Text Density (CETD)](http://ofey.me/papers/cetd-sigir11.pdf)
+A Rust library for extracting main content from web pages using text 
+density analysis. This is an implementation of the Content Extraction 
+via Text Density (CETD) algorithm described in the paper by Fei Sun, 
+Dandan Song and Lejian Liao: 
+
+[Content Extraction via Text Density](http://ofey.me/papers/cetd-sigir11.pdf).
+
+## What Problem Does This Solve?
+
+Web pages often contain a lot of peripheral content like navigation menus, advertisements, footers, and sidebars. This makes it challenging to extract just the main content programmatically. This library helps solve this problem by:
+
+- Analyzing the text density patterns in HTML documents
+- Identifying content-rich sections versus navigational/peripheral elements
+- Extracting the main content while filtering out noise
+- Handling various HTML layouts and structures
+
+## Key Features
+
+- Build a density tree representing text distribution in the HTML document
+- Calculate composite text density using multiple metrics
+- Extract main content blocks based on density patterns
+- Support for nested HTML structures
+- Efficient processing of large documents
+- Error handling for malformed HTML
+
+## Usage
+
+Basic usage example:
 
 ```rust
 use dom_content_extraction::{DensityTree, get_node_text};
 
-let dtree = DensityTree::from_document(&document); // &scraper::Html 
-let sorted_nodes = dtree.sorted_nodes();
-let node_id = sorted_nodes.last().unwrap().node_id;
+let dtree = DensityTree::from_document(&document)?; // Takes a scraper::Html document
 
-println!("{}", get_node_text(node_id, &document));
+// Get nodes sorted by text density
+let sorted_nodes = dtree.sorted_nodes();
+let densest_node = sorted_nodes.last().unwrap();
 
-dtree.calculate_density_sum();
-let extracted_content = dtree.extract_content(&document);
+// Extract text from the node with highest density
+println!("{}", get_node_text(densest_node.node_id, &document)?);
 
-println!("{}", extracted_content;
+// For more accurate content extraction:
+dtree.calculate_density_sum()?;
+let main_content = dtree.extract_content(&document)?;
+println!("{}", main_content);
 ```
 
+## Installation 
+
 Add it it with:
 
 ```bash
@@ -37,6 +71,12 @@ or add to you  `Cargo.toml`
 dom-content-extraction = "0.3"
 ```
 
+## Documentation
+
+Read the docs! 
+
+[dom-content-extraction documentation](https://docs.rs/dom-content-extraction/latest/dom_content_extraction/)
+
 ## Run examples
 
 Check examples.

diff --git a/benches/simple.rs b/benches/simple.rs
@@ -37,10 +37,10 @@ fn benchmark_test_1_html_dom_content_extaction(c: &mut Criterion) {
         b.iter(|| {
             let document = build_dom(black_box(content.as_str()));
 
-            let dtree = DensityTree::from_document(&document);
+            let dtree = DensityTree::from_document(&document).unwrap();
             let sorted_nodes = dtree.sorted_nodes();
             let node_id = sorted_nodes.last().unwrap().node_id;
-            assert_eq!(get_node_text(node_id, &document).len(), 200);
+            assert_eq!(get_node_text(node_id, &document).unwrap().len(), 200);
         })
     });
 }
@@ -56,10 +56,10 @@ fn benchmark_real_file_dom_content_extraction(c: &mut Criterion) {
         b.iter(|| {
             let document = build_dom(black_box(content.as_str()));
 
-            let dtree = DensityTree::from_document(&document);
+            let dtree = DensityTree::from_document(&document).unwrap();
             let sorted_nodes = dtree.sorted_nodes();
             let node_id = sorted_nodes.last().unwrap().node_id;
-            assert_eq!(get_node_text(node_id, &document).len() > 0, true);
+            assert!(!get_node_text(node_id, &document).unwrap().is_empty());
         })
     });
 }
@@ -74,7 +74,7 @@ fn benchmark_real_file_density_tree_calculation(c: &mut Criterion) {
 
     c.bench_function("real_file_density_tree_calculation", |b| {
         b.iter(|| {
-            let dtree = DensityTree::from_document(black_box(&document));
+            let dtree = DensityTree::from_document(black_box(&document)).unwrap();
             assert_eq!(dtree.tree.values().len(), 893);
         })
     });
@@ -90,7 +90,7 @@ fn benchmark_real_file_density_tree_calculation_and_sort(c: &mut Criterion) {
 
     c.bench_function("real_file_density_tree_sort_nodes", |b| {
         b.iter(|| {
-            let dtree = DensityTree::from_document(black_box(&document));
+            let dtree = DensityTree::from_document(black_box(&document)).unwrap();
             let sorted_nodes = dtree.sorted_nodes();
             let last_node = sorted_nodes.last().unwrap();
             assert_eq!(last_node.density, 104.79147);
@@ -106,14 +106,15 @@ fn benchmark_node_text_extraction(c: &mut Criterion) {
     .unwrap();
     let document = build_dom(content.as_str());
 
-    let dtree = DensityTree::from_document(&document);
+    let dtree = DensityTree::from_document(&document).unwrap();
     let sorted_nodes = dtree.sorted_nodes();
     let last_node_id = sorted_nodes.last().unwrap().node_id;
 
     c.bench_function("real_file_density_tree_sort_and_text_extraction", |b| {
         b.iter(|| {
             let node_text =
-                get_node_text(black_box(last_node_id), black_box(&document));
+                get_node_text(black_box(last_node_id), black_box(&document))
+                    .unwrap();
             assert_eq!(node_text.len(), 3065);
         })
     });

diff --git a/examples/ce_score.rs b/examples/ce_score.rs
@@ -30,9 +30,9 @@ fn extract_content_from_html(file_path: &Path) -> Result<String> {
         .with_context(|| format!("Failed to read file: {:?}", file_path))?;
 
     let document = Html::parse_document(&content);
-    let mut dtree = DensityTree::from_document(&document);
-    dtree.calculate_density_sum();
-    let extracted_content = dtree.extract_content(&document);
+    let mut dtree = DensityTree::from_document(&document).unwrap();
+    let _ = dtree.calculate_density_sum();
+    let extracted_content = dtree.extract_content(&document).unwrap();
 
     Ok(normalize_text(&extracted_content))
 }

diff --git a/examples/check.rs b/examples/check.rs
@@ -29,8 +29,8 @@ fn process_lorem_ipsum() {
     let html_content =
         fs::read_to_string("html/lorem_ipsum.html").expect("Unable to read file");
     let document = Html::parse_document(&html_content);
-    let mut dtree = DensityTree::from_document(&document);
-    dtree.calculate_density_sum();
-    let extracted_content = dtree.extract_content(&document);
+    let mut dtree = DensityTree::from_document(&document).unwrap();
+    let _ = dtree.calculate_density_sum();
+    let extracted_content = dtree.extract_content(&document).unwrap();
     println!("Extracted content:\n{}", extracted_content);
 }
diff --git a/notes.org b/notes.org
@@ -1,6 +1,8 @@
 #+title: Notes
 
 * Microtasks
+** add custom errors, remove unwraps
+** forbid unwrap in linting rules
 ** TODO coverage should be 100%
 ** TODO integrate trampoline or gcov into the github ci pipeline
 ** TODO cargo publish workflow

diff --git a/src/lib.rs b/src/lib.rs
-Original file line number
+Diff line change
@@ @@ -8,3 +8,4 @@ @@
     /tmp
     /data
     *.profraw
+    all_code.txt