Skip to content

Commit

Permalink
massive update, errors handling, remove unwraps
Browse files Browse the repository at this point in the history
oiwn committed Nov 13, 2024
1 parent e8a1ef5 commit dbb412e
Showing 11 changed files with 284 additions and 69 deletions.
23 changes: 23 additions & 0 deletions .github/workflows/coverage.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name: coverage

on: [push]
jobs:
test:
name: coverage
runs-on: ubuntu-latest
container:
image: xd009642/tarpaulin:develop-nightly
options: --security-opt seccomp=unconfined
steps:
- name: Checkout repository
uses: actions/checkout@v2

- name: Generate code coverage
run: |
cargo +nightly tarpaulin --verbose --all-features --workspace --timeout 120 --out xml
- name: Upload to codecov.io
uses: codecov/codecov-action@v2
with:
# token: ${{secrets.CODECOV_TOKEN}} # not required for public repos
fail_ci_if_error: true
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -8,3 +8,4 @@
/tmp
/data
*.profraw
all_code.txt
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "dom-content-extraction"
version = "0.3.3"
version = "0.3.4"

description = "Rust implementation of Content extraction via text density paper"
license = "MPL-2.0"
@@ -32,6 +32,7 @@ panic = "abort"
[dependencies]
ego-tree = "0.9"
scraper = "0.21"
thiserror = "2.0.3"

[dev-dependencies]
criterion = "0.5"
2 changes: 2 additions & 0 deletions Justfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
coverage:
cargo tarpaulin
7 changes: 0 additions & 7 deletions Makefile

This file was deleted.

58 changes: 49 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -4,27 +4,61 @@
<a href="https://crates.io/crates/dom-content-extraction">
<img src="https://img.shields.io/crates/dr/dom-content-extraction" alt="Crates.io">
</a>
<a href="https://codecov.io/github/oiwn/dom-content-extraction" >
<img src="https://codecov.io/github/oiwn/dom-content-extraction/graph/badge.svg?token=6Y7IYX29OP"/>
</a>
</div>

Rust implementation of Fei Sun, Dandan Song and Lejian Liao paper:

[Content Extraction via Text Density (CETD)](http://ofey.me/papers/cetd-sigir11.pdf)
A Rust library for extracting main content from web pages using text
density analysis. This is an implementation of the Content Extraction
via Text Density (CETD) algorithm described in the paper by Fei Sun,
Dandan Song and Lejian Liao:

[Content Extraction via Text Density](http://ofey.me/papers/cetd-sigir11.pdf).

## What Problem Does This Solve?

Web pages often contain a lot of peripheral content like navigation menus, advertisements, footers, and sidebars. This makes it challenging to extract just the main content programmatically. This library helps solve this problem by:

- Analyzing the text density patterns in HTML documents
- Identifying content-rich sections versus navigational/peripheral elements
- Extracting the main content while filtering out noise
- Handling various HTML layouts and structures

## Key Features

- Build a density tree representing text distribution in the HTML document
- Calculate composite text density using multiple metrics
- Extract main content blocks based on density patterns
- Support for nested HTML structures
- Efficient processing of large documents
- Error handling for malformed HTML

## Usage

Basic usage example:

```rust
use dom_content_extraction::{DensityTree, get_node_text};

let dtree = DensityTree::from_document(&document); // &scraper::Html
let sorted_nodes = dtree.sorted_nodes();
let node_id = sorted_nodes.last().unwrap().node_id;
let dtree = DensityTree::from_document(&document)?; // Takes a scraper::Html document

println!("{}", get_node_text(node_id, &document));
// Get nodes sorted by text density
let sorted_nodes = dtree.sorted_nodes();
let densest_node = sorted_nodes.last().unwrap();

dtree.calculate_density_sum();
let extracted_content = dtree.extract_content(&document);
// Extract text from the node with highest density
println!("{}", get_node_text(densest_node.node_id, &document)?);

println!("{}", extracted_content;
// For more accurate content extraction:
dtree.calculate_density_sum()?;
let main_content = dtree.extract_content(&document)?;
println!("{}", main_content);
```

## Installation

Add it it with:

```bash
@@ -37,6 +71,12 @@ or add to you `Cargo.toml`
dom-content-extraction = "0.3"
```

## Documentation

Read the docs!

[dom-content-extraction documentation](https://docs.rs/dom-content-extraction/latest/dom_content_extraction/)

## Run examples

Check examples.
17 changes: 9 additions & 8 deletions benches/simple.rs
Original file line number Diff line number Diff line change
@@ -37,10 +37,10 @@ fn benchmark_test_1_html_dom_content_extaction(c: &mut Criterion) {
b.iter(|| {
let document = build_dom(black_box(content.as_str()));

let dtree = DensityTree::from_document(&document);
let dtree = DensityTree::from_document(&document).unwrap();
let sorted_nodes = dtree.sorted_nodes();
let node_id = sorted_nodes.last().unwrap().node_id;
assert_eq!(get_node_text(node_id, &document).len(), 200);
assert_eq!(get_node_text(node_id, &document).unwrap().len(), 200);
})
});
}
@@ -56,10 +56,10 @@ fn benchmark_real_file_dom_content_extraction(c: &mut Criterion) {
b.iter(|| {
let document = build_dom(black_box(content.as_str()));

let dtree = DensityTree::from_document(&document);
let dtree = DensityTree::from_document(&document).unwrap();
let sorted_nodes = dtree.sorted_nodes();
let node_id = sorted_nodes.last().unwrap().node_id;
assert_eq!(get_node_text(node_id, &document).len() > 0, true);
assert!(!get_node_text(node_id, &document).unwrap().is_empty());
})
});
}
@@ -74,7 +74,7 @@ fn benchmark_real_file_density_tree_calculation(c: &mut Criterion) {

c.bench_function("real_file_density_tree_calculation", |b| {
b.iter(|| {
let dtree = DensityTree::from_document(black_box(&document));
let dtree = DensityTree::from_document(black_box(&document)).unwrap();
assert_eq!(dtree.tree.values().len(), 893);
})
});
@@ -90,7 +90,7 @@ fn benchmark_real_file_density_tree_calculation_and_sort(c: &mut Criterion) {

c.bench_function("real_file_density_tree_sort_nodes", |b| {
b.iter(|| {
let dtree = DensityTree::from_document(black_box(&document));
let dtree = DensityTree::from_document(black_box(&document)).unwrap();
let sorted_nodes = dtree.sorted_nodes();
let last_node = sorted_nodes.last().unwrap();
assert_eq!(last_node.density, 104.79147);
@@ -106,14 +106,15 @@ fn benchmark_node_text_extraction(c: &mut Criterion) {
.unwrap();
let document = build_dom(content.as_str());

let dtree = DensityTree::from_document(&document);
let dtree = DensityTree::from_document(&document).unwrap();
let sorted_nodes = dtree.sorted_nodes();
let last_node_id = sorted_nodes.last().unwrap().node_id;

c.bench_function("real_file_density_tree_sort_and_text_extraction", |b| {
b.iter(|| {
let node_text =
get_node_text(black_box(last_node_id), black_box(&document));
get_node_text(black_box(last_node_id), black_box(&document))
.unwrap();
assert_eq!(node_text.len(), 3065);
})
});
6 changes: 3 additions & 3 deletions examples/ce_score.rs
Original file line number Diff line number Diff line change
@@ -30,9 +30,9 @@ fn extract_content_from_html(file_path: &Path) -> Result<String> {
.with_context(|| format!("Failed to read file: {:?}", file_path))?;

let document = Html::parse_document(&content);
let mut dtree = DensityTree::from_document(&document);
dtree.calculate_density_sum();
let extracted_content = dtree.extract_content(&document);
let mut dtree = DensityTree::from_document(&document).unwrap();
let _ = dtree.calculate_density_sum();
let extracted_content = dtree.extract_content(&document).unwrap();

Ok(normalize_text(&extracted_content))
}
6 changes: 3 additions & 3 deletions examples/check.rs
Original file line number Diff line number Diff line change
@@ -29,8 +29,8 @@ fn process_lorem_ipsum() {
let html_content =
fs::read_to_string("html/lorem_ipsum.html").expect("Unable to read file");
let document = Html::parse_document(&html_content);
let mut dtree = DensityTree::from_document(&document);
dtree.calculate_density_sum();
let extracted_content = dtree.extract_content(&document);
let mut dtree = DensityTree::from_document(&document).unwrap();
let _ = dtree.calculate_density_sum();
let extracted_content = dtree.extract_content(&document).unwrap();
println!("Extracted content:\n{}", extracted_content);
}
2 changes: 2 additions & 0 deletions notes.org
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#+title: Notes

* Microtasks
** add custom errors, remove unwraps
** forbid unwrap in linting rules
** TODO coverage should be 100%
** TODO integrate trampoline or gcov into the github ci pipeline
** TODO cargo publish workflow
228 changes: 190 additions & 38 deletions src/lib.rs

Large diffs are not rendered by default.

0 comments on commit dbb412e

Please sign in to comment.