diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml new file mode 100644 index 000000000..2dd608b35 --- /dev/null +++ b/.github/workflows/bench.yml @@ -0,0 +1,39 @@ +name: Benches +on: + push: + branches: [master] + pull_request: + branches: [master] + +env: + CARGO_TERM_COLOR: always + RUST_LOG: "off" + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/cache@v2 + id: cache + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + target/ + key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} + - name: Install Benchmark Dependencies + run: | + # install node-crawler + npm install -g crawler + # install go and deps + go mod init example.com/spider + go get github.com/gocolly/colly/v2 + cat go.mod + go mod tidy + # install the local cli latest + cd ./spider_cli && cargo install --path . && cd ../ + - name: Run Benchmarks + run: cargo bench diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index b16f40241..000ca0cbb 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -13,21 +13,17 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 + - uses: actions/cache@v2 + id: cache + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + target/ + key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} - name: Build run: cargo build --verbose - name: Run tests run: cargo test --verbose --all-features - - name: Install Benchmark Dependencies - run: | - sudo apt update && sudo apt-get install build-essential - # install node-crawler - npm install crawler -g - # install the local cli latest - cd ./spider_cli && cargo install --path . && cd ../ - # install go and deps - echo "module github.com/x/y" >> go.mod - echo "go 1.14" >> go.mod - echo "require github.com/gocolly/colly/v2 v2.1.0" >> go.mod - go mod tidy - - name: Run Benchmarks - run: cargo bench diff --git a/benches/.gitignore b/benches/.gitignore index 2ebab1541..ba21c5416 100644 --- a/benches/.gitignore +++ b/benches/.gitignore @@ -4,4 +4,5 @@ node_modules node-crawler.js go-crolly.go go.mod -go.sum \ No newline at end of file +go.sum +output.txt \ No newline at end of file diff --git a/benches/Cargo.toml b/benches/Cargo.toml index 46cf8c968..8a1e2d5c3 100644 --- a/benches/Cargo.toml +++ b/benches/Cargo.toml @@ -5,7 +5,7 @@ publish = false edition = "2021" [dependencies] -spider = { version = "1.5.5", path = "../spider" } +spider = { version = "1.6.0", path = "../spider" } criterion = "0.3" [[bench]] diff --git a/benches/crawl.rs b/benches/crawl.rs index 6d172cb58..adc725fec 100644 --- a/benches/crawl.rs +++ b/benches/crawl.rs @@ -10,7 +10,7 @@ pub fn bench_speed(c: &mut Criterion) { let go_crawl_script = go_crolly::gen_crawl(); let mut group = c.benchmark_group("crawl-speed"); - group.sample_size(10).measurement_time(Duration::new(85, 0) + Duration::from_millis(500)); + group.sample_size(10).measurement_time(Duration::new(180, 0) + Duration::from_millis(500)); group.bench_function("Rust[spider]: with crawl 10 times", |b| b.iter(||Command::new("spider") .args(["--delay", "0", "--domain", "https://rsseau.fr", "crawl"]) .output() diff --git a/benches/go_crolly.rs b/benches/go_crolly.rs index f6d592af9..42f1b0694 100644 --- a/benches/go_crolly.rs +++ b/benches/go_crolly.rs @@ -3,7 +3,7 @@ use std::io::{BufWriter, Write}; pub fn crawl_stub() -> String { r#" - package main + package spider import ( "fmt" diff --git a/examples/Cargo.toml b/examples/Cargo.toml index 230a9dded..5a4bdc417 100644 --- a/examples/Cargo.toml +++ b/examples/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_examples" -version = "1.5.5" +version = "1.6.0" authors = ["madeindjs ", "j-mendez "] description = "Multithreaded web crawler written in Rust." repository = "https://github.com/madeindjs/spider" @@ -15,7 +15,7 @@ publish = false maintenance = { status = "as-is" } [dependencies.spider] -version = "1.5.5" +version = "1.6.0" path = "../spider" default-features = false diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 06163b763..78d6469ae 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "1.5.5" +version = "1.6.0" authors = ["madeindjs ", "j-mendez "] description = "Multithreaded web crawler written in Rust." repository = "https://github.com/madeindjs/spider" @@ -15,7 +15,7 @@ edition = "2018" maintenance = { status = "as-is" } [dependencies] -reqwest = { version = "0.11.10" } +reqwest = { version = "0.11.10", features = ["blocking"] } scraper = "0.12" robotparser-fork = "0.10.5" url = "2.2" diff --git a/spider/src/page.rs b/spider/src/page.rs index b33790b57..7e5b59712 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -91,7 +91,7 @@ fn parse_links() { .unwrap(); let link_result = "https://choosealicense.com/"; - let html = fetch_page_html(&link_result, &client).unwrap(); + let html = fetch_page_html(&link_result, &client); let page: Page = Page::new(&link_result, &html); assert!( @@ -111,7 +111,7 @@ fn test_abs_path() { .build() .unwrap(); let link_result = "https://choosealicense.com/"; - let html = fetch_page_html(&link_result, &client).unwrap(); + let html = fetch_page_html(&link_result, &client); let page: Page = Page::new(&link_result, &html); assert_eq!( diff --git a/spider/src/utils.rs b/spider/src/utils.rs index 893f2bdf1..e0f322431 100644 --- a/spider/src/utils.rs +++ b/spider/src/utils.rs @@ -1,8 +1,18 @@ -pub use crate::reqwest::{Client, Error}; +pub use crate::reqwest::blocking::{Client}; +use reqwest::StatusCode; -#[tokio::main] -pub async fn fetch_page_html(url: &str, client: &Client) -> Result { - let body = client.get(url).send().await?.text().await?; +pub fn fetch_page_html(url: &str, client: &Client) -> String { + let mut body = String::new(); - Ok(body) + // silence errors for top level logging + match client.get(url).send() { + Ok(res) if res.status() == StatusCode::OK => match res.text() { + Ok(text) => body = text, + Err(_) => {}, + }, + Ok(_) => (), + Err(_) => {} + } + + body } diff --git a/spider/src/website.rs b/spider/src/website.rs index 0fe5d3fb8..6a0087f95 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -39,8 +39,6 @@ pub struct Website<'a> { pub on_link_find_callback: fn(String) -> String, /// Robot.txt parser holder robot_file_parser: RobotFileParser<'a>, - // fetch client - client: Client, // ignore holding page in memory, pages will always be empty pub page_store_ignore: bool, } @@ -50,14 +48,13 @@ impl<'a> Website<'a> { pub fn new(domain: &str) -> Self { Self { configuration: Configuration::new(), - domain: domain.to_string(), - links: HashSet::from([format!("{}/", domain)]), links_visited: HashSet::new(), pages: Vec::new(), robot_file_parser: RobotFileParser::new(&format!("{}/robots.txt", domain)), // TODO: lazy establish + links: HashSet::from([format!("{}/", domain)]), on_link_find_callback: |s| s, - client: Client::new(), - page_store_ignore: false + page_store_ignore: false, + domain: domain.to_owned(), } } @@ -85,14 +82,13 @@ impl<'a> Website<'a> { } /// configure http client - pub fn configure_http_client(&mut self, user_agent: Option) { + fn configure_http_client(&mut self, user_agent: Option) -> Client { let mut headers = header::HeaderMap::new(); headers.insert(CONNECTION, header::HeaderValue::from_static("keep-alive")); - self.client = Client::builder() + Client::builder() .default_headers(headers) .user_agent(user_agent.unwrap_or(self.configuration.user_agent.to_string())) - .pool_max_idle_per_host(0) .build() .expect("Failed building client.") } @@ -108,7 +104,7 @@ impl<'a> Website<'a> { /// Start to crawl website pub fn crawl(&mut self) { self.configure_robots_parser(); - self.configure_http_client(None); + let client = self.configure_http_client(None); let delay = self.get_delay(); let on_link_find_callback = self.on_link_find_callback; let pool = self.create_thread_pool(); @@ -122,16 +118,15 @@ impl<'a> Website<'a> { continue; } self.log(&format!("- fetch {}", &link)); - self.links_visited.insert(link.to_string()); - - let thread_link = link.to_string(); + self.links_visited.insert(String::from(link)); + let link = link.clone(); let tx = tx.clone(); - let cx = self.client.clone(); + let cx = client.clone(); pool.spawn(move || { - let link_result = on_link_find_callback(thread_link); - let html = fetch_page_html(&link_result, &cx).unwrap_or_default(); + let link_result = on_link_find_callback(link); + let html = fetch_page_html(&link_result, &cx); let page = Page::new(&link_result, &html); let links = page.links(); @@ -156,11 +151,9 @@ impl<'a> Website<'a> { if self.configuration.delay > 0 { thread::sleep(delay); } - }); self.links = new_links; - } } diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index a0241a858..ac547ace7 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "1.5.5" +version = "1.6.0" authors = ["madeindjs ", "j-mendez "] description = "Multithreaded web crawler written in Rust." repository = "https://github.com/madeindjs/spider" @@ -23,7 +23,7 @@ quote = "1.0.18" failure_derive = "0.1.8" [dependencies.spider] -version = "1.5.5" +version = "1.6.0" path = "../spider" default-features = false