From 4eadd287bf353c311cf21f0222e3e07e0d212280 Mon Sep 17 00:00:00 2001 From: Jade Wibbels Date: Fri, 3 Jan 2025 15:25:29 -0700 Subject: [PATCH] webcrawler --- module2/webcrawl-rayon/Cargo.toml | 8 +++ module2/webcrawl-rayon/Makefile | 38 +++++++++++++++ module2/webcrawl-rayon/src/main.rs | 78 ++++++++++++++++++++++++++++++ 3 files changed, 124 insertions(+) create mode 100644 module2/webcrawl-rayon/Cargo.toml create mode 100644 module2/webcrawl-rayon/Makefile create mode 100644 module2/webcrawl-rayon/src/main.rs diff --git a/module2/webcrawl-rayon/Cargo.toml b/module2/webcrawl-rayon/Cargo.toml new file mode 100644 index 0000000..7052d3d --- /dev/null +++ b/module2/webcrawl-rayon/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "webcrawl-rayon" +version = "0.1.0" +edition = "2021" + +[dependencies] +wikipedia = "0.3.4" +rayon = "1.7.0" diff --git a/module2/webcrawl-rayon/Makefile b/module2/webcrawl-rayon/Makefile new file mode 100644 index 0000000..4daa6f8 --- /dev/null +++ b/module2/webcrawl-rayon/Makefile @@ -0,0 +1,38 @@ +SHELL := /bin/bash +.PHONY: help + +help: + @grep -E '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}' + +clean: ## Clean the project using cargo + cargo clean + +build: ## Build the project using cargo + cargo build + +run: ## Run the project using cargo + cargo run + +test: ## Run the tests using cargo + cargo test + +lint: ## Run the linter using cargo + @rustup component add clippy 2> /dev/null + cargo clippy + +format: ## Format the code using cargo + @rustup component add rustfmt 2> /dev/null + cargo fmt + +release: + cargo build --release + +all: format lint test run + +bump: ## Bump the version of the project + @echo "Current version is $(shell cargo pkgid | cut -d# -f2)" + @read -p "Enter the new version: " version; \ + updated_version=$$(cargo pkgid | cut -d# -f2 | sed "s/$(shell cargo pkgid | cut -d# -f2)/$$version/"); \ + sed -i -E "s/^version = .*/version = \"$$updated_version\"/" Cargo.toml + @echo "Version bumped to $$(cargo pkgid | cut -d# -f2)" + rm Cargo.toml-e \ No newline at end of file diff --git a/module2/webcrawl-rayon/src/main.rs b/module2/webcrawl-rayon/src/main.rs new file mode 100644 index 0000000..a7cf1a1 --- /dev/null +++ b/module2/webcrawl-rayon/src/main.rs @@ -0,0 +1,78 @@ +/* + +* Uses wikipedia crate to fetch pages + +* Processes page content + +* Collects timing metrics + +* Concurrent page processing + +* Shows crate usage and concurrency in Rust +*/ + +use rayon::prelude::*; +use wikipedia::http::default::Client; +use wikipedia::Page; +use wikipedia::Wikipedia; + +struct ProcessedPage { + title: String, + data: String, +} + +const PAGES: [&str; 9] = [ + "Giannis Antetokounmpo", + "James Harden", + "Russell Westbrook", + "Stephen Curry", + "Kevin Durant", + "LeBron James", + "Kobe Bryant", + "Michael Jordan", + "Shaquille O'Neal", +]; + +fn process_page(page: &Page) -> ProcessedPage { + let title = page.get_title().unwrap(); + let content = page.get_content().unwrap(); + ProcessedPage { + title, + data: content, + } +} + +//times how long it takes to process the pages and total time +fn main() { + //start timer + let start = std::time::Instant::now(); + let wikipedia = Wikipedia::::default(); + let pages: Vec<_> = PAGES + .par_iter() //parallel iterator + .map(|&p| wikipedia.page_from_title(p.to_string())) + .collect(); + + let processed_pages: Vec = pages.par_iter().map(process_page).collect(); + for page in processed_pages { + //time how long it takes to process each page + let start_page = std::time::Instant::now(); + + println!("Title: {}", page.title.as_str()); + //grab first sentence of the page + let first_sentence = page.data.split('.').next().unwrap(); + println!("First sentence: {}", first_sentence); + //count the number of words in the page + let word_count = page.data.split_whitespace().count(); + println!("Word count: {}", word_count); + //prints time it took to process each page + println!("Page time: {:?}", start_page.elapsed()); + } + //descriptive statistics of: total time, average time per page, and total number of pages, as well as the number of threads used + println!("Total time: {:?}", start.elapsed()); + println!( + "Average time per page: {:?}", + start.elapsed() / PAGES.len() as u32 + ); + println!("Total number of pages: {}", PAGES.len()); + println!("Number of threads: {}", rayon::current_num_threads()); +}