Skip to content

Commit

Permalink
Merge pull request #10 from JadeCara/jade/webcrawl_wikipedia
Browse files Browse the repository at this point in the history
webcrawler
  • Loading branch information
JadeCara authored Jan 3, 2025
2 parents 1465985 + 4eadd28 commit 21d5c46
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 0 deletions.
8 changes: 8 additions & 0 deletions module2/webcrawl-rayon/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[package]
name = "webcrawl-rayon"
version = "0.1.0"
edition = "2021"

[dependencies]
wikipedia = "0.3.4"
rayon = "1.7.0"
38 changes: 38 additions & 0 deletions module2/webcrawl-rayon/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
SHELL := /bin/bash
.PHONY: help

help:
@grep -E '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'

clean: ## Clean the project using cargo
cargo clean

build: ## Build the project using cargo
cargo build

run: ## Run the project using cargo
cargo run

test: ## Run the tests using cargo
cargo test

lint: ## Run the linter using cargo
@rustup component add clippy 2> /dev/null
cargo clippy

format: ## Format the code using cargo
@rustup component add rustfmt 2> /dev/null
cargo fmt

release:
cargo build --release

all: format lint test run

bump: ## Bump the version of the project
@echo "Current version is $(shell cargo pkgid | cut -d# -f2)"
@read -p "Enter the new version: " version; \
updated_version=$$(cargo pkgid | cut -d# -f2 | sed "s/$(shell cargo pkgid | cut -d# -f2)/$$version/"); \
sed -i -E "s/^version = .*/version = \"$$updated_version\"/" Cargo.toml
@echo "Version bumped to $$(cargo pkgid | cut -d# -f2)"
rm Cargo.toml-e
78 changes: 78 additions & 0 deletions module2/webcrawl-rayon/src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/*
* Uses wikipedia crate to fetch pages
* Processes page content
* Collects timing metrics
* Concurrent page processing
* Shows crate usage and concurrency in Rust
*/

use rayon::prelude::*;
use wikipedia::http::default::Client;
use wikipedia::Page;
use wikipedia::Wikipedia;

struct ProcessedPage {
title: String,
data: String,
}

const PAGES: [&str; 9] = [
"Giannis Antetokounmpo",
"James Harden",
"Russell Westbrook",
"Stephen Curry",
"Kevin Durant",
"LeBron James",
"Kobe Bryant",
"Michael Jordan",
"Shaquille O'Neal",
];

fn process_page(page: &Page<Client>) -> ProcessedPage {
let title = page.get_title().unwrap();
let content = page.get_content().unwrap();
ProcessedPage {
title,
data: content,
}
}

//times how long it takes to process the pages and total time
fn main() {
//start timer
let start = std::time::Instant::now();
let wikipedia = Wikipedia::<Client>::default();
let pages: Vec<_> = PAGES
.par_iter() //parallel iterator
.map(|&p| wikipedia.page_from_title(p.to_string()))
.collect();

let processed_pages: Vec<ProcessedPage> = pages.par_iter().map(process_page).collect();
for page in processed_pages {
//time how long it takes to process each page
let start_page = std::time::Instant::now();

println!("Title: {}", page.title.as_str());
//grab first sentence of the page
let first_sentence = page.data.split('.').next().unwrap();
println!("First sentence: {}", first_sentence);
//count the number of words in the page
let word_count = page.data.split_whitespace().count();
println!("Word count: {}", word_count);
//prints time it took to process each page
println!("Page time: {:?}", start_page.elapsed());
}
//descriptive statistics of: total time, average time per page, and total number of pages, as well as the number of threads used
println!("Total time: {:?}", start.elapsed());
println!(
"Average time per page: {:?}",
start.elapsed() / PAGES.len() as u32
);
println!("Total number of pages: {}", PAGES.len());
println!("Number of threads: {}", rayon::current_num_threads());
}

0 comments on commit 21d5c46

Please sign in to comment.