Skip to content

Commit

Permalink
chore(page): add initial start domain tracking
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Sep 23, 2024
1 parent a231d6c commit 35e3208
Show file tree
Hide file tree
Showing 12 changed files with 132 additions and 59 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

## Unreleased

## v2

1. feat(transform): HTML transformation crate with spider_transformations
1. feat(css_scraping): CSS scraping with the spider_utils
1. chore(chrome): stabilize concurrent screenshot handling

## v1.98.0

1. feat(whitelist): whitelist routes to only crawl.
Expand Down
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion examples/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_examples"
version = "2.6.12"
version = "2.6.14"
authors = [
"j-mendez <[email protected]>",
]
Expand Down
2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.6.12"
version = "2.6.14"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
5 changes: 4 additions & 1 deletion spider/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,10 @@ pub type Client = reqwest_middleware::ClientWithMiddleware;
pub type ClientBuilder = reqwest_middleware::ClientBuilder;

/// The selectors type. The values are held to make sure the relative domain can be crawled upon base redirects.
pub(crate) type RelativeSelectors = (
pub type RelativeSelectors = (
// base domain
compact_str::CompactString,
smallvec::SmallVec<[compact_str::CompactString; 2]>,
// redirected domain
compact_str::CompactString,
);
90 changes: 53 additions & 37 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ use crate::RelativeSelectors;
use bytes::Bytes;
use hashbrown::HashSet;
use reqwest::StatusCode;
use smallvec::SmallVec;

#[cfg(all(feature = "time", not(feature = "decentralized")))]
use std::time::Duration;
Expand Down Expand Up @@ -209,55 +208,55 @@ pub fn convert_abs_path(base: &Url, href: &str) -> Url {
}
}

/// validation to match a domain to parent host and the top level redirect for the crawl 'parent_host'.
/// validation to match a domain to parent host and the top level redirect for the crawl 'parent_host' and 'base_host' being the input start domain.
pub fn parent_host_match(
host_name: Option<&str>,
base_domain: &str,
parent_host: &CompactString,
base_host: &CompactString,
) -> bool {
match host_name {
Some(host) => {
if base_domain.is_empty() {
parent_host.eq(&host)
parent_host.eq(&host) || base_host.eq(&host)
} else {
host.ends_with(parent_host.as_str())
host.ends_with(parent_host.as_str()) || host.ends_with(base_host.as_str())
}
}
_ => false,
}
}

/// html selector for valid web pages for domain.
pub fn get_page_selectors(
url: &str,
subdomains: bool,
tld: bool,
) -> Option<(CompactString, SmallVec<[CompactString; 2]>)> {
match Url::parse(url) {
Ok(host) => {
let host_name = CompactString::from(
match convert_abs_path(&host, Default::default()).host_str() {
Some(host) => host.to_ascii_lowercase(),
_ => Default::default(),
},
);
let scheme = host.scheme();
pub fn get_page_selectors_base(u: &Url, subdomains: bool, tld: bool) -> Option<RelativeSelectors> {
let host_name =
CompactString::from(match convert_abs_path(&u, Default::default()).host_str() {
Some(host) => host.to_ascii_lowercase(),
_ => Default::default(),
});
let scheme = u.scheme();

Some(if tld || subdomains {
let dname = domain_name(&host);
let scheme = host.scheme();
Some(if tld || subdomains {
let dname = domain_name(&u);

(
dname.into(),
smallvec::SmallVec::from([host_name, CompactString::from(scheme)]),
)
} else {
(
CompactString::default(),
smallvec::SmallVec::from([host_name, CompactString::from(scheme)]),
)
})
}
(
dname.into(),
smallvec::SmallVec::from([host_name, CompactString::from(scheme)]),
CompactString::default(),
)
} else {
(
CompactString::default(),
smallvec::SmallVec::from([host_name, CompactString::from(scheme)]),
CompactString::default(),
)
})
}

/// html selector for valid web pages for domain.
pub fn get_page_selectors(url: &str, subdomains: bool, tld: bool) -> Option<RelativeSelectors> {
match Url::parse(url) {
Ok(host) => get_page_selectors_base(&host, subdomains, tld),
_ => None,
}
}
Expand Down Expand Up @@ -741,11 +740,13 @@ impl Page {
base_domain: &CompactString,
parent_host: &CompactString,
parent_host_scheme: &CompactString,
base_input_domain: &CompactString,
) {
match self.abs_path(href) {
Some(mut abs) => {
let host_name = abs.host_str();
let mut can_process = parent_host_match(host_name, base_domain, parent_host);
let mut can_process =
parent_host_match(host_name, base_domain, parent_host, base_input_domain);
let mut external_domain = false;

if !can_process && host_name.is_some() && !self.external_domains_caseless.is_empty()
Expand Down Expand Up @@ -807,6 +808,7 @@ impl Page {

let parent_host = &selectors.1[0];
let parent_host_scheme = &selectors.1[1];
let base_input_domain = &selectors.2;

let mut is_link_tag = false;

Expand All @@ -832,6 +834,7 @@ impl Page {
&selectors.0,
parent_host,
parent_host_scheme,
base_input_domain,
);
}
_ => (),
Expand Down Expand Up @@ -877,6 +880,7 @@ impl Page {

let parent_host = &selectors.1[0];
let parent_host_scheme = &selectors.1[1];
let base_input_domain = &selectors.2;

while let Some(node) = stream.next().await {
if let Some(element) = node.as_element() {
Expand All @@ -891,6 +895,7 @@ impl Page {
&selectors.0,
parent_host,
parent_host_scheme,
base_input_domain,
);
}
_ => (),
Expand Down Expand Up @@ -939,6 +944,8 @@ impl Page {
} else {
let base_domain = &selectors.0;
let parent_frags = &selectors.1; // todo: allow mix match tpt
let base_input_domain = &selectors.2;

let parent_host = &parent_frags[0];
let parent_host_scheme = &parent_frags[1];

Expand Down Expand Up @@ -1152,8 +1159,12 @@ impl Page {
Some(href) => match self.abs_path(href) {
Some(mut abs) => {
let host_name = abs.host_str();
let mut can_process =
parent_host_match(host_name, &base_domain, parent_host);
let mut can_process = parent_host_match(
host_name,
&base_domain,
parent_host,
base_input_domain,
);

if can_process {
if abs.scheme() != parent_host_scheme.as_str() {
Expand Down Expand Up @@ -1229,6 +1240,7 @@ impl Page {
let mut stream = tokio_stream::iter(html.tree);

let base_domain = &selectors.0;
let base_input_domain = &selectors.2;
let parent_frags = &selectors.1; // todo: allow mix match tpt
let parent_host = &parent_frags[0];
let parent_host_scheme = &parent_frags[1];
Expand All @@ -1249,8 +1261,12 @@ impl Page {
Some(href) => match self.abs_path(href) {
Some(mut abs) => {
let host_name = abs.host_str();
let mut can_process =
parent_host_match(host_name, base_domain, parent_host);
let mut can_process = parent_host_match(
host_name,
base_domain,
parent_host,
base_input_domain,
);

let mut external_domain = false;

Expand Down
Loading

0 comments on commit 35e3208

Please sign in to comment.