Skip to content

Commit 2d19755

Browse files
authored
Refactoring of robotparser-rs (#20)
* Migrated sites into robotsparser file. * Robots.txt refactoring. * Migrated to new version of url and reqwest.
1 parent cb7df85 commit 2d19755

33 files changed

+1789
-511
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
target
22
Cargo.lock
33
.vscode/
4+
.idea/

Cargo.toml

+14-5
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,25 @@ license = "MIT"
88
name = "robotparser"
99
readme = "README.md"
1010
repository = "https://github.com/messense/robotparser-rs"
11-
version = "0.10.2"
11+
version = "0.11.0"
12+
edition = "2018"
1213

1314
[dependencies]
14-
url = "1"
15+
url = "2"
16+
percent-encoding = "2.1"
1517

1618
[dependencies.reqwest]
17-
version = "0.9"
19+
version = "0.10.1"
20+
optional = true
21+
features = ["blocking"]
22+
23+
[dependencies.futures]
24+
version = "0.3"
1825
optional = true
1926

2027
[features]
21-
default = ["http"]
22-
http = ["reqwest"]
28+
default = ["reqwest", "futures"]
2329
unstable = []
30+
31+
[dev-dependencies]
32+
tokio = "0.2.11"

README.md

+10-7
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ Add it to your ``Cargo.toml``:
1515

1616
```toml
1717
[dependencies]
18-
robotparser = "0.10"
18+
robotparser = "0.11"
1919
```
2020

2121
Add ``extern crate robotparser`` to your crate root and your're good to go!
@@ -24,14 +24,17 @@ Add ``extern crate robotparser`` to your crate root and your're good to go!
2424
## Examples
2525

2626
```rust
27-
extern crate robotparser;
28-
29-
use robotparser::RobotFileParser;
27+
use robotparser::http::RobotsTxtClient;
28+
use robotparser::service::RobotsTxtService;
29+
use reqwest::Client;
30+
use url::Url;
3031

3132
fn main() {
32-
let parser = RobotFileParser::new("http://www.python.org/robots.txt");
33-
parser.read();
34-
assert!(parser.can_fetch("*", "http://www.python.org/robots.txt"));
33+
let client = Client::new();
34+
let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
35+
let robots_txt = client.fetch_robots_txt(robots_txt_url.origin()).unwrap().get_result();
36+
let fetch_url = Url::parse("http://www.python.org/robots.txt").unwrap();
37+
assert!(robots_txt.can_fetch("*", &fetch_url));
3538
}
3639
```
3740

src/http.rs

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
//! # Supported libraries
2+
//! To enable support for the required library, you need to add this feature to your `Cargo.toml`.
3+
//! Now only one library is supported - `reqwest`.
4+
//! But you can also add support for other libraries.
5+
6+
use url::Origin;
7+
#[cfg(feature = "reqwest")]
8+
/// Support for reqwest library.
9+
pub mod reqwest;
10+
11+
/// User agent of this crate.
12+
pub const DEFAULT_USER_AGENT: &str = "robotparser-rs (https://crates.io/crates/robotparser)";
13+
14+
/// Trait to fetch and parse the robots.txt file.
15+
/// Must be implemented on http-client.
16+
pub trait RobotsTxtClient {
17+
type Result;
18+
fn fetch_robots_txt(&self, origin: Origin) -> Self::Result;
19+
}

src/http/reqwest.rs

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
mod sync_reqwest;
2+
pub use self::sync_reqwest::*;
3+
mod async_reqwest;
4+
pub use self::async_reqwest::*;

src/http/reqwest/async_reqwest.rs

+76
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
use reqwest::{Client, Request};
2+
use reqwest::{Method, Error};
3+
use reqwest::header::HeaderValue;
4+
use url::{Origin, Url};
5+
use reqwest::header::USER_AGENT;
6+
use crate::http::{RobotsTxtClient, DEFAULT_USER_AGENT};
7+
use crate::parser::{ParseResult, parse_fetched_robots_txt};
8+
use crate::model::FetchedRobotsTxt;
9+
use std::pin::Pin;
10+
use futures::task::{Context, Poll};
11+
use futures::Future;
12+
use futures::future::TryFutureExt;
13+
use futures::future::ok as future_ok;
14+
15+
type FetchFuture = Box<dyn Future<Output=Result<(ResponseInfo, String), Error>>>;
16+
17+
impl RobotsTxtClient for Client {
18+
type Result = RobotsTxtResponse;
19+
fn fetch_robots_txt(&self, origin: Origin) -> Self::Result {
20+
let url = format!("{}/robots.txt", origin.unicode_serialization());
21+
let url = Url::parse(&url).expect("Unable to parse robots.txt url");
22+
let mut request = Request::new(Method::GET, url);
23+
let _ = request.headers_mut().insert(USER_AGENT, HeaderValue::from_static(DEFAULT_USER_AGENT));
24+
let response = self
25+
.execute(request)
26+
.and_then(|response| {
27+
let response_info = ResponseInfo {status_code: response.status().as_u16()};
28+
return response.text().and_then(|response_text| {
29+
return future_ok((response_info, response_text));
30+
});
31+
});
32+
let response: Pin<Box<dyn Future<Output=Result<(ResponseInfo, String), Error>>>> = Box::pin(response);
33+
return RobotsTxtResponse {
34+
origin,
35+
response,
36+
}
37+
}
38+
}
39+
40+
struct ResponseInfo {
41+
status_code: u16,
42+
}
43+
44+
/// Future for fetching robots.txt result.
45+
pub struct RobotsTxtResponse {
46+
origin: Origin,
47+
response: Pin<FetchFuture>,
48+
}
49+
50+
impl RobotsTxtResponse {
51+
/// Returns origin of robots.txt
52+
pub fn get_origin(&self) -> &Origin {
53+
return &self.origin;
54+
}
55+
}
56+
57+
impl Future for RobotsTxtResponse {
58+
type Output = Result<ParseResult<FetchedRobotsTxt>, Error>;
59+
60+
fn poll(self: Pin<&mut Self>, cx: &mut Context) -> Poll<Self::Output> {
61+
let self_mut = self.get_mut();
62+
let response_pin = self_mut.response.as_mut();
63+
match response_pin.poll(cx) {
64+
Poll::Ready(Ok((response_info, text))) => {
65+
let robots_txt = parse_fetched_robots_txt(self_mut.origin.clone(), response_info.status_code, &text);
66+
return Poll::Ready(Ok(robots_txt));
67+
},
68+
Poll::Ready(Err(error)) => {
69+
return Poll::Ready(Err(error));
70+
},
71+
Poll::Pending => {
72+
return Poll::Pending;
73+
},
74+
}
75+
}
76+
}

src/http/reqwest/sync_reqwest.rs

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
use reqwest::blocking::{Client, Request};
2+
use reqwest::{Method, Error};
3+
use reqwest::header::HeaderValue;
4+
use url::{Origin, Url};
5+
use reqwest::header::USER_AGENT;
6+
use crate::http::{RobotsTxtClient, DEFAULT_USER_AGENT};
7+
use crate::parser::{ParseResult, parse_fetched_robots_txt};
8+
use crate::model::FetchedRobotsTxt;
9+
10+
impl RobotsTxtClient for Client {
11+
type Result = Result<ParseResult<FetchedRobotsTxt>, Error>;
12+
fn fetch_robots_txt(&self, origin: Origin) -> Self::Result {
13+
let url = format!("{}/robots.txt", origin.unicode_serialization());
14+
let url = Url::parse(&url).expect("Unable to parse robots.txt url");
15+
let mut request = Request::new(Method::GET, url);
16+
let _ = request.headers_mut().insert(USER_AGENT, HeaderValue::from_static(DEFAULT_USER_AGENT));
17+
let response = self.execute(request)?;
18+
let status_code = response.status().as_u16();
19+
let text = response.text()?;
20+
let robots_txt = parse_fetched_robots_txt(origin, status_code, &text);
21+
return Ok(robots_txt);
22+
}
23+
}

0 commit comments

Comments
 (0)