Inital github-actions (#25)

* Inital github-actions

most taken from starship project
ref #21

* rustfmt config remove unknown configuration options

* Run rustfmt

* clippy: use any instead of find..is_some

* clippy: Remove the `clone` call: `self.crawl_delay`

* Clippy fixes

* Rustfmt fixes

* clippy: fix dont need to add `&` to all patterns

* clippy: fix needless `fn main` in doctest

* clippy: fix if-then-else expression returns a bool literal

* clippy: fix very complex type BoxFuture response

* clippy: fix variable `line_no` is used as a loop counter

* clippy: dereference the expression on tests

* clippy: fix assert(true) will be optimized out by the compiler

* github: name workflow
This commit is contained in:
Laurent Arnoud 2020-03-30 01:32:32 +00:00 committed by GitHub
parent df49f6bcf0
commit 6ba403aab9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
31 changed files with 532 additions and 458 deletions

127
.github/workflows/workflow.yml vendored Normal file
View file

@ -0,0 +1,127 @@
---
name: Main workflow
on:
push:
paths-ignore:
- "**.md"
pull_request:
paths-ignore:
- "**.md"
jobs:
# Run the `rustfmt` code formatter
rustfmt:
name: Rustfmt [Formatter]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
components: rustfmt
override: true
- run: rustup component add rustfmt
- uses: actions-rs/cargo@v1
with:
command: fmt
args: --all -- --check
# Run the `clippy` linting tool
clippy:
name: Clippy [Linter]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
components: clippy
override: true
- uses: actions-rs/clippy-check@v1
with:
token: ${{ secrets.GITHUB_TOKEN }}
args: --all-targets --all-features -- -D clippy::all
# Run a security audit on dependencies
cargo_audit:
name: Cargo Audit [Security]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions-rs/toolchain@v1
with:
toolchain: stable
override: true
- run: cargo install --force cargo-audit
- run: cargo generate-lockfile
- uses: actions-rs/cargo@v1
with:
command: audit
# Ensure that the project could be successfully compiled
cargo_check:
name: Compile
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
- uses: actions-rs/cargo@v1
with:
command: check
args: --all
# Run tests on Linux, macOS, and Windows
# On both Rust stable and Rust nightly
test:
name: Test Suite
needs: [cargo_check]
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, macOS-latest, windows-latest]
rust: [stable, nightly]
steps:
# Checkout the branch being tested
- uses: actions/checkout@v2
# Cache files between builds
- name: Cache cargo registry
uses: actions/cache@v1
with:
path: ~/.cargo/registry
key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }}
- name: Cache cargo index
uses: actions/cache@v1
with:
path: ~/.cargo/git
key: ${{ runner.os }}-cargo-index-${{ hashFiles('**/Cargo.lock') }}
- name: Cache cargo build
uses: actions/cache@v1
with:
path: target
key: ${{ runner.os }}-cargo-build-target-${{ hashFiles('**/Cargo.lock') }}
# Install all the required dependencies for testing
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
- name: Run all tests
uses: actions-rs/cargo@v1
with:
command: test

View file

@ -1,3 +1 @@
max_width = 120 max_width = 120
ideal_width = 100
write_mode = "Overwrite"

View file

@ -1,41 +1,43 @@
use reqwest::{Client, Request};
use reqwest::Method;
use reqwest::Error as ReqwestError;
use reqwest::header::HeaderValue;
use url::{Origin, Url};
use reqwest::header::USER_AGENT;
use crate::http::{RobotsTxtClient, DEFAULT_USER_AGENT}; use crate::http::{RobotsTxtClient, DEFAULT_USER_AGENT};
use crate::parser::{ParseResult, parse_fetched_robots_txt};
use crate::model::FetchedRobotsTxt; use crate::model::FetchedRobotsTxt;
use crate::model::{Error, ErrorKind}; use crate::model::{Error, ErrorKind};
use std::pin::Pin; use crate::parser::{parse_fetched_robots_txt, ParseResult};
use futures::future::ok as future_ok;
use futures::future::TryFutureExt;
use futures::task::{Context, Poll}; use futures::task::{Context, Poll};
use futures::Future; use futures::Future;
use futures::future::TryFutureExt; use reqwest::header::HeaderValue;
use futures::future::ok as future_ok; use reqwest::header::USER_AGENT;
use reqwest::Error as ReqwestError;
use reqwest::Method;
use reqwest::{Client, Request};
use std::pin::Pin;
use url::{Origin, Url};
type FetchFuture = Box<dyn Future<Output=Result<(ResponseInfo, String), ReqwestError>>>; type FetchFuture = Box<dyn Future<Output = Result<(ResponseInfo, String), ReqwestError>>>;
type BoxFuture = Pin<FetchFuture>;
impl RobotsTxtClient for Client { impl RobotsTxtClient for Client {
type Result = Result<RobotsTxtResponse, Error>; type Result = Result<RobotsTxtResponse, Error>;
fn fetch_robots_txt(&self, origin: Origin) -> Self::Result { fn fetch_robots_txt(&self, origin: Origin) -> Self::Result {
let url = format!("{}/robots.txt", origin.unicode_serialization()); let url = format!("{}/robots.txt", origin.unicode_serialization());
let url = Url::parse(&url).map_err(|err| Error {kind: ErrorKind::Url(err)})?; let url = Url::parse(&url).map_err(|err| Error {
kind: ErrorKind::Url(err),
})?;
let mut request = Request::new(Method::GET, url); let mut request = Request::new(Method::GET, url);
let _ = request.headers_mut().insert(USER_AGENT, HeaderValue::from_static(DEFAULT_USER_AGENT)); let _ = request
let response = self .headers_mut()
.execute(request) .insert(USER_AGENT, HeaderValue::from_static(DEFAULT_USER_AGENT));
.and_then(|response| { let response = self.execute(request).and_then(|response| {
let response_info = ResponseInfo {status_code: response.status().as_u16()}; let response_info = ResponseInfo {
return response.text().and_then(|response_text| { status_code: response.status().as_u16(),
return future_ok((response_info, response_text)); };
}); response
}); .text()
let response: Pin<Box<dyn Future<Output=Result<(ResponseInfo, String), ReqwestError>>>> = Box::pin(response); .and_then(|response_text| future_ok((response_info, response_text)))
Ok(RobotsTxtResponse { });
origin, let response: BoxFuture = Box::pin(response);
response, Ok(RobotsTxtResponse { origin, response })
})
} }
} }
@ -52,7 +54,7 @@ pub struct RobotsTxtResponse {
impl RobotsTxtResponse { impl RobotsTxtResponse {
/// Returns origin of robots.txt /// Returns origin of robots.txt
pub fn get_origin(&self) -> &Origin { pub fn get_origin(&self) -> &Origin {
return &self.origin; &self.origin
} }
} }
@ -65,14 +67,10 @@ impl Future for RobotsTxtResponse {
match response_pin.poll(cx) { match response_pin.poll(cx) {
Poll::Ready(Ok((response_info, text))) => { Poll::Ready(Ok((response_info, text))) => {
let robots_txt = parse_fetched_robots_txt(self_mut.origin.clone(), response_info.status_code, &text); let robots_txt = parse_fetched_robots_txt(self_mut.origin.clone(), response_info.status_code, &text);
return Poll::Ready(Ok(robots_txt)); Poll::Ready(Ok(robots_txt))
}, }
Poll::Ready(Err(error)) => { Poll::Ready(Err(error)) => Poll::Ready(Err(error)),
return Poll::Ready(Err(error)); Poll::Pending => Poll::Pending,
},
Poll::Pending => {
return Poll::Pending;
},
} }
} }
} }

View file

@ -1,24 +1,32 @@
use reqwest::blocking::{Client, Request};
use reqwest::Method;
use reqwest::header::HeaderValue;
use url::{Origin, Url};
use reqwest::header::USER_AGENT;
use crate::http::{RobotsTxtClient, DEFAULT_USER_AGENT}; use crate::http::{RobotsTxtClient, DEFAULT_USER_AGENT};
use crate::parser::{ParseResult, parse_fetched_robots_txt};
use crate::model::FetchedRobotsTxt; use crate::model::FetchedRobotsTxt;
use crate::model::{Error, ErrorKind}; use crate::model::{Error, ErrorKind};
use crate::parser::{parse_fetched_robots_txt, ParseResult};
use reqwest::blocking::{Client, Request};
use reqwest::header::HeaderValue;
use reqwest::header::USER_AGENT;
use reqwest::Method;
use url::{Origin, Url};
impl RobotsTxtClient for Client { impl RobotsTxtClient for Client {
type Result = Result<ParseResult<FetchedRobotsTxt>, Error>; type Result = Result<ParseResult<FetchedRobotsTxt>, Error>;
fn fetch_robots_txt(&self, origin: Origin) -> Self::Result { fn fetch_robots_txt(&self, origin: Origin) -> Self::Result {
let url = format!("{}/robots.txt", origin.unicode_serialization()); let url = format!("{}/robots.txt", origin.unicode_serialization());
let url = Url::parse(&url).map_err(|err| Error {kind: ErrorKind::Url(err)})?; let url = Url::parse(&url).map_err(|err| Error {
kind: ErrorKind::Url(err),
})?;
let mut request = Request::new(Method::GET, url); let mut request = Request::new(Method::GET, url);
let _ = request.headers_mut().insert(USER_AGENT, HeaderValue::from_static(DEFAULT_USER_AGENT)); let _ = request
let response = self.execute(request).map_err(|err| Error {kind: ErrorKind::Http(err)})?; .headers_mut()
.insert(USER_AGENT, HeaderValue::from_static(DEFAULT_USER_AGENT));
let response = self.execute(request).map_err(|err| Error {
kind: ErrorKind::Http(err),
})?;
let status_code = response.status().as_u16(); let status_code = response.status().as_u16();
let text = response.text().map_err(|err| Error {kind: ErrorKind::Http(err)})?; let text = response.text().map_err(|err| Error {
kind: ErrorKind::Http(err),
})?;
let robots_txt = parse_fetched_robots_txt(origin, status_code, &text); let robots_txt = parse_fetched_robots_txt(origin, status_code, &text);
return Ok(robots_txt); Ok(robots_txt)
} }
} }

View file

@ -21,20 +21,18 @@
//! use reqwest::blocking::Client; //! use reqwest::blocking::Client;
//! use url::Url; //! use url::Url;
//! //!
//! fn main() { //! let client = Client::new();
//! let client = Client::new(); //! let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
//! let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap(); //! let robots_txt = client.fetch_robots_txt(robots_txt_url.origin()).unwrap().get_result();
//! let robots_txt = client.fetch_robots_txt(robots_txt_url.origin()).unwrap().get_result(); //! let fetch_url = Url::parse("https://www.python.org/robots.txt").unwrap();
//! let fetch_url = Url::parse("https://www.python.org/robots.txt").unwrap(); //! assert!(robots_txt.can_fetch("*", &fetch_url));
//! assert!(robots_txt.can_fetch("*", &fetch_url));
//! }
//! ``` //! ```
/// Request builder & response parsers for other http libraries.
pub mod http;
/// Contains models of robots.txt file. /// Contains models of robots.txt file.
pub mod model; pub mod model;
/// Contains robots.txt parsers. /// Contains robots.txt parsers.
pub mod parser; pub mod parser;
/// Contains robots.txt services. /// Contains robots.txt services.
pub mod service; pub mod service;
/// Request builder & response parsers for other http libraries.
pub mod http;

View file

@ -1,19 +1,19 @@
mod path_pattern; mod path_pattern;
pub (crate) use self::path_pattern::PathPattern; pub(crate) use self::path_pattern::PathPattern;
mod group; mod group;
pub (crate) use self::group::Group; pub(crate) use self::group::Group;
mod rule; mod rule;
pub (crate) use self::rule::Rule; pub(crate) use self::rule::Rule;
mod clean_params; mod clean_params;
pub (crate) use self::clean_params::CleanParams; pub(crate) use self::clean_params::CleanParams;
mod request_rate; mod request_rate;
pub use self::request_rate::RequestRate; pub use self::request_rate::RequestRate;
mod robots_txt; mod robots_txt;
pub use self::fetched_robots_txt::FetchedRobotsTxt; pub use self::fetched_robots_txt::FetchedRobotsTxt;
pub (crate) use self::fetched_robots_txt::FetchedRobotsTxtContainer; pub(crate) use self::fetched_robots_txt::FetchedRobotsTxtContainer;
mod fetched_robots_txt; mod fetched_robots_txt;
pub use self::robots_txt::RobotsTxt; pub use self::robots_txt::RobotsTxt;
mod path; mod path;
pub (crate) use self::path::Path; pub(crate) use self::path::Path;
mod errors; mod errors;
pub use self::errors::{Error, ErrorKind}; pub use self::errors::{Error, ErrorKind};

View file

@ -8,17 +8,14 @@ pub struct CleanParams {
impl CleanParams { impl CleanParams {
pub fn new(path_pattern: PathPattern, params: Vec<String>) -> CleanParams { pub fn new(path_pattern: PathPattern, params: Vec<String>) -> CleanParams {
return CleanParams { CleanParams { path_pattern, params }
path_pattern,
params,
}
} }
pub fn get_path_pattern(&self) -> &PathPattern { pub fn get_path_pattern(&self) -> &PathPattern {
return &self.path_pattern; &self.path_pattern
} }
pub fn get_params(&self) -> &Vec<String> { pub fn get_params(&self) -> &Vec<String> {
return &self.params; &self.params
} }
} }

View file

@ -2,7 +2,7 @@ use crate::model::robots_txt::RobotsTxt;
use std::time::SystemTime; use std::time::SystemTime;
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub (crate) enum FetchedRobotsTxtContainer { pub(crate) enum FetchedRobotsTxtContainer {
FetchDenied, FetchDenied,
FetchFailed, FetchFailed,
Fetched(RobotsTxt), Fetched(RobotsTxt),
@ -19,18 +19,18 @@ pub struct FetchedRobotsTxt {
} }
impl FetchedRobotsTxt { impl FetchedRobotsTxt {
pub (crate) fn new(container: FetchedRobotsTxtContainer) -> FetchedRobotsTxt { pub(crate) fn new(container: FetchedRobotsTxtContainer) -> FetchedRobotsTxt {
FetchedRobotsTxt { FetchedRobotsTxt {
fetched_at: SystemTime::now(), fetched_at: SystemTime::now(),
container, container,
} }
} }
pub (crate) fn get_container(&self) -> &FetchedRobotsTxtContainer { pub(crate) fn get_container(&self) -> &FetchedRobotsTxtContainer {
return &self.container; &self.container
} }
/// Returns the system time when the robots.txt file was downloaded over the network. /// Returns the system time when the robots.txt file was downloaded over the network.
pub fn get_fetched_at(&self) -> &SystemTime { pub fn get_fetched_at(&self) -> &SystemTime {
return &self.fetched_at; &self.fetched_at
} }
} }

View file

@ -1,6 +1,6 @@
use std::time::Duration;
use crate::model::request_rate::RequestRate; use crate::model::request_rate::RequestRate;
use crate::model::rule::Rule; use crate::model::rule::Rule;
use std::time::Duration;
/// An group has one or more user-agents and zero or more rules /// An group has one or more user-agents and zero or more rules
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
@ -12,7 +12,7 @@ pub struct Group {
} }
impl Group { impl Group {
pub (crate) fn new() -> Group { pub(crate) fn new() -> Group {
Group { Group {
user_agents: vec![], user_agents: vec![],
rules: vec![], rules: vec![],
@ -22,8 +22,8 @@ impl Group {
} }
/// check if this group applies to the specified agent /// check if this group applies to the specified agent
pub (crate) fn applies_to(&self, useragent: &str) -> bool { pub(crate) fn applies_to(&self, useragent: &str) -> bool {
let ua = useragent.split('/').nth(0).unwrap_or("").to_lowercase(); let ua = useragent.split('/').next().unwrap_or("").to_lowercase();
for agent in self.user_agents.iter() { for agent in self.user_agents.iter() {
if ua.contains(agent) { if ua.contains(agent) {
return true; return true;
@ -32,60 +32,54 @@ impl Group {
false false
} }
pub (crate) fn push_useragent(&mut self, useragent: &str) { pub(crate) fn push_useragent(&mut self, useragent: &str) {
self.user_agents.push(useragent.to_lowercase().to_owned()); self.user_agents.push(useragent.to_lowercase());
} }
pub (crate) fn push_rule(&mut self, rule: Rule) { pub(crate) fn push_rule(&mut self, rule: Rule) {
self.rules.push(rule); self.rules.push(rule);
} }
pub (crate) fn get_rules_sorted_by_path_len_desc(&self) -> Vec<&Rule> { pub(crate) fn get_rules_sorted_by_path_len_desc(&self) -> Vec<&Rule> {
let mut rules: Vec<&Rule> = self.rules.iter().collect(); let mut rules: Vec<&Rule> = self.rules.iter().collect();
rules.sort_by(|a, b| { rules.sort_by(|a, b| {
let a = a.get_path_pattern().len(); let a = a.get_path_pattern().len();
let b = b.get_path_pattern().len(); let b = b.get_path_pattern().len();
return b.cmp(&a); b.cmp(&a)
}); });
return rules; rules
} }
pub (crate) fn contains_user_agent(&self, user_agent: &str) -> bool { pub(crate) fn contains_user_agent(&self, user_agent: &str) -> bool {
return self self.user_agents.iter().any(|item| *item == user_agent)
.user_agents
.iter()
.find(|item| {
return *item == user_agent;
}).is_some();
} }
pub (crate) fn set_crawl_delay(&mut self, delay: Duration) { pub(crate) fn set_crawl_delay(&mut self, delay: Duration) {
self.crawl_delay = Some(delay); self.crawl_delay = Some(delay);
} }
pub (crate) fn get_crawl_delay(&self) -> Option<Duration> { pub(crate) fn get_crawl_delay(&self) -> Option<Duration> {
return self.crawl_delay.clone(); self.crawl_delay
} }
pub (crate) fn set_req_rate(&mut self, req_rate: RequestRate) { pub(crate) fn set_req_rate(&mut self, req_rate: RequestRate) {
self.req_rate = Some(req_rate); self.req_rate = Some(req_rate);
} }
pub (crate) fn get_req_rate(&self) -> Option<RequestRate> { pub(crate) fn get_req_rate(&self) -> Option<RequestRate> {
return self.req_rate.clone(); self.req_rate.clone()
} }
pub (crate) fn is_default(&self) -> bool { pub(crate) fn is_default(&self) -> bool {
for user_agent in self.user_agents.iter() { for user_agent in self.user_agents.iter() {
if user_agent == "*" { if user_agent == "*" {
return true; return true;
} }
} }
return false; false
} }
} }
impl Default for Group { impl Default for Group {
fn default() -> Group { fn default() -> Group {
Group::new() Group::new()

View file

@ -1,5 +1,5 @@
use url::Url;
use percent_encoding::percent_decode; use percent_encoding::percent_decode;
use url::Url;
#[derive(Debug)] #[derive(Debug)]
pub struct Path(String); pub struct Path(String);
@ -9,13 +9,13 @@ impl Path {
let path = get_url_without_origin(&url); let path = get_url_without_origin(&url);
let path = percent_decode(path.as_bytes()).decode_utf8_lossy(); let path = percent_decode(path.as_bytes()).decode_utf8_lossy();
if path.is_empty() { if path.is_empty() {
return Path("/".into()); Path("/".into())
} else { } else {
return Path(path.into()); Path(path.into())
} }
} }
pub fn as_str(&self) -> &str { pub fn as_str(&self) -> &str {
return &self.0; &self.0
} }
} }
@ -24,10 +24,10 @@ fn get_url_without_origin(url: &Url) -> &str {
let url = url.as_str(); let url = url.as_str();
let unicode_origin = origin.unicode_serialization(); let unicode_origin = origin.unicode_serialization();
let ascii_origin = origin.ascii_serialization(); let ascii_origin = origin.ascii_serialization();
if url.starts_with(&unicode_origin) && unicode_origin.len() >= 1 { if url.starts_with(&unicode_origin) && !unicode_origin.is_empty() {
return &url[unicode_origin.len()..]; return &url[unicode_origin.len()..];
} }
if url.starts_with(&ascii_origin) && ascii_origin.len() >= 1 { if url.starts_with(&ascii_origin) && !ascii_origin.is_empty() {
return &url[ascii_origin.len()..]; return &url[ascii_origin.len()..];
} }
// Must never be executed. // Must never be executed.

View file

@ -1,7 +1,7 @@
use crate::model::path::Path;
use percent_encoding::percent_decode;
use std::convert::From; use std::convert::From;
use std::mem::replace; use std::mem::replace;
use percent_encoding::percent_decode;
use crate::model::path::Path;
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct PathPattern(Vec<PathPatternToken>); pub struct PathPattern(Vec<PathPatternToken>);
@ -16,22 +16,16 @@ enum PathPatternToken {
impl PathPatternToken { impl PathPatternToken {
fn from_path_pattern(path: String) -> PathPatternToken { fn from_path_pattern(path: String) -> PathPatternToken {
let path = percent_decode(path.as_bytes()).decode_utf8_lossy(); let path = percent_decode(path.as_bytes()).decode_utf8_lossy();
return PathPatternToken::Text(path.to_string()); PathPatternToken::Text(path.to_string())
} }
} }
impl PathPatternToken { impl PathPatternToken {
fn len(&self) -> usize { fn len(&self) -> usize {
return match self { match *self {
&PathPatternToken::Text(ref text) => { PathPatternToken::Text(ref text) => text.len(),
text.len() PathPatternToken::AnyString => 1,
}, PathPatternToken::TerminateString => 1,
&PathPatternToken::AnyString => {
1
},
&PathPatternToken::TerminateString => {
1
},
} }
} }
} }
@ -42,16 +36,12 @@ impl PathPattern {
let mut tokens = Vec::new(); let mut tokens = Vec::new();
for c in path.chars() { for c in path.chars() {
let prepared_token = match c { let prepared_token = match c {
'*' => { '*' => Some(PathPatternToken::AnyString),
Some(PathPatternToken::AnyString) '$' => Some(PathPatternToken::TerminateString),
},
'$' => {
Some(PathPatternToken::TerminateString)
},
_ => { _ => {
text.push(c); text.push(c);
None None
}, }
}; };
if let Some(prepared_token) = prepared_token { if let Some(prepared_token) = prepared_token {
if !text.is_empty() { if !text.is_empty() {
@ -67,26 +57,26 @@ impl PathPattern {
tokens.push(PathPatternToken::AnyString); tokens.push(PathPatternToken::AnyString);
} }
tokens.dedup(); tokens.dedup();
return PathPattern(tokens); PathPattern(tokens)
} }
pub fn all() -> PathPattern { pub fn all() -> PathPattern {
return PathPattern(vec![PathPatternToken::AnyString]); PathPattern(vec![PathPatternToken::AnyString])
} }
pub fn applies_to(&self, path: &Path) -> bool { pub fn applies_to(&self, path: &Path) -> bool {
let mut filename = path.as_str(); let mut filename = path.as_str();
for (index, token) in self.0.iter().enumerate() { for (index, token) in self.0.iter().enumerate() {
match token { match *token {
&PathPatternToken::Text(ref text) => { PathPatternToken::Text(ref text) => {
if !filename.starts_with(text) { if !filename.starts_with(text) {
return false; return false;
} }
filename = &filename[text.len() ..]; filename = &filename[text.len()..];
}, }
&PathPatternToken::AnyString => { PathPatternToken::AnyString => {
if let Some(&PathPatternToken::Text(ref text)) = self.0.get(index + 1) { if let Some(PathPatternToken::Text(ref text)) = self.0.get(index + 1) {
while filename.len() >= 1 { while !filename.is_empty() {
if filename.starts_with(text) { if filename.starts_with(text) {
break; break;
} }
@ -100,15 +90,15 @@ impl PathPattern {
} else { } else {
filename = &filename[filename.len()..]; filename = &filename[filename.len()..];
} }
}, }
&PathPatternToken::TerminateString => { PathPatternToken::TerminateString => {
if filename.len() != 0 { if !filename.is_empty() {
return false; return false;
} }
}, }
} }
} }
return true; true
} }
pub fn len(&self) -> usize { pub fn len(&self) -> usize {
@ -116,12 +106,12 @@ impl PathPattern {
for path_token in self.0.iter() { for path_token in self.0.iter() {
length += path_token.len(); length += path_token.len();
} }
return length; length
} }
} }
impl From<&str> for PathPattern { impl From<&str> for PathPattern {
fn from(path: &str) -> Self { fn from(path: &str) -> Self {
return PathPattern::new(path); PathPattern::new(path)
} }
} }

View file

@ -1,6 +1,6 @@
use crate::model::group::Group;
use crate::model::clean_params::CleanParams; use crate::model::clean_params::CleanParams;
use url::{Url, Origin}; use crate::model::group::Group;
use url::{Origin, Url};
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
/// The robots.txt model that was obtained after parsing the text of the robots.txt file. /// The robots.txt model that was obtained after parsing the text of the robots.txt file.
@ -14,8 +14,8 @@ pub struct RobotsTxt {
} }
impl RobotsTxt { impl RobotsTxt {
pub (crate) fn new(origin: Origin) -> RobotsTxt { pub(crate) fn new(origin: Origin) -> RobotsTxt {
return RobotsTxt { RobotsTxt {
origin, origin,
groups: Vec::new(), groups: Vec::new(),
sitemaps: Vec::new(), sitemaps: Vec::new(),
@ -23,31 +23,35 @@ impl RobotsTxt {
} }
} }
pub (crate) fn add_sitemap(&mut self, url: Url) { pub(crate) fn add_sitemap(&mut self, url: Url) {
self.sitemaps.push(url); self.sitemaps.push(url);
} }
pub (crate) fn get_sitemaps_slice(&self) -> &[Url] { pub(crate) fn get_sitemaps_slice(&self) -> &[Url] {
return self.sitemaps.as_slice(); self.sitemaps.as_slice()
} }
pub (crate) fn add_clean_params(&mut self, clean_params: CleanParams) { pub(crate) fn add_clean_params(&mut self, clean_params: CleanParams) {
self.clean_params.push(clean_params); self.clean_params.push(clean_params);
} }
pub (crate) fn get_clean_params(&self) -> &[CleanParams] { pub(crate) fn get_clean_params(&self) -> &[CleanParams] {
return self.clean_params.as_slice(); self.clean_params.as_slice()
} }
pub (crate) fn add_group(&mut self, group: Group) { pub(crate) fn add_group(&mut self, group: Group) {
self.groups.push(group); self.groups.push(group);
} }
pub (crate) fn get_origin(&self) -> &Origin { pub(crate) fn get_origin(&self) -> &Origin {
return &self.origin; &self.origin
} }
pub (crate) fn find_in_group<'a, T>(&'a self, user_agent: &str, callback: impl Fn(&'a Group) -> Option<T>) -> Option<T> { pub(crate) fn find_in_group<'a, T>(
&'a self,
user_agent: &str,
callback: impl Fn(&'a Group) -> Option<T>,
) -> Option<T> {
// Search by user agents // Search by user agents
for group in self.groups.iter() { for group in self.groups.iter() {
if group.applies_to(user_agent) { if group.applies_to(user_agent) {
@ -61,15 +65,15 @@ impl RobotsTxt {
return Some(output); return Some(output);
} }
} }
return None; None
} }
pub (crate) fn get_default_group(&self) -> Option<&Group> { pub(crate) fn get_default_group(&self) -> Option<&Group> {
for group in self.groups.iter() { for group in self.groups.iter() {
if group.is_default() { if group.is_default() {
return Some(group); return Some(group);
} }
} }
return None; None
} }
} }

View file

@ -1,5 +1,5 @@
use crate::model::path_pattern::PathPattern;
use crate::model::path::Path; use crate::model::path::Path;
use crate::model::path_pattern::PathPattern;
/// A rule line is a single "Allow:" (allowance==True) or "Disallow:" /// A rule line is a single "Allow:" (allowance==True) or "Disallow:"
/// (allowance==False) followed by a path.""" /// (allowance==False) followed by a path."""
@ -17,15 +17,15 @@ impl Rule {
} }
} }
pub (crate) fn applies_to(&self, path: &Path) -> bool { pub(crate) fn applies_to(&self, path: &Path) -> bool {
return self.path_pattern.applies_to(path); self.path_pattern.applies_to(path)
} }
pub (crate) fn get_allowance(&self) -> bool { pub(crate) fn get_allowance(&self) -> bool {
return self.allowance; self.allowance
} }
pub (crate) fn get_path_pattern(&self) -> &PathPattern { pub(crate) fn get_path_pattern(&self) -> &PathPattern {
return &self.path_pattern; &self.path_pattern
} }
} }

View file

@ -15,17 +15,15 @@
//! use robotparser::service::RobotsTxtService; //! use robotparser::service::RobotsTxtService;
//! use url::Url; //! use url::Url;
//! //!
//! fn main() { //! let robots_txt_url = Url::parse("https://google.com/robots.txt").unwrap();
//! let robots_txt_url = Url::parse("https://google.com/robots.txt").unwrap(); //! let robots_txt = "User-agent: *\nDisallow: /search";
//! let robots_txt = "User-agent: *\nDisallow: /search"; //! let robots_txt = parse_robots_txt(robots_txt_url.origin(), robots_txt);
//! let robots_txt = parse_robots_txt(robots_txt_url.origin(), robots_txt); //! assert_eq!(robots_txt.get_warnings().len(), 0);
//! assert_eq!(robots_txt.get_warnings().len(), 0); //! let robots_txt = robots_txt.get_result();
//! let robots_txt = robots_txt.get_result(); //! let good_url = Url::parse("https://google.com/test").unwrap();
//! let good_url = Url::parse("https://google.com/test").unwrap(); //! let bad_url = Url::parse("https://google.com/search/vvv").unwrap();
//! let bad_url = Url::parse("https://google.com/search/vvv").unwrap(); //! assert_eq!(robots_txt.can_fetch("*", &bad_url), false);
//! assert_eq!(robots_txt.can_fetch("*", &bad_url), false); //! assert_eq!(robots_txt.can_fetch("*", &good_url), true);
//! assert_eq!(robots_txt.can_fetch("*", &good_url), true);
//! }
//! ``` //! ```
mod robots_txt_parser; mod robots_txt_parser;
pub use self::robots_txt_parser::parse as parse_robots_txt; pub use self::robots_txt_parser::parse as parse_robots_txt;

View file

@ -1,6 +1,6 @@
use crate::model::{FetchedRobotsTxt, FetchedRobotsTxtContainer}; use crate::model::{FetchedRobotsTxt, FetchedRobotsTxtContainer};
use crate::parser::ParseResult;
use crate::parser::parse_robots_txt; use crate::parser::parse_robots_txt;
use crate::parser::ParseResult;
use url::Origin; use url::Origin;
const UNAUTHORIZED: u16 = 401; const UNAUTHORIZED: u16 = 401;
@ -12,17 +12,9 @@ const OK: u16 = 200;
/// **IMPORTANT NOTE**: origin must point to robots.txt url **before redirects**. /// **IMPORTANT NOTE**: origin must point to robots.txt url **before redirects**.
pub fn parse(origin: Origin, status_code: u16, input: &str) -> ParseResult<FetchedRobotsTxt> { pub fn parse(origin: Origin, status_code: u16, input: &str) -> ParseResult<FetchedRobotsTxt> {
match status_code { match status_code {
UNAUTHORIZED | FORBIDDEN => { UNAUTHORIZED | FORBIDDEN => ParseResult::new(FetchedRobotsTxt::new(FetchedRobotsTxtContainer::FetchDenied)),
return ParseResult::new(FetchedRobotsTxt::new(FetchedRobotsTxtContainer::FetchDenied)); OK => parse_robots_txt(origin, input)
} .map(|result| FetchedRobotsTxt::new(FetchedRobotsTxtContainer::Fetched(result))),
OK => { _ => ParseResult::new(FetchedRobotsTxt::new(FetchedRobotsTxtContainer::FetchFailed)),
return parse_robots_txt(origin, input)
.map(|result| {
return FetchedRobotsTxt::new(FetchedRobotsTxtContainer::Fetched(result));
});
},
_ => {
return ParseResult::new(FetchedRobotsTxt::new(FetchedRobotsTxtContainer::FetchFailed));
}
} }
} }

View file

@ -3,19 +3,16 @@ pub struct Line<'a> {
position: usize, position: usize,
} }
impl <'a>Line<'a> { impl<'a> Line<'a> {
pub fn new(line: &'a str, position: usize) -> Line<'a> { pub fn new(line: &'a str, position: usize) -> Line<'a> {
return Line { Line { line, position }
line,
position,
}
} }
pub fn get_line_text(&self) -> &str { pub fn get_line_text(&self) -> &str {
return self.line; self.line
} }
pub fn get_line_number(&self) -> usize { pub fn get_line_number(&self) -> usize {
return self.position; self.position
} }
} }

View file

@ -3,36 +3,39 @@ use std::fmt::Debug;
#[derive(Debug)] #[derive(Debug)]
/// The result of the robots.txt parser. /// The result of the robots.txt parser.
pub struct ParseResult<R> where R: Debug { pub struct ParseResult<R>
where
R: Debug,
{
result: R, result: R,
warnings: Vec<ParseWarning>, warnings: Vec<ParseWarning>,
} }
impl <R>ParseResult<R> where R: Debug { impl<R> ParseResult<R>
where
R: Debug,
{
/// Creates a new structure for parser results. /// Creates a new structure for parser results.
pub (crate) fn new(result: R) -> ParseResult<R>{ pub(crate) fn new(result: R) -> ParseResult<R> {
return ParseResult { ParseResult {
result, result,
warnings: Vec::new(), warnings: Vec::new(),
} }
} }
/// Creates a new structure for parser results with warnings. /// Creates a new structure for parser results with warnings.
pub (crate) fn new_with_warnings(result: R, warnings: Vec<ParseWarning>) -> ParseResult<R>{ pub(crate) fn new_with_warnings(result: R, warnings: Vec<ParseWarning>) -> ParseResult<R> {
return ParseResult { ParseResult { result, warnings }
result,
warnings,
}
} }
/// Returns the result of the robots.txt parser. /// Returns the result of the robots.txt parser.
pub fn get_result(self) -> R { pub fn get_result(self) -> R {
return self.result; self.result
} }
/// Returns the robots.txt parser warning array. /// Returns the robots.txt parser warning array.
pub fn get_warnings(&self) -> &[ParseWarning] { pub fn get_warnings(&self) -> &[ParseWarning] {
return self.warnings.as_slice(); self.warnings.as_slice()
} }
/// Returns reference to result of the robots.txt parser or first warning. /// Returns reference to result of the robots.txt parser or first warning.
@ -40,7 +43,7 @@ impl <R>ParseResult<R> where R: Debug {
if let Some(warning) = self.warnings.first() { if let Some(warning) = self.warnings.first() {
return Err(warning); return Err(warning);
} }
return Ok(&self.result); Ok(&self.result)
} }
/// Returns the result of the robots.txt parser or first warning. /// Returns the result of the robots.txt parser or first warning.
@ -49,12 +52,15 @@ impl <R>ParseResult<R> where R: Debug {
return Ok(self.result); return Ok(self.result);
} }
let first_warning = self.warnings.remove(0); let first_warning = self.warnings.remove(0);
return Err(first_warning); Err(first_warning)
} }
/// Converts this structure into another type of structure. /// Converts this structure into another type of structure.
pub (crate) fn map<T>(self, callback: impl Fn(R) -> T) -> ParseResult<T> where T: Debug { pub(crate) fn map<T>(self, callback: impl Fn(R) -> T) -> ParseResult<T>
return ParseResult { where
T: Debug,
{
ParseResult {
result: (callback)(self.result), result: (callback)(self.result),
warnings: self.warnings, warnings: self.warnings,
} }

View file

@ -1,21 +1,21 @@
use url::{Origin, Url}; use crate::model::{CleanParams, PathPattern, RequestRate, RobotsTxt, Rule};
use std::time::Duration;
use crate::parser::parse_result::ParseResult;
use crate::model::{RobotsTxt, Rule, PathPattern, CleanParams, RequestRate};
use crate::parser::line::Line; use crate::parser::line::Line;
use crate::parser::parse_result::ParseResult;
use crate::parser::warning::ParseWarning; use crate::parser::warning::ParseWarning;
use std::time::Duration;
use url::{Origin, Url};
mod directive; mod directive;
use self::directive::Directive; use self::directive::Directive;
mod group_builder; mod group_builder;
pub use self::group_builder::GroupBuilder; pub use self::group_builder::GroupBuilder;
const COMMENT_BEGIN_CHAR: char = '#'; const COMMENT_BEGIN_CHAR: char = '#';
const KV_SEPARATOR: &'static str = ":"; const KV_SEPARATOR: &str = ":";
/// Parses the text of the robots.txt file located in the specified origin. /// Parses the text of the robots.txt file located in the specified origin.
pub fn parse(origin: Origin, input: &str) -> ParseResult<RobotsTxt> { pub fn parse(origin: Origin, input: &str) -> ParseResult<RobotsTxt> {
let parser = Parser::new(origin); let parser = Parser::new(origin);
return parser.parse(input); parser.parse(input)
} }
struct Parser { struct Parser {
@ -26,7 +26,7 @@ struct Parser {
impl Parser { impl Parser {
pub fn new(origin: Origin) -> Parser { pub fn new(origin: Origin) -> Parser {
return Parser { Parser {
result: RobotsTxt::new(origin), result: RobotsTxt::new(origin),
group_builder: GroupBuilder::new(), group_builder: GroupBuilder::new(),
warnings: Vec::new(), warnings: Vec::new(),
@ -35,22 +35,20 @@ impl Parser {
pub fn parse(mut self, input: &str) -> ParseResult<RobotsTxt> { pub fn parse(mut self, input: &str) -> ParseResult<RobotsTxt> {
let input = ignore_bom(input); let input = ignore_bom(input);
let mut line_no = 0; for (line_no, line) in input.lines().enumerate() {
for line in input.lines() { let line = Line::new(line, line_no + 1);
line_no += 1;
let line = Line::new(line, line_no);
match Self::parse_line(&line) { match Self::parse_line(&line) {
Ok(Some(line_value)) => { Ok(Some(line_value)) => {
self.process_line_value(&line, &line_value); self.process_line_value(&line, &line_value);
}, }
Err(warning) => { Err(warning) => {
self.warnings.push(warning); self.warnings.push(warning);
}, }
_ => {}, _ => {}
} }
} }
self.group_builder.fill_entries(&mut self.result); self.group_builder.fill_entries(&mut self.result);
return ParseResult::new_with_warnings(self.result, self.warnings); ParseResult::new_with_warnings(self.result, self.warnings)
} }
fn parse_line<'a>(line: &'a Line) -> Result<Option<Directive<'a>>, ParseWarning> { fn parse_line<'a>(line: &'a Line) -> Result<Option<Directive<'a>>, ParseWarning> {
@ -61,9 +59,9 @@ impl Parser {
if kv_part.is_empty() { if kv_part.is_empty() {
return Ok(None); return Ok(None);
} }
let separator_index = kv_part.find(KV_SEPARATOR).ok_or_else(|| { let separator_index = kv_part
return ParseWarning::invalid_directive_format(line); .find(KV_SEPARATOR)
})?; .ok_or_else(|| ParseWarning::invalid_directive_format(line))?;
if separator_index >= kv_part.len() { if separator_index >= kv_part.len() {
return Err(ParseWarning::invalid_directive_format(line)); return Err(ParseWarning::invalid_directive_format(line));
} }
@ -75,7 +73,7 @@ impl Parser {
let value = &kv_part[separator_index + 1..]; let value = &kv_part[separator_index + 1..];
let value = value.trim(); let value = value.trim();
let result = Directive::new(key, value); let result = Directive::new(key, value);
return Ok(Some(result)); Ok(Some(result))
} }
fn process_line_value(&mut self, line: &Line, directive: &Directive) { fn process_line_value(&mut self, line: &Line, directive: &Directive) {
@ -84,29 +82,29 @@ impl Parser {
// Group specific directives // Group specific directives
"user-agent" => { "user-agent" => {
self.process_directive_user_agent(line, directive); self.process_directive_user_agent(line, directive);
}, }
"allow" => { "allow" => {
self.process_directive_allow(line, directive); self.process_directive_allow(line, directive);
}, }
"disallow" => { "disallow" => {
self.process_directive_disallow(line, directive); self.process_directive_disallow(line, directive);
}, }
"crawl-delay" => { "crawl-delay" => {
self.process_directive_crawl_delay(line, directive); self.process_directive_crawl_delay(line, directive);
}, }
"request-rate" => { "request-rate" => {
self.process_directive_request_rate(line, directive); self.process_directive_request_rate(line, directive);
}, }
// Non-group directives // Non-group directives
"sitemap" => { "sitemap" => {
self.process_directive_sitemap(line, directive); self.process_directive_sitemap(line, directive);
}, }
"clean-param" => { "clean-param" => {
self.process_directive_clean_param(line, directive); self.process_directive_clean_param(line, directive);
}, }
_ => { _ => {
self.warnings.push(ParseWarning::unsupported_directive_key(line, key)); self.warnings.push(ParseWarning::unsupported_directive_key(line, key));
}, }
} }
} }
@ -123,7 +121,7 @@ impl Parser {
if let Some(group) = self.group_builder.get_mut_active_group() { if let Some(group) = self.group_builder.get_mut_active_group() {
if directive.get_value() == "" { if directive.get_value() == "" {
// Nothing to do. Ignoring. // Nothing to do. Ignoring.
} else if directive.get_value().starts_with("*") || directive.get_value().starts_with("/") { } else if directive.get_value().starts_with('*') || directive.get_value().starts_with('/') {
group.push_rule(Rule::new(directive.get_value(), true)); group.push_rule(Rule::new(directive.get_value(), true));
} else { } else {
self.warnings.push(ParseWarning::wrong_path_format(line)); self.warnings.push(ParseWarning::wrong_path_format(line));
@ -138,7 +136,7 @@ impl Parser {
if directive.get_value() == "" { if directive.get_value() == "" {
// Allow all. // Allow all.
group.push_rule(Rule::new(PathPattern::all(), true)); group.push_rule(Rule::new(PathPattern::all(), true));
} else if directive.get_value().starts_with("*") || directive.get_value().starts_with("/") { } else if directive.get_value().starts_with('*') || directive.get_value().starts_with('/') {
group.push_rule(Rule::new(directive.get_value(), false)); group.push_rule(Rule::new(directive.get_value(), false));
} else { } else {
self.warnings.push(ParseWarning::wrong_path_format(line)); self.warnings.push(ParseWarning::wrong_path_format(line));
@ -156,10 +154,10 @@ impl Parser {
let delay_nanoseconds = delay.fract() * 10f64.powi(9); let delay_nanoseconds = delay.fract() * 10f64.powi(9);
let delay = Duration::new(delay_seconds as u64, delay_nanoseconds as u32); let delay = Duration::new(delay_seconds as u64, delay_nanoseconds as u32);
group.set_crawl_delay(delay); group.set_crawl_delay(delay);
}, }
Err(error) => { Err(error) => {
self.warnings.push(ParseWarning::parse_crawl_delay_error(line, error)); self.warnings.push(ParseWarning::parse_crawl_delay_error(line, error));
}, }
} }
} else { } else {
self.warnings.push(ParseWarning::directive_without_user_agent(line)); self.warnings.push(ParseWarning::directive_without_user_agent(line));
@ -174,20 +172,20 @@ impl Parser {
return; return;
} }
let requests = match numbers[0].parse::<usize>() { let requests = match numbers[0].parse::<usize>() {
Ok(requests) => {requests}, Ok(requests) => requests,
Err(error) => { Err(error) => {
self.warnings.push(ParseWarning::parse_request_rate(line, error)); self.warnings.push(ParseWarning::parse_request_rate(line, error));
return; return;
}, }
}; };
let seconds = match numbers[1].parse::<usize>() { let seconds = match numbers[1].parse::<usize>() {
Ok(seconds) => {seconds}, Ok(seconds) => seconds,
Err(error) => { Err(error) => {
self.warnings.push(ParseWarning::parse_request_rate(line, error)); self.warnings.push(ParseWarning::parse_request_rate(line, error));
return; return;
}, }
}; };
group.set_req_rate(RequestRate{requests, seconds}); group.set_req_rate(RequestRate { requests, seconds });
} else { } else {
self.warnings.push(ParseWarning::directive_without_user_agent(line)); self.warnings.push(ParseWarning::directive_without_user_agent(line));
} }
@ -197,27 +195,27 @@ impl Parser {
match Url::parse(directive.get_value()) { match Url::parse(directive.get_value()) {
Ok(sitemap_url) => { Ok(sitemap_url) => {
self.result.add_sitemap(sitemap_url); self.result.add_sitemap(sitemap_url);
}, }
Err(error) => { Err(error) => {
self.warnings.push(ParseWarning::parse_url(line, error)); self.warnings.push(ParseWarning::parse_url(line, error));
}, }
} }
} }
fn process_directive_clean_param(&mut self, line: &Line, directive: &Directive) { fn process_directive_clean_param(&mut self, line: &Line, directive: &Directive) {
let parts: Vec<&str> = directive.get_value().split_whitespace().collect(); let parts: Vec<&str> = directive.get_value().split_whitespace().collect();
if parts.len() >= 3 || parts.len() == 0 { if parts.len() >= 3 || parts.is_empty() {
self.warnings.push(ParseWarning::wrong_clean_param_format(line)); self.warnings.push(ParseWarning::wrong_clean_param_format(line));
return; return;
} }
if parts[0].len() == 0 { if parts[0].is_empty() {
self.warnings.push(ParseWarning::wrong_clean_param_format(line)); self.warnings.push(ParseWarning::wrong_clean_param_format(line));
return; return;
} }
let clean_params_path_pattern; let clean_params_path_pattern;
let clean_params; let clean_params;
if let Some(second_param) = parts.get(1) { if let Some(second_param) = parts.get(1) {
if second_param.len() == 0 { if second_param.is_empty() {
self.warnings.push(ParseWarning::wrong_clean_param_format(line)); self.warnings.push(ParseWarning::wrong_clean_param_format(line));
return; return;
} }
@ -229,9 +227,11 @@ impl Parser {
} }
let (valid_clean_params, invalid_clean_params) = Self::parse_clean_params(clean_params); let (valid_clean_params, invalid_clean_params) = Self::parse_clean_params(clean_params);
if !invalid_clean_params.is_empty() { if !invalid_clean_params.is_empty() {
self.warnings.push(ParseWarning::ignored_clean_params(line, invalid_clean_params)); self.warnings
.push(ParseWarning::ignored_clean_params(line, invalid_clean_params));
} }
self.result.add_clean_params(CleanParams::new(clean_params_path_pattern, valid_clean_params)); self.result
.add_clean_params(CleanParams::new(clean_params_path_pattern, valid_clean_params));
} }
fn parse_clean_params(clean_params: &str) -> (Vec<String>, Vec<String>) { fn parse_clean_params(clean_params: &str) -> (Vec<String>, Vec<String>) {
@ -246,36 +246,29 @@ impl Parser {
} }
} }
} }
return (valid, invalid); (valid, invalid)
} }
fn is_valid_clean_param(clean_param: &str) -> bool { fn is_valid_clean_param(clean_param: &str) -> bool {
for c in clean_param.chars() { for c in clean_param.chars() {
let mut is_valid = false; let is_valid = ('A'..'Z').contains(&c)
if ('A'..'Z').contains(&c) { || ('a'..'z').contains(&c)
is_valid = true; || ('0'..'9').contains(&c)
} || c == '.'
if ('a'..'z').contains(&c) { || c == '-'
is_valid = true; || c == '_';
}
if ('0'..'9').contains(&c) {
is_valid = true;
}
if c == '.' || c == '-' || c == '_' {
is_valid = true;
}
if !is_valid { if !is_valid {
return false; return false;
} }
} }
return true; true
} }
} }
fn ignore_bom(input: &str) -> &str { fn ignore_bom(input: &str) -> &str {
const BOM: &'static str = "\u{feff}"; const BOM: &str = "\u{feff}";
if input.starts_with(BOM) { if input.starts_with(BOM) {
return &input[BOM.len()..]; return &input[BOM.len()..];
} }
return input; input
} }

View file

@ -3,19 +3,16 @@ pub struct Directive<'a> {
value: &'a str, value: &'a str,
} }
impl <'a> Directive<'a> { impl<'a> Directive<'a> {
pub fn new(key: &'a str, value: &'a str) -> Directive<'a> { pub fn new(key: &'a str, value: &'a str) -> Directive<'a> {
return Directive { Directive { key, value }
key,
value,
}
} }
pub fn get_key_lowercase(&self) -> String { pub fn get_key_lowercase(&self) -> String {
return self.key.to_lowercase(); self.key.to_lowercase()
} }
pub fn get_value(&self) -> &str { pub fn get_value(&self) -> &str {
return self.value; self.value
} }
} }

View file

@ -12,7 +12,7 @@ pub struct GroupBuilder {
impl GroupBuilder { impl GroupBuilder {
pub fn new() -> GroupBuilder { pub fn new() -> GroupBuilder {
return GroupBuilder { GroupBuilder {
state: State::WaitingForNewGroup, state: State::WaitingForNewGroup,
active_group: None, active_group: None,
groups: Vec::new(), groups: Vec::new(),
@ -27,14 +27,14 @@ impl GroupBuilder {
self.groups.push(group); self.groups.push(group);
self.active_group = Some(self.groups.len() - 1); self.active_group = Some(self.groups.len() - 1);
self.state = State::WaitingForAdditionalUserAgent; self.state = State::WaitingForAdditionalUserAgent;
}, }
State::WaitingForAdditionalUserAgent => { State::WaitingForAdditionalUserAgent => {
let active_group = self.active_group.expect("Unable to get active group"); let active_group = self.active_group.expect("Unable to get active group");
let group = self.groups.get_mut(active_group).expect("Unable to get group index"); let group = self.groups.get_mut(active_group).expect("Unable to get group index");
if !group.contains_user_agent(user_agent) { if !group.contains_user_agent(user_agent) {
group.push_useragent(user_agent); group.push_useragent(user_agent);
} }
}, }
} }
} }
@ -43,7 +43,7 @@ impl GroupBuilder {
if let Some(active_group) = self.active_group { if let Some(active_group) = self.active_group {
return self.groups.get_mut(active_group); return self.groups.get_mut(active_group);
} }
return None; None
} }
pub fn fill_entries(mut self, robots_txt: &mut RobotsTxt) { pub fn fill_entries(mut self, robots_txt: &mut RobotsTxt) {

View file

@ -1,9 +1,9 @@
use super::line::Line; use super::line::Line;
use super::warning_reason::WarningReason; use super::warning_reason::WarningReason;
use url::ParseError as ParseUrlError;
use std::num::{ParseFloatError, ParseIntError};
use std::fmt;
use std::error::Error; use std::error::Error;
use std::fmt;
use std::num::{ParseFloatError, ParseIntError};
use url::ParseError as ParseUrlError;
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
/// Warning of robots.txt parser about problems when parsing robots.txt file. /// Warning of robots.txt parser about problems when parsing robots.txt file.
@ -18,109 +18,109 @@ impl Error for ParseWarning {}
impl ParseWarning { impl ParseWarning {
/// Returns the line number in the text of the robots.txt file. /// Returns the line number in the text of the robots.txt file.
pub fn get_line_no(&self) -> usize { pub fn get_line_no(&self) -> usize {
return self.line_no; self.line_no
} }
/// Returns the text of the robots.txt file string. /// Returns the text of the robots.txt file string.
pub fn get_line_text(&self) -> &String { pub fn get_line_text(&self) -> &String {
return &self.line; &self.line
} }
/// Returns the reason of warning. /// Returns the reason of warning.
pub fn get_reason(&self) -> &WarningReason { pub fn get_reason(&self) -> &WarningReason {
return &self.reason; &self.reason
} }
pub (crate) fn invalid_directive_format(line: &Line) -> ParseWarning { pub(crate) fn invalid_directive_format(line: &Line) -> ParseWarning {
return ParseWarning { ParseWarning {
line_no: line.get_line_number(), line_no: line.get_line_number(),
line: line.get_line_text().into(), line: line.get_line_text().into(),
reason: WarningReason::InvalidDirectiveFormat, reason: WarningReason::InvalidDirectiveFormat,
} }
} }
pub (crate) fn directive_key_is_empty(line: &Line) -> ParseWarning { pub(crate) fn directive_key_is_empty(line: &Line) -> ParseWarning {
return ParseWarning { ParseWarning {
line_no: line.get_line_number(), line_no: line.get_line_number(),
line: line.get_line_text().into(), line: line.get_line_text().into(),
reason: WarningReason::DirectiveKeyIsEmpty, reason: WarningReason::DirectiveKeyIsEmpty,
} }
} }
pub (crate) fn unsupported_directive_key(line: &Line, key: String) -> ParseWarning { pub(crate) fn unsupported_directive_key(line: &Line, key: String) -> ParseWarning {
return ParseWarning { ParseWarning {
line_no: line.get_line_number(), line_no: line.get_line_number(),
line: line.get_line_text().into(), line: line.get_line_text().into(),
reason: WarningReason::UnsupportedDirectiveKey(key), reason: WarningReason::UnsupportedDirectiveKey(key),
} }
} }
pub (crate) fn user_agent_cannot_be_empty(line: &Line) -> ParseWarning { pub(crate) fn user_agent_cannot_be_empty(line: &Line) -> ParseWarning {
return ParseWarning { ParseWarning {
line_no: line.get_line_number(), line_no: line.get_line_number(),
line: line.get_line_text().into(), line: line.get_line_text().into(),
reason: WarningReason::UserAgentCannotBeEmpty, reason: WarningReason::UserAgentCannotBeEmpty,
} }
} }
pub (crate) fn wrong_path_format(line: &Line) -> ParseWarning { pub(crate) fn wrong_path_format(line: &Line) -> ParseWarning {
return ParseWarning { ParseWarning {
line_no: line.get_line_number(), line_no: line.get_line_number(),
line: line.get_line_text().into(), line: line.get_line_text().into(),
reason: WarningReason::WrongPathFormat, reason: WarningReason::WrongPathFormat,
} }
} }
pub (crate) fn directive_without_user_agent(line: &Line) -> ParseWarning { pub(crate) fn directive_without_user_agent(line: &Line) -> ParseWarning {
return ParseWarning { ParseWarning {
line_no: line.get_line_number(), line_no: line.get_line_number(),
line: line.get_line_text().into(), line: line.get_line_text().into(),
reason: WarningReason::DirectiveWithoutUserAgent, reason: WarningReason::DirectiveWithoutUserAgent,
} }
} }
pub (crate) fn parse_crawl_delay_error(line: &Line, error: ParseFloatError) -> ParseWarning { pub(crate) fn parse_crawl_delay_error(line: &Line, error: ParseFloatError) -> ParseWarning {
return ParseWarning { ParseWarning {
line_no: line.get_line_number(), line_no: line.get_line_number(),
line: line.get_line_text().into(), line: line.get_line_text().into(),
reason: WarningReason::ParseCrawlDelayError(error), reason: WarningReason::ParseCrawlDelayError(error),
} }
} }
pub (crate) fn wrong_request_rate_format(line: &Line) -> ParseWarning { pub(crate) fn wrong_request_rate_format(line: &Line) -> ParseWarning {
return ParseWarning { ParseWarning {
line_no: line.get_line_number(), line_no: line.get_line_number(),
line: line.get_line_text().into(), line: line.get_line_text().into(),
reason: WarningReason::WrongRequestRateFormat, reason: WarningReason::WrongRequestRateFormat,
} }
} }
pub (crate) fn parse_request_rate(line: &Line, error: ParseIntError) -> ParseWarning { pub(crate) fn parse_request_rate(line: &Line, error: ParseIntError) -> ParseWarning {
return ParseWarning { ParseWarning {
line_no: line.get_line_number(), line_no: line.get_line_number(),
line: line.get_line_text().into(), line: line.get_line_text().into(),
reason: WarningReason::ParseRequestRate(error), reason: WarningReason::ParseRequestRate(error),
} }
} }
pub (crate) fn parse_url(line: &Line, error: ParseUrlError) -> ParseWarning { pub(crate) fn parse_url(line: &Line, error: ParseUrlError) -> ParseWarning {
return ParseWarning { ParseWarning {
line_no: line.get_line_number(), line_no: line.get_line_number(),
line: line.get_line_text().into(), line: line.get_line_text().into(),
reason: WarningReason::ParseUrl(error), reason: WarningReason::ParseUrl(error),
} }
} }
pub (crate) fn wrong_clean_param_format(line: &Line) -> ParseWarning { pub(crate) fn wrong_clean_param_format(line: &Line) -> ParseWarning {
return ParseWarning { ParseWarning {
line_no: line.get_line_number(), line_no: line.get_line_number(),
line: line.get_line_text().into(), line: line.get_line_text().into(),
reason: WarningReason::WrongCleanParamFormat, reason: WarningReason::WrongCleanParamFormat,
} }
} }
pub (crate) fn ignored_clean_params(line: &Line, ignored_clean_params: Vec<String>) -> ParseWarning { pub(crate) fn ignored_clean_params(line: &Line, ignored_clean_params: Vec<String>) -> ParseWarning {
return ParseWarning { ParseWarning {
line_no: line.get_line_number(), line_no: line.get_line_number(),
line: line.get_line_text().into(), line: line.get_line_text().into(),
reason: WarningReason::IgnoredCleanParams(ignored_clean_params), reason: WarningReason::IgnoredCleanParams(ignored_clean_params),

View file

@ -1,6 +1,6 @@
use url::ParseError as ParseUrlError;
use std::num::{ParseFloatError, ParseIntError};
use std::fmt; use std::fmt;
use std::num::{ParseFloatError, ParseIntError};
use url::ParseError as ParseUrlError;
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
/// Warning reason of robots.txt parser about problems when parsing robots.txt file. /// Warning reason of robots.txt parser about problems when parsing robots.txt file.
@ -38,41 +38,41 @@ pub enum WarningReason {
/// Displays text of warning reason. /// Displays text of warning reason.
impl fmt::Display for WarningReason { impl fmt::Display for WarningReason {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
match &self { match self {
&Self::InvalidDirectiveFormat => { Self::InvalidDirectiveFormat => {
write!(f, "Invalid directive format.") write!(f, "Invalid directive format.")
}, },
&Self::DirectiveKeyIsEmpty => { Self::DirectiveKeyIsEmpty => {
write!(f, "Directive key is empty.") write!(f, "Directive key is empty.")
}, },
&Self::UnsupportedDirectiveKey(key) => { Self::UnsupportedDirectiveKey(key) => {
write!(f, "Directive key `{}` is not suppored by this parser.", key) write!(f, "Directive key `{}` is not suppored by this parser.", key)
}, },
&Self::UserAgentCannotBeEmpty => { Self::UserAgentCannotBeEmpty => {
write!(f, "Passed directive key is `User-Agent` and passed value is empty.") write!(f, "Passed directive key is `User-Agent` and passed value is empty.")
}, },
&Self::DirectiveWithoutUserAgent => { Self::DirectiveWithoutUserAgent => {
write!(f, "It is impossible to process this directive before `User-Agent` directive has not been processed.") write!(f, "It is impossible to process this directive before `User-Agent` directive has not been processed.")
}, },
&Self::ParseCrawlDelayError(err) => { Self::ParseCrawlDelayError(err) => {
write!(f, "It is impossible to process the `Crawl-Delay` directive because of an error when parsing a floating point number: {}", err) write!(f, "It is impossible to process the `Crawl-Delay` directive because of an error when parsing a floating point number: {}", err)
}, },
&Self::WrongRequestRateFormat => { Self::WrongRequestRateFormat => {
write!(f, "Incorrect format of the `Request-Rate` directive") write!(f, "Incorrect format of the `Request-Rate` directive")
}, },
&Self::ParseRequestRate(err) => { Self::ParseRequestRate(err) => {
write!(f, "Incorrect format of the `Request-Rate` directive: {}", err) write!(f, "Incorrect format of the `Request-Rate` directive: {}", err)
}, },
&Self::ParseUrl(err) => { Self::ParseUrl(err) => {
write!(f, "Parsing URL error: {}", err) write!(f, "Parsing URL error: {}", err)
}, },
&Self::WrongCleanParamFormat => { Self::WrongCleanParamFormat => {
write!(f, "Incorrect format of the `Clean-Param` directive.") write!(f, "Incorrect format of the `Clean-Param` directive.")
}, },
&Self::IgnoredCleanParams(ref params) => { Self::IgnoredCleanParams(ref params) => {
write!(f, "Directive `Clean-Param` directive has incorrect parameters: {:?}", params) write!(f, "Directive `Clean-Param` directive has incorrect parameters: {:?}", params)
}, },
&Self::WrongPathFormat => { Self::WrongPathFormat => {
write!(f, "Error in URL path format.") write!(f, "Error in URL path format.")
}, },
} }

View file

@ -1,8 +1,8 @@
mod robots_txt;
mod fetched_robots_txt; mod fetched_robots_txt;
use url::Url; mod robots_txt;
use std::time::Duration;
use crate::model::RequestRate; use crate::model::RequestRate;
use std::time::Duration;
use url::Url;
/// Trait that implements robots txt service. /// Trait that implements robots txt service.
pub trait RobotsTxtService { pub trait RobotsTxtService {

View file

@ -1,51 +1,49 @@
use url::Url;
use std::time::Duration;
use crate::service::RobotsTxtService;
use crate::model::{FetchedRobotsTxt, FetchedRobotsTxtContainer};
use crate::model::RequestRate; use crate::model::RequestRate;
use crate::model::{FetchedRobotsTxt, FetchedRobotsTxtContainer};
use crate::service::RobotsTxtService;
use std::time::Duration;
use url::Url;
impl RobotsTxtService for FetchedRobotsTxt { impl RobotsTxtService for FetchedRobotsTxt {
fn can_fetch(&self, user_agent: &str, url: &Url) -> bool { fn can_fetch(&self, user_agent: &str, url: &Url) -> bool {
match self.get_container() { match *self.get_container() {
&FetchedRobotsTxtContainer::FetchDenied => false, FetchedRobotsTxtContainer::FetchDenied => false,
&FetchedRobotsTxtContainer::FetchFailed => true, FetchedRobotsTxtContainer::FetchFailed => true,
&FetchedRobotsTxtContainer::Fetched(ref robots_txt) => { FetchedRobotsTxtContainer::Fetched(ref robots_txt) => robots_txt.can_fetch(user_agent, url),
robots_txt.can_fetch(user_agent, url)
}
} }
} }
fn get_crawl_delay(&self, user_agent: &str) -> Option<Duration> { fn get_crawl_delay(&self, user_agent: &str) -> Option<Duration> {
if let &FetchedRobotsTxtContainer::Fetched(ref robots_txt) = self.get_container() { if let FetchedRobotsTxtContainer::Fetched(ref robots_txt) = *self.get_container() {
return robots_txt.get_crawl_delay(user_agent); return robots_txt.get_crawl_delay(user_agent);
} }
return None; None
} }
fn normalize_url(&self, url: &mut Url) -> bool { fn normalize_url(&self, url: &mut Url) -> bool {
if let &FetchedRobotsTxtContainer::Fetched(ref robots_txt) = self.get_container() { if let FetchedRobotsTxtContainer::Fetched(ref robots_txt) = *self.get_container() {
return robots_txt.normalize_url(url); return robots_txt.normalize_url(url);
} }
return true; true
} }
fn normalize_url_ignore_origin(&self, url: &mut Url) { fn normalize_url_ignore_origin(&self, url: &mut Url) {
if let &FetchedRobotsTxtContainer::Fetched(ref robots_txt) = self.get_container() { if let FetchedRobotsTxtContainer::Fetched(ref robots_txt) = *self.get_container() {
robots_txt.normalize_url_ignore_origin(url); robots_txt.normalize_url_ignore_origin(url);
} }
} }
fn get_sitemaps(&self) -> &[Url] { fn get_sitemaps(&self) -> &[Url] {
if let &FetchedRobotsTxtContainer::Fetched(ref robots_txt) = self.get_container() { if let FetchedRobotsTxtContainer::Fetched(ref robots_txt) = *self.get_container() {
return robots_txt.get_sitemaps(); return robots_txt.get_sitemaps();
} }
return &[]; &[]
} }
fn get_req_rate(&self, user_agent: &str) -> Option<RequestRate> { fn get_req_rate(&self, user_agent: &str) -> Option<RequestRate> {
if let &FetchedRobotsTxtContainer::Fetched(ref robots_txt) = self.get_container() { if let FetchedRobotsTxtContainer::Fetched(ref robots_txt) = *self.get_container() {
return robots_txt.get_req_rate(user_agent); return robots_txt.get_req_rate(user_agent);
} }
return None; None
} }
} }

View file

@ -1,9 +1,9 @@
use url::Url;
use std::time::Duration;
use crate::service::RobotsTxtService;
use crate::model::RobotsTxt;
use crate::model::RequestRate;
use crate::model::Path; use crate::model::Path;
use crate::model::RequestRate;
use crate::model::RobotsTxt;
use crate::service::RobotsTxtService;
use std::time::Duration;
use url::Url;
impl RobotsTxtService for RobotsTxt { impl RobotsTxtService for RobotsTxt {
fn can_fetch(&self, user_agent: &str, url: &Url) -> bool { fn can_fetch(&self, user_agent: &str, url: &Url) -> bool {
@ -18,19 +18,17 @@ impl RobotsTxtService for RobotsTxt {
return Some(rule.get_allowance()); return Some(rule.get_allowance());
} }
} }
return None; None
}); });
if let Some(rule_decision) = rule_decision { if let Some(rule_decision) = rule_decision {
return rule_decision; return rule_decision;
} }
// Empty robots.txt allows crawling. Everything that was not denied must be allowed. // Empty robots.txt allows crawling. Everything that was not denied must be allowed.
return true; true
} }
fn get_crawl_delay(&self, user_agent: &str) -> Option<Duration> { fn get_crawl_delay(&self, user_agent: &str) -> Option<Duration> {
return self.find_in_group(user_agent, |group| { self.find_in_group(user_agent, |group| group.get_crawl_delay())
return group.get_crawl_delay();
});
} }
fn normalize_url(&self, url: &mut Url) -> bool { fn normalize_url(&self, url: &mut Url) -> bool {
@ -38,7 +36,7 @@ impl RobotsTxtService for RobotsTxt {
return false; return false;
} }
self.normalize_url_ignore_origin(url); self.normalize_url_ignore_origin(url);
return true; true
} }
fn normalize_url_ignore_origin(&self, url: &mut Url) { fn normalize_url_ignore_origin(&self, url: &mut Url) {
@ -54,9 +52,7 @@ impl RobotsTxtService for RobotsTxt {
} }
let mut pairs: Vec<(String, String)> = url let mut pairs: Vec<(String, String)> = url
.query_pairs() .query_pairs()
.map(|(key, value)|{ .map(|(key, value)| (key.into(), value.into()))
return (key.into(), value.into());
})
.collect(); .collect();
{ {
let mut query_pairs_mut = url.query_pairs_mut(); let mut query_pairs_mut = url.query_pairs_mut();
@ -73,12 +69,10 @@ impl RobotsTxtService for RobotsTxt {
} }
fn get_sitemaps(&self) -> &[Url] { fn get_sitemaps(&self) -> &[Url] {
return self.get_sitemaps_slice(); self.get_sitemaps_slice()
} }
fn get_req_rate(&self, user_agent: &str) -> Option<RequestRate> { fn get_req_rate(&self, user_agent: &str) -> Option<RequestRate> {
return self.find_in_group(user_agent, |group| { self.find_in_group(user_agent, |group| group.get_req_rate())
return group.get_req_rate();
});
} }
} }

View file

@ -3,7 +3,7 @@ use robotparser::service::RobotsTxtService;
use std::time::Duration; use std::time::Duration;
use url::Url; use url::Url;
const AGENT: &'static str = "test_robotparser"; const AGENT: &str = "test_robotparser";
fn robot_test(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>, agent: &str) { fn robot_test(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>, agent: &str) {
let url = Url::parse("https://www.baidu.com/robots.txt").unwrap(); let url = Url::parse("https://www.baidu.com/robots.txt").unwrap();
@ -20,7 +20,6 @@ fn robot_test(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>, agent: &str)
} }
} }
fn robot_test_simple(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>) { fn robot_test_simple(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>) {
robot_test(doc, good_urls, bad_urls, AGENT); robot_test(doc, good_urls, bad_urls, AGENT);
} }
@ -33,12 +32,11 @@ fn test_robots_txt_rn_bom() {
Disallow: /tmp/ # these will soon disappear\r\n\ Disallow: /tmp/ # these will soon disappear\r\n\
Disallow: /foo.html\r\n\ Disallow: /foo.html\r\n\
"; ";
let good = vec!["/","/test.html"]; let good = vec!["/", "/test.html"];
let bad = vec!["/cyberworld/map/index.html","/tmp/xxx","/foo.html"]; let bad = vec!["/cyberworld/map/index.html", "/tmp/xxx", "/foo.html"];
robot_test_simple(doc, good, bad); robot_test_simple(doc, good, bad);
} }
#[test] #[test]
fn test_robots_txt_1() { fn test_robots_txt_1() {
let doc = "\n\ let doc = "\n\
@ -47,12 +45,11 @@ fn test_robots_txt_1() {
Disallow: /tmp/ # these will soon disappear\n\ Disallow: /tmp/ # these will soon disappear\n\
Disallow: /foo.html\n\ Disallow: /foo.html\n\
"; ";
let good = vec!["/","/test.html"]; let good = vec!["/", "/test.html"];
let bad = vec!["/cyberworld/map/index.html","/tmp/xxx","/foo.html"]; let bad = vec!["/cyberworld/map/index.html", "/tmp/xxx", "/foo.html"];
robot_test_simple(doc, good, bad); robot_test_simple(doc, good, bad);
} }
#[test] #[test]
fn test_robots_txt_2() { fn test_robots_txt_2() {
let doc = "\n\ let doc = "\n\
@ -66,7 +63,7 @@ fn test_robots_txt_2() {
Disallow:\n\ Disallow:\n\
\n\ \n\
"; ";
let good = vec!["/","/test.html"]; let good = vec!["/", "/test.html"];
let bad = vec!["/cyberworld/map/index.html"]; let bad = vec!["/cyberworld/map/index.html"];
robot_test_simple(doc, good, bad); robot_test_simple(doc, good, bad);
@ -82,7 +79,7 @@ fn test_robots_txt_3() {
Disallow: /\n\ Disallow: /\n\
"; ";
let good = vec![]; let good = vec![];
let bad = vec!["/cyberworld/map/index.html","/","/tmp/"]; let bad = vec!["/cyberworld/map/index.html", "/", "/tmp/"];
robot_test_simple(doc, good, bad); robot_test_simple(doc, good, bad);
} }
@ -97,8 +94,13 @@ fn test_robots_txt_4() {
"; ";
let good = vec![]; let good = vec![];
let bad = vec![ let bad = vec![
"/tmp", "/tmp.html", "/tmp/a.html", "/a%3cd.html", "/a%3Cd.html", "/tmp",
"/a%2fb.html", "/~joe/index.html", "/tmp.html",
"/tmp/a.html",
"/a%3cd.html",
"/a%3Cd.html",
"/a%2fb.html",
"/~joe/index.html",
]; ];
robot_test(doc, good.clone(), bad.clone(), "figtree"); robot_test(doc, good.clone(), bad.clone(), "figtree");
robot_test(doc, good, bad, "FigTree Robot libwww-perl/5.04"); robot_test(doc, good, bad, "FigTree Robot libwww-perl/5.04");
@ -115,8 +117,12 @@ fn test_robots_txt_5() {
"; ";
let good = vec!["/tmp"]; let good = vec!["/tmp"];
let bad = vec![ let bad = vec![
"/tmp/", "/tmp/a.html", "/a%3cd.html", "/a%3Cd.html", "/tmp/",
"/a/b.html", "/%7Ejoe/index.html", "/tmp/a.html",
"/a%3cd.html",
"/a%3Cd.html",
"/a/b.html",
"/%7Ejoe/index.html",
]; ];
robot_test_simple(doc, good, bad); robot_test_simple(doc, good, bad);
} }
@ -246,8 +252,8 @@ fn test_robots_txt_14() {
#[cfg(feature = "http")] #[cfg(feature = "http")]
#[test] #[test]
fn test_robots_txt_read() { fn test_robots_txt_read() {
use robotparser::http::{CreateRobotsTxtRequest, ParseRobotsTxtResponse};
use reqwest::{Client, Request}; use reqwest::{Client, Request};
use robotparser::http::{CreateRobotsTxtRequest, ParseRobotsTxtResponse};
let http_client = Client::new(); let http_client = Client::new();
let url = Url::parse("https://www.python.org/robots.txt").unwrap(); let url = Url::parse("https://www.python.org/robots.txt").unwrap();
let request = Request::create_robots_txt_request(url.origin()); let request = Request::create_robots_txt_request(url.origin());
@ -263,7 +269,10 @@ fn test_robots_text_crawl_delay() {
Crawl-delay: 2.35\n\ Crawl-delay: 2.35\n\
Disallow: /search/\n"; Disallow: /search/\n";
let parser = parse_robots_txt(robots_txt_url.origin(), doc).get_result(); let parser = parse_robots_txt(robots_txt_url.origin(), doc).get_result();
assert_eq!(Duration::new(2,350 * 1000 * 1000), parser.get_crawl_delay("Yandex").unwrap()); assert_eq!(
Duration::new(2, 350 * 1000 * 1000),
parser.get_crawl_delay("Yandex").unwrap()
);
} }
#[test] #[test]
@ -288,8 +297,7 @@ fn test_robots_text_sitemaps() {
#[test] #[test]
fn test_robots_text_request_rate() { fn test_robots_text_request_rate() {
let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap(); let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
let doc = let doc = "User-agent: Yandex\n\
"User-agent: Yandex\n\
Request-rate: 3/15\n\ Request-rate: 3/15\n\
Disallow: /search/\n"; Disallow: /search/\n";
let parser = parse_robots_txt(robots_txt_url.origin(), doc).get_result(); let parser = parse_robots_txt(robots_txt_url.origin(), doc).get_result();
@ -301,7 +309,6 @@ fn test_robots_text_request_rate() {
assert!(req_rate.is_none()); assert!(req_rate.is_none());
} }
#[test] #[test]
fn test_robots_text_clean_params() { fn test_robots_text_clean_params() {
let doc = "\ let doc = "\
@ -315,13 +322,18 @@ Clean-param: amp\n\
"; ";
let url = Url::parse("https://www.baidu.com/robots.txt").unwrap(); let url = Url::parse("https://www.baidu.com/robots.txt").unwrap();
let parser = parse_robots_txt(url.origin(), doc).get_result(); let parser = parse_robots_txt(url.origin(), doc).get_result();
let mut site_url = Url::parse("https://www.baidu.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&amp=1").unwrap(); let mut site_url =
Url::parse("https://www.baidu.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&amp=1").unwrap();
let was_updated = parser.normalize_url(&mut site_url); let was_updated = parser.normalize_url(&mut site_url);
assert_eq!(was_updated, true); assert_eq!(was_updated, true);
assert_eq!(site_url.as_str(), "https://www.baidu.com/test?post_id=7777"); assert_eq!(site_url.as_str(), "https://www.baidu.com/test?post_id=7777");
let mut site_url = Url::parse("https://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&amp=1").unwrap(); let mut site_url =
Url::parse("https://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&amp=1").unwrap();
let was_updated = parser.normalize_url(&mut site_url); let was_updated = parser.normalize_url(&mut site_url);
assert_eq!(was_updated, false); assert_eq!(was_updated, false);
assert_eq!(site_url.as_str(), "https://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&amp=1"); assert_eq!(
site_url.as_str(),
"https://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&amp=1"
);
} }

View file

@ -1,8 +1,8 @@
use reqwest::Client;
use robotparser::http::RobotsTxtClient; use robotparser::http::RobotsTxtClient;
use robotparser::service::RobotsTxtService; use robotparser::service::RobotsTxtService;
use reqwest::Client;
use url::Url;
use tokio::runtime::Runtime; use tokio::runtime::Runtime;
use url::Url;
use url::{Host, Origin}; use url::{Host, Origin};
#[test] #[test]
@ -23,8 +23,7 @@ fn test_reqwest_blocking_panic_url() {
let client = Client::new(); let client = Client::new();
let host = Host::Domain("python.org::".into()); let host = Host::Domain("python.org::".into());
let origin = Origin::Tuple("https".into(), host, 80); let origin = Origin::Tuple("https".into(), host, 80);
match client.fetch_robots_txt(origin) { if client.fetch_robots_txt(origin).is_ok() {
Ok(_) => assert!(false), panic!()
Err(_) => assert!(true)
} }
} }

View file

@ -1,6 +1,6 @@
use reqwest::blocking::Client;
use robotparser::http::RobotsTxtClient; use robotparser::http::RobotsTxtClient;
use robotparser::service::RobotsTxtService; use robotparser::service::RobotsTxtService;
use reqwest::blocking::Client;
use url::Url; use url::Url;
use url::{Host, Origin}; use url::{Host, Origin};
@ -20,8 +20,7 @@ fn test_reqwest_blocking_panic_url() {
let client = Client::new(); let client = Client::new();
let host = Host::Domain("python.org::".into()); let host = Host::Domain("python.org::".into());
let origin = Origin::Tuple("https".into(), host, 80); let origin = Origin::Tuple("https".into(), host, 80);
match client.fetch_robots_txt(origin) { if client.fetch_robots_txt(origin).is_ok() {
Ok(_) => assert!(false), panic!()
Err(_) => assert!(true)
} }
} }

View file

@ -1,6 +1,6 @@
use robotparser::parser::{parse_robots_txt, WarningReason}; use robotparser::parser::{parse_robots_txt, WarningReason};
use url::{Host, Origin};
use std::convert::From; use std::convert::From;
use url::{Host, Origin};
#[derive(PartialEq, Eq, Debug, Clone)] #[derive(PartialEq, Eq, Debug, Clone)]
enum WarningReasonKind { enum WarningReasonKind {
@ -51,7 +51,6 @@ fn test_warning_supported_directive_key() {
validate_warnings(input, &[WarningReasonKind::UnsupportedDirectiveKey]); validate_warnings(input, &[WarningReasonKind::UnsupportedDirectiveKey]);
} }
#[test] #[test]
fn test_warning_user_agent_cannot_be_empty() { fn test_warning_user_agent_cannot_be_empty() {
let input = "User-Agent:"; let input = "User-Agent:";
@ -136,43 +135,19 @@ fn test_warning_wrong_path_format() {
impl From<&WarningReason> for WarningReasonKind { impl From<&WarningReason> for WarningReasonKind {
fn from(reason: &WarningReason) -> Self { fn from(reason: &WarningReason) -> Self {
match reason { match *reason {
&WarningReason::InvalidDirectiveFormat => { WarningReason::InvalidDirectiveFormat => WarningReasonKind::InvalidDirectiveFormat,
return WarningReasonKind::InvalidDirectiveFormat; WarningReason::DirectiveKeyIsEmpty => WarningReasonKind::DirectiveKeyIsEmpty,
}, WarningReason::UnsupportedDirectiveKey { .. } => WarningReasonKind::UnsupportedDirectiveKey,
&WarningReason::DirectiveKeyIsEmpty => { WarningReason::UserAgentCannotBeEmpty => WarningReasonKind::UserAgentCannotBeEmpty,
return WarningReasonKind::DirectiveKeyIsEmpty; WarningReason::DirectiveWithoutUserAgent => WarningReasonKind::DirectiveWithoutUserAgent,
}, WarningReason::ParseCrawlDelayError { .. } => WarningReasonKind::ParseCrawlDelayError,
&WarningReason::UnsupportedDirectiveKey {..} => { WarningReason::WrongRequestRateFormat => WarningReasonKind::WrongRequestRateFormat,
return WarningReasonKind::UnsupportedDirectiveKey; WarningReason::ParseRequestRate { .. } => WarningReasonKind::ParseRequestRate,
}, WarningReason::ParseUrl { .. } => WarningReasonKind::ParseUrl,
&WarningReason::UserAgentCannotBeEmpty => { WarningReason::WrongCleanParamFormat => WarningReasonKind::WrongCleanParamFormat,
return WarningReasonKind::UserAgentCannotBeEmpty; WarningReason::IgnoredCleanParams { .. } => WarningReasonKind::IgnoredCleanParams,
}, WarningReason::WrongPathFormat => WarningReasonKind::WrongPathFormat,
&WarningReason::DirectiveWithoutUserAgent => {
return WarningReasonKind::DirectiveWithoutUserAgent;
},
&WarningReason::ParseCrawlDelayError {..} => {
return WarningReasonKind::ParseCrawlDelayError;
},
&WarningReason::WrongRequestRateFormat => {
return WarningReasonKind::WrongRequestRateFormat;
},
&WarningReason::ParseRequestRate {..} => {
return WarningReasonKind::ParseRequestRate;
},
&WarningReason::ParseUrl {..} => {
return WarningReasonKind::ParseUrl;
},
&WarningReason::WrongCleanParamFormat => {
return WarningReasonKind::WrongCleanParamFormat;
},
&WarningReason::IgnoredCleanParams {..} => {
return WarningReasonKind::IgnoredCleanParams;
},
&WarningReason::WrongPathFormat => {
return WarningReasonKind::WrongPathFormat;
},
} }
} }
} }