From 6ba403aab9294d038b4ae79e3fdfab52e14b6664 Mon Sep 17 00:00:00 2001 From: Laurent Arnoud Date: Mon, 30 Mar 2020 01:32:32 +0000 Subject: [PATCH] Inital github-actions (#25) * Inital github-actions most taken from starship project ref #21 * rustfmt config remove unknown configuration options * Run rustfmt * clippy: use any instead of find..is_some * clippy: Remove the `clone` call: `self.crawl_delay` * Clippy fixes * Rustfmt fixes * clippy: fix dont need to add `&` to all patterns * clippy: fix needless `fn main` in doctest * clippy: fix if-then-else expression returns a bool literal * clippy: fix very complex type BoxFuture response * clippy: fix variable `line_no` is used as a loop counter * clippy: dereference the expression on tests * clippy: fix assert(true) will be optimized out by the compiler * github: name workflow --- .github/workflows/workflow.yml | 127 ++++++++++++++++++ rustfmt.toml | 2 - src/http.rs | 2 +- src/http/reqwest.rs | 2 +- src/http/reqwest/async_reqwest.rs | 68 +++++----- src/http/reqwest/sync_reqwest.rs | 30 +++-- src/lib.rs | 16 +-- src/model.rs | 12 +- src/model/clean_params.rs | 11 +- src/model/fetched_robots_txt.rs | 12 +- src/model/group.rs | 46 +++---- src/model/path.rs | 12 +- src/model/path_pattern.rs | 62 ++++----- src/model/robots_txt.rs | 40 +++--- src/model/rule.rs | 16 +-- src/parser.rs | 22 ++- src/parser/fetched_robots_txt_parser.rs | 20 +-- src/parser/line.rs | 13 +- src/parser/parse_result.rs | 38 +++--- src/parser/robots_txt_parser.rs | 115 ++++++++-------- src/parser/robots_txt_parser/directive.rs | 13 +- src/parser/robots_txt_parser/group_builder.rs | 10 +- src/parser/warning.rs | 62 ++++----- src/parser/warning_reason.rs | 32 ++--- src/service.rs | 8 +- src/service/fetched_robots_txt.rs | 38 +++--- src/service/robots_txt.rs | 32 ++--- tests/test_lib.rs | 58 ++++---- tests/test_reqwest_async.rs | 9 +- tests/test_reqwest_blocking.rs | 7 +- tests/test_warnings.rs | 55 +++----- 31 files changed, 532 insertions(+), 458 deletions(-) create mode 100644 .github/workflows/workflow.yml diff --git a/.github/workflows/workflow.yml b/.github/workflows/workflow.yml new file mode 100644 index 0000000..2387c43 --- /dev/null +++ b/.github/workflows/workflow.yml @@ -0,0 +1,127 @@ +--- +name: Main workflow +on: + push: + paths-ignore: + - "**.md" + + pull_request: + paths-ignore: + - "**.md" + +jobs: + # Run the `rustfmt` code formatter + rustfmt: + name: Rustfmt [Formatter] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + components: rustfmt + override: true + - run: rustup component add rustfmt + - uses: actions-rs/cargo@v1 + with: + command: fmt + args: --all -- --check + + # Run the `clippy` linting tool + clippy: + name: Clippy [Linter] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + components: clippy + override: true + - uses: actions-rs/clippy-check@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + args: --all-targets --all-features -- -D clippy::all + + # Run a security audit on dependencies + cargo_audit: + name: Cargo Audit [Security] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + - run: cargo install --force cargo-audit + - run: cargo generate-lockfile + - uses: actions-rs/cargo@v1 + with: + command: audit + + # Ensure that the project could be successfully compiled + cargo_check: + name: Compile + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + - uses: actions-rs/cargo@v1 + with: + command: check + args: --all + + # Run tests on Linux, macOS, and Windows + # On both Rust stable and Rust nightly + test: + name: Test Suite + needs: [cargo_check] + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macOS-latest, windows-latest] + rust: [stable, nightly] + + steps: + # Checkout the branch being tested + - uses: actions/checkout@v2 + + # Cache files between builds + - name: Cache cargo registry + uses: actions/cache@v1 + with: + path: ~/.cargo/registry + key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }} + + - name: Cache cargo index + uses: actions/cache@v1 + with: + path: ~/.cargo/git + key: ${{ runner.os }}-cargo-index-${{ hashFiles('**/Cargo.lock') }} + + - name: Cache cargo build + uses: actions/cache@v1 + with: + path: target + key: ${{ runner.os }}-cargo-build-target-${{ hashFiles('**/Cargo.lock') }} + + # Install all the required dependencies for testing + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + - name: Run all tests + uses: actions-rs/cargo@v1 + with: + command: test diff --git a/rustfmt.toml b/rustfmt.toml index 04c6a82..7530651 100644 --- a/rustfmt.toml +++ b/rustfmt.toml @@ -1,3 +1 @@ max_width = 120 -ideal_width = 100 -write_mode = "Overwrite" diff --git a/src/http.rs b/src/http.rs index d50aab1..e186647 100644 --- a/src/http.rs +++ b/src/http.rs @@ -16,4 +16,4 @@ pub const DEFAULT_USER_AGENT: &str = "robotparser-rs (https://crates.io/crates/r pub trait RobotsTxtClient { type Result; fn fetch_robots_txt(&self, origin: Origin) -> Self::Result; -} \ No newline at end of file +} diff --git a/src/http/reqwest.rs b/src/http/reqwest.rs index 6c6dfff..02b1377 100644 --- a/src/http/reqwest.rs +++ b/src/http/reqwest.rs @@ -1,4 +1,4 @@ mod sync_reqwest; pub use self::sync_reqwest::*; mod async_reqwest; -pub use self::async_reqwest::*; \ No newline at end of file +pub use self::async_reqwest::*; diff --git a/src/http/reqwest/async_reqwest.rs b/src/http/reqwest/async_reqwest.rs index 288d119..5c01fdc 100644 --- a/src/http/reqwest/async_reqwest.rs +++ b/src/http/reqwest/async_reqwest.rs @@ -1,41 +1,43 @@ -use reqwest::{Client, Request}; -use reqwest::Method; -use reqwest::Error as ReqwestError; -use reqwest::header::HeaderValue; -use url::{Origin, Url}; -use reqwest::header::USER_AGENT; use crate::http::{RobotsTxtClient, DEFAULT_USER_AGENT}; -use crate::parser::{ParseResult, parse_fetched_robots_txt}; use crate::model::FetchedRobotsTxt; use crate::model::{Error, ErrorKind}; -use std::pin::Pin; +use crate::parser::{parse_fetched_robots_txt, ParseResult}; +use futures::future::ok as future_ok; +use futures::future::TryFutureExt; use futures::task::{Context, Poll}; use futures::Future; -use futures::future::TryFutureExt; -use futures::future::ok as future_ok; +use reqwest::header::HeaderValue; +use reqwest::header::USER_AGENT; +use reqwest::Error as ReqwestError; +use reqwest::Method; +use reqwest::{Client, Request}; +use std::pin::Pin; +use url::{Origin, Url}; -type FetchFuture = Box>>; +type FetchFuture = Box>>; +type BoxFuture = Pin; impl RobotsTxtClient for Client { type Result = Result; fn fetch_robots_txt(&self, origin: Origin) -> Self::Result { let url = format!("{}/robots.txt", origin.unicode_serialization()); - let url = Url::parse(&url).map_err(|err| Error {kind: ErrorKind::Url(err)})?; + let url = Url::parse(&url).map_err(|err| Error { + kind: ErrorKind::Url(err), + })?; let mut request = Request::new(Method::GET, url); - let _ = request.headers_mut().insert(USER_AGENT, HeaderValue::from_static(DEFAULT_USER_AGENT)); - let response = self - .execute(request) - .and_then(|response| { - let response_info = ResponseInfo {status_code: response.status().as_u16()}; - return response.text().and_then(|response_text| { - return future_ok((response_info, response_text)); - }); - }); - let response: Pin>>> = Box::pin(response); - Ok(RobotsTxtResponse { - origin, - response, - }) + let _ = request + .headers_mut() + .insert(USER_AGENT, HeaderValue::from_static(DEFAULT_USER_AGENT)); + let response = self.execute(request).and_then(|response| { + let response_info = ResponseInfo { + status_code: response.status().as_u16(), + }; + response + .text() + .and_then(|response_text| future_ok((response_info, response_text))) + }); + let response: BoxFuture = Box::pin(response); + Ok(RobotsTxtResponse { origin, response }) } } @@ -52,7 +54,7 @@ pub struct RobotsTxtResponse { impl RobotsTxtResponse { /// Returns origin of robots.txt pub fn get_origin(&self) -> &Origin { - return &self.origin; + &self.origin } } @@ -65,14 +67,10 @@ impl Future for RobotsTxtResponse { match response_pin.poll(cx) { Poll::Ready(Ok((response_info, text))) => { let robots_txt = parse_fetched_robots_txt(self_mut.origin.clone(), response_info.status_code, &text); - return Poll::Ready(Ok(robots_txt)); - }, - Poll::Ready(Err(error)) => { - return Poll::Ready(Err(error)); - }, - Poll::Pending => { - return Poll::Pending; - }, + Poll::Ready(Ok(robots_txt)) + } + Poll::Ready(Err(error)) => Poll::Ready(Err(error)), + Poll::Pending => Poll::Pending, } } } diff --git a/src/http/reqwest/sync_reqwest.rs b/src/http/reqwest/sync_reqwest.rs index 671cca4..7d58446 100644 --- a/src/http/reqwest/sync_reqwest.rs +++ b/src/http/reqwest/sync_reqwest.rs @@ -1,24 +1,32 @@ -use reqwest::blocking::{Client, Request}; -use reqwest::Method; -use reqwest::header::HeaderValue; -use url::{Origin, Url}; -use reqwest::header::USER_AGENT; use crate::http::{RobotsTxtClient, DEFAULT_USER_AGENT}; -use crate::parser::{ParseResult, parse_fetched_robots_txt}; use crate::model::FetchedRobotsTxt; use crate::model::{Error, ErrorKind}; +use crate::parser::{parse_fetched_robots_txt, ParseResult}; +use reqwest::blocking::{Client, Request}; +use reqwest::header::HeaderValue; +use reqwest::header::USER_AGENT; +use reqwest::Method; +use url::{Origin, Url}; impl RobotsTxtClient for Client { type Result = Result, Error>; fn fetch_robots_txt(&self, origin: Origin) -> Self::Result { let url = format!("{}/robots.txt", origin.unicode_serialization()); - let url = Url::parse(&url).map_err(|err| Error {kind: ErrorKind::Url(err)})?; + let url = Url::parse(&url).map_err(|err| Error { + kind: ErrorKind::Url(err), + })?; let mut request = Request::new(Method::GET, url); - let _ = request.headers_mut().insert(USER_AGENT, HeaderValue::from_static(DEFAULT_USER_AGENT)); - let response = self.execute(request).map_err(|err| Error {kind: ErrorKind::Http(err)})?; + let _ = request + .headers_mut() + .insert(USER_AGENT, HeaderValue::from_static(DEFAULT_USER_AGENT)); + let response = self.execute(request).map_err(|err| Error { + kind: ErrorKind::Http(err), + })?; let status_code = response.status().as_u16(); - let text = response.text().map_err(|err| Error {kind: ErrorKind::Http(err)})?; + let text = response.text().map_err(|err| Error { + kind: ErrorKind::Http(err), + })?; let robots_txt = parse_fetched_robots_txt(origin, status_code, &text); - return Ok(robots_txt); + Ok(robots_txt) } } diff --git a/src/lib.rs b/src/lib.rs index f5692e4..f255d45 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,20 +21,18 @@ //! use reqwest::blocking::Client; //! use url::Url; //! -//! fn main() { -//! let client = Client::new(); -//! let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap(); -//! let robots_txt = client.fetch_robots_txt(robots_txt_url.origin()).unwrap().get_result(); -//! let fetch_url = Url::parse("https://www.python.org/robots.txt").unwrap(); -//! assert!(robots_txt.can_fetch("*", &fetch_url)); -//! } +//! let client = Client::new(); +//! let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap(); +//! let robots_txt = client.fetch_robots_txt(robots_txt_url.origin()).unwrap().get_result(); +//! let fetch_url = Url::parse("https://www.python.org/robots.txt").unwrap(); +//! assert!(robots_txt.can_fetch("*", &fetch_url)); //! ``` +/// Request builder & response parsers for other http libraries. +pub mod http; /// Contains models of robots.txt file. pub mod model; /// Contains robots.txt parsers. pub mod parser; /// Contains robots.txt services. pub mod service; -/// Request builder & response parsers for other http libraries. -pub mod http; diff --git a/src/model.rs b/src/model.rs index d56bd4a..bd80ae6 100644 --- a/src/model.rs +++ b/src/model.rs @@ -1,19 +1,19 @@ mod path_pattern; -pub (crate) use self::path_pattern::PathPattern; +pub(crate) use self::path_pattern::PathPattern; mod group; -pub (crate) use self::group::Group; +pub(crate) use self::group::Group; mod rule; -pub (crate) use self::rule::Rule; +pub(crate) use self::rule::Rule; mod clean_params; -pub (crate) use self::clean_params::CleanParams; +pub(crate) use self::clean_params::CleanParams; mod request_rate; pub use self::request_rate::RequestRate; mod robots_txt; pub use self::fetched_robots_txt::FetchedRobotsTxt; -pub (crate) use self::fetched_robots_txt::FetchedRobotsTxtContainer; +pub(crate) use self::fetched_robots_txt::FetchedRobotsTxtContainer; mod fetched_robots_txt; pub use self::robots_txt::RobotsTxt; mod path; -pub (crate) use self::path::Path; +pub(crate) use self::path::Path; mod errors; pub use self::errors::{Error, ErrorKind}; diff --git a/src/model/clean_params.rs b/src/model/clean_params.rs index f64a4c1..c34de22 100644 --- a/src/model/clean_params.rs +++ b/src/model/clean_params.rs @@ -8,17 +8,14 @@ pub struct CleanParams { impl CleanParams { pub fn new(path_pattern: PathPattern, params: Vec) -> CleanParams { - return CleanParams { - path_pattern, - params, - } + CleanParams { path_pattern, params } } pub fn get_path_pattern(&self) -> &PathPattern { - return &self.path_pattern; + &self.path_pattern } pub fn get_params(&self) -> &Vec { - return &self.params; + &self.params } -} \ No newline at end of file +} diff --git a/src/model/fetched_robots_txt.rs b/src/model/fetched_robots_txt.rs index 6adc9e8..8eb8499 100644 --- a/src/model/fetched_robots_txt.rs +++ b/src/model/fetched_robots_txt.rs @@ -2,7 +2,7 @@ use crate::model::robots_txt::RobotsTxt; use std::time::SystemTime; #[derive(Debug, Clone)] -pub (crate) enum FetchedRobotsTxtContainer { +pub(crate) enum FetchedRobotsTxtContainer { FetchDenied, FetchFailed, Fetched(RobotsTxt), @@ -19,18 +19,18 @@ pub struct FetchedRobotsTxt { } impl FetchedRobotsTxt { - pub (crate) fn new(container: FetchedRobotsTxtContainer) -> FetchedRobotsTxt { + pub(crate) fn new(container: FetchedRobotsTxtContainer) -> FetchedRobotsTxt { FetchedRobotsTxt { fetched_at: SystemTime::now(), container, } } - pub (crate) fn get_container(&self) -> &FetchedRobotsTxtContainer { - return &self.container; + pub(crate) fn get_container(&self) -> &FetchedRobotsTxtContainer { + &self.container } /// Returns the system time when the robots.txt file was downloaded over the network. pub fn get_fetched_at(&self) -> &SystemTime { - return &self.fetched_at; + &self.fetched_at } -} \ No newline at end of file +} diff --git a/src/model/group.rs b/src/model/group.rs index d394f29..6d9b935 100644 --- a/src/model/group.rs +++ b/src/model/group.rs @@ -1,6 +1,6 @@ -use std::time::Duration; use crate::model::request_rate::RequestRate; use crate::model::rule::Rule; +use std::time::Duration; /// An group has one or more user-agents and zero or more rules #[derive(Debug, Clone)] @@ -12,7 +12,7 @@ pub struct Group { } impl Group { - pub (crate) fn new() -> Group { + pub(crate) fn new() -> Group { Group { user_agents: vec![], rules: vec![], @@ -22,8 +22,8 @@ impl Group { } /// check if this group applies to the specified agent - pub (crate) fn applies_to(&self, useragent: &str) -> bool { - let ua = useragent.split('/').nth(0).unwrap_or("").to_lowercase(); + pub(crate) fn applies_to(&self, useragent: &str) -> bool { + let ua = useragent.split('/').next().unwrap_or("").to_lowercase(); for agent in self.user_agents.iter() { if ua.contains(agent) { return true; @@ -32,60 +32,54 @@ impl Group { false } - pub (crate) fn push_useragent(&mut self, useragent: &str) { - self.user_agents.push(useragent.to_lowercase().to_owned()); + pub(crate) fn push_useragent(&mut self, useragent: &str) { + self.user_agents.push(useragent.to_lowercase()); } - pub (crate) fn push_rule(&mut self, rule: Rule) { + pub(crate) fn push_rule(&mut self, rule: Rule) { self.rules.push(rule); } - pub (crate) fn get_rules_sorted_by_path_len_desc(&self) -> Vec<&Rule> { + pub(crate) fn get_rules_sorted_by_path_len_desc(&self) -> Vec<&Rule> { let mut rules: Vec<&Rule> = self.rules.iter().collect(); rules.sort_by(|a, b| { let a = a.get_path_pattern().len(); let b = b.get_path_pattern().len(); - return b.cmp(&a); + b.cmp(&a) }); - return rules; + rules } - pub (crate) fn contains_user_agent(&self, user_agent: &str) -> bool { - return self - .user_agents - .iter() - .find(|item| { - return *item == user_agent; - }).is_some(); + pub(crate) fn contains_user_agent(&self, user_agent: &str) -> bool { + self.user_agents.iter().any(|item| *item == user_agent) } - pub (crate) fn set_crawl_delay(&mut self, delay: Duration) { + pub(crate) fn set_crawl_delay(&mut self, delay: Duration) { self.crawl_delay = Some(delay); } - pub (crate) fn get_crawl_delay(&self) -> Option { - return self.crawl_delay.clone(); + pub(crate) fn get_crawl_delay(&self) -> Option { + self.crawl_delay } - pub (crate) fn set_req_rate(&mut self, req_rate: RequestRate) { + pub(crate) fn set_req_rate(&mut self, req_rate: RequestRate) { self.req_rate = Some(req_rate); } - pub (crate) fn get_req_rate(&self) -> Option { - return self.req_rate.clone(); + pub(crate) fn get_req_rate(&self) -> Option { + self.req_rate.clone() } - pub (crate) fn is_default(&self) -> bool { + pub(crate) fn is_default(&self) -> bool { for user_agent in self.user_agents.iter() { if user_agent == "*" { return true; } } - return false; + false } } - impl Default for Group { fn default() -> Group { Group::new() diff --git a/src/model/path.rs b/src/model/path.rs index 6eb16bc..a742736 100644 --- a/src/model/path.rs +++ b/src/model/path.rs @@ -1,5 +1,5 @@ -use url::Url; use percent_encoding::percent_decode; +use url::Url; #[derive(Debug)] pub struct Path(String); @@ -9,13 +9,13 @@ impl Path { let path = get_url_without_origin(&url); let path = percent_decode(path.as_bytes()).decode_utf8_lossy(); if path.is_empty() { - return Path("/".into()); + Path("/".into()) } else { - return Path(path.into()); + Path(path.into()) } } pub fn as_str(&self) -> &str { - return &self.0; + &self.0 } } @@ -24,10 +24,10 @@ fn get_url_without_origin(url: &Url) -> &str { let url = url.as_str(); let unicode_origin = origin.unicode_serialization(); let ascii_origin = origin.ascii_serialization(); - if url.starts_with(&unicode_origin) && unicode_origin.len() >= 1 { + if url.starts_with(&unicode_origin) && !unicode_origin.is_empty() { return &url[unicode_origin.len()..]; } - if url.starts_with(&ascii_origin) && ascii_origin.len() >= 1 { + if url.starts_with(&ascii_origin) && !ascii_origin.is_empty() { return &url[ascii_origin.len()..]; } // Must never be executed. diff --git a/src/model/path_pattern.rs b/src/model/path_pattern.rs index ad5e856..e9514c0 100644 --- a/src/model/path_pattern.rs +++ b/src/model/path_pattern.rs @@ -1,7 +1,7 @@ +use crate::model::path::Path; +use percent_encoding::percent_decode; use std::convert::From; use std::mem::replace; -use percent_encoding::percent_decode; -use crate::model::path::Path; #[derive(Debug, Clone)] pub struct PathPattern(Vec); @@ -16,22 +16,16 @@ enum PathPatternToken { impl PathPatternToken { fn from_path_pattern(path: String) -> PathPatternToken { let path = percent_decode(path.as_bytes()).decode_utf8_lossy(); - return PathPatternToken::Text(path.to_string()); + PathPatternToken::Text(path.to_string()) } } impl PathPatternToken { fn len(&self) -> usize { - return match self { - &PathPatternToken::Text(ref text) => { - text.len() - }, - &PathPatternToken::AnyString => { - 1 - }, - &PathPatternToken::TerminateString => { - 1 - }, + match *self { + PathPatternToken::Text(ref text) => text.len(), + PathPatternToken::AnyString => 1, + PathPatternToken::TerminateString => 1, } } } @@ -42,16 +36,12 @@ impl PathPattern { let mut tokens = Vec::new(); for c in path.chars() { let prepared_token = match c { - '*' => { - Some(PathPatternToken::AnyString) - }, - '$' => { - Some(PathPatternToken::TerminateString) - }, + '*' => Some(PathPatternToken::AnyString), + '$' => Some(PathPatternToken::TerminateString), _ => { text.push(c); None - }, + } }; if let Some(prepared_token) = prepared_token { if !text.is_empty() { @@ -67,26 +57,26 @@ impl PathPattern { tokens.push(PathPatternToken::AnyString); } tokens.dedup(); - return PathPattern(tokens); + PathPattern(tokens) } pub fn all() -> PathPattern { - return PathPattern(vec![PathPatternToken::AnyString]); + PathPattern(vec![PathPatternToken::AnyString]) } pub fn applies_to(&self, path: &Path) -> bool { let mut filename = path.as_str(); for (index, token) in self.0.iter().enumerate() { - match token { - &PathPatternToken::Text(ref text) => { + match *token { + PathPatternToken::Text(ref text) => { if !filename.starts_with(text) { return false; } - filename = &filename[text.len() ..]; - }, - &PathPatternToken::AnyString => { - if let Some(&PathPatternToken::Text(ref text)) = self.0.get(index + 1) { - while filename.len() >= 1 { + filename = &filename[text.len()..]; + } + PathPatternToken::AnyString => { + if let Some(PathPatternToken::Text(ref text)) = self.0.get(index + 1) { + while !filename.is_empty() { if filename.starts_with(text) { break; } @@ -100,15 +90,15 @@ impl PathPattern { } else { filename = &filename[filename.len()..]; } - }, - &PathPatternToken::TerminateString => { - if filename.len() != 0 { + } + PathPatternToken::TerminateString => { + if !filename.is_empty() { return false; } - }, + } } } - return true; + true } pub fn len(&self) -> usize { @@ -116,12 +106,12 @@ impl PathPattern { for path_token in self.0.iter() { length += path_token.len(); } - return length; + length } } impl From<&str> for PathPattern { fn from(path: &str) -> Self { - return PathPattern::new(path); + PathPattern::new(path) } } diff --git a/src/model/robots_txt.rs b/src/model/robots_txt.rs index 7952b2a..0f1320b 100644 --- a/src/model/robots_txt.rs +++ b/src/model/robots_txt.rs @@ -1,6 +1,6 @@ -use crate::model::group::Group; use crate::model::clean_params::CleanParams; -use url::{Url, Origin}; +use crate::model::group::Group; +use url::{Origin, Url}; #[derive(Debug, Clone)] /// The robots.txt model that was obtained after parsing the text of the robots.txt file. @@ -14,8 +14,8 @@ pub struct RobotsTxt { } impl RobotsTxt { - pub (crate) fn new(origin: Origin) -> RobotsTxt { - return RobotsTxt { + pub(crate) fn new(origin: Origin) -> RobotsTxt { + RobotsTxt { origin, groups: Vec::new(), sitemaps: Vec::new(), @@ -23,31 +23,35 @@ impl RobotsTxt { } } - pub (crate) fn add_sitemap(&mut self, url: Url) { + pub(crate) fn add_sitemap(&mut self, url: Url) { self.sitemaps.push(url); } - pub (crate) fn get_sitemaps_slice(&self) -> &[Url] { - return self.sitemaps.as_slice(); + pub(crate) fn get_sitemaps_slice(&self) -> &[Url] { + self.sitemaps.as_slice() } - pub (crate) fn add_clean_params(&mut self, clean_params: CleanParams) { + pub(crate) fn add_clean_params(&mut self, clean_params: CleanParams) { self.clean_params.push(clean_params); } - pub (crate) fn get_clean_params(&self) -> &[CleanParams] { - return self.clean_params.as_slice(); + pub(crate) fn get_clean_params(&self) -> &[CleanParams] { + self.clean_params.as_slice() } - pub (crate) fn add_group(&mut self, group: Group) { + pub(crate) fn add_group(&mut self, group: Group) { self.groups.push(group); } - pub (crate) fn get_origin(&self) -> &Origin { - return &self.origin; + pub(crate) fn get_origin(&self) -> &Origin { + &self.origin } - pub (crate) fn find_in_group<'a, T>(&'a self, user_agent: &str, callback: impl Fn(&'a Group) -> Option) -> Option { + pub(crate) fn find_in_group<'a, T>( + &'a self, + user_agent: &str, + callback: impl Fn(&'a Group) -> Option, + ) -> Option { // Search by user agents for group in self.groups.iter() { if group.applies_to(user_agent) { @@ -61,15 +65,15 @@ impl RobotsTxt { return Some(output); } } - return None; + None } - pub (crate) fn get_default_group(&self) -> Option<&Group> { + pub(crate) fn get_default_group(&self) -> Option<&Group> { for group in self.groups.iter() { if group.is_default() { return Some(group); } } - return None; + None } -} \ No newline at end of file +} diff --git a/src/model/rule.rs b/src/model/rule.rs index 748f713..888047e 100644 --- a/src/model/rule.rs +++ b/src/model/rule.rs @@ -1,5 +1,5 @@ -use crate::model::path_pattern::PathPattern; use crate::model::path::Path; +use crate::model::path_pattern::PathPattern; /// A rule line is a single "Allow:" (allowance==True) or "Disallow:" /// (allowance==False) followed by a path.""" @@ -17,15 +17,15 @@ impl Rule { } } - pub (crate) fn applies_to(&self, path: &Path) -> bool { - return self.path_pattern.applies_to(path); + pub(crate) fn applies_to(&self, path: &Path) -> bool { + self.path_pattern.applies_to(path) } - pub (crate) fn get_allowance(&self) -> bool { - return self.allowance; + pub(crate) fn get_allowance(&self) -> bool { + self.allowance } - pub (crate) fn get_path_pattern(&self) -> &PathPattern { - return &self.path_pattern; + pub(crate) fn get_path_pattern(&self) -> &PathPattern { + &self.path_pattern } -} \ No newline at end of file +} diff --git a/src/parser.rs b/src/parser.rs index ee4bbb7..96fc6c0 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -15,17 +15,15 @@ //! use robotparser::service::RobotsTxtService; //! use url::Url; //! -//! fn main() { -//! let robots_txt_url = Url::parse("https://google.com/robots.txt").unwrap(); -//! let robots_txt = "User-agent: *\nDisallow: /search"; -//! let robots_txt = parse_robots_txt(robots_txt_url.origin(), robots_txt); -//! assert_eq!(robots_txt.get_warnings().len(), 0); -//! let robots_txt = robots_txt.get_result(); -//! let good_url = Url::parse("https://google.com/test").unwrap(); -//! let bad_url = Url::parse("https://google.com/search/vvv").unwrap(); -//! assert_eq!(robots_txt.can_fetch("*", &bad_url), false); -//! assert_eq!(robots_txt.can_fetch("*", &good_url), true); -//! } +//! let robots_txt_url = Url::parse("https://google.com/robots.txt").unwrap(); +//! let robots_txt = "User-agent: *\nDisallow: /search"; +//! let robots_txt = parse_robots_txt(robots_txt_url.origin(), robots_txt); +//! assert_eq!(robots_txt.get_warnings().len(), 0); +//! let robots_txt = robots_txt.get_result(); +//! let good_url = Url::parse("https://google.com/test").unwrap(); +//! let bad_url = Url::parse("https://google.com/search/vvv").unwrap(); +//! assert_eq!(robots_txt.can_fetch("*", &bad_url), false); +//! assert_eq!(robots_txt.can_fetch("*", &good_url), true); //! ``` mod robots_txt_parser; pub use self::robots_txt_parser::parse as parse_robots_txt; @@ -37,4 +35,4 @@ mod parse_result; pub use self::parse_result::ParseResult; mod fetched_robots_txt_parser; pub use self::fetched_robots_txt_parser::parse as parse_fetched_robots_txt; -mod line; \ No newline at end of file +mod line; diff --git a/src/parser/fetched_robots_txt_parser.rs b/src/parser/fetched_robots_txt_parser.rs index 23cd100..0e7836f 100644 --- a/src/parser/fetched_robots_txt_parser.rs +++ b/src/parser/fetched_robots_txt_parser.rs @@ -1,6 +1,6 @@ use crate::model::{FetchedRobotsTxt, FetchedRobotsTxtContainer}; -use crate::parser::ParseResult; use crate::parser::parse_robots_txt; +use crate::parser::ParseResult; use url::Origin; const UNAUTHORIZED: u16 = 401; @@ -12,17 +12,9 @@ const OK: u16 = 200; /// **IMPORTANT NOTE**: origin must point to robots.txt url **before redirects**. pub fn parse(origin: Origin, status_code: u16, input: &str) -> ParseResult { match status_code { - UNAUTHORIZED | FORBIDDEN => { - return ParseResult::new(FetchedRobotsTxt::new(FetchedRobotsTxtContainer::FetchDenied)); - } - OK => { - return parse_robots_txt(origin, input) - .map(|result| { - return FetchedRobotsTxt::new(FetchedRobotsTxtContainer::Fetched(result)); - }); - }, - _ => { - return ParseResult::new(FetchedRobotsTxt::new(FetchedRobotsTxtContainer::FetchFailed)); - } + UNAUTHORIZED | FORBIDDEN => ParseResult::new(FetchedRobotsTxt::new(FetchedRobotsTxtContainer::FetchDenied)), + OK => parse_robots_txt(origin, input) + .map(|result| FetchedRobotsTxt::new(FetchedRobotsTxtContainer::Fetched(result))), + _ => ParseResult::new(FetchedRobotsTxt::new(FetchedRobotsTxtContainer::FetchFailed)), } -} \ No newline at end of file +} diff --git a/src/parser/line.rs b/src/parser/line.rs index 4db7fe6..9994135 100644 --- a/src/parser/line.rs +++ b/src/parser/line.rs @@ -3,19 +3,16 @@ pub struct Line<'a> { position: usize, } -impl <'a>Line<'a> { +impl<'a> Line<'a> { pub fn new(line: &'a str, position: usize) -> Line<'a> { - return Line { - line, - position, - } + Line { line, position } } pub fn get_line_text(&self) -> &str { - return self.line; + self.line } pub fn get_line_number(&self) -> usize { - return self.position; + self.position } -} \ No newline at end of file +} diff --git a/src/parser/parse_result.rs b/src/parser/parse_result.rs index 7e315c8..e469171 100644 --- a/src/parser/parse_result.rs +++ b/src/parser/parse_result.rs @@ -3,36 +3,39 @@ use std::fmt::Debug; #[derive(Debug)] /// The result of the robots.txt parser. -pub struct ParseResult where R: Debug { +pub struct ParseResult +where + R: Debug, +{ result: R, warnings: Vec, } -impl ParseResult where R: Debug { +impl ParseResult +where + R: Debug, +{ /// Creates a new structure for parser results. - pub (crate) fn new(result: R) -> ParseResult{ - return ParseResult { + pub(crate) fn new(result: R) -> ParseResult { + ParseResult { result, warnings: Vec::new(), } } /// Creates a new structure for parser results with warnings. - pub (crate) fn new_with_warnings(result: R, warnings: Vec) -> ParseResult{ - return ParseResult { - result, - warnings, - } + pub(crate) fn new_with_warnings(result: R, warnings: Vec) -> ParseResult { + ParseResult { result, warnings } } /// Returns the result of the robots.txt parser. pub fn get_result(self) -> R { - return self.result; + self.result } /// Returns the robots.txt parser warning array. pub fn get_warnings(&self) -> &[ParseWarning] { - return self.warnings.as_slice(); + self.warnings.as_slice() } /// Returns reference to result of the robots.txt parser or first warning. @@ -40,7 +43,7 @@ impl ParseResult where R: Debug { if let Some(warning) = self.warnings.first() { return Err(warning); } - return Ok(&self.result); + Ok(&self.result) } /// Returns the result of the robots.txt parser or first warning. @@ -49,14 +52,17 @@ impl ParseResult where R: Debug { return Ok(self.result); } let first_warning = self.warnings.remove(0); - return Err(first_warning); + Err(first_warning) } /// Converts this structure into another type of structure. - pub (crate) fn map(self, callback: impl Fn(R) -> T) -> ParseResult where T: Debug { - return ParseResult { + pub(crate) fn map(self, callback: impl Fn(R) -> T) -> ParseResult + where + T: Debug, + { + ParseResult { result: (callback)(self.result), warnings: self.warnings, } } -} \ No newline at end of file +} diff --git a/src/parser/robots_txt_parser.rs b/src/parser/robots_txt_parser.rs index 798fe89..75e660f 100644 --- a/src/parser/robots_txt_parser.rs +++ b/src/parser/robots_txt_parser.rs @@ -1,21 +1,21 @@ -use url::{Origin, Url}; -use std::time::Duration; -use crate::parser::parse_result::ParseResult; -use crate::model::{RobotsTxt, Rule, PathPattern, CleanParams, RequestRate}; +use crate::model::{CleanParams, PathPattern, RequestRate, RobotsTxt, Rule}; use crate::parser::line::Line; +use crate::parser::parse_result::ParseResult; use crate::parser::warning::ParseWarning; +use std::time::Duration; +use url::{Origin, Url}; mod directive; use self::directive::Directive; mod group_builder; pub use self::group_builder::GroupBuilder; const COMMENT_BEGIN_CHAR: char = '#'; -const KV_SEPARATOR: &'static str = ":"; +const KV_SEPARATOR: &str = ":"; /// Parses the text of the robots.txt file located in the specified origin. pub fn parse(origin: Origin, input: &str) -> ParseResult { let parser = Parser::new(origin); - return parser.parse(input); + parser.parse(input) } struct Parser { @@ -26,7 +26,7 @@ struct Parser { impl Parser { pub fn new(origin: Origin) -> Parser { - return Parser { + Parser { result: RobotsTxt::new(origin), group_builder: GroupBuilder::new(), warnings: Vec::new(), @@ -35,22 +35,20 @@ impl Parser { pub fn parse(mut self, input: &str) -> ParseResult { let input = ignore_bom(input); - let mut line_no = 0; - for line in input.lines() { - line_no += 1; - let line = Line::new(line, line_no); + for (line_no, line) in input.lines().enumerate() { + let line = Line::new(line, line_no + 1); match Self::parse_line(&line) { Ok(Some(line_value)) => { self.process_line_value(&line, &line_value); - }, + } Err(warning) => { self.warnings.push(warning); - }, - _ => {}, + } + _ => {} } } self.group_builder.fill_entries(&mut self.result); - return ParseResult::new_with_warnings(self.result, self.warnings); + ParseResult::new_with_warnings(self.result, self.warnings) } fn parse_line<'a>(line: &'a Line) -> Result>, ParseWarning> { @@ -61,9 +59,9 @@ impl Parser { if kv_part.is_empty() { return Ok(None); } - let separator_index = kv_part.find(KV_SEPARATOR).ok_or_else(|| { - return ParseWarning::invalid_directive_format(line); - })?; + let separator_index = kv_part + .find(KV_SEPARATOR) + .ok_or_else(|| ParseWarning::invalid_directive_format(line))?; if separator_index >= kv_part.len() { return Err(ParseWarning::invalid_directive_format(line)); } @@ -75,7 +73,7 @@ impl Parser { let value = &kv_part[separator_index + 1..]; let value = value.trim(); let result = Directive::new(key, value); - return Ok(Some(result)); + Ok(Some(result)) } fn process_line_value(&mut self, line: &Line, directive: &Directive) { @@ -84,29 +82,29 @@ impl Parser { // Group specific directives "user-agent" => { self.process_directive_user_agent(line, directive); - }, + } "allow" => { self.process_directive_allow(line, directive); - }, + } "disallow" => { self.process_directive_disallow(line, directive); - }, + } "crawl-delay" => { self.process_directive_crawl_delay(line, directive); - }, + } "request-rate" => { self.process_directive_request_rate(line, directive); - }, + } // Non-group directives "sitemap" => { self.process_directive_sitemap(line, directive); - }, + } "clean-param" => { self.process_directive_clean_param(line, directive); - }, + } _ => { self.warnings.push(ParseWarning::unsupported_directive_key(line, key)); - }, + } } } @@ -123,7 +121,7 @@ impl Parser { if let Some(group) = self.group_builder.get_mut_active_group() { if directive.get_value() == "" { // Nothing to do. Ignoring. - } else if directive.get_value().starts_with("*") || directive.get_value().starts_with("/") { + } else if directive.get_value().starts_with('*') || directive.get_value().starts_with('/') { group.push_rule(Rule::new(directive.get_value(), true)); } else { self.warnings.push(ParseWarning::wrong_path_format(line)); @@ -138,7 +136,7 @@ impl Parser { if directive.get_value() == "" { // Allow all. group.push_rule(Rule::new(PathPattern::all(), true)); - } else if directive.get_value().starts_with("*") || directive.get_value().starts_with("/") { + } else if directive.get_value().starts_with('*') || directive.get_value().starts_with('/') { group.push_rule(Rule::new(directive.get_value(), false)); } else { self.warnings.push(ParseWarning::wrong_path_format(line)); @@ -156,10 +154,10 @@ impl Parser { let delay_nanoseconds = delay.fract() * 10f64.powi(9); let delay = Duration::new(delay_seconds as u64, delay_nanoseconds as u32); group.set_crawl_delay(delay); - }, + } Err(error) => { self.warnings.push(ParseWarning::parse_crawl_delay_error(line, error)); - }, + } } } else { self.warnings.push(ParseWarning::directive_without_user_agent(line)); @@ -174,20 +172,20 @@ impl Parser { return; } let requests = match numbers[0].parse::() { - Ok(requests) => {requests}, + Ok(requests) => requests, Err(error) => { self.warnings.push(ParseWarning::parse_request_rate(line, error)); return; - }, + } }; let seconds = match numbers[1].parse::() { - Ok(seconds) => {seconds}, + Ok(seconds) => seconds, Err(error) => { self.warnings.push(ParseWarning::parse_request_rate(line, error)); return; - }, + } }; - group.set_req_rate(RequestRate{requests, seconds}); + group.set_req_rate(RequestRate { requests, seconds }); } else { self.warnings.push(ParseWarning::directive_without_user_agent(line)); } @@ -197,27 +195,27 @@ impl Parser { match Url::parse(directive.get_value()) { Ok(sitemap_url) => { self.result.add_sitemap(sitemap_url); - }, + } Err(error) => { self.warnings.push(ParseWarning::parse_url(line, error)); - }, + } } } fn process_directive_clean_param(&mut self, line: &Line, directive: &Directive) { let parts: Vec<&str> = directive.get_value().split_whitespace().collect(); - if parts.len() >= 3 || parts.len() == 0 { + if parts.len() >= 3 || parts.is_empty() { self.warnings.push(ParseWarning::wrong_clean_param_format(line)); return; } - if parts[0].len() == 0 { + if parts[0].is_empty() { self.warnings.push(ParseWarning::wrong_clean_param_format(line)); return; } let clean_params_path_pattern; let clean_params; if let Some(second_param) = parts.get(1) { - if second_param.len() == 0 { + if second_param.is_empty() { self.warnings.push(ParseWarning::wrong_clean_param_format(line)); return; } @@ -229,9 +227,11 @@ impl Parser { } let (valid_clean_params, invalid_clean_params) = Self::parse_clean_params(clean_params); if !invalid_clean_params.is_empty() { - self.warnings.push(ParseWarning::ignored_clean_params(line, invalid_clean_params)); + self.warnings + .push(ParseWarning::ignored_clean_params(line, invalid_clean_params)); } - self.result.add_clean_params(CleanParams::new(clean_params_path_pattern, valid_clean_params)); + self.result + .add_clean_params(CleanParams::new(clean_params_path_pattern, valid_clean_params)); } fn parse_clean_params(clean_params: &str) -> (Vec, Vec) { @@ -246,36 +246,29 @@ impl Parser { } } } - return (valid, invalid); + (valid, invalid) } fn is_valid_clean_param(clean_param: &str) -> bool { for c in clean_param.chars() { - let mut is_valid = false; - if ('A'..'Z').contains(&c) { - is_valid = true; - } - if ('a'..'z').contains(&c) { - is_valid = true; - } - if ('0'..'9').contains(&c) { - is_valid = true; - } - if c == '.' || c == '-' || c == '_' { - is_valid = true; - } + let is_valid = ('A'..'Z').contains(&c) + || ('a'..'z').contains(&c) + || ('0'..'9').contains(&c) + || c == '.' + || c == '-' + || c == '_'; if !is_valid { return false; } } - return true; + true } } fn ignore_bom(input: &str) -> &str { - const BOM: &'static str = "\u{feff}"; + const BOM: &str = "\u{feff}"; if input.starts_with(BOM) { return &input[BOM.len()..]; } - return input; -} \ No newline at end of file + input +} diff --git a/src/parser/robots_txt_parser/directive.rs b/src/parser/robots_txt_parser/directive.rs index 2bafea1..d528c6b 100644 --- a/src/parser/robots_txt_parser/directive.rs +++ b/src/parser/robots_txt_parser/directive.rs @@ -3,19 +3,16 @@ pub struct Directive<'a> { value: &'a str, } -impl <'a> Directive<'a> { +impl<'a> Directive<'a> { pub fn new(key: &'a str, value: &'a str) -> Directive<'a> { - return Directive { - key, - value, - } + Directive { key, value } } pub fn get_key_lowercase(&self) -> String { - return self.key.to_lowercase(); + self.key.to_lowercase() } pub fn get_value(&self) -> &str { - return self.value; + self.value } -} \ No newline at end of file +} diff --git a/src/parser/robots_txt_parser/group_builder.rs b/src/parser/robots_txt_parser/group_builder.rs index 225c052..871f06b 100644 --- a/src/parser/robots_txt_parser/group_builder.rs +++ b/src/parser/robots_txt_parser/group_builder.rs @@ -12,7 +12,7 @@ pub struct GroupBuilder { impl GroupBuilder { pub fn new() -> GroupBuilder { - return GroupBuilder { + GroupBuilder { state: State::WaitingForNewGroup, active_group: None, groups: Vec::new(), @@ -27,14 +27,14 @@ impl GroupBuilder { self.groups.push(group); self.active_group = Some(self.groups.len() - 1); self.state = State::WaitingForAdditionalUserAgent; - }, + } State::WaitingForAdditionalUserAgent => { let active_group = self.active_group.expect("Unable to get active group"); let group = self.groups.get_mut(active_group).expect("Unable to get group index"); if !group.contains_user_agent(user_agent) { group.push_useragent(user_agent); } - }, + } } } @@ -43,7 +43,7 @@ impl GroupBuilder { if let Some(active_group) = self.active_group { return self.groups.get_mut(active_group); } - return None; + None } pub fn fill_entries(mut self, robots_txt: &mut RobotsTxt) { @@ -51,4 +51,4 @@ impl GroupBuilder { robots_txt.add_group(group); } } -} \ No newline at end of file +} diff --git a/src/parser/warning.rs b/src/parser/warning.rs index a6a73d8..cc39bb5 100644 --- a/src/parser/warning.rs +++ b/src/parser/warning.rs @@ -1,9 +1,9 @@ use super::line::Line; use super::warning_reason::WarningReason; -use url::ParseError as ParseUrlError; -use std::num::{ParseFloatError, ParseIntError}; -use std::fmt; use std::error::Error; +use std::fmt; +use std::num::{ParseFloatError, ParseIntError}; +use url::ParseError as ParseUrlError; #[derive(Clone, Debug)] /// Warning of robots.txt parser about problems when parsing robots.txt file. @@ -18,109 +18,109 @@ impl Error for ParseWarning {} impl ParseWarning { /// Returns the line number in the text of the robots.txt file. pub fn get_line_no(&self) -> usize { - return self.line_no; + self.line_no } /// Returns the text of the robots.txt file string. pub fn get_line_text(&self) -> &String { - return &self.line; + &self.line } /// Returns the reason of warning. pub fn get_reason(&self) -> &WarningReason { - return &self.reason; + &self.reason } - pub (crate) fn invalid_directive_format(line: &Line) -> ParseWarning { - return ParseWarning { + pub(crate) fn invalid_directive_format(line: &Line) -> ParseWarning { + ParseWarning { line_no: line.get_line_number(), line: line.get_line_text().into(), reason: WarningReason::InvalidDirectiveFormat, } } - pub (crate) fn directive_key_is_empty(line: &Line) -> ParseWarning { - return ParseWarning { + pub(crate) fn directive_key_is_empty(line: &Line) -> ParseWarning { + ParseWarning { line_no: line.get_line_number(), line: line.get_line_text().into(), reason: WarningReason::DirectiveKeyIsEmpty, } } - pub (crate) fn unsupported_directive_key(line: &Line, key: String) -> ParseWarning { - return ParseWarning { + pub(crate) fn unsupported_directive_key(line: &Line, key: String) -> ParseWarning { + ParseWarning { line_no: line.get_line_number(), line: line.get_line_text().into(), reason: WarningReason::UnsupportedDirectiveKey(key), } } - pub (crate) fn user_agent_cannot_be_empty(line: &Line) -> ParseWarning { - return ParseWarning { + pub(crate) fn user_agent_cannot_be_empty(line: &Line) -> ParseWarning { + ParseWarning { line_no: line.get_line_number(), line: line.get_line_text().into(), reason: WarningReason::UserAgentCannotBeEmpty, } } - pub (crate) fn wrong_path_format(line: &Line) -> ParseWarning { - return ParseWarning { + pub(crate) fn wrong_path_format(line: &Line) -> ParseWarning { + ParseWarning { line_no: line.get_line_number(), line: line.get_line_text().into(), reason: WarningReason::WrongPathFormat, } } - pub (crate) fn directive_without_user_agent(line: &Line) -> ParseWarning { - return ParseWarning { + pub(crate) fn directive_without_user_agent(line: &Line) -> ParseWarning { + ParseWarning { line_no: line.get_line_number(), line: line.get_line_text().into(), reason: WarningReason::DirectiveWithoutUserAgent, } } - pub (crate) fn parse_crawl_delay_error(line: &Line, error: ParseFloatError) -> ParseWarning { - return ParseWarning { + pub(crate) fn parse_crawl_delay_error(line: &Line, error: ParseFloatError) -> ParseWarning { + ParseWarning { line_no: line.get_line_number(), line: line.get_line_text().into(), reason: WarningReason::ParseCrawlDelayError(error), } } - pub (crate) fn wrong_request_rate_format(line: &Line) -> ParseWarning { - return ParseWarning { + pub(crate) fn wrong_request_rate_format(line: &Line) -> ParseWarning { + ParseWarning { line_no: line.get_line_number(), line: line.get_line_text().into(), reason: WarningReason::WrongRequestRateFormat, } } - pub (crate) fn parse_request_rate(line: &Line, error: ParseIntError) -> ParseWarning { - return ParseWarning { + pub(crate) fn parse_request_rate(line: &Line, error: ParseIntError) -> ParseWarning { + ParseWarning { line_no: line.get_line_number(), line: line.get_line_text().into(), reason: WarningReason::ParseRequestRate(error), } } - pub (crate) fn parse_url(line: &Line, error: ParseUrlError) -> ParseWarning { - return ParseWarning { + pub(crate) fn parse_url(line: &Line, error: ParseUrlError) -> ParseWarning { + ParseWarning { line_no: line.get_line_number(), line: line.get_line_text().into(), reason: WarningReason::ParseUrl(error), } } - pub (crate) fn wrong_clean_param_format(line: &Line) -> ParseWarning { - return ParseWarning { + pub(crate) fn wrong_clean_param_format(line: &Line) -> ParseWarning { + ParseWarning { line_no: line.get_line_number(), line: line.get_line_text().into(), reason: WarningReason::WrongCleanParamFormat, } } - pub (crate) fn ignored_clean_params(line: &Line, ignored_clean_params: Vec) -> ParseWarning { - return ParseWarning { + pub(crate) fn ignored_clean_params(line: &Line, ignored_clean_params: Vec) -> ParseWarning { + ParseWarning { line_no: line.get_line_number(), line: line.get_line_text().into(), reason: WarningReason::IgnoredCleanParams(ignored_clean_params), @@ -133,4 +133,4 @@ impl fmt::Display for ParseWarning { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { write!(f, "Line: {}. Text: `{}`. {}", self.line_no, self.line, self.reason) } -} \ No newline at end of file +} diff --git a/src/parser/warning_reason.rs b/src/parser/warning_reason.rs index 6e119c6..54e1acf 100644 --- a/src/parser/warning_reason.rs +++ b/src/parser/warning_reason.rs @@ -1,6 +1,6 @@ -use url::ParseError as ParseUrlError; -use std::num::{ParseFloatError, ParseIntError}; use std::fmt; +use std::num::{ParseFloatError, ParseIntError}; +use url::ParseError as ParseUrlError; #[derive(Clone, Debug)] /// Warning reason of robots.txt parser about problems when parsing robots.txt file. @@ -38,43 +38,43 @@ pub enum WarningReason { /// Displays text of warning reason. impl fmt::Display for WarningReason { fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { - match &self { - &Self::InvalidDirectiveFormat => { + match self { + Self::InvalidDirectiveFormat => { write!(f, "Invalid directive format.") }, - &Self::DirectiveKeyIsEmpty => { + Self::DirectiveKeyIsEmpty => { write!(f, "Directive key is empty.") }, - &Self::UnsupportedDirectiveKey(key) => { + Self::UnsupportedDirectiveKey(key) => { write!(f, "Directive key `{}` is not suppored by this parser.", key) }, - &Self::UserAgentCannotBeEmpty => { + Self::UserAgentCannotBeEmpty => { write!(f, "Passed directive key is `User-Agent` and passed value is empty.") }, - &Self::DirectiveWithoutUserAgent => { + Self::DirectiveWithoutUserAgent => { write!(f, "It is impossible to process this directive before `User-Agent` directive has not been processed.") }, - &Self::ParseCrawlDelayError(err) => { + Self::ParseCrawlDelayError(err) => { write!(f, "It is impossible to process the `Crawl-Delay` directive because of an error when parsing a floating point number: {}", err) }, - &Self::WrongRequestRateFormat => { + Self::WrongRequestRateFormat => { write!(f, "Incorrect format of the `Request-Rate` directive") }, - &Self::ParseRequestRate(err) => { + Self::ParseRequestRate(err) => { write!(f, "Incorrect format of the `Request-Rate` directive: {}", err) }, - &Self::ParseUrl(err) => { + Self::ParseUrl(err) => { write!(f, "Parsing URL error: {}", err) }, - &Self::WrongCleanParamFormat => { + Self::WrongCleanParamFormat => { write!(f, "Incorrect format of the `Clean-Param` directive.") }, - &Self::IgnoredCleanParams(ref params) => { + Self::IgnoredCleanParams(ref params) => { write!(f, "Directive `Clean-Param` directive has incorrect parameters: {:?}", params) }, - &Self::WrongPathFormat => { + Self::WrongPathFormat => { write!(f, "Error in URL path format.") }, } } -} \ No newline at end of file +} diff --git a/src/service.rs b/src/service.rs index d33592b..26a7273 100644 --- a/src/service.rs +++ b/src/service.rs @@ -1,8 +1,8 @@ -mod robots_txt; mod fetched_robots_txt; -use url::Url; -use std::time::Duration; +mod robots_txt; use crate::model::RequestRate; +use std::time::Duration; +use url::Url; /// Trait that implements robots txt service. pub trait RobotsTxtService { @@ -27,4 +27,4 @@ pub trait RobotsTxtService { /// Returns information about the restrictions set for sending HTTP requests to the server. fn get_req_rate(&self, user_agent: &str) -> Option; -} \ No newline at end of file +} diff --git a/src/service/fetched_robots_txt.rs b/src/service/fetched_robots_txt.rs index 74b0a13..91fd384 100644 --- a/src/service/fetched_robots_txt.rs +++ b/src/service/fetched_robots_txt.rs @@ -1,51 +1,49 @@ -use url::Url; -use std::time::Duration; -use crate::service::RobotsTxtService; -use crate::model::{FetchedRobotsTxt, FetchedRobotsTxtContainer}; use crate::model::RequestRate; +use crate::model::{FetchedRobotsTxt, FetchedRobotsTxtContainer}; +use crate::service::RobotsTxtService; +use std::time::Duration; +use url::Url; impl RobotsTxtService for FetchedRobotsTxt { fn can_fetch(&self, user_agent: &str, url: &Url) -> bool { - match self.get_container() { - &FetchedRobotsTxtContainer::FetchDenied => false, - &FetchedRobotsTxtContainer::FetchFailed => true, - &FetchedRobotsTxtContainer::Fetched(ref robots_txt) => { - robots_txt.can_fetch(user_agent, url) - } + match *self.get_container() { + FetchedRobotsTxtContainer::FetchDenied => false, + FetchedRobotsTxtContainer::FetchFailed => true, + FetchedRobotsTxtContainer::Fetched(ref robots_txt) => robots_txt.can_fetch(user_agent, url), } } fn get_crawl_delay(&self, user_agent: &str) -> Option { - if let &FetchedRobotsTxtContainer::Fetched(ref robots_txt) = self.get_container() { + if let FetchedRobotsTxtContainer::Fetched(ref robots_txt) = *self.get_container() { return robots_txt.get_crawl_delay(user_agent); } - return None; + None } fn normalize_url(&self, url: &mut Url) -> bool { - if let &FetchedRobotsTxtContainer::Fetched(ref robots_txt) = self.get_container() { + if let FetchedRobotsTxtContainer::Fetched(ref robots_txt) = *self.get_container() { return robots_txt.normalize_url(url); } - return true; + true } fn normalize_url_ignore_origin(&self, url: &mut Url) { - if let &FetchedRobotsTxtContainer::Fetched(ref robots_txt) = self.get_container() { + if let FetchedRobotsTxtContainer::Fetched(ref robots_txt) = *self.get_container() { robots_txt.normalize_url_ignore_origin(url); } } fn get_sitemaps(&self) -> &[Url] { - if let &FetchedRobotsTxtContainer::Fetched(ref robots_txt) = self.get_container() { + if let FetchedRobotsTxtContainer::Fetched(ref robots_txt) = *self.get_container() { return robots_txt.get_sitemaps(); } - return &[]; + &[] } fn get_req_rate(&self, user_agent: &str) -> Option { - if let &FetchedRobotsTxtContainer::Fetched(ref robots_txt) = self.get_container() { + if let FetchedRobotsTxtContainer::Fetched(ref robots_txt) = *self.get_container() { return robots_txt.get_req_rate(user_agent); } - return None; + None } -} \ No newline at end of file +} diff --git a/src/service/robots_txt.rs b/src/service/robots_txt.rs index 5bc34ff..dadda76 100644 --- a/src/service/robots_txt.rs +++ b/src/service/robots_txt.rs @@ -1,9 +1,9 @@ -use url::Url; -use std::time::Duration; -use crate::service::RobotsTxtService; -use crate::model::RobotsTxt; -use crate::model::RequestRate; use crate::model::Path; +use crate::model::RequestRate; +use crate::model::RobotsTxt; +use crate::service::RobotsTxtService; +use std::time::Duration; +use url::Url; impl RobotsTxtService for RobotsTxt { fn can_fetch(&self, user_agent: &str, url: &Url) -> bool { @@ -18,19 +18,17 @@ impl RobotsTxtService for RobotsTxt { return Some(rule.get_allowance()); } } - return None; + None }); if let Some(rule_decision) = rule_decision { return rule_decision; } // Empty robots.txt allows crawling. Everything that was not denied must be allowed. - return true; + true } fn get_crawl_delay(&self, user_agent: &str) -> Option { - return self.find_in_group(user_agent, |group| { - return group.get_crawl_delay(); - }); + self.find_in_group(user_agent, |group| group.get_crawl_delay()) } fn normalize_url(&self, url: &mut Url) -> bool { @@ -38,7 +36,7 @@ impl RobotsTxtService for RobotsTxt { return false; } self.normalize_url_ignore_origin(url); - return true; + true } fn normalize_url_ignore_origin(&self, url: &mut Url) { @@ -54,9 +52,7 @@ impl RobotsTxtService for RobotsTxt { } let mut pairs: Vec<(String, String)> = url .query_pairs() - .map(|(key, value)|{ - return (key.into(), value.into()); - }) + .map(|(key, value)| (key.into(), value.into())) .collect(); { let mut query_pairs_mut = url.query_pairs_mut(); @@ -73,12 +69,10 @@ impl RobotsTxtService for RobotsTxt { } fn get_sitemaps(&self) -> &[Url] { - return self.get_sitemaps_slice(); + self.get_sitemaps_slice() } fn get_req_rate(&self, user_agent: &str) -> Option { - return self.find_in_group(user_agent, |group| { - return group.get_req_rate(); - }); + self.find_in_group(user_agent, |group| group.get_req_rate()) } -} \ No newline at end of file +} diff --git a/tests/test_lib.rs b/tests/test_lib.rs index 1cfc1bf..f30cecd 100644 --- a/tests/test_lib.rs +++ b/tests/test_lib.rs @@ -3,7 +3,7 @@ use robotparser::service::RobotsTxtService; use std::time::Duration; use url::Url; -const AGENT: &'static str = "test_robotparser"; +const AGENT: &str = "test_robotparser"; fn robot_test(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>, agent: &str) { let url = Url::parse("https://www.baidu.com/robots.txt").unwrap(); @@ -20,7 +20,6 @@ fn robot_test(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>, agent: &str) } } - fn robot_test_simple(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>) { robot_test(doc, good_urls, bad_urls, AGENT); } @@ -33,12 +32,11 @@ fn test_robots_txt_rn_bom() { Disallow: /tmp/ # these will soon disappear\r\n\ Disallow: /foo.html\r\n\ "; - let good = vec!["/","/test.html"]; - let bad = vec!["/cyberworld/map/index.html","/tmp/xxx","/foo.html"]; + let good = vec!["/", "/test.html"]; + let bad = vec!["/cyberworld/map/index.html", "/tmp/xxx", "/foo.html"]; robot_test_simple(doc, good, bad); } - #[test] fn test_robots_txt_1() { let doc = "\n\ @@ -47,12 +45,11 @@ fn test_robots_txt_1() { Disallow: /tmp/ # these will soon disappear\n\ Disallow: /foo.html\n\ "; - let good = vec!["/","/test.html"]; - let bad = vec!["/cyberworld/map/index.html","/tmp/xxx","/foo.html"]; + let good = vec!["/", "/test.html"]; + let bad = vec!["/cyberworld/map/index.html", "/tmp/xxx", "/foo.html"]; robot_test_simple(doc, good, bad); } - #[test] fn test_robots_txt_2() { let doc = "\n\ @@ -66,7 +63,7 @@ fn test_robots_txt_2() { Disallow:\n\ \n\ "; - let good = vec!["/","/test.html"]; + let good = vec!["/", "/test.html"]; let bad = vec!["/cyberworld/map/index.html"]; robot_test_simple(doc, good, bad); @@ -82,7 +79,7 @@ fn test_robots_txt_3() { Disallow: /\n\ "; let good = vec![]; - let bad = vec!["/cyberworld/map/index.html","/","/tmp/"]; + let bad = vec!["/cyberworld/map/index.html", "/", "/tmp/"]; robot_test_simple(doc, good, bad); } @@ -97,8 +94,13 @@ fn test_robots_txt_4() { "; let good = vec![]; let bad = vec![ - "/tmp", "/tmp.html", "/tmp/a.html", "/a%3cd.html", "/a%3Cd.html", - "/a%2fb.html", "/~joe/index.html", + "/tmp", + "/tmp.html", + "/tmp/a.html", + "/a%3cd.html", + "/a%3Cd.html", + "/a%2fb.html", + "/~joe/index.html", ]; robot_test(doc, good.clone(), bad.clone(), "figtree"); robot_test(doc, good, bad, "FigTree Robot libwww-perl/5.04"); @@ -115,8 +117,12 @@ fn test_robots_txt_5() { "; let good = vec!["/tmp"]; let bad = vec![ - "/tmp/", "/tmp/a.html", "/a%3cd.html", "/a%3Cd.html", - "/a/b.html", "/%7Ejoe/index.html", + "/tmp/", + "/tmp/a.html", + "/a%3cd.html", + "/a%3Cd.html", + "/a/b.html", + "/%7Ejoe/index.html", ]; robot_test_simple(doc, good, bad); } @@ -246,8 +252,8 @@ fn test_robots_txt_14() { #[cfg(feature = "http")] #[test] fn test_robots_txt_read() { - use robotparser::http::{CreateRobotsTxtRequest, ParseRobotsTxtResponse}; use reqwest::{Client, Request}; + use robotparser::http::{CreateRobotsTxtRequest, ParseRobotsTxtResponse}; let http_client = Client::new(); let url = Url::parse("https://www.python.org/robots.txt").unwrap(); let request = Request::create_robots_txt_request(url.origin()); @@ -263,7 +269,10 @@ fn test_robots_text_crawl_delay() { Crawl-delay: 2.35\n\ Disallow: /search/\n"; let parser = parse_robots_txt(robots_txt_url.origin(), doc).get_result(); - assert_eq!(Duration::new(2,350 * 1000 * 1000), parser.get_crawl_delay("Yandex").unwrap()); + assert_eq!( + Duration::new(2, 350 * 1000 * 1000), + parser.get_crawl_delay("Yandex").unwrap() + ); } #[test] @@ -288,8 +297,7 @@ fn test_robots_text_sitemaps() { #[test] fn test_robots_text_request_rate() { let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap(); - let doc = - "User-agent: Yandex\n\ + let doc = "User-agent: Yandex\n\ Request-rate: 3/15\n\ Disallow: /search/\n"; let parser = parse_robots_txt(robots_txt_url.origin(), doc).get_result(); @@ -301,7 +309,6 @@ fn test_robots_text_request_rate() { assert!(req_rate.is_none()); } - #[test] fn test_robots_text_clean_params() { let doc = "\ @@ -315,13 +322,18 @@ Clean-param: amp\n\ "; let url = Url::parse("https://www.baidu.com/robots.txt").unwrap(); let parser = parse_robots_txt(url.origin(), doc).get_result(); - let mut site_url = Url::parse("https://www.baidu.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&=1").unwrap(); + let mut site_url = + Url::parse("https://www.baidu.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&=1").unwrap(); let was_updated = parser.normalize_url(&mut site_url); assert_eq!(was_updated, true); assert_eq!(site_url.as_str(), "https://www.baidu.com/test?post_id=7777"); - let mut site_url = Url::parse("https://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&=1").unwrap(); + let mut site_url = + Url::parse("https://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&=1").unwrap(); let was_updated = parser.normalize_url(&mut site_url); assert_eq!(was_updated, false); - assert_eq!(site_url.as_str(), "https://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&=1"); -} \ No newline at end of file + assert_eq!( + site_url.as_str(), + "https://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&=1" + ); +} diff --git a/tests/test_reqwest_async.rs b/tests/test_reqwest_async.rs index 3701b2b..ef0681a 100644 --- a/tests/test_reqwest_async.rs +++ b/tests/test_reqwest_async.rs @@ -1,8 +1,8 @@ +use reqwest::Client; use robotparser::http::RobotsTxtClient; use robotparser::service::RobotsTxtService; -use reqwest::Client; -use url::Url; use tokio::runtime::Runtime; +use url::Url; use url::{Host, Origin}; #[test] @@ -23,8 +23,7 @@ fn test_reqwest_blocking_panic_url() { let client = Client::new(); let host = Host::Domain("python.org::".into()); let origin = Origin::Tuple("https".into(), host, 80); - match client.fetch_robots_txt(origin) { - Ok(_) => assert!(false), - Err(_) => assert!(true) + if client.fetch_robots_txt(origin).is_ok() { + panic!() } } diff --git a/tests/test_reqwest_blocking.rs b/tests/test_reqwest_blocking.rs index b826811..9238d7c 100644 --- a/tests/test_reqwest_blocking.rs +++ b/tests/test_reqwest_blocking.rs @@ -1,6 +1,6 @@ +use reqwest::blocking::Client; use robotparser::http::RobotsTxtClient; use robotparser::service::RobotsTxtService; -use reqwest::blocking::Client; use url::Url; use url::{Host, Origin}; @@ -20,8 +20,7 @@ fn test_reqwest_blocking_panic_url() { let client = Client::new(); let host = Host::Domain("python.org::".into()); let origin = Origin::Tuple("https".into(), host, 80); - match client.fetch_robots_txt(origin) { - Ok(_) => assert!(false), - Err(_) => assert!(true) + if client.fetch_robots_txt(origin).is_ok() { + panic!() } } diff --git a/tests/test_warnings.rs b/tests/test_warnings.rs index b85620f..40372bc 100644 --- a/tests/test_warnings.rs +++ b/tests/test_warnings.rs @@ -1,6 +1,6 @@ use robotparser::parser::{parse_robots_txt, WarningReason}; -use url::{Host, Origin}; use std::convert::From; +use url::{Host, Origin}; #[derive(PartialEq, Eq, Debug, Clone)] enum WarningReasonKind { @@ -51,7 +51,6 @@ fn test_warning_supported_directive_key() { validate_warnings(input, &[WarningReasonKind::UnsupportedDirectiveKey]); } - #[test] fn test_warning_user_agent_cannot_be_empty() { let input = "User-Agent:"; @@ -136,43 +135,19 @@ fn test_warning_wrong_path_format() { impl From<&WarningReason> for WarningReasonKind { fn from(reason: &WarningReason) -> Self { - match reason { - &WarningReason::InvalidDirectiveFormat => { - return WarningReasonKind::InvalidDirectiveFormat; - }, - &WarningReason::DirectiveKeyIsEmpty => { - return WarningReasonKind::DirectiveKeyIsEmpty; - }, - &WarningReason::UnsupportedDirectiveKey {..} => { - return WarningReasonKind::UnsupportedDirectiveKey; - }, - &WarningReason::UserAgentCannotBeEmpty => { - return WarningReasonKind::UserAgentCannotBeEmpty; - }, - &WarningReason::DirectiveWithoutUserAgent => { - return WarningReasonKind::DirectiveWithoutUserAgent; - }, - &WarningReason::ParseCrawlDelayError {..} => { - return WarningReasonKind::ParseCrawlDelayError; - }, - &WarningReason::WrongRequestRateFormat => { - return WarningReasonKind::WrongRequestRateFormat; - }, - &WarningReason::ParseRequestRate {..} => { - return WarningReasonKind::ParseRequestRate; - }, - &WarningReason::ParseUrl {..} => { - return WarningReasonKind::ParseUrl; - }, - &WarningReason::WrongCleanParamFormat => { - return WarningReasonKind::WrongCleanParamFormat; - }, - &WarningReason::IgnoredCleanParams {..} => { - return WarningReasonKind::IgnoredCleanParams; - }, - &WarningReason::WrongPathFormat => { - return WarningReasonKind::WrongPathFormat; - }, + match *reason { + WarningReason::InvalidDirectiveFormat => WarningReasonKind::InvalidDirectiveFormat, + WarningReason::DirectiveKeyIsEmpty => WarningReasonKind::DirectiveKeyIsEmpty, + WarningReason::UnsupportedDirectiveKey { .. } => WarningReasonKind::UnsupportedDirectiveKey, + WarningReason::UserAgentCannotBeEmpty => WarningReasonKind::UserAgentCannotBeEmpty, + WarningReason::DirectiveWithoutUserAgent => WarningReasonKind::DirectiveWithoutUserAgent, + WarningReason::ParseCrawlDelayError { .. } => WarningReasonKind::ParseCrawlDelayError, + WarningReason::WrongRequestRateFormat => WarningReasonKind::WrongRequestRateFormat, + WarningReason::ParseRequestRate { .. } => WarningReasonKind::ParseRequestRate, + WarningReason::ParseUrl { .. } => WarningReasonKind::ParseUrl, + WarningReason::WrongCleanParamFormat => WarningReasonKind::WrongCleanParamFormat, + WarningReason::IgnoredCleanParams { .. } => WarningReasonKind::IgnoredCleanParams, + WarningReason::WrongPathFormat => WarningReasonKind::WrongPathFormat, } } -} \ No newline at end of file +}