From 2d19755779816c07dbe99772c78c3fb6b1e5ffd0 Mon Sep 17 00:00:00 2001 From: svmk Date: Fri, 31 Jan 2020 16:00:58 +0700 Subject: [PATCH] Refactoring of robotparser-rs (#20) * Migrated sites into robotsparser file. * Robots.txt refactoring. * Migrated to new version of url and reqwest. --- .gitignore | 1 + Cargo.toml | 19 +- README.md | 17 +- src/http.rs | 19 + src/http/reqwest.rs | 4 + src/http/reqwest/async_reqwest.rs | 76 +++ src/http/reqwest/sync_reqwest.rs | 23 + src/lib.rs | 488 +----------------- src/model.rs | 17 + src/model/clean_params.rs | 24 + src/model/fetched_robots_txt.rs | 36 ++ src/model/group.rs | 93 ++++ src/model/path.rs | 35 ++ src/model/path_pattern.rs | 127 +++++ src/model/request_rate.rs | 9 + src/model/robots_txt.rs | 75 +++ src/model/rule.rs | 31 ++ src/parser.rs | 40 ++ src/parser/fetched_robots_txt_parser.rs | 28 + src/parser/line.rs | 21 + src/parser/parse_result.rs | 62 +++ src/parser/robots_txt_parser.rs | 281 ++++++++++ src/parser/robots_txt_parser/directive.rs | 21 + src/parser/robots_txt_parser/group_builder.rs | 54 ++ src/parser/warning.rs | 136 +++++ src/parser/warning_reason.rs | 80 +++ src/service.rs | 30 ++ src/service/fetched_robots_txt.rs | 51 ++ src/service/robots_txt.rs | 84 +++ tests/{lib.rs => test_lib.rs} | 111 ++-- tests/test_reqwest_async.rs | 16 + tests/test_reqwest_blocking.rs | 13 + tests/test_warnings.rs | 178 +++++++ 33 files changed, 1789 insertions(+), 511 deletions(-) create mode 100644 src/http.rs create mode 100644 src/http/reqwest.rs create mode 100644 src/http/reqwest/async_reqwest.rs create mode 100644 src/http/reqwest/sync_reqwest.rs create mode 100644 src/model.rs create mode 100644 src/model/clean_params.rs create mode 100644 src/model/fetched_robots_txt.rs create mode 100644 src/model/group.rs create mode 100644 src/model/path.rs create mode 100644 src/model/path_pattern.rs create mode 100644 src/model/request_rate.rs create mode 100644 src/model/robots_txt.rs create mode 100644 src/model/rule.rs create mode 100644 src/parser.rs create mode 100644 src/parser/fetched_robots_txt_parser.rs create mode 100644 src/parser/line.rs create mode 100644 src/parser/parse_result.rs create mode 100644 src/parser/robots_txt_parser.rs create mode 100644 src/parser/robots_txt_parser/directive.rs create mode 100644 src/parser/robots_txt_parser/group_builder.rs create mode 100644 src/parser/warning.rs create mode 100644 src/parser/warning_reason.rs create mode 100644 src/service.rs create mode 100644 src/service/fetched_robots_txt.rs create mode 100644 src/service/robots_txt.rs rename tests/{lib.rs => test_lib.rs} (62%) create mode 100644 tests/test_reqwest_async.rs create mode 100644 tests/test_reqwest_blocking.rs create mode 100644 tests/test_warnings.rs diff --git a/.gitignore b/.gitignore index 865d4a7..dea6a61 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ target Cargo.lock .vscode/ +.idea/ \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index 7acd126..241ba9d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,16 +8,25 @@ license = "MIT" name = "robotparser" readme = "README.md" repository = "https://github.com/messense/robotparser-rs" -version = "0.10.2" +version = "0.11.0" +edition = "2018" [dependencies] -url = "1" +url = "2" +percent-encoding = "2.1" [dependencies.reqwest] -version = "0.9" +version = "0.10.1" +optional = true +features = ["blocking"] + +[dependencies.futures] +version = "0.3" optional = true [features] -default = ["http"] -http = ["reqwest"] +default = ["reqwest", "futures"] unstable = [] + +[dev-dependencies] +tokio = "0.2.11" diff --git a/README.md b/README.md index 41b65b2..c7a6b95 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Add it to your ``Cargo.toml``: ```toml [dependencies] -robotparser = "0.10" +robotparser = "0.11" ``` Add ``extern crate robotparser`` to your crate root and your're good to go! @@ -24,14 +24,17 @@ Add ``extern crate robotparser`` to your crate root and your're good to go! ## Examples ```rust -extern crate robotparser; - -use robotparser::RobotFileParser; +use robotparser::http::RobotsTxtClient; +use robotparser::service::RobotsTxtService; +use reqwest::Client; +use url::Url; fn main() { - let parser = RobotFileParser::new("http://www.python.org/robots.txt"); - parser.read(); - assert!(parser.can_fetch("*", "http://www.python.org/robots.txt")); + let client = Client::new(); + let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap(); + let robots_txt = client.fetch_robots_txt(robots_txt_url.origin()).unwrap().get_result(); + let fetch_url = Url::parse("http://www.python.org/robots.txt").unwrap(); + assert!(robots_txt.can_fetch("*", &fetch_url)); } ``` diff --git a/src/http.rs b/src/http.rs new file mode 100644 index 0000000..d50aab1 --- /dev/null +++ b/src/http.rs @@ -0,0 +1,19 @@ +//! # Supported libraries +//! To enable support for the required library, you need to add this feature to your `Cargo.toml`. +//! Now only one library is supported - `reqwest`. +//! But you can also add support for other libraries. + +use url::Origin; +#[cfg(feature = "reqwest")] +/// Support for reqwest library. +pub mod reqwest; + +/// User agent of this crate. +pub const DEFAULT_USER_AGENT: &str = "robotparser-rs (https://crates.io/crates/robotparser)"; + +/// Trait to fetch and parse the robots.txt file. +/// Must be implemented on http-client. +pub trait RobotsTxtClient { + type Result; + fn fetch_robots_txt(&self, origin: Origin) -> Self::Result; +} \ No newline at end of file diff --git a/src/http/reqwest.rs b/src/http/reqwest.rs new file mode 100644 index 0000000..6c6dfff --- /dev/null +++ b/src/http/reqwest.rs @@ -0,0 +1,4 @@ +mod sync_reqwest; +pub use self::sync_reqwest::*; +mod async_reqwest; +pub use self::async_reqwest::*; \ No newline at end of file diff --git a/src/http/reqwest/async_reqwest.rs b/src/http/reqwest/async_reqwest.rs new file mode 100644 index 0000000..ea87d5f --- /dev/null +++ b/src/http/reqwest/async_reqwest.rs @@ -0,0 +1,76 @@ +use reqwest::{Client, Request}; +use reqwest::{Method, Error}; +use reqwest::header::HeaderValue; +use url::{Origin, Url}; +use reqwest::header::USER_AGENT; +use crate::http::{RobotsTxtClient, DEFAULT_USER_AGENT}; +use crate::parser::{ParseResult, parse_fetched_robots_txt}; +use crate::model::FetchedRobotsTxt; +use std::pin::Pin; +use futures::task::{Context, Poll}; +use futures::Future; +use futures::future::TryFutureExt; +use futures::future::ok as future_ok; + +type FetchFuture = Box>>; + +impl RobotsTxtClient for Client { + type Result = RobotsTxtResponse; + fn fetch_robots_txt(&self, origin: Origin) -> Self::Result { + let url = format!("{}/robots.txt", origin.unicode_serialization()); + let url = Url::parse(&url).expect("Unable to parse robots.txt url"); + let mut request = Request::new(Method::GET, url); + let _ = request.headers_mut().insert(USER_AGENT, HeaderValue::from_static(DEFAULT_USER_AGENT)); + let response = self + .execute(request) + .and_then(|response| { + let response_info = ResponseInfo {status_code: response.status().as_u16()}; + return response.text().and_then(|response_text| { + return future_ok((response_info, response_text)); + }); + }); + let response: Pin>>> = Box::pin(response); + return RobotsTxtResponse { + origin, + response, + } + } +} + +struct ResponseInfo { + status_code: u16, +} + +/// Future for fetching robots.txt result. +pub struct RobotsTxtResponse { + origin: Origin, + response: Pin, +} + +impl RobotsTxtResponse { + /// Returns origin of robots.txt + pub fn get_origin(&self) -> &Origin { + return &self.origin; + } +} + +impl Future for RobotsTxtResponse { + type Output = Result, Error>; + + fn poll(self: Pin<&mut Self>, cx: &mut Context) -> Poll { + let self_mut = self.get_mut(); + let response_pin = self_mut.response.as_mut(); + match response_pin.poll(cx) { + Poll::Ready(Ok((response_info, text))) => { + let robots_txt = parse_fetched_robots_txt(self_mut.origin.clone(), response_info.status_code, &text); + return Poll::Ready(Ok(robots_txt)); + }, + Poll::Ready(Err(error)) => { + return Poll::Ready(Err(error)); + }, + Poll::Pending => { + return Poll::Pending; + }, + } + } +} \ No newline at end of file diff --git a/src/http/reqwest/sync_reqwest.rs b/src/http/reqwest/sync_reqwest.rs new file mode 100644 index 0000000..0365d66 --- /dev/null +++ b/src/http/reqwest/sync_reqwest.rs @@ -0,0 +1,23 @@ +use reqwest::blocking::{Client, Request}; +use reqwest::{Method, Error}; +use reqwest::header::HeaderValue; +use url::{Origin, Url}; +use reqwest::header::USER_AGENT; +use crate::http::{RobotsTxtClient, DEFAULT_USER_AGENT}; +use crate::parser::{ParseResult, parse_fetched_robots_txt}; +use crate::model::FetchedRobotsTxt; + +impl RobotsTxtClient for Client { + type Result = Result, Error>; + fn fetch_robots_txt(&self, origin: Origin) -> Self::Result { + let url = format!("{}/robots.txt", origin.unicode_serialization()); + let url = Url::parse(&url).expect("Unable to parse robots.txt url"); + let mut request = Request::new(Method::GET, url); + let _ = request.headers_mut().insert(USER_AGENT, HeaderValue::from_static(DEFAULT_USER_AGENT)); + let response = self.execute(request)?; + let status_code = response.status().as_u16(); + let text = response.text()?; + let robots_txt = parse_fetched_robots_txt(origin, status_code, &text); + return Ok(robots_txt); + } +} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 296144c..f22bb11 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,482 +9,32 @@ //! //! ```toml //! [dependencies] -//! robotparser = "0.10" +//! robotparser = "0.11" //! ``` //! -//! Add ``extern crate robotparser`` to your crate root and your're good to go! //! //! # Examples //! -//! ```rust,ignore -//! extern crate robotparser; -//! -//! use robotparser::RobotFileParser; +//! ```rust +//! use robotparser::http::RobotsTxtClient; +//! use robotparser::service::RobotsTxtService; +//! use reqwest::blocking::Client; +//! use url::Url; //! //! fn main() { -//! let parser = RobotFileParser::new("http://www.python.org/robots.txt"); -//! parser.read(); -//! assert!(parser.can_fetch("*", "http://www.python.org/robots.txt")); +//! let client = Client::new(); +//! let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap(); +//! let robots_txt = client.fetch_robots_txt(robots_txt_url.origin()).unwrap().get_result(); +//! let fetch_url = Url::parse("http://www.python.org/robots.txt").unwrap(); +//! assert!(robots_txt.can_fetch("*", &fetch_url)); //! } //! ``` -extern crate url; -#[cfg(feature = "http")] -extern crate reqwest; - -#[cfg(feature = "http")] -use std::io::Read; -use std::cell::{Cell, RefCell}; -use std::borrow::Cow; -use std::time::{Duration, SystemTime, UNIX_EPOCH}; - -use url::Url; - -#[cfg(feature = "http")] -use reqwest::Client; -#[cfg(feature = "http")] -use reqwest::header::USER_AGENT; -#[cfg(feature = "http")] -use reqwest::StatusCode; -#[cfg(feature = "http")] -use reqwest::Response; - -#[cfg(feature = "http")] -const RP_USER_AGENT: &str = "robotparser-rs (https://crates.io/crates/robotparser)"; - -/// A rule line is a single "Allow:" (allowance==True) or "Disallow:" -/// (allowance==False) followed by a path.""" -#[derive(Debug, Eq, PartialEq, Clone)] -struct RuleLine<'a> { - path: Cow<'a, str>, - allowance: bool, -} - -#[derive(Debug, Eq, PartialEq, Clone)] -pub struct RequestRate { - pub requests: usize, - pub seconds: usize, -} - -/// An entry has one or more user-agents and zero or more rulelines -#[derive(Debug, Eq, PartialEq, Clone)] -struct Entry<'a> { - useragents: RefCell>, - rulelines: RefCell>>, - crawl_delay: Option, - sitemaps: Vec, - req_rate: Option, -} - -/// robots.txt file parser -#[derive(Debug, Eq, PartialEq, Clone)] -pub struct RobotFileParser<'a> { - entries: RefCell>>, - default_entry: RefCell>, - disallow_all: Cell, - allow_all: Cell, - url: Url, - host: String, - path: String, - last_checked: Cell, -} - - -impl<'a> RuleLine<'a> { - fn new(path: S, allowance: bool) -> RuleLine<'a> - where S: Into> - { - let path = path.into(); - let mut allow = allowance; - if path == "" && !allowance { - // an empty value means allow all - allow = true; - } - RuleLine { - path: path, - allowance: allow, - } - } - - fn applies_to(&self, filename: &str) -> bool { - self.path == "*" || filename.starts_with(&self.path[..]) - } -} - - -impl<'a> Entry<'a> { - fn new() -> Entry<'a> { - Entry { - useragents: RefCell::new(vec![]), - rulelines: RefCell::new(vec![]), - crawl_delay: None, - sitemaps: Vec::new(), - req_rate: None, - } - } - - /// check if this entry applies to the specified agent - fn applies_to(&self, useragent: &str) -> bool { - let ua = useragent.split('/').nth(0).unwrap_or("").to_lowercase(); - let useragents = self.useragents.borrow(); - for agent in &*useragents { - if agent == "*" { - return true; - } - if ua.contains(agent) { - return true; - } - } - false - } - - - /// Preconditions: - /// - our agent applies to this entry - /// - filename is URL decoded - fn allowance(&self, filename: &str) -> bool { - let rulelines = self.rulelines.borrow(); - for line in &*rulelines { - if line.applies_to(filename) { - return line.allowance; - } - } - true - } - - fn push_useragent(&self, useragent: &str) { - let mut useragents = self.useragents.borrow_mut(); - useragents.push(useragent.to_lowercase().to_owned()); - } - - fn push_ruleline(&self, ruleline: RuleLine<'a>) { - let mut rulelines = self.rulelines.borrow_mut(); - rulelines.push(ruleline); - } - - fn has_useragent(&self, useragent: &str) -> bool { - let useragents = self.useragents.borrow(); - useragents.contains(&useragent.to_owned()) - } - - fn is_empty(&self) -> bool { - let useragents = self.useragents.borrow(); - let rulelines = self.rulelines.borrow(); - useragents.is_empty() && rulelines.is_empty() - } - - fn set_crawl_delay(&mut self, delay: Duration) { - self.crawl_delay = Some(delay); - } - - fn get_crawl_delay(&self) -> Option { - self.crawl_delay - } - - fn add_sitemap(&mut self, url: &str) { - if let Ok(url) = Url::parse(url) { - self.sitemaps.push(url); - } - } - - fn get_sitemaps(&self) -> Vec { - self.sitemaps.clone() - } - - fn set_req_rate(&mut self, req_rate: RequestRate) { - self.req_rate = Some(req_rate); - } - - fn get_req_rate(&self) -> Option { - self.req_rate.clone() - } -} - - -impl<'a> Default for Entry<'a> { - fn default() -> Entry<'a> { - Entry::new() - } -} - - -impl<'a> RobotFileParser<'a> { - pub fn new>(url: T) -> RobotFileParser<'a> { - let parsed_url = Url::parse(url.as_ref()).unwrap(); - RobotFileParser { - entries: RefCell::new(vec![]), - default_entry: RefCell::new(Entry::new()), - disallow_all: Cell::new(false), - allow_all: Cell::new(false), - url: parsed_url.clone(), - host: parsed_url.host_str().unwrap().to_owned(), - path: parsed_url.path().to_owned(), - last_checked: Cell::new(0i64), - } - } - - /// Returns the time the robots.txt file was last fetched. - /// - /// This is useful for long-running web spiders that need to - /// check for new robots.txt files periodically. - pub fn mtime(&self) -> i64 { - self.last_checked.get() - } - - /// Sets the time the robots.txt file was last fetched to the - /// current time. - pub fn modified(&self) { - let now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_secs() as i64; - self.last_checked.set(now); - } - - /// Sets the URL referring to a robots.txt file. - pub fn set_url>(&mut self, url: T) { - let parsed_url = Url::parse(url.as_ref()).unwrap(); - self.url = parsed_url.clone(); - self.host = parsed_url.host_str().unwrap().to_owned(); - self.path = parsed_url.path().to_owned(); - self.last_checked.set(0i64); - } - - #[cfg(feature = "http")] - /// Reads the robots.txt URL and feeds it to the parser. - pub fn read(&self) { - let client = Client::new(); - let request = client.get(self.url.clone()); - let request = request.header(USER_AGENT, RP_USER_AGENT.to_owned()); - let mut res = match request.send() { - Ok(res) => res, - Err(_) => { - return; - } - }; - let status = res.status(); - match status { - StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN => { - self.disallow_all.set(true); - } - status if status >= StatusCode::BAD_REQUEST && status < StatusCode::INTERNAL_SERVER_ERROR => { - self.allow_all.set(true); - } - StatusCode::OK => self.from_response(&mut res), - _ => {} - } - } - - #[cfg(feature = "http")] - /// Reads the HTTP response and feeds it to the parser. - pub fn from_response(&self, response: &mut Response) { - let mut buf = String::new(); - response.read_to_string(&mut buf).unwrap(); - let lines: Vec<&str> = buf.split('\n').collect(); - self.parse(&lines); - } - - fn _add_entry(&self, entry: Entry<'a>) { - if entry.has_useragent("*") { - // the default entry is considered last - let mut default_entry = self.default_entry.borrow_mut(); - if default_entry.is_empty() { - // the first default entry wins - *default_entry = entry; - } - } else { - let mut entries = self.entries.borrow_mut(); - entries.push(entry); - } - } - - /// - /// Parse the input lines from a robots.txt file - /// - /// We allow that a user-agent: line is not preceded by - /// one or more blank lines. - /// - pub fn parse>(&self, lines: &[T]) { - use url::percent_encoding::percent_decode; - - // states: - // 0: start state - // 1: saw user-agent line - // 2: saw an allow or disallow line - let mut state = 0; - let mut entry = Entry::new(); - - self.modified(); - for line in lines { - let mut ln = line.as_ref(); - if ln.is_empty() { - match state { - 1 => { - entry = Entry::new(); - state = 0; - } - 2 => { - self._add_entry(entry); - entry = Entry::new(); - state = 0; - } - _ => {} - } - } - // remove optional comment and strip line - if let Some(i) = ln.find('#') { - ln = &ln[0..i]; - } - ln = ln.trim(); - if ln.is_empty() { - continue; - } - let parts: Vec<&str> = ln.splitn(2, ':').collect(); - if parts.len() == 2 { - let part0 = parts[0].trim().to_lowercase(); - let part1 = String::from_utf8(percent_decode(parts[1].trim().as_bytes()).collect()) - .unwrap_or("".to_owned()); - match part0 { - ref x if x == "user-agent" => { - if state == 2 { - self._add_entry(entry); - entry = Entry::new(); - } - entry.push_useragent(&part1); - state = 1; - } - ref x if x == "disallow" => { - if state != 0 { - entry.push_ruleline(RuleLine::new(part1, false)); - state = 2; - } - } - ref x if x == "allow" => { - if state != 0 { - entry.push_ruleline(RuleLine::new(part1, true)); - state = 2; - } - } - ref x if x == "crawl-delay" => { - if state != 0 { - if let Ok(delay) = part1.parse::() { - let delay_seconds = delay.trunc(); - let delay_nanoseconds = delay.fract() * 10f64.powi(9); - let delay = Duration::new(delay_seconds as u64, delay_nanoseconds as u32); - entry.set_crawl_delay(delay); - } - state = 2; - } - } - ref x if x == "sitemap" => { - if state != 0 { - entry.add_sitemap(&part1); - state = 2; - } - } - ref x if x == "request-rate" => { - if state != 0 { - let numbers: Vec> = part1.split('/').map(|x| x.parse::()).collect(); - if numbers.len() == 2 && numbers[0].is_ok() && numbers[1].is_ok() { - let req_rate = RequestRate { - requests: numbers[0].clone().unwrap(), - seconds: numbers[1].clone().unwrap(), - }; - entry.set_req_rate(req_rate); - } - state = 2; - } - } - _ => {} - } - } - } - if state == 2 { - self._add_entry(entry); - } - } - - /// Using the parsed robots.txt decide if useragent can fetch url - pub fn can_fetch>(&self, useragent: T, url: T) -> bool { - use url::percent_encoding::percent_decode; - - let useragent = useragent.as_ref(); - let url = url.as_ref(); - - if self.disallow_all.get() { - return false; - } - if self.allow_all.get() { - return true; - } - // Until the robots.txt file has been read or found not - // to exist, we must assume that no url is allowable. - // This prevents false positives when a user erronenously - // calls can_fetch() before calling read(). - if self.last_checked.get() == 0 { - return false; - } - // search for given user agent matches - // the first match counts - let decoded_url = String::from_utf8(percent_decode(url.trim().as_bytes()).collect()).unwrap_or("".to_owned()); - let url_str = match decoded_url { - ref u if !u.is_empty() => u.to_owned(), - _ => "/".to_owned(), - }; - let entries = self.entries.borrow(); - for entry in &*entries { - if entry.applies_to(useragent) { - return entry.allowance(&url_str); - } - } - // try the default entry last - let default_entry = self.default_entry.borrow(); - if !default_entry.is_empty() { - return default_entry.allowance(&url_str); - } - // agent not found ==> access granted - true - } - - /// Returns the crawl delay for this user agent as a `Duration`, or None if no crawl delay is defined. - pub fn get_crawl_delay>(&self, useragent: T) -> Option { - let useragent = useragent.as_ref(); - if self.last_checked.get() == 0 { - return None; - } - let entries = self.entries.borrow(); - for entry in &*entries { - if entry.applies_to(useragent) { - return entry.get_crawl_delay(); - } - } - None - } - - /// Returns the sitemaps for this user agent as a `Vec`. - pub fn get_sitemaps>(&self, useragent: T) -> Vec { - let useragent = useragent.as_ref(); - if self.last_checked.get() == 0 { - return Vec::new(); - } - let entries = self.entries.borrow(); - for entry in &*entries { - if entry.applies_to(useragent) { - return entry.get_sitemaps(); - } - } - vec![] - } - - /// Returns the request rate for this user agent as a `RequestRate`, or None if not request rate is defined - pub fn get_req_rate>(&self, useragent: T) -> Option { - let useragent = useragent.as_ref(); - if self.last_checked.get() == 0 { - return None; - } - let entries = self.entries.borrow(); - for entry in &*entries { - if entry.applies_to(useragent) { - return entry.get_req_rate(); - } - } - None - } -} +/// Contains models of robots.txt file. +pub mod model; +/// Contains robots.txt parsers. +pub mod parser; +/// Contains robots.txt services. +pub mod service; +/// Request builder & response parsers for other http libraries. +pub mod http; diff --git a/src/model.rs b/src/model.rs new file mode 100644 index 0000000..483385d --- /dev/null +++ b/src/model.rs @@ -0,0 +1,17 @@ +mod path_pattern; +pub (crate) use self::path_pattern::PathPattern; +mod group; +pub (crate) use self::group::Group; +mod rule; +pub (crate) use self::rule::Rule; +mod clean_params; +pub (crate) use self::clean_params::CleanParams; +mod request_rate; +pub use self::request_rate::RequestRate; +mod robots_txt; +pub use self::fetched_robots_txt::FetchedRobotsTxt; +pub (crate) use self::fetched_robots_txt::FetchedRobotsTxtContainer; +mod fetched_robots_txt; +pub use self::robots_txt::RobotsTxt; +mod path; +pub (crate) use self::path::Path; \ No newline at end of file diff --git a/src/model/clean_params.rs b/src/model/clean_params.rs new file mode 100644 index 0000000..f64a4c1 --- /dev/null +++ b/src/model/clean_params.rs @@ -0,0 +1,24 @@ +use crate::model::PathPattern; + +#[derive(Debug, Clone)] +pub struct CleanParams { + path_pattern: PathPattern, + params: Vec, +} + +impl CleanParams { + pub fn new(path_pattern: PathPattern, params: Vec) -> CleanParams { + return CleanParams { + path_pattern, + params, + } + } + + pub fn get_path_pattern(&self) -> &PathPattern { + return &self.path_pattern; + } + + pub fn get_params(&self) -> &Vec { + return &self.params; + } +} \ No newline at end of file diff --git a/src/model/fetched_robots_txt.rs b/src/model/fetched_robots_txt.rs new file mode 100644 index 0000000..6adc9e8 --- /dev/null +++ b/src/model/fetched_robots_txt.rs @@ -0,0 +1,36 @@ +use crate::model::robots_txt::RobotsTxt; +use std::time::SystemTime; + +#[derive(Debug, Clone)] +pub (crate) enum FetchedRobotsTxtContainer { + FetchDenied, + FetchFailed, + Fetched(RobotsTxt), +} + +#[derive(Debug, Clone)] +/// A model of the robots.txt file that was downloaded over the network. +/// This model takes into account HTTP response codes when loading the robots.txt file. +/// To work with this model you should use the trait `robotparser::service::RobotsTxtService`. +/// To create this structure you should use the `robotparser::parser::parse_fetched_robots_txt`. +pub struct FetchedRobotsTxt { + fetched_at: SystemTime, + container: FetchedRobotsTxtContainer, +} + +impl FetchedRobotsTxt { + pub (crate) fn new(container: FetchedRobotsTxtContainer) -> FetchedRobotsTxt { + FetchedRobotsTxt { + fetched_at: SystemTime::now(), + container, + } + } + pub (crate) fn get_container(&self) -> &FetchedRobotsTxtContainer { + return &self.container; + } + + /// Returns the system time when the robots.txt file was downloaded over the network. + pub fn get_fetched_at(&self) -> &SystemTime { + return &self.fetched_at; + } +} \ No newline at end of file diff --git a/src/model/group.rs b/src/model/group.rs new file mode 100644 index 0000000..d394f29 --- /dev/null +++ b/src/model/group.rs @@ -0,0 +1,93 @@ +use std::time::Duration; +use crate::model::request_rate::RequestRate; +use crate::model::rule::Rule; + +/// An group has one or more user-agents and zero or more rules +#[derive(Debug, Clone)] +pub struct Group { + user_agents: Vec, + rules: Vec, + crawl_delay: Option, + req_rate: Option, +} + +impl Group { + pub (crate) fn new() -> Group { + Group { + user_agents: vec![], + rules: vec![], + crawl_delay: None, + req_rate: None, + } + } + + /// check if this group applies to the specified agent + pub (crate) fn applies_to(&self, useragent: &str) -> bool { + let ua = useragent.split('/').nth(0).unwrap_or("").to_lowercase(); + for agent in self.user_agents.iter() { + if ua.contains(agent) { + return true; + } + } + false + } + + pub (crate) fn push_useragent(&mut self, useragent: &str) { + self.user_agents.push(useragent.to_lowercase().to_owned()); + } + + pub (crate) fn push_rule(&mut self, rule: Rule) { + self.rules.push(rule); + } + + pub (crate) fn get_rules_sorted_by_path_len_desc(&self) -> Vec<&Rule> { + let mut rules: Vec<&Rule> = self.rules.iter().collect(); + rules.sort_by(|a, b| { + let a = a.get_path_pattern().len(); + let b = b.get_path_pattern().len(); + return b.cmp(&a); + }); + return rules; + } + + pub (crate) fn contains_user_agent(&self, user_agent: &str) -> bool { + return self + .user_agents + .iter() + .find(|item| { + return *item == user_agent; + }).is_some(); + } + + pub (crate) fn set_crawl_delay(&mut self, delay: Duration) { + self.crawl_delay = Some(delay); + } + + pub (crate) fn get_crawl_delay(&self) -> Option { + return self.crawl_delay.clone(); + } + + pub (crate) fn set_req_rate(&mut self, req_rate: RequestRate) { + self.req_rate = Some(req_rate); + } + + pub (crate) fn get_req_rate(&self) -> Option { + return self.req_rate.clone(); + } + + pub (crate) fn is_default(&self) -> bool { + for user_agent in self.user_agents.iter() { + if user_agent == "*" { + return true; + } + } + return false; + } +} + + +impl Default for Group { + fn default() -> Group { + Group::new() + } +} diff --git a/src/model/path.rs b/src/model/path.rs new file mode 100644 index 0000000..6eb16bc --- /dev/null +++ b/src/model/path.rs @@ -0,0 +1,35 @@ +use url::Url; +use percent_encoding::percent_decode; + +#[derive(Debug)] +pub struct Path(String); + +impl Path { + pub fn from_url(url: &Url) -> Path { + let path = get_url_without_origin(&url); + let path = percent_decode(path.as_bytes()).decode_utf8_lossy(); + if path.is_empty() { + return Path("/".into()); + } else { + return Path(path.into()); + } + } + pub fn as_str(&self) -> &str { + return &self.0; + } +} + +fn get_url_without_origin(url: &Url) -> &str { + let origin = url.origin(); + let url = url.as_str(); + let unicode_origin = origin.unicode_serialization(); + let ascii_origin = origin.ascii_serialization(); + if url.starts_with(&unicode_origin) && unicode_origin.len() >= 1 { + return &url[unicode_origin.len()..]; + } + if url.starts_with(&ascii_origin) && ascii_origin.len() >= 1 { + return &url[ascii_origin.len()..]; + } + // Must never be executed. + panic!("Unable to get path from url"); +} diff --git a/src/model/path_pattern.rs b/src/model/path_pattern.rs new file mode 100644 index 0000000..ad5e856 --- /dev/null +++ b/src/model/path_pattern.rs @@ -0,0 +1,127 @@ +use std::convert::From; +use std::mem::replace; +use percent_encoding::percent_decode; +use crate::model::path::Path; + +#[derive(Debug, Clone)] +pub struct PathPattern(Vec); + +#[derive(Debug, Eq, PartialEq, Clone)] +enum PathPatternToken { + Text(String), + AnyString, + TerminateString, +} + +impl PathPatternToken { + fn from_path_pattern(path: String) -> PathPatternToken { + let path = percent_decode(path.as_bytes()).decode_utf8_lossy(); + return PathPatternToken::Text(path.to_string()); + } +} + +impl PathPatternToken { + fn len(&self) -> usize { + return match self { + &PathPatternToken::Text(ref text) => { + text.len() + }, + &PathPatternToken::AnyString => { + 1 + }, + &PathPatternToken::TerminateString => { + 1 + }, + } + } +} + +impl PathPattern { + pub fn new(path: &str) -> PathPattern { + let mut text = String::new(); + let mut tokens = Vec::new(); + for c in path.chars() { + let prepared_token = match c { + '*' => { + Some(PathPatternToken::AnyString) + }, + '$' => { + Some(PathPatternToken::TerminateString) + }, + _ => { + text.push(c); + None + }, + }; + if let Some(prepared_token) = prepared_token { + if !text.is_empty() { + tokens.push(PathPatternToken::from_path_pattern(replace(&mut text, "".into()))); + } + tokens.push(prepared_token); + } + } + if !text.is_empty() { + tokens.push(PathPatternToken::from_path_pattern(text)); + } + if let Some(&PathPatternToken::Text(..)) = tokens.last() { + tokens.push(PathPatternToken::AnyString); + } + tokens.dedup(); + return PathPattern(tokens); + } + + pub fn all() -> PathPattern { + return PathPattern(vec![PathPatternToken::AnyString]); + } + + pub fn applies_to(&self, path: &Path) -> bool { + let mut filename = path.as_str(); + for (index, token) in self.0.iter().enumerate() { + match token { + &PathPatternToken::Text(ref text) => { + if !filename.starts_with(text) { + return false; + } + filename = &filename[text.len() ..]; + }, + &PathPatternToken::AnyString => { + if let Some(&PathPatternToken::Text(ref text)) = self.0.get(index + 1) { + while filename.len() >= 1 { + if filename.starts_with(text) { + break; + } + // Search for next unicode char. + if let Some((next_char_index, _)) = filename.char_indices().nth(1) { + filename = &filename[next_char_index..]; + } else { + break; + } + } + } else { + filename = &filename[filename.len()..]; + } + }, + &PathPatternToken::TerminateString => { + if filename.len() != 0 { + return false; + } + }, + } + } + return true; + } + + pub fn len(&self) -> usize { + let mut length = 0; + for path_token in self.0.iter() { + length += path_token.len(); + } + return length; + } +} + +impl From<&str> for PathPattern { + fn from(path: &str) -> Self { + return PathPattern::new(path); + } +} diff --git a/src/model/request_rate.rs b/src/model/request_rate.rs new file mode 100644 index 0000000..f4b203a --- /dev/null +++ b/src/model/request_rate.rs @@ -0,0 +1,9 @@ +#[derive(Debug, Clone)] +/// The model of limiting the frequency of requests to the server. +/// It's set by the `Request-Rate` directive. +/// # Example +/// For the directive `Request-Rate: 1/5` is equivalent to the model `RequestRate {requests: 1, seconds: 5}` +pub struct RequestRate { + pub requests: usize, + pub seconds: usize, +} diff --git a/src/model/robots_txt.rs b/src/model/robots_txt.rs new file mode 100644 index 0000000..7952b2a --- /dev/null +++ b/src/model/robots_txt.rs @@ -0,0 +1,75 @@ +use crate::model::group::Group; +use crate::model::clean_params::CleanParams; +use url::{Url, Origin}; + +#[derive(Debug, Clone)] +/// The robots.txt model that was obtained after parsing the text of the robots.txt file. +/// To work with this model you should use the trait `robotparser::service::RobotsTxtService`. +/// To create this structure you should use the `robotparser::parser::parse_robots_txt`. +pub struct RobotsTxt { + origin: Origin, + groups: Vec, + sitemaps: Vec, + clean_params: Vec, +} + +impl RobotsTxt { + pub (crate) fn new(origin: Origin) -> RobotsTxt { + return RobotsTxt { + origin, + groups: Vec::new(), + sitemaps: Vec::new(), + clean_params: Vec::new(), + } + } + + pub (crate) fn add_sitemap(&mut self, url: Url) { + self.sitemaps.push(url); + } + + pub (crate) fn get_sitemaps_slice(&self) -> &[Url] { + return self.sitemaps.as_slice(); + } + + pub (crate) fn add_clean_params(&mut self, clean_params: CleanParams) { + self.clean_params.push(clean_params); + } + + pub (crate) fn get_clean_params(&self) -> &[CleanParams] { + return self.clean_params.as_slice(); + } + + pub (crate) fn add_group(&mut self, group: Group) { + self.groups.push(group); + } + + pub (crate) fn get_origin(&self) -> &Origin { + return &self.origin; + } + + pub (crate) fn find_in_group<'a, T>(&'a self, user_agent: &str, callback: impl Fn(&'a Group) -> Option) -> Option { + // Search by user agents + for group in self.groups.iter() { + if group.applies_to(user_agent) { + if let Some(output) = (callback)(group) { + return Some(output); + } + } + } + if let Some(group) = self.get_default_group() { + if let Some(output) = (callback)(group) { + return Some(output); + } + } + return None; + } + + pub (crate) fn get_default_group(&self) -> Option<&Group> { + for group in self.groups.iter() { + if group.is_default() { + return Some(group); + } + } + return None; + } +} \ No newline at end of file diff --git a/src/model/rule.rs b/src/model/rule.rs new file mode 100644 index 0000000..748f713 --- /dev/null +++ b/src/model/rule.rs @@ -0,0 +1,31 @@ +use crate::model::path_pattern::PathPattern; +use crate::model::path::Path; + +/// A rule line is a single "Allow:" (allowance==True) or "Disallow:" +/// (allowance==False) followed by a path.""" +#[derive(Debug, Clone)] +pub struct Rule { + path_pattern: PathPattern, + allowance: bool, +} + +impl Rule { + pub fn new(path_pattern: impl Into, allowance: bool) -> Rule { + Rule { + path_pattern: path_pattern.into(), + allowance, + } + } + + pub (crate) fn applies_to(&self, path: &Path) -> bool { + return self.path_pattern.applies_to(path); + } + + pub (crate) fn get_allowance(&self) -> bool { + return self.allowance; + } + + pub (crate) fn get_path_pattern(&self) -> &PathPattern { + return &self.path_pattern; + } +} \ No newline at end of file diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 0000000..94dcba0 --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,40 @@ +//! # Supported features and directives +//! +//! * Removes BOM unicode +//! * Directive `User-Agent` +//! * Directive `Allow` +//! * Directive `Disallow` +//! * Directive `Crawl-Delay` +//! * Directive `Request-Rate` +//! * Directive `Sitemap` +//! * Directive `Clean-Param` +//! +//! # Example +//! ```rust +//! use robotparser::parser::parse_robots_txt; +//! use robotparser::service::RobotsTxtService; +//! use url::Url; +//! +//! fn main() { +//! let robots_txt_url = Url::parse("http://google.com/robots.txt").unwrap(); +//! let robots_txt = "User-agent: *\nDisallow: /search"; +//! let robots_txt = parse_robots_txt(robots_txt_url.origin(), robots_txt); +//! assert_eq!(robots_txt.get_warnings().len(), 0); +//! let robots_txt = robots_txt.get_result(); +//! let good_url = Url::parse("http://google.com/test").unwrap(); +//! let bad_url = Url::parse("http://google.com/search/vvv").unwrap(); +//! assert_eq!(robots_txt.can_fetch("*", &bad_url), false); +//! assert_eq!(robots_txt.can_fetch("*", &good_url), true); +//! } +//! ``` +mod robots_txt_parser; +pub use self::robots_txt_parser::parse as parse_robots_txt; +mod warning_reason; +pub use self::warning_reason::WarningReason; +mod warning; +pub use self::warning::ParseWarning; +mod parse_result; +pub use self::parse_result::ParseResult; +mod fetched_robots_txt_parser; +pub use self::fetched_robots_txt_parser::parse as parse_fetched_robots_txt; +mod line; \ No newline at end of file diff --git a/src/parser/fetched_robots_txt_parser.rs b/src/parser/fetched_robots_txt_parser.rs new file mode 100644 index 0000000..23cd100 --- /dev/null +++ b/src/parser/fetched_robots_txt_parser.rs @@ -0,0 +1,28 @@ +use crate::model::{FetchedRobotsTxt, FetchedRobotsTxtContainer}; +use crate::parser::ParseResult; +use crate::parser::parse_robots_txt; +use url::Origin; + +const UNAUTHORIZED: u16 = 401; +const FORBIDDEN: u16 = 403; +const OK: u16 = 200; + +/// Parses the text of the robots.txt file located in the specified place of origin, +/// taking into account the response status code of the HTTP-request. +/// **IMPORTANT NOTE**: origin must point to robots.txt url **before redirects**. +pub fn parse(origin: Origin, status_code: u16, input: &str) -> ParseResult { + match status_code { + UNAUTHORIZED | FORBIDDEN => { + return ParseResult::new(FetchedRobotsTxt::new(FetchedRobotsTxtContainer::FetchDenied)); + } + OK => { + return parse_robots_txt(origin, input) + .map(|result| { + return FetchedRobotsTxt::new(FetchedRobotsTxtContainer::Fetched(result)); + }); + }, + _ => { + return ParseResult::new(FetchedRobotsTxt::new(FetchedRobotsTxtContainer::FetchFailed)); + } + } +} \ No newline at end of file diff --git a/src/parser/line.rs b/src/parser/line.rs new file mode 100644 index 0000000..4db7fe6 --- /dev/null +++ b/src/parser/line.rs @@ -0,0 +1,21 @@ +pub struct Line<'a> { + line: &'a str, + position: usize, +} + +impl <'a>Line<'a> { + pub fn new(line: &'a str, position: usize) -> Line<'a> { + return Line { + line, + position, + } + } + + pub fn get_line_text(&self) -> &str { + return self.line; + } + + pub fn get_line_number(&self) -> usize { + return self.position; + } +} \ No newline at end of file diff --git a/src/parser/parse_result.rs b/src/parser/parse_result.rs new file mode 100644 index 0000000..7e315c8 --- /dev/null +++ b/src/parser/parse_result.rs @@ -0,0 +1,62 @@ +use crate::parser::warning::ParseWarning; +use std::fmt::Debug; + +#[derive(Debug)] +/// The result of the robots.txt parser. +pub struct ParseResult where R: Debug { + result: R, + warnings: Vec, +} + +impl ParseResult where R: Debug { + /// Creates a new structure for parser results. + pub (crate) fn new(result: R) -> ParseResult{ + return ParseResult { + result, + warnings: Vec::new(), + } + } + + /// Creates a new structure for parser results with warnings. + pub (crate) fn new_with_warnings(result: R, warnings: Vec) -> ParseResult{ + return ParseResult { + result, + warnings, + } + } + + /// Returns the result of the robots.txt parser. + pub fn get_result(self) -> R { + return self.result; + } + + /// Returns the robots.txt parser warning array. + pub fn get_warnings(&self) -> &[ParseWarning] { + return self.warnings.as_slice(); + } + + /// Returns reference to result of the robots.txt parser or first warning. + pub fn ok_ref(&self) -> Result<&R, &ParseWarning> { + if let Some(warning) = self.warnings.first() { + return Err(warning); + } + return Ok(&self.result); + } + + /// Returns the result of the robots.txt parser or first warning. + pub fn ok(mut self) -> Result { + if self.warnings.is_empty() { + return Ok(self.result); + } + let first_warning = self.warnings.remove(0); + return Err(first_warning); + } + + /// Converts this structure into another type of structure. + pub (crate) fn map(self, callback: impl Fn(R) -> T) -> ParseResult where T: Debug { + return ParseResult { + result: (callback)(self.result), + warnings: self.warnings, + } + } +} \ No newline at end of file diff --git a/src/parser/robots_txt_parser.rs b/src/parser/robots_txt_parser.rs new file mode 100644 index 0000000..798fe89 --- /dev/null +++ b/src/parser/robots_txt_parser.rs @@ -0,0 +1,281 @@ +use url::{Origin, Url}; +use std::time::Duration; +use crate::parser::parse_result::ParseResult; +use crate::model::{RobotsTxt, Rule, PathPattern, CleanParams, RequestRate}; +use crate::parser::line::Line; +use crate::parser::warning::ParseWarning; +mod directive; +use self::directive::Directive; +mod group_builder; +pub use self::group_builder::GroupBuilder; + +const COMMENT_BEGIN_CHAR: char = '#'; +const KV_SEPARATOR: &'static str = ":"; + +/// Parses the text of the robots.txt file located in the specified origin. +pub fn parse(origin: Origin, input: &str) -> ParseResult { + let parser = Parser::new(origin); + return parser.parse(input); +} + +struct Parser { + result: RobotsTxt, + group_builder: GroupBuilder, + warnings: Vec, +} + +impl Parser { + pub fn new(origin: Origin) -> Parser { + return Parser { + result: RobotsTxt::new(origin), + group_builder: GroupBuilder::new(), + warnings: Vec::new(), + } + } + + pub fn parse(mut self, input: &str) -> ParseResult { + let input = ignore_bom(input); + let mut line_no = 0; + for line in input.lines() { + line_no += 1; + let line = Line::new(line, line_no); + match Self::parse_line(&line) { + Ok(Some(line_value)) => { + self.process_line_value(&line, &line_value); + }, + Err(warning) => { + self.warnings.push(warning); + }, + _ => {}, + } + } + self.group_builder.fill_entries(&mut self.result); + return ParseResult::new_with_warnings(self.result, self.warnings); + } + + fn parse_line<'a>(line: &'a Line) -> Result>, ParseWarning> { + let mut kv_part = line.get_line_text(); + if let Some(comment_separator_position) = line.get_line_text().find(COMMENT_BEGIN_CHAR) { + kv_part = &kv_part[0..comment_separator_position]; + } + if kv_part.is_empty() { + return Ok(None); + } + let separator_index = kv_part.find(KV_SEPARATOR).ok_or_else(|| { + return ParseWarning::invalid_directive_format(line); + })?; + if separator_index >= kv_part.len() { + return Err(ParseWarning::invalid_directive_format(line)); + } + let key = &kv_part[0..separator_index]; + let key = key.trim(); + if key.is_empty() { + return Err(ParseWarning::directive_key_is_empty(line)); + } + let value = &kv_part[separator_index + 1..]; + let value = value.trim(); + let result = Directive::new(key, value); + return Ok(Some(result)); + } + + fn process_line_value(&mut self, line: &Line, directive: &Directive) { + let key = directive.get_key_lowercase(); + match key.as_str() { + // Group specific directives + "user-agent" => { + self.process_directive_user_agent(line, directive); + }, + "allow" => { + self.process_directive_allow(line, directive); + }, + "disallow" => { + self.process_directive_disallow(line, directive); + }, + "crawl-delay" => { + self.process_directive_crawl_delay(line, directive); + }, + "request-rate" => { + self.process_directive_request_rate(line, directive); + }, + // Non-group directives + "sitemap" => { + self.process_directive_sitemap(line, directive); + }, + "clean-param" => { + self.process_directive_clean_param(line, directive); + }, + _ => { + self.warnings.push(ParseWarning::unsupported_directive_key(line, key)); + }, + } + } + + fn process_directive_user_agent(&mut self, line: &Line, directive: &Directive) { + let user_agent = directive.get_value(); + if user_agent.is_empty() { + self.warnings.push(ParseWarning::user_agent_cannot_be_empty(line)); + return; + } + self.group_builder.handle_user_agent(user_agent); + } + + fn process_directive_allow(&mut self, line: &Line, directive: &Directive) { + if let Some(group) = self.group_builder.get_mut_active_group() { + if directive.get_value() == "" { + // Nothing to do. Ignoring. + } else if directive.get_value().starts_with("*") || directive.get_value().starts_with("/") { + group.push_rule(Rule::new(directive.get_value(), true)); + } else { + self.warnings.push(ParseWarning::wrong_path_format(line)); + } + } else { + self.warnings.push(ParseWarning::directive_without_user_agent(line)); + } + } + + fn process_directive_disallow(&mut self, line: &Line, directive: &Directive) { + if let Some(group) = self.group_builder.get_mut_active_group() { + if directive.get_value() == "" { + // Allow all. + group.push_rule(Rule::new(PathPattern::all(), true)); + } else if directive.get_value().starts_with("*") || directive.get_value().starts_with("/") { + group.push_rule(Rule::new(directive.get_value(), false)); + } else { + self.warnings.push(ParseWarning::wrong_path_format(line)); + } + } else { + self.warnings.push(ParseWarning::directive_without_user_agent(line)); + } + } + + fn process_directive_crawl_delay(&mut self, line: &Line, directive: &Directive) { + if let Some(group) = self.group_builder.get_mut_active_group() { + match directive.get_value().parse::() { + Ok(delay) => { + let delay_seconds = delay.trunc(); + let delay_nanoseconds = delay.fract() * 10f64.powi(9); + let delay = Duration::new(delay_seconds as u64, delay_nanoseconds as u32); + group.set_crawl_delay(delay); + }, + Err(error) => { + self.warnings.push(ParseWarning::parse_crawl_delay_error(line, error)); + }, + } + } else { + self.warnings.push(ParseWarning::directive_without_user_agent(line)); + } + } + + fn process_directive_request_rate(&mut self, line: &Line, directive: &Directive) { + if let Some(group) = self.group_builder.get_mut_active_group() { + let numbers: Vec<&str> = directive.get_value().split('/').collect(); + if numbers.len() != 2 { + self.warnings.push(ParseWarning::wrong_request_rate_format(line)); + return; + } + let requests = match numbers[0].parse::() { + Ok(requests) => {requests}, + Err(error) => { + self.warnings.push(ParseWarning::parse_request_rate(line, error)); + return; + }, + }; + let seconds = match numbers[1].parse::() { + Ok(seconds) => {seconds}, + Err(error) => { + self.warnings.push(ParseWarning::parse_request_rate(line, error)); + return; + }, + }; + group.set_req_rate(RequestRate{requests, seconds}); + } else { + self.warnings.push(ParseWarning::directive_without_user_agent(line)); + } + } + + fn process_directive_sitemap(&mut self, line: &Line, directive: &Directive) { + match Url::parse(directive.get_value()) { + Ok(sitemap_url) => { + self.result.add_sitemap(sitemap_url); + }, + Err(error) => { + self.warnings.push(ParseWarning::parse_url(line, error)); + }, + } + } + + fn process_directive_clean_param(&mut self, line: &Line, directive: &Directive) { + let parts: Vec<&str> = directive.get_value().split_whitespace().collect(); + if parts.len() >= 3 || parts.len() == 0 { + self.warnings.push(ParseWarning::wrong_clean_param_format(line)); + return; + } + if parts[0].len() == 0 { + self.warnings.push(ParseWarning::wrong_clean_param_format(line)); + return; + } + let clean_params_path_pattern; + let clean_params; + if let Some(second_param) = parts.get(1) { + if second_param.len() == 0 { + self.warnings.push(ParseWarning::wrong_clean_param_format(line)); + return; + } + clean_params_path_pattern = PathPattern::new(parts[0]); + clean_params = *second_param; + } else { + clean_params_path_pattern = PathPattern::all(); + clean_params = parts[0]; + } + let (valid_clean_params, invalid_clean_params) = Self::parse_clean_params(clean_params); + if !invalid_clean_params.is_empty() { + self.warnings.push(ParseWarning::ignored_clean_params(line, invalid_clean_params)); + } + self.result.add_clean_params(CleanParams::new(clean_params_path_pattern, valid_clean_params)); + } + + fn parse_clean_params(clean_params: &str) -> (Vec, Vec) { + let mut valid = Vec::new(); + let mut invalid = Vec::new(); + for clean_param in clean_params.split('&') { + if !clean_param.is_empty() { + if Self::is_valid_clean_param(clean_param) { + valid.push(clean_param.into()); + } else { + invalid.push(clean_param.into()); + } + } + } + return (valid, invalid); + } + + fn is_valid_clean_param(clean_param: &str) -> bool { + for c in clean_param.chars() { + let mut is_valid = false; + if ('A'..'Z').contains(&c) { + is_valid = true; + } + if ('a'..'z').contains(&c) { + is_valid = true; + } + if ('0'..'9').contains(&c) { + is_valid = true; + } + if c == '.' || c == '-' || c == '_' { + is_valid = true; + } + if !is_valid { + return false; + } + } + return true; + } +} + +fn ignore_bom(input: &str) -> &str { + const BOM: &'static str = "\u{feff}"; + if input.starts_with(BOM) { + return &input[BOM.len()..]; + } + return input; +} \ No newline at end of file diff --git a/src/parser/robots_txt_parser/directive.rs b/src/parser/robots_txt_parser/directive.rs new file mode 100644 index 0000000..2bafea1 --- /dev/null +++ b/src/parser/robots_txt_parser/directive.rs @@ -0,0 +1,21 @@ +pub struct Directive<'a> { + key: &'a str, + value: &'a str, +} + +impl <'a> Directive<'a> { + pub fn new(key: &'a str, value: &'a str) -> Directive<'a> { + return Directive { + key, + value, + } + } + + pub fn get_key_lowercase(&self) -> String { + return self.key.to_lowercase(); + } + + pub fn get_value(&self) -> &str { + return self.value; + } +} \ No newline at end of file diff --git a/src/parser/robots_txt_parser/group_builder.rs b/src/parser/robots_txt_parser/group_builder.rs new file mode 100644 index 0000000..225c052 --- /dev/null +++ b/src/parser/robots_txt_parser/group_builder.rs @@ -0,0 +1,54 @@ +use crate::model::{Group, RobotsTxt}; +enum State { + WaitingForNewGroup, + WaitingForAdditionalUserAgent, +} + +pub struct GroupBuilder { + state: State, + active_group: Option, + groups: Vec, +} + +impl GroupBuilder { + pub fn new() -> GroupBuilder { + return GroupBuilder { + state: State::WaitingForNewGroup, + active_group: None, + groups: Vec::new(), + } + } + + pub fn handle_user_agent(&mut self, user_agent: &str) { + match self.state { + State::WaitingForNewGroup => { + let mut group = Group::new(); + group.push_useragent(user_agent); + self.groups.push(group); + self.active_group = Some(self.groups.len() - 1); + self.state = State::WaitingForAdditionalUserAgent; + }, + State::WaitingForAdditionalUserAgent => { + let active_group = self.active_group.expect("Unable to get active group"); + let group = self.groups.get_mut(active_group).expect("Unable to get group index"); + if !group.contains_user_agent(user_agent) { + group.push_useragent(user_agent); + } + }, + } + } + + pub fn get_mut_active_group(&mut self) -> Option<&mut Group> { + self.state = State::WaitingForNewGroup; + if let Some(active_group) = self.active_group { + return self.groups.get_mut(active_group); + } + return None; + } + + pub fn fill_entries(mut self, robots_txt: &mut RobotsTxt) { + for group in self.groups.drain(..) { + robots_txt.add_group(group); + } + } +} \ No newline at end of file diff --git a/src/parser/warning.rs b/src/parser/warning.rs new file mode 100644 index 0000000..a6a73d8 --- /dev/null +++ b/src/parser/warning.rs @@ -0,0 +1,136 @@ +use super::line::Line; +use super::warning_reason::WarningReason; +use url::ParseError as ParseUrlError; +use std::num::{ParseFloatError, ParseIntError}; +use std::fmt; +use std::error::Error; + +#[derive(Clone, Debug)] +/// Warning of robots.txt parser about problems when parsing robots.txt file. +pub struct ParseWarning { + line_no: usize, + line: String, + reason: WarningReason, +} + +impl Error for ParseWarning {} + +impl ParseWarning { + /// Returns the line number in the text of the robots.txt file. + pub fn get_line_no(&self) -> usize { + return self.line_no; + } + + /// Returns the text of the robots.txt file string. + pub fn get_line_text(&self) -> &String { + return &self.line; + } + + /// Returns the reason of warning. + pub fn get_reason(&self) -> &WarningReason { + return &self.reason; + } + + pub (crate) fn invalid_directive_format(line: &Line) -> ParseWarning { + return ParseWarning { + line_no: line.get_line_number(), + line: line.get_line_text().into(), + reason: WarningReason::InvalidDirectiveFormat, + } + } + + pub (crate) fn directive_key_is_empty(line: &Line) -> ParseWarning { + return ParseWarning { + line_no: line.get_line_number(), + line: line.get_line_text().into(), + reason: WarningReason::DirectiveKeyIsEmpty, + } + } + + pub (crate) fn unsupported_directive_key(line: &Line, key: String) -> ParseWarning { + return ParseWarning { + line_no: line.get_line_number(), + line: line.get_line_text().into(), + reason: WarningReason::UnsupportedDirectiveKey(key), + } + } + + pub (crate) fn user_agent_cannot_be_empty(line: &Line) -> ParseWarning { + return ParseWarning { + line_no: line.get_line_number(), + line: line.get_line_text().into(), + reason: WarningReason::UserAgentCannotBeEmpty, + } + } + + pub (crate) fn wrong_path_format(line: &Line) -> ParseWarning { + return ParseWarning { + line_no: line.get_line_number(), + line: line.get_line_text().into(), + reason: WarningReason::WrongPathFormat, + } + } + + pub (crate) fn directive_without_user_agent(line: &Line) -> ParseWarning { + return ParseWarning { + line_no: line.get_line_number(), + line: line.get_line_text().into(), + reason: WarningReason::DirectiveWithoutUserAgent, + } + } + + pub (crate) fn parse_crawl_delay_error(line: &Line, error: ParseFloatError) -> ParseWarning { + return ParseWarning { + line_no: line.get_line_number(), + line: line.get_line_text().into(), + reason: WarningReason::ParseCrawlDelayError(error), + } + } + + pub (crate) fn wrong_request_rate_format(line: &Line) -> ParseWarning { + return ParseWarning { + line_no: line.get_line_number(), + line: line.get_line_text().into(), + reason: WarningReason::WrongRequestRateFormat, + } + } + + pub (crate) fn parse_request_rate(line: &Line, error: ParseIntError) -> ParseWarning { + return ParseWarning { + line_no: line.get_line_number(), + line: line.get_line_text().into(), + reason: WarningReason::ParseRequestRate(error), + } + } + + pub (crate) fn parse_url(line: &Line, error: ParseUrlError) -> ParseWarning { + return ParseWarning { + line_no: line.get_line_number(), + line: line.get_line_text().into(), + reason: WarningReason::ParseUrl(error), + } + } + + pub (crate) fn wrong_clean_param_format(line: &Line) -> ParseWarning { + return ParseWarning { + line_no: line.get_line_number(), + line: line.get_line_text().into(), + reason: WarningReason::WrongCleanParamFormat, + } + } + + pub (crate) fn ignored_clean_params(line: &Line, ignored_clean_params: Vec) -> ParseWarning { + return ParseWarning { + line_no: line.get_line_number(), + line: line.get_line_text().into(), + reason: WarningReason::IgnoredCleanParams(ignored_clean_params), + } + } +} + +/// Displays text of warning. +impl fmt::Display for ParseWarning { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + write!(f, "Line: {}. Text: `{}`. {}", self.line_no, self.line, self.reason) + } +} \ No newline at end of file diff --git a/src/parser/warning_reason.rs b/src/parser/warning_reason.rs new file mode 100644 index 0000000..6e119c6 --- /dev/null +++ b/src/parser/warning_reason.rs @@ -0,0 +1,80 @@ +use url::ParseError as ParseUrlError; +use std::num::{ParseFloatError, ParseIntError}; +use std::fmt; + +#[derive(Clone, Debug)] +/// Warning reason of robots.txt parser about problems when parsing robots.txt file. +pub enum WarningReason { + /// Invalid directive format. Invalid directive example: `:` + InvalidDirectiveFormat, + /// Directive key is empty. Invalid directive example: `: ` + DirectiveKeyIsEmpty, + /// Directive key is not suppored by this parser. + UnsupportedDirectiveKey(String), + /// Passed directive key is `User-Agent` and passed value is empty. + UserAgentCannotBeEmpty, + /// It is impossible to process this directive before the `User-Agent` directive has not been processed. + DirectiveWithoutUserAgent, + /// It is impossible to process the `Crawl-Delay` directive because of an error when parsing a floating point number. + ParseCrawlDelayError(ParseFloatError), + /// Incorrect format of the `Request-Rate` directive. Example of the correct format: `Request-rate: 1/5` + WrongRequestRateFormat, + /// Incorrect format of the `Request-Rate` directive. Example of the correct format: `Request-rate: 1/5` + ParseRequestRate(ParseIntError), + /// Parsing URL error. + ParseUrl(ParseUrlError), + /// Incorrect format of the `Clean-Param` directive. + /// Parameters must be matched to regular expression: `A-Za-z0-9.-_`. + /// Example of the correct format: `Clean-param: ref1&ref2 /some_dir/get_book.pl` + WrongCleanParamFormat, + /// Some parameters of the `Clean-Param` directive has wrong symbols. + /// Parameters must be matched to regular expression: `A-Za-z0-9.-_`. + /// Example of the correct format: `Clean-param: ref1&ref2 /some_dir/get_book.pl` + IgnoredCleanParams(Vec), + /// Error in URL path format. + WrongPathFormat, +} + +/// Displays text of warning reason. +impl fmt::Display for WarningReason { + fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> { + match &self { + &Self::InvalidDirectiveFormat => { + write!(f, "Invalid directive format.") + }, + &Self::DirectiveKeyIsEmpty => { + write!(f, "Directive key is empty.") + }, + &Self::UnsupportedDirectiveKey(key) => { + write!(f, "Directive key `{}` is not suppored by this parser.", key) + }, + &Self::UserAgentCannotBeEmpty => { + write!(f, "Passed directive key is `User-Agent` and passed value is empty.") + }, + &Self::DirectiveWithoutUserAgent => { + write!(f, "It is impossible to process this directive before `User-Agent` directive has not been processed.") + }, + &Self::ParseCrawlDelayError(err) => { + write!(f, "It is impossible to process the `Crawl-Delay` directive because of an error when parsing a floating point number: {}", err) + }, + &Self::WrongRequestRateFormat => { + write!(f, "Incorrect format of the `Request-Rate` directive") + }, + &Self::ParseRequestRate(err) => { + write!(f, "Incorrect format of the `Request-Rate` directive: {}", err) + }, + &Self::ParseUrl(err) => { + write!(f, "Parsing URL error: {}", err) + }, + &Self::WrongCleanParamFormat => { + write!(f, "Incorrect format of the `Clean-Param` directive.") + }, + &Self::IgnoredCleanParams(ref params) => { + write!(f, "Directive `Clean-Param` directive has incorrect parameters: {:?}", params) + }, + &Self::WrongPathFormat => { + write!(f, "Error in URL path format.") + }, + } + } +} \ No newline at end of file diff --git a/src/service.rs b/src/service.rs new file mode 100644 index 0000000..d33592b --- /dev/null +++ b/src/service.rs @@ -0,0 +1,30 @@ +mod robots_txt; +mod fetched_robots_txt; +use url::Url; +use std::time::Duration; +use crate::model::RequestRate; + +/// Trait that implements robots txt service. +pub trait RobotsTxtService { + /// Using the parsed robots.txt decide if useragent can fetch url. + fn can_fetch(&self, user_agent: &str, url: &Url) -> bool; + + /// Returns the crawl delay for this user agent as a Duration, or None if no crawl delay is defined. + fn get_crawl_delay(&self, user_agent: &str) -> Option; + + /// Removes the request parameters from the url that were listed in the `Clean-param` directive. + /// This method CHECKS that the origin of the transmitted url matches the origin of robots.txt. + /// Returns true if the operation was applied to the passed url. + /// In other cases it returns false. + fn normalize_url(&self, url: &mut Url) -> bool; + + /// Removes the request parameters from the url that were listed in the `Clean-param` directive. + /// This method DOES NOT CHECK that the origin of the transmitted url coincides with the origin of robots.txt. + fn normalize_url_ignore_origin(&self, url: &mut Url); + + /// Returns the list of URL sitemaps that have been listed in the robots.txt file. + fn get_sitemaps(&self) -> &[Url]; + + /// Returns information about the restrictions set for sending HTTP requests to the server. + fn get_req_rate(&self, user_agent: &str) -> Option; +} \ No newline at end of file diff --git a/src/service/fetched_robots_txt.rs b/src/service/fetched_robots_txt.rs new file mode 100644 index 0000000..74b0a13 --- /dev/null +++ b/src/service/fetched_robots_txt.rs @@ -0,0 +1,51 @@ +use url::Url; +use std::time::Duration; +use crate::service::RobotsTxtService; +use crate::model::{FetchedRobotsTxt, FetchedRobotsTxtContainer}; +use crate::model::RequestRate; + +impl RobotsTxtService for FetchedRobotsTxt { + fn can_fetch(&self, user_agent: &str, url: &Url) -> bool { + match self.get_container() { + &FetchedRobotsTxtContainer::FetchDenied => false, + &FetchedRobotsTxtContainer::FetchFailed => true, + &FetchedRobotsTxtContainer::Fetched(ref robots_txt) => { + robots_txt.can_fetch(user_agent, url) + } + } + } + + fn get_crawl_delay(&self, user_agent: &str) -> Option { + if let &FetchedRobotsTxtContainer::Fetched(ref robots_txt) = self.get_container() { + return robots_txt.get_crawl_delay(user_agent); + } + return None; + } + + fn normalize_url(&self, url: &mut Url) -> bool { + if let &FetchedRobotsTxtContainer::Fetched(ref robots_txt) = self.get_container() { + return robots_txt.normalize_url(url); + } + return true; + } + + fn normalize_url_ignore_origin(&self, url: &mut Url) { + if let &FetchedRobotsTxtContainer::Fetched(ref robots_txt) = self.get_container() { + robots_txt.normalize_url_ignore_origin(url); + } + } + + fn get_sitemaps(&self) -> &[Url] { + if let &FetchedRobotsTxtContainer::Fetched(ref robots_txt) = self.get_container() { + return robots_txt.get_sitemaps(); + } + return &[]; + } + + fn get_req_rate(&self, user_agent: &str) -> Option { + if let &FetchedRobotsTxtContainer::Fetched(ref robots_txt) = self.get_container() { + return robots_txt.get_req_rate(user_agent); + } + return None; + } +} \ No newline at end of file diff --git a/src/service/robots_txt.rs b/src/service/robots_txt.rs new file mode 100644 index 0000000..5bc34ff --- /dev/null +++ b/src/service/robots_txt.rs @@ -0,0 +1,84 @@ +use url::Url; +use std::time::Duration; +use crate::service::RobotsTxtService; +use crate::model::RobotsTxt; +use crate::model::RequestRate; +use crate::model::Path; + +impl RobotsTxtService for RobotsTxt { + fn can_fetch(&self, user_agent: &str, url: &Url) -> bool { + if url.origin() != *self.get_origin() { + return false; + } + let path = Path::from_url(url); + let rule_decision = self.find_in_group(user_agent, |group| { + let rules = group.get_rules_sorted_by_path_len_desc(); + for rule in rules.iter() { + if rule.applies_to(&path) { + return Some(rule.get_allowance()); + } + } + return None; + }); + if let Some(rule_decision) = rule_decision { + return rule_decision; + } + // Empty robots.txt allows crawling. Everything that was not denied must be allowed. + return true; + } + + fn get_crawl_delay(&self, user_agent: &str) -> Option { + return self.find_in_group(user_agent, |group| { + return group.get_crawl_delay(); + }); + } + + fn normalize_url(&self, url: &mut Url) -> bool { + if url.origin() != *self.get_origin() { + return false; + } + self.normalize_url_ignore_origin(url); + return true; + } + + fn normalize_url_ignore_origin(&self, url: &mut Url) { + if url.query().is_none() { + return; + } + let mut query_params_to_filter = Vec::new(); + let path = Path::from_url(url); + for clean_params in self.get_clean_params().iter() { + if clean_params.get_path_pattern().applies_to(&path) { + query_params_to_filter.extend_from_slice(clean_params.get_params()) + } + } + let mut pairs: Vec<(String, String)> = url + .query_pairs() + .map(|(key, value)|{ + return (key.into(), value.into()); + }) + .collect(); + { + let mut query_pairs_mut = url.query_pairs_mut(); + query_pairs_mut.clear(); + for (key, value) in pairs.drain(..) { + if !query_params_to_filter.contains(&key) { + query_pairs_mut.append_pair(&key, &value); + } + } + } + if url.query() == Some("") { + url.set_query(None); + } + } + + fn get_sitemaps(&self) -> &[Url] { + return self.get_sitemaps_slice(); + } + + fn get_req_rate(&self, user_agent: &str) -> Option { + return self.find_in_group(user_agent, |group| { + return group.get_req_rate(); + }); + } +} \ No newline at end of file diff --git a/tests/lib.rs b/tests/test_lib.rs similarity index 62% rename from tests/lib.rs rename to tests/test_lib.rs index 0c6a8b5..a50a8be 100644 --- a/tests/lib.rs +++ b/tests/test_lib.rs @@ -1,21 +1,22 @@ -extern crate robotparser; -extern crate url; - -use robotparser::RobotFileParser; +use robotparser::parser::parse_robots_txt; +use robotparser::service::RobotsTxtService; use std::time::Duration; use url::Url; const AGENT: &'static str = "test_robotparser"; fn robot_test(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>, agent: &str) { - let parser = RobotFileParser::new("http://www.baidu.com/robots.txt"); - let lines: Vec<&str> = doc.split("\n").collect(); - parser.parse(&lines); + let url = Url::parse("http://www.baidu.com/robots.txt").unwrap(); + let parser = parse_robots_txt(url.origin(), doc).get_result(); for url in &good_urls { - assert!(parser.can_fetch(agent, url)); + let url = format!("http://www.baidu.com{}", url); + let url = Url::parse(&url).unwrap(); + assert!(parser.can_fetch(agent, &url)); } for url in &bad_urls { - assert!(!parser.can_fetch(agent, url)); + let url = format!("http://www.baidu.com{}", url); + let url = Url::parse(&url).unwrap(); + assert!(!parser.can_fetch(agent, &url)); } } @@ -24,6 +25,19 @@ fn robot_test_simple(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>) { robot_test(doc, good_urls, bad_urls, AGENT); } +#[test] +fn test_robots_txt_rn_bom() { + let doc = "\u{feff}\r\n\ + User-agent: *\r\n\ + Disallow: /cyberworld/map/ # This is an infinite virtual URL space\r\n\ + Disallow: /tmp/ # these will soon disappear\r\n\ + Disallow: /foo.html\r\n\ + "; + let good = vec!["/","/test.html"]; + let bad = vec!["/cyberworld/map/index.html","/tmp/xxx","/foo.html"]; + robot_test_simple(doc, good, bad); +} + #[test] fn test_robots_txt_1() { @@ -213,54 +227,72 @@ fn test_robots_txt_13() { robot_test_simple(doc, good, bad); } +/// Using patterns with `*` and `$` symbols. +#[test] +fn test_robots_txt_14() { + let doc = "\n\ + User-agent: *\n + Allow: /*video.html\n + Allow: */?amp*\n + Disallow: */rss$\n + Disallow: */rss/$\n + Disallow: /rate/\n + "; + let good = vec!["/rss/test", "/sdfvsdvs-sdfvsdv-video.html", "/rate"]; + let bad = vec!["/rss", "/rss/", "/rate/", "/rate/0/9"]; + robot_test_simple(doc, good, bad); +} + #[cfg(feature = "http")] #[test] fn test_robots_txt_read() { - let parser = RobotFileParser::new("http://www.python.org/robots.txt"); - parser.read(); - assert!(parser.can_fetch("*", "http://www.python.org/robots.txt")); + use robotparser::http::{CreateRobotsTxtRequest, ParseRobotsTxtResponse}; + use reqwest::{Client, Request}; + let http_client = Client::new(); + let url = Url::parse("http://www.python.org/robots.txt").unwrap(); + let request = Request::create_robots_txt_request(url.origin()); + let mut response = http_client.execute(request).unwrap(); + let parser = response.parse_robots_txt_response().unwrap().get_result(); + assert!(parser.can_fetch("*", &url)); } #[test] fn test_robots_text_crawl_delay() { - let parser = RobotFileParser::new("http://www.python.org/robots.txt"); + let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap(); let doc = "User-agent: Yandex\n\ Crawl-delay: 2.35\n\ Disallow: /search/\n"; - let lines: Vec<&str> = doc.split("\n").collect(); - parser.parse(&lines); + let parser = parse_robots_txt(robots_txt_url.origin(), doc).get_result(); assert_eq!(Duration::new(2,350 * 1000 * 1000), parser.get_crawl_delay("Yandex").unwrap()); } #[test] fn test_robots_text_sitemaps() { - let parser = RobotFileParser::new("http://www.python.org/robots.txt"); + let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap(); let doc = "User-agent: Yandex\n\ - Sitemap: http://example.com/sitemap1.xml - Sitemap: http://example.com/sitemap2.xml - Sitemap: http://example.com/sitemap3.xml + Sitemap \t : http://example.com/sitemap1.xml\n + Sitemap: http://example.com/sitemap2.xml\n + Sitemap: http://example.com/sitemap3.xml\n Disallow: /search/\n"; - let lines: Vec<&str> = doc.split("\n").collect(); - parser.parse(&lines); + let parser = parse_robots_txt(robots_txt_url.origin(), doc).get_result(); assert_eq!( - vec![ + &[ Url::parse("http://example.com/sitemap1.xml").unwrap(), Url::parse("http://example.com/sitemap2.xml").unwrap(), Url::parse("http://example.com/sitemap3.xml").unwrap() ], - parser.get_sitemaps("Yandex") + parser.get_sitemaps() ); } #[test] fn test_robots_text_request_rate() { - let parser = RobotFileParser::new("http://www.python.org/robots.txt"); + let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap(); let doc = "User-agent: Yandex\n\ Request-rate: 3/15\n\ Disallow: /search/\n"; - let lines: Vec<&str> = doc.split("\n").collect(); - parser.parse(&lines); + let parser = parse_robots_txt(robots_txt_url.origin(), doc).get_result(); let req_rate = parser.get_req_rate("Yandex").unwrap(); assert_eq!(3, req_rate.requests); assert_eq!(15, req_rate.seconds); @@ -269,8 +301,27 @@ fn test_robots_text_request_rate() { assert!(req_rate.is_none()); } + #[test] -fn test_robots_127_0_0_1() { - // Ensure it does not panic - RobotFileParser::new("http://127.0.0.1:4000/robots.txt"); -} +fn test_robots_text_clean_params() { + let doc = "\ +User-Agent: *\n\ +Clean-param: mode\n\ +Clean-param: from\n\ +Clean-param: pid\n\ +Clean-param: gid\n\ +Clean-param: tm\n\ +Clean-param: amp\n\ + "; + let url = Url::parse("http://www.baidu.com/robots.txt").unwrap(); + let parser = parse_robots_txt(url.origin(), doc).get_result(); + let mut site_url = Url::parse("http://www.baidu.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&=1").unwrap(); + let was_updated = parser.normalize_url(&mut site_url); + assert_eq!(was_updated, true); + assert_eq!(site_url.as_str(), "http://www.baidu.com/test?post_id=7777"); + + let mut site_url = Url::parse("http://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&=1").unwrap(); + let was_updated = parser.normalize_url(&mut site_url); + assert_eq!(was_updated, false); + assert_eq!(site_url.as_str(), "http://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&=1"); +} \ No newline at end of file diff --git a/tests/test_reqwest_async.rs b/tests/test_reqwest_async.rs new file mode 100644 index 0000000..e286b0b --- /dev/null +++ b/tests/test_reqwest_async.rs @@ -0,0 +1,16 @@ +use robotparser::http::RobotsTxtClient; +use robotparser::service::RobotsTxtService; +use reqwest::Client; +use url::Url; +use tokio::runtime::Runtime; + +#[test] +fn test_reqwest_async() { + let mut runtime = Runtime::new().unwrap(); + let client = Client::new(); + let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap(); + let robots_txt_response = runtime.block_on(client.fetch_robots_txt(robots_txt_url.origin())); + let robots_txt = robots_txt_response.unwrap().get_result(); + let fetch_url = Url::parse("http://www.python.org/robots.txt").unwrap(); + assert!(robots_txt.can_fetch("*", &fetch_url)); +} \ No newline at end of file diff --git a/tests/test_reqwest_blocking.rs b/tests/test_reqwest_blocking.rs new file mode 100644 index 0000000..129f14d --- /dev/null +++ b/tests/test_reqwest_blocking.rs @@ -0,0 +1,13 @@ +use robotparser::http::RobotsTxtClient; +use robotparser::service::RobotsTxtService; +use reqwest::blocking::Client; +use url::Url; + +#[test] +fn test_reqwest_blocking() { + let client = Client::new(); + let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap(); + let robots_txt = client.fetch_robots_txt(robots_txt_url.origin()).unwrap().get_result(); + let fetch_url = Url::parse("http://www.python.org/robots.txt").unwrap(); + assert!(robots_txt.can_fetch("*", &fetch_url)); +} diff --git a/tests/test_warnings.rs b/tests/test_warnings.rs new file mode 100644 index 0000000..30b2d8e --- /dev/null +++ b/tests/test_warnings.rs @@ -0,0 +1,178 @@ +use robotparser::parser::{parse_robots_txt, WarningReason}; +use url::{Host, Origin}; +use std::convert::From; + +#[derive(PartialEq, Eq, Debug, Clone)] +enum WarningReasonKind { + InvalidDirectiveFormat, + DirectiveKeyIsEmpty, + UnsupportedDirectiveKey, + UserAgentCannotBeEmpty, + DirectiveWithoutUserAgent, + ParseCrawlDelayError, + WrongRequestRateFormat, + ParseRequestRate, + ParseUrl, + WrongCleanParamFormat, + IgnoredCleanParams, + WrongPathFormat, +} + +fn validate_warnings(input: &str, expected_warnings: &[WarningReasonKind]) { + let host = Host::Domain("python.org".into()); + let origin = Origin::Tuple("http".into(), host, 80); + let warnings = parse_robots_txt(origin, &input).get_warnings().to_vec(); + assert_eq!(warnings.len(), expected_warnings.len()); + for (warning, expected_warning) in warnings.iter().zip(expected_warnings.iter()) { + let warning: WarningReasonKind = warning.get_reason().into(); + assert_eq!(expected_warning.clone(), warning); + } +} + +#[test] +fn test_warning_invalid_directive_format() { + let input = "`"; + validate_warnings(input, &[WarningReasonKind::InvalidDirectiveFormat]); + let input = " \t ` \t "; + validate_warnings(input, &[WarningReasonKind::InvalidDirectiveFormat]); +} + +#[test] +fn test_warning_directive_key_is_empty() { + let input = ":"; + validate_warnings(input, &[WarningReasonKind::DirectiveKeyIsEmpty]); +} + +#[test] +fn test_warning_supported_directive_key() { + let input = "X-Directive:"; + validate_warnings(input, &[WarningReasonKind::UnsupportedDirectiveKey]); + let input = "\t X-Directive\t :\t "; + validate_warnings(input, &[WarningReasonKind::UnsupportedDirectiveKey]); +} + + +#[test] +fn test_warning_user_agent_cannot_be_empty() { + let input = "User-Agent:"; + validate_warnings(input, &[WarningReasonKind::UserAgentCannotBeEmpty]); + let input = "\t User-Agent\t :\t "; + validate_warnings(input, &[WarningReasonKind::UserAgentCannotBeEmpty]); + let input = "\t User-Agent\t :\t *"; + validate_warnings(input, &[]); +} + +#[test] +fn test_warning_directive_without_user_agent() { + let input = "Crawl-Delay: 5s"; + validate_warnings(input, &[WarningReasonKind::DirectiveWithoutUserAgent]); + let input = "User-Agent: *\nCrawl-Delay: 5"; + validate_warnings(input, &[]); +} + +#[test] +fn test_warning_parse_crawl_delay_error() { + let input = "User-Agent: *\nCrawl-Delay: "; + validate_warnings(input, &[WarningReasonKind::ParseCrawlDelayError]); + let input = "User-Agent: *\nCrawl-Delay: -"; + validate_warnings(input, &[WarningReasonKind::ParseCrawlDelayError]); + let input = "User-Agent: *\nCrawl-Delay: 5h9"; + validate_warnings(input, &[WarningReasonKind::ParseCrawlDelayError]); + let input = "User-Agent: *\nCrawl-Delay: 5"; + validate_warnings(input, &[]); +} + +#[test] +fn test_warning_request_rate_format() { + let input = "User-Agent: *\nRequest-rate: 1/5"; + validate_warnings(input, &[]); + let input = "User-Agent: *\nRequest-rate: 1//5"; + validate_warnings(input, &[WarningReasonKind::WrongRequestRateFormat]); + let input = "User-Agent: *\nRequest-rate: 1"; + validate_warnings(input, &[WarningReasonKind::WrongRequestRateFormat]); +} + +#[test] +fn test_warning_request_rate() { + let input = "User-Agent: *\nRequest-rate: a/b"; + validate_warnings(input, &[WarningReasonKind::ParseRequestRate]); + let input = "User-Agent: *\nRequest-rate: a/5"; + validate_warnings(input, &[WarningReasonKind::ParseRequestRate]); + let input = "User-Agent: *\nRequest-rate: 5/b"; + validate_warnings(input, &[WarningReasonKind::ParseRequestRate]); + let input = "User-Agent: *\nRequest-rate: 1.0/5.0"; + validate_warnings(input, &[WarningReasonKind::ParseRequestRate]); +} + +#[test] +fn test_warning_parsing_url() { + let input = "User-Agent: *\nSitemap: http://python.org/sitemap.xml"; + validate_warnings(input, &[]); + let input = "User-Agent: *\nSitemap: http$$$://python.org/sitemap.xml"; + validate_warnings(input, &[WarningReasonKind::ParseUrl]); +} + +#[test] +fn test_wrong_clean_param() { + let input = "User-Agent: *\nClean-param: ref "; + validate_warnings(input, &[]); + let input = "User-Agent: *\nClean-param: "; + validate_warnings(input, &[WarningReasonKind::WrongCleanParamFormat]); + let input = "User-Agent: *\nClean-param: &"; + validate_warnings(input, &[]); + let input = "User-Agent: *\nClean-param: ?"; + validate_warnings(input, &[WarningReasonKind::IgnoredCleanParams]); + let input = "User-Agent: *\nClean-param: abc$"; + validate_warnings(input, &[WarningReasonKind::IgnoredCleanParams]); +} + +#[test] +fn test_warning_wrong_path_format() { + let input = "User-Agent: *\nAllow: \\"; + validate_warnings(input, &[WarningReasonKind::WrongPathFormat]); + let input = "User-Agent: *\nDisallow: \\"; + validate_warnings(input, &[WarningReasonKind::WrongPathFormat]); +} + +impl From<&WarningReason> for WarningReasonKind { + fn from(reason: &WarningReason) -> Self { + match reason { + &WarningReason::InvalidDirectiveFormat => { + return WarningReasonKind::InvalidDirectiveFormat; + }, + &WarningReason::DirectiveKeyIsEmpty => { + return WarningReasonKind::DirectiveKeyIsEmpty; + }, + &WarningReason::UnsupportedDirectiveKey {..} => { + return WarningReasonKind::UnsupportedDirectiveKey; + }, + &WarningReason::UserAgentCannotBeEmpty => { + return WarningReasonKind::UserAgentCannotBeEmpty; + }, + &WarningReason::DirectiveWithoutUserAgent => { + return WarningReasonKind::DirectiveWithoutUserAgent; + }, + &WarningReason::ParseCrawlDelayError {..} => { + return WarningReasonKind::ParseCrawlDelayError; + }, + &WarningReason::WrongRequestRateFormat => { + return WarningReasonKind::WrongRequestRateFormat; + }, + &WarningReason::ParseRequestRate {..} => { + return WarningReasonKind::ParseRequestRate; + }, + &WarningReason::ParseUrl {..} => { + return WarningReasonKind::ParseUrl; + }, + &WarningReason::WrongCleanParamFormat => { + return WarningReasonKind::WrongCleanParamFormat; + }, + &WarningReason::IgnoredCleanParams {..} => { + return WarningReasonKind::IgnoredCleanParams; + }, + &WarningReason::WrongPathFormat => { + return WarningReasonKind::WrongPathFormat; + }, + } + } +} \ No newline at end of file