Refactoring of robotparser-rs (#20)

* Migrated sites into robotsparser file.

* Robots.txt refactoring.

* Migrated to new version of url and reqwest.
This commit is contained in:
svmk 2020-01-31 16:00:58 +07:00 committed by GitHub
parent cb7df85b83
commit 2d19755779
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
33 changed files with 1789 additions and 511 deletions

1
.gitignore vendored
View file

@ -1,3 +1,4 @@
target
Cargo.lock
.vscode/
.idea/

View file

@ -8,16 +8,25 @@ license = "MIT"
name = "robotparser"
readme = "README.md"
repository = "https://github.com/messense/robotparser-rs"
version = "0.10.2"
version = "0.11.0"
edition = "2018"
[dependencies]
url = "1"
url = "2"
percent-encoding = "2.1"
[dependencies.reqwest]
version = "0.9"
version = "0.10.1"
optional = true
features = ["blocking"]
[dependencies.futures]
version = "0.3"
optional = true
[features]
default = ["http"]
http = ["reqwest"]
default = ["reqwest", "futures"]
unstable = []
[dev-dependencies]
tokio = "0.2.11"

View file

@ -15,7 +15,7 @@ Add it to your ``Cargo.toml``:
```toml
[dependencies]
robotparser = "0.10"
robotparser = "0.11"
```
Add ``extern crate robotparser`` to your crate root and your're good to go!
@ -24,14 +24,17 @@ Add ``extern crate robotparser`` to your crate root and your're good to go!
## Examples
```rust
extern crate robotparser;
use robotparser::RobotFileParser;
use robotparser::http::RobotsTxtClient;
use robotparser::service::RobotsTxtService;
use reqwest::Client;
use url::Url;
fn main() {
let parser = RobotFileParser::new("http://www.python.org/robots.txt");
parser.read();
assert!(parser.can_fetch("*", "http://www.python.org/robots.txt"));
let client = Client::new();
let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
let robots_txt = client.fetch_robots_txt(robots_txt_url.origin()).unwrap().get_result();
let fetch_url = Url::parse("http://www.python.org/robots.txt").unwrap();
assert!(robots_txt.can_fetch("*", &fetch_url));
}
```

19
src/http.rs Normal file
View file

@ -0,0 +1,19 @@
//! # Supported libraries
//! To enable support for the required library, you need to add this feature to your `Cargo.toml`.
//! Now only one library is supported - `reqwest`.
//! But you can also add support for other libraries.
use url::Origin;
#[cfg(feature = "reqwest")]
/// Support for reqwest library.
pub mod reqwest;
/// User agent of this crate.
pub const DEFAULT_USER_AGENT: &str = "robotparser-rs (https://crates.io/crates/robotparser)";
/// Trait to fetch and parse the robots.txt file.
/// Must be implemented on http-client.
pub trait RobotsTxtClient {
type Result;
fn fetch_robots_txt(&self, origin: Origin) -> Self::Result;
}

4
src/http/reqwest.rs Normal file
View file

@ -0,0 +1,4 @@
mod sync_reqwest;
pub use self::sync_reqwest::*;
mod async_reqwest;
pub use self::async_reqwest::*;

View file

@ -0,0 +1,76 @@
use reqwest::{Client, Request};
use reqwest::{Method, Error};
use reqwest::header::HeaderValue;
use url::{Origin, Url};
use reqwest::header::USER_AGENT;
use crate::http::{RobotsTxtClient, DEFAULT_USER_AGENT};
use crate::parser::{ParseResult, parse_fetched_robots_txt};
use crate::model::FetchedRobotsTxt;
use std::pin::Pin;
use futures::task::{Context, Poll};
use futures::Future;
use futures::future::TryFutureExt;
use futures::future::ok as future_ok;
type FetchFuture = Box<dyn Future<Output=Result<(ResponseInfo, String), Error>>>;
impl RobotsTxtClient for Client {
type Result = RobotsTxtResponse;
fn fetch_robots_txt(&self, origin: Origin) -> Self::Result {
let url = format!("{}/robots.txt", origin.unicode_serialization());
let url = Url::parse(&url).expect("Unable to parse robots.txt url");
let mut request = Request::new(Method::GET, url);
let _ = request.headers_mut().insert(USER_AGENT, HeaderValue::from_static(DEFAULT_USER_AGENT));
let response = self
.execute(request)
.and_then(|response| {
let response_info = ResponseInfo {status_code: response.status().as_u16()};
return response.text().and_then(|response_text| {
return future_ok((response_info, response_text));
});
});
let response: Pin<Box<dyn Future<Output=Result<(ResponseInfo, String), Error>>>> = Box::pin(response);
return RobotsTxtResponse {
origin,
response,
}
}
}
struct ResponseInfo {
status_code: u16,
}
/// Future for fetching robots.txt result.
pub struct RobotsTxtResponse {
origin: Origin,
response: Pin<FetchFuture>,
}
impl RobotsTxtResponse {
/// Returns origin of robots.txt
pub fn get_origin(&self) -> &Origin {
return &self.origin;
}
}
impl Future for RobotsTxtResponse {
type Output = Result<ParseResult<FetchedRobotsTxt>, Error>;
fn poll(self: Pin<&mut Self>, cx: &mut Context) -> Poll<Self::Output> {
let self_mut = self.get_mut();
let response_pin = self_mut.response.as_mut();
match response_pin.poll(cx) {
Poll::Ready(Ok((response_info, text))) => {
let robots_txt = parse_fetched_robots_txt(self_mut.origin.clone(), response_info.status_code, &text);
return Poll::Ready(Ok(robots_txt));
},
Poll::Ready(Err(error)) => {
return Poll::Ready(Err(error));
},
Poll::Pending => {
return Poll::Pending;
},
}
}
}

View file

@ -0,0 +1,23 @@
use reqwest::blocking::{Client, Request};
use reqwest::{Method, Error};
use reqwest::header::HeaderValue;
use url::{Origin, Url};
use reqwest::header::USER_AGENT;
use crate::http::{RobotsTxtClient, DEFAULT_USER_AGENT};
use crate::parser::{ParseResult, parse_fetched_robots_txt};
use crate::model::FetchedRobotsTxt;
impl RobotsTxtClient for Client {
type Result = Result<ParseResult<FetchedRobotsTxt>, Error>;
fn fetch_robots_txt(&self, origin: Origin) -> Self::Result {
let url = format!("{}/robots.txt", origin.unicode_serialization());
let url = Url::parse(&url).expect("Unable to parse robots.txt url");
let mut request = Request::new(Method::GET, url);
let _ = request.headers_mut().insert(USER_AGENT, HeaderValue::from_static(DEFAULT_USER_AGENT));
let response = self.execute(request)?;
let status_code = response.status().as_u16();
let text = response.text()?;
let robots_txt = parse_fetched_robots_txt(origin, status_code, &text);
return Ok(robots_txt);
}
}

View file

@ -9,482 +9,32 @@
//!
//! ```toml
//! [dependencies]
//! robotparser = "0.10"
//! robotparser = "0.11"
//! ```
//!
//! Add ``extern crate robotparser`` to your crate root and your're good to go!
//!
//! # Examples
//!
//! ```rust,ignore
//! extern crate robotparser;
//!
//! use robotparser::RobotFileParser;
//! ```rust
//! use robotparser::http::RobotsTxtClient;
//! use robotparser::service::RobotsTxtService;
//! use reqwest::blocking::Client;
//! use url::Url;
//!
//! fn main() {
//! let parser = RobotFileParser::new("http://www.python.org/robots.txt");
//! parser.read();
//! assert!(parser.can_fetch("*", "http://www.python.org/robots.txt"));
//! let client = Client::new();
//! let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
//! let robots_txt = client.fetch_robots_txt(robots_txt_url.origin()).unwrap().get_result();
//! let fetch_url = Url::parse("http://www.python.org/robots.txt").unwrap();
//! assert!(robots_txt.can_fetch("*", &fetch_url));
//! }
//! ```
extern crate url;
#[cfg(feature = "http")]
extern crate reqwest;
#[cfg(feature = "http")]
use std::io::Read;
use std::cell::{Cell, RefCell};
use std::borrow::Cow;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use url::Url;
#[cfg(feature = "http")]
use reqwest::Client;
#[cfg(feature = "http")]
use reqwest::header::USER_AGENT;
#[cfg(feature = "http")]
use reqwest::StatusCode;
#[cfg(feature = "http")]
use reqwest::Response;
#[cfg(feature = "http")]
const RP_USER_AGENT: &str = "robotparser-rs (https://crates.io/crates/robotparser)";
/// A rule line is a single "Allow:" (allowance==True) or "Disallow:"
/// (allowance==False) followed by a path."""
#[derive(Debug, Eq, PartialEq, Clone)]
struct RuleLine<'a> {
path: Cow<'a, str>,
allowance: bool,
}
#[derive(Debug, Eq, PartialEq, Clone)]
pub struct RequestRate {
pub requests: usize,
pub seconds: usize,
}
/// An entry has one or more user-agents and zero or more rulelines
#[derive(Debug, Eq, PartialEq, Clone)]
struct Entry<'a> {
useragents: RefCell<Vec<String>>,
rulelines: RefCell<Vec<RuleLine<'a>>>,
crawl_delay: Option<Duration>,
sitemaps: Vec<Url>,
req_rate: Option<RequestRate>,
}
/// robots.txt file parser
#[derive(Debug, Eq, PartialEq, Clone)]
pub struct RobotFileParser<'a> {
entries: RefCell<Vec<Entry<'a>>>,
default_entry: RefCell<Entry<'a>>,
disallow_all: Cell<bool>,
allow_all: Cell<bool>,
url: Url,
host: String,
path: String,
last_checked: Cell<i64>,
}
impl<'a> RuleLine<'a> {
fn new<S>(path: S, allowance: bool) -> RuleLine<'a>
where S: Into<Cow<'a, str>>
{
let path = path.into();
let mut allow = allowance;
if path == "" && !allowance {
// an empty value means allow all
allow = true;
}
RuleLine {
path: path,
allowance: allow,
}
}
fn applies_to(&self, filename: &str) -> bool {
self.path == "*" || filename.starts_with(&self.path[..])
}
}
impl<'a> Entry<'a> {
fn new() -> Entry<'a> {
Entry {
useragents: RefCell::new(vec![]),
rulelines: RefCell::new(vec![]),
crawl_delay: None,
sitemaps: Vec::new(),
req_rate: None,
}
}
/// check if this entry applies to the specified agent
fn applies_to(&self, useragent: &str) -> bool {
let ua = useragent.split('/').nth(0).unwrap_or("").to_lowercase();
let useragents = self.useragents.borrow();
for agent in &*useragents {
if agent == "*" {
return true;
}
if ua.contains(agent) {
return true;
}
}
false
}
/// Preconditions:
/// - our agent applies to this entry
/// - filename is URL decoded
fn allowance(&self, filename: &str) -> bool {
let rulelines = self.rulelines.borrow();
for line in &*rulelines {
if line.applies_to(filename) {
return line.allowance;
}
}
true
}
fn push_useragent(&self, useragent: &str) {
let mut useragents = self.useragents.borrow_mut();
useragents.push(useragent.to_lowercase().to_owned());
}
fn push_ruleline(&self, ruleline: RuleLine<'a>) {
let mut rulelines = self.rulelines.borrow_mut();
rulelines.push(ruleline);
}
fn has_useragent(&self, useragent: &str) -> bool {
let useragents = self.useragents.borrow();
useragents.contains(&useragent.to_owned())
}
fn is_empty(&self) -> bool {
let useragents = self.useragents.borrow();
let rulelines = self.rulelines.borrow();
useragents.is_empty() && rulelines.is_empty()
}
fn set_crawl_delay(&mut self, delay: Duration) {
self.crawl_delay = Some(delay);
}
fn get_crawl_delay(&self) -> Option<Duration> {
self.crawl_delay
}
fn add_sitemap(&mut self, url: &str) {
if let Ok(url) = Url::parse(url) {
self.sitemaps.push(url);
}
}
fn get_sitemaps(&self) -> Vec<Url> {
self.sitemaps.clone()
}
fn set_req_rate(&mut self, req_rate: RequestRate) {
self.req_rate = Some(req_rate);
}
fn get_req_rate(&self) -> Option<RequestRate> {
self.req_rate.clone()
}
}
impl<'a> Default for Entry<'a> {
fn default() -> Entry<'a> {
Entry::new()
}
}
impl<'a> RobotFileParser<'a> {
pub fn new<T: AsRef<str>>(url: T) -> RobotFileParser<'a> {
let parsed_url = Url::parse(url.as_ref()).unwrap();
RobotFileParser {
entries: RefCell::new(vec![]),
default_entry: RefCell::new(Entry::new()),
disallow_all: Cell::new(false),
allow_all: Cell::new(false),
url: parsed_url.clone(),
host: parsed_url.host_str().unwrap().to_owned(),
path: parsed_url.path().to_owned(),
last_checked: Cell::new(0i64),
}
}
/// Returns the time the robots.txt file was last fetched.
///
/// This is useful for long-running web spiders that need to
/// check for new robots.txt files periodically.
pub fn mtime(&self) -> i64 {
self.last_checked.get()
}
/// Sets the time the robots.txt file was last fetched to the
/// current time.
pub fn modified(&self) {
let now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_secs() as i64;
self.last_checked.set(now);
}
/// Sets the URL referring to a robots.txt file.
pub fn set_url<T: AsRef<str>>(&mut self, url: T) {
let parsed_url = Url::parse(url.as_ref()).unwrap();
self.url = parsed_url.clone();
self.host = parsed_url.host_str().unwrap().to_owned();
self.path = parsed_url.path().to_owned();
self.last_checked.set(0i64);
}
#[cfg(feature = "http")]
/// Reads the robots.txt URL and feeds it to the parser.
pub fn read(&self) {
let client = Client::new();
let request = client.get(self.url.clone());
let request = request.header(USER_AGENT, RP_USER_AGENT.to_owned());
let mut res = match request.send() {
Ok(res) => res,
Err(_) => {
return;
}
};
let status = res.status();
match status {
StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN => {
self.disallow_all.set(true);
}
status if status >= StatusCode::BAD_REQUEST && status < StatusCode::INTERNAL_SERVER_ERROR => {
self.allow_all.set(true);
}
StatusCode::OK => self.from_response(&mut res),
_ => {}
}
}
#[cfg(feature = "http")]
/// Reads the HTTP response and feeds it to the parser.
pub fn from_response(&self, response: &mut Response) {
let mut buf = String::new();
response.read_to_string(&mut buf).unwrap();
let lines: Vec<&str> = buf.split('\n').collect();
self.parse(&lines);
}
fn _add_entry(&self, entry: Entry<'a>) {
if entry.has_useragent("*") {
// the default entry is considered last
let mut default_entry = self.default_entry.borrow_mut();
if default_entry.is_empty() {
// the first default entry wins
*default_entry = entry;
}
} else {
let mut entries = self.entries.borrow_mut();
entries.push(entry);
}
}
///
/// Parse the input lines from a robots.txt file
///
/// We allow that a user-agent: line is not preceded by
/// one or more blank lines.
///
pub fn parse<T: AsRef<str>>(&self, lines: &[T]) {
use url::percent_encoding::percent_decode;
// states:
// 0: start state
// 1: saw user-agent line
// 2: saw an allow or disallow line
let mut state = 0;
let mut entry = Entry::new();
self.modified();
for line in lines {
let mut ln = line.as_ref();
if ln.is_empty() {
match state {
1 => {
entry = Entry::new();
state = 0;
}
2 => {
self._add_entry(entry);
entry = Entry::new();
state = 0;
}
_ => {}
}
}
// remove optional comment and strip line
if let Some(i) = ln.find('#') {
ln = &ln[0..i];
}
ln = ln.trim();
if ln.is_empty() {
continue;
}
let parts: Vec<&str> = ln.splitn(2, ':').collect();
if parts.len() == 2 {
let part0 = parts[0].trim().to_lowercase();
let part1 = String::from_utf8(percent_decode(parts[1].trim().as_bytes()).collect())
.unwrap_or("".to_owned());
match part0 {
ref x if x == "user-agent" => {
if state == 2 {
self._add_entry(entry);
entry = Entry::new();
}
entry.push_useragent(&part1);
state = 1;
}
ref x if x == "disallow" => {
if state != 0 {
entry.push_ruleline(RuleLine::new(part1, false));
state = 2;
}
}
ref x if x == "allow" => {
if state != 0 {
entry.push_ruleline(RuleLine::new(part1, true));
state = 2;
}
}
ref x if x == "crawl-delay" => {
if state != 0 {
if let Ok(delay) = part1.parse::<f64>() {
let delay_seconds = delay.trunc();
let delay_nanoseconds = delay.fract() * 10f64.powi(9);
let delay = Duration::new(delay_seconds as u64, delay_nanoseconds as u32);
entry.set_crawl_delay(delay);
}
state = 2;
}
}
ref x if x == "sitemap" => {
if state != 0 {
entry.add_sitemap(&part1);
state = 2;
}
}
ref x if x == "request-rate" => {
if state != 0 {
let numbers: Vec<Result<usize, _>> = part1.split('/').map(|x| x.parse::<usize>()).collect();
if numbers.len() == 2 && numbers[0].is_ok() && numbers[1].is_ok() {
let req_rate = RequestRate {
requests: numbers[0].clone().unwrap(),
seconds: numbers[1].clone().unwrap(),
};
entry.set_req_rate(req_rate);
}
state = 2;
}
}
_ => {}
}
}
}
if state == 2 {
self._add_entry(entry);
}
}
/// Using the parsed robots.txt decide if useragent can fetch url
pub fn can_fetch<T: AsRef<str>>(&self, useragent: T, url: T) -> bool {
use url::percent_encoding::percent_decode;
let useragent = useragent.as_ref();
let url = url.as_ref();
if self.disallow_all.get() {
return false;
}
if self.allow_all.get() {
return true;
}
// Until the robots.txt file has been read or found not
// to exist, we must assume that no url is allowable.
// This prevents false positives when a user erronenously
// calls can_fetch() before calling read().
if self.last_checked.get() == 0 {
return false;
}
// search for given user agent matches
// the first match counts
let decoded_url = String::from_utf8(percent_decode(url.trim().as_bytes()).collect()).unwrap_or("".to_owned());
let url_str = match decoded_url {
ref u if !u.is_empty() => u.to_owned(),
_ => "/".to_owned(),
};
let entries = self.entries.borrow();
for entry in &*entries {
if entry.applies_to(useragent) {
return entry.allowance(&url_str);
}
}
// try the default entry last
let default_entry = self.default_entry.borrow();
if !default_entry.is_empty() {
return default_entry.allowance(&url_str);
}
// agent not found ==> access granted
true
}
/// Returns the crawl delay for this user agent as a `Duration`, or None if no crawl delay is defined.
pub fn get_crawl_delay<T: AsRef<str>>(&self, useragent: T) -> Option<Duration> {
let useragent = useragent.as_ref();
if self.last_checked.get() == 0 {
return None;
}
let entries = self.entries.borrow();
for entry in &*entries {
if entry.applies_to(useragent) {
return entry.get_crawl_delay();
}
}
None
}
/// Returns the sitemaps for this user agent as a `Vec<Url>`.
pub fn get_sitemaps<T: AsRef<str>>(&self, useragent: T) -> Vec<Url> {
let useragent = useragent.as_ref();
if self.last_checked.get() == 0 {
return Vec::new();
}
let entries = self.entries.borrow();
for entry in &*entries {
if entry.applies_to(useragent) {
return entry.get_sitemaps();
}
}
vec![]
}
/// Returns the request rate for this user agent as a `RequestRate`, or None if not request rate is defined
pub fn get_req_rate<T: AsRef<str>>(&self, useragent: T) -> Option<RequestRate> {
let useragent = useragent.as_ref();
if self.last_checked.get() == 0 {
return None;
}
let entries = self.entries.borrow();
for entry in &*entries {
if entry.applies_to(useragent) {
return entry.get_req_rate();
}
}
None
}
}
/// Contains models of robots.txt file.
pub mod model;
/// Contains robots.txt parsers.
pub mod parser;
/// Contains robots.txt services.
pub mod service;
/// Request builder & response parsers for other http libraries.
pub mod http;

17
src/model.rs Normal file
View file

@ -0,0 +1,17 @@
mod path_pattern;
pub (crate) use self::path_pattern::PathPattern;
mod group;
pub (crate) use self::group::Group;
mod rule;
pub (crate) use self::rule::Rule;
mod clean_params;
pub (crate) use self::clean_params::CleanParams;
mod request_rate;
pub use self::request_rate::RequestRate;
mod robots_txt;
pub use self::fetched_robots_txt::FetchedRobotsTxt;
pub (crate) use self::fetched_robots_txt::FetchedRobotsTxtContainer;
mod fetched_robots_txt;
pub use self::robots_txt::RobotsTxt;
mod path;
pub (crate) use self::path::Path;

24
src/model/clean_params.rs Normal file
View file

@ -0,0 +1,24 @@
use crate::model::PathPattern;
#[derive(Debug, Clone)]
pub struct CleanParams {
path_pattern: PathPattern,
params: Vec<String>,
}
impl CleanParams {
pub fn new(path_pattern: PathPattern, params: Vec<String>) -> CleanParams {
return CleanParams {
path_pattern,
params,
}
}
pub fn get_path_pattern(&self) -> &PathPattern {
return &self.path_pattern;
}
pub fn get_params(&self) -> &Vec<String> {
return &self.params;
}
}

View file

@ -0,0 +1,36 @@
use crate::model::robots_txt::RobotsTxt;
use std::time::SystemTime;
#[derive(Debug, Clone)]
pub (crate) enum FetchedRobotsTxtContainer {
FetchDenied,
FetchFailed,
Fetched(RobotsTxt),
}
#[derive(Debug, Clone)]
/// A model of the robots.txt file that was downloaded over the network.
/// This model takes into account HTTP response codes when loading the robots.txt file.
/// To work with this model you should use the trait `robotparser::service::RobotsTxtService`.
/// To create this structure you should use the `robotparser::parser::parse_fetched_robots_txt`.
pub struct FetchedRobotsTxt {
fetched_at: SystemTime,
container: FetchedRobotsTxtContainer,
}
impl FetchedRobotsTxt {
pub (crate) fn new(container: FetchedRobotsTxtContainer) -> FetchedRobotsTxt {
FetchedRobotsTxt {
fetched_at: SystemTime::now(),
container,
}
}
pub (crate) fn get_container(&self) -> &FetchedRobotsTxtContainer {
return &self.container;
}
/// Returns the system time when the robots.txt file was downloaded over the network.
pub fn get_fetched_at(&self) -> &SystemTime {
return &self.fetched_at;
}
}

93
src/model/group.rs Normal file
View file

@ -0,0 +1,93 @@
use std::time::Duration;
use crate::model::request_rate::RequestRate;
use crate::model::rule::Rule;
/// An group has one or more user-agents and zero or more rules
#[derive(Debug, Clone)]
pub struct Group {
user_agents: Vec<String>,
rules: Vec<Rule>,
crawl_delay: Option<Duration>,
req_rate: Option<RequestRate>,
}
impl Group {
pub (crate) fn new() -> Group {
Group {
user_agents: vec![],
rules: vec![],
crawl_delay: None,
req_rate: None,
}
}
/// check if this group applies to the specified agent
pub (crate) fn applies_to(&self, useragent: &str) -> bool {
let ua = useragent.split('/').nth(0).unwrap_or("").to_lowercase();
for agent in self.user_agents.iter() {
if ua.contains(agent) {
return true;
}
}
false
}
pub (crate) fn push_useragent(&mut self, useragent: &str) {
self.user_agents.push(useragent.to_lowercase().to_owned());
}
pub (crate) fn push_rule(&mut self, rule: Rule) {
self.rules.push(rule);
}
pub (crate) fn get_rules_sorted_by_path_len_desc(&self) -> Vec<&Rule> {
let mut rules: Vec<&Rule> = self.rules.iter().collect();
rules.sort_by(|a, b| {
let a = a.get_path_pattern().len();
let b = b.get_path_pattern().len();
return b.cmp(&a);
});
return rules;
}
pub (crate) fn contains_user_agent(&self, user_agent: &str) -> bool {
return self
.user_agents
.iter()
.find(|item| {
return *item == user_agent;
}).is_some();
}
pub (crate) fn set_crawl_delay(&mut self, delay: Duration) {
self.crawl_delay = Some(delay);
}
pub (crate) fn get_crawl_delay(&self) -> Option<Duration> {
return self.crawl_delay.clone();
}
pub (crate) fn set_req_rate(&mut self, req_rate: RequestRate) {
self.req_rate = Some(req_rate);
}
pub (crate) fn get_req_rate(&self) -> Option<RequestRate> {
return self.req_rate.clone();
}
pub (crate) fn is_default(&self) -> bool {
for user_agent in self.user_agents.iter() {
if user_agent == "*" {
return true;
}
}
return false;
}
}
impl Default for Group {
fn default() -> Group {
Group::new()
}
}

35
src/model/path.rs Normal file
View file

@ -0,0 +1,35 @@
use url::Url;
use percent_encoding::percent_decode;
#[derive(Debug)]
pub struct Path(String);
impl Path {
pub fn from_url(url: &Url) -> Path {
let path = get_url_without_origin(&url);
let path = percent_decode(path.as_bytes()).decode_utf8_lossy();
if path.is_empty() {
return Path("/".into());
} else {
return Path(path.into());
}
}
pub fn as_str(&self) -> &str {
return &self.0;
}
}
fn get_url_without_origin(url: &Url) -> &str {
let origin = url.origin();
let url = url.as_str();
let unicode_origin = origin.unicode_serialization();
let ascii_origin = origin.ascii_serialization();
if url.starts_with(&unicode_origin) && unicode_origin.len() >= 1 {
return &url[unicode_origin.len()..];
}
if url.starts_with(&ascii_origin) && ascii_origin.len() >= 1 {
return &url[ascii_origin.len()..];
}
// Must never be executed.
panic!("Unable to get path from url");
}

127
src/model/path_pattern.rs Normal file
View file

@ -0,0 +1,127 @@
use std::convert::From;
use std::mem::replace;
use percent_encoding::percent_decode;
use crate::model::path::Path;
#[derive(Debug, Clone)]
pub struct PathPattern(Vec<PathPatternToken>);
#[derive(Debug, Eq, PartialEq, Clone)]
enum PathPatternToken {
Text(String),
AnyString,
TerminateString,
}
impl PathPatternToken {
fn from_path_pattern(path: String) -> PathPatternToken {
let path = percent_decode(path.as_bytes()).decode_utf8_lossy();
return PathPatternToken::Text(path.to_string());
}
}
impl PathPatternToken {
fn len(&self) -> usize {
return match self {
&PathPatternToken::Text(ref text) => {
text.len()
},
&PathPatternToken::AnyString => {
1
},
&PathPatternToken::TerminateString => {
1
},
}
}
}
impl PathPattern {
pub fn new(path: &str) -> PathPattern {
let mut text = String::new();
let mut tokens = Vec::new();
for c in path.chars() {
let prepared_token = match c {
'*' => {
Some(PathPatternToken::AnyString)
},
'$' => {
Some(PathPatternToken::TerminateString)
},
_ => {
text.push(c);
None
},
};
if let Some(prepared_token) = prepared_token {
if !text.is_empty() {
tokens.push(PathPatternToken::from_path_pattern(replace(&mut text, "".into())));
}
tokens.push(prepared_token);
}
}
if !text.is_empty() {
tokens.push(PathPatternToken::from_path_pattern(text));
}
if let Some(&PathPatternToken::Text(..)) = tokens.last() {
tokens.push(PathPatternToken::AnyString);
}
tokens.dedup();
return PathPattern(tokens);
}
pub fn all() -> PathPattern {
return PathPattern(vec![PathPatternToken::AnyString]);
}
pub fn applies_to(&self, path: &Path) -> bool {
let mut filename = path.as_str();
for (index, token) in self.0.iter().enumerate() {
match token {
&PathPatternToken::Text(ref text) => {
if !filename.starts_with(text) {
return false;
}
filename = &filename[text.len() ..];
},
&PathPatternToken::AnyString => {
if let Some(&PathPatternToken::Text(ref text)) = self.0.get(index + 1) {
while filename.len() >= 1 {
if filename.starts_with(text) {
break;
}
// Search for next unicode char.
if let Some((next_char_index, _)) = filename.char_indices().nth(1) {
filename = &filename[next_char_index..];
} else {
break;
}
}
} else {
filename = &filename[filename.len()..];
}
},
&PathPatternToken::TerminateString => {
if filename.len() != 0 {
return false;
}
},
}
}
return true;
}
pub fn len(&self) -> usize {
let mut length = 0;
for path_token in self.0.iter() {
length += path_token.len();
}
return length;
}
}
impl From<&str> for PathPattern {
fn from(path: &str) -> Self {
return PathPattern::new(path);
}
}

View file

@ -0,0 +1,9 @@
#[derive(Debug, Clone)]
/// The model of limiting the frequency of requests to the server.
/// It's set by the `Request-Rate` directive.
/// # Example
/// For the directive `Request-Rate: 1/5` is equivalent to the model `RequestRate {requests: 1, seconds: 5}`
pub struct RequestRate {
pub requests: usize,
pub seconds: usize,
}

75
src/model/robots_txt.rs Normal file
View file

@ -0,0 +1,75 @@
use crate::model::group::Group;
use crate::model::clean_params::CleanParams;
use url::{Url, Origin};
#[derive(Debug, Clone)]
/// The robots.txt model that was obtained after parsing the text of the robots.txt file.
/// To work with this model you should use the trait `robotparser::service::RobotsTxtService`.
/// To create this structure you should use the `robotparser::parser::parse_robots_txt`.
pub struct RobotsTxt {
origin: Origin,
groups: Vec<Group>,
sitemaps: Vec<Url>,
clean_params: Vec<CleanParams>,
}
impl RobotsTxt {
pub (crate) fn new(origin: Origin) -> RobotsTxt {
return RobotsTxt {
origin,
groups: Vec::new(),
sitemaps: Vec::new(),
clean_params: Vec::new(),
}
}
pub (crate) fn add_sitemap(&mut self, url: Url) {
self.sitemaps.push(url);
}
pub (crate) fn get_sitemaps_slice(&self) -> &[Url] {
return self.sitemaps.as_slice();
}
pub (crate) fn add_clean_params(&mut self, clean_params: CleanParams) {
self.clean_params.push(clean_params);
}
pub (crate) fn get_clean_params(&self) -> &[CleanParams] {
return self.clean_params.as_slice();
}
pub (crate) fn add_group(&mut self, group: Group) {
self.groups.push(group);
}
pub (crate) fn get_origin(&self) -> &Origin {
return &self.origin;
}
pub (crate) fn find_in_group<'a, T>(&'a self, user_agent: &str, callback: impl Fn(&'a Group) -> Option<T>) -> Option<T> {
// Search by user agents
for group in self.groups.iter() {
if group.applies_to(user_agent) {
if let Some(output) = (callback)(group) {
return Some(output);
}
}
}
if let Some(group) = self.get_default_group() {
if let Some(output) = (callback)(group) {
return Some(output);
}
}
return None;
}
pub (crate) fn get_default_group(&self) -> Option<&Group> {
for group in self.groups.iter() {
if group.is_default() {
return Some(group);
}
}
return None;
}
}

31
src/model/rule.rs Normal file
View file

@ -0,0 +1,31 @@
use crate::model::path_pattern::PathPattern;
use crate::model::path::Path;
/// A rule line is a single "Allow:" (allowance==True) or "Disallow:"
/// (allowance==False) followed by a path."""
#[derive(Debug, Clone)]
pub struct Rule {
path_pattern: PathPattern,
allowance: bool,
}
impl Rule {
pub fn new(path_pattern: impl Into<PathPattern>, allowance: bool) -> Rule {
Rule {
path_pattern: path_pattern.into(),
allowance,
}
}
pub (crate) fn applies_to(&self, path: &Path) -> bool {
return self.path_pattern.applies_to(path);
}
pub (crate) fn get_allowance(&self) -> bool {
return self.allowance;
}
pub (crate) fn get_path_pattern(&self) -> &PathPattern {
return &self.path_pattern;
}
}

40
src/parser.rs Normal file
View file

@ -0,0 +1,40 @@
//! # Supported features and directives
//!
//! * Removes BOM unicode
//! * Directive `User-Agent`
//! * Directive `Allow`
//! * Directive `Disallow`
//! * Directive `Crawl-Delay`
//! * Directive `Request-Rate`
//! * Directive `Sitemap`
//! * Directive `Clean-Param`
//!
//! # Example
//! ```rust
//! use robotparser::parser::parse_robots_txt;
//! use robotparser::service::RobotsTxtService;
//! use url::Url;
//!
//! fn main() {
//! let robots_txt_url = Url::parse("http://google.com/robots.txt").unwrap();
//! let robots_txt = "User-agent: *\nDisallow: /search";
//! let robots_txt = parse_robots_txt(robots_txt_url.origin(), robots_txt);
//! assert_eq!(robots_txt.get_warnings().len(), 0);
//! let robots_txt = robots_txt.get_result();
//! let good_url = Url::parse("http://google.com/test").unwrap();
//! let bad_url = Url::parse("http://google.com/search/vvv").unwrap();
//! assert_eq!(robots_txt.can_fetch("*", &bad_url), false);
//! assert_eq!(robots_txt.can_fetch("*", &good_url), true);
//! }
//! ```
mod robots_txt_parser;
pub use self::robots_txt_parser::parse as parse_robots_txt;
mod warning_reason;
pub use self::warning_reason::WarningReason;
mod warning;
pub use self::warning::ParseWarning;
mod parse_result;
pub use self::parse_result::ParseResult;
mod fetched_robots_txt_parser;
pub use self::fetched_robots_txt_parser::parse as parse_fetched_robots_txt;
mod line;

View file

@ -0,0 +1,28 @@
use crate::model::{FetchedRobotsTxt, FetchedRobotsTxtContainer};
use crate::parser::ParseResult;
use crate::parser::parse_robots_txt;
use url::Origin;
const UNAUTHORIZED: u16 = 401;
const FORBIDDEN: u16 = 403;
const OK: u16 = 200;
/// Parses the text of the robots.txt file located in the specified place of origin,
/// taking into account the response status code of the HTTP-request.
/// **IMPORTANT NOTE**: origin must point to robots.txt url **before redirects**.
pub fn parse(origin: Origin, status_code: u16, input: &str) -> ParseResult<FetchedRobotsTxt> {
match status_code {
UNAUTHORIZED | FORBIDDEN => {
return ParseResult::new(FetchedRobotsTxt::new(FetchedRobotsTxtContainer::FetchDenied));
}
OK => {
return parse_robots_txt(origin, input)
.map(|result| {
return FetchedRobotsTxt::new(FetchedRobotsTxtContainer::Fetched(result));
});
},
_ => {
return ParseResult::new(FetchedRobotsTxt::new(FetchedRobotsTxtContainer::FetchFailed));
}
}
}

21
src/parser/line.rs Normal file
View file

@ -0,0 +1,21 @@
pub struct Line<'a> {
line: &'a str,
position: usize,
}
impl <'a>Line<'a> {
pub fn new(line: &'a str, position: usize) -> Line<'a> {
return Line {
line,
position,
}
}
pub fn get_line_text(&self) -> &str {
return self.line;
}
pub fn get_line_number(&self) -> usize {
return self.position;
}
}

View file

@ -0,0 +1,62 @@
use crate::parser::warning::ParseWarning;
use std::fmt::Debug;
#[derive(Debug)]
/// The result of the robots.txt parser.
pub struct ParseResult<R> where R: Debug {
result: R,
warnings: Vec<ParseWarning>,
}
impl <R>ParseResult<R> where R: Debug {
/// Creates a new structure for parser results.
pub (crate) fn new(result: R) -> ParseResult<R>{
return ParseResult {
result,
warnings: Vec::new(),
}
}
/// Creates a new structure for parser results with warnings.
pub (crate) fn new_with_warnings(result: R, warnings: Vec<ParseWarning>) -> ParseResult<R>{
return ParseResult {
result,
warnings,
}
}
/// Returns the result of the robots.txt parser.
pub fn get_result(self) -> R {
return self.result;
}
/// Returns the robots.txt parser warning array.
pub fn get_warnings(&self) -> &[ParseWarning] {
return self.warnings.as_slice();
}
/// Returns reference to result of the robots.txt parser or first warning.
pub fn ok_ref(&self) -> Result<&R, &ParseWarning> {
if let Some(warning) = self.warnings.first() {
return Err(warning);
}
return Ok(&self.result);
}
/// Returns the result of the robots.txt parser or first warning.
pub fn ok(mut self) -> Result<R, ParseWarning> {
if self.warnings.is_empty() {
return Ok(self.result);
}
let first_warning = self.warnings.remove(0);
return Err(first_warning);
}
/// Converts this structure into another type of structure.
pub (crate) fn map<T>(self, callback: impl Fn(R) -> T) -> ParseResult<T> where T: Debug {
return ParseResult {
result: (callback)(self.result),
warnings: self.warnings,
}
}
}

View file

@ -0,0 +1,281 @@
use url::{Origin, Url};
use std::time::Duration;
use crate::parser::parse_result::ParseResult;
use crate::model::{RobotsTxt, Rule, PathPattern, CleanParams, RequestRate};
use crate::parser::line::Line;
use crate::parser::warning::ParseWarning;
mod directive;
use self::directive::Directive;
mod group_builder;
pub use self::group_builder::GroupBuilder;
const COMMENT_BEGIN_CHAR: char = '#';
const KV_SEPARATOR: &'static str = ":";
/// Parses the text of the robots.txt file located in the specified origin.
pub fn parse(origin: Origin, input: &str) -> ParseResult<RobotsTxt> {
let parser = Parser::new(origin);
return parser.parse(input);
}
struct Parser {
result: RobotsTxt,
group_builder: GroupBuilder,
warnings: Vec<ParseWarning>,
}
impl Parser {
pub fn new(origin: Origin) -> Parser {
return Parser {
result: RobotsTxt::new(origin),
group_builder: GroupBuilder::new(),
warnings: Vec::new(),
}
}
pub fn parse(mut self, input: &str) -> ParseResult<RobotsTxt> {
let input = ignore_bom(input);
let mut line_no = 0;
for line in input.lines() {
line_no += 1;
let line = Line::new(line, line_no);
match Self::parse_line(&line) {
Ok(Some(line_value)) => {
self.process_line_value(&line, &line_value);
},
Err(warning) => {
self.warnings.push(warning);
},
_ => {},
}
}
self.group_builder.fill_entries(&mut self.result);
return ParseResult::new_with_warnings(self.result, self.warnings);
}
fn parse_line<'a>(line: &'a Line) -> Result<Option<Directive<'a>>, ParseWarning> {
let mut kv_part = line.get_line_text();
if let Some(comment_separator_position) = line.get_line_text().find(COMMENT_BEGIN_CHAR) {
kv_part = &kv_part[0..comment_separator_position];
}
if kv_part.is_empty() {
return Ok(None);
}
let separator_index = kv_part.find(KV_SEPARATOR).ok_or_else(|| {
return ParseWarning::invalid_directive_format(line);
})?;
if separator_index >= kv_part.len() {
return Err(ParseWarning::invalid_directive_format(line));
}
let key = &kv_part[0..separator_index];
let key = key.trim();
if key.is_empty() {
return Err(ParseWarning::directive_key_is_empty(line));
}
let value = &kv_part[separator_index + 1..];
let value = value.trim();
let result = Directive::new(key, value);
return Ok(Some(result));
}
fn process_line_value(&mut self, line: &Line, directive: &Directive) {
let key = directive.get_key_lowercase();
match key.as_str() {
// Group specific directives
"user-agent" => {
self.process_directive_user_agent(line, directive);
},
"allow" => {
self.process_directive_allow(line, directive);
},
"disallow" => {
self.process_directive_disallow(line, directive);
},
"crawl-delay" => {
self.process_directive_crawl_delay(line, directive);
},
"request-rate" => {
self.process_directive_request_rate(line, directive);
},
// Non-group directives
"sitemap" => {
self.process_directive_sitemap(line, directive);
},
"clean-param" => {
self.process_directive_clean_param(line, directive);
},
_ => {
self.warnings.push(ParseWarning::unsupported_directive_key(line, key));
},
}
}
fn process_directive_user_agent(&mut self, line: &Line, directive: &Directive) {
let user_agent = directive.get_value();
if user_agent.is_empty() {
self.warnings.push(ParseWarning::user_agent_cannot_be_empty(line));
return;
}
self.group_builder.handle_user_agent(user_agent);
}
fn process_directive_allow(&mut self, line: &Line, directive: &Directive) {
if let Some(group) = self.group_builder.get_mut_active_group() {
if directive.get_value() == "" {
// Nothing to do. Ignoring.
} else if directive.get_value().starts_with("*") || directive.get_value().starts_with("/") {
group.push_rule(Rule::new(directive.get_value(), true));
} else {
self.warnings.push(ParseWarning::wrong_path_format(line));
}
} else {
self.warnings.push(ParseWarning::directive_without_user_agent(line));
}
}
fn process_directive_disallow(&mut self, line: &Line, directive: &Directive) {
if let Some(group) = self.group_builder.get_mut_active_group() {
if directive.get_value() == "" {
// Allow all.
group.push_rule(Rule::new(PathPattern::all(), true));
} else if directive.get_value().starts_with("*") || directive.get_value().starts_with("/") {
group.push_rule(Rule::new(directive.get_value(), false));
} else {
self.warnings.push(ParseWarning::wrong_path_format(line));
}
} else {
self.warnings.push(ParseWarning::directive_without_user_agent(line));
}
}
fn process_directive_crawl_delay(&mut self, line: &Line, directive: &Directive) {
if let Some(group) = self.group_builder.get_mut_active_group() {
match directive.get_value().parse::<f64>() {
Ok(delay) => {
let delay_seconds = delay.trunc();
let delay_nanoseconds = delay.fract() * 10f64.powi(9);
let delay = Duration::new(delay_seconds as u64, delay_nanoseconds as u32);
group.set_crawl_delay(delay);
},
Err(error) => {
self.warnings.push(ParseWarning::parse_crawl_delay_error(line, error));
},
}
} else {
self.warnings.push(ParseWarning::directive_without_user_agent(line));
}
}
fn process_directive_request_rate(&mut self, line: &Line, directive: &Directive) {
if let Some(group) = self.group_builder.get_mut_active_group() {
let numbers: Vec<&str> = directive.get_value().split('/').collect();
if numbers.len() != 2 {
self.warnings.push(ParseWarning::wrong_request_rate_format(line));
return;
}
let requests = match numbers[0].parse::<usize>() {
Ok(requests) => {requests},
Err(error) => {
self.warnings.push(ParseWarning::parse_request_rate(line, error));
return;
},
};
let seconds = match numbers[1].parse::<usize>() {
Ok(seconds) => {seconds},
Err(error) => {
self.warnings.push(ParseWarning::parse_request_rate(line, error));
return;
},
};
group.set_req_rate(RequestRate{requests, seconds});
} else {
self.warnings.push(ParseWarning::directive_without_user_agent(line));
}
}
fn process_directive_sitemap(&mut self, line: &Line, directive: &Directive) {
match Url::parse(directive.get_value()) {
Ok(sitemap_url) => {
self.result.add_sitemap(sitemap_url);
},
Err(error) => {
self.warnings.push(ParseWarning::parse_url(line, error));
},
}
}
fn process_directive_clean_param(&mut self, line: &Line, directive: &Directive) {
let parts: Vec<&str> = directive.get_value().split_whitespace().collect();
if parts.len() >= 3 || parts.len() == 0 {
self.warnings.push(ParseWarning::wrong_clean_param_format(line));
return;
}
if parts[0].len() == 0 {
self.warnings.push(ParseWarning::wrong_clean_param_format(line));
return;
}
let clean_params_path_pattern;
let clean_params;
if let Some(second_param) = parts.get(1) {
if second_param.len() == 0 {
self.warnings.push(ParseWarning::wrong_clean_param_format(line));
return;
}
clean_params_path_pattern = PathPattern::new(parts[0]);
clean_params = *second_param;
} else {
clean_params_path_pattern = PathPattern::all();
clean_params = parts[0];
}
let (valid_clean_params, invalid_clean_params) = Self::parse_clean_params(clean_params);
if !invalid_clean_params.is_empty() {
self.warnings.push(ParseWarning::ignored_clean_params(line, invalid_clean_params));
}
self.result.add_clean_params(CleanParams::new(clean_params_path_pattern, valid_clean_params));
}
fn parse_clean_params(clean_params: &str) -> (Vec<String>, Vec<String>) {
let mut valid = Vec::new();
let mut invalid = Vec::new();
for clean_param in clean_params.split('&') {
if !clean_param.is_empty() {
if Self::is_valid_clean_param(clean_param) {
valid.push(clean_param.into());
} else {
invalid.push(clean_param.into());
}
}
}
return (valid, invalid);
}
fn is_valid_clean_param(clean_param: &str) -> bool {
for c in clean_param.chars() {
let mut is_valid = false;
if ('A'..'Z').contains(&c) {
is_valid = true;
}
if ('a'..'z').contains(&c) {
is_valid = true;
}
if ('0'..'9').contains(&c) {
is_valid = true;
}
if c == '.' || c == '-' || c == '_' {
is_valid = true;
}
if !is_valid {
return false;
}
}
return true;
}
}
fn ignore_bom(input: &str) -> &str {
const BOM: &'static str = "\u{feff}";
if input.starts_with(BOM) {
return &input[BOM.len()..];
}
return input;
}

View file

@ -0,0 +1,21 @@
pub struct Directive<'a> {
key: &'a str,
value: &'a str,
}
impl <'a> Directive<'a> {
pub fn new(key: &'a str, value: &'a str) -> Directive<'a> {
return Directive {
key,
value,
}
}
pub fn get_key_lowercase(&self) -> String {
return self.key.to_lowercase();
}
pub fn get_value(&self) -> &str {
return self.value;
}
}

View file

@ -0,0 +1,54 @@
use crate::model::{Group, RobotsTxt};
enum State {
WaitingForNewGroup,
WaitingForAdditionalUserAgent,
}
pub struct GroupBuilder {
state: State,
active_group: Option<usize>,
groups: Vec<Group>,
}
impl GroupBuilder {
pub fn new() -> GroupBuilder {
return GroupBuilder {
state: State::WaitingForNewGroup,
active_group: None,
groups: Vec::new(),
}
}
pub fn handle_user_agent(&mut self, user_agent: &str) {
match self.state {
State::WaitingForNewGroup => {
let mut group = Group::new();
group.push_useragent(user_agent);
self.groups.push(group);
self.active_group = Some(self.groups.len() - 1);
self.state = State::WaitingForAdditionalUserAgent;
},
State::WaitingForAdditionalUserAgent => {
let active_group = self.active_group.expect("Unable to get active group");
let group = self.groups.get_mut(active_group).expect("Unable to get group index");
if !group.contains_user_agent(user_agent) {
group.push_useragent(user_agent);
}
},
}
}
pub fn get_mut_active_group(&mut self) -> Option<&mut Group> {
self.state = State::WaitingForNewGroup;
if let Some(active_group) = self.active_group {
return self.groups.get_mut(active_group);
}
return None;
}
pub fn fill_entries(mut self, robots_txt: &mut RobotsTxt) {
for group in self.groups.drain(..) {
robots_txt.add_group(group);
}
}
}

136
src/parser/warning.rs Normal file
View file

@ -0,0 +1,136 @@
use super::line::Line;
use super::warning_reason::WarningReason;
use url::ParseError as ParseUrlError;
use std::num::{ParseFloatError, ParseIntError};
use std::fmt;
use std::error::Error;
#[derive(Clone, Debug)]
/// Warning of robots.txt parser about problems when parsing robots.txt file.
pub struct ParseWarning {
line_no: usize,
line: String,
reason: WarningReason,
}
impl Error for ParseWarning {}
impl ParseWarning {
/// Returns the line number in the text of the robots.txt file.
pub fn get_line_no(&self) -> usize {
return self.line_no;
}
/// Returns the text of the robots.txt file string.
pub fn get_line_text(&self) -> &String {
return &self.line;
}
/// Returns the reason of warning.
pub fn get_reason(&self) -> &WarningReason {
return &self.reason;
}
pub (crate) fn invalid_directive_format(line: &Line) -> ParseWarning {
return ParseWarning {
line_no: line.get_line_number(),
line: line.get_line_text().into(),
reason: WarningReason::InvalidDirectiveFormat,
}
}
pub (crate) fn directive_key_is_empty(line: &Line) -> ParseWarning {
return ParseWarning {
line_no: line.get_line_number(),
line: line.get_line_text().into(),
reason: WarningReason::DirectiveKeyIsEmpty,
}
}
pub (crate) fn unsupported_directive_key(line: &Line, key: String) -> ParseWarning {
return ParseWarning {
line_no: line.get_line_number(),
line: line.get_line_text().into(),
reason: WarningReason::UnsupportedDirectiveKey(key),
}
}
pub (crate) fn user_agent_cannot_be_empty(line: &Line) -> ParseWarning {
return ParseWarning {
line_no: line.get_line_number(),
line: line.get_line_text().into(),
reason: WarningReason::UserAgentCannotBeEmpty,
}
}
pub (crate) fn wrong_path_format(line: &Line) -> ParseWarning {
return ParseWarning {
line_no: line.get_line_number(),
line: line.get_line_text().into(),
reason: WarningReason::WrongPathFormat,
}
}
pub (crate) fn directive_without_user_agent(line: &Line) -> ParseWarning {
return ParseWarning {
line_no: line.get_line_number(),
line: line.get_line_text().into(),
reason: WarningReason::DirectiveWithoutUserAgent,
}
}
pub (crate) fn parse_crawl_delay_error(line: &Line, error: ParseFloatError) -> ParseWarning {
return ParseWarning {
line_no: line.get_line_number(),
line: line.get_line_text().into(),
reason: WarningReason::ParseCrawlDelayError(error),
}
}
pub (crate) fn wrong_request_rate_format(line: &Line) -> ParseWarning {
return ParseWarning {
line_no: line.get_line_number(),
line: line.get_line_text().into(),
reason: WarningReason::WrongRequestRateFormat,
}
}
pub (crate) fn parse_request_rate(line: &Line, error: ParseIntError) -> ParseWarning {
return ParseWarning {
line_no: line.get_line_number(),
line: line.get_line_text().into(),
reason: WarningReason::ParseRequestRate(error),
}
}
pub (crate) fn parse_url(line: &Line, error: ParseUrlError) -> ParseWarning {
return ParseWarning {
line_no: line.get_line_number(),
line: line.get_line_text().into(),
reason: WarningReason::ParseUrl(error),
}
}
pub (crate) fn wrong_clean_param_format(line: &Line) -> ParseWarning {
return ParseWarning {
line_no: line.get_line_number(),
line: line.get_line_text().into(),
reason: WarningReason::WrongCleanParamFormat,
}
}
pub (crate) fn ignored_clean_params(line: &Line, ignored_clean_params: Vec<String>) -> ParseWarning {
return ParseWarning {
line_no: line.get_line_number(),
line: line.get_line_text().into(),
reason: WarningReason::IgnoredCleanParams(ignored_clean_params),
}
}
}
/// Displays text of warning.
impl fmt::Display for ParseWarning {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(f, "Line: {}. Text: `{}`. {}", self.line_no, self.line, self.reason)
}
}

View file

@ -0,0 +1,80 @@
use url::ParseError as ParseUrlError;
use std::num::{ParseFloatError, ParseIntError};
use std::fmt;
#[derive(Clone, Debug)]
/// Warning reason of robots.txt parser about problems when parsing robots.txt file.
pub enum WarningReason {
/// Invalid directive format. Invalid directive example: `:`
InvalidDirectiveFormat,
/// Directive key is empty. Invalid directive example: `: <Value>`
DirectiveKeyIsEmpty,
/// Directive key is not suppored by this parser.
UnsupportedDirectiveKey(String),
/// Passed directive key is `User-Agent` and passed value is empty.
UserAgentCannotBeEmpty,
/// It is impossible to process this directive before the `User-Agent` directive has not been processed.
DirectiveWithoutUserAgent,
/// It is impossible to process the `Crawl-Delay` directive because of an error when parsing a floating point number.
ParseCrawlDelayError(ParseFloatError),
/// Incorrect format of the `Request-Rate` directive. Example of the correct format: `Request-rate: 1/5`
WrongRequestRateFormat,
/// Incorrect format of the `Request-Rate` directive. Example of the correct format: `Request-rate: 1/5`
ParseRequestRate(ParseIntError),
/// Parsing URL error.
ParseUrl(ParseUrlError),
/// Incorrect format of the `Clean-Param` directive.
/// Parameters must be matched to regular expression: `A-Za-z0-9.-_`.
/// Example of the correct format: `Clean-param: ref1&ref2 /some_dir/get_book.pl`
WrongCleanParamFormat,
/// Some parameters of the `Clean-Param` directive has wrong symbols.
/// Parameters must be matched to regular expression: `A-Za-z0-9.-_`.
/// Example of the correct format: `Clean-param: ref1&ref2 /some_dir/get_book.pl`
IgnoredCleanParams(Vec<String>),
/// Error in URL path format.
WrongPathFormat,
}
/// Displays text of warning reason.
impl fmt::Display for WarningReason {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
match &self {
&Self::InvalidDirectiveFormat => {
write!(f, "Invalid directive format.")
},
&Self::DirectiveKeyIsEmpty => {
write!(f, "Directive key is empty.")
},
&Self::UnsupportedDirectiveKey(key) => {
write!(f, "Directive key `{}` is not suppored by this parser.", key)
},
&Self::UserAgentCannotBeEmpty => {
write!(f, "Passed directive key is `User-Agent` and passed value is empty.")
},
&Self::DirectiveWithoutUserAgent => {
write!(f, "It is impossible to process this directive before `User-Agent` directive has not been processed.")
},
&Self::ParseCrawlDelayError(err) => {
write!(f, "It is impossible to process the `Crawl-Delay` directive because of an error when parsing a floating point number: {}", err)
},
&Self::WrongRequestRateFormat => {
write!(f, "Incorrect format of the `Request-Rate` directive")
},
&Self::ParseRequestRate(err) => {
write!(f, "Incorrect format of the `Request-Rate` directive: {}", err)
},
&Self::ParseUrl(err) => {
write!(f, "Parsing URL error: {}", err)
},
&Self::WrongCleanParamFormat => {
write!(f, "Incorrect format of the `Clean-Param` directive.")
},
&Self::IgnoredCleanParams(ref params) => {
write!(f, "Directive `Clean-Param` directive has incorrect parameters: {:?}", params)
},
&Self::WrongPathFormat => {
write!(f, "Error in URL path format.")
},
}
}
}

30
src/service.rs Normal file
View file

@ -0,0 +1,30 @@
mod robots_txt;
mod fetched_robots_txt;
use url::Url;
use std::time::Duration;
use crate::model::RequestRate;
/// Trait that implements robots txt service.
pub trait RobotsTxtService {
/// Using the parsed robots.txt decide if useragent can fetch url.
fn can_fetch(&self, user_agent: &str, url: &Url) -> bool;
/// Returns the crawl delay for this user agent as a Duration, or None if no crawl delay is defined.
fn get_crawl_delay(&self, user_agent: &str) -> Option<Duration>;
/// Removes the request parameters from the url that were listed in the `Clean-param` directive.
/// This method CHECKS that the origin of the transmitted url matches the origin of robots.txt.
/// Returns true if the operation was applied to the passed url.
/// In other cases it returns false.
fn normalize_url(&self, url: &mut Url) -> bool;
/// Removes the request parameters from the url that were listed in the `Clean-param` directive.
/// This method DOES NOT CHECK that the origin of the transmitted url coincides with the origin of robots.txt.
fn normalize_url_ignore_origin(&self, url: &mut Url);
/// Returns the list of URL sitemaps that have been listed in the robots.txt file.
fn get_sitemaps(&self) -> &[Url];
/// Returns information about the restrictions set for sending HTTP requests to the server.
fn get_req_rate(&self, user_agent: &str) -> Option<RequestRate>;
}

View file

@ -0,0 +1,51 @@
use url::Url;
use std::time::Duration;
use crate::service::RobotsTxtService;
use crate::model::{FetchedRobotsTxt, FetchedRobotsTxtContainer};
use crate::model::RequestRate;
impl RobotsTxtService for FetchedRobotsTxt {
fn can_fetch(&self, user_agent: &str, url: &Url) -> bool {
match self.get_container() {
&FetchedRobotsTxtContainer::FetchDenied => false,
&FetchedRobotsTxtContainer::FetchFailed => true,
&FetchedRobotsTxtContainer::Fetched(ref robots_txt) => {
robots_txt.can_fetch(user_agent, url)
}
}
}
fn get_crawl_delay(&self, user_agent: &str) -> Option<Duration> {
if let &FetchedRobotsTxtContainer::Fetched(ref robots_txt) = self.get_container() {
return robots_txt.get_crawl_delay(user_agent);
}
return None;
}
fn normalize_url(&self, url: &mut Url) -> bool {
if let &FetchedRobotsTxtContainer::Fetched(ref robots_txt) = self.get_container() {
return robots_txt.normalize_url(url);
}
return true;
}
fn normalize_url_ignore_origin(&self, url: &mut Url) {
if let &FetchedRobotsTxtContainer::Fetched(ref robots_txt) = self.get_container() {
robots_txt.normalize_url_ignore_origin(url);
}
}
fn get_sitemaps(&self) -> &[Url] {
if let &FetchedRobotsTxtContainer::Fetched(ref robots_txt) = self.get_container() {
return robots_txt.get_sitemaps();
}
return &[];
}
fn get_req_rate(&self, user_agent: &str) -> Option<RequestRate> {
if let &FetchedRobotsTxtContainer::Fetched(ref robots_txt) = self.get_container() {
return robots_txt.get_req_rate(user_agent);
}
return None;
}
}

84
src/service/robots_txt.rs Normal file
View file

@ -0,0 +1,84 @@
use url::Url;
use std::time::Duration;
use crate::service::RobotsTxtService;
use crate::model::RobotsTxt;
use crate::model::RequestRate;
use crate::model::Path;
impl RobotsTxtService for RobotsTxt {
fn can_fetch(&self, user_agent: &str, url: &Url) -> bool {
if url.origin() != *self.get_origin() {
return false;
}
let path = Path::from_url(url);
let rule_decision = self.find_in_group(user_agent, |group| {
let rules = group.get_rules_sorted_by_path_len_desc();
for rule in rules.iter() {
if rule.applies_to(&path) {
return Some(rule.get_allowance());
}
}
return None;
});
if let Some(rule_decision) = rule_decision {
return rule_decision;
}
// Empty robots.txt allows crawling. Everything that was not denied must be allowed.
return true;
}
fn get_crawl_delay(&self, user_agent: &str) -> Option<Duration> {
return self.find_in_group(user_agent, |group| {
return group.get_crawl_delay();
});
}
fn normalize_url(&self, url: &mut Url) -> bool {
if url.origin() != *self.get_origin() {
return false;
}
self.normalize_url_ignore_origin(url);
return true;
}
fn normalize_url_ignore_origin(&self, url: &mut Url) {
if url.query().is_none() {
return;
}
let mut query_params_to_filter = Vec::new();
let path = Path::from_url(url);
for clean_params in self.get_clean_params().iter() {
if clean_params.get_path_pattern().applies_to(&path) {
query_params_to_filter.extend_from_slice(clean_params.get_params())
}
}
let mut pairs: Vec<(String, String)> = url
.query_pairs()
.map(|(key, value)|{
return (key.into(), value.into());
})
.collect();
{
let mut query_pairs_mut = url.query_pairs_mut();
query_pairs_mut.clear();
for (key, value) in pairs.drain(..) {
if !query_params_to_filter.contains(&key) {
query_pairs_mut.append_pair(&key, &value);
}
}
}
if url.query() == Some("") {
url.set_query(None);
}
}
fn get_sitemaps(&self) -> &[Url] {
return self.get_sitemaps_slice();
}
fn get_req_rate(&self, user_agent: &str) -> Option<RequestRate> {
return self.find_in_group(user_agent, |group| {
return group.get_req_rate();
});
}
}

View file

@ -1,21 +1,22 @@
extern crate robotparser;
extern crate url;
use robotparser::RobotFileParser;
use robotparser::parser::parse_robots_txt;
use robotparser::service::RobotsTxtService;
use std::time::Duration;
use url::Url;
const AGENT: &'static str = "test_robotparser";
fn robot_test(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>, agent: &str) {
let parser = RobotFileParser::new("http://www.baidu.com/robots.txt");
let lines: Vec<&str> = doc.split("\n").collect();
parser.parse(&lines);
let url = Url::parse("http://www.baidu.com/robots.txt").unwrap();
let parser = parse_robots_txt(url.origin(), doc).get_result();
for url in &good_urls {
assert!(parser.can_fetch(agent, url));
let url = format!("http://www.baidu.com{}", url);
let url = Url::parse(&url).unwrap();
assert!(parser.can_fetch(agent, &url));
}
for url in &bad_urls {
assert!(!parser.can_fetch(agent, url));
let url = format!("http://www.baidu.com{}", url);
let url = Url::parse(&url).unwrap();
assert!(!parser.can_fetch(agent, &url));
}
}
@ -24,6 +25,19 @@ fn robot_test_simple(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>) {
robot_test(doc, good_urls, bad_urls, AGENT);
}
#[test]
fn test_robots_txt_rn_bom() {
let doc = "\u{feff}\r\n\
User-agent: *\r\n\
Disallow: /cyberworld/map/ # This is an infinite virtual URL space\r\n\
Disallow: /tmp/ # these will soon disappear\r\n\
Disallow: /foo.html\r\n\
";
let good = vec!["/","/test.html"];
let bad = vec!["/cyberworld/map/index.html","/tmp/xxx","/foo.html"];
robot_test_simple(doc, good, bad);
}
#[test]
fn test_robots_txt_1() {
@ -213,54 +227,72 @@ fn test_robots_txt_13() {
robot_test_simple(doc, good, bad);
}
/// Using patterns with `*` and `$` symbols.
#[test]
fn test_robots_txt_14() {
let doc = "\n\
User-agent: *\n
Allow: /*video.html\n
Allow: */?amp*\n
Disallow: */rss$\n
Disallow: */rss/$\n
Disallow: /rate/\n
";
let good = vec!["/rss/test", "/sdfvsdvs-sdfvsdv-video.html", "/rate"];
let bad = vec!["/rss", "/rss/", "/rate/", "/rate/0/9"];
robot_test_simple(doc, good, bad);
}
#[cfg(feature = "http")]
#[test]
fn test_robots_txt_read() {
let parser = RobotFileParser::new("http://www.python.org/robots.txt");
parser.read();
assert!(parser.can_fetch("*", "http://www.python.org/robots.txt"));
use robotparser::http::{CreateRobotsTxtRequest, ParseRobotsTxtResponse};
use reqwest::{Client, Request};
let http_client = Client::new();
let url = Url::parse("http://www.python.org/robots.txt").unwrap();
let request = Request::create_robots_txt_request(url.origin());
let mut response = http_client.execute(request).unwrap();
let parser = response.parse_robots_txt_response().unwrap().get_result();
assert!(parser.can_fetch("*", &url));
}
#[test]
fn test_robots_text_crawl_delay() {
let parser = RobotFileParser::new("http://www.python.org/robots.txt");
let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
let doc = "User-agent: Yandex\n\
Crawl-delay: 2.35\n\
Disallow: /search/\n";
let lines: Vec<&str> = doc.split("\n").collect();
parser.parse(&lines);
let parser = parse_robots_txt(robots_txt_url.origin(), doc).get_result();
assert_eq!(Duration::new(2,350 * 1000 * 1000), parser.get_crawl_delay("Yandex").unwrap());
}
#[test]
fn test_robots_text_sitemaps() {
let parser = RobotFileParser::new("http://www.python.org/robots.txt");
let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
let doc = "User-agent: Yandex\n\
Sitemap: http://example.com/sitemap1.xml
Sitemap: http://example.com/sitemap2.xml
Sitemap: http://example.com/sitemap3.xml
Sitemap \t : http://example.com/sitemap1.xml\n
Sitemap: http://example.com/sitemap2.xml\n
Sitemap: http://example.com/sitemap3.xml\n
Disallow: /search/\n";
let lines: Vec<&str> = doc.split("\n").collect();
parser.parse(&lines);
let parser = parse_robots_txt(robots_txt_url.origin(), doc).get_result();
assert_eq!(
vec![
&[
Url::parse("http://example.com/sitemap1.xml").unwrap(),
Url::parse("http://example.com/sitemap2.xml").unwrap(),
Url::parse("http://example.com/sitemap3.xml").unwrap()
],
parser.get_sitemaps("Yandex")
parser.get_sitemaps()
);
}
#[test]
fn test_robots_text_request_rate() {
let parser = RobotFileParser::new("http://www.python.org/robots.txt");
let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
let doc =
"User-agent: Yandex\n\
Request-rate: 3/15\n\
Disallow: /search/\n";
let lines: Vec<&str> = doc.split("\n").collect();
parser.parse(&lines);
let parser = parse_robots_txt(robots_txt_url.origin(), doc).get_result();
let req_rate = parser.get_req_rate("Yandex").unwrap();
assert_eq!(3, req_rate.requests);
assert_eq!(15, req_rate.seconds);
@ -269,8 +301,27 @@ fn test_robots_text_request_rate() {
assert!(req_rate.is_none());
}
#[test]
fn test_robots_127_0_0_1() {
// Ensure it does not panic
RobotFileParser::new("http://127.0.0.1:4000/robots.txt");
}
fn test_robots_text_clean_params() {
let doc = "\
User-Agent: *\n\
Clean-param: mode\n\
Clean-param: from\n\
Clean-param: pid\n\
Clean-param: gid\n\
Clean-param: tm\n\
Clean-param: amp\n\
";
let url = Url::parse("http://www.baidu.com/robots.txt").unwrap();
let parser = parse_robots_txt(url.origin(), doc).get_result();
let mut site_url = Url::parse("http://www.baidu.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&amp=1").unwrap();
let was_updated = parser.normalize_url(&mut site_url);
assert_eq!(was_updated, true);
assert_eq!(site_url.as_str(), "http://www.baidu.com/test?post_id=7777");
let mut site_url = Url::parse("http://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&amp=1").unwrap();
let was_updated = parser.normalize_url(&mut site_url);
assert_eq!(was_updated, false);
assert_eq!(site_url.as_str(), "http://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&amp=1");
}

View file

@ -0,0 +1,16 @@
use robotparser::http::RobotsTxtClient;
use robotparser::service::RobotsTxtService;
use reqwest::Client;
use url::Url;
use tokio::runtime::Runtime;
#[test]
fn test_reqwest_async() {
let mut runtime = Runtime::new().unwrap();
let client = Client::new();
let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
let robots_txt_response = runtime.block_on(client.fetch_robots_txt(robots_txt_url.origin()));
let robots_txt = robots_txt_response.unwrap().get_result();
let fetch_url = Url::parse("http://www.python.org/robots.txt").unwrap();
assert!(robots_txt.can_fetch("*", &fetch_url));
}

View file

@ -0,0 +1,13 @@
use robotparser::http::RobotsTxtClient;
use robotparser::service::RobotsTxtService;
use reqwest::blocking::Client;
use url::Url;
#[test]
fn test_reqwest_blocking() {
let client = Client::new();
let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
let robots_txt = client.fetch_robots_txt(robots_txt_url.origin()).unwrap().get_result();
let fetch_url = Url::parse("http://www.python.org/robots.txt").unwrap();
assert!(robots_txt.can_fetch("*", &fetch_url));
}

178
tests/test_warnings.rs Normal file
View file

@ -0,0 +1,178 @@
use robotparser::parser::{parse_robots_txt, WarningReason};
use url::{Host, Origin};
use std::convert::From;
#[derive(PartialEq, Eq, Debug, Clone)]
enum WarningReasonKind {
InvalidDirectiveFormat,
DirectiveKeyIsEmpty,
UnsupportedDirectiveKey,
UserAgentCannotBeEmpty,
DirectiveWithoutUserAgent,
ParseCrawlDelayError,
WrongRequestRateFormat,
ParseRequestRate,
ParseUrl,
WrongCleanParamFormat,
IgnoredCleanParams,
WrongPathFormat,
}
fn validate_warnings(input: &str, expected_warnings: &[WarningReasonKind]) {
let host = Host::Domain("python.org".into());
let origin = Origin::Tuple("http".into(), host, 80);
let warnings = parse_robots_txt(origin, &input).get_warnings().to_vec();
assert_eq!(warnings.len(), expected_warnings.len());
for (warning, expected_warning) in warnings.iter().zip(expected_warnings.iter()) {
let warning: WarningReasonKind = warning.get_reason().into();
assert_eq!(expected_warning.clone(), warning);
}
}
#[test]
fn test_warning_invalid_directive_format() {
let input = "`";
validate_warnings(input, &[WarningReasonKind::InvalidDirectiveFormat]);
let input = " \t ` \t ";
validate_warnings(input, &[WarningReasonKind::InvalidDirectiveFormat]);
}
#[test]
fn test_warning_directive_key_is_empty() {
let input = ":";
validate_warnings(input, &[WarningReasonKind::DirectiveKeyIsEmpty]);
}
#[test]
fn test_warning_supported_directive_key() {
let input = "X-Directive:";
validate_warnings(input, &[WarningReasonKind::UnsupportedDirectiveKey]);
let input = "\t X-Directive\t :\t ";
validate_warnings(input, &[WarningReasonKind::UnsupportedDirectiveKey]);
}
#[test]
fn test_warning_user_agent_cannot_be_empty() {
let input = "User-Agent:";
validate_warnings(input, &[WarningReasonKind::UserAgentCannotBeEmpty]);
let input = "\t User-Agent\t :\t ";
validate_warnings(input, &[WarningReasonKind::UserAgentCannotBeEmpty]);
let input = "\t User-Agent\t :\t *";
validate_warnings(input, &[]);
}
#[test]
fn test_warning_directive_without_user_agent() {
let input = "Crawl-Delay: 5s";
validate_warnings(input, &[WarningReasonKind::DirectiveWithoutUserAgent]);
let input = "User-Agent: *\nCrawl-Delay: 5";
validate_warnings(input, &[]);
}
#[test]
fn test_warning_parse_crawl_delay_error() {
let input = "User-Agent: *\nCrawl-Delay: ";
validate_warnings(input, &[WarningReasonKind::ParseCrawlDelayError]);
let input = "User-Agent: *\nCrawl-Delay: -";
validate_warnings(input, &[WarningReasonKind::ParseCrawlDelayError]);
let input = "User-Agent: *\nCrawl-Delay: 5h9";
validate_warnings(input, &[WarningReasonKind::ParseCrawlDelayError]);
let input = "User-Agent: *\nCrawl-Delay: 5";
validate_warnings(input, &[]);
}
#[test]
fn test_warning_request_rate_format() {
let input = "User-Agent: *\nRequest-rate: 1/5";
validate_warnings(input, &[]);
let input = "User-Agent: *\nRequest-rate: 1//5";
validate_warnings(input, &[WarningReasonKind::WrongRequestRateFormat]);
let input = "User-Agent: *\nRequest-rate: 1";
validate_warnings(input, &[WarningReasonKind::WrongRequestRateFormat]);
}
#[test]
fn test_warning_request_rate() {
let input = "User-Agent: *\nRequest-rate: a/b";
validate_warnings(input, &[WarningReasonKind::ParseRequestRate]);
let input = "User-Agent: *\nRequest-rate: a/5";
validate_warnings(input, &[WarningReasonKind::ParseRequestRate]);
let input = "User-Agent: *\nRequest-rate: 5/b";
validate_warnings(input, &[WarningReasonKind::ParseRequestRate]);
let input = "User-Agent: *\nRequest-rate: 1.0/5.0";
validate_warnings(input, &[WarningReasonKind::ParseRequestRate]);
}
#[test]
fn test_warning_parsing_url() {
let input = "User-Agent: *\nSitemap: http://python.org/sitemap.xml";
validate_warnings(input, &[]);
let input = "User-Agent: *\nSitemap: http$$$://python.org/sitemap.xml";
validate_warnings(input, &[WarningReasonKind::ParseUrl]);
}
#[test]
fn test_wrong_clean_param() {
let input = "User-Agent: *\nClean-param: ref ";
validate_warnings(input, &[]);
let input = "User-Agent: *\nClean-param: ";
validate_warnings(input, &[WarningReasonKind::WrongCleanParamFormat]);
let input = "User-Agent: *\nClean-param: &";
validate_warnings(input, &[]);
let input = "User-Agent: *\nClean-param: ?";
validate_warnings(input, &[WarningReasonKind::IgnoredCleanParams]);
let input = "User-Agent: *\nClean-param: abc$";
validate_warnings(input, &[WarningReasonKind::IgnoredCleanParams]);
}
#[test]
fn test_warning_wrong_path_format() {
let input = "User-Agent: *\nAllow: \\";
validate_warnings(input, &[WarningReasonKind::WrongPathFormat]);
let input = "User-Agent: *\nDisallow: \\";
validate_warnings(input, &[WarningReasonKind::WrongPathFormat]);
}
impl From<&WarningReason> for WarningReasonKind {
fn from(reason: &WarningReason) -> Self {
match reason {
&WarningReason::InvalidDirectiveFormat => {
return WarningReasonKind::InvalidDirectiveFormat;
},
&WarningReason::DirectiveKeyIsEmpty => {
return WarningReasonKind::DirectiveKeyIsEmpty;
},
&WarningReason::UnsupportedDirectiveKey {..} => {
return WarningReasonKind::UnsupportedDirectiveKey;
},
&WarningReason::UserAgentCannotBeEmpty => {
return WarningReasonKind::UserAgentCannotBeEmpty;
},
&WarningReason::DirectiveWithoutUserAgent => {
return WarningReasonKind::DirectiveWithoutUserAgent;
},
&WarningReason::ParseCrawlDelayError {..} => {
return WarningReasonKind::ParseCrawlDelayError;
},
&WarningReason::WrongRequestRateFormat => {
return WarningReasonKind::WrongRequestRateFormat;
},
&WarningReason::ParseRequestRate {..} => {
return WarningReasonKind::ParseRequestRate;
},
&WarningReason::ParseUrl {..} => {
return WarningReasonKind::ParseUrl;
},
&WarningReason::WrongCleanParamFormat => {
return WarningReasonKind::WrongCleanParamFormat;
},
&WarningReason::IgnoredCleanParams {..} => {
return WarningReasonKind::IgnoredCleanParams;
},
&WarningReason::WrongPathFormat => {
return WarningReasonKind::WrongPathFormat;
},
}
}
}