diff --git a/.gitignore b/.gitignore index a9d37c5..865d4a7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ target Cargo.lock +.vscode/ diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..04c6a82 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1,3 @@ +max_width = 120 +ideal_width = 100 +write_mode = "Overwrite" diff --git a/src/lib.rs b/src/lib.rs index cfcd4c4..9dd8174 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,3 @@ -//! //! robots.txt parser for Rust //! //! The robots.txt Exclusion Protocol is implemented as specified in @@ -39,12 +38,13 @@ extern crate hyper; use std::io::Read; use std::cell::{Cell, RefCell}; use std::borrow::Cow; -use url::Url; -use hyper::{Client}; -use hyper::header::{UserAgent}; -use hyper::status::StatusCode; use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use url::Url; +use hyper::Client; +use hyper::header::UserAgent; +use hyper::status::StatusCode; + const USER_AGENT: &'static str = "robotparser-rs (https://crates.io/crates/robotparser)"; /// A rule line is a single "Allow:" (allowance==True) or "Disallow:" @@ -55,6 +55,12 @@ struct RuleLine<'a> { allowance: bool, } +#[derive(Debug, Eq, PartialEq, Clone)] +pub struct RequestRate { + pub requests: usize, + pub seconds: usize, +} + /// An entry has one or more user-agents and zero or more rulelines #[derive(Debug, Eq, PartialEq, Clone)] struct Entry<'a> { @@ -62,6 +68,7 @@ struct Entry<'a> { rulelines: RefCell>>, crawl_delay: Option, sitemaps: Vec, + req_rate: Option, } /// robots.txt file parser @@ -107,6 +114,7 @@ impl<'a> Entry<'a> { rulelines: RefCell::new(vec![]), crawl_delay: None, sitemaps: Vec::new(), + req_rate: None, } } @@ -125,7 +133,7 @@ impl<'a> Entry<'a> { false } - + /// Preconditions: /// - our agent applies to this entry /// - filename is URL decoded @@ -133,7 +141,7 @@ impl<'a> Entry<'a> { let rulelines = self.rulelines.borrow(); for line in &*rulelines { if line.applies_to(filename) { - return line.allowance + return line.allowance; } } true @@ -168,7 +176,7 @@ impl<'a> Entry<'a> { self.crawl_delay } - fn add_sitemap(&mut self,url:&str) { + fn add_sitemap(&mut self, url: &str) { if let Ok(url) = Url::parse(url) { self.sitemaps.push(url); } @@ -177,6 +185,14 @@ impl<'a> Entry<'a> { fn get_sitemaps(&self) -> Vec { self.sitemaps.clone() } + + fn set_req_rate(&mut self, req_rate: RequestRate) { + self.req_rate = Some(req_rate); + } + + fn get_req_rate(&self) -> Option { + self.req_rate.clone() + } } @@ -240,17 +256,17 @@ impl<'a> RobotFileParser<'a> { match res.status { StatusCode::Unauthorized | StatusCode::Forbidden => { self.disallow_all.set(true); - }, + } status if status >= StatusCode::BadRequest && status < StatusCode::InternalServerError => { self.allow_all.set(true); - }, + } StatusCode::Ok => { let mut buf = String::new(); res.read_to_string(&mut buf).unwrap(); let lines: Vec<&str> = buf.split('\n').collect(); self.parse(&lines); - }, - _ => {}, + } + _ => {} } } @@ -292,13 +308,13 @@ impl<'a> RobotFileParser<'a> { 1 => { entry = Entry::new(); state = 0; - }, + } 2 => { self._add_entry(entry); entry = Entry::new(); state = 0; - }, - _ => {}, + } + _ => {} } } // remove optional comment and strip line @@ -312,7 +328,8 @@ impl<'a> RobotFileParser<'a> { let parts: Vec<&str> = ln.splitn(2, ':').collect(); if parts.len() == 2 { let part0 = parts[0].trim().to_lowercase(); - let part1 = String::from_utf8(percent_decode(parts[1].trim().as_bytes()).collect()).unwrap_or("".to_owned()); + let part1 = String::from_utf8(percent_decode(parts[1].trim().as_bytes()).collect()) + .unwrap_or("".to_owned()); match part0 { ref x if x == "user-agent" => { if state == 2 { @@ -321,37 +338,50 @@ impl<'a> RobotFileParser<'a> { } entry.push_useragent(&part1); state = 1; - }, + } ref x if x == "disallow" => { if state != 0 { entry.push_ruleline(RuleLine::new(part1, false)); state = 2; } - }, + } ref x if x == "allow" => { if state != 0 { entry.push_ruleline(RuleLine::new(part1, true)); state = 2; } - }, + } ref x if x == "crawl-delay" => { if state != 0 { if let Ok(delay) = part1.parse::() { let delay_seconds = delay.trunc(); let delay_nanoseconds = delay.fract() * 10f64.powi(9); - let delay = Duration::new(delay_seconds as u64,delay_nanoseconds as u32); + let delay = Duration::new(delay_seconds as u64, delay_nanoseconds as u32); entry.set_crawl_delay(delay); } state = 2; - } - }, + } + } ref x if x == "sitemap" => { if state != 0 { entry.add_sitemap(&part1); state = 2; } } - _ => {}, + ref x if x == "request-rate" => { + if state != 0 { + let numbers: Vec> = part1.split('/').map(|x| x.parse::()).collect(); + if numbers.len() == 2 && numbers[0].is_ok() && numbers[1].is_ok() { + let req_rate = RequestRate { + requests: numbers[0].clone().unwrap(), + seconds: numbers[1].clone().unwrap(), + }; + entry.set_req_rate(req_rate); + } + state = 2; + } + } + _ => {} } } } @@ -403,8 +433,8 @@ impl<'a> RobotFileParser<'a> { } /// Returns the crawl delay for this user agent as a `Duration`, or None if no crawl delay is defined. - pub fn get_crawl_delay>(&self,useragent: T) -> Option { - let useragent = useragent.as_ref(); + pub fn get_crawl_delay>(&self, useragent: T) -> Option { + let useragent = useragent.as_ref(); if self.last_checked.get() == 0 { return None; } @@ -418,8 +448,8 @@ impl<'a> RobotFileParser<'a> { } /// Returns the sitemaps for this user agent as a `Vec`. - pub fn get_sitemaps>(&self,useragent: T) -> Vec { - let useragent = useragent.as_ref(); + pub fn get_sitemaps>(&self, useragent: T) -> Vec { + let useragent = useragent.as_ref(); if self.last_checked.get() == 0 { return Vec::new(); } @@ -431,4 +461,19 @@ impl<'a> RobotFileParser<'a> { } vec![] } + + /// Returns the request rate for this user agent as a `RequestRate`, or None if not request rate is defined + pub fn get_req_rate>(&self, useragent: T) -> Option { + let useragent = useragent.as_ref(); + if self.last_checked.get() == 0 { + return None; + } + let entries = self.entries.borrow(); + for entry in &*entries { + if entry.applies_to(useragent) { + return entry.get_req_rate(); + } + } + None + } } diff --git a/tests/lib.rs b/tests/lib.rs index abc9f24..ae9fff8 100644 --- a/tests/lib.rs +++ b/tests/lib.rs @@ -246,7 +246,24 @@ fn test_robots_text_sitemaps() { Url::parse("http://example.com/sitemap1.xml").unwrap(), Url::parse("http://example.com/sitemap2.xml").unwrap(), Url::parse("http://example.com/sitemap3.xml").unwrap() - ], + ], parser.get_sitemaps("Yandex") ); } + +#[test] +fn test_robots_text_request_rate() { + let parser = RobotFileParser::new("http://www.python.org/robots.txt"); + let doc = + "User-agent: Yandex\n\ + Request-rate: 3/15\n\ + Disallow: /search/\n"; + let lines: Vec<&str> = doc.split("\n").collect(); + parser.parse(&lines); + let req_rate = parser.get_req_rate("Yandex").unwrap(); + assert_eq!(3, req_rate.requests); + assert_eq!(15, req_rate.seconds); + + let req_rate = parser.get_req_rate("Google"); + assert!(req_rate.is_none()); +}