Add Request-rate support. Closes #4

This commit is contained in:
messense 2016-08-21 11:35:24 +08:00
parent 384ae57caa
commit 17a36159a7
2 changed files with 64 additions and 2 deletions

View file

@ -38,11 +38,12 @@ extern crate hyper;
use std::io::Read; use std::io::Read;
use std::cell::{Cell, RefCell}; use std::cell::{Cell, RefCell};
use std::borrow::Cow; use std::borrow::Cow;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use url::Url; use url::Url;
use hyper::Client; use hyper::Client;
use hyper::header::UserAgent; use hyper::header::UserAgent;
use hyper::status::StatusCode; use hyper::status::StatusCode;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
const USER_AGENT: &'static str = "robotparser-rs (https://crates.io/crates/robotparser)"; const USER_AGENT: &'static str = "robotparser-rs (https://crates.io/crates/robotparser)";
@ -54,6 +55,12 @@ struct RuleLine<'a> {
allowance: bool, allowance: bool,
} }
#[derive(Debug, Eq, PartialEq, Clone)]
pub struct RequestRate {
pub requests: usize,
pub seconds: usize,
}
/// An entry has one or more user-agents and zero or more rulelines /// An entry has one or more user-agents and zero or more rulelines
#[derive(Debug, Eq, PartialEq, Clone)] #[derive(Debug, Eq, PartialEq, Clone)]
struct Entry<'a> { struct Entry<'a> {
@ -61,6 +68,7 @@ struct Entry<'a> {
rulelines: RefCell<Vec<RuleLine<'a>>>, rulelines: RefCell<Vec<RuleLine<'a>>>,
crawl_delay: Option<Duration>, crawl_delay: Option<Duration>,
sitemaps: Vec<Url>, sitemaps: Vec<Url>,
req_rate: Option<RequestRate>,
} }
/// robots.txt file parser /// robots.txt file parser
@ -106,6 +114,7 @@ impl<'a> Entry<'a> {
rulelines: RefCell::new(vec![]), rulelines: RefCell::new(vec![]),
crawl_delay: None, crawl_delay: None,
sitemaps: Vec::new(), sitemaps: Vec::new(),
req_rate: None,
} }
} }
@ -176,6 +185,14 @@ impl<'a> Entry<'a> {
fn get_sitemaps(&self) -> Vec<Url> { fn get_sitemaps(&self) -> Vec<Url> {
self.sitemaps.clone() self.sitemaps.clone()
} }
fn set_req_rate(&mut self, req_rate: RequestRate) {
self.req_rate = Some(req_rate);
}
fn get_req_rate(&self) -> Option<RequestRate> {
self.req_rate.clone()
}
} }
@ -351,6 +368,19 @@ impl<'a> RobotFileParser<'a> {
state = 2; state = 2;
} }
} }
ref x if x == "request-rate" => {
if state != 0 {
let numbers: Vec<Result<usize, _>> = part1.split('/').map(|x| x.parse::<usize>()).collect();
if numbers.len() == 2 && numbers[0].is_ok() && numbers[1].is_ok() {
let req_rate = RequestRate {
requests: numbers[0].clone().unwrap(),
seconds: numbers[1].clone().unwrap(),
};
entry.set_req_rate(req_rate);
}
state = 2;
}
}
_ => {} _ => {}
} }
} }
@ -431,4 +461,19 @@ impl<'a> RobotFileParser<'a> {
} }
vec![] vec![]
} }
/// Returns the request rate for this user agent as a `RequestRate`, or None if not request rate is defined
pub fn get_req_rate<T: AsRef<str>>(&self, useragent: T) -> Option<RequestRate> {
let useragent = useragent.as_ref();
if self.last_checked.get() == 0 {
return None;
}
let entries = self.entries.borrow();
for entry in &*entries {
if entry.applies_to(useragent) {
return entry.get_req_rate();
}
}
None
}
} }

View file

@ -250,3 +250,20 @@ fn test_robots_text_sitemaps() {
parser.get_sitemaps("Yandex") parser.get_sitemaps("Yandex")
); );
} }
#[test]
fn test_robots_text_request_rate() {
let parser = RobotFileParser::new("http://www.python.org/robots.txt");
let doc =
"User-agent: Yandex\n\
Request-rate: 3/15\n\
Disallow: /search/\n";
let lines: Vec<&str> = doc.split("\n").collect();
parser.parse(&lines);
let req_rate = parser.get_req_rate("Yandex").unwrap();
assert_eq!(3, req_rate.requests);
assert_eq!(15, req_rate.seconds);
let req_rate = parser.get_req_rate("Google");
assert!(req_rate.is_none());
}