mirror of
https://github.com/danbulant/robotparser-rs
synced 2026-07-04 10:40:36 +00:00
Merge pull request #6 from messense/request-rate
Add Request-rate support
This commit is contained in:
commit
a732ad5085
4 changed files with 94 additions and 28 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -1,2 +1,3 @@
|
||||||
target
|
target
|
||||||
Cargo.lock
|
Cargo.lock
|
||||||
|
.vscode/
|
||||||
|
|
|
||||||
3
rustfmt.toml
Normal file
3
rustfmt.toml
Normal file
|
|
@ -0,0 +1,3 @@
|
||||||
|
max_width = 120
|
||||||
|
ideal_width = 100
|
||||||
|
write_mode = "Overwrite"
|
||||||
99
src/lib.rs
99
src/lib.rs
|
|
@ -1,4 +1,3 @@
|
||||||
//!
|
|
||||||
//! robots.txt parser for Rust
|
//! robots.txt parser for Rust
|
||||||
//!
|
//!
|
||||||
//! The robots.txt Exclusion Protocol is implemented as specified in
|
//! The robots.txt Exclusion Protocol is implemented as specified in
|
||||||
|
|
@ -39,12 +38,13 @@ extern crate hyper;
|
||||||
use std::io::Read;
|
use std::io::Read;
|
||||||
use std::cell::{Cell, RefCell};
|
use std::cell::{Cell, RefCell};
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use url::Url;
|
|
||||||
use hyper::{Client};
|
|
||||||
use hyper::header::{UserAgent};
|
|
||||||
use hyper::status::StatusCode;
|
|
||||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||||
|
|
||||||
|
use url::Url;
|
||||||
|
use hyper::Client;
|
||||||
|
use hyper::header::UserAgent;
|
||||||
|
use hyper::status::StatusCode;
|
||||||
|
|
||||||
const USER_AGENT: &'static str = "robotparser-rs (https://crates.io/crates/robotparser)";
|
const USER_AGENT: &'static str = "robotparser-rs (https://crates.io/crates/robotparser)";
|
||||||
|
|
||||||
/// A rule line is a single "Allow:" (allowance==True) or "Disallow:"
|
/// A rule line is a single "Allow:" (allowance==True) or "Disallow:"
|
||||||
|
|
@ -55,6 +55,12 @@ struct RuleLine<'a> {
|
||||||
allowance: bool,
|
allowance: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Eq, PartialEq, Clone)]
|
||||||
|
pub struct RequestRate {
|
||||||
|
pub requests: usize,
|
||||||
|
pub seconds: usize,
|
||||||
|
}
|
||||||
|
|
||||||
/// An entry has one or more user-agents and zero or more rulelines
|
/// An entry has one or more user-agents and zero or more rulelines
|
||||||
#[derive(Debug, Eq, PartialEq, Clone)]
|
#[derive(Debug, Eq, PartialEq, Clone)]
|
||||||
struct Entry<'a> {
|
struct Entry<'a> {
|
||||||
|
|
@ -62,6 +68,7 @@ struct Entry<'a> {
|
||||||
rulelines: RefCell<Vec<RuleLine<'a>>>,
|
rulelines: RefCell<Vec<RuleLine<'a>>>,
|
||||||
crawl_delay: Option<Duration>,
|
crawl_delay: Option<Duration>,
|
||||||
sitemaps: Vec<Url>,
|
sitemaps: Vec<Url>,
|
||||||
|
req_rate: Option<RequestRate>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// robots.txt file parser
|
/// robots.txt file parser
|
||||||
|
|
@ -107,6 +114,7 @@ impl<'a> Entry<'a> {
|
||||||
rulelines: RefCell::new(vec![]),
|
rulelines: RefCell::new(vec![]),
|
||||||
crawl_delay: None,
|
crawl_delay: None,
|
||||||
sitemaps: Vec::new(),
|
sitemaps: Vec::new(),
|
||||||
|
req_rate: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -125,7 +133,7 @@ impl<'a> Entry<'a> {
|
||||||
false
|
false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// Preconditions:
|
/// Preconditions:
|
||||||
/// - our agent applies to this entry
|
/// - our agent applies to this entry
|
||||||
/// - filename is URL decoded
|
/// - filename is URL decoded
|
||||||
|
|
@ -133,7 +141,7 @@ impl<'a> Entry<'a> {
|
||||||
let rulelines = self.rulelines.borrow();
|
let rulelines = self.rulelines.borrow();
|
||||||
for line in &*rulelines {
|
for line in &*rulelines {
|
||||||
if line.applies_to(filename) {
|
if line.applies_to(filename) {
|
||||||
return line.allowance
|
return line.allowance;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
true
|
true
|
||||||
|
|
@ -168,7 +176,7 @@ impl<'a> Entry<'a> {
|
||||||
self.crawl_delay
|
self.crawl_delay
|
||||||
}
|
}
|
||||||
|
|
||||||
fn add_sitemap(&mut self,url:&str) {
|
fn add_sitemap(&mut self, url: &str) {
|
||||||
if let Ok(url) = Url::parse(url) {
|
if let Ok(url) = Url::parse(url) {
|
||||||
self.sitemaps.push(url);
|
self.sitemaps.push(url);
|
||||||
}
|
}
|
||||||
|
|
@ -177,6 +185,14 @@ impl<'a> Entry<'a> {
|
||||||
fn get_sitemaps(&self) -> Vec<Url> {
|
fn get_sitemaps(&self) -> Vec<Url> {
|
||||||
self.sitemaps.clone()
|
self.sitemaps.clone()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn set_req_rate(&mut self, req_rate: RequestRate) {
|
||||||
|
self.req_rate = Some(req_rate);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_req_rate(&self) -> Option<RequestRate> {
|
||||||
|
self.req_rate.clone()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -240,17 +256,17 @@ impl<'a> RobotFileParser<'a> {
|
||||||
match res.status {
|
match res.status {
|
||||||
StatusCode::Unauthorized | StatusCode::Forbidden => {
|
StatusCode::Unauthorized | StatusCode::Forbidden => {
|
||||||
self.disallow_all.set(true);
|
self.disallow_all.set(true);
|
||||||
},
|
}
|
||||||
status if status >= StatusCode::BadRequest && status < StatusCode::InternalServerError => {
|
status if status >= StatusCode::BadRequest && status < StatusCode::InternalServerError => {
|
||||||
self.allow_all.set(true);
|
self.allow_all.set(true);
|
||||||
},
|
}
|
||||||
StatusCode::Ok => {
|
StatusCode::Ok => {
|
||||||
let mut buf = String::new();
|
let mut buf = String::new();
|
||||||
res.read_to_string(&mut buf).unwrap();
|
res.read_to_string(&mut buf).unwrap();
|
||||||
let lines: Vec<&str> = buf.split('\n').collect();
|
let lines: Vec<&str> = buf.split('\n').collect();
|
||||||
self.parse(&lines);
|
self.parse(&lines);
|
||||||
},
|
}
|
||||||
_ => {},
|
_ => {}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -292,13 +308,13 @@ impl<'a> RobotFileParser<'a> {
|
||||||
1 => {
|
1 => {
|
||||||
entry = Entry::new();
|
entry = Entry::new();
|
||||||
state = 0;
|
state = 0;
|
||||||
},
|
}
|
||||||
2 => {
|
2 => {
|
||||||
self._add_entry(entry);
|
self._add_entry(entry);
|
||||||
entry = Entry::new();
|
entry = Entry::new();
|
||||||
state = 0;
|
state = 0;
|
||||||
},
|
}
|
||||||
_ => {},
|
_ => {}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// remove optional comment and strip line
|
// remove optional comment and strip line
|
||||||
|
|
@ -312,7 +328,8 @@ impl<'a> RobotFileParser<'a> {
|
||||||
let parts: Vec<&str> = ln.splitn(2, ':').collect();
|
let parts: Vec<&str> = ln.splitn(2, ':').collect();
|
||||||
if parts.len() == 2 {
|
if parts.len() == 2 {
|
||||||
let part0 = parts[0].trim().to_lowercase();
|
let part0 = parts[0].trim().to_lowercase();
|
||||||
let part1 = String::from_utf8(percent_decode(parts[1].trim().as_bytes()).collect()).unwrap_or("".to_owned());
|
let part1 = String::from_utf8(percent_decode(parts[1].trim().as_bytes()).collect())
|
||||||
|
.unwrap_or("".to_owned());
|
||||||
match part0 {
|
match part0 {
|
||||||
ref x if x == "user-agent" => {
|
ref x if x == "user-agent" => {
|
||||||
if state == 2 {
|
if state == 2 {
|
||||||
|
|
@ -321,37 +338,50 @@ impl<'a> RobotFileParser<'a> {
|
||||||
}
|
}
|
||||||
entry.push_useragent(&part1);
|
entry.push_useragent(&part1);
|
||||||
state = 1;
|
state = 1;
|
||||||
},
|
}
|
||||||
ref x if x == "disallow" => {
|
ref x if x == "disallow" => {
|
||||||
if state != 0 {
|
if state != 0 {
|
||||||
entry.push_ruleline(RuleLine::new(part1, false));
|
entry.push_ruleline(RuleLine::new(part1, false));
|
||||||
state = 2;
|
state = 2;
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
ref x if x == "allow" => {
|
ref x if x == "allow" => {
|
||||||
if state != 0 {
|
if state != 0 {
|
||||||
entry.push_ruleline(RuleLine::new(part1, true));
|
entry.push_ruleline(RuleLine::new(part1, true));
|
||||||
state = 2;
|
state = 2;
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
ref x if x == "crawl-delay" => {
|
ref x if x == "crawl-delay" => {
|
||||||
if state != 0 {
|
if state != 0 {
|
||||||
if let Ok(delay) = part1.parse::<f64>() {
|
if let Ok(delay) = part1.parse::<f64>() {
|
||||||
let delay_seconds = delay.trunc();
|
let delay_seconds = delay.trunc();
|
||||||
let delay_nanoseconds = delay.fract() * 10f64.powi(9);
|
let delay_nanoseconds = delay.fract() * 10f64.powi(9);
|
||||||
let delay = Duration::new(delay_seconds as u64,delay_nanoseconds as u32);
|
let delay = Duration::new(delay_seconds as u64, delay_nanoseconds as u32);
|
||||||
entry.set_crawl_delay(delay);
|
entry.set_crawl_delay(delay);
|
||||||
}
|
}
|
||||||
state = 2;
|
state = 2;
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
ref x if x == "sitemap" => {
|
ref x if x == "sitemap" => {
|
||||||
if state != 0 {
|
if state != 0 {
|
||||||
entry.add_sitemap(&part1);
|
entry.add_sitemap(&part1);
|
||||||
state = 2;
|
state = 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_ => {},
|
ref x if x == "request-rate" => {
|
||||||
|
if state != 0 {
|
||||||
|
let numbers: Vec<Result<usize, _>> = part1.split('/').map(|x| x.parse::<usize>()).collect();
|
||||||
|
if numbers.len() == 2 && numbers[0].is_ok() && numbers[1].is_ok() {
|
||||||
|
let req_rate = RequestRate {
|
||||||
|
requests: numbers[0].clone().unwrap(),
|
||||||
|
seconds: numbers[1].clone().unwrap(),
|
||||||
|
};
|
||||||
|
entry.set_req_rate(req_rate);
|
||||||
|
}
|
||||||
|
state = 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -403,8 +433,8 @@ impl<'a> RobotFileParser<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the crawl delay for this user agent as a `Duration`, or None if no crawl delay is defined.
|
/// Returns the crawl delay for this user agent as a `Duration`, or None if no crawl delay is defined.
|
||||||
pub fn get_crawl_delay<T: AsRef<str>>(&self,useragent: T) -> Option<Duration> {
|
pub fn get_crawl_delay<T: AsRef<str>>(&self, useragent: T) -> Option<Duration> {
|
||||||
let useragent = useragent.as_ref();
|
let useragent = useragent.as_ref();
|
||||||
if self.last_checked.get() == 0 {
|
if self.last_checked.get() == 0 {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
|
|
@ -418,8 +448,8 @@ impl<'a> RobotFileParser<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the sitemaps for this user agent as a `Vec<Url>`.
|
/// Returns the sitemaps for this user agent as a `Vec<Url>`.
|
||||||
pub fn get_sitemaps<T: AsRef<str>>(&self,useragent: T) -> Vec<Url> {
|
pub fn get_sitemaps<T: AsRef<str>>(&self, useragent: T) -> Vec<Url> {
|
||||||
let useragent = useragent.as_ref();
|
let useragent = useragent.as_ref();
|
||||||
if self.last_checked.get() == 0 {
|
if self.last_checked.get() == 0 {
|
||||||
return Vec::new();
|
return Vec::new();
|
||||||
}
|
}
|
||||||
|
|
@ -431,4 +461,19 @@ impl<'a> RobotFileParser<'a> {
|
||||||
}
|
}
|
||||||
vec![]
|
vec![]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the request rate for this user agent as a `RequestRate`, or None if not request rate is defined
|
||||||
|
pub fn get_req_rate<T: AsRef<str>>(&self, useragent: T) -> Option<RequestRate> {
|
||||||
|
let useragent = useragent.as_ref();
|
||||||
|
if self.last_checked.get() == 0 {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let entries = self.entries.borrow();
|
||||||
|
for entry in &*entries {
|
||||||
|
if entry.applies_to(useragent) {
|
||||||
|
return entry.get_req_rate();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
19
tests/lib.rs
19
tests/lib.rs
|
|
@ -246,7 +246,24 @@ fn test_robots_text_sitemaps() {
|
||||||
Url::parse("http://example.com/sitemap1.xml").unwrap(),
|
Url::parse("http://example.com/sitemap1.xml").unwrap(),
|
||||||
Url::parse("http://example.com/sitemap2.xml").unwrap(),
|
Url::parse("http://example.com/sitemap2.xml").unwrap(),
|
||||||
Url::parse("http://example.com/sitemap3.xml").unwrap()
|
Url::parse("http://example.com/sitemap3.xml").unwrap()
|
||||||
],
|
],
|
||||||
parser.get_sitemaps("Yandex")
|
parser.get_sitemaps("Yandex")
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_robots_text_request_rate() {
|
||||||
|
let parser = RobotFileParser::new("http://www.python.org/robots.txt");
|
||||||
|
let doc =
|
||||||
|
"User-agent: Yandex\n\
|
||||||
|
Request-rate: 3/15\n\
|
||||||
|
Disallow: /search/\n";
|
||||||
|
let lines: Vec<&str> = doc.split("\n").collect();
|
||||||
|
parser.parse(&lines);
|
||||||
|
let req_rate = parser.get_req_rate("Yandex").unwrap();
|
||||||
|
assert_eq!(3, req_rate.requests);
|
||||||
|
assert_eq!(15, req_rate.seconds);
|
||||||
|
|
||||||
|
let req_rate = parser.get_req_rate("Google");
|
||||||
|
assert!(req_rate.is_none());
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue