mirror of
https://github.com/danbulant/robotparser-rs
synced 2026-06-18 14:11:30 +00:00
Added crawl delay feature.
This commit is contained in:
parent
04c25dda65
commit
49a2b9c93d
2 changed files with 49 additions and 0 deletions
38
src/lib.rs
38
src/lib.rs
|
|
@ -27,6 +27,7 @@ struct RuleLine {
|
|||
struct Entry {
|
||||
useragents: RefCell<Vec<String>>,
|
||||
rulelines: RefCell<Vec<RuleLine>>,
|
||||
crawl_delay: Option<usize>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Eq, PartialEq, Clone)]
|
||||
|
|
@ -66,6 +67,7 @@ impl Entry {
|
|||
Entry {
|
||||
useragents: RefCell::new(vec![]),
|
||||
rulelines: RefCell::new(vec![]),
|
||||
crawl_delay: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -118,6 +120,14 @@ impl Entry {
|
|||
let rulelines = self.rulelines.borrow();
|
||||
useragents.is_empty() && rulelines.is_empty()
|
||||
}
|
||||
|
||||
fn set_crawl_delay(&mut self,delay: usize) {
|
||||
self.crawl_delay = Some(delay);
|
||||
}
|
||||
|
||||
fn get_crawl_delay(&self) -> Option<usize> {
|
||||
return self.crawl_delay.clone();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -270,6 +280,19 @@ impl RobotFileParser {
|
|||
state = 2;
|
||||
}
|
||||
},
|
||||
ref x if x == "crawl-delay" => {
|
||||
if state != 0 {
|
||||
let delay = part1.parse::<f64>();
|
||||
match delay {
|
||||
Ok(delay) => {
|
||||
let delay = delay * 1000.0;
|
||||
entry.set_crawl_delay(delay.round() as usize);
|
||||
},
|
||||
Err(_) => {}
|
||||
}
|
||||
state = 2;
|
||||
}
|
||||
}
|
||||
_ => {},
|
||||
}
|
||||
}
|
||||
|
|
@ -320,4 +343,19 @@ impl RobotFileParser {
|
|||
// agent not found ==> access granted
|
||||
true
|
||||
}
|
||||
|
||||
/// Returns the crawl delay for this user agent as a `usize` in milliseconds, or None if no crawl delay is defined.
|
||||
pub fn get_crawl_delay<T: AsRef<str>>(&self,useragent: T) -> Option<usize> {
|
||||
let useragent = useragent.as_ref();
|
||||
if self.last_checked.get() == 0 {
|
||||
return None;
|
||||
}
|
||||
let entries = self.entries.borrow();
|
||||
for entry in &*entries {
|
||||
if entry.applies_to(useragent) {
|
||||
return entry.get_crawl_delay();
|
||||
}
|
||||
}
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
11
tests/lib.rs
11
tests/lib.rs
|
|
@ -217,3 +217,14 @@ fn test_robots_txt_read() {
|
|||
parser.read();
|
||||
assert!(parser.can_fetch("*", "http://www.python.org/robots.txt"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_robots_text_crawl_delay() {
|
||||
let parser = RobotFileParser::new("http://www.python.org/robots.txt");
|
||||
let doc = "User-agent: Yandex\n\
|
||||
Crawl-delay: 2.35\n\
|
||||
Disallow: /search/\n";
|
||||
let lines: Vec<&str> = doc.split("\n").collect();
|
||||
parser.parse(&lines);
|
||||
assert_eq!(2350, parser.get_crawl_delay("Yandex").unwrap());
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue