From 49a2b9c93d256b8e09d135c961459d391f48e4b9 Mon Sep 17 00:00:00 2001 From: Mikhail Svetov Date: Wed, 13 Jan 2016 20:51:54 +0700 Subject: [PATCH 1/3] Added crawl delay feature. --- src/lib.rs | 38 ++++++++++++++++++++++++++++++++++++++ tests/lib.rs | 11 +++++++++++ 2 files changed, 49 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index fe24fa9..526e4b5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -27,6 +27,7 @@ struct RuleLine { struct Entry { useragents: RefCell>, rulelines: RefCell>, + crawl_delay: Option, } #[derive(Debug, Eq, PartialEq, Clone)] @@ -66,6 +67,7 @@ impl Entry { Entry { useragents: RefCell::new(vec![]), rulelines: RefCell::new(vec![]), + crawl_delay: None, } } @@ -118,6 +120,14 @@ impl Entry { let rulelines = self.rulelines.borrow(); useragents.is_empty() && rulelines.is_empty() } + + fn set_crawl_delay(&mut self,delay: usize) { + self.crawl_delay = Some(delay); + } + + fn get_crawl_delay(&self) -> Option { + return self.crawl_delay.clone(); + } } @@ -270,6 +280,19 @@ impl RobotFileParser { state = 2; } }, + ref x if x == "crawl-delay" => { + if state != 0 { + let delay = part1.parse::(); + match delay { + Ok(delay) => { + let delay = delay * 1000.0; + entry.set_crawl_delay(delay.round() as usize); + }, + Err(_) => {} + } + state = 2; + } + } _ => {}, } } @@ -320,4 +343,19 @@ impl RobotFileParser { // agent not found ==> access granted true } + + /// Returns the crawl delay for this user agent as a `usize` in milliseconds, or None if no crawl delay is defined. + pub fn get_crawl_delay>(&self,useragent: T) -> Option { + let useragent = useragent.as_ref(); + if self.last_checked.get() == 0 { + return None; + } + let entries = self.entries.borrow(); + for entry in &*entries { + if entry.applies_to(useragent) { + return entry.get_crawl_delay(); + } + } + return None; + } } diff --git a/tests/lib.rs b/tests/lib.rs index bc5f9d4..6b4434b 100644 --- a/tests/lib.rs +++ b/tests/lib.rs @@ -217,3 +217,14 @@ fn test_robots_txt_read() { parser.read(); assert!(parser.can_fetch("*", "http://www.python.org/robots.txt")); } + +#[test] +fn test_robots_text_crawl_delay() { + let parser = RobotFileParser::new("http://www.python.org/robots.txt"); + let doc = "User-agent: Yandex\n\ + Crawl-delay: 2.35\n\ + Disallow: /search/\n"; + let lines: Vec<&str> = doc.split("\n").collect(); + parser.parse(&lines); + assert_eq!(2350, parser.get_crawl_delay("Yandex").unwrap()); +} From f6168c3ea26583d0c3f1d5252d57c569e4a469ec Mon Sep 17 00:00:00 2001 From: Mikhail Svetov Date: Wed, 13 Jan 2016 20:53:33 +0700 Subject: [PATCH 2/3] Changed version. --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index db96002..05dfafe 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "robotparser" -version = "0.2.0" +version = "0.3.0" authors = ["messense "] description = "robots.txt parser for Rust" repository = "https://github.com/messense/robotparser-rs" From 02eedd6246e82ba0e708c72e0b79de93b0004d56 Mon Sep 17 00:00:00 2001 From: Mikhail Svetov Date: Wed, 13 Jan 2016 21:18:07 +0700 Subject: [PATCH 3/3] Improvements. --- src/lib.rs | 17 ++++++++++------- tests/lib.rs | 4 ++-- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 526e4b5..d669754 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,6 +13,7 @@ use std::cell::{Cell, RefCell}; use url::Url; use hyper::{Client}; use hyper::status::StatusCode; +use std::time::Duration; /// A rule line is a single "Allow:" (allowance==True) or "Disallow:" /// (allowance==False) followed by a path.""" @@ -27,7 +28,7 @@ struct RuleLine { struct Entry { useragents: RefCell>, rulelines: RefCell>, - crawl_delay: Option, + crawl_delay: Option, } #[derive(Debug, Eq, PartialEq, Clone)] @@ -121,11 +122,11 @@ impl Entry { useragents.is_empty() && rulelines.is_empty() } - fn set_crawl_delay(&mut self,delay: usize) { + fn set_crawl_delay(&mut self,delay: Duration) { self.crawl_delay = Some(delay); } - fn get_crawl_delay(&self) -> Option { + fn get_crawl_delay(&self) -> Option { return self.crawl_delay.clone(); } } @@ -285,8 +286,10 @@ impl RobotFileParser { let delay = part1.parse::(); match delay { Ok(delay) => { - let delay = delay * 1000.0; - entry.set_crawl_delay(delay.round() as usize); + let delay_seconds = delay.trunc(); + let delay_nanoseconds = delay.fract()* 10f64.powi(9); + let delay = Duration::new(delay_seconds as u64,delay_nanoseconds as u32); + entry.set_crawl_delay(delay); }, Err(_) => {} } @@ -344,8 +347,8 @@ impl RobotFileParser { true } - /// Returns the crawl delay for this user agent as a `usize` in milliseconds, or None if no crawl delay is defined. - pub fn get_crawl_delay>(&self,useragent: T) -> Option { + /// Returns the crawl delay for this user agent as a `Duration`, or None if no crawl delay is defined. + pub fn get_crawl_delay>(&self,useragent: T) -> Option { let useragent = useragent.as_ref(); if self.last_checked.get() == 0 { return None; diff --git a/tests/lib.rs b/tests/lib.rs index 6b4434b..ada37cb 100644 --- a/tests/lib.rs +++ b/tests/lib.rs @@ -1,10 +1,10 @@ extern crate robotparser; use robotparser::RobotFileParser; +use std::time::Duration; const AGENT: &'static str = "test_robotparser"; - fn robot_test(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>, agent: &str) { let parser = RobotFileParser::new("http://www.baidu.com/robots.txt"); let lines: Vec<&str> = doc.split("\n").collect(); @@ -226,5 +226,5 @@ fn test_robots_text_crawl_delay() { Disallow: /search/\n"; let lines: Vec<&str> = doc.split("\n").collect(); parser.parse(&lines); - assert_eq!(2350, parser.get_crawl_delay("Yandex").unwrap()); + assert_eq!(Duration::new(2,350 * 1000 * 1000), parser.get_crawl_delay("Yandex").unwrap()); }