Improvements.

This commit is contained in:
Mikhail Svetov 2016-01-13 21:18:07 +07:00
parent f6168c3ea2
commit 02eedd6246
2 changed files with 12 additions and 9 deletions

View file

@ -13,6 +13,7 @@ use std::cell::{Cell, RefCell};
use url::Url; use url::Url;
use hyper::{Client}; use hyper::{Client};
use hyper::status::StatusCode; use hyper::status::StatusCode;
use std::time::Duration;
/// A rule line is a single "Allow:" (allowance==True) or "Disallow:" /// A rule line is a single "Allow:" (allowance==True) or "Disallow:"
/// (allowance==False) followed by a path.""" /// (allowance==False) followed by a path."""
@ -27,7 +28,7 @@ struct RuleLine {
struct Entry { struct Entry {
useragents: RefCell<Vec<String>>, useragents: RefCell<Vec<String>>,
rulelines: RefCell<Vec<RuleLine>>, rulelines: RefCell<Vec<RuleLine>>,
crawl_delay: Option<usize>, crawl_delay: Option<Duration>,
} }
#[derive(Debug, Eq, PartialEq, Clone)] #[derive(Debug, Eq, PartialEq, Clone)]
@ -121,11 +122,11 @@ impl Entry {
useragents.is_empty() && rulelines.is_empty() useragents.is_empty() && rulelines.is_empty()
} }
fn set_crawl_delay(&mut self,delay: usize) { fn set_crawl_delay(&mut self,delay: Duration) {
self.crawl_delay = Some(delay); self.crawl_delay = Some(delay);
} }
fn get_crawl_delay(&self) -> Option<usize> { fn get_crawl_delay(&self) -> Option<Duration> {
return self.crawl_delay.clone(); return self.crawl_delay.clone();
} }
} }
@ -285,8 +286,10 @@ impl RobotFileParser {
let delay = part1.parse::<f64>(); let delay = part1.parse::<f64>();
match delay { match delay {
Ok(delay) => { Ok(delay) => {
let delay = delay * 1000.0; let delay_seconds = delay.trunc();
entry.set_crawl_delay(delay.round() as usize); let delay_nanoseconds = delay.fract()* 10f64.powi(9);
let delay = Duration::new(delay_seconds as u64,delay_nanoseconds as u32);
entry.set_crawl_delay(delay);
}, },
Err(_) => {} Err(_) => {}
} }
@ -344,8 +347,8 @@ impl RobotFileParser {
true true
} }
/// Returns the crawl delay for this user agent as a `usize` in milliseconds, or None if no crawl delay is defined. /// Returns the crawl delay for this user agent as a `Duration`, or None if no crawl delay is defined.
pub fn get_crawl_delay<T: AsRef<str>>(&self,useragent: T) -> Option<usize> { pub fn get_crawl_delay<T: AsRef<str>>(&self,useragent: T) -> Option<Duration> {
let useragent = useragent.as_ref(); let useragent = useragent.as_ref();
if self.last_checked.get() == 0 { if self.last_checked.get() == 0 {
return None; return None;

View file

@ -1,10 +1,10 @@
extern crate robotparser; extern crate robotparser;
use robotparser::RobotFileParser; use robotparser::RobotFileParser;
use std::time::Duration;
const AGENT: &'static str = "test_robotparser"; const AGENT: &'static str = "test_robotparser";
fn robot_test(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>, agent: &str) { fn robot_test(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>, agent: &str) {
let parser = RobotFileParser::new("http://www.baidu.com/robots.txt"); let parser = RobotFileParser::new("http://www.baidu.com/robots.txt");
let lines: Vec<&str> = doc.split("\n").collect(); let lines: Vec<&str> = doc.split("\n").collect();
@ -226,5 +226,5 @@ fn test_robots_text_crawl_delay() {
Disallow: /search/\n"; Disallow: /search/\n";
let lines: Vec<&str> = doc.split("\n").collect(); let lines: Vec<&str> = doc.split("\n").collect();
parser.parse(&lines); parser.parse(&lines);
assert_eq!(2350, parser.get_crawl_delay("Yandex").unwrap()); assert_eq!(Duration::new(2,350 * 1000 * 1000), parser.get_crawl_delay("Yandex").unwrap());
} }