mirror of
https://github.com/danbulant/robotparser-rs
synced 2026-05-26 21:41:55 +00:00
commit
255166313f
3 changed files with 54 additions and 2 deletions
|
|
@ -1,6 +1,6 @@
|
||||||
[package]
|
[package]
|
||||||
name = "robotparser"
|
name = "robotparser"
|
||||||
version = "0.2.0"
|
version = "0.3.0"
|
||||||
authors = ["messense <messense@icloud.com>"]
|
authors = ["messense <messense@icloud.com>"]
|
||||||
description = "robots.txt parser for Rust"
|
description = "robots.txt parser for Rust"
|
||||||
repository = "https://github.com/messense/robotparser-rs"
|
repository = "https://github.com/messense/robotparser-rs"
|
||||||
|
|
|
||||||
41
src/lib.rs
41
src/lib.rs
|
|
@ -13,6 +13,7 @@ use std::cell::{Cell, RefCell};
|
||||||
use url::Url;
|
use url::Url;
|
||||||
use hyper::{Client};
|
use hyper::{Client};
|
||||||
use hyper::status::StatusCode;
|
use hyper::status::StatusCode;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
/// A rule line is a single "Allow:" (allowance==True) or "Disallow:"
|
/// A rule line is a single "Allow:" (allowance==True) or "Disallow:"
|
||||||
/// (allowance==False) followed by a path."""
|
/// (allowance==False) followed by a path."""
|
||||||
|
|
@ -27,6 +28,7 @@ struct RuleLine {
|
||||||
struct Entry {
|
struct Entry {
|
||||||
useragents: RefCell<Vec<String>>,
|
useragents: RefCell<Vec<String>>,
|
||||||
rulelines: RefCell<Vec<RuleLine>>,
|
rulelines: RefCell<Vec<RuleLine>>,
|
||||||
|
crawl_delay: Option<Duration>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Eq, PartialEq, Clone)]
|
#[derive(Debug, Eq, PartialEq, Clone)]
|
||||||
|
|
@ -66,6 +68,7 @@ impl Entry {
|
||||||
Entry {
|
Entry {
|
||||||
useragents: RefCell::new(vec![]),
|
useragents: RefCell::new(vec![]),
|
||||||
rulelines: RefCell::new(vec![]),
|
rulelines: RefCell::new(vec![]),
|
||||||
|
crawl_delay: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -118,6 +121,14 @@ impl Entry {
|
||||||
let rulelines = self.rulelines.borrow();
|
let rulelines = self.rulelines.borrow();
|
||||||
useragents.is_empty() && rulelines.is_empty()
|
useragents.is_empty() && rulelines.is_empty()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn set_crawl_delay(&mut self,delay: Duration) {
|
||||||
|
self.crawl_delay = Some(delay);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_crawl_delay(&self) -> Option<Duration> {
|
||||||
|
return self.crawl_delay.clone();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -270,6 +281,21 @@ impl RobotFileParser {
|
||||||
state = 2;
|
state = 2;
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
ref x if x == "crawl-delay" => {
|
||||||
|
if state != 0 {
|
||||||
|
let delay = part1.parse::<f64>();
|
||||||
|
match delay {
|
||||||
|
Ok(delay) => {
|
||||||
|
let delay_seconds = delay.trunc();
|
||||||
|
let delay_nanoseconds = delay.fract()* 10f64.powi(9);
|
||||||
|
let delay = Duration::new(delay_seconds as u64,delay_nanoseconds as u32);
|
||||||
|
entry.set_crawl_delay(delay);
|
||||||
|
},
|
||||||
|
Err(_) => {}
|
||||||
|
}
|
||||||
|
state = 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
_ => {},
|
_ => {},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -320,4 +346,19 @@ impl RobotFileParser {
|
||||||
// agent not found ==> access granted
|
// agent not found ==> access granted
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the crawl delay for this user agent as a `Duration`, or None if no crawl delay is defined.
|
||||||
|
pub fn get_crawl_delay<T: AsRef<str>>(&self,useragent: T) -> Option<Duration> {
|
||||||
|
let useragent = useragent.as_ref();
|
||||||
|
if self.last_checked.get() == 0 {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let entries = self.entries.borrow();
|
||||||
|
for entry in &*entries {
|
||||||
|
if entry.applies_to(useragent) {
|
||||||
|
return entry.get_crawl_delay();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return None;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
13
tests/lib.rs
13
tests/lib.rs
|
|
@ -1,10 +1,10 @@
|
||||||
extern crate robotparser;
|
extern crate robotparser;
|
||||||
|
|
||||||
use robotparser::RobotFileParser;
|
use robotparser::RobotFileParser;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
const AGENT: &'static str = "test_robotparser";
|
const AGENT: &'static str = "test_robotparser";
|
||||||
|
|
||||||
|
|
||||||
fn robot_test(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>, agent: &str) {
|
fn robot_test(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>, agent: &str) {
|
||||||
let parser = RobotFileParser::new("http://www.baidu.com/robots.txt");
|
let parser = RobotFileParser::new("http://www.baidu.com/robots.txt");
|
||||||
let lines: Vec<&str> = doc.split("\n").collect();
|
let lines: Vec<&str> = doc.split("\n").collect();
|
||||||
|
|
@ -217,3 +217,14 @@ fn test_robots_txt_read() {
|
||||||
parser.read();
|
parser.read();
|
||||||
assert!(parser.can_fetch("*", "http://www.python.org/robots.txt"));
|
assert!(parser.can_fetch("*", "http://www.python.org/robots.txt"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_robots_text_crawl_delay() {
|
||||||
|
let parser = RobotFileParser::new("http://www.python.org/robots.txt");
|
||||||
|
let doc = "User-agent: Yandex\n\
|
||||||
|
Crawl-delay: 2.35\n\
|
||||||
|
Disallow: /search/\n";
|
||||||
|
let lines: Vec<&str> = doc.split("\n").collect();
|
||||||
|
parser.parse(&lines);
|
||||||
|
assert_eq!(Duration::new(2,350 * 1000 * 1000), parser.get_crawl_delay("Yandex").unwrap());
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue