mirror of
https://github.com/danbulant/robotparser-rs
synced 2026-05-24 12:35:49 +00:00
commit
a18dcd4c26
3 changed files with 59 additions and 1 deletions
|
|
@ -1,6 +1,6 @@
|
||||||
[package]
|
[package]
|
||||||
name = "robotparser"
|
name = "robotparser"
|
||||||
version = "0.3.0"
|
version = "0.4.0"
|
||||||
authors = ["messense <messense@icloud.com>"]
|
authors = ["messense <messense@icloud.com>"]
|
||||||
description = "robots.txt parser for Rust"
|
description = "robots.txt parser for Rust"
|
||||||
repository = "https://github.com/messense/robotparser-rs"
|
repository = "https://github.com/messense/robotparser-rs"
|
||||||
|
|
|
||||||
36
src/lib.rs
36
src/lib.rs
|
|
@ -29,6 +29,7 @@ struct Entry {
|
||||||
useragents: RefCell<Vec<String>>,
|
useragents: RefCell<Vec<String>>,
|
||||||
rulelines: RefCell<Vec<RuleLine>>,
|
rulelines: RefCell<Vec<RuleLine>>,
|
||||||
crawl_delay: Option<Duration>,
|
crawl_delay: Option<Duration>,
|
||||||
|
sitemaps: Vec<Url>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Eq, PartialEq, Clone)]
|
#[derive(Debug, Eq, PartialEq, Clone)]
|
||||||
|
|
@ -69,6 +70,7 @@ impl Entry {
|
||||||
useragents: RefCell::new(vec![]),
|
useragents: RefCell::new(vec![]),
|
||||||
rulelines: RefCell::new(vec![]),
|
rulelines: RefCell::new(vec![]),
|
||||||
crawl_delay: None,
|
crawl_delay: None,
|
||||||
|
sitemaps: Vec::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -129,6 +131,19 @@ impl Entry {
|
||||||
fn get_crawl_delay(&self) -> Option<Duration> {
|
fn get_crawl_delay(&self) -> Option<Duration> {
|
||||||
return self.crawl_delay.clone();
|
return self.crawl_delay.clone();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn add_sitemap(&mut self,url:&str) {
|
||||||
|
match Url::parse(url) {
|
||||||
|
Ok(url) => {
|
||||||
|
self.sitemaps.push(url);
|
||||||
|
},
|
||||||
|
Err(_) => {},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_sitemaps(&self) -> Vec<Url> {
|
||||||
|
return self.sitemaps.clone();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -295,6 +310,12 @@ impl RobotFileParser {
|
||||||
}
|
}
|
||||||
state = 2;
|
state = 2;
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
ref x if x == "sitemap" => {
|
||||||
|
if state != 0 {
|
||||||
|
entry.add_sitemap(&part1);
|
||||||
|
state = 2;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
_ => {},
|
_ => {},
|
||||||
}
|
}
|
||||||
|
|
@ -361,4 +382,19 @@ impl RobotFileParser {
|
||||||
}
|
}
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the sitemaps for this user agent as a `Vec<Url>`.
|
||||||
|
pub fn get_sitemaps<T: AsRef<str>>(&self,useragent: T) -> Vec<Url> {
|
||||||
|
let useragent = useragent.as_ref();
|
||||||
|
if self.last_checked.get() == 0 {
|
||||||
|
return Vec::new();
|
||||||
|
}
|
||||||
|
let entries = self.entries.borrow();
|
||||||
|
for entry in &*entries {
|
||||||
|
if entry.applies_to(useragent) {
|
||||||
|
return entry.get_sitemaps();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return Vec::new();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
22
tests/lib.rs
22
tests/lib.rs
|
|
@ -1,7 +1,9 @@
|
||||||
extern crate robotparser;
|
extern crate robotparser;
|
||||||
|
extern crate url;
|
||||||
|
|
||||||
use robotparser::RobotFileParser;
|
use robotparser::RobotFileParser;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
const AGENT: &'static str = "test_robotparser";
|
const AGENT: &'static str = "test_robotparser";
|
||||||
|
|
||||||
|
|
@ -228,3 +230,23 @@ fn test_robots_text_crawl_delay() {
|
||||||
parser.parse(&lines);
|
parser.parse(&lines);
|
||||||
assert_eq!(Duration::new(2,350 * 1000 * 1000), parser.get_crawl_delay("Yandex").unwrap());
|
assert_eq!(Duration::new(2,350 * 1000 * 1000), parser.get_crawl_delay("Yandex").unwrap());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_robots_text_sitemaps() {
|
||||||
|
let parser = RobotFileParser::new("http://www.python.org/robots.txt");
|
||||||
|
let doc = "User-agent: Yandex\n\
|
||||||
|
Sitemap: http://example.com/sitemap1.xml
|
||||||
|
Sitemap: http://example.com/sitemap2.xml
|
||||||
|
Sitemap: http://example.com/sitemap3.xml
|
||||||
|
Disallow: /search/\n";
|
||||||
|
let lines: Vec<&str> = doc.split("\n").collect();
|
||||||
|
parser.parse(&lines);
|
||||||
|
assert_eq!(
|
||||||
|
vec![
|
||||||
|
Url::parse("http://example.com/sitemap1.xml").unwrap(),
|
||||||
|
Url::parse("http://example.com/sitemap2.xml").unwrap(),
|
||||||
|
Url::parse("http://example.com/sitemap3.xml").unwrap()
|
||||||
|
],
|
||||||
|
parser.get_sitemaps("Yandex")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue