diff --git a/Cargo.toml b/Cargo.toml index 05dfafe..8e4195c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "robotparser" -version = "0.3.0" +version = "0.4.0" authors = ["messense "] description = "robots.txt parser for Rust" repository = "https://github.com/messense/robotparser-rs" diff --git a/src/lib.rs b/src/lib.rs index d669754..d052e6b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -29,6 +29,7 @@ struct Entry { useragents: RefCell>, rulelines: RefCell>, crawl_delay: Option, + sitemaps: Vec, } #[derive(Debug, Eq, PartialEq, Clone)] @@ -69,6 +70,7 @@ impl Entry { useragents: RefCell::new(vec![]), rulelines: RefCell::new(vec![]), crawl_delay: None, + sitemaps: Vec::new(), } } @@ -129,6 +131,19 @@ impl Entry { fn get_crawl_delay(&self) -> Option { return self.crawl_delay.clone(); } + + fn add_sitemap(&mut self,url:&str) { + match Url::parse(url) { + Ok(url) => { + self.sitemaps.push(url); + }, + Err(_) => {}, + } + } + + fn get_sitemaps(&self) -> Vec { + return self.sitemaps.clone(); + } } @@ -295,6 +310,12 @@ impl RobotFileParser { } state = 2; } + }, + ref x if x == "sitemap" => { + if state != 0 { + entry.add_sitemap(&part1); + state = 2; + } } _ => {}, } @@ -361,4 +382,19 @@ impl RobotFileParser { } return None; } + + /// Returns the sitemaps for this user agent as a `Vec`. + pub fn get_sitemaps>(&self,useragent: T) -> Vec { + let useragent = useragent.as_ref(); + if self.last_checked.get() == 0 { + return Vec::new(); + } + let entries = self.entries.borrow(); + for entry in &*entries { + if entry.applies_to(useragent) { + return entry.get_sitemaps(); + } + } + return Vec::new(); + } } diff --git a/tests/lib.rs b/tests/lib.rs index ada37cb..abc9f24 100644 --- a/tests/lib.rs +++ b/tests/lib.rs @@ -1,7 +1,9 @@ extern crate robotparser; +extern crate url; use robotparser::RobotFileParser; use std::time::Duration; +use url::Url; const AGENT: &'static str = "test_robotparser"; @@ -228,3 +230,23 @@ fn test_robots_text_crawl_delay() { parser.parse(&lines); assert_eq!(Duration::new(2,350 * 1000 * 1000), parser.get_crawl_delay("Yandex").unwrap()); } + +#[test] +fn test_robots_text_sitemaps() { + let parser = RobotFileParser::new("http://www.python.org/robots.txt"); + let doc = "User-agent: Yandex\n\ + Sitemap: http://example.com/sitemap1.xml + Sitemap: http://example.com/sitemap2.xml + Sitemap: http://example.com/sitemap3.xml + Disallow: /search/\n"; + let lines: Vec<&str> = doc.split("\n").collect(); + parser.parse(&lines); + assert_eq!( + vec![ + Url::parse("http://example.com/sitemap1.xml").unwrap(), + Url::parse("http://example.com/sitemap2.xml").unwrap(), + Url::parse("http://example.com/sitemap3.xml").unwrap() + ], + parser.get_sitemaps("Yandex") + ); +}