Merge pull request #2 from svmk/master

Added sitemap support
2026-07-03 10:10:47 +00:00 · 2016-02-08 19:37:19 +08:00 · 2016-02-08 19:37:19 +08:00 · a18dcd4c26
commit a18dcd4c26
parent 255166313f fcfb6d9df6
3 changed files with 59 additions and 1 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "robotparser"
-version = "0.3.0"
+version = "0.4.0"
 authors = ["messense <messense@icloud.com>"]
 description = "robots.txt parser for Rust"
 repository = "https://github.com/messense/robotparser-rs"
--- a/src/lib.rs
+++ b/src/lib.rs
@ -29,6 +29,7 @@ struct Entry {
    useragents: RefCell<Vec<String>>,
    rulelines: RefCell<Vec<RuleLine>>,
    crawl_delay: Option<Duration>,
+    sitemaps: Vec<Url>,
 }

 #[derive(Debug, Eq, PartialEq, Clone)]
@ -69,6 +70,7 @@ impl Entry {
            useragents: RefCell::new(vec![]),
            rulelines: RefCell::new(vec![]),
            crawl_delay: None,
+            sitemaps: Vec::new(),
        }
    }

@ -129,6 +131,19 @@ impl Entry {
    fn get_crawl_delay(&self) -> Option<Duration> {
        return self.crawl_delay.clone();
    }
+
+    fn add_sitemap(&mut self,url:&str) {
+        match Url::parse(url) {
+            Ok(url) => {
+                self.sitemaps.push(url);
+            },
+            Err(_) => {},
+        }
+    }
+
+    fn get_sitemaps(&self) -> Vec<Url> {
+        return self.sitemaps.clone();
+    }
 }


@ -295,6 +310,12 @@ impl RobotFileParser {
                            }
                            state = 2;
                        }  
+                    },
+                    ref x if x == "sitemap" => {
+                        if state != 0 {
+                            entry.add_sitemap(&part1);
+                            state = 2;
+                        }
                    }
                    _ => {},
                }
@ -361,4 +382,19 @@ impl RobotFileParser {
        }
        return None;
    }
+
+    /// Returns the sitemaps for this user agent as a `Vec<Url>`.
+    pub fn get_sitemaps<T: AsRef<str>>(&self,useragent: T) -> Vec<Url> {
+        let useragent = useragent.as_ref();        
+        if self.last_checked.get() == 0 {
+            return Vec::new();
+        }
+        let entries = self.entries.borrow();
+        for entry in &*entries {
+            if entry.applies_to(useragent) {
+                return entry.get_sitemaps();
+            }
+        }
+        return Vec::new();
+    }
 }
--- a/tests/lib.rs
+++ b/tests/lib.rs
@ -1,7 +1,9 @@
 extern crate robotparser;
+extern crate url;

 use robotparser::RobotFileParser;
 use std::time::Duration;
+use url::Url;

 const AGENT: &'static str = "test_robotparser";

@ -228,3 +230,23 @@ fn test_robots_text_crawl_delay() {
    parser.parse(&lines);
    assert_eq!(Duration::new(2,350 * 1000 * 1000), parser.get_crawl_delay("Yandex").unwrap());
 }
+
+#[test]
+fn test_robots_text_sitemaps() {
+    let parser = RobotFileParser::new("http://www.python.org/robots.txt");
+    let doc = "User-agent: Yandex\n\
+    Sitemap:  http://example.com/sitemap1.xml
+    Sitemap:  http://example.com/sitemap2.xml
+    Sitemap:  http://example.com/sitemap3.xml
+    Disallow: /search/\n";
+    let lines: Vec<&str> = doc.split("\n").collect();
+    parser.parse(&lines);
+    assert_eq!(
+        vec![
+            Url::parse("http://example.com/sitemap1.xml").unwrap(),
+            Url::parse("http://example.com/sitemap2.xml").unwrap(),
+            Url::parse("http://example.com/sitemap3.xml").unwrap()
+        ], 
+        parser.get_sitemaps("Yandex")
+    );
+}