From 8e2944d8a470b2ef92f6caaaa5eda094c87c7f5f Mon Sep 17 00:00:00 2001 From: messense Date: Wed, 1 Jul 2015 00:01:15 +0800 Subject: [PATCH] Bug fixes and add more test cases --- src/lib.rs | 4 +-- tests/lib.rs | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 8192535..c14cc5a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -100,7 +100,7 @@ impl Entry { fn push_useragent(&self, useragent: &str) { let mut useragents = self.useragents.borrow_mut(); - useragents.push(useragent.to_owned()); + useragents.push(useragent.to_lowercase().to_owned()); } fn push_ruleline(&self, ruleline: RuleLine) { @@ -298,7 +298,7 @@ impl RobotFileParser { } // search for given user agent matches // the first match counts - let decoded_url = String::from_utf8(percent_decode(url.as_bytes())).unwrap_or("".to_owned()); + let decoded_url = String::from_utf8(percent_decode(url.trim().as_bytes())).unwrap_or("".to_owned()); let url_str = match decoded_url { ref u if !u.is_empty() => u.to_owned(), _ => "/".to_owned(), diff --git a/tests/lib.rs b/tests/lib.rs index 221d807..1460fd2 100644 --- a/tests/lib.rs +++ b/tests/lib.rs @@ -115,3 +115,98 @@ fn test_robots_txt_6() { let bad = vec![]; robot_test_simple(doc, good, bad); } + +#[test] +fn test_robots_txt_7() { + let doc = "\n\ + User-agent: Googlebot\n\ + Allow: /folder1/myfile.html\n\ + Disallow: /folder1/\n\ + "; + let good = vec!["/folder1/myfile.html"]; + let bad = vec!["/folder1/anotherfile.html"]; + robot_test(doc, good, bad, "Googlebot"); +} + +/// This file is incorrect because "Googlebot" is a substring of "Googlebot-Mobile" +#[test] +fn test_robots_txt_8() { + let doc = "\n\ + User-agent: Googlebot\n\ + Disallow: /\n\ + \n\ + User-agent: Googlebot-Mobile\n\ + Allow: /\n\ + "; + let good = vec![]; + let bad = vec!["/something.jpg"]; + robot_test(doc, good.clone(), bad.clone(), "Googlebot"); + robot_test(doc, good, bad, "Googlebot-Mobile"); +} + +#[test] +fn test_robots_txt_9() { + let doc = "\n\ + User-agent: Googlebot-Mobile\n\ + Allow: /\n\ + \n\ + User-agent: Googlebot\n\ + Disallow: /\n\ + "; + let good = vec![]; + let bad = vec!["/something.jpg"]; + robot_test(doc, good.clone(), bad.clone(), "Googlebot"); + robot_test(doc, bad, good, "Googlebot-Mobile"); +} + +#[test] +fn test_robots_txt_10() { + let doc = "\n\ + User-agent: Googlebot\n\ + Allow: /folder1/myfile.html\n\ + Disallow: /folder1/\n\ + "; + let good = vec!["/folder1/myfile.html"]; + let bad = vec!["/folder1/anotherfile.html"]; + robot_test(doc, good, bad, "googlebot"); +} + +/// query string support +#[test] +fn test_robots_txt_11() { + let doc = "\n\ + User-agent: *\n\ + Disallow: /some/path?name=value\n\ + "; + let good = vec!["/some/path"]; + let bad = vec!["/some/path?name=value"]; + robot_test_simple(doc, good, bad); +} + +/// obey first * entry +#[test] +fn test_robots_txt_12() { + let doc = "\n\ + User-agent: *\n\ + Disallow: /some/path\n\ + \n\ + User-agent: *\n\ + Disallow: /another/path\n\ + "; + let good = vec!["/another/path"]; + let bad = vec!["/some/path"]; + robot_test_simple(doc, good, bad); +} + +/// Empty query. Normalizing the url first. +#[test] +fn test_robots_txt_13() { + let doc = "\n\ + User-agent: *\n\ + Allow: /some/path?\n\ + Disallow: /another/path?\n\ + "; + let good = vec!["/some/path?"]; + let bad = vec!["/another/path?"]; + robot_test_simple(doc, good, bad); +}