Bug fixes and add more test cases

This commit is contained in:
messense 2015-07-01 00:01:15 +08:00
parent e8928ec198
commit 8e2944d8a4
2 changed files with 97 additions and 2 deletions

View file

@ -100,7 +100,7 @@ impl Entry {
fn push_useragent(&self, useragent: &str) {
let mut useragents = self.useragents.borrow_mut();
useragents.push(useragent.to_owned());
useragents.push(useragent.to_lowercase().to_owned());
}
fn push_ruleline(&self, ruleline: RuleLine) {
@ -298,7 +298,7 @@ impl RobotFileParser {
}
// search for given user agent matches
// the first match counts
let decoded_url = String::from_utf8(percent_decode(url.as_bytes())).unwrap_or("".to_owned());
let decoded_url = String::from_utf8(percent_decode(url.trim().as_bytes())).unwrap_or("".to_owned());
let url_str = match decoded_url {
ref u if !u.is_empty() => u.to_owned(),
_ => "/".to_owned(),

View file

@ -115,3 +115,98 @@ fn test_robots_txt_6() {
let bad = vec![];
robot_test_simple(doc, good, bad);
}
#[test]
fn test_robots_txt_7() {
let doc = "\n\
User-agent: Googlebot\n\
Allow: /folder1/myfile.html\n\
Disallow: /folder1/\n\
";
let good = vec!["/folder1/myfile.html"];
let bad = vec!["/folder1/anotherfile.html"];
robot_test(doc, good, bad, "Googlebot");
}
/// This file is incorrect because "Googlebot" is a substring of "Googlebot-Mobile"
#[test]
fn test_robots_txt_8() {
let doc = "\n\
User-agent: Googlebot\n\
Disallow: /\n\
\n\
User-agent: Googlebot-Mobile\n\
Allow: /\n\
";
let good = vec![];
let bad = vec!["/something.jpg"];
robot_test(doc, good.clone(), bad.clone(), "Googlebot");
robot_test(doc, good, bad, "Googlebot-Mobile");
}
#[test]
fn test_robots_txt_9() {
let doc = "\n\
User-agent: Googlebot-Mobile\n\
Allow: /\n\
\n\
User-agent: Googlebot\n\
Disallow: /\n\
";
let good = vec![];
let bad = vec!["/something.jpg"];
robot_test(doc, good.clone(), bad.clone(), "Googlebot");
robot_test(doc, bad, good, "Googlebot-Mobile");
}
#[test]
fn test_robots_txt_10() {
let doc = "\n\
User-agent: Googlebot\n\
Allow: /folder1/myfile.html\n\
Disallow: /folder1/\n\
";
let good = vec!["/folder1/myfile.html"];
let bad = vec!["/folder1/anotherfile.html"];
robot_test(doc, good, bad, "googlebot");
}
/// query string support
#[test]
fn test_robots_txt_11() {
let doc = "\n\
User-agent: *\n\
Disallow: /some/path?name=value\n\
";
let good = vec!["/some/path"];
let bad = vec!["/some/path?name=value"];
robot_test_simple(doc, good, bad);
}
/// obey first * entry
#[test]
fn test_robots_txt_12() {
let doc = "\n\
User-agent: *\n\
Disallow: /some/path\n\
\n\
User-agent: *\n\
Disallow: /another/path\n\
";
let good = vec!["/another/path"];
let bad = vec!["/some/path"];
robot_test_simple(doc, good, bad);
}
/// Empty query. Normalizing the url first.
#[test]
fn test_robots_txt_13() {
let doc = "\n\
User-agent: *\n\
Allow: /some/path?\n\
Disallow: /another/path?\n\
";
let good = vec!["/some/path?"];
let bad = vec!["/another/path?"];
robot_test_simple(doc, good, bad);
}