mirror of
https://github.com/danbulant/robotparser-rs
synced 2026-05-19 04:18:42 +00:00
Bug fixes and add more test cases
This commit is contained in:
parent
e8928ec198
commit
8e2944d8a4
2 changed files with 97 additions and 2 deletions
|
|
@ -100,7 +100,7 @@ impl Entry {
|
|||
|
||||
fn push_useragent(&self, useragent: &str) {
|
||||
let mut useragents = self.useragents.borrow_mut();
|
||||
useragents.push(useragent.to_owned());
|
||||
useragents.push(useragent.to_lowercase().to_owned());
|
||||
}
|
||||
|
||||
fn push_ruleline(&self, ruleline: RuleLine) {
|
||||
|
|
@ -298,7 +298,7 @@ impl RobotFileParser {
|
|||
}
|
||||
// search for given user agent matches
|
||||
// the first match counts
|
||||
let decoded_url = String::from_utf8(percent_decode(url.as_bytes())).unwrap_or("".to_owned());
|
||||
let decoded_url = String::from_utf8(percent_decode(url.trim().as_bytes())).unwrap_or("".to_owned());
|
||||
let url_str = match decoded_url {
|
||||
ref u if !u.is_empty() => u.to_owned(),
|
||||
_ => "/".to_owned(),
|
||||
|
|
|
|||
95
tests/lib.rs
95
tests/lib.rs
|
|
@ -115,3 +115,98 @@ fn test_robots_txt_6() {
|
|||
let bad = vec![];
|
||||
robot_test_simple(doc, good, bad);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_robots_txt_7() {
|
||||
let doc = "\n\
|
||||
User-agent: Googlebot\n\
|
||||
Allow: /folder1/myfile.html\n\
|
||||
Disallow: /folder1/\n\
|
||||
";
|
||||
let good = vec!["/folder1/myfile.html"];
|
||||
let bad = vec!["/folder1/anotherfile.html"];
|
||||
robot_test(doc, good, bad, "Googlebot");
|
||||
}
|
||||
|
||||
/// This file is incorrect because "Googlebot" is a substring of "Googlebot-Mobile"
|
||||
#[test]
|
||||
fn test_robots_txt_8() {
|
||||
let doc = "\n\
|
||||
User-agent: Googlebot\n\
|
||||
Disallow: /\n\
|
||||
\n\
|
||||
User-agent: Googlebot-Mobile\n\
|
||||
Allow: /\n\
|
||||
";
|
||||
let good = vec![];
|
||||
let bad = vec!["/something.jpg"];
|
||||
robot_test(doc, good.clone(), bad.clone(), "Googlebot");
|
||||
robot_test(doc, good, bad, "Googlebot-Mobile");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_robots_txt_9() {
|
||||
let doc = "\n\
|
||||
User-agent: Googlebot-Mobile\n\
|
||||
Allow: /\n\
|
||||
\n\
|
||||
User-agent: Googlebot\n\
|
||||
Disallow: /\n\
|
||||
";
|
||||
let good = vec![];
|
||||
let bad = vec!["/something.jpg"];
|
||||
robot_test(doc, good.clone(), bad.clone(), "Googlebot");
|
||||
robot_test(doc, bad, good, "Googlebot-Mobile");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_robots_txt_10() {
|
||||
let doc = "\n\
|
||||
User-agent: Googlebot\n\
|
||||
Allow: /folder1/myfile.html\n\
|
||||
Disallow: /folder1/\n\
|
||||
";
|
||||
let good = vec!["/folder1/myfile.html"];
|
||||
let bad = vec!["/folder1/anotherfile.html"];
|
||||
robot_test(doc, good, bad, "googlebot");
|
||||
}
|
||||
|
||||
/// query string support
|
||||
#[test]
|
||||
fn test_robots_txt_11() {
|
||||
let doc = "\n\
|
||||
User-agent: *\n\
|
||||
Disallow: /some/path?name=value\n\
|
||||
";
|
||||
let good = vec!["/some/path"];
|
||||
let bad = vec!["/some/path?name=value"];
|
||||
robot_test_simple(doc, good, bad);
|
||||
}
|
||||
|
||||
/// obey first * entry
|
||||
#[test]
|
||||
fn test_robots_txt_12() {
|
||||
let doc = "\n\
|
||||
User-agent: *\n\
|
||||
Disallow: /some/path\n\
|
||||
\n\
|
||||
User-agent: *\n\
|
||||
Disallow: /another/path\n\
|
||||
";
|
||||
let good = vec!["/another/path"];
|
||||
let bad = vec!["/some/path"];
|
||||
robot_test_simple(doc, good, bad);
|
||||
}
|
||||
|
||||
/// Empty query. Normalizing the url first.
|
||||
#[test]
|
||||
fn test_robots_txt_13() {
|
||||
let doc = "\n\
|
||||
User-agent: *\n\
|
||||
Allow: /some/path?\n\
|
||||
Disallow: /another/path?\n\
|
||||
";
|
||||
let good = vec!["/some/path?"];
|
||||
let bad = vec!["/another/path?"];
|
||||
robot_test_simple(doc, good, bad);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue