mirror of
https://github.com/danbulant/robotparser-rs
synced 2026-05-26 21:41:55 +00:00
Bug fixes and add more test cases
This commit is contained in:
parent
e8928ec198
commit
8e2944d8a4
2 changed files with 97 additions and 2 deletions
|
|
@ -100,7 +100,7 @@ impl Entry {
|
||||||
|
|
||||||
fn push_useragent(&self, useragent: &str) {
|
fn push_useragent(&self, useragent: &str) {
|
||||||
let mut useragents = self.useragents.borrow_mut();
|
let mut useragents = self.useragents.borrow_mut();
|
||||||
useragents.push(useragent.to_owned());
|
useragents.push(useragent.to_lowercase().to_owned());
|
||||||
}
|
}
|
||||||
|
|
||||||
fn push_ruleline(&self, ruleline: RuleLine) {
|
fn push_ruleline(&self, ruleline: RuleLine) {
|
||||||
|
|
@ -298,7 +298,7 @@ impl RobotFileParser {
|
||||||
}
|
}
|
||||||
// search for given user agent matches
|
// search for given user agent matches
|
||||||
// the first match counts
|
// the first match counts
|
||||||
let decoded_url = String::from_utf8(percent_decode(url.as_bytes())).unwrap_or("".to_owned());
|
let decoded_url = String::from_utf8(percent_decode(url.trim().as_bytes())).unwrap_or("".to_owned());
|
||||||
let url_str = match decoded_url {
|
let url_str = match decoded_url {
|
||||||
ref u if !u.is_empty() => u.to_owned(),
|
ref u if !u.is_empty() => u.to_owned(),
|
||||||
_ => "/".to_owned(),
|
_ => "/".to_owned(),
|
||||||
|
|
|
||||||
95
tests/lib.rs
95
tests/lib.rs
|
|
@ -115,3 +115,98 @@ fn test_robots_txt_6() {
|
||||||
let bad = vec![];
|
let bad = vec![];
|
||||||
robot_test_simple(doc, good, bad);
|
robot_test_simple(doc, good, bad);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_robots_txt_7() {
|
||||||
|
let doc = "\n\
|
||||||
|
User-agent: Googlebot\n\
|
||||||
|
Allow: /folder1/myfile.html\n\
|
||||||
|
Disallow: /folder1/\n\
|
||||||
|
";
|
||||||
|
let good = vec!["/folder1/myfile.html"];
|
||||||
|
let bad = vec!["/folder1/anotherfile.html"];
|
||||||
|
robot_test(doc, good, bad, "Googlebot");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This file is incorrect because "Googlebot" is a substring of "Googlebot-Mobile"
|
||||||
|
#[test]
|
||||||
|
fn test_robots_txt_8() {
|
||||||
|
let doc = "\n\
|
||||||
|
User-agent: Googlebot\n\
|
||||||
|
Disallow: /\n\
|
||||||
|
\n\
|
||||||
|
User-agent: Googlebot-Mobile\n\
|
||||||
|
Allow: /\n\
|
||||||
|
";
|
||||||
|
let good = vec![];
|
||||||
|
let bad = vec!["/something.jpg"];
|
||||||
|
robot_test(doc, good.clone(), bad.clone(), "Googlebot");
|
||||||
|
robot_test(doc, good, bad, "Googlebot-Mobile");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_robots_txt_9() {
|
||||||
|
let doc = "\n\
|
||||||
|
User-agent: Googlebot-Mobile\n\
|
||||||
|
Allow: /\n\
|
||||||
|
\n\
|
||||||
|
User-agent: Googlebot\n\
|
||||||
|
Disallow: /\n\
|
||||||
|
";
|
||||||
|
let good = vec![];
|
||||||
|
let bad = vec!["/something.jpg"];
|
||||||
|
robot_test(doc, good.clone(), bad.clone(), "Googlebot");
|
||||||
|
robot_test(doc, bad, good, "Googlebot-Mobile");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_robots_txt_10() {
|
||||||
|
let doc = "\n\
|
||||||
|
User-agent: Googlebot\n\
|
||||||
|
Allow: /folder1/myfile.html\n\
|
||||||
|
Disallow: /folder1/\n\
|
||||||
|
";
|
||||||
|
let good = vec!["/folder1/myfile.html"];
|
||||||
|
let bad = vec!["/folder1/anotherfile.html"];
|
||||||
|
robot_test(doc, good, bad, "googlebot");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// query string support
|
||||||
|
#[test]
|
||||||
|
fn test_robots_txt_11() {
|
||||||
|
let doc = "\n\
|
||||||
|
User-agent: *\n\
|
||||||
|
Disallow: /some/path?name=value\n\
|
||||||
|
";
|
||||||
|
let good = vec!["/some/path"];
|
||||||
|
let bad = vec!["/some/path?name=value"];
|
||||||
|
robot_test_simple(doc, good, bad);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// obey first * entry
|
||||||
|
#[test]
|
||||||
|
fn test_robots_txt_12() {
|
||||||
|
let doc = "\n\
|
||||||
|
User-agent: *\n\
|
||||||
|
Disallow: /some/path\n\
|
||||||
|
\n\
|
||||||
|
User-agent: *\n\
|
||||||
|
Disallow: /another/path\n\
|
||||||
|
";
|
||||||
|
let good = vec!["/another/path"];
|
||||||
|
let bad = vec!["/some/path"];
|
||||||
|
robot_test_simple(doc, good, bad);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Empty query. Normalizing the url first.
|
||||||
|
#[test]
|
||||||
|
fn test_robots_txt_13() {
|
||||||
|
let doc = "\n\
|
||||||
|
User-agent: *\n\
|
||||||
|
Allow: /some/path?\n\
|
||||||
|
Disallow: /another/path?\n\
|
||||||
|
";
|
||||||
|
let good = vec!["/some/path?"];
|
||||||
|
let bad = vec!["/another/path?"];
|
||||||
|
robot_test_simple(doc, good, bad);
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue