mirror of
https://github.com/danbulant/robotparser-rs
synced 2026-05-19 04:18:42 +00:00
339 lines
9.3 KiB
Rust
339 lines
9.3 KiB
Rust
use robotparser::parser::parse_robots_txt;
|
|
use robotparser::service::RobotsTxtService;
|
|
use std::time::Duration;
|
|
use url::Url;
|
|
|
|
const AGENT: &str = "test_robotparser";
|
|
|
|
fn robot_test(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>, agent: &str) {
|
|
let url = Url::parse("https://www.baidu.com/robots.txt").unwrap();
|
|
let parser = parse_robots_txt(url.origin(), doc).get_result();
|
|
for url in &good_urls {
|
|
let url = format!("https://www.baidu.com{}", url);
|
|
let url = Url::parse(&url).unwrap();
|
|
assert!(parser.can_fetch(agent, &url));
|
|
}
|
|
for url in &bad_urls {
|
|
let url = format!("https://www.baidu.com{}", url);
|
|
let url = Url::parse(&url).unwrap();
|
|
assert!(!parser.can_fetch(agent, &url));
|
|
}
|
|
}
|
|
|
|
fn robot_test_simple(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>) {
|
|
robot_test(doc, good_urls, bad_urls, AGENT);
|
|
}
|
|
|
|
#[test]
|
|
fn test_robots_txt_rn_bom() {
|
|
let doc = "\u{feff}\r\n\
|
|
User-agent: *\r\n\
|
|
Disallow: /cyberworld/map/ # This is an infinite virtual URL space\r\n\
|
|
Disallow: /tmp/ # these will soon disappear\r\n\
|
|
Disallow: /foo.html\r\n\
|
|
";
|
|
let good = vec!["/", "/test.html"];
|
|
let bad = vec!["/cyberworld/map/index.html", "/tmp/xxx", "/foo.html"];
|
|
robot_test_simple(doc, good, bad);
|
|
}
|
|
|
|
#[test]
|
|
fn test_robots_txt_1() {
|
|
let doc = "\n\
|
|
User-agent: *\n\
|
|
Disallow: /cyberworld/map/ # This is an infinite virtual URL space\n\
|
|
Disallow: /tmp/ # these will soon disappear\n\
|
|
Disallow: /foo.html\n\
|
|
";
|
|
let good = vec!["/", "/test.html"];
|
|
let bad = vec!["/cyberworld/map/index.html", "/tmp/xxx", "/foo.html"];
|
|
robot_test_simple(doc, good, bad);
|
|
}
|
|
|
|
#[test]
|
|
fn test_robots_txt_2() {
|
|
let doc = "\n\
|
|
# robots.txt for https://www.example.com/\n\
|
|
\n\
|
|
User-agent: *\n\
|
|
Disallow: /cyberworld/map/ # This is an infinite virtual URL space\n\
|
|
\n\
|
|
# Cybermapper knows where to go.\n\
|
|
User-agent: cybermapper\n\
|
|
Disallow:\n\
|
|
\n\
|
|
";
|
|
let good = vec!["/", "/test.html"];
|
|
let bad = vec!["/cyberworld/map/index.html"];
|
|
robot_test_simple(doc, good, bad);
|
|
|
|
let good = vec!["/cyberworld/map/index.html"];
|
|
robot_test(doc, good, vec![], "cybermapper");
|
|
}
|
|
|
|
#[test]
|
|
fn test_robots_txt_3() {
|
|
let doc = "\n\
|
|
# go away\n\
|
|
User-agent: *\n\
|
|
Disallow: /\n\
|
|
";
|
|
let good = vec![];
|
|
let bad = vec!["/cyberworld/map/index.html", "/", "/tmp/"];
|
|
robot_test_simple(doc, good, bad);
|
|
}
|
|
|
|
#[test]
|
|
fn test_robots_txt_4() {
|
|
let doc = "\n\
|
|
User-agent: figtree\n\
|
|
Disallow: /tmp\n\
|
|
Disallow: /a%3cd.html\n\
|
|
Disallow: /a%2fb.html\n\
|
|
Disallow: /%7ejoe/index.html\n\
|
|
";
|
|
let good = vec![];
|
|
let bad = vec![
|
|
"/tmp",
|
|
"/tmp.html",
|
|
"/tmp/a.html",
|
|
"/a%3cd.html",
|
|
"/a%3Cd.html",
|
|
"/a%2fb.html",
|
|
"/~joe/index.html",
|
|
];
|
|
robot_test(doc, good.clone(), bad.clone(), "figtree");
|
|
robot_test(doc, good, bad, "FigTree Robot libwww-perl/5.04");
|
|
}
|
|
|
|
#[test]
|
|
fn test_robots_txt_5() {
|
|
let doc = "\n\
|
|
User-agent: *\n\
|
|
Disallow: /tmp/\n\
|
|
Disallow: /a%3Cd.html\n\
|
|
Disallow: /a/b.html\n\
|
|
Disallow: /%7ejoe/index.html\n\
|
|
";
|
|
let good = vec!["/tmp"];
|
|
let bad = vec![
|
|
"/tmp/",
|
|
"/tmp/a.html",
|
|
"/a%3cd.html",
|
|
"/a%3Cd.html",
|
|
"/a/b.html",
|
|
"/%7Ejoe/index.html",
|
|
];
|
|
robot_test_simple(doc, good, bad);
|
|
}
|
|
|
|
#[test]
|
|
fn test_robots_txt_6() {
|
|
let doc = "\n\
|
|
User-Agent: *\n\
|
|
Disallow: /.\n\
|
|
";
|
|
let good = vec!["/foo.html"];
|
|
let bad = vec![];
|
|
robot_test_simple(doc, good, bad);
|
|
}
|
|
|
|
#[test]
|
|
fn test_robots_txt_7() {
|
|
let doc = "\n\
|
|
User-agent: Googlebot\n\
|
|
Allow: /folder1/myfile.html\n\
|
|
Disallow: /folder1/\n\
|
|
";
|
|
let good = vec!["/folder1/myfile.html"];
|
|
let bad = vec!["/folder1/anotherfile.html"];
|
|
robot_test(doc, good, bad, "Googlebot");
|
|
}
|
|
|
|
/// This file is incorrect because "Googlebot" is a substring of "Googlebot-Mobile"
|
|
#[test]
|
|
fn test_robots_txt_8() {
|
|
let doc = "\n\
|
|
User-agent: Googlebot\n\
|
|
Disallow: /\n\
|
|
\n\
|
|
User-agent: Googlebot-Mobile\n\
|
|
Allow: /\n\
|
|
";
|
|
let good = vec![];
|
|
let bad = vec!["/something.jpg"];
|
|
robot_test(doc, good.clone(), bad.clone(), "Googlebot");
|
|
robot_test(doc, good, bad, "Googlebot-Mobile");
|
|
}
|
|
|
|
#[test]
|
|
fn test_robots_txt_9() {
|
|
let doc = "\n\
|
|
User-agent: Googlebot-Mobile\n\
|
|
Allow: /\n\
|
|
\n\
|
|
User-agent: Googlebot\n\
|
|
Disallow: /\n\
|
|
";
|
|
let good = vec![];
|
|
let bad = vec!["/something.jpg"];
|
|
robot_test(doc, good.clone(), bad.clone(), "Googlebot");
|
|
robot_test(doc, bad, good, "Googlebot-Mobile");
|
|
}
|
|
|
|
#[test]
|
|
fn test_robots_txt_10() {
|
|
let doc = "\n\
|
|
User-agent: Googlebot\n\
|
|
Allow: /folder1/myfile.html\n\
|
|
Disallow: /folder1/\n\
|
|
";
|
|
let good = vec!["/folder1/myfile.html"];
|
|
let bad = vec!["/folder1/anotherfile.html"];
|
|
robot_test(doc, good, bad, "googlebot");
|
|
}
|
|
|
|
/// query string support
|
|
#[test]
|
|
fn test_robots_txt_11() {
|
|
let doc = "\n\
|
|
User-agent: *\n\
|
|
Disallow: /some/path?name=value\n\
|
|
";
|
|
let good = vec!["/some/path"];
|
|
let bad = vec!["/some/path?name=value"];
|
|
robot_test_simple(doc, good, bad);
|
|
}
|
|
|
|
/// obey first * entry
|
|
#[test]
|
|
fn test_robots_txt_12() {
|
|
let doc = "\n\
|
|
User-agent: *\n\
|
|
Disallow: /some/path\n\
|
|
\n\
|
|
User-agent: *\n\
|
|
Disallow: /another/path\n\
|
|
";
|
|
let good = vec!["/another/path"];
|
|
let bad = vec!["/some/path"];
|
|
robot_test_simple(doc, good, bad);
|
|
}
|
|
|
|
/// Empty query. Normalizing the url first.
|
|
#[test]
|
|
fn test_robots_txt_13() {
|
|
let doc = "\n\
|
|
User-agent: *\n\
|
|
Allow: /some/path?\n\
|
|
Disallow: /another/path?\n\
|
|
";
|
|
let good = vec!["/some/path?"];
|
|
let bad = vec!["/another/path?"];
|
|
robot_test_simple(doc, good, bad);
|
|
}
|
|
|
|
/// Using patterns with `*` and `$` symbols.
|
|
#[test]
|
|
fn test_robots_txt_14() {
|
|
let doc = "\n\
|
|
User-agent: *\n
|
|
Allow: /*video.html\n
|
|
Allow: */?amp*\n
|
|
Disallow: */rss$\n
|
|
Disallow: */rss/$\n
|
|
Disallow: /rate/\n
|
|
";
|
|
let good = vec!["/rss/test", "/sdfvsdvs-sdfvsdv-video.html", "/rate"];
|
|
let bad = vec!["/rss", "/rss/", "/rate/", "/rate/0/9"];
|
|
robot_test_simple(doc, good, bad);
|
|
}
|
|
|
|
#[cfg(feature = "reqwest")]
|
|
#[test]
|
|
fn test_robots_txt_read() {
|
|
use reqwest::{Client, Request};
|
|
use robotparser::http::{CreateRobotsTxtRequest, ParseRobotsTxtResponse};
|
|
let http_client = Client::new();
|
|
let url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
|
let request = Request::create_robots_txt_request(url.origin());
|
|
let mut response = http_client.execute(request).unwrap();
|
|
let parser = response.parse_robots_txt_response().unwrap().get_result();
|
|
assert!(parser.can_fetch("*", &url));
|
|
}
|
|
|
|
#[test]
|
|
fn test_robots_text_crawl_delay() {
|
|
let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
|
let doc = "User-agent: Yandex\n\
|
|
Crawl-delay: 2.35\n\
|
|
Disallow: /search/\n";
|
|
let parser = parse_robots_txt(robots_txt_url.origin(), doc).get_result();
|
|
assert_eq!(
|
|
Duration::new(2, 350 * 1000 * 1000),
|
|
parser.get_crawl_delay("Yandex").unwrap()
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_robots_text_sitemaps() {
|
|
let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
|
let doc = "User-agent: Yandex\n\
|
|
Sitemap \t : https://example.com/sitemap1.xml\n
|
|
Sitemap: https://example.com/sitemap2.xml\n
|
|
Sitemap: https://example.com/sitemap3.xml\n
|
|
Disallow: /search/\n";
|
|
let parser = parse_robots_txt(robots_txt_url.origin(), doc).get_result();
|
|
assert_eq!(
|
|
&[
|
|
Url::parse("https://example.com/sitemap1.xml").unwrap(),
|
|
Url::parse("https://example.com/sitemap2.xml").unwrap(),
|
|
Url::parse("https://example.com/sitemap3.xml").unwrap()
|
|
],
|
|
parser.get_sitemaps()
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_robots_text_request_rate() {
|
|
let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
|
let doc = "User-agent: Yandex\n\
|
|
Request-rate: 3/15\n\
|
|
Disallow: /search/\n";
|
|
let parser = parse_robots_txt(robots_txt_url.origin(), doc).get_result();
|
|
let req_rate = parser.get_req_rate("Yandex").unwrap();
|
|
assert_eq!(3, req_rate.requests);
|
|
assert_eq!(15, req_rate.seconds);
|
|
|
|
let req_rate = parser.get_req_rate("Google");
|
|
assert!(req_rate.is_none());
|
|
}
|
|
|
|
#[test]
|
|
fn test_robots_text_clean_params() {
|
|
let doc = "\
|
|
User-Agent: *\n\
|
|
Clean-param: mode\n\
|
|
Clean-param: from\n\
|
|
Clean-param: pid\n\
|
|
Clean-param: gid\n\
|
|
Clean-param: tm\n\
|
|
Clean-param: amp\n\
|
|
";
|
|
let url = Url::parse("https://www.baidu.com/robots.txt").unwrap();
|
|
let parser = parse_robots_txt(url.origin(), doc).get_result();
|
|
let mut site_url =
|
|
Url::parse("https://www.baidu.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&=1").unwrap();
|
|
let was_updated = parser.normalize_url(&mut site_url);
|
|
assert!(was_updated);
|
|
assert_eq!(site_url.as_str(), "https://www.baidu.com/test?post_id=7777");
|
|
|
|
let mut site_url =
|
|
Url::parse("https://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&=1").unwrap();
|
|
let was_updated = parser.normalize_url(&mut site_url);
|
|
assert!(!was_updated);
|
|
assert_eq!(
|
|
site_url.as_str(),
|
|
"https://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&=1"
|
|
);
|
|
}
|