mirror of
https://github.com/danbulant/robotparser-rs
synced 2026-06-17 05:31:17 +00:00
178 lines
No EOL
6.5 KiB
Rust
178 lines
No EOL
6.5 KiB
Rust
use robotparser::parser::{parse_robots_txt, WarningReason};
|
|
use url::{Host, Origin};
|
|
use std::convert::From;
|
|
|
|
#[derive(PartialEq, Eq, Debug, Clone)]
|
|
enum WarningReasonKind {
|
|
InvalidDirectiveFormat,
|
|
DirectiveKeyIsEmpty,
|
|
UnsupportedDirectiveKey,
|
|
UserAgentCannotBeEmpty,
|
|
DirectiveWithoutUserAgent,
|
|
ParseCrawlDelayError,
|
|
WrongRequestRateFormat,
|
|
ParseRequestRate,
|
|
ParseUrl,
|
|
WrongCleanParamFormat,
|
|
IgnoredCleanParams,
|
|
WrongPathFormat,
|
|
}
|
|
|
|
fn validate_warnings(input: &str, expected_warnings: &[WarningReasonKind]) {
|
|
let host = Host::Domain("python.org".into());
|
|
let origin = Origin::Tuple("http".into(), host, 80);
|
|
let warnings = parse_robots_txt(origin, &input).get_warnings().to_vec();
|
|
assert_eq!(warnings.len(), expected_warnings.len());
|
|
for (warning, expected_warning) in warnings.iter().zip(expected_warnings.iter()) {
|
|
let warning: WarningReasonKind = warning.get_reason().into();
|
|
assert_eq!(expected_warning.clone(), warning);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_warning_invalid_directive_format() {
|
|
let input = "`";
|
|
validate_warnings(input, &[WarningReasonKind::InvalidDirectiveFormat]);
|
|
let input = " \t ` \t ";
|
|
validate_warnings(input, &[WarningReasonKind::InvalidDirectiveFormat]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_warning_directive_key_is_empty() {
|
|
let input = ":";
|
|
validate_warnings(input, &[WarningReasonKind::DirectiveKeyIsEmpty]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_warning_supported_directive_key() {
|
|
let input = "X-Directive:";
|
|
validate_warnings(input, &[WarningReasonKind::UnsupportedDirectiveKey]);
|
|
let input = "\t X-Directive\t :\t ";
|
|
validate_warnings(input, &[WarningReasonKind::UnsupportedDirectiveKey]);
|
|
}
|
|
|
|
|
|
#[test]
|
|
fn test_warning_user_agent_cannot_be_empty() {
|
|
let input = "User-Agent:";
|
|
validate_warnings(input, &[WarningReasonKind::UserAgentCannotBeEmpty]);
|
|
let input = "\t User-Agent\t :\t ";
|
|
validate_warnings(input, &[WarningReasonKind::UserAgentCannotBeEmpty]);
|
|
let input = "\t User-Agent\t :\t *";
|
|
validate_warnings(input, &[]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_warning_directive_without_user_agent() {
|
|
let input = "Crawl-Delay: 5s";
|
|
validate_warnings(input, &[WarningReasonKind::DirectiveWithoutUserAgent]);
|
|
let input = "User-Agent: *\nCrawl-Delay: 5";
|
|
validate_warnings(input, &[]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_warning_parse_crawl_delay_error() {
|
|
let input = "User-Agent: *\nCrawl-Delay: ";
|
|
validate_warnings(input, &[WarningReasonKind::ParseCrawlDelayError]);
|
|
let input = "User-Agent: *\nCrawl-Delay: -";
|
|
validate_warnings(input, &[WarningReasonKind::ParseCrawlDelayError]);
|
|
let input = "User-Agent: *\nCrawl-Delay: 5h9";
|
|
validate_warnings(input, &[WarningReasonKind::ParseCrawlDelayError]);
|
|
let input = "User-Agent: *\nCrawl-Delay: 5";
|
|
validate_warnings(input, &[]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_warning_request_rate_format() {
|
|
let input = "User-Agent: *\nRequest-rate: 1/5";
|
|
validate_warnings(input, &[]);
|
|
let input = "User-Agent: *\nRequest-rate: 1//5";
|
|
validate_warnings(input, &[WarningReasonKind::WrongRequestRateFormat]);
|
|
let input = "User-Agent: *\nRequest-rate: 1";
|
|
validate_warnings(input, &[WarningReasonKind::WrongRequestRateFormat]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_warning_request_rate() {
|
|
let input = "User-Agent: *\nRequest-rate: a/b";
|
|
validate_warnings(input, &[WarningReasonKind::ParseRequestRate]);
|
|
let input = "User-Agent: *\nRequest-rate: a/5";
|
|
validate_warnings(input, &[WarningReasonKind::ParseRequestRate]);
|
|
let input = "User-Agent: *\nRequest-rate: 5/b";
|
|
validate_warnings(input, &[WarningReasonKind::ParseRequestRate]);
|
|
let input = "User-Agent: *\nRequest-rate: 1.0/5.0";
|
|
validate_warnings(input, &[WarningReasonKind::ParseRequestRate]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_warning_parsing_url() {
|
|
let input = "User-Agent: *\nSitemap: https://python.org/sitemap.xml";
|
|
validate_warnings(input, &[]);
|
|
let input = "User-Agent: *\nSitemap: http$$$://python.org/sitemap.xml";
|
|
validate_warnings(input, &[WarningReasonKind::ParseUrl]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_wrong_clean_param() {
|
|
let input = "User-Agent: *\nClean-param: ref ";
|
|
validate_warnings(input, &[]);
|
|
let input = "User-Agent: *\nClean-param: ";
|
|
validate_warnings(input, &[WarningReasonKind::WrongCleanParamFormat]);
|
|
let input = "User-Agent: *\nClean-param: &";
|
|
validate_warnings(input, &[]);
|
|
let input = "User-Agent: *\nClean-param: ?";
|
|
validate_warnings(input, &[WarningReasonKind::IgnoredCleanParams]);
|
|
let input = "User-Agent: *\nClean-param: abc$";
|
|
validate_warnings(input, &[WarningReasonKind::IgnoredCleanParams]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_warning_wrong_path_format() {
|
|
let input = "User-Agent: *\nAllow: \\";
|
|
validate_warnings(input, &[WarningReasonKind::WrongPathFormat]);
|
|
let input = "User-Agent: *\nDisallow: \\";
|
|
validate_warnings(input, &[WarningReasonKind::WrongPathFormat]);
|
|
}
|
|
|
|
impl From<&WarningReason> for WarningReasonKind {
|
|
fn from(reason: &WarningReason) -> Self {
|
|
match reason {
|
|
&WarningReason::InvalidDirectiveFormat => {
|
|
return WarningReasonKind::InvalidDirectiveFormat;
|
|
},
|
|
&WarningReason::DirectiveKeyIsEmpty => {
|
|
return WarningReasonKind::DirectiveKeyIsEmpty;
|
|
},
|
|
&WarningReason::UnsupportedDirectiveKey {..} => {
|
|
return WarningReasonKind::UnsupportedDirectiveKey;
|
|
},
|
|
&WarningReason::UserAgentCannotBeEmpty => {
|
|
return WarningReasonKind::UserAgentCannotBeEmpty;
|
|
},
|
|
&WarningReason::DirectiveWithoutUserAgent => {
|
|
return WarningReasonKind::DirectiveWithoutUserAgent;
|
|
},
|
|
&WarningReason::ParseCrawlDelayError {..} => {
|
|
return WarningReasonKind::ParseCrawlDelayError;
|
|
},
|
|
&WarningReason::WrongRequestRateFormat => {
|
|
return WarningReasonKind::WrongRequestRateFormat;
|
|
},
|
|
&WarningReason::ParseRequestRate {..} => {
|
|
return WarningReasonKind::ParseRequestRate;
|
|
},
|
|
&WarningReason::ParseUrl {..} => {
|
|
return WarningReasonKind::ParseUrl;
|
|
},
|
|
&WarningReason::WrongCleanParamFormat => {
|
|
return WarningReasonKind::WrongCleanParamFormat;
|
|
},
|
|
&WarningReason::IgnoredCleanParams {..} => {
|
|
return WarningReasonKind::IgnoredCleanParams;
|
|
},
|
|
&WarningReason::WrongPathFormat => {
|
|
return WarningReasonKind::WrongPathFormat;
|
|
},
|
|
}
|
|
}
|
|
} |