robotparser-rs/tests/test_warnings.rs
2020-03-02 15:20:44 +08:00

178 lines
No EOL
6.5 KiB
Rust

use robotparser::parser::{parse_robots_txt, WarningReason};
use url::{Host, Origin};
use std::convert::From;
#[derive(PartialEq, Eq, Debug, Clone)]
enum WarningReasonKind {
InvalidDirectiveFormat,
DirectiveKeyIsEmpty,
UnsupportedDirectiveKey,
UserAgentCannotBeEmpty,
DirectiveWithoutUserAgent,
ParseCrawlDelayError,
WrongRequestRateFormat,
ParseRequestRate,
ParseUrl,
WrongCleanParamFormat,
IgnoredCleanParams,
WrongPathFormat,
}
fn validate_warnings(input: &str, expected_warnings: &[WarningReasonKind]) {
let host = Host::Domain("python.org".into());
let origin = Origin::Tuple("http".into(), host, 80);
let warnings = parse_robots_txt(origin, &input).get_warnings().to_vec();
assert_eq!(warnings.len(), expected_warnings.len());
for (warning, expected_warning) in warnings.iter().zip(expected_warnings.iter()) {
let warning: WarningReasonKind = warning.get_reason().into();
assert_eq!(expected_warning.clone(), warning);
}
}
#[test]
fn test_warning_invalid_directive_format() {
let input = "`";
validate_warnings(input, &[WarningReasonKind::InvalidDirectiveFormat]);
let input = " \t ` \t ";
validate_warnings(input, &[WarningReasonKind::InvalidDirectiveFormat]);
}
#[test]
fn test_warning_directive_key_is_empty() {
let input = ":";
validate_warnings(input, &[WarningReasonKind::DirectiveKeyIsEmpty]);
}
#[test]
fn test_warning_supported_directive_key() {
let input = "X-Directive:";
validate_warnings(input, &[WarningReasonKind::UnsupportedDirectiveKey]);
let input = "\t X-Directive\t :\t ";
validate_warnings(input, &[WarningReasonKind::UnsupportedDirectiveKey]);
}
#[test]
fn test_warning_user_agent_cannot_be_empty() {
let input = "User-Agent:";
validate_warnings(input, &[WarningReasonKind::UserAgentCannotBeEmpty]);
let input = "\t User-Agent\t :\t ";
validate_warnings(input, &[WarningReasonKind::UserAgentCannotBeEmpty]);
let input = "\t User-Agent\t :\t *";
validate_warnings(input, &[]);
}
#[test]
fn test_warning_directive_without_user_agent() {
let input = "Crawl-Delay: 5s";
validate_warnings(input, &[WarningReasonKind::DirectiveWithoutUserAgent]);
let input = "User-Agent: *\nCrawl-Delay: 5";
validate_warnings(input, &[]);
}
#[test]
fn test_warning_parse_crawl_delay_error() {
let input = "User-Agent: *\nCrawl-Delay: ";
validate_warnings(input, &[WarningReasonKind::ParseCrawlDelayError]);
let input = "User-Agent: *\nCrawl-Delay: -";
validate_warnings(input, &[WarningReasonKind::ParseCrawlDelayError]);
let input = "User-Agent: *\nCrawl-Delay: 5h9";
validate_warnings(input, &[WarningReasonKind::ParseCrawlDelayError]);
let input = "User-Agent: *\nCrawl-Delay: 5";
validate_warnings(input, &[]);
}
#[test]
fn test_warning_request_rate_format() {
let input = "User-Agent: *\nRequest-rate: 1/5";
validate_warnings(input, &[]);
let input = "User-Agent: *\nRequest-rate: 1//5";
validate_warnings(input, &[WarningReasonKind::WrongRequestRateFormat]);
let input = "User-Agent: *\nRequest-rate: 1";
validate_warnings(input, &[WarningReasonKind::WrongRequestRateFormat]);
}
#[test]
fn test_warning_request_rate() {
let input = "User-Agent: *\nRequest-rate: a/b";
validate_warnings(input, &[WarningReasonKind::ParseRequestRate]);
let input = "User-Agent: *\nRequest-rate: a/5";
validate_warnings(input, &[WarningReasonKind::ParseRequestRate]);
let input = "User-Agent: *\nRequest-rate: 5/b";
validate_warnings(input, &[WarningReasonKind::ParseRequestRate]);
let input = "User-Agent: *\nRequest-rate: 1.0/5.0";
validate_warnings(input, &[WarningReasonKind::ParseRequestRate]);
}
#[test]
fn test_warning_parsing_url() {
let input = "User-Agent: *\nSitemap: https://python.org/sitemap.xml";
validate_warnings(input, &[]);
let input = "User-Agent: *\nSitemap: http$$$://python.org/sitemap.xml";
validate_warnings(input, &[WarningReasonKind::ParseUrl]);
}
#[test]
fn test_wrong_clean_param() {
let input = "User-Agent: *\nClean-param: ref ";
validate_warnings(input, &[]);
let input = "User-Agent: *\nClean-param: ";
validate_warnings(input, &[WarningReasonKind::WrongCleanParamFormat]);
let input = "User-Agent: *\nClean-param: &";
validate_warnings(input, &[]);
let input = "User-Agent: *\nClean-param: ?";
validate_warnings(input, &[WarningReasonKind::IgnoredCleanParams]);
let input = "User-Agent: *\nClean-param: abc$";
validate_warnings(input, &[WarningReasonKind::IgnoredCleanParams]);
}
#[test]
fn test_warning_wrong_path_format() {
let input = "User-Agent: *\nAllow: \\";
validate_warnings(input, &[WarningReasonKind::WrongPathFormat]);
let input = "User-Agent: *\nDisallow: \\";
validate_warnings(input, &[WarningReasonKind::WrongPathFormat]);
}
impl From<&WarningReason> for WarningReasonKind {
fn from(reason: &WarningReason) -> Self {
match reason {
&WarningReason::InvalidDirectiveFormat => {
return WarningReasonKind::InvalidDirectiveFormat;
},
&WarningReason::DirectiveKeyIsEmpty => {
return WarningReasonKind::DirectiveKeyIsEmpty;
},
&WarningReason::UnsupportedDirectiveKey {..} => {
return WarningReasonKind::UnsupportedDirectiveKey;
},
&WarningReason::UserAgentCannotBeEmpty => {
return WarningReasonKind::UserAgentCannotBeEmpty;
},
&WarningReason::DirectiveWithoutUserAgent => {
return WarningReasonKind::DirectiveWithoutUserAgent;
},
&WarningReason::ParseCrawlDelayError {..} => {
return WarningReasonKind::ParseCrawlDelayError;
},
&WarningReason::WrongRequestRateFormat => {
return WarningReasonKind::WrongRequestRateFormat;
},
&WarningReason::ParseRequestRate {..} => {
return WarningReasonKind::ParseRequestRate;
},
&WarningReason::ParseUrl {..} => {
return WarningReasonKind::ParseUrl;
},
&WarningReason::WrongCleanParamFormat => {
return WarningReasonKind::WrongCleanParamFormat;
},
&WarningReason::IgnoredCleanParams {..} => {
return WarningReasonKind::IgnoredCleanParams;
},
&WarningReason::WrongPathFormat => {
return WarningReasonKind::WrongPathFormat;
},
}
}
}