use robotparser::parser::{parse_robots_txt, WarningReason}; use url::{Host, Origin}; use std::convert::From; #[derive(PartialEq, Eq, Debug, Clone)] enum WarningReasonKind { InvalidDirectiveFormat, DirectiveKeyIsEmpty, UnsupportedDirectiveKey, UserAgentCannotBeEmpty, DirectiveWithoutUserAgent, ParseCrawlDelayError, WrongRequestRateFormat, ParseRequestRate, ParseUrl, WrongCleanParamFormat, IgnoredCleanParams, WrongPathFormat, } fn validate_warnings(input: &str, expected_warnings: &[WarningReasonKind]) { let host = Host::Domain("python.org".into()); let origin = Origin::Tuple("http".into(), host, 80); let warnings = parse_robots_txt(origin, &input).get_warnings().to_vec(); assert_eq!(warnings.len(), expected_warnings.len()); for (warning, expected_warning) in warnings.iter().zip(expected_warnings.iter()) { let warning: WarningReasonKind = warning.get_reason().into(); assert_eq!(expected_warning.clone(), warning); } } #[test] fn test_warning_invalid_directive_format() { let input = "`"; validate_warnings(input, &[WarningReasonKind::InvalidDirectiveFormat]); let input = " \t ` \t "; validate_warnings(input, &[WarningReasonKind::InvalidDirectiveFormat]); } #[test] fn test_warning_directive_key_is_empty() { let input = ":"; validate_warnings(input, &[WarningReasonKind::DirectiveKeyIsEmpty]); } #[test] fn test_warning_supported_directive_key() { let input = "X-Directive:"; validate_warnings(input, &[WarningReasonKind::UnsupportedDirectiveKey]); let input = "\t X-Directive\t :\t "; validate_warnings(input, &[WarningReasonKind::UnsupportedDirectiveKey]); } #[test] fn test_warning_user_agent_cannot_be_empty() { let input = "User-Agent:"; validate_warnings(input, &[WarningReasonKind::UserAgentCannotBeEmpty]); let input = "\t User-Agent\t :\t "; validate_warnings(input, &[WarningReasonKind::UserAgentCannotBeEmpty]); let input = "\t User-Agent\t :\t *"; validate_warnings(input, &[]); } #[test] fn test_warning_directive_without_user_agent() { let input = "Crawl-Delay: 5s"; validate_warnings(input, &[WarningReasonKind::DirectiveWithoutUserAgent]); let input = "User-Agent: *\nCrawl-Delay: 5"; validate_warnings(input, &[]); } #[test] fn test_warning_parse_crawl_delay_error() { let input = "User-Agent: *\nCrawl-Delay: "; validate_warnings(input, &[WarningReasonKind::ParseCrawlDelayError]); let input = "User-Agent: *\nCrawl-Delay: -"; validate_warnings(input, &[WarningReasonKind::ParseCrawlDelayError]); let input = "User-Agent: *\nCrawl-Delay: 5h9"; validate_warnings(input, &[WarningReasonKind::ParseCrawlDelayError]); let input = "User-Agent: *\nCrawl-Delay: 5"; validate_warnings(input, &[]); } #[test] fn test_warning_request_rate_format() { let input = "User-Agent: *\nRequest-rate: 1/5"; validate_warnings(input, &[]); let input = "User-Agent: *\nRequest-rate: 1//5"; validate_warnings(input, &[WarningReasonKind::WrongRequestRateFormat]); let input = "User-Agent: *\nRequest-rate: 1"; validate_warnings(input, &[WarningReasonKind::WrongRequestRateFormat]); } #[test] fn test_warning_request_rate() { let input = "User-Agent: *\nRequest-rate: a/b"; validate_warnings(input, &[WarningReasonKind::ParseRequestRate]); let input = "User-Agent: *\nRequest-rate: a/5"; validate_warnings(input, &[WarningReasonKind::ParseRequestRate]); let input = "User-Agent: *\nRequest-rate: 5/b"; validate_warnings(input, &[WarningReasonKind::ParseRequestRate]); let input = "User-Agent: *\nRequest-rate: 1.0/5.0"; validate_warnings(input, &[WarningReasonKind::ParseRequestRate]); } #[test] fn test_warning_parsing_url() { let input = "User-Agent: *\nSitemap: https://python.org/sitemap.xml"; validate_warnings(input, &[]); let input = "User-Agent: *\nSitemap: http$$$://python.org/sitemap.xml"; validate_warnings(input, &[WarningReasonKind::ParseUrl]); } #[test] fn test_wrong_clean_param() { let input = "User-Agent: *\nClean-param: ref "; validate_warnings(input, &[]); let input = "User-Agent: *\nClean-param: "; validate_warnings(input, &[WarningReasonKind::WrongCleanParamFormat]); let input = "User-Agent: *\nClean-param: &"; validate_warnings(input, &[]); let input = "User-Agent: *\nClean-param: ?"; validate_warnings(input, &[WarningReasonKind::IgnoredCleanParams]); let input = "User-Agent: *\nClean-param: abc$"; validate_warnings(input, &[WarningReasonKind::IgnoredCleanParams]); } #[test] fn test_warning_wrong_path_format() { let input = "User-Agent: *\nAllow: \\"; validate_warnings(input, &[WarningReasonKind::WrongPathFormat]); let input = "User-Agent: *\nDisallow: \\"; validate_warnings(input, &[WarningReasonKind::WrongPathFormat]); } impl From<&WarningReason> for WarningReasonKind { fn from(reason: &WarningReason) -> Self { match reason { &WarningReason::InvalidDirectiveFormat => { return WarningReasonKind::InvalidDirectiveFormat; }, &WarningReason::DirectiveKeyIsEmpty => { return WarningReasonKind::DirectiveKeyIsEmpty; }, &WarningReason::UnsupportedDirectiveKey {..} => { return WarningReasonKind::UnsupportedDirectiveKey; }, &WarningReason::UserAgentCannotBeEmpty => { return WarningReasonKind::UserAgentCannotBeEmpty; }, &WarningReason::DirectiveWithoutUserAgent => { return WarningReasonKind::DirectiveWithoutUserAgent; }, &WarningReason::ParseCrawlDelayError {..} => { return WarningReasonKind::ParseCrawlDelayError; }, &WarningReason::WrongRequestRateFormat => { return WarningReasonKind::WrongRequestRateFormat; }, &WarningReason::ParseRequestRate {..} => { return WarningReasonKind::ParseRequestRate; }, &WarningReason::ParseUrl {..} => { return WarningReasonKind::ParseUrl; }, &WarningReason::WrongCleanParamFormat => { return WarningReasonKind::WrongCleanParamFormat; }, &WarningReason::IgnoredCleanParams {..} => { return WarningReasonKind::IgnoredCleanParams; }, &WarningReason::WrongPathFormat => { return WarningReasonKind::WrongPathFormat; }, } } }