mirror of
https://github.com/danbulant/robotparser-rs
synced 2026-05-19 04:18:42 +00:00
* Inital github-actions most taken from starship project ref #21 * rustfmt config remove unknown configuration options * Run rustfmt * clippy: use any instead of find..is_some * clippy: Remove the `clone` call: `self.crawl_delay` * Clippy fixes * Rustfmt fixes * clippy: fix dont need to add `&` to all patterns * clippy: fix needless `fn main` in doctest * clippy: fix if-then-else expression returns a bool literal * clippy: fix very complex type BoxFuture response * clippy: fix variable `line_no` is used as a loop counter * clippy: dereference the expression on tests * clippy: fix assert(true) will be optimized out by the compiler * github: name workflow
153 lines
6 KiB
Rust
153 lines
6 KiB
Rust
use robotparser::parser::{parse_robots_txt, WarningReason};
|
|
use std::convert::From;
|
|
use url::{Host, Origin};
|
|
|
|
#[derive(PartialEq, Eq, Debug, Clone)]
|
|
enum WarningReasonKind {
|
|
InvalidDirectiveFormat,
|
|
DirectiveKeyIsEmpty,
|
|
UnsupportedDirectiveKey,
|
|
UserAgentCannotBeEmpty,
|
|
DirectiveWithoutUserAgent,
|
|
ParseCrawlDelayError,
|
|
WrongRequestRateFormat,
|
|
ParseRequestRate,
|
|
ParseUrl,
|
|
WrongCleanParamFormat,
|
|
IgnoredCleanParams,
|
|
WrongPathFormat,
|
|
}
|
|
|
|
fn validate_warnings(input: &str, expected_warnings: &[WarningReasonKind]) {
|
|
let host = Host::Domain("python.org".into());
|
|
let origin = Origin::Tuple("http".into(), host, 80);
|
|
let warnings = parse_robots_txt(origin, &input).get_warnings().to_vec();
|
|
assert_eq!(warnings.len(), expected_warnings.len());
|
|
for (warning, expected_warning) in warnings.iter().zip(expected_warnings.iter()) {
|
|
let warning: WarningReasonKind = warning.get_reason().into();
|
|
assert_eq!(expected_warning.clone(), warning);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_warning_invalid_directive_format() {
|
|
let input = "`";
|
|
validate_warnings(input, &[WarningReasonKind::InvalidDirectiveFormat]);
|
|
let input = " \t ` \t ";
|
|
validate_warnings(input, &[WarningReasonKind::InvalidDirectiveFormat]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_warning_directive_key_is_empty() {
|
|
let input = ":";
|
|
validate_warnings(input, &[WarningReasonKind::DirectiveKeyIsEmpty]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_warning_supported_directive_key() {
|
|
let input = "X-Directive:";
|
|
validate_warnings(input, &[WarningReasonKind::UnsupportedDirectiveKey]);
|
|
let input = "\t X-Directive\t :\t ";
|
|
validate_warnings(input, &[WarningReasonKind::UnsupportedDirectiveKey]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_warning_user_agent_cannot_be_empty() {
|
|
let input = "User-Agent:";
|
|
validate_warnings(input, &[WarningReasonKind::UserAgentCannotBeEmpty]);
|
|
let input = "\t User-Agent\t :\t ";
|
|
validate_warnings(input, &[WarningReasonKind::UserAgentCannotBeEmpty]);
|
|
let input = "\t User-Agent\t :\t *";
|
|
validate_warnings(input, &[]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_warning_directive_without_user_agent() {
|
|
let input = "Crawl-Delay: 5s";
|
|
validate_warnings(input, &[WarningReasonKind::DirectiveWithoutUserAgent]);
|
|
let input = "User-Agent: *\nCrawl-Delay: 5";
|
|
validate_warnings(input, &[]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_warning_parse_crawl_delay_error() {
|
|
let input = "User-Agent: *\nCrawl-Delay: ";
|
|
validate_warnings(input, &[WarningReasonKind::ParseCrawlDelayError]);
|
|
let input = "User-Agent: *\nCrawl-Delay: -";
|
|
validate_warnings(input, &[WarningReasonKind::ParseCrawlDelayError]);
|
|
let input = "User-Agent: *\nCrawl-Delay: 5h9";
|
|
validate_warnings(input, &[WarningReasonKind::ParseCrawlDelayError]);
|
|
let input = "User-Agent: *\nCrawl-Delay: 5";
|
|
validate_warnings(input, &[]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_warning_request_rate_format() {
|
|
let input = "User-Agent: *\nRequest-rate: 1/5";
|
|
validate_warnings(input, &[]);
|
|
let input = "User-Agent: *\nRequest-rate: 1//5";
|
|
validate_warnings(input, &[WarningReasonKind::WrongRequestRateFormat]);
|
|
let input = "User-Agent: *\nRequest-rate: 1";
|
|
validate_warnings(input, &[WarningReasonKind::WrongRequestRateFormat]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_warning_request_rate() {
|
|
let input = "User-Agent: *\nRequest-rate: a/b";
|
|
validate_warnings(input, &[WarningReasonKind::ParseRequestRate]);
|
|
let input = "User-Agent: *\nRequest-rate: a/5";
|
|
validate_warnings(input, &[WarningReasonKind::ParseRequestRate]);
|
|
let input = "User-Agent: *\nRequest-rate: 5/b";
|
|
validate_warnings(input, &[WarningReasonKind::ParseRequestRate]);
|
|
let input = "User-Agent: *\nRequest-rate: 1.0/5.0";
|
|
validate_warnings(input, &[WarningReasonKind::ParseRequestRate]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_warning_parsing_url() {
|
|
let input = "User-Agent: *\nSitemap: https://python.org/sitemap.xml";
|
|
validate_warnings(input, &[]);
|
|
let input = "User-Agent: *\nSitemap: http$$$://python.org/sitemap.xml";
|
|
validate_warnings(input, &[WarningReasonKind::ParseUrl]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_wrong_clean_param() {
|
|
let input = "User-Agent: *\nClean-param: ref ";
|
|
validate_warnings(input, &[]);
|
|
let input = "User-Agent: *\nClean-param: ";
|
|
validate_warnings(input, &[WarningReasonKind::WrongCleanParamFormat]);
|
|
let input = "User-Agent: *\nClean-param: &";
|
|
validate_warnings(input, &[]);
|
|
let input = "User-Agent: *\nClean-param: ?";
|
|
validate_warnings(input, &[WarningReasonKind::IgnoredCleanParams]);
|
|
let input = "User-Agent: *\nClean-param: abc$";
|
|
validate_warnings(input, &[WarningReasonKind::IgnoredCleanParams]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_warning_wrong_path_format() {
|
|
let input = "User-Agent: *\nAllow: \\";
|
|
validate_warnings(input, &[WarningReasonKind::WrongPathFormat]);
|
|
let input = "User-Agent: *\nDisallow: \\";
|
|
validate_warnings(input, &[WarningReasonKind::WrongPathFormat]);
|
|
}
|
|
|
|
impl From<&WarningReason> for WarningReasonKind {
|
|
fn from(reason: &WarningReason) -> Self {
|
|
match *reason {
|
|
WarningReason::InvalidDirectiveFormat => WarningReasonKind::InvalidDirectiveFormat,
|
|
WarningReason::DirectiveKeyIsEmpty => WarningReasonKind::DirectiveKeyIsEmpty,
|
|
WarningReason::UnsupportedDirectiveKey { .. } => WarningReasonKind::UnsupportedDirectiveKey,
|
|
WarningReason::UserAgentCannotBeEmpty => WarningReasonKind::UserAgentCannotBeEmpty,
|
|
WarningReason::DirectiveWithoutUserAgent => WarningReasonKind::DirectiveWithoutUserAgent,
|
|
WarningReason::ParseCrawlDelayError { .. } => WarningReasonKind::ParseCrawlDelayError,
|
|
WarningReason::WrongRequestRateFormat => WarningReasonKind::WrongRequestRateFormat,
|
|
WarningReason::ParseRequestRate { .. } => WarningReasonKind::ParseRequestRate,
|
|
WarningReason::ParseUrl { .. } => WarningReasonKind::ParseUrl,
|
|
WarningReason::WrongCleanParamFormat => WarningReasonKind::WrongCleanParamFormat,
|
|
WarningReason::IgnoredCleanParams { .. } => WarningReasonKind::IgnoredCleanParams,
|
|
WarningReason::WrongPathFormat => WarningReasonKind::WrongPathFormat,
|
|
}
|
|
}
|
|
}
|