mirror of
https://github.com/danbulant/robotparser-rs
synced 2026-05-19 04:18:42 +00:00
271 lines
10 KiB
Rust
271 lines
10 KiB
Rust
use crate::model::{CleanParams, PathPattern, RequestRate, RobotsTxt, Rule};
|
|
use crate::parser::line::Line;
|
|
use crate::parser::parse_result::ParseResult;
|
|
use crate::parser::warning::ParseWarning;
|
|
use std::time::Duration;
|
|
use url::{Origin, Url};
|
|
mod directive;
|
|
use self::directive::Directive;
|
|
mod group_builder;
|
|
pub use self::group_builder::GroupBuilder;
|
|
|
|
const COMMENT_BEGIN_CHAR: char = '#';
|
|
const KV_SEPARATOR: &str = ":";
|
|
|
|
/// Parses the text of the robots.txt file located in the specified origin.
|
|
pub fn parse(origin: Origin, input: &str) -> ParseResult<RobotsTxt> {
|
|
let parser = Parser::new(origin);
|
|
parser.parse(input)
|
|
}
|
|
|
|
struct Parser {
|
|
result: RobotsTxt,
|
|
group_builder: GroupBuilder,
|
|
warnings: Vec<ParseWarning>,
|
|
}
|
|
|
|
impl Parser {
|
|
pub fn new(origin: Origin) -> Parser {
|
|
Parser {
|
|
result: RobotsTxt::new(origin),
|
|
group_builder: GroupBuilder::new(),
|
|
warnings: Vec::new(),
|
|
}
|
|
}
|
|
|
|
pub fn parse(mut self, input: &str) -> ParseResult<RobotsTxt> {
|
|
let input = ignore_bom(input);
|
|
for (line_no, line) in input.lines().enumerate() {
|
|
let line = Line::new(line, line_no + 1);
|
|
match Self::parse_line(&line) {
|
|
Ok(Some(line_value)) => {
|
|
self.process_line_value(&line, &line_value);
|
|
}
|
|
Err(warning) => {
|
|
self.warnings.push(warning);
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
self.group_builder.fill_entries(&mut self.result);
|
|
ParseResult::new_with_warnings(self.result, self.warnings)
|
|
}
|
|
|
|
fn parse_line<'a>(line: &'a Line) -> Result<Option<Directive<'a>>, ParseWarning> {
|
|
let mut kv_part = line.get_line_text();
|
|
if let Some(comment_separator_position) = line.get_line_text().find(COMMENT_BEGIN_CHAR) {
|
|
kv_part = &kv_part[0..comment_separator_position];
|
|
}
|
|
if kv_part.is_empty() {
|
|
return Ok(None);
|
|
}
|
|
let separator_index = kv_part
|
|
.find(KV_SEPARATOR)
|
|
.ok_or_else(|| ParseWarning::invalid_directive_format(line))?;
|
|
if separator_index >= kv_part.len() {
|
|
return Err(ParseWarning::invalid_directive_format(line));
|
|
}
|
|
let key = &kv_part[0..separator_index];
|
|
let key = key.trim();
|
|
if key.is_empty() {
|
|
return Err(ParseWarning::directive_key_is_empty(line));
|
|
}
|
|
let value = &kv_part[separator_index + 1..];
|
|
let value = value.trim();
|
|
let result = Directive::new(key, value);
|
|
Ok(Some(result))
|
|
}
|
|
|
|
fn process_line_value(&mut self, line: &Line, directive: &Directive) {
|
|
let key = directive.get_key_lowercase();
|
|
match key.as_str() {
|
|
// Group specific directives
|
|
"user-agent" => {
|
|
self.process_directive_user_agent(line, directive);
|
|
}
|
|
"allow" => {
|
|
self.process_directive_allow(line, directive);
|
|
}
|
|
"disallow" => {
|
|
self.process_directive_disallow(line, directive);
|
|
}
|
|
"crawl-delay" => {
|
|
self.process_directive_crawl_delay(line, directive);
|
|
}
|
|
"request-rate" => {
|
|
self.process_directive_request_rate(line, directive);
|
|
}
|
|
// Non-group directives
|
|
"sitemap" => {
|
|
self.process_directive_sitemap(line, directive);
|
|
}
|
|
"clean-param" => {
|
|
self.process_directive_clean_param(line, directive);
|
|
}
|
|
_ => {
|
|
self.warnings.push(ParseWarning::unsupported_directive_key(line, key));
|
|
}
|
|
}
|
|
}
|
|
|
|
fn process_directive_user_agent(&mut self, line: &Line, directive: &Directive) {
|
|
let user_agent = directive.get_value();
|
|
if user_agent.is_empty() {
|
|
self.warnings.push(ParseWarning::user_agent_cannot_be_empty(line));
|
|
return;
|
|
}
|
|
self.group_builder.handle_user_agent(user_agent);
|
|
}
|
|
|
|
fn process_directive_allow(&mut self, line: &Line, directive: &Directive) {
|
|
if let Some(group) = self.group_builder.get_mut_active_group() {
|
|
if directive.get_value() == "" {
|
|
// Nothing to do. Ignoring.
|
|
} else if directive.get_value().starts_with('*') || directive.get_value().starts_with('/') {
|
|
group.push_rule(Rule::new(directive.get_value(), true));
|
|
} else {
|
|
self.warnings.push(ParseWarning::wrong_path_format(line));
|
|
}
|
|
} else {
|
|
self.warnings.push(ParseWarning::directive_without_user_agent(line));
|
|
}
|
|
}
|
|
|
|
fn process_directive_disallow(&mut self, line: &Line, directive: &Directive) {
|
|
if let Some(group) = self.group_builder.get_mut_active_group() {
|
|
if directive.get_value() == "" {
|
|
// Allow all.
|
|
group.push_rule(Rule::new(PathPattern::all(), true));
|
|
} else if directive.get_value().starts_with('*') || directive.get_value().starts_with('/') {
|
|
group.push_rule(Rule::new(directive.get_value(), false));
|
|
} else {
|
|
self.warnings.push(ParseWarning::wrong_path_format(line));
|
|
}
|
|
} else {
|
|
self.warnings.push(ParseWarning::directive_without_user_agent(line));
|
|
}
|
|
}
|
|
|
|
fn process_directive_crawl_delay(&mut self, line: &Line, directive: &Directive) {
|
|
if let Some(group) = self.group_builder.get_mut_active_group() {
|
|
match directive.get_value().parse::<f64>() {
|
|
Ok(delay) => {
|
|
let delay_seconds = delay.trunc();
|
|
let delay_nanoseconds = delay.fract() * 10f64.powi(9);
|
|
let delay = Duration::new(delay_seconds as u64, delay_nanoseconds as u32);
|
|
group.set_crawl_delay(delay);
|
|
}
|
|
Err(error) => {
|
|
self.warnings.push(ParseWarning::parse_crawl_delay_error(line, error));
|
|
}
|
|
}
|
|
} else {
|
|
self.warnings.push(ParseWarning::directive_without_user_agent(line));
|
|
}
|
|
}
|
|
|
|
fn process_directive_request_rate(&mut self, line: &Line, directive: &Directive) {
|
|
if let Some(group) = self.group_builder.get_mut_active_group() {
|
|
let numbers: Vec<&str> = directive.get_value().split('/').collect();
|
|
if numbers.len() != 2 {
|
|
self.warnings.push(ParseWarning::wrong_request_rate_format(line));
|
|
return;
|
|
}
|
|
let requests = match numbers[0].parse::<usize>() {
|
|
Ok(requests) => requests,
|
|
Err(error) => {
|
|
self.warnings.push(ParseWarning::parse_request_rate(line, error));
|
|
return;
|
|
}
|
|
};
|
|
let seconds = match numbers[1].parse::<usize>() {
|
|
Ok(seconds) => seconds,
|
|
Err(error) => {
|
|
self.warnings.push(ParseWarning::parse_request_rate(line, error));
|
|
return;
|
|
}
|
|
};
|
|
group.set_req_rate(RequestRate { requests, seconds });
|
|
} else {
|
|
self.warnings.push(ParseWarning::directive_without_user_agent(line));
|
|
}
|
|
}
|
|
|
|
fn process_directive_sitemap(&mut self, line: &Line, directive: &Directive) {
|
|
match Url::parse(directive.get_value()) {
|
|
Ok(sitemap_url) => {
|
|
self.result.add_sitemap(sitemap_url);
|
|
}
|
|
Err(error) => {
|
|
self.warnings.push(ParseWarning::parse_url(line, error));
|
|
}
|
|
}
|
|
}
|
|
|
|
fn process_directive_clean_param(&mut self, line: &Line, directive: &Directive) {
|
|
let parts: Vec<&str> = directive.get_value().split_whitespace().collect();
|
|
if parts.len() >= 3 || parts.is_empty() {
|
|
self.warnings.push(ParseWarning::wrong_clean_param_format(line));
|
|
return;
|
|
}
|
|
if parts[0].is_empty() {
|
|
self.warnings.push(ParseWarning::wrong_clean_param_format(line));
|
|
return;
|
|
}
|
|
let clean_params_path_pattern;
|
|
let clean_params;
|
|
if let Some(second_param) = parts.get(1) {
|
|
if second_param.is_empty() {
|
|
self.warnings.push(ParseWarning::wrong_clean_param_format(line));
|
|
return;
|
|
}
|
|
clean_params_path_pattern = PathPattern::new(parts[0]);
|
|
clean_params = *second_param;
|
|
} else {
|
|
clean_params_path_pattern = PathPattern::all();
|
|
clean_params = parts[0];
|
|
}
|
|
let (valid_clean_params, invalid_clean_params) = Self::parse_clean_params(clean_params);
|
|
if !invalid_clean_params.is_empty() {
|
|
self.warnings
|
|
.push(ParseWarning::ignored_clean_params(line, invalid_clean_params));
|
|
}
|
|
self.result
|
|
.add_clean_params(CleanParams::new(clean_params_path_pattern, valid_clean_params));
|
|
}
|
|
|
|
fn parse_clean_params(clean_params: &str) -> (Vec<String>, Vec<String>) {
|
|
let mut valid = Vec::new();
|
|
let mut invalid = Vec::new();
|
|
for clean_param in clean_params.split('&') {
|
|
if !clean_param.is_empty() {
|
|
if Self::is_valid_clean_param(clean_param) {
|
|
valid.push(clean_param.into());
|
|
} else {
|
|
invalid.push(clean_param.into());
|
|
}
|
|
}
|
|
}
|
|
(valid, invalid)
|
|
}
|
|
|
|
fn is_valid_clean_param(clean_param: &str) -> bool {
|
|
for c in clean_param.chars() {
|
|
let is_valid = ('A'..'Z').contains(&c)
|
|
|| ('a'..'z').contains(&c)
|
|
|| ('0'..'9').contains(&c)
|
|
|| c == '.'
|
|
|| c == '-'
|
|
|| c == '_';
|
|
if !is_valid {
|
|
return false;
|
|
}
|
|
}
|
|
true
|
|
}
|
|
}
|
|
|
|
fn ignore_bom(input: &str) -> &str {
|
|
const BOM: &str = "\u{feff}";
|
|
input.trim_start_matches(BOM)
|
|
}
|