use crate::model::{CleanParams, PathPattern, RequestRate, RobotsTxt, Rule}; use crate::parser::line::Line; use crate::parser::parse_result::ParseResult; use crate::parser::warning::ParseWarning; use std::time::Duration; use url::{Origin, Url}; mod directive; use self::directive::Directive; mod group_builder; pub use self::group_builder::GroupBuilder; const COMMENT_BEGIN_CHAR: char = '#'; const KV_SEPARATOR: &str = ":"; /// Parses the text of the robots.txt file located in the specified origin. pub fn parse(origin: Origin, input: &str) -> ParseResult { let parser = Parser::new(origin); parser.parse(input) } struct Parser { result: RobotsTxt, group_builder: GroupBuilder, warnings: Vec, } impl Parser { pub fn new(origin: Origin) -> Parser { Parser { result: RobotsTxt::new(origin), group_builder: GroupBuilder::new(), warnings: Vec::new(), } } pub fn parse(mut self, input: &str) -> ParseResult { let input = ignore_bom(input); for (line_no, line) in input.lines().enumerate() { let line = Line::new(line, line_no + 1); match Self::parse_line(&line) { Ok(Some(line_value)) => { self.process_line_value(&line, &line_value); } Err(warning) => { self.warnings.push(warning); } _ => {} } } self.group_builder.fill_entries(&mut self.result); ParseResult::new_with_warnings(self.result, self.warnings) } fn parse_line<'a>(line: &'a Line) -> Result>, ParseWarning> { let mut kv_part = line.get_line_text(); if let Some(comment_separator_position) = line.get_line_text().find(COMMENT_BEGIN_CHAR) { kv_part = &kv_part[0..comment_separator_position]; } if kv_part.is_empty() { return Ok(None); } let separator_index = kv_part .find(KV_SEPARATOR) .ok_or_else(|| ParseWarning::invalid_directive_format(line))?; if separator_index >= kv_part.len() { return Err(ParseWarning::invalid_directive_format(line)); } let key = &kv_part[0..separator_index]; let key = key.trim(); if key.is_empty() { return Err(ParseWarning::directive_key_is_empty(line)); } let value = &kv_part[separator_index + 1..]; let value = value.trim(); let result = Directive::new(key, value); Ok(Some(result)) } fn process_line_value(&mut self, line: &Line, directive: &Directive) { let key = directive.get_key_lowercase(); match key.as_str() { // Group specific directives "user-agent" => { self.process_directive_user_agent(line, directive); } "allow" => { self.process_directive_allow(line, directive); } "disallow" => { self.process_directive_disallow(line, directive); } "crawl-delay" => { self.process_directive_crawl_delay(line, directive); } "request-rate" => { self.process_directive_request_rate(line, directive); } // Non-group directives "sitemap" => { self.process_directive_sitemap(line, directive); } "clean-param" => { self.process_directive_clean_param(line, directive); } _ => { self.warnings.push(ParseWarning::unsupported_directive_key(line, key)); } } } fn process_directive_user_agent(&mut self, line: &Line, directive: &Directive) { let user_agent = directive.get_value(); if user_agent.is_empty() { self.warnings.push(ParseWarning::user_agent_cannot_be_empty(line)); return; } self.group_builder.handle_user_agent(user_agent); } fn process_directive_allow(&mut self, line: &Line, directive: &Directive) { if let Some(group) = self.group_builder.get_mut_active_group() { if directive.get_value() == "" { // Nothing to do. Ignoring. } else if directive.get_value().starts_with('*') || directive.get_value().starts_with('/') { group.push_rule(Rule::new(directive.get_value(), true)); } else { self.warnings.push(ParseWarning::wrong_path_format(line)); } } else { self.warnings.push(ParseWarning::directive_without_user_agent(line)); } } fn process_directive_disallow(&mut self, line: &Line, directive: &Directive) { if let Some(group) = self.group_builder.get_mut_active_group() { if directive.get_value() == "" { // Allow all. group.push_rule(Rule::new(PathPattern::all(), true)); } else if directive.get_value().starts_with('*') || directive.get_value().starts_with('/') { group.push_rule(Rule::new(directive.get_value(), false)); } else { self.warnings.push(ParseWarning::wrong_path_format(line)); } } else { self.warnings.push(ParseWarning::directive_without_user_agent(line)); } } fn process_directive_crawl_delay(&mut self, line: &Line, directive: &Directive) { if let Some(group) = self.group_builder.get_mut_active_group() { match directive.get_value().parse::() { Ok(delay) => { let delay_seconds = delay.trunc(); let delay_nanoseconds = delay.fract() * 10f64.powi(9); let delay = Duration::new(delay_seconds as u64, delay_nanoseconds as u32); group.set_crawl_delay(delay); } Err(error) => { self.warnings.push(ParseWarning::parse_crawl_delay_error(line, error)); } } } else { self.warnings.push(ParseWarning::directive_without_user_agent(line)); } } fn process_directive_request_rate(&mut self, line: &Line, directive: &Directive) { if let Some(group) = self.group_builder.get_mut_active_group() { let numbers: Vec<&str> = directive.get_value().split('/').collect(); if numbers.len() != 2 { self.warnings.push(ParseWarning::wrong_request_rate_format(line)); return; } let requests = match numbers[0].parse::() { Ok(requests) => requests, Err(error) => { self.warnings.push(ParseWarning::parse_request_rate(line, error)); return; } }; let seconds = match numbers[1].parse::() { Ok(seconds) => seconds, Err(error) => { self.warnings.push(ParseWarning::parse_request_rate(line, error)); return; } }; group.set_req_rate(RequestRate { requests, seconds }); } else { self.warnings.push(ParseWarning::directive_without_user_agent(line)); } } fn process_directive_sitemap(&mut self, line: &Line, directive: &Directive) { match Url::parse(directive.get_value()) { Ok(sitemap_url) => { self.result.add_sitemap(sitemap_url); } Err(error) => { self.warnings.push(ParseWarning::parse_url(line, error)); } } } fn process_directive_clean_param(&mut self, line: &Line, directive: &Directive) { let parts: Vec<&str> = directive.get_value().split_whitespace().collect(); if parts.len() >= 3 || parts.is_empty() { self.warnings.push(ParseWarning::wrong_clean_param_format(line)); return; } if parts[0].is_empty() { self.warnings.push(ParseWarning::wrong_clean_param_format(line)); return; } let clean_params_path_pattern; let clean_params; if let Some(second_param) = parts.get(1) { if second_param.is_empty() { self.warnings.push(ParseWarning::wrong_clean_param_format(line)); return; } clean_params_path_pattern = PathPattern::new(parts[0]); clean_params = *second_param; } else { clean_params_path_pattern = PathPattern::all(); clean_params = parts[0]; } let (valid_clean_params, invalid_clean_params) = Self::parse_clean_params(clean_params); if !invalid_clean_params.is_empty() { self.warnings .push(ParseWarning::ignored_clean_params(line, invalid_clean_params)); } self.result .add_clean_params(CleanParams::new(clean_params_path_pattern, valid_clean_params)); } fn parse_clean_params(clean_params: &str) -> (Vec, Vec) { let mut valid = Vec::new(); let mut invalid = Vec::new(); for clean_param in clean_params.split('&') { if !clean_param.is_empty() { if Self::is_valid_clean_param(clean_param) { valid.push(clean_param.into()); } else { invalid.push(clean_param.into()); } } } (valid, invalid) } fn is_valid_clean_param(clean_param: &str) -> bool { for c in clean_param.chars() { let is_valid = ('A'..'Z').contains(&c) || ('a'..'z').contains(&c) || ('0'..'9').contains(&c) || c == '.' || c == '-' || c == '_'; if !is_valid { return false; } } true } } fn ignore_bom(input: &str) -> &str { const BOM: &str = "\u{feff}"; input.trim_start_matches(BOM) }