mirror of
https://github.com/danbulant/robotparser-rs
synced 2026-07-05 11:10:39 +00:00
Use Cow instead of owned String
This commit is contained in:
parent
4862c704ad
commit
caca158589
1 changed files with 26 additions and 22 deletions
48
src/lib.rs
48
src/lib.rs
|
|
@ -39,6 +39,7 @@ extern crate hyper;
|
||||||
|
|
||||||
use std::io::Read;
|
use std::io::Read;
|
||||||
use std::cell::{Cell, RefCell};
|
use std::cell::{Cell, RefCell};
|
||||||
|
use std::borrow::Cow;
|
||||||
use url::Url;
|
use url::Url;
|
||||||
use hyper::{Client};
|
use hyper::{Client};
|
||||||
use hyper::status::StatusCode;
|
use hyper::status::StatusCode;
|
||||||
|
|
@ -47,25 +48,25 @@ use std::time::Duration;
|
||||||
/// A rule line is a single "Allow:" (allowance==True) or "Disallow:"
|
/// A rule line is a single "Allow:" (allowance==True) or "Disallow:"
|
||||||
/// (allowance==False) followed by a path."""
|
/// (allowance==False) followed by a path."""
|
||||||
#[derive(Debug, Eq, PartialEq, Clone)]
|
#[derive(Debug, Eq, PartialEq, Clone)]
|
||||||
struct RuleLine {
|
struct RuleLine<'a> {
|
||||||
path: String,
|
path: Cow<'a, str>,
|
||||||
allowance: bool,
|
allowance: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// An entry has one or more user-agents and zero or more rulelines
|
/// An entry has one or more user-agents and zero or more rulelines
|
||||||
#[derive(Debug, Eq, PartialEq, Clone)]
|
#[derive(Debug, Eq, PartialEq, Clone)]
|
||||||
struct Entry {
|
struct Entry<'a> {
|
||||||
useragents: RefCell<Vec<String>>,
|
useragents: RefCell<Vec<String>>,
|
||||||
rulelines: RefCell<Vec<RuleLine>>,
|
rulelines: RefCell<Vec<RuleLine<'a>>>,
|
||||||
crawl_delay: Option<Duration>,
|
crawl_delay: Option<Duration>,
|
||||||
sitemaps: Vec<Url>,
|
sitemaps: Vec<Url>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// robots.txt file parser
|
/// robots.txt file parser
|
||||||
#[derive(Debug, Eq, PartialEq, Clone)]
|
#[derive(Debug, Eq, PartialEq, Clone)]
|
||||||
pub struct RobotFileParser {
|
pub struct RobotFileParser<'a> {
|
||||||
entries: RefCell<Vec<Entry>>,
|
entries: RefCell<Vec<Entry<'a>>>,
|
||||||
default_entry: RefCell<Entry>,
|
default_entry: RefCell<Entry<'a>>,
|
||||||
disallow_all: Cell<bool>,
|
disallow_all: Cell<bool>,
|
||||||
allow_all: Cell<bool>,
|
allow_all: Cell<bool>,
|
||||||
url: Url,
|
url: Url,
|
||||||
|
|
@ -75,27 +76,30 @@ pub struct RobotFileParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
impl RuleLine {
|
impl<'a> RuleLine<'a> {
|
||||||
fn new(path: &str, allowance: bool) -> RuleLine {
|
fn new<S>(path: S, allowance: bool) -> RuleLine<'a>
|
||||||
|
where S: Into<Cow<'a, str>>
|
||||||
|
{
|
||||||
|
let path = path.into();
|
||||||
let mut allow = allowance;
|
let mut allow = allowance;
|
||||||
if path == "" && !allowance {
|
if path == "" && !allowance {
|
||||||
// an empty value means allow all
|
// an empty value means allow all
|
||||||
allow = true;
|
allow = true;
|
||||||
}
|
}
|
||||||
RuleLine {
|
RuleLine {
|
||||||
path: path.to_owned(),
|
path: path,
|
||||||
allowance: allow,
|
allowance: allow,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn applies_to(&self, filename: &str) -> bool {
|
fn applies_to(&self, filename: &str) -> bool {
|
||||||
&self.path == "*" || filename.starts_with(&self.path)
|
self.path == "*" || filename.starts_with(&self.path[..])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
impl Entry {
|
impl<'a> Entry<'a> {
|
||||||
fn new() -> Entry {
|
fn new() -> Entry<'a> {
|
||||||
Entry {
|
Entry {
|
||||||
useragents: RefCell::new(vec![]),
|
useragents: RefCell::new(vec![]),
|
||||||
rulelines: RefCell::new(vec![]),
|
rulelines: RefCell::new(vec![]),
|
||||||
|
|
@ -138,7 +142,7 @@ impl Entry {
|
||||||
useragents.push(useragent.to_lowercase().to_owned());
|
useragents.push(useragent.to_lowercase().to_owned());
|
||||||
}
|
}
|
||||||
|
|
||||||
fn push_ruleline(&self, ruleline: RuleLine) {
|
fn push_ruleline(&self, ruleline: RuleLine<'a>) {
|
||||||
let mut rulelines = self.rulelines.borrow_mut();
|
let mut rulelines = self.rulelines.borrow_mut();
|
||||||
rulelines.push(ruleline);
|
rulelines.push(ruleline);
|
||||||
}
|
}
|
||||||
|
|
@ -154,7 +158,7 @@ impl Entry {
|
||||||
useragents.is_empty() && rulelines.is_empty()
|
useragents.is_empty() && rulelines.is_empty()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn set_crawl_delay(&mut self,delay: Duration) {
|
fn set_crawl_delay(&mut self, delay: Duration) {
|
||||||
self.crawl_delay = Some(delay);
|
self.crawl_delay = Some(delay);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -174,15 +178,15 @@ impl Entry {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
impl Default for Entry {
|
impl<'a> Default for Entry<'a> {
|
||||||
fn default() -> Entry {
|
fn default() -> Entry<'a> {
|
||||||
Entry::new()
|
Entry::new()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
impl RobotFileParser {
|
impl<'a> RobotFileParser<'a> {
|
||||||
pub fn new<T: AsRef<str>>(url: T) -> RobotFileParser {
|
pub fn new<T: AsRef<str>>(url: T) -> RobotFileParser<'a> {
|
||||||
let parsed_url = Url::parse(url.as_ref()).unwrap();
|
let parsed_url = Url::parse(url.as_ref()).unwrap();
|
||||||
RobotFileParser {
|
RobotFileParser {
|
||||||
entries: RefCell::new(vec![]),
|
entries: RefCell::new(vec![]),
|
||||||
|
|
@ -247,7 +251,7 @@ impl RobotFileParser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn _add_entry(&self, entry: Entry) {
|
fn _add_entry(&self, entry: Entry<'a>) {
|
||||||
if entry.has_useragent("*") {
|
if entry.has_useragent("*") {
|
||||||
// the default entry is considered last
|
// the default entry is considered last
|
||||||
let mut default_entry = self.default_entry.borrow_mut();
|
let mut default_entry = self.default_entry.borrow_mut();
|
||||||
|
|
@ -317,13 +321,13 @@ impl RobotFileParser {
|
||||||
},
|
},
|
||||||
ref x if x == "disallow" => {
|
ref x if x == "disallow" => {
|
||||||
if state != 0 {
|
if state != 0 {
|
||||||
entry.push_ruleline(RuleLine::new(&part1, false));
|
entry.push_ruleline(RuleLine::new(part1, false));
|
||||||
state = 2;
|
state = 2;
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
ref x if x == "allow" => {
|
ref x if x == "allow" => {
|
||||||
if state != 0 {
|
if state != 0 {
|
||||||
entry.push_ruleline(RuleLine::new(&part1, true));
|
entry.push_ruleline(RuleLine::new(part1, true));
|
||||||
state = 2;
|
state = 2;
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue