mirror of
https://github.com/danbulant/robotparser-rs
synced 2026-06-24 17:21:45 +00:00
Refactor some API to be more generic over Rust strings
This commit is contained in:
parent
685853e729
commit
ac7d7aa8db
2 changed files with 13 additions and 10 deletions
21
src/lib.rs
21
src/lib.rs
|
|
@ -122,8 +122,8 @@ impl Entry {
|
||||||
|
|
||||||
|
|
||||||
impl RobotFileParser {
|
impl RobotFileParser {
|
||||||
pub fn new(url: &str) -> RobotFileParser {
|
pub fn new<T: AsRef<str>>(url: T) -> RobotFileParser {
|
||||||
let parsed_url = Url::parse(url).unwrap();
|
let parsed_url = Url::parse(url.as_ref()).unwrap();
|
||||||
RobotFileParser {
|
RobotFileParser {
|
||||||
entries: RefCell::new(vec![]),
|
entries: RefCell::new(vec![]),
|
||||||
default_entry: RefCell::new(Entry::new()),
|
default_entry: RefCell::new(Entry::new()),
|
||||||
|
|
@ -153,8 +153,8 @@ impl RobotFileParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Sets the URL referring to a robots.txt file.
|
/// Sets the URL referring to a robots.txt file.
|
||||||
pub fn set_url(&mut self, url: &str) {
|
pub fn set_url<T: AsRef<str>>(&mut self, url: T) {
|
||||||
let parsed_url = Url::parse(url).unwrap();
|
let parsed_url = Url::parse(url.as_ref()).unwrap();
|
||||||
self.url = parsed_url.clone();
|
self.url = parsed_url.clone();
|
||||||
self.host = parsed_url.domain().unwrap().to_owned();
|
self.host = parsed_url.domain().unwrap().to_owned();
|
||||||
self.path = parsed_url.path().unwrap().join("/");
|
self.path = parsed_url.path().unwrap().join("/");
|
||||||
|
|
@ -181,7 +181,7 @@ impl RobotFileParser {
|
||||||
let mut buf = String::new();
|
let mut buf = String::new();
|
||||||
res.read_to_string(&mut buf).unwrap();
|
res.read_to_string(&mut buf).unwrap();
|
||||||
let lines: Vec<&str> = buf.split("\n").collect();
|
let lines: Vec<&str> = buf.split("\n").collect();
|
||||||
self.parse(lines);
|
self.parse(&lines);
|
||||||
},
|
},
|
||||||
_ => {},
|
_ => {},
|
||||||
}
|
}
|
||||||
|
|
@ -207,7 +207,7 @@ impl RobotFileParser {
|
||||||
/// We allow that a user-agent: line is not preceded by
|
/// We allow that a user-agent: line is not preceded by
|
||||||
/// one or more blank lines.
|
/// one or more blank lines.
|
||||||
///
|
///
|
||||||
pub fn parse(&self, lines: Vec<&str>) {
|
pub fn parse<T: AsRef<str>>(&self, lines: &[T]) {
|
||||||
use url::percent_encoding::percent_decode;
|
use url::percent_encoding::percent_decode;
|
||||||
|
|
||||||
// states:
|
// states:
|
||||||
|
|
@ -218,8 +218,8 @@ impl RobotFileParser {
|
||||||
let mut entry = Entry::new();
|
let mut entry = Entry::new();
|
||||||
|
|
||||||
self.modified();
|
self.modified();
|
||||||
for line in &lines {
|
for line in lines {
|
||||||
let mut ln = line.clone();
|
let mut ln = line.as_ref().clone();
|
||||||
if ln.is_empty() {
|
if ln.is_empty() {
|
||||||
match state {
|
match state {
|
||||||
1 => {
|
1 => {
|
||||||
|
|
@ -280,9 +280,12 @@ impl RobotFileParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Using the parsed robots.txt decide if useragent can fetch url
|
/// Using the parsed robots.txt decide if useragent can fetch url
|
||||||
pub fn can_fetch(&self, useragent: &str, url: &str) -> bool {
|
pub fn can_fetch<T: AsRef<str>>(&self, useragent: T, url: T) -> bool {
|
||||||
use url::percent_encoding::percent_decode;
|
use url::percent_encoding::percent_decode;
|
||||||
|
|
||||||
|
let useragent = useragent.as_ref();
|
||||||
|
let url = url.as_ref();
|
||||||
|
|
||||||
if self.disallow_all.get() {
|
if self.disallow_all.get() {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ const AGENT: &'static str = "test_robotparser";
|
||||||
fn robot_test(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>, agent: &str) {
|
fn robot_test(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>, agent: &str) {
|
||||||
let parser = RobotFileParser::new("http://www.baidu.com/robots.txt");
|
let parser = RobotFileParser::new("http://www.baidu.com/robots.txt");
|
||||||
let lines: Vec<&str> = doc.split("\n").collect();
|
let lines: Vec<&str> = doc.split("\n").collect();
|
||||||
parser.parse(lines);
|
parser.parse(&lines);
|
||||||
for url in &good_urls {
|
for url in &good_urls {
|
||||||
assert!(parser.can_fetch(agent, url));
|
assert!(parser.can_fetch(agent, url));
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue