mirror of
https://github.com/danbulant/robotparser-rs
synced 2026-06-15 12:41:29 +00:00
Initial commit.
This commit is contained in:
parent
2a2ebd6a13
commit
ece1e1f3c4
4 changed files with 387 additions and 0 deletions
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
target
|
||||
Cargo.lock
|
||||
9
Cargo.toml
Normal file
9
Cargo.toml
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
[package]
|
||||
name = "robotparser"
|
||||
version = "0.1.0"
|
||||
authors = ["messense <messense@icloud.com>"]
|
||||
|
||||
[dependencies]
|
||||
hyper = "*"
|
||||
url = "*"
|
||||
time = "*"
|
||||
320
src/lib.rs
Normal file
320
src/lib.rs
Normal file
|
|
@ -0,0 +1,320 @@
|
|||
//!
|
||||
//! robots.txt parser for Rust
|
||||
//!
|
||||
//! The robots.txt Exclusion Protocol is implemented as specified in
|
||||
//! http://www.robotstxt.org/norobots-rfc.txt
|
||||
//!
|
||||
extern crate url;
|
||||
extern crate time;
|
||||
extern crate hyper;
|
||||
|
||||
use std::io::Read;
|
||||
use std::cell::{Cell, RefCell};
|
||||
use url::Url;
|
||||
use hyper::{Client};
|
||||
use hyper::status::StatusCode;
|
||||
|
||||
/// A rule line is a single "Allow:" (allowance==True) or "Disallow:"
|
||||
/// (allowance==False) followed by a path."""
|
||||
#[derive(Debug, Eq, PartialEq, Clone)]
|
||||
struct RuleLine {
|
||||
path: String,
|
||||
allowance: bool,
|
||||
}
|
||||
|
||||
/// An entry has one or more user-agents and zero or more rulelines
|
||||
#[derive(Debug, Eq, PartialEq, Clone)]
|
||||
struct Entry {
|
||||
useragents: RefCell<Vec<String>>,
|
||||
rulelines: RefCell<Vec<RuleLine>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Eq, PartialEq, Clone)]
|
||||
pub struct RobotFileParser {
|
||||
entries: RefCell<Vec<Entry>>,
|
||||
default_entry: RefCell<Entry>,
|
||||
disallow_all: Cell<bool>,
|
||||
allow_all: Cell<bool>,
|
||||
url: Url,
|
||||
host: String,
|
||||
path: String,
|
||||
last_checked: Cell<i64>,
|
||||
}
|
||||
|
||||
|
||||
impl RuleLine {
|
||||
fn new(path: &str, allowance: bool) -> RuleLine {
|
||||
let mut allow = allowance;
|
||||
if path == "" && !allowance {
|
||||
// an empty value means allow all
|
||||
allow = true;
|
||||
}
|
||||
RuleLine {
|
||||
path: path.to_owned(),
|
||||
allowance: allow,
|
||||
}
|
||||
}
|
||||
|
||||
fn applies_to(&self, filename: &str) -> bool {
|
||||
&self.path == "*" || filename.starts_with(&self.path)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl Entry {
|
||||
fn new() -> Entry {
|
||||
Entry {
|
||||
useragents: RefCell::new(vec![]),
|
||||
rulelines: RefCell::new(vec![]),
|
||||
}
|
||||
}
|
||||
|
||||
/// check if this entry applies to the specified agent
|
||||
fn applies_to(&self, useragent: &str) -> bool {
|
||||
let ua = useragent.split("/").nth(0).unwrap_or("");
|
||||
let useragents = self.useragents.borrow().clone();
|
||||
for agent in &useragents {
|
||||
if agent == "*" {
|
||||
return true;
|
||||
}
|
||||
if ua.contains(agent) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
|
||||
/// Preconditions:
|
||||
/// - our agent applies to this entry
|
||||
/// - filename is URL decoded
|
||||
fn allowance(&self, filename: &str) -> bool {
|
||||
let rulelines = self.rulelines.borrow().clone();
|
||||
for line in &rulelines {
|
||||
if line.applies_to(filename) {
|
||||
return line.allowance
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
fn push_useragent(&self, useragent: &str) {
|
||||
let mut useragents = self.useragents.borrow_mut();
|
||||
useragents.push(useragent.to_owned());
|
||||
}
|
||||
|
||||
fn push_ruleline(&self, ruleline: RuleLine) {
|
||||
let mut rulelines = self.rulelines.borrow_mut();
|
||||
rulelines.push(ruleline);
|
||||
}
|
||||
|
||||
fn has_useragent(&self, useragent: &str) -> bool {
|
||||
let useragents = self.useragents.borrow().clone();
|
||||
useragents.contains(&useragent.to_owned())
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
let useragents = self.useragents.borrow().clone();
|
||||
let rulelines = self.rulelines.borrow().clone();
|
||||
useragents.is_empty() && rulelines.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl RobotFileParser {
|
||||
pub fn new(url: &str) -> RobotFileParser {
|
||||
let parsed_url = Url::parse(url).unwrap();
|
||||
RobotFileParser {
|
||||
entries: RefCell::new(vec![]),
|
||||
default_entry: RefCell::new(Entry::new()),
|
||||
disallow_all: Cell::new(false),
|
||||
allow_all: Cell::new(false),
|
||||
url: parsed_url.clone(),
|
||||
host: parsed_url.domain().unwrap().to_owned(),
|
||||
path: parsed_url.path().unwrap().connect("/"),
|
||||
last_checked: Cell::new(0i64),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the time the robots.txt file was last fetched.
|
||||
///
|
||||
/// This is useful for long-running web spiders that need to
|
||||
/// check for new robots.txt files periodically.
|
||||
pub fn mtime(&self) -> i64 {
|
||||
self.last_checked.get()
|
||||
}
|
||||
|
||||
/// Sets the time the robots.txt file was last fetched to the
|
||||
/// current time.
|
||||
pub fn modified(&self) {
|
||||
use time::get_time;
|
||||
|
||||
self.last_checked.set(get_time().sec);
|
||||
}
|
||||
|
||||
/// Sets the URL referring to a robots.txt file.
|
||||
pub fn set_url(&mut self, url: &str) {
|
||||
let parsed_url = Url::parse(url).unwrap();
|
||||
self.url = parsed_url.clone();
|
||||
self.host = parsed_url.domain().unwrap().to_owned();
|
||||
self.path = parsed_url.path().unwrap().connect("/");
|
||||
self.last_checked.set(0i64);
|
||||
}
|
||||
|
||||
/// Reads the robots.txt URL and feeds it to the parser.
|
||||
pub fn read(&self) {
|
||||
let client = Client::new();
|
||||
let mut res = match client.get(self.url.clone()).send() {
|
||||
Ok(res) => res,
|
||||
Err(_) => {
|
||||
return;
|
||||
}
|
||||
};
|
||||
match res.status {
|
||||
StatusCode::Unauthorized | StatusCode::Forbidden => {
|
||||
self.disallow_all.set(true);
|
||||
},
|
||||
status if status >= StatusCode::BadRequest && status < StatusCode::InternalServerError => {
|
||||
self.allow_all.set(true);
|
||||
},
|
||||
StatusCode::Ok => {
|
||||
let mut buf = String::new();
|
||||
res.read_to_string(&mut buf).unwrap();
|
||||
let lines: Vec<&str> = buf.split("\n").collect();
|
||||
self.parse(lines);
|
||||
},
|
||||
_ => {},
|
||||
}
|
||||
}
|
||||
|
||||
fn _add_entry(&self, entry: Entry) {
|
||||
if entry.has_useragent("*") {
|
||||
// the default entry is considered last
|
||||
let mut default_entry = self.default_entry.borrow_mut();
|
||||
if default_entry.is_empty() {
|
||||
// the first default entry wins
|
||||
*default_entry = entry;
|
||||
}
|
||||
} else {
|
||||
let mut entries = self.entries.borrow_mut();
|
||||
entries.push(entry);
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Parse the input lines from a robots.txt file
|
||||
///
|
||||
/// We allow that a user-agent: line is not preceded by
|
||||
/// one or more blank lines.
|
||||
///
|
||||
pub fn parse(&self, lines: Vec<&str>) {
|
||||
use url::percent_encoding::percent_decode;
|
||||
|
||||
// states:
|
||||
// 0: start state
|
||||
// 1: saw user-agent line
|
||||
// 2: saw an allow or disallow line
|
||||
let mut state = 0;
|
||||
let mut entry = Entry::new();
|
||||
|
||||
self.modified();
|
||||
for line in &lines {
|
||||
let mut ln = line.clone();
|
||||
if ln.is_empty() {
|
||||
match state {
|
||||
1 => {
|
||||
entry = Entry::new();
|
||||
state = 0;
|
||||
},
|
||||
2 => {
|
||||
self._add_entry(entry);
|
||||
entry = Entry::new();
|
||||
state = 0;
|
||||
},
|
||||
_ => {},
|
||||
}
|
||||
}
|
||||
// remove optional comment and strip line
|
||||
match ln.find("#") {
|
||||
Some(i) => {
|
||||
ln = &ln[0..i];
|
||||
},
|
||||
None => {},
|
||||
}
|
||||
ln = ln.trim();
|
||||
if ln.is_empty() {
|
||||
continue;
|
||||
}
|
||||
let parts: Vec<&str> = ln.splitn(2, ":").collect();
|
||||
if parts.len() == 2 {
|
||||
let part0 = parts[0].trim().to_lowercase();
|
||||
let part1 = String::from_utf8(percent_decode(parts[1].trim().as_bytes())).unwrap_or("".to_owned());
|
||||
match part0 {
|
||||
ref x if x == "user-agent" => {
|
||||
if state == 2 {
|
||||
self._add_entry(entry);
|
||||
entry = Entry::new();
|
||||
}
|
||||
entry.push_useragent(&part1);
|
||||
state = 1;
|
||||
},
|
||||
ref x if x == "disallow" => {
|
||||
if state != 0 {
|
||||
entry.push_ruleline(RuleLine::new(&part1, false));
|
||||
state = 2;
|
||||
}
|
||||
},
|
||||
ref x if x == "allow" => {
|
||||
if state != 0 {
|
||||
entry.push_ruleline(RuleLine::new(&part1, true));
|
||||
state = 2;
|
||||
}
|
||||
},
|
||||
_ => {},
|
||||
}
|
||||
}
|
||||
}
|
||||
if state == 2 {
|
||||
self._add_entry(entry);
|
||||
}
|
||||
}
|
||||
|
||||
/// Using the parsed robots.txt decide if useragent can fetch url
|
||||
pub fn can_fetch(&self, useragent: &str, url: &str) -> bool {
|
||||
use url::percent_encoding::percent_decode;
|
||||
|
||||
if self.disallow_all.get() {
|
||||
return false;
|
||||
}
|
||||
if self.allow_all.get() {
|
||||
return true;
|
||||
}
|
||||
// Until the robots.txt file has been read or found not
|
||||
// to exist, we must assume that no url is allowable.
|
||||
// This prevents false positives when a user erronenously
|
||||
// calls can_fetch() before calling read().
|
||||
if self.last_checked.get() == 0 {
|
||||
return false;
|
||||
}
|
||||
// search for given user agent matches
|
||||
// the first match counts
|
||||
let decoded_url = String::from_utf8(percent_decode(url.as_bytes())).unwrap_or("".to_owned());
|
||||
let url_str = match decoded_url {
|
||||
ref u if !u.is_empty() => u.to_owned(),
|
||||
_ => "/".to_owned(),
|
||||
};
|
||||
let entries = self.entries.borrow().clone();
|
||||
for entry in &entries {
|
||||
if entry.applies_to(useragent) {
|
||||
return entry.allowance(&url_str);
|
||||
}
|
||||
}
|
||||
// try the default entry last
|
||||
let default_entry = self.default_entry.borrow().clone();
|
||||
if !default_entry.is_empty() {
|
||||
return default_entry.allowance(&url_str);
|
||||
}
|
||||
// agent not found ==> access granted
|
||||
true
|
||||
}
|
||||
}
|
||||
56
tests/lib.rs
Normal file
56
tests/lib.rs
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
extern crate robotparser;
|
||||
|
||||
use robotparser::RobotFileParser;
|
||||
|
||||
const AGENT: &'static str = "test_robotparser";
|
||||
|
||||
|
||||
fn robot_test(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>, agent: &str) {
|
||||
let parser = RobotFileParser::new("http://www.baidu.com/robots.txt");
|
||||
let lines: Vec<&str> = doc.split("\n").collect();
|
||||
parser.parse(lines);
|
||||
for url in &good_urls {
|
||||
assert!(parser.can_fetch(agent, url));
|
||||
}
|
||||
for url in &bad_urls {
|
||||
assert!(!parser.can_fetch(agent, url));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn robot_test_simple(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>) {
|
||||
robot_test(doc, good_urls, bad_urls, AGENT);
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_robots_txt_1() {
|
||||
let doc = "\n\
|
||||
User-agent: *\n\
|
||||
Disallow: /cyberworld/map/ # This is an infinite virtual URL space\n\
|
||||
Disallow: /tmp/ # these will soon disappear\n\
|
||||
Disallow: /foo.html\n\
|
||||
";
|
||||
let good = vec!["/","/test.html"];
|
||||
let bad = vec!["/cyberworld/map/index.html","/tmp/xxx","/foo.html"];
|
||||
robot_test_simple(doc, good, bad);
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_robots_txt_2() {
|
||||
let doc = "\n\
|
||||
# robots.txt for http://www.example.com/\n\
|
||||
\n\
|
||||
User-agent: *\n\
|
||||
Disallow: /cyberworld/map/ # This is an infinite virtual URL space\n\
|
||||
\n\
|
||||
# Cybermapper knows where to go.\n\
|
||||
User-agent: cybermapper\n\
|
||||
Disallow:\n\
|
||||
\n\
|
||||
";
|
||||
let good = vec!["/","/test.html"];
|
||||
let bad = vec!["/cyberworld/map/index.html"];
|
||||
robot_test_simple(doc, good, bad);
|
||||
}
|
||||
Loading…
Reference in a new issue