mirror of
https://github.com/danbulant/robotparser-rs
synced 2026-07-03 10:10:47 +00:00
Switch to https for urls (#23)
This commit is contained in:
parent
2d19755779
commit
1474a8cce9
7 changed files with 32 additions and 32 deletions
|
|
@ -31,9 +31,9 @@ use url::Url;
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
let client = Client::new();
|
let client = Client::new();
|
||||||
let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
|
let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||||
let robots_txt = client.fetch_robots_txt(robots_txt_url.origin()).unwrap().get_result();
|
let robots_txt = client.fetch_robots_txt(robots_txt_url.origin()).unwrap().get_result();
|
||||||
let fetch_url = Url::parse("http://www.python.org/robots.txt").unwrap();
|
let fetch_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||||
assert!(robots_txt.can_fetch("*", &fetch_url));
|
assert!(robots_txt.can_fetch("*", &fetch_url));
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
//! robots.txt parser for Rust
|
//! robots.txt parser for Rust
|
||||||
//!
|
//!
|
||||||
//! The robots.txt Exclusion Protocol is implemented as specified in
|
//! The robots.txt Exclusion Protocol is implemented as specified in
|
||||||
//! <http://www.robotstxt.org/norobots-rfc.txt>
|
//! <https://www.robotstxt.org/norobots-rfc.txt>
|
||||||
//!
|
//!
|
||||||
//! # Installation
|
//! # Installation
|
||||||
//!
|
//!
|
||||||
|
|
@ -23,9 +23,9 @@
|
||||||
//!
|
//!
|
||||||
//! fn main() {
|
//! fn main() {
|
||||||
//! let client = Client::new();
|
//! let client = Client::new();
|
||||||
//! let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
|
//! let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||||
//! let robots_txt = client.fetch_robots_txt(robots_txt_url.origin()).unwrap().get_result();
|
//! let robots_txt = client.fetch_robots_txt(robots_txt_url.origin()).unwrap().get_result();
|
||||||
//! let fetch_url = Url::parse("http://www.python.org/robots.txt").unwrap();
|
//! let fetch_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||||
//! assert!(robots_txt.can_fetch("*", &fetch_url));
|
//! assert!(robots_txt.can_fetch("*", &fetch_url));
|
||||||
//! }
|
//! }
|
||||||
//! ```
|
//! ```
|
||||||
|
|
|
||||||
|
|
@ -16,13 +16,13 @@
|
||||||
//! use url::Url;
|
//! use url::Url;
|
||||||
//!
|
//!
|
||||||
//! fn main() {
|
//! fn main() {
|
||||||
//! let robots_txt_url = Url::parse("http://google.com/robots.txt").unwrap();
|
//! let robots_txt_url = Url::parse("https://google.com/robots.txt").unwrap();
|
||||||
//! let robots_txt = "User-agent: *\nDisallow: /search";
|
//! let robots_txt = "User-agent: *\nDisallow: /search";
|
||||||
//! let robots_txt = parse_robots_txt(robots_txt_url.origin(), robots_txt);
|
//! let robots_txt = parse_robots_txt(robots_txt_url.origin(), robots_txt);
|
||||||
//! assert_eq!(robots_txt.get_warnings().len(), 0);
|
//! assert_eq!(robots_txt.get_warnings().len(), 0);
|
||||||
//! let robots_txt = robots_txt.get_result();
|
//! let robots_txt = robots_txt.get_result();
|
||||||
//! let good_url = Url::parse("http://google.com/test").unwrap();
|
//! let good_url = Url::parse("https://google.com/test").unwrap();
|
||||||
//! let bad_url = Url::parse("http://google.com/search/vvv").unwrap();
|
//! let bad_url = Url::parse("https://google.com/search/vvv").unwrap();
|
||||||
//! assert_eq!(robots_txt.can_fetch("*", &bad_url), false);
|
//! assert_eq!(robots_txt.can_fetch("*", &bad_url), false);
|
||||||
//! assert_eq!(robots_txt.can_fetch("*", &good_url), true);
|
//! assert_eq!(robots_txt.can_fetch("*", &good_url), true);
|
||||||
//! }
|
//! }
|
||||||
|
|
|
||||||
|
|
@ -6,15 +6,15 @@ use url::Url;
|
||||||
const AGENT: &'static str = "test_robotparser";
|
const AGENT: &'static str = "test_robotparser";
|
||||||
|
|
||||||
fn robot_test(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>, agent: &str) {
|
fn robot_test(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>, agent: &str) {
|
||||||
let url = Url::parse("http://www.baidu.com/robots.txt").unwrap();
|
let url = Url::parse("https://www.baidu.com/robots.txt").unwrap();
|
||||||
let parser = parse_robots_txt(url.origin(), doc).get_result();
|
let parser = parse_robots_txt(url.origin(), doc).get_result();
|
||||||
for url in &good_urls {
|
for url in &good_urls {
|
||||||
let url = format!("http://www.baidu.com{}", url);
|
let url = format!("https://www.baidu.com{}", url);
|
||||||
let url = Url::parse(&url).unwrap();
|
let url = Url::parse(&url).unwrap();
|
||||||
assert!(parser.can_fetch(agent, &url));
|
assert!(parser.can_fetch(agent, &url));
|
||||||
}
|
}
|
||||||
for url in &bad_urls {
|
for url in &bad_urls {
|
||||||
let url = format!("http://www.baidu.com{}", url);
|
let url = format!("https://www.baidu.com{}", url);
|
||||||
let url = Url::parse(&url).unwrap();
|
let url = Url::parse(&url).unwrap();
|
||||||
assert!(!parser.can_fetch(agent, &url));
|
assert!(!parser.can_fetch(agent, &url));
|
||||||
}
|
}
|
||||||
|
|
@ -56,7 +56,7 @@ fn test_robots_txt_1() {
|
||||||
#[test]
|
#[test]
|
||||||
fn test_robots_txt_2() {
|
fn test_robots_txt_2() {
|
||||||
let doc = "\n\
|
let doc = "\n\
|
||||||
# robots.txt for http://www.example.com/\n\
|
# robots.txt for https://www.example.com/\n\
|
||||||
\n\
|
\n\
|
||||||
User-agent: *\n\
|
User-agent: *\n\
|
||||||
Disallow: /cyberworld/map/ # This is an infinite virtual URL space\n\
|
Disallow: /cyberworld/map/ # This is an infinite virtual URL space\n\
|
||||||
|
|
@ -249,7 +249,7 @@ fn test_robots_txt_read() {
|
||||||
use robotparser::http::{CreateRobotsTxtRequest, ParseRobotsTxtResponse};
|
use robotparser::http::{CreateRobotsTxtRequest, ParseRobotsTxtResponse};
|
||||||
use reqwest::{Client, Request};
|
use reqwest::{Client, Request};
|
||||||
let http_client = Client::new();
|
let http_client = Client::new();
|
||||||
let url = Url::parse("http://www.python.org/robots.txt").unwrap();
|
let url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||||
let request = Request::create_robots_txt_request(url.origin());
|
let request = Request::create_robots_txt_request(url.origin());
|
||||||
let mut response = http_client.execute(request).unwrap();
|
let mut response = http_client.execute(request).unwrap();
|
||||||
let parser = response.parse_robots_txt_response().unwrap().get_result();
|
let parser = response.parse_robots_txt_response().unwrap().get_result();
|
||||||
|
|
@ -258,7 +258,7 @@ fn test_robots_txt_read() {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_robots_text_crawl_delay() {
|
fn test_robots_text_crawl_delay() {
|
||||||
let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
|
let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||||
let doc = "User-agent: Yandex\n\
|
let doc = "User-agent: Yandex\n\
|
||||||
Crawl-delay: 2.35\n\
|
Crawl-delay: 2.35\n\
|
||||||
Disallow: /search/\n";
|
Disallow: /search/\n";
|
||||||
|
|
@ -268,18 +268,18 @@ fn test_robots_text_crawl_delay() {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_robots_text_sitemaps() {
|
fn test_robots_text_sitemaps() {
|
||||||
let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
|
let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||||
let doc = "User-agent: Yandex\n\
|
let doc = "User-agent: Yandex\n\
|
||||||
Sitemap \t : http://example.com/sitemap1.xml\n
|
Sitemap \t : https://example.com/sitemap1.xml\n
|
||||||
Sitemap: http://example.com/sitemap2.xml\n
|
Sitemap: https://example.com/sitemap2.xml\n
|
||||||
Sitemap: http://example.com/sitemap3.xml\n
|
Sitemap: https://example.com/sitemap3.xml\n
|
||||||
Disallow: /search/\n";
|
Disallow: /search/\n";
|
||||||
let parser = parse_robots_txt(robots_txt_url.origin(), doc).get_result();
|
let parser = parse_robots_txt(robots_txt_url.origin(), doc).get_result();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
&[
|
&[
|
||||||
Url::parse("http://example.com/sitemap1.xml").unwrap(),
|
Url::parse("https://example.com/sitemap1.xml").unwrap(),
|
||||||
Url::parse("http://example.com/sitemap2.xml").unwrap(),
|
Url::parse("https://example.com/sitemap2.xml").unwrap(),
|
||||||
Url::parse("http://example.com/sitemap3.xml").unwrap()
|
Url::parse("https://example.com/sitemap3.xml").unwrap()
|
||||||
],
|
],
|
||||||
parser.get_sitemaps()
|
parser.get_sitemaps()
|
||||||
);
|
);
|
||||||
|
|
@ -287,7 +287,7 @@ fn test_robots_text_sitemaps() {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_robots_text_request_rate() {
|
fn test_robots_text_request_rate() {
|
||||||
let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
|
let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||||
let doc =
|
let doc =
|
||||||
"User-agent: Yandex\n\
|
"User-agent: Yandex\n\
|
||||||
Request-rate: 3/15\n\
|
Request-rate: 3/15\n\
|
||||||
|
|
@ -313,15 +313,15 @@ Clean-param: gid\n\
|
||||||
Clean-param: tm\n\
|
Clean-param: tm\n\
|
||||||
Clean-param: amp\n\
|
Clean-param: amp\n\
|
||||||
";
|
";
|
||||||
let url = Url::parse("http://www.baidu.com/robots.txt").unwrap();
|
let url = Url::parse("https://www.baidu.com/robots.txt").unwrap();
|
||||||
let parser = parse_robots_txt(url.origin(), doc).get_result();
|
let parser = parse_robots_txt(url.origin(), doc).get_result();
|
||||||
let mut site_url = Url::parse("http://www.baidu.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&=1").unwrap();
|
let mut site_url = Url::parse("https://www.baidu.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&=1").unwrap();
|
||||||
let was_updated = parser.normalize_url(&mut site_url);
|
let was_updated = parser.normalize_url(&mut site_url);
|
||||||
assert_eq!(was_updated, true);
|
assert_eq!(was_updated, true);
|
||||||
assert_eq!(site_url.as_str(), "http://www.baidu.com/test?post_id=7777");
|
assert_eq!(site_url.as_str(), "https://www.baidu.com/test?post_id=7777");
|
||||||
|
|
||||||
let mut site_url = Url::parse("http://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&=1").unwrap();
|
let mut site_url = Url::parse("https://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&=1").unwrap();
|
||||||
let was_updated = parser.normalize_url(&mut site_url);
|
let was_updated = parser.normalize_url(&mut site_url);
|
||||||
assert_eq!(was_updated, false);
|
assert_eq!(was_updated, false);
|
||||||
assert_eq!(site_url.as_str(), "http://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&=1");
|
assert_eq!(site_url.as_str(), "https://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&=1");
|
||||||
}
|
}
|
||||||
|
|
@ -8,9 +8,9 @@ use tokio::runtime::Runtime;
|
||||||
fn test_reqwest_async() {
|
fn test_reqwest_async() {
|
||||||
let mut runtime = Runtime::new().unwrap();
|
let mut runtime = Runtime::new().unwrap();
|
||||||
let client = Client::new();
|
let client = Client::new();
|
||||||
let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
|
let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||||
let robots_txt_response = runtime.block_on(client.fetch_robots_txt(robots_txt_url.origin()));
|
let robots_txt_response = runtime.block_on(client.fetch_robots_txt(robots_txt_url.origin()));
|
||||||
let robots_txt = robots_txt_response.unwrap().get_result();
|
let robots_txt = robots_txt_response.unwrap().get_result();
|
||||||
let fetch_url = Url::parse("http://www.python.org/robots.txt").unwrap();
|
let fetch_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||||
assert!(robots_txt.can_fetch("*", &fetch_url));
|
assert!(robots_txt.can_fetch("*", &fetch_url));
|
||||||
}
|
}
|
||||||
|
|
@ -6,8 +6,8 @@ use url::Url;
|
||||||
#[test]
|
#[test]
|
||||||
fn test_reqwest_blocking() {
|
fn test_reqwest_blocking() {
|
||||||
let client = Client::new();
|
let client = Client::new();
|
||||||
let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
|
let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||||
let robots_txt = client.fetch_robots_txt(robots_txt_url.origin()).unwrap().get_result();
|
let robots_txt = client.fetch_robots_txt(robots_txt_url.origin()).unwrap().get_result();
|
||||||
let fetch_url = Url::parse("http://www.python.org/robots.txt").unwrap();
|
let fetch_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||||
assert!(robots_txt.can_fetch("*", &fetch_url));
|
assert!(robots_txt.can_fetch("*", &fetch_url));
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -106,7 +106,7 @@ fn test_warning_request_rate() {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_warning_parsing_url() {
|
fn test_warning_parsing_url() {
|
||||||
let input = "User-Agent: *\nSitemap: http://python.org/sitemap.xml";
|
let input = "User-Agent: *\nSitemap: https://python.org/sitemap.xml";
|
||||||
validate_warnings(input, &[]);
|
validate_warnings(input, &[]);
|
||||||
let input = "User-Agent: *\nSitemap: http$$$://python.org/sitemap.xml";
|
let input = "User-Agent: *\nSitemap: http$$$://python.org/sitemap.xml";
|
||||||
validate_warnings(input, &[WarningReasonKind::ParseUrl]);
|
validate_warnings(input, &[WarningReasonKind::ParseUrl]);
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue