mirror of
https://github.com/danbulant/robotparser-rs
synced 2026-06-17 13:41:26 +00:00
Switch to https for urls (#23)
This commit is contained in:
parent
2d19755779
commit
1474a8cce9
7 changed files with 32 additions and 32 deletions
|
|
@ -31,9 +31,9 @@ use url::Url;
|
|||
|
||||
fn main() {
|
||||
let client = Client::new();
|
||||
let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
|
||||
let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||
let robots_txt = client.fetch_robots_txt(robots_txt_url.origin()).unwrap().get_result();
|
||||
let fetch_url = Url::parse("http://www.python.org/robots.txt").unwrap();
|
||||
let fetch_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||
assert!(robots_txt.can_fetch("*", &fetch_url));
|
||||
}
|
||||
```
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
//! robots.txt parser for Rust
|
||||
//!
|
||||
//! The robots.txt Exclusion Protocol is implemented as specified in
|
||||
//! <http://www.robotstxt.org/norobots-rfc.txt>
|
||||
//! <https://www.robotstxt.org/norobots-rfc.txt>
|
||||
//!
|
||||
//! # Installation
|
||||
//!
|
||||
|
|
@ -23,9 +23,9 @@
|
|||
//!
|
||||
//! fn main() {
|
||||
//! let client = Client::new();
|
||||
//! let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
|
||||
//! let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||
//! let robots_txt = client.fetch_robots_txt(robots_txt_url.origin()).unwrap().get_result();
|
||||
//! let fetch_url = Url::parse("http://www.python.org/robots.txt").unwrap();
|
||||
//! let fetch_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||
//! assert!(robots_txt.can_fetch("*", &fetch_url));
|
||||
//! }
|
||||
//! ```
|
||||
|
|
|
|||
|
|
@ -16,13 +16,13 @@
|
|||
//! use url::Url;
|
||||
//!
|
||||
//! fn main() {
|
||||
//! let robots_txt_url = Url::parse("http://google.com/robots.txt").unwrap();
|
||||
//! let robots_txt_url = Url::parse("https://google.com/robots.txt").unwrap();
|
||||
//! let robots_txt = "User-agent: *\nDisallow: /search";
|
||||
//! let robots_txt = parse_robots_txt(robots_txt_url.origin(), robots_txt);
|
||||
//! assert_eq!(robots_txt.get_warnings().len(), 0);
|
||||
//! let robots_txt = robots_txt.get_result();
|
||||
//! let good_url = Url::parse("http://google.com/test").unwrap();
|
||||
//! let bad_url = Url::parse("http://google.com/search/vvv").unwrap();
|
||||
//! let good_url = Url::parse("https://google.com/test").unwrap();
|
||||
//! let bad_url = Url::parse("https://google.com/search/vvv").unwrap();
|
||||
//! assert_eq!(robots_txt.can_fetch("*", &bad_url), false);
|
||||
//! assert_eq!(robots_txt.can_fetch("*", &good_url), true);
|
||||
//! }
|
||||
|
|
|
|||
|
|
@ -6,15 +6,15 @@ use url::Url;
|
|||
const AGENT: &'static str = "test_robotparser";
|
||||
|
||||
fn robot_test(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>, agent: &str) {
|
||||
let url = Url::parse("http://www.baidu.com/robots.txt").unwrap();
|
||||
let url = Url::parse("https://www.baidu.com/robots.txt").unwrap();
|
||||
let parser = parse_robots_txt(url.origin(), doc).get_result();
|
||||
for url in &good_urls {
|
||||
let url = format!("http://www.baidu.com{}", url);
|
||||
let url = format!("https://www.baidu.com{}", url);
|
||||
let url = Url::parse(&url).unwrap();
|
||||
assert!(parser.can_fetch(agent, &url));
|
||||
}
|
||||
for url in &bad_urls {
|
||||
let url = format!("http://www.baidu.com{}", url);
|
||||
let url = format!("https://www.baidu.com{}", url);
|
||||
let url = Url::parse(&url).unwrap();
|
||||
assert!(!parser.can_fetch(agent, &url));
|
||||
}
|
||||
|
|
@ -56,7 +56,7 @@ fn test_robots_txt_1() {
|
|||
#[test]
|
||||
fn test_robots_txt_2() {
|
||||
let doc = "\n\
|
||||
# robots.txt for http://www.example.com/\n\
|
||||
# robots.txt for https://www.example.com/\n\
|
||||
\n\
|
||||
User-agent: *\n\
|
||||
Disallow: /cyberworld/map/ # This is an infinite virtual URL space\n\
|
||||
|
|
@ -249,7 +249,7 @@ fn test_robots_txt_read() {
|
|||
use robotparser::http::{CreateRobotsTxtRequest, ParseRobotsTxtResponse};
|
||||
use reqwest::{Client, Request};
|
||||
let http_client = Client::new();
|
||||
let url = Url::parse("http://www.python.org/robots.txt").unwrap();
|
||||
let url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||
let request = Request::create_robots_txt_request(url.origin());
|
||||
let mut response = http_client.execute(request).unwrap();
|
||||
let parser = response.parse_robots_txt_response().unwrap().get_result();
|
||||
|
|
@ -258,7 +258,7 @@ fn test_robots_txt_read() {
|
|||
|
||||
#[test]
|
||||
fn test_robots_text_crawl_delay() {
|
||||
let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
|
||||
let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||
let doc = "User-agent: Yandex\n\
|
||||
Crawl-delay: 2.35\n\
|
||||
Disallow: /search/\n";
|
||||
|
|
@ -268,18 +268,18 @@ fn test_robots_text_crawl_delay() {
|
|||
|
||||
#[test]
|
||||
fn test_robots_text_sitemaps() {
|
||||
let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
|
||||
let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||
let doc = "User-agent: Yandex\n\
|
||||
Sitemap \t : http://example.com/sitemap1.xml\n
|
||||
Sitemap: http://example.com/sitemap2.xml\n
|
||||
Sitemap: http://example.com/sitemap3.xml\n
|
||||
Sitemap \t : https://example.com/sitemap1.xml\n
|
||||
Sitemap: https://example.com/sitemap2.xml\n
|
||||
Sitemap: https://example.com/sitemap3.xml\n
|
||||
Disallow: /search/\n";
|
||||
let parser = parse_robots_txt(robots_txt_url.origin(), doc).get_result();
|
||||
assert_eq!(
|
||||
&[
|
||||
Url::parse("http://example.com/sitemap1.xml").unwrap(),
|
||||
Url::parse("http://example.com/sitemap2.xml").unwrap(),
|
||||
Url::parse("http://example.com/sitemap3.xml").unwrap()
|
||||
Url::parse("https://example.com/sitemap1.xml").unwrap(),
|
||||
Url::parse("https://example.com/sitemap2.xml").unwrap(),
|
||||
Url::parse("https://example.com/sitemap3.xml").unwrap()
|
||||
],
|
||||
parser.get_sitemaps()
|
||||
);
|
||||
|
|
@ -287,7 +287,7 @@ fn test_robots_text_sitemaps() {
|
|||
|
||||
#[test]
|
||||
fn test_robots_text_request_rate() {
|
||||
let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
|
||||
let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||
let doc =
|
||||
"User-agent: Yandex\n\
|
||||
Request-rate: 3/15\n\
|
||||
|
|
@ -313,15 +313,15 @@ Clean-param: gid\n\
|
|||
Clean-param: tm\n\
|
||||
Clean-param: amp\n\
|
||||
";
|
||||
let url = Url::parse("http://www.baidu.com/robots.txt").unwrap();
|
||||
let url = Url::parse("https://www.baidu.com/robots.txt").unwrap();
|
||||
let parser = parse_robots_txt(url.origin(), doc).get_result();
|
||||
let mut site_url = Url::parse("http://www.baidu.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&=1").unwrap();
|
||||
let mut site_url = Url::parse("https://www.baidu.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&=1").unwrap();
|
||||
let was_updated = parser.normalize_url(&mut site_url);
|
||||
assert_eq!(was_updated, true);
|
||||
assert_eq!(site_url.as_str(), "http://www.baidu.com/test?post_id=7777");
|
||||
assert_eq!(site_url.as_str(), "https://www.baidu.com/test?post_id=7777");
|
||||
|
||||
let mut site_url = Url::parse("http://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&=1").unwrap();
|
||||
let mut site_url = Url::parse("https://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&=1").unwrap();
|
||||
let was_updated = parser.normalize_url(&mut site_url);
|
||||
assert_eq!(was_updated, false);
|
||||
assert_eq!(site_url.as_str(), "http://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&=1");
|
||||
assert_eq!(site_url.as_str(), "https://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&=1");
|
||||
}
|
||||
|
|
@ -8,9 +8,9 @@ use tokio::runtime::Runtime;
|
|||
fn test_reqwest_async() {
|
||||
let mut runtime = Runtime::new().unwrap();
|
||||
let client = Client::new();
|
||||
let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
|
||||
let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||
let robots_txt_response = runtime.block_on(client.fetch_robots_txt(robots_txt_url.origin()));
|
||||
let robots_txt = robots_txt_response.unwrap().get_result();
|
||||
let fetch_url = Url::parse("http://www.python.org/robots.txt").unwrap();
|
||||
let fetch_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||
assert!(robots_txt.can_fetch("*", &fetch_url));
|
||||
}
|
||||
|
|
@ -6,8 +6,8 @@ use url::Url;
|
|||
#[test]
|
||||
fn test_reqwest_blocking() {
|
||||
let client = Client::new();
|
||||
let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
|
||||
let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||
let robots_txt = client.fetch_robots_txt(robots_txt_url.origin()).unwrap().get_result();
|
||||
let fetch_url = Url::parse("http://www.python.org/robots.txt").unwrap();
|
||||
let fetch_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||
assert!(robots_txt.can_fetch("*", &fetch_url));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -106,7 +106,7 @@ fn test_warning_request_rate() {
|
|||
|
||||
#[test]
|
||||
fn test_warning_parsing_url() {
|
||||
let input = "User-Agent: *\nSitemap: http://python.org/sitemap.xml";
|
||||
let input = "User-Agent: *\nSitemap: https://python.org/sitemap.xml";
|
||||
validate_warnings(input, &[]);
|
||||
let input = "User-Agent: *\nSitemap: http$$$://python.org/sitemap.xml";
|
||||
validate_warnings(input, &[WarningReasonKind::ParseUrl]);
|
||||
|
|
|
|||
Loading…
Reference in a new issue