Switch to https for urls (#23)

This commit is contained in:
Laurent Arnoud 2020-03-02 07:20:44 +00:00 committed by GitHub
parent 2d19755779
commit 1474a8cce9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 32 additions and 32 deletions

View file

@ -31,9 +31,9 @@ use url::Url;
fn main() {
let client = Client::new();
let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
let robots_txt = client.fetch_robots_txt(robots_txt_url.origin()).unwrap().get_result();
let fetch_url = Url::parse("http://www.python.org/robots.txt").unwrap();
let fetch_url = Url::parse("https://www.python.org/robots.txt").unwrap();
assert!(robots_txt.can_fetch("*", &fetch_url));
}
```

View file

@ -1,7 +1,7 @@
//! robots.txt parser for Rust
//!
//! The robots.txt Exclusion Protocol is implemented as specified in
//! <http://www.robotstxt.org/norobots-rfc.txt>
//! <https://www.robotstxt.org/norobots-rfc.txt>
//!
//! # Installation
//!
@ -23,9 +23,9 @@
//!
//! fn main() {
//! let client = Client::new();
//! let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
//! let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
//! let robots_txt = client.fetch_robots_txt(robots_txt_url.origin()).unwrap().get_result();
//! let fetch_url = Url::parse("http://www.python.org/robots.txt").unwrap();
//! let fetch_url = Url::parse("https://www.python.org/robots.txt").unwrap();
//! assert!(robots_txt.can_fetch("*", &fetch_url));
//! }
//! ```

View file

@ -16,13 +16,13 @@
//! use url::Url;
//!
//! fn main() {
//! let robots_txt_url = Url::parse("http://google.com/robots.txt").unwrap();
//! let robots_txt_url = Url::parse("https://google.com/robots.txt").unwrap();
//! let robots_txt = "User-agent: *\nDisallow: /search";
//! let robots_txt = parse_robots_txt(robots_txt_url.origin(), robots_txt);
//! assert_eq!(robots_txt.get_warnings().len(), 0);
//! let robots_txt = robots_txt.get_result();
//! let good_url = Url::parse("http://google.com/test").unwrap();
//! let bad_url = Url::parse("http://google.com/search/vvv").unwrap();
//! let good_url = Url::parse("https://google.com/test").unwrap();
//! let bad_url = Url::parse("https://google.com/search/vvv").unwrap();
//! assert_eq!(robots_txt.can_fetch("*", &bad_url), false);
//! assert_eq!(robots_txt.can_fetch("*", &good_url), true);
//! }

View file

@ -6,15 +6,15 @@ use url::Url;
const AGENT: &'static str = "test_robotparser";
fn robot_test(doc: &str, good_urls: Vec<&str>, bad_urls: Vec<&str>, agent: &str) {
let url = Url::parse("http://www.baidu.com/robots.txt").unwrap();
let url = Url::parse("https://www.baidu.com/robots.txt").unwrap();
let parser = parse_robots_txt(url.origin(), doc).get_result();
for url in &good_urls {
let url = format!("http://www.baidu.com{}", url);
let url = format!("https://www.baidu.com{}", url);
let url = Url::parse(&url).unwrap();
assert!(parser.can_fetch(agent, &url));
}
for url in &bad_urls {
let url = format!("http://www.baidu.com{}", url);
let url = format!("https://www.baidu.com{}", url);
let url = Url::parse(&url).unwrap();
assert!(!parser.can_fetch(agent, &url));
}
@ -56,7 +56,7 @@ fn test_robots_txt_1() {
#[test]
fn test_robots_txt_2() {
let doc = "\n\
# robots.txt for http://www.example.com/\n\
# robots.txt for https://www.example.com/\n\
\n\
User-agent: *\n\
Disallow: /cyberworld/map/ # This is an infinite virtual URL space\n\
@ -249,7 +249,7 @@ fn test_robots_txt_read() {
use robotparser::http::{CreateRobotsTxtRequest, ParseRobotsTxtResponse};
use reqwest::{Client, Request};
let http_client = Client::new();
let url = Url::parse("http://www.python.org/robots.txt").unwrap();
let url = Url::parse("https://www.python.org/robots.txt").unwrap();
let request = Request::create_robots_txt_request(url.origin());
let mut response = http_client.execute(request).unwrap();
let parser = response.parse_robots_txt_response().unwrap().get_result();
@ -258,7 +258,7 @@ fn test_robots_txt_read() {
#[test]
fn test_robots_text_crawl_delay() {
let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
let doc = "User-agent: Yandex\n\
Crawl-delay: 2.35\n\
Disallow: /search/\n";
@ -268,18 +268,18 @@ fn test_robots_text_crawl_delay() {
#[test]
fn test_robots_text_sitemaps() {
let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
let doc = "User-agent: Yandex\n\
Sitemap \t : http://example.com/sitemap1.xml\n
Sitemap: http://example.com/sitemap2.xml\n
Sitemap: http://example.com/sitemap3.xml\n
Sitemap \t : https://example.com/sitemap1.xml\n
Sitemap: https://example.com/sitemap2.xml\n
Sitemap: https://example.com/sitemap3.xml\n
Disallow: /search/\n";
let parser = parse_robots_txt(robots_txt_url.origin(), doc).get_result();
assert_eq!(
&[
Url::parse("http://example.com/sitemap1.xml").unwrap(),
Url::parse("http://example.com/sitemap2.xml").unwrap(),
Url::parse("http://example.com/sitemap3.xml").unwrap()
Url::parse("https://example.com/sitemap1.xml").unwrap(),
Url::parse("https://example.com/sitemap2.xml").unwrap(),
Url::parse("https://example.com/sitemap3.xml").unwrap()
],
parser.get_sitemaps()
);
@ -287,7 +287,7 @@ fn test_robots_text_sitemaps() {
#[test]
fn test_robots_text_request_rate() {
let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
let doc =
"User-agent: Yandex\n\
Request-rate: 3/15\n\
@ -313,15 +313,15 @@ Clean-param: gid\n\
Clean-param: tm\n\
Clean-param: amp\n\
";
let url = Url::parse("http://www.baidu.com/robots.txt").unwrap();
let url = Url::parse("https://www.baidu.com/robots.txt").unwrap();
let parser = parse_robots_txt(url.origin(), doc).get_result();
let mut site_url = Url::parse("http://www.baidu.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&amp=1").unwrap();
let mut site_url = Url::parse("https://www.baidu.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&amp=1").unwrap();
let was_updated = parser.normalize_url(&mut site_url);
assert_eq!(was_updated, true);
assert_eq!(site_url.as_str(), "http://www.baidu.com/test?post_id=7777");
assert_eq!(site_url.as_str(), "https://www.baidu.com/test?post_id=7777");
let mut site_url = Url::parse("http://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&amp=1").unwrap();
let mut site_url = Url::parse("https://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&amp=1").unwrap();
let was_updated = parser.normalize_url(&mut site_url);
assert_eq!(was_updated, false);
assert_eq!(site_url.as_str(), "http://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&amp=1");
assert_eq!(site_url.as_str(), "https://www.google.com/test?post_id=7777&mode=99&from=google&pid=99&gid=88&tm=777&amp=1");
}

View file

@ -8,9 +8,9 @@ use tokio::runtime::Runtime;
fn test_reqwest_async() {
let mut runtime = Runtime::new().unwrap();
let client = Client::new();
let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
let robots_txt_response = runtime.block_on(client.fetch_robots_txt(robots_txt_url.origin()));
let robots_txt = robots_txt_response.unwrap().get_result();
let fetch_url = Url::parse("http://www.python.org/robots.txt").unwrap();
let fetch_url = Url::parse("https://www.python.org/robots.txt").unwrap();
assert!(robots_txt.can_fetch("*", &fetch_url));
}

View file

@ -6,8 +6,8 @@ use url::Url;
#[test]
fn test_reqwest_blocking() {
let client = Client::new();
let robots_txt_url = Url::parse("http://www.python.org/robots.txt").unwrap();
let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
let robots_txt = client.fetch_robots_txt(robots_txt_url.origin()).unwrap().get_result();
let fetch_url = Url::parse("http://www.python.org/robots.txt").unwrap();
let fetch_url = Url::parse("https://www.python.org/robots.txt").unwrap();
assert!(robots_txt.can_fetch("*", &fetch_url));
}

View file

@ -106,7 +106,7 @@ fn test_warning_request_rate() {
#[test]
fn test_warning_parsing_url() {
let input = "User-Agent: *\nSitemap: http://python.org/sitemap.xml";
let input = "User-Agent: *\nSitemap: https://python.org/sitemap.xml";
validate_warnings(input, &[]);
let input = "User-Agent: *\nSitemap: http$$$://python.org/sitemap.xml";
validate_warnings(input, &[WarningReasonKind::ParseUrl]);