mirror of
https://github.com/danbulant/robotparser-rs
synced 2026-05-22 22:09:06 +00:00
Error handling (#24)
* Add test when url is invalid and panic * Initial error handling ref https://github.com/messense/robotparser-rs/issues/22 * Rename ErrorKind::HttpClient => ErrorKind::Http * Implement std::error::Error and rename to Error
This commit is contained in:
parent
1474a8cce9
commit
df49f6bcf0
6 changed files with 73 additions and 17 deletions
|
|
@ -1,24 +1,26 @@
|
|||
use reqwest::{Client, Request};
|
||||
use reqwest::{Method, Error};
|
||||
use reqwest::Method;
|
||||
use reqwest::Error as ReqwestError;
|
||||
use reqwest::header::HeaderValue;
|
||||
use url::{Origin, Url};
|
||||
use reqwest::header::USER_AGENT;
|
||||
use crate::http::{RobotsTxtClient, DEFAULT_USER_AGENT};
|
||||
use crate::parser::{ParseResult, parse_fetched_robots_txt};
|
||||
use crate::model::FetchedRobotsTxt;
|
||||
use crate::model::{Error, ErrorKind};
|
||||
use std::pin::Pin;
|
||||
use futures::task::{Context, Poll};
|
||||
use futures::Future;
|
||||
use futures::future::TryFutureExt;
|
||||
use futures::future::ok as future_ok;
|
||||
|
||||
type FetchFuture = Box<dyn Future<Output=Result<(ResponseInfo, String), Error>>>;
|
||||
type FetchFuture = Box<dyn Future<Output=Result<(ResponseInfo, String), ReqwestError>>>;
|
||||
|
||||
impl RobotsTxtClient for Client {
|
||||
type Result = RobotsTxtResponse;
|
||||
type Result = Result<RobotsTxtResponse, Error>;
|
||||
fn fetch_robots_txt(&self, origin: Origin) -> Self::Result {
|
||||
let url = format!("{}/robots.txt", origin.unicode_serialization());
|
||||
let url = Url::parse(&url).expect("Unable to parse robots.txt url");
|
||||
let url = Url::parse(&url).map_err(|err| Error {kind: ErrorKind::Url(err)})?;
|
||||
let mut request = Request::new(Method::GET, url);
|
||||
let _ = request.headers_mut().insert(USER_AGENT, HeaderValue::from_static(DEFAULT_USER_AGENT));
|
||||
let response = self
|
||||
|
|
@ -29,11 +31,11 @@ impl RobotsTxtClient for Client {
|
|||
return future_ok((response_info, response_text));
|
||||
});
|
||||
});
|
||||
let response: Pin<Box<dyn Future<Output=Result<(ResponseInfo, String), Error>>>> = Box::pin(response);
|
||||
return RobotsTxtResponse {
|
||||
let response: Pin<Box<dyn Future<Output=Result<(ResponseInfo, String), ReqwestError>>>> = Box::pin(response);
|
||||
Ok(RobotsTxtResponse {
|
||||
origin,
|
||||
response,
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -55,7 +57,7 @@ impl RobotsTxtResponse {
|
|||
}
|
||||
|
||||
impl Future for RobotsTxtResponse {
|
||||
type Output = Result<ParseResult<FetchedRobotsTxt>, Error>;
|
||||
type Output = Result<ParseResult<FetchedRobotsTxt>, ReqwestError>;
|
||||
|
||||
fn poll(self: Pin<&mut Self>, cx: &mut Context) -> Poll<Self::Output> {
|
||||
let self_mut = self.get_mut();
|
||||
|
|
@ -73,4 +75,4 @@ impl Future for RobotsTxtResponse {
|
|||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,23 +1,24 @@
|
|||
use reqwest::blocking::{Client, Request};
|
||||
use reqwest::{Method, Error};
|
||||
use reqwest::Method;
|
||||
use reqwest::header::HeaderValue;
|
||||
use url::{Origin, Url};
|
||||
use reqwest::header::USER_AGENT;
|
||||
use crate::http::{RobotsTxtClient, DEFAULT_USER_AGENT};
|
||||
use crate::parser::{ParseResult, parse_fetched_robots_txt};
|
||||
use crate::model::FetchedRobotsTxt;
|
||||
use crate::model::{Error, ErrorKind};
|
||||
|
||||
impl RobotsTxtClient for Client {
|
||||
type Result = Result<ParseResult<FetchedRobotsTxt>, Error>;
|
||||
fn fetch_robots_txt(&self, origin: Origin) -> Self::Result {
|
||||
let url = format!("{}/robots.txt", origin.unicode_serialization());
|
||||
let url = Url::parse(&url).expect("Unable to parse robots.txt url");
|
||||
let url = Url::parse(&url).map_err(|err| Error {kind: ErrorKind::Url(err)})?;
|
||||
let mut request = Request::new(Method::GET, url);
|
||||
let _ = request.headers_mut().insert(USER_AGENT, HeaderValue::from_static(DEFAULT_USER_AGENT));
|
||||
let response = self.execute(request)?;
|
||||
let response = self.execute(request).map_err(|err| Error {kind: ErrorKind::Http(err)})?;
|
||||
let status_code = response.status().as_u16();
|
||||
let text = response.text()?;
|
||||
let text = response.text().map_err(|err| Error {kind: ErrorKind::Http(err)})?;
|
||||
let robots_txt = parse_fetched_robots_txt(origin, status_code, &text);
|
||||
return Ok(robots_txt);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -14,4 +14,6 @@ pub (crate) use self::fetched_robots_txt::FetchedRobotsTxtContainer;
|
|||
mod fetched_robots_txt;
|
||||
pub use self::robots_txt::RobotsTxt;
|
||||
mod path;
|
||||
pub (crate) use self::path::Path;
|
||||
pub (crate) use self::path::Path;
|
||||
mod errors;
|
||||
pub use self::errors::{Error, ErrorKind};
|
||||
|
|
|
|||
23
src/model/errors.rs
Normal file
23
src/model/errors.rs
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
use std::fmt;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Error {
|
||||
pub kind: ErrorKind,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum ErrorKind {
|
||||
Url(url::ParseError),
|
||||
Http(reqwest::Error),
|
||||
}
|
||||
|
||||
impl fmt::Display for Error {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self.kind {
|
||||
ErrorKind::Url(ref err) => err.fmt(f),
|
||||
ErrorKind::Http(ref err) => err.fmt(f),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for Error {}
|
||||
|
|
@ -3,14 +3,28 @@ use robotparser::service::RobotsTxtService;
|
|||
use reqwest::Client;
|
||||
use url::Url;
|
||||
use tokio::runtime::Runtime;
|
||||
use url::{Host, Origin};
|
||||
|
||||
#[test]
|
||||
fn test_reqwest_async() {
|
||||
let mut runtime = Runtime::new().unwrap();
|
||||
let client = Client::new();
|
||||
let robots_txt_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||
let robots_txt_response = runtime.block_on(client.fetch_robots_txt(robots_txt_url.origin()));
|
||||
let robots_txt_response = runtime.block_on(client.fetch_robots_txt(robots_txt_url.origin()).unwrap());
|
||||
let robots_txt = robots_txt_response.unwrap().get_result();
|
||||
let fetch_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||
assert!(robots_txt.can_fetch("*", &fetch_url));
|
||||
}
|
||||
let fetch_url = Url::parse("http://www.python.org/webstats/").unwrap();
|
||||
assert!(!robots_txt.can_fetch("*", &fetch_url));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reqwest_blocking_panic_url() {
|
||||
let client = Client::new();
|
||||
let host = Host::Domain("python.org::".into());
|
||||
let origin = Origin::Tuple("https".into(), host, 80);
|
||||
match client.fetch_robots_txt(origin) {
|
||||
Ok(_) => assert!(false),
|
||||
Err(_) => assert!(true)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ use robotparser::http::RobotsTxtClient;
|
|||
use robotparser::service::RobotsTxtService;
|
||||
use reqwest::blocking::Client;
|
||||
use url::Url;
|
||||
use url::{Host, Origin};
|
||||
|
||||
#[test]
|
||||
fn test_reqwest_blocking() {
|
||||
|
|
@ -10,4 +11,17 @@ fn test_reqwest_blocking() {
|
|||
let robots_txt = client.fetch_robots_txt(robots_txt_url.origin()).unwrap().get_result();
|
||||
let fetch_url = Url::parse("https://www.python.org/robots.txt").unwrap();
|
||||
assert!(robots_txt.can_fetch("*", &fetch_url));
|
||||
let fetch_url = Url::parse("https://www.python.org/webstats/").unwrap();
|
||||
assert!(!robots_txt.can_fetch("*", &fetch_url));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reqwest_blocking_panic_url() {
|
||||
let client = Client::new();
|
||||
let host = Host::Domain("python.org::".into());
|
||||
let origin = Origin::Tuple("https".into(), host, 80);
|
||||
match client.fetch_robots_txt(origin) {
|
||||
Ok(_) => assert!(false),
|
||||
Err(_) => assert!(true)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue