Skip to content

Commit cbe2dd2

Browse files
authored
tidy URL validator implementations (#1788)
1 parent 1cd5ee3 commit cbe2dd2

File tree

4 files changed

+85
-78
lines changed

4 files changed

+85
-78
lines changed

src/url.rs

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -271,16 +271,16 @@ impl PyMultiHostUrl {
271271
// string representation of the URL, with punycode decoded when appropriate
272272
pub fn unicode_string(&self) -> String {
273273
if let Some(extra_urls) = &self.extra_urls {
274-
let schema = self.ref_url.lib_url.scheme();
275-
let host_offset = schema.len() + 3;
274+
let scheme = self.ref_url.lib_url.scheme();
275+
let host_offset = scheme.len() + 3;
276276

277277
let mut full_url = self.ref_url.unicode_string();
278278
full_url.insert(host_offset, ',');
279279

280280
// special urls will have had a trailing slash added, non-special urls will not
281-
// hence we need to remove the last char if the schema is special
281+
// hence we need to remove the last char if the scheme is special
282282
#[allow(clippy::bool_to_int_with_if)]
283-
let sub = if schema_is_special(schema) { 1 } else { 0 };
283+
let sub = if scheme_is_special(scheme) { 1 } else { 0 };
284284

285285
let hosts = extra_urls
286286
.iter()
@@ -299,16 +299,16 @@ impl PyMultiHostUrl {
299299

300300
pub fn __str__(&self) -> String {
301301
if let Some(extra_urls) = &self.extra_urls {
302-
let schema = self.ref_url.lib_url.scheme();
303-
let host_offset = schema.len() + 3;
302+
let scheme = self.ref_url.lib_url.scheme();
303+
let host_offset = scheme.len() + 3;
304304

305305
let mut full_url = self.ref_url.lib_url.to_string();
306306
full_url.insert(host_offset, ',');
307307

308308
// special urls will have had a trailing slash added, non-special urls will not
309-
// hence we need to remove the last char if the schema is special
309+
// hence we need to remove the last char if the scheme is special
310310
#[allow(clippy::bool_to_int_with_if)]
311-
let sub = if schema_is_special(schema) { 1 } else { 0 };
311+
let sub = if scheme_is_special(scheme) { 1 } else { 0 };
312312

313313
let hosts = extra_urls
314314
.iter()
@@ -510,10 +510,10 @@ fn decode_punycode(domain: &str) -> Option<String> {
510510
static PUNYCODE_PREFIX: &str = "xn--";
511511

512512
fn is_punnycode_domain(lib_url: &Url, domain: &str) -> bool {
513-
schema_is_special(lib_url.scheme()) && domain.split('.').any(|part| part.starts_with(PUNYCODE_PREFIX))
513+
scheme_is_special(lib_url.scheme()) && domain.split('.').any(|part| part.starts_with(PUNYCODE_PREFIX))
514514
}
515515

516516
// based on https://github.com/servo/rust-url/blob/1c1e406874b3d2aa6f36c5d2f3a5c2ea74af9efb/url/src/parser.rs#L161-L167
517-
pub fn schema_is_special(schema: &str) -> bool {
518-
matches!(schema, "http" | "https" | "ws" | "wss" | "ftp" | "file")
517+
pub fn scheme_is_special(scheme: &str) -> bool {
518+
matches!(scheme, "http" | "https" | "ws" | "wss" | "ftp" | "file")
519519
}

src/validators/url.rs

Lines changed: 54 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -15,20 +15,21 @@ use crate::errors::ToErrorValue;
1515
use crate::errors::{ErrorType, ErrorTypeDefaults, ValError, ValResult};
1616
use crate::input::downcast_python_input;
1717
use crate::input::Input;
18+
use crate::input::ValidationMatch;
1819
use crate::tools::SchemaDict;
19-
use crate::url::{schema_is_special, PyMultiHostUrl, PyUrl};
20+
use crate::url::{scheme_is_special, PyMultiHostUrl, PyUrl};
2021

2122
use super::literal::expected_repr_name;
2223
use super::Exactness;
2324
use super::{BuildValidator, CombinedValidator, DefinitionsBuilder, ValidationState, Validator};
2425

25-
type AllowedSchemas = Option<(AHashSet<String>, String)>;
26+
type AllowedSchemes = Option<(AHashSet<String>, String)>;
2627

2728
#[derive(Debug, Clone)]
2829
pub struct UrlValidator {
2930
strict: bool,
3031
max_length: Option<usize>,
31-
allowed_schemes: AllowedSchemas,
32+
allowed_schemes: AllowedSchemes,
3233
host_required: bool,
3334
default_host: Option<String>,
3435
default_port: Option<u16>,
@@ -44,7 +45,7 @@ impl BuildValidator for UrlValidator {
4445
config: Option<&Bound<'_, PyDict>>,
4546
_definitions: &mut DefinitionsBuilder<CombinedValidator>,
4647
) -> PyResult<CombinedValidator> {
47-
let (allowed_schemes, name) = get_allowed_schemas(schema, Self::EXPECTED_TYPE)?;
48+
let (allowed_schemes, name) = get_allowed_schemes(schema, Self::EXPECTED_TYPE)?;
4849

4950
Ok(Self {
5051
strict: is_strict(schema, config)?,
@@ -107,31 +108,23 @@ impl Validator for UrlValidator {
107108

108109
impl UrlValidator {
109110
fn get_url<'py>(&self, input: &(impl Input<'py> + ?Sized), strict: bool) -> ValResult<EitherUrl<'py>> {
110-
match input.validate_str(strict, false) {
111-
Ok(val_match) => {
112-
let either_str = val_match.into_inner();
113-
let cow = either_str.as_cow()?;
114-
let url_str = cow.as_ref();
115-
116-
self.check_length(input, url_str)?;
117-
118-
parse_url(url_str, input, strict).map(EitherUrl::Rust)
119-
}
120-
Err(_) => {
121-
// we don't need to worry about whether the url was parsed in strict mode before,
122-
// even if it was, any syntax errors would have been fixed by the first validation
123-
if let Some(py_url) = downcast_python_input::<PyUrl>(input) {
124-
self.check_length(input, py_url.get().url().as_str())?;
125-
Ok(EitherUrl::Py(py_url.clone()))
126-
} else if let Some(multi_host_url) = downcast_python_input::<PyMultiHostUrl>(input) {
127-
let url_str = multi_host_url.get().__str__();
128-
self.check_length(input, &url_str)?;
129-
130-
parse_url(&url_str, input, strict).map(EitherUrl::Rust)
131-
} else {
132-
Err(ValError::new(ErrorTypeDefaults::UrlType, input))
133-
}
134-
}
111+
if let Some(py_url) = downcast_python_input::<PyUrl>(input) {
112+
// we don't need to worry about whether the url was parsed in strict mode before,
113+
// even if it was, any syntax errors would have been fixed by the first validation
114+
self.check_length(input, py_url.get().url().as_str())?;
115+
Ok(EitherUrl::Py(py_url.clone()))
116+
} else if let Some(multi_host_url) = downcast_python_input::<PyMultiHostUrl>(input) {
117+
let url_str = multi_host_url.get().__str__();
118+
self.check_length(input, &url_str)?;
119+
parse_url(&url_str, input, strict).map(EitherUrl::Rust)
120+
} else if let Ok(either_str) = input.validate_str(strict, false).map(ValidationMatch::into_inner) {
121+
let cow = either_str.as_cow()?;
122+
let url_str = cow.as_ref();
123+
124+
self.check_length(input, url_str)?;
125+
parse_url(url_str, input, strict).map(EitherUrl::Rust)
126+
} else {
127+
Err(ValError::new(ErrorTypeDefaults::UrlType, input))
135128
}
136129
}
137130

@@ -192,7 +185,7 @@ impl CopyFromPyUrl for EitherUrl<'_> {
192185
pub struct MultiHostUrlValidator {
193186
strict: bool,
194187
max_length: Option<usize>,
195-
allowed_schemes: AllowedSchemas,
188+
allowed_schemes: AllowedSchemes,
196189
host_required: bool,
197190
default_host: Option<String>,
198191
default_port: Option<u16>,
@@ -208,7 +201,7 @@ impl BuildValidator for MultiHostUrlValidator {
208201
config: Option<&Bound<'_, PyDict>>,
209202
_definitions: &mut DefinitionsBuilder<CombinedValidator>,
210203
) -> PyResult<CombinedValidator> {
211-
let (allowed_schemes, name) = get_allowed_schemas(schema, Self::EXPECTED_TYPE)?;
204+
let (allowed_schemes, name) = get_allowed_schemes(schema, Self::EXPECTED_TYPE)?;
212205

213206
let default_host: Option<String> = schema.get_as(intern!(schema.py(), "default_host"))?;
214207
if let Some(ref default_host) = default_host {
@@ -276,32 +269,26 @@ impl Validator for MultiHostUrlValidator {
276269

277270
impl MultiHostUrlValidator {
278271
fn get_url<'py>(&self, input: &(impl Input<'py> + ?Sized), strict: bool) -> ValResult<EitherMultiHostUrl<'py>> {
279-
match input.validate_str(strict, false) {
280-
Ok(val_match) => {
281-
let either_str = val_match.into_inner();
282-
let cow = either_str.as_cow()?;
283-
let url_str = cow.as_ref();
284-
285-
self.check_length(input, || url_str.len())?;
286-
287-
parse_multihost_url(url_str, input, strict).map(EitherMultiHostUrl::Rust)
288-
}
289-
Err(_) => {
290-
// we don't need to worry about whether the url was parsed in strict mode before,
291-
// even if it was, any syntax errors would have been fixed by the first validation
292-
if let Some(multi_url) = downcast_python_input::<PyMultiHostUrl>(input) {
293-
self.check_length(input, || multi_url.get().__str__().len())?;
294-
Ok(EitherMultiHostUrl::Py(multi_url.clone()))
295-
} else if let Some(py_url) = downcast_python_input::<PyUrl>(input) {
296-
self.check_length(input, || py_url.get().url().as_str().len())?;
297-
Ok(EitherMultiHostUrl::Rust(PyMultiHostUrl::new(
298-
py_url.get().url().clone(),
299-
None,
300-
)))
301-
} else {
302-
Err(ValError::new(ErrorTypeDefaults::UrlType, input))
303-
}
304-
}
272+
// we don't need to worry about whether the url was parsed in strict mode before,
273+
// even if it was, any syntax errors would have been fixed by the first validation
274+
if let Some(multi_url) = downcast_python_input::<PyMultiHostUrl>(input) {
275+
self.check_length(input, || multi_url.get().__str__().len())?;
276+
Ok(EitherMultiHostUrl::Py(multi_url.clone()))
277+
} else if let Some(py_url) = downcast_python_input::<PyUrl>(input) {
278+
self.check_length(input, || py_url.get().url().as_str().len())?;
279+
Ok(EitherMultiHostUrl::Rust(PyMultiHostUrl::new(
280+
py_url.get().url().clone(),
281+
None,
282+
)))
283+
} else if let Ok(either_str) = input.validate_str(strict, false).map(ValidationMatch::into_inner) {
284+
let cow = either_str.as_cow()?;
285+
let url_str = cow.as_ref();
286+
287+
self.check_length(input, || url_str.len())?;
288+
289+
parse_multihost_url(url_str, input, strict).map(EitherMultiHostUrl::Rust)
290+
} else {
291+
Err(ValError::new(ErrorTypeDefaults::UrlType, input))
305292
}
306293
}
307294

@@ -399,24 +386,24 @@ fn parse_multihost_url<'py>(
399386
}
400387
}
401388

402-
// consume the url schema, some logic from `parse_scheme`
389+
// consume the url scheme, some logic from `parse_scheme`
403390
// https://github.com/servo/rust-url/blob/v2.3.1/url/src/parser.rs#L387-L411
404-
let schema_start = chars.position;
405-
let schema_end = loop {
391+
let scheme_start = chars.position;
392+
let scheme_end = loop {
406393
match chars.next() {
407394
Some('a'..='z' | 'A'..='Z' | '0'..='9' | '+' | '-' | '.') => continue,
408395
Some(':') => {
409-
// require the schema to be non-empty
410-
let schema_end = chars.position - ':'.len_utf8();
411-
if schema_end > schema_start {
412-
break schema_end;
396+
// require the scheme to be non-empty
397+
let scheme_end = chars.position - ':'.len_utf8();
398+
if scheme_end > scheme_start {
399+
break scheme_end;
413400
}
414401
}
415402
_ => {}
416403
}
417404
return parsing_err!(ParseError::RelativeUrlWithoutBase);
418405
};
419-
let schema = url_str[schema_start..schema_end].to_ascii_lowercase();
406+
let scheme = url_str[scheme_start..scheme_end].to_ascii_lowercase();
420407

421408
// consume the double slash, or any number of slashes, including backslashes, taken from `parse_with_scheme`
422409
// https://github.com/servo/rust-url/blob/v2.3.1/url/src/parser.rs#L413-L456
@@ -437,7 +424,7 @@ fn parse_multihost_url<'py>(
437424
let mut start = chars.position;
438425
while let Some(c) = chars.next() {
439426
match c {
440-
'\\' if schema_is_special(&schema) => break,
427+
'\\' if scheme_is_special(&scheme) => break,
441428
'/' | '?' | '#' => break,
442429
',' => {
443430
// minus 1 because we know that the last char was a `,` with length 1
@@ -587,7 +574,7 @@ trait CopyFromPyUrl {
587574
fn url_mut(&mut self) -> &mut Url;
588575
}
589576

590-
fn get_allowed_schemas(schema: &Bound<'_, PyDict>, name: &'static str) -> PyResult<(AllowedSchemas, String)> {
577+
fn get_allowed_schemes(schema: &Bound<'_, PyDict>, name: &'static str) -> PyResult<(AllowedSchemes, String)> {
591578
match schema.get_as::<Bound<'_, PyList>>(intern!(schema.py(), "allowed_schemes"))? {
592579
Some(list) => {
593580
if list.is_empty() {

tests/benchmarks/test_micro_benchmarks.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1475,3 +1475,17 @@ def test_enum_str_core(benchmark):
14751475
assert v.validate_python('apple') is FooStr.a
14761476

14771477
benchmark(v.validate_python, 'apple')
1478+
1479+
1480+
@pytest.mark.benchmark(group='url')
1481+
def test_url_core(benchmark):
1482+
v = SchemaValidator(core_schema.url_schema())
1483+
1484+
benchmark(v.validate_python, 'https://example.com/some/path?query=string#fragment')
1485+
1486+
1487+
@pytest.mark.benchmark(group='url')
1488+
def test_multi_host_url_core(benchmark):
1489+
v = SchemaValidator(core_schema.multi_host_url_schema())
1490+
1491+
benchmark(v.validate_python, 'https://example.com,b:[email protected]:777/some/path?query=string#fragment')

tests/validators/test_url.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,12 @@ def url_test_case_helper(
121121
('http://example.com:65535', 'http://example.com:65535/'),
122122
('http:\\\\example.com', 'http://example.com/'),
123123
('http:example.com', 'http://example.com/'),
124+
('http:example.com/path', 'http://example.com/path'),
125+
('http:example.com/path/', 'http://example.com/path/'),
126+
('http:example.com?query=nopath', 'http://example.com/?query=nopath'),
127+
('http:example.com/?query=haspath', 'http://example.com/?query=haspath'),
128+
('http:example.com#nopath', 'http://example.com/#nopath'),
129+
('http:example.com/#haspath', 'http://example.com/#haspath'),
124130
('http://example.com:65536', Err('invalid port number')),
125131
('http://1...1', Err('invalid IPv4 address')),
126132
('https://[2001:0db8:85a3:0000:0000:8a2e:0370:7334[', Err('invalid IPv6 address')),

0 commit comments

Comments
 (0)