diff --git a/tests/test_url.py b/tests/test_url.py index ca84745..319d76c 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -1384,6 +1384,12 @@ def test_domains_are_case_insensitive(self): canonicalize_url("http://www.EXAMPLE.com/"), "http://www.example.com/" ) + def test_userinfo_is_case_sensitive(self): + self.assertEqual( + canonicalize_url("sftp://UsEr:PaSsWoRd@www.EXAMPLE.com/"), + "sftp://UsEr:PaSsWoRd@www.example.com/", + ) + def test_canonicalize_idns(self): self.assertEqual( canonicalize_url("http://www.bücher.de?q=bücher"), diff --git a/w3lib/url.py b/w3lib/url.py index 28e70cb..c142048 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -654,10 +654,13 @@ def canonicalize_url( fragment = "" if not keep_fragments else fragment + # Apply lowercase to the domain, but not to the userinfo. + netloc_parts = netloc.split("@") + netloc_parts[-1] = netloc_parts[-1].lower().rstrip(":") + netloc = "@".join(netloc_parts) + # every part should be safe already - return urlunparse( - (scheme, netloc.lower().rstrip(":"), path, params, query, fragment) - ) + return urlunparse((scheme, netloc, path, params, query, fragment)) def _unquotepath(path: str) -> bytes: