From 3312f87ada0020caa7aa82fcce7abf1e992fe686 Mon Sep 17 00:00:00 2001 From: pp-qq <p_qp__q@163.com> Date: Thu, 14 Sep 2017 17:12:12 +0800 Subject: [PATCH 1/4] =?UTF-8?q?fix(url):=20canonicalize=5Furl('http://?= =?UTF-8?q?=E6=82=A8=E5=A5=BD.=E4=B8=AD=E5=9B=BD:80/')=20failed?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit expect: 'http://xn--5usr0o.xn--fiqs8s:80/' actual: 'http://xn--5usr0o.xn--:80-u68dy61b/' --- w3lib/url.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/w3lib/url.py b/w3lib/url.py index 4be74f74..a2025df7 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -374,7 +374,15 @@ def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'): # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) try: - netloc = parts.netloc.encode('idna') + idx = parts.netloc.rfind(u':') + if idx != -1: + hostname = parts.netloc[:idx] + portpart = parts.netloc[idx:] + else: + hostname = parts.netloc + portpart = u'' + hostname = to_unicode(hostname.encode('idna')) + netloc = hostname + portpart except UnicodeError: netloc = parts.netloc From d6934daa399cadd52931c165a397834b430b08b7 Mon Sep 17 00:00:00 2001 From: pp-qq <p_qp__q@163.com> Date: Thu, 19 Oct 2017 11:12:36 +0800 Subject: [PATCH 2/4] =?UTF-8?q?fix(url):=20canonicalize=5Furl('http://?= =?UTF-8?q?=E6=82=A8=E5=A5=BD.=E4=B8=AD=E5=9B=BD:80/')=20failed?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit expect: 'http://xn--5usr0o.xn--fiqs8s:80/' actual: 'http://xn--5usr0o.xn--:80-u68dy61b/' --- w3lib/url.py | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/w3lib/url.py b/w3lib/url.py index a2025df7..cc93d534 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -19,6 +19,28 @@ from w3lib.util import to_bytes, to_native_str, to_unicode +def _encode_netloc(onetloc): + """ + :type onetloc: unicode + :rtype: unicode + """ + try: + idx = onetloc.rfind(u':') + if idx != -1: + hostname = onetloc[:idx] + portpart = onetloc[idx:] + else: + hostname = onetloc + portpart = u'' + # assert isinstance(hostname, unicode) + # assert isinstance(portpart, unicode) + hostname = to_unicode(hostname.encode('idna')) + netloc = hostname + portpart + except UnicodeError: + netloc = onetloc + return netloc + + # error handling function for bytes-to-Unicode decoding errors with URLs def _quote_byte(error): return (to_unicode(quote(error.object[error.start:error.end])), error.end) @@ -373,18 +395,7 @@ def parse_data_uri(uri): def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'): # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) - try: - idx = parts.netloc.rfind(u':') - if idx != -1: - hostname = parts.netloc[:idx] - portpart = parts.netloc[idx:] - else: - hostname = parts.netloc - portpart = u'' - hostname = to_unicode(hostname.encode('idna')) - netloc = hostname + portpart - except UnicodeError: - netloc = parts.netloc + netloc = _encode_netloc(parts.netloc) return ( to_native_str(parts.scheme), From 0606f77761d4358e6d6daea09f1fcaea9df42d3a Mon Sep 17 00:00:00 2001 From: pp-qq <p_qp__q@163.com> Date: Thu, 19 Oct 2017 11:13:36 +0800 Subject: [PATCH 3/4] fix(url): fix `safe_url_string()` with the change to `_safe_ParseResult()` --- w3lib/url.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/w3lib/url.py b/w3lib/url.py index cc93d534..6b9c2d49 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -83,10 +83,7 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8'): # IDNA encoding can fail for too long labels (>63 characters) # or missing labels (e.g. http://.example.com) - try: - netloc = parts.netloc.encode('idna') - except UnicodeError: - netloc = parts.netloc + netloc = _encode_netloc(parts.netloc) # quote() in Python2 return type follows input type; # quote() in Python3 always returns Unicode (native str) From 90037c7b24bad952be3554cb9ff09ad1c04aa6cf Mon Sep 17 00:00:00 2001 From: pp-qq <p_qp__q@163.com> Date: Thu, 19 Oct 2017 11:29:10 +0800 Subject: [PATCH 4/4] test(url): add test cases --- tests/test_url.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_url.py b/tests/test_url.py index 0df5bfdc..79e2af44 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -146,12 +146,14 @@ def test_safe_url_idna(self): # Japanese (u'http://はじめよう.みんな/?query=サ&maxResults=5', 'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?query=%E3%82%B5&maxResults=5'), + (u'http://はじめよう.みんな:80/?query=サ&maxResults=5', 'http://xn--p8j9a0d9c9a.xn--q9jyb4c:80/?query=%E3%82%B5&maxResults=5'), # Russian (u'http://кто.рф/', 'http://xn--j1ail.xn--p1ai/'), (u'http://кто.рф/index.php?domain=Что', 'http://xn--j1ail.xn--p1ai/index.php?domain=%D0%A7%D1%82%D0%BE'), # Korean + (u'http://내도메인.한국:80/', 'http://xn--220b31d95hq8o.xn--3e0b707e:80/'), (u'http://내도메인.한국/', 'http://xn--220b31d95hq8o.xn--3e0b707e/'), (u'http://맨체스터시티축구단.한국/', 'http://xn--2e0b17htvgtvj9haj53ccob62ni8d.xn--3e0b707e/'), @@ -159,6 +161,8 @@ def test_safe_url_idna(self): (u'http://nic.شبكة', 'http://nic.xn--ngbc5azd'), # Chinese + (u'http://您好.中国/', 'http://xn--5usr0o.xn--fiqs8s/'), + (u'http://您好.中国:80/', 'http://xn--5usr0o.xn--fiqs8s:80/'), (u'https://www.贷款.在线', 'https://www.xn--0kwr83e.xn--3ds443g'), (u'https://www2.xn--0kwr83e.在线', 'https://www2.xn--0kwr83e.xn--3ds443g'), (u'https://www3.贷款.xn--3ds443g', 'https://www3.xn--0kwr83e.xn--3ds443g'), @@ -394,10 +398,15 @@ def test_typical_usage(self): def test_port_number(self): self.assertEqual(canonicalize_url("http://www.example.com:8888/do?a=1&b=2&c=3"), "http://www.example.com:8888/do?a=1&b=2&c=3") + + self.assertEqual(canonicalize_url(u'http://您好.中国:80/'), 'http://xn--5usr0o.xn--fiqs8s:80/') + # trailing empty ports are removed self.assertEqual(canonicalize_url("http://www.example.com:/do?a=1&b=2&c=3"), "http://www.example.com/do?a=1&b=2&c=3") + self.assertEqual(canonicalize_url(u'http://您好.中国:/'), 'http://xn--5usr0o.xn--fiqs8s/') + def test_sorting(self): self.assertEqual(canonicalize_url("http://www.example.com/do?c=3&b=5&b=2&a=50"), "http://www.example.com/do?a=50&b=2&b=5&c=3") @@ -522,10 +531,17 @@ def test_domains_are_case_insensitive(self): def test_canonicalize_idns(self): self.assertEqual(canonicalize_url(u'http://www.bücher.de?q=bücher'), 'http://www.xn--bcher-kva.de/?q=b%C3%BCcher') + + self.assertEqual(canonicalize_url(u'http://www.bücher.de:80?q=bücher'), + 'http://www.xn--bcher-kva.de:80/?q=b%C3%BCcher') + # Japanese (+ reordering query parameters) self.assertEqual(canonicalize_url(u'http://はじめよう.みんな/?query=サ&maxResults=5'), 'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?maxResults=5&query=%E3%82%B5') + self.assertEqual(canonicalize_url(u'http://はじめよう.みんな:80/?query=サ&maxResults=5'), + 'http://xn--p8j9a0d9c9a.xn--q9jyb4c:80/?maxResults=5&query=%E3%82%B5') + def test_quoted_slash_and_question_sign(self): self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC+rocks%3f/?yeah=1"), "http://foo.com/AC%2FDC+rocks%3F/?yeah=1")