From 3312f87ada0020caa7aa82fcce7abf1e992fe686 Mon Sep 17 00:00:00 2001
From: pp-qq <p_qp__q@163.com>
Date: Thu, 14 Sep 2017 17:12:12 +0800
Subject: [PATCH 1/4] =?UTF-8?q?fix(url):=20canonicalize=5Furl('http://?=
 =?UTF-8?q?=E6=82=A8=E5=A5=BD.=E4=B8=AD=E5=9B=BD:80/')=20failed?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

expect: 'http://xn--5usr0o.xn--fiqs8s:80/'
actual: 'http://xn--5usr0o.xn--:80-u68dy61b/'
---
 w3lib/url.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/w3lib/url.py b/w3lib/url.py
index 4be74f74..a2025df7 100644
--- a/w3lib/url.py
+++ b/w3lib/url.py
@@ -374,7 +374,15 @@ def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
     # IDNA encoding can fail for too long labels (>63 characters)
     # or missing labels (e.g. http://.example.com)
     try:
-        netloc = parts.netloc.encode('idna')
+        idx = parts.netloc.rfind(u':')
+        if idx != -1:
+            hostname = parts.netloc[:idx]
+            portpart = parts.netloc[idx:]
+        else:
+            hostname = parts.netloc
+            portpart = u''
+        hostname = to_unicode(hostname.encode('idna'))
+        netloc = hostname + portpart
     except UnicodeError:
         netloc = parts.netloc
 

From d6934daa399cadd52931c165a397834b430b08b7 Mon Sep 17 00:00:00 2001
From: pp-qq <p_qp__q@163.com>
Date: Thu, 19 Oct 2017 11:12:36 +0800
Subject: [PATCH 2/4] =?UTF-8?q?fix(url):=20canonicalize=5Furl('http://?=
 =?UTF-8?q?=E6=82=A8=E5=A5=BD.=E4=B8=AD=E5=9B=BD:80/')=20failed?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

expect: 'http://xn--5usr0o.xn--fiqs8s:80/'
actual: 'http://xn--5usr0o.xn--:80-u68dy61b/'
---
 w3lib/url.py | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/w3lib/url.py b/w3lib/url.py
index a2025df7..cc93d534 100644
--- a/w3lib/url.py
+++ b/w3lib/url.py
@@ -19,6 +19,28 @@
 from w3lib.util import to_bytes, to_native_str, to_unicode
 
 
+def _encode_netloc(onetloc):
+    """
+    :type onetloc: unicode
+    :rtype: unicode
+    """
+    try:
+        idx = onetloc.rfind(u':')
+        if idx != -1:
+            hostname = onetloc[:idx]
+            portpart = onetloc[idx:]
+        else:
+            hostname = onetloc
+            portpart = u''
+        # assert isinstance(hostname, unicode)
+        # assert isinstance(portpart, unicode)
+        hostname = to_unicode(hostname.encode('idna'))
+        netloc = hostname + portpart
+    except UnicodeError:
+        netloc = onetloc
+    return netloc
+
+
 # error handling function for bytes-to-Unicode decoding errors with URLs
 def _quote_byte(error):
     return (to_unicode(quote(error.object[error.start:error.end])), error.end)
@@ -373,18 +395,7 @@ def parse_data_uri(uri):
 def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
     # IDNA encoding can fail for too long labels (>63 characters)
     # or missing labels (e.g. http://.example.com)
-    try:
-        idx = parts.netloc.rfind(u':')
-        if idx != -1:
-            hostname = parts.netloc[:idx]
-            portpart = parts.netloc[idx:]
-        else:
-            hostname = parts.netloc
-            portpart = u''
-        hostname = to_unicode(hostname.encode('idna'))
-        netloc = hostname + portpart
-    except UnicodeError:
-        netloc = parts.netloc
+    netloc = _encode_netloc(parts.netloc)
 
     return (
         to_native_str(parts.scheme),

From 0606f77761d4358e6d6daea09f1fcaea9df42d3a Mon Sep 17 00:00:00 2001
From: pp-qq <p_qp__q@163.com>
Date: Thu, 19 Oct 2017 11:13:36 +0800
Subject: [PATCH 3/4] fix(url): fix `safe_url_string()` with the change to
 `_safe_ParseResult()`

---
 w3lib/url.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/w3lib/url.py b/w3lib/url.py
index cc93d534..6b9c2d49 100644
--- a/w3lib/url.py
+++ b/w3lib/url.py
@@ -83,10 +83,7 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
 
     # IDNA encoding can fail for too long labels (>63 characters)
     # or missing labels (e.g. http://.example.com)
-    try:
-        netloc = parts.netloc.encode('idna')
-    except UnicodeError:
-        netloc = parts.netloc
+    netloc = _encode_netloc(parts.netloc)
 
     # quote() in Python2 return type follows input type;
     # quote() in Python3 always returns Unicode (native str)

From 90037c7b24bad952be3554cb9ff09ad1c04aa6cf Mon Sep 17 00:00:00 2001
From: pp-qq <p_qp__q@163.com>
Date: Thu, 19 Oct 2017 11:29:10 +0800
Subject: [PATCH 4/4] test(url): add test cases

---
 tests/test_url.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tests/test_url.py b/tests/test_url.py
index 0df5bfdc..79e2af44 100644
--- a/tests/test_url.py
+++ b/tests/test_url.py
@@ -146,12 +146,14 @@ def test_safe_url_idna(self):
 
             # Japanese
             (u'http://はじめよう.みんな/?query=サ&maxResults=5', 'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?query=%E3%82%B5&maxResults=5'),
+            (u'http://はじめよう.みんな:80/?query=サ&maxResults=5', 'http://xn--p8j9a0d9c9a.xn--q9jyb4c:80/?query=%E3%82%B5&maxResults=5'),
 
             # Russian
             (u'http://кто.рф/', 'http://xn--j1ail.xn--p1ai/'),
             (u'http://кто.рф/index.php?domain=Что', 'http://xn--j1ail.xn--p1ai/index.php?domain=%D0%A7%D1%82%D0%BE'),
 
             # Korean
+            (u'http://내도메인.한국:80/', 'http://xn--220b31d95hq8o.xn--3e0b707e:80/'),
             (u'http://내도메인.한국/', 'http://xn--220b31d95hq8o.xn--3e0b707e/'),
             (u'http://맨체스터시티축구단.한국/', 'http://xn--2e0b17htvgtvj9haj53ccob62ni8d.xn--3e0b707e/'),
 
@@ -159,6 +161,8 @@ def test_safe_url_idna(self):
             (u'http://nic.شبكة', 'http://nic.xn--ngbc5azd'),
 
             # Chinese
+            (u'http://您好.中国/', 'http://xn--5usr0o.xn--fiqs8s/'),
+            (u'http://您好.中国:80/', 'http://xn--5usr0o.xn--fiqs8s:80/'),
             (u'https://www.贷款.在线', 'https://www.xn--0kwr83e.xn--3ds443g'),
             (u'https://www2.xn--0kwr83e.在线', 'https://www2.xn--0kwr83e.xn--3ds443g'),
             (u'https://www3.贷款.xn--3ds443g', 'https://www3.xn--0kwr83e.xn--3ds443g'),
@@ -394,10 +398,15 @@ def test_typical_usage(self):
     def test_port_number(self):
         self.assertEqual(canonicalize_url("http://www.example.com:8888/do?a=1&b=2&c=3"),
                                           "http://www.example.com:8888/do?a=1&b=2&c=3")
+
+        self.assertEqual(canonicalize_url(u'http://您好.中国:80/'), 'http://xn--5usr0o.xn--fiqs8s:80/')
+
         # trailing empty ports are removed
         self.assertEqual(canonicalize_url("http://www.example.com:/do?a=1&b=2&c=3"),
                                           "http://www.example.com/do?a=1&b=2&c=3")
 
+        self.assertEqual(canonicalize_url(u'http://您好.中国:/'), 'http://xn--5usr0o.xn--fiqs8s/')
+
     def test_sorting(self):
         self.assertEqual(canonicalize_url("http://www.example.com/do?c=3&b=5&b=2&a=50"),
                                           "http://www.example.com/do?a=50&b=2&b=5&c=3")
@@ -522,10 +531,17 @@ def test_domains_are_case_insensitive(self):
     def test_canonicalize_idns(self):
         self.assertEqual(canonicalize_url(u'http://www.bücher.de?q=bücher'),
                                            'http://www.xn--bcher-kva.de/?q=b%C3%BCcher')
+
+        self.assertEqual(canonicalize_url(u'http://www.bücher.de:80?q=bücher'),
+                                           'http://www.xn--bcher-kva.de:80/?q=b%C3%BCcher')
+
         # Japanese (+ reordering query parameters)
         self.assertEqual(canonicalize_url(u'http://はじめよう.みんな/?query=サ&maxResults=5'),
                                            'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?maxResults=5&query=%E3%82%B5')
 
+        self.assertEqual(canonicalize_url(u'http://はじめよう.みんな:80/?query=サ&maxResults=5'),
+                                           'http://xn--p8j9a0d9c9a.xn--q9jyb4c:80/?maxResults=5&query=%E3%82%B5')
+
     def test_quoted_slash_and_question_sign(self):
         self.assertEqual(canonicalize_url("http://foo.com/AC%2FDC+rocks%3f/?yeah=1"),
                          "http://foo.com/AC%2FDC+rocks%3F/?yeah=1")