From 94c034f181aeaf7f388a2a7130adde97671a6ad8 Mon Sep 17 00:00:00 2001
From: Taro Sato <okomestudio@gmail.com>
Date: Fri, 27 Jul 2018 13:42:05 -0700
Subject: [PATCH 1/3] Fix a bug when header item gets ignored when it extends
 over multiple lines

---
 tests/test_http.py | 22 +++++++++++++++++++---
 w3lib/http.py      | 12 +++++++++++-
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/tests/test_http.py b/tests/test_http.py
index 01f903e8..37de9a00 100644
--- a/tests/test_http.py
+++ b/tests/test_http.py
@@ -28,12 +28,28 @@ def test_headers_raw_dict_none(self):
         self.assertIsNone(headers_dict_to_raw(None))
 
     def test_headers_raw_to_dict(self):
-        raw = b"Content-type: text/html\n\rAccept: gzip\n\r\
-                Cache-Control: no-cache\n\rCache-Control: no-store\n\n"
-        dct = {b'Content-type': [b'text/html'], b'Accept': [b'gzip'], 
+        raw = b'\r\n'.join((b"Content-type: text/html",
+                            b"Accept: gzip",
+                            b"Cache-Control: no-cache",
+                            b"Cache-Control: no-store"))
+        dct = {b'Content-type': [b'text/html'], b'Accept': [b'gzip'],
                b'Cache-Control': [b'no-cache', b'no-store']}
         self.assertEqual(headers_raw_to_dict(raw), dct)
 
+    def test_headers_raw_to_dict_multiline(self):
+        raw = b'\r\n'.join((b'Content-Type: multipart/related;',
+                            b'  type="application/xop+xml";',
+                            b'\tboundary="example"',
+                            b'Cache-Control: no-cache'))
+        dct = {
+            b'Content-Type': [
+                b'\r\n'.join((b'multipart/related;',
+                              b'  type="application/xop+xml";',
+                              b'\tboundary="example"'))
+            ],
+            b'Cache-Control': [b'no-cache']}
+        self.assertEqual(headers_raw_to_dict(raw), dct)
+
     def test_headers_dict_to_raw(self):
         dct = OrderedDict([
             (b'Content-type', b'text/html'),
diff --git a/w3lib/http.py b/w3lib/http.py
index c7b94a23..0986b43a 100644
--- a/w3lib/http.py
+++ b/w3lib/http.py
@@ -27,7 +27,17 @@ def headers_raw_to_dict(headers_raw):
 
     if headers_raw is None:
         return None
-    headers = headers_raw.splitlines()
+
+    headers = []
+    for line in headers_raw.split(b'\r\n'):
+        if line.startswith(b' ') or line.startswith(b'\t'):
+            try:
+                headers[-1] += (b'\r\n' + line)
+            except IndexError:
+                raise ValueError('Malformed raw headers')
+        else:
+            headers.append(line)
+
     headers_tuples = [header.split(b':', 1) for header in headers]
 
     result_dict = {}

From f809060fa0e224bceb827ffcadfb749f1bd784ec Mon Sep 17 00:00:00 2001
From: Taro Sato <okomestudio@gmail.com>
Date: Wed, 5 Feb 2020 19:01:58 -0800
Subject: [PATCH 2/3] Add strict parameter to control line parsing behavior

---
 tests/test_http.py | 16 +++++++++++++++-
 w3lib/http.py      | 29 +++++++++++++++++++----------
 2 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/tests/test_http.py b/tests/test_http.py
index 37de9a00..e5d22667 100644
--- a/tests/test_http.py
+++ b/tests/test_http.py
@@ -41,6 +41,20 @@ def test_headers_raw_to_dict_multiline(self):
                             b'  type="application/xop+xml";',
                             b'\tboundary="example"',
                             b'Cache-Control: no-cache'))
+        # With strict=False, the header value that spans across
+        # multiple lines does not get parsed fully, and only the first
+        # line is retained.
+        dct = {b'Content-Type': [b'multipart/related;'],
+               b'Cache-Control': [b'no-cache']}
+        self.assertEqual(headers_raw_to_dict(raw), dct)
+
+    def test_headers_raw_to_dict_multiline_strict(self):
+        raw = b'\r\n'.join((b'Content-Type: multipart/related;',
+                            b'  type="application/xop+xml";',
+                            b'\tboundary="example"',
+                            b'Cache-Control: no-cache'))
+        # With strict=True, the header value that spans across
+        # multiple lines does get parsed fully.
         dct = {
             b'Content-Type': [
                 b'\r\n'.join((b'multipart/related;',
@@ -48,7 +62,7 @@ def test_headers_raw_to_dict_multiline(self):
                               b'\tboundary="example"'))
             ],
             b'Cache-Control': [b'no-cache']}
-        self.assertEqual(headers_raw_to_dict(raw), dct)
+        self.assertEqual(headers_raw_to_dict(raw, strict=True), dct)
 
     def test_headers_dict_to_raw(self):
         dct = OrderedDict([
diff --git a/w3lib/http.py b/w3lib/http.py
index 0986b43a..38528ecd 100644
--- a/w3lib/http.py
+++ b/w3lib/http.py
@@ -1,11 +1,17 @@
 from base64 import urlsafe_b64encode
 
 
-def headers_raw_to_dict(headers_raw):
+def headers_raw_to_dict(headers_raw, strict=False):
     r"""
     Convert raw headers (single multi-line bytestring)
     to a dictionary.
 
+    `strict` is a bool parameter controlling the multi-line parsing behavior.
+    If 'True', only the character sequence '\r\n' is considered as the line
+    delimiter, as per the HTTP specification (e.g., RFC 2616). If 'False'
+    (default), lines are delimited by 'str.splitlines()' and a wider range
+    of character(s) are considered as line boundaries.
+
     For example:
 
     >>> import w3lib.http
@@ -28,15 +34,18 @@ def headers_raw_to_dict(headers_raw):
     if headers_raw is None:
         return None
 
-    headers = []
-    for line in headers_raw.split(b'\r\n'):
-        if line.startswith(b' ') or line.startswith(b'\t'):
-            try:
-                headers[-1] += (b'\r\n' + line)
-            except IndexError:
-                raise ValueError('Malformed raw headers')
-        else:
-            headers.append(line)
+    if strict:
+        headers = []
+        for line in headers_raw.split(b'\r\n'):
+            if line.startswith(b' ') or line.startswith(b'\t'):
+                try:
+                    headers[-1] += (b'\r\n' + line)
+                except IndexError:
+                    raise ValueError('Malformed raw headers')
+            else:
+                headers.append(line)
+    else:
+        headers = headers_raw.splitlines()
 
     headers_tuples = [header.split(b':', 1) for header in headers]
 

From e2e069313cca65c7e903040a484236ffecef6c9e Mon Sep 17 00:00:00 2001
From: Taro Sato <okomestudio@gmail.com>
Date: Tue, 11 Feb 2020 10:48:04 -0800
Subject: [PATCH 3/3] Remove cosmetic parentheses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Adrián Chaves <adrian@chaves.io>
---
 w3lib/http.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/w3lib/http.py b/w3lib/http.py
index 38528ecd..e5eb236f 100644
--- a/w3lib/http.py
+++ b/w3lib/http.py
@@ -39,7 +39,7 @@ def headers_raw_to_dict(headers_raw, strict=False):
         for line in headers_raw.split(b'\r\n'):
             if line.startswith(b' ') or line.startswith(b'\t'):
                 try:
-                    headers[-1] += (b'\r\n' + line)
+                    headers[-1] += b'\r\n' + line
                 except IndexError:
                     raise ValueError('Malformed raw headers')
             else: