From 94c034f181aeaf7f388a2a7130adde97671a6ad8 Mon Sep 17 00:00:00 2001 From: Taro Sato Date: Fri, 27 Jul 2018 13:42:05 -0700 Subject: [PATCH 1/3] Fix a bug when header item gets ignored when it extends over multiple lines --- tests/test_http.py | 22 +++++++++++++++++++--- w3lib/http.py | 12 +++++++++++- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/tests/test_http.py b/tests/test_http.py index 01f903e8..37de9a00 100644 --- a/tests/test_http.py +++ b/tests/test_http.py @@ -28,12 +28,28 @@ def test_headers_raw_dict_none(self): self.assertIsNone(headers_dict_to_raw(None)) def test_headers_raw_to_dict(self): - raw = b"Content-type: text/html\n\rAccept: gzip\n\r\ - Cache-Control: no-cache\n\rCache-Control: no-store\n\n" - dct = {b'Content-type': [b'text/html'], b'Accept': [b'gzip'], + raw = b'\r\n'.join((b"Content-type: text/html", + b"Accept: gzip", + b"Cache-Control: no-cache", + b"Cache-Control: no-store")) + dct = {b'Content-type': [b'text/html'], b'Accept': [b'gzip'], b'Cache-Control': [b'no-cache', b'no-store']} self.assertEqual(headers_raw_to_dict(raw), dct) + def test_headers_raw_to_dict_multiline(self): + raw = b'\r\n'.join((b'Content-Type: multipart/related;', + b' type="application/xop+xml";', + b'\tboundary="example"', + b'Cache-Control: no-cache')) + dct = { + b'Content-Type': [ + b'\r\n'.join((b'multipart/related;', + b' type="application/xop+xml";', + b'\tboundary="example"')) + ], + b'Cache-Control': [b'no-cache']} + self.assertEqual(headers_raw_to_dict(raw), dct) + def test_headers_dict_to_raw(self): dct = OrderedDict([ (b'Content-type', b'text/html'), diff --git a/w3lib/http.py b/w3lib/http.py index c7b94a23..0986b43a 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -27,7 +27,17 @@ def headers_raw_to_dict(headers_raw): if headers_raw is None: return None - headers = headers_raw.splitlines() + + headers = [] + for line in headers_raw.split(b'\r\n'): + if line.startswith(b' ') or line.startswith(b'\t'): + try: + headers[-1] += (b'\r\n' + line) + except IndexError: + raise ValueError('Malformed raw headers') + else: + headers.append(line) + headers_tuples = [header.split(b':', 1) for header in headers] result_dict = {} From f809060fa0e224bceb827ffcadfb749f1bd784ec Mon Sep 17 00:00:00 2001 From: Taro Sato Date: Wed, 5 Feb 2020 19:01:58 -0800 Subject: [PATCH 2/3] Add strict parameter to control line parsing behavior --- tests/test_http.py | 16 +++++++++++++++- w3lib/http.py | 29 +++++++++++++++++++---------- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/tests/test_http.py b/tests/test_http.py index 37de9a00..e5d22667 100644 --- a/tests/test_http.py +++ b/tests/test_http.py @@ -41,6 +41,20 @@ def test_headers_raw_to_dict_multiline(self): b' type="application/xop+xml";', b'\tboundary="example"', b'Cache-Control: no-cache')) + # With strict=False, the header value that spans across + # multiple lines does not get parsed fully, and only the first + # line is retained. + dct = {b'Content-Type': [b'multipart/related;'], + b'Cache-Control': [b'no-cache']} + self.assertEqual(headers_raw_to_dict(raw), dct) + + def test_headers_raw_to_dict_multiline_strict(self): + raw = b'\r\n'.join((b'Content-Type: multipart/related;', + b' type="application/xop+xml";', + b'\tboundary="example"', + b'Cache-Control: no-cache')) + # With strict=True, the header value that spans across + # multiple lines does get parsed fully. dct = { b'Content-Type': [ b'\r\n'.join((b'multipart/related;', @@ -48,7 +62,7 @@ def test_headers_raw_to_dict_multiline(self): b'\tboundary="example"')) ], b'Cache-Control': [b'no-cache']} - self.assertEqual(headers_raw_to_dict(raw), dct) + self.assertEqual(headers_raw_to_dict(raw, strict=True), dct) def test_headers_dict_to_raw(self): dct = OrderedDict([ diff --git a/w3lib/http.py b/w3lib/http.py index 0986b43a..38528ecd 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -1,11 +1,17 @@ from base64 import urlsafe_b64encode -def headers_raw_to_dict(headers_raw): +def headers_raw_to_dict(headers_raw, strict=False): r""" Convert raw headers (single multi-line bytestring) to a dictionary. + `strict` is a bool parameter controlling the multi-line parsing behavior. + If 'True', only the character sequence '\r\n' is considered as the line + delimiter, as per the HTTP specification (e.g., RFC 2616). If 'False' + (default), lines are delimited by 'str.splitlines()' and a wider range + of character(s) are considered as line boundaries. + For example: >>> import w3lib.http @@ -28,15 +34,18 @@ def headers_raw_to_dict(headers_raw): if headers_raw is None: return None - headers = [] - for line in headers_raw.split(b'\r\n'): - if line.startswith(b' ') or line.startswith(b'\t'): - try: - headers[-1] += (b'\r\n' + line) - except IndexError: - raise ValueError('Malformed raw headers') - else: - headers.append(line) + if strict: + headers = [] + for line in headers_raw.split(b'\r\n'): + if line.startswith(b' ') or line.startswith(b'\t'): + try: + headers[-1] += (b'\r\n' + line) + except IndexError: + raise ValueError('Malformed raw headers') + else: + headers.append(line) + else: + headers = headers_raw.splitlines() headers_tuples = [header.split(b':', 1) for header in headers] From e2e069313cca65c7e903040a484236ffecef6c9e Mon Sep 17 00:00:00 2001 From: Taro Sato Date: Tue, 11 Feb 2020 10:48:04 -0800 Subject: [PATCH 3/3] Remove cosmetic parentheses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Adrián Chaves --- w3lib/http.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/w3lib/http.py b/w3lib/http.py index 38528ecd..e5eb236f 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -39,7 +39,7 @@ def headers_raw_to_dict(headers_raw, strict=False): for line in headers_raw.split(b'\r\n'): if line.startswith(b' ') or line.startswith(b'\t'): try: - headers[-1] += (b'\r\n' + line) + headers[-1] += b'\r\n' + line except IndexError: raise ValueError('Malformed raw headers') else: