diff --git a/tests/test_http.py b/tests/test_http.py index 01f903e8..e5d22667 100644 --- a/tests/test_http.py +++ b/tests/test_http.py @@ -28,12 +28,42 @@ def test_headers_raw_dict_none(self): self.assertIsNone(headers_dict_to_raw(None)) def test_headers_raw_to_dict(self): - raw = b"Content-type: text/html\n\rAccept: gzip\n\r\ - Cache-Control: no-cache\n\rCache-Control: no-store\n\n" - dct = {b'Content-type': [b'text/html'], b'Accept': [b'gzip'], + raw = b'\r\n'.join((b"Content-type: text/html", + b"Accept: gzip", + b"Cache-Control: no-cache", + b"Cache-Control: no-store")) + dct = {b'Content-type': [b'text/html'], b'Accept': [b'gzip'], b'Cache-Control': [b'no-cache', b'no-store']} self.assertEqual(headers_raw_to_dict(raw), dct) + def test_headers_raw_to_dict_multiline(self): + raw = b'\r\n'.join((b'Content-Type: multipart/related;', + b' type="application/xop+xml";', + b'\tboundary="example"', + b'Cache-Control: no-cache')) + # With strict=False, the header value that spans across + # multiple lines does not get parsed fully, and only the first + # line is retained. + dct = {b'Content-Type': [b'multipart/related;'], + b'Cache-Control': [b'no-cache']} + self.assertEqual(headers_raw_to_dict(raw), dct) + + def test_headers_raw_to_dict_multiline_strict(self): + raw = b'\r\n'.join((b'Content-Type: multipart/related;', + b' type="application/xop+xml";', + b'\tboundary="example"', + b'Cache-Control: no-cache')) + # With strict=True, the header value that spans across + # multiple lines does get parsed fully. + dct = { + b'Content-Type': [ + b'\r\n'.join((b'multipart/related;', + b' type="application/xop+xml";', + b'\tboundary="example"')) + ], + b'Cache-Control': [b'no-cache']} + self.assertEqual(headers_raw_to_dict(raw, strict=True), dct) + def test_headers_dict_to_raw(self): dct = OrderedDict([ (b'Content-type', b'text/html'), diff --git a/w3lib/http.py b/w3lib/http.py index c7b94a23..e5eb236f 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -1,11 +1,17 @@ from base64 import urlsafe_b64encode -def headers_raw_to_dict(headers_raw): +def headers_raw_to_dict(headers_raw, strict=False): r""" Convert raw headers (single multi-line bytestring) to a dictionary. + `strict` is a bool parameter controlling the multi-line parsing behavior. + If 'True', only the character sequence '\r\n' is considered as the line + delimiter, as per the HTTP specification (e.g., RFC 2616). If 'False' + (default), lines are delimited by 'str.splitlines()' and a wider range + of character(s) are considered as line boundaries. + For example: >>> import w3lib.http @@ -27,7 +33,20 @@ def headers_raw_to_dict(headers_raw): if headers_raw is None: return None - headers = headers_raw.splitlines() + + if strict: + headers = [] + for line in headers_raw.split(b'\r\n'): + if line.startswith(b' ') or line.startswith(b'\t'): + try: + headers[-1] += b'\r\n' + line + except IndexError: + raise ValueError('Malformed raw headers') + else: + headers.append(line) + else: + headers = headers_raw.splitlines() + headers_tuples = [header.split(b':', 1) for header in headers] result_dict = {}