Refactored warcio's loop string/byte concatenation heavy sections to be gain performance increases (https://docs.python.org/3/faq/programming.html#id37)

N0taN3rd · N0taN3rd · commit 8fb69948a572 · 2019-02-20T12:07:19.000-05:00
Changed the caught exception in try_brotli_init from ImportError to Exception due to finding causing pywb PR #444 (webrecorder/pywb#444)
diff --git a/warcio/bufferedreaders.py b/warcio/bufferedreaders.py
@@ -32,7 +32,7 @@ def brotli_decompressor():
             return decomp
 
         BufferedReader.DECOMPRESSORS['br'] = brotli_decompressor
-    except ImportError:  #pragma: no cover
+    except Exception:  # pragma: no cover
         pass
 
 
@@ -157,20 +157,19 @@ def read(self, length=None):
         if at buffer boundary, will attempt to read again until
         specified length is read
         """
-        all_buffs = []
+        # bytes are immutable, in-place concatenation via bytearray avoids the quadratic runtime cost
+        all_buffs = bytearray()
         while length is None or length > 0:
             self._fillbuff()
             if self.empty():
                 break
 
             buff = self.buff.read(length)
-            all_buffs.append(buff)
+            all_buffs += buff
             if length:
                 length -= len(buff)
 
-        return b''.join(all_buffs)
-
-
+        return bytes(all_buffs)
 
     def readline(self, length=None):
         """
@@ -187,10 +186,12 @@ def readline(self, length=None):
         if self.empty():
             return b''
 
-        linebuff = self.buff.readline(length)
+        # bytes are immutable, in-place concatenation via bytearray avoids the quadratic runtime cost
+        linebuff = bytearray(self.buff.readline(length))
+        newline_b = b'\n'
 
         # we may be at a boundary
-        while not linebuff.endswith(b'\n'):
+        while not linebuff.endswith(newline_b):
             if length:
                 length -= len(linebuff)
                 if length <= 0:
@@ -203,7 +204,7 @@ def readline(self, length=None):
 
             linebuff += self.buff.readline(length)
 
-        return linebuff
+        return bytes(linebuff)
 
     def empty(self):
         if not self.buff or self.buff.tell() >= self.buff_size:
@@ -336,7 +337,8 @@ def _try_decode(self, length_header):
             return
 
         data_len = 0
-        data = b''
+        # bytes are immutable, in-place concatenation via bytearray avoids the quadratic runtime cost
+        data = bytearray()
 
         # read chunk
         while data_len < chunk_size:
@@ -348,7 +350,7 @@ def _try_decode(self, length_header):
             if not new_data:
                 if self.raise_chunked_data_exceptions:
                     msg = 'Ran out of data before end of chunk'
-                    raise ChunkedDataException(msg, data)
+                    raise ChunkedDataException(msg, bytes(data))
                 else:
                     chunk_size = data_len
                     self.all_chunks_read = True
@@ -362,10 +364,10 @@ def _try_decode(self, length_header):
             clrf = self.stream.read(2)
             if clrf != b'\r\n':
                 raise ChunkedDataException(b"Chunk terminator not found.",
-                                           data)
+                                           bytes(data))
 
         # hand to base class for further processing
-        self._process_read(data)
+        self._process_read(bytes(data))
 
 
 #=================================================================
diff --git a/warcio/capture_http.py b/warcio/capture_http.py
@@ -129,12 +129,14 @@ def _extract_url(self, data):
         path = line.split(' ', 2)[1]
 
         scheme = 'https' if self.default_port == 443 else 'http'
-        url = scheme + '://' + self.host
+        # string are immutable, in-place concatenation via list avoids the quadratic runtime cost
+        url = [scheme, '://', self.host]
         if self.port != self.default_port:
-            url += ':' + str(self.port)
+            url.append(':')
+            url.append(str(self.port))
 
-        url += path
-        return url
+        url.append(path)
+        return ''.join(url)
 
 
 # ============================================================================
diff --git a/warcio/statusandheaders.py b/warcio/statusandheaders.py
@@ -134,26 +134,28 @@ def __bool__(self):
     __nonzero__ = __bool__
 
     def to_str(self, filter_func=None):
-        string = self.protocol
+        # strings are immutable, in-place concatenation via list avoids the quadratic runtime cost
+        crlf = '\r\n'
+        string = [self.protocol]
 
-        if string and self.statusline:
-            string += ' '
+        if self.protocol and self.statusline:
+            string.append(' ')
 
         if self.statusline:
-            string += self.statusline
+            string.append(self.statusline)
 
-        if string:
-            string += '\r\n'
+        if self.protocol or self.statusline:
+            string.append(crlf)
 
         for h in self.headers:
             if filter_func:
                 h = filter_func(h)
                 if not h:
                     continue
+            string.append(': '.join(h))
+            string.append(crlf)
 
-            string += ': '.join(h) + '\r\n'
-
-        return string
+        return ''.join(string)
 
     def to_bytes(self, filter_func=None, encoding='utf-8'):
         return self.to_str(filter_func).encode(encoding) + b'\r\n'
@@ -247,6 +249,14 @@ def parse(self, stream, full_statusline=None):
                                     protocol='',
                                     total_len=total_read)
 
+        # strings and tuples are immutable, create these objects before the loop
+        # in order to only create them once per parse invocation
+        spacestr = ' '
+        tabstr = '\t'
+        strip_space_tab = spacestr + tabstr
+        colonstr = ':'
+        split_on_space_or_tab = (spacestr, tabstr)
+
         # validate only if verify is set
         if self.verify:
             protocol_status = self.split_prefix(statusline, self.statuslist)
@@ -256,14 +266,15 @@ def parse(self, stream, full_statusline=None):
                 msg = msg.format(self.statuslist, statusline)
                 raise StatusAndHeadersParserException(msg, full_statusline)
         else:
-            protocol_status = statusline.split(' ', 1)
+            protocol_status = statusline.split(spacestr, 1)
 
         line, total_read = _strip_count(self.decode_header(stream.readline()), total_read)
         while line:
-            result = line.split(':', 1)
+            result = line.split(colonstr, 1)
             if len(result) == 2:
-                name = result[0].rstrip(' \t')
-                value = result[1].lstrip()
+                name = result[0].rstrip(strip_space_tab)
+                # string are immutable, in-place concatenation via list avoids the quadratic runtime cost
+                value = [result[1].lstrip()]
             else:
                 name = result[0]
                 value = None
@@ -272,14 +283,14 @@ def parse(self, stream, full_statusline=None):
                                                  total_read)
 
             # append continuation lines, if any
-            while next_line and next_line.startswith((' ', '\t')):
+            while next_line and next_line.startswith(split_on_space_or_tab):
                 if value is not None:
-                    value += next_line
+                    value.append(next_line)
                 next_line, total_read = _strip_count(self.decode_header(stream.readline()),
                                                      total_read)
 
             if value is not None:
-                header = (name, value)
+                header = (name, ''.join(value))
                 headers.append(header)
 
             line = next_line