From 8fb69948a5729587d9b89363ad6ad1e0e256138e Mon Sep 17 00:00:00 2001 From: John Berlin Date: Wed, 20 Feb 2019 12:07:19 -0500 Subject: [PATCH] Refactored warcio's loop string/byte concatenation heavy sections to be gain performance increases (https://docs.python.org/3/faq/programming.html#id37) Changed the caught exception in try_brotli_init from ImportError to Exception due to finding causing pywb PR #444 (https://github.com/webrecorder/pywb/pull/444) --- warcio/bufferedreaders.py | 28 +++++++++++++------------ warcio/capture_http.py | 10 +++++---- warcio/statusandheaders.py | 43 ++++++++++++++++++++++++-------------- 3 files changed, 48 insertions(+), 33 deletions(-) diff --git a/warcio/bufferedreaders.py b/warcio/bufferedreaders.py index 5b11522b..19f19288 100644 --- a/warcio/bufferedreaders.py +++ b/warcio/bufferedreaders.py @@ -32,7 +32,7 @@ def brotli_decompressor(): return decomp BufferedReader.DECOMPRESSORS['br'] = brotli_decompressor - except ImportError: #pragma: no cover + except Exception: # pragma: no cover pass @@ -157,20 +157,19 @@ def read(self, length=None): if at buffer boundary, will attempt to read again until specified length is read """ - all_buffs = [] + # bytes are immutable, in-place concatenation via bytearray avoids the quadratic runtime cost + all_buffs = bytearray() while length is None or length > 0: self._fillbuff() if self.empty(): break buff = self.buff.read(length) - all_buffs.append(buff) + all_buffs += buff if length: length -= len(buff) - return b''.join(all_buffs) - - + return bytes(all_buffs) def readline(self, length=None): """ @@ -187,10 +186,12 @@ def readline(self, length=None): if self.empty(): return b'' - linebuff = self.buff.readline(length) + # bytes are immutable, in-place concatenation via bytearray avoids the quadratic runtime cost + linebuff = bytearray(self.buff.readline(length)) + newline_b = b'\n' # we may be at a boundary - while not linebuff.endswith(b'\n'): + while not linebuff.endswith(newline_b): if length: length -= len(linebuff) if length <= 0: @@ -203,7 +204,7 @@ def readline(self, length=None): linebuff += self.buff.readline(length) - return linebuff + return bytes(linebuff) def empty(self): if not self.buff or self.buff.tell() >= self.buff_size: @@ -336,7 +337,8 @@ def _try_decode(self, length_header): return data_len = 0 - data = b'' + # bytes are immutable, in-place concatenation via bytearray avoids the quadratic runtime cost + data = bytearray() # read chunk while data_len < chunk_size: @@ -348,7 +350,7 @@ def _try_decode(self, length_header): if not new_data: if self.raise_chunked_data_exceptions: msg = 'Ran out of data before end of chunk' - raise ChunkedDataException(msg, data) + raise ChunkedDataException(msg, bytes(data)) else: chunk_size = data_len self.all_chunks_read = True @@ -362,10 +364,10 @@ def _try_decode(self, length_header): clrf = self.stream.read(2) if clrf != b'\r\n': raise ChunkedDataException(b"Chunk terminator not found.", - data) + bytes(data)) # hand to base class for further processing - self._process_read(data) + self._process_read(bytes(data)) #================================================================= diff --git a/warcio/capture_http.py b/warcio/capture_http.py index f3d1e43a..77ec55fd 100644 --- a/warcio/capture_http.py +++ b/warcio/capture_http.py @@ -129,12 +129,14 @@ def _extract_url(self, data): path = line.split(' ', 2)[1] scheme = 'https' if self.default_port == 443 else 'http' - url = scheme + '://' + self.host + # string are immutable, in-place concatenation via list avoids the quadratic runtime cost + url = [scheme, '://', self.host] if self.port != self.default_port: - url += ':' + str(self.port) + url.append(':') + url.append(str(self.port)) - url += path - return url + url.append(path) + return ''.join(url) # ============================================================================ diff --git a/warcio/statusandheaders.py b/warcio/statusandheaders.py index e0241361..20946387 100644 --- a/warcio/statusandheaders.py +++ b/warcio/statusandheaders.py @@ -134,26 +134,28 @@ def __bool__(self): __nonzero__ = __bool__ def to_str(self, filter_func=None): - string = self.protocol + # strings are immutable, in-place concatenation via list avoids the quadratic runtime cost + crlf = '\r\n' + string = [self.protocol] - if string and self.statusline: - string += ' ' + if self.protocol and self.statusline: + string.append(' ') if self.statusline: - string += self.statusline + string.append(self.statusline) - if string: - string += '\r\n' + if self.protocol or self.statusline: + string.append(crlf) for h in self.headers: if filter_func: h = filter_func(h) if not h: continue + string.append(': '.join(h)) + string.append(crlf) - string += ': '.join(h) + '\r\n' - - return string + return ''.join(string) def to_bytes(self, filter_func=None, encoding='utf-8'): return self.to_str(filter_func).encode(encoding) + b'\r\n' @@ -247,6 +249,14 @@ def parse(self, stream, full_statusline=None): protocol='', total_len=total_read) + # strings and tuples are immutable, create these objects before the loop + # in order to only create them once per parse invocation + spacestr = ' ' + tabstr = '\t' + strip_space_tab = spacestr + tabstr + colonstr = ':' + split_on_space_or_tab = (spacestr, tabstr) + # validate only if verify is set if self.verify: protocol_status = self.split_prefix(statusline, self.statuslist) @@ -256,14 +266,15 @@ def parse(self, stream, full_statusline=None): msg = msg.format(self.statuslist, statusline) raise StatusAndHeadersParserException(msg, full_statusline) else: - protocol_status = statusline.split(' ', 1) + protocol_status = statusline.split(spacestr, 1) line, total_read = _strip_count(self.decode_header(stream.readline()), total_read) while line: - result = line.split(':', 1) + result = line.split(colonstr, 1) if len(result) == 2: - name = result[0].rstrip(' \t') - value = result[1].lstrip() + name = result[0].rstrip(strip_space_tab) + # string are immutable, in-place concatenation via list avoids the quadratic runtime cost + value = [result[1].lstrip()] else: name = result[0] value = None @@ -272,14 +283,14 @@ def parse(self, stream, full_statusline=None): total_read) # append continuation lines, if any - while next_line and next_line.startswith((' ', '\t')): + while next_line and next_line.startswith(split_on_space_or_tab): if value is not None: - value += next_line + value.append(next_line) next_line, total_read = _strip_count(self.decode_header(stream.readline()), total_read) if value is not None: - header = (name, value) + header = (name, ''.join(value)) headers.append(header) line = next_line