Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

warcio and perf #72

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 15 additions & 13 deletions warcio/bufferedreaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def brotli_decompressor():
return decomp

BufferedReader.DECOMPRESSORS['br'] = brotli_decompressor
except ImportError: #pragma: no cover
except Exception: # pragma: no cover
pass


Expand Down Expand Up @@ -157,20 +157,19 @@ def read(self, length=None):
if at buffer boundary, will attempt to read again until
specified length is read
"""
all_buffs = []
# bytes are immutable, in-place concatenation via bytearray avoids the quadratic runtime cost
all_buffs = bytearray()
while length is None or length > 0:
self._fillbuff()
if self.empty():
break

buff = self.buff.read(length)
all_buffs.append(buff)
all_buffs += buff
if length:
length -= len(buff)

return b''.join(all_buffs)


return bytes(all_buffs)

def readline(self, length=None):
"""
Expand All @@ -187,10 +186,12 @@ def readline(self, length=None):
if self.empty():
return b''

linebuff = self.buff.readline(length)
# bytes are immutable, in-place concatenation via bytearray avoids the quadratic runtime cost
linebuff = bytearray(self.buff.readline(length))
newline_b = b'\n'

# we may be at a boundary
while not linebuff.endswith(b'\n'):
while not linebuff.endswith(newline_b):
if length:
length -= len(linebuff)
if length <= 0:
Expand All @@ -203,7 +204,7 @@ def readline(self, length=None):

linebuff += self.buff.readline(length)

return linebuff
return bytes(linebuff)

def empty(self):
if not self.buff or self.buff.tell() >= self.buff_size:
Expand Down Expand Up @@ -336,7 +337,8 @@ def _try_decode(self, length_header):
return

data_len = 0
data = b''
# bytes are immutable, in-place concatenation via bytearray avoids the quadratic runtime cost
data = bytearray()

# read chunk
while data_len < chunk_size:
Expand All @@ -348,7 +350,7 @@ def _try_decode(self, length_header):
if not new_data:
if self.raise_chunked_data_exceptions:
msg = 'Ran out of data before end of chunk'
raise ChunkedDataException(msg, data)
raise ChunkedDataException(msg, bytes(data))
else:
chunk_size = data_len
self.all_chunks_read = True
Expand All @@ -362,10 +364,10 @@ def _try_decode(self, length_header):
clrf = self.stream.read(2)
if clrf != b'\r\n':
raise ChunkedDataException(b"Chunk terminator not found.",
data)
bytes(data))

# hand to base class for further processing
self._process_read(data)
self._process_read(bytes(data))


#=================================================================
Expand Down
10 changes: 6 additions & 4 deletions warcio/capture_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,14 @@ def _extract_url(self, data):
path = line.split(' ', 2)[1]

scheme = 'https' if self.default_port == 443 else 'http'
url = scheme + '://' + self.host
# string are immutable, in-place concatenation via list avoids the quadratic runtime cost
url = [scheme, '://', self.host]
if self.port != self.default_port:
url += ':' + str(self.port)
url.append(':')
url.append(str(self.port))

url += path
return url
url.append(path)
return ''.join(url)


# ============================================================================
Expand Down
43 changes: 27 additions & 16 deletions warcio/statusandheaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,26 +134,28 @@ def __bool__(self):
__nonzero__ = __bool__

def to_str(self, filter_func=None):
string = self.protocol
# strings are immutable, in-place concatenation via list avoids the quadratic runtime cost
crlf = '\r\n'
string = [self.protocol]

if string and self.statusline:
string += ' '
if self.protocol and self.statusline:
string.append(' ')

if self.statusline:
string += self.statusline
string.append(self.statusline)

if string:
string += '\r\n'
if self.protocol or self.statusline:
string.append(crlf)

for h in self.headers:
if filter_func:
h = filter_func(h)
if not h:
continue
string.append(': '.join(h))
string.append(crlf)

string += ': '.join(h) + '\r\n'

return string
return ''.join(string)

def to_bytes(self, filter_func=None, encoding='utf-8'):
return self.to_str(filter_func).encode(encoding) + b'\r\n'
Expand Down Expand Up @@ -247,6 +249,14 @@ def parse(self, stream, full_statusline=None):
protocol='',
total_len=total_read)

# strings and tuples are immutable, create these objects before the loop
# in order to only create them once per parse invocation
spacestr = ' '
tabstr = '\t'
strip_space_tab = spacestr + tabstr
colonstr = ':'
split_on_space_or_tab = (spacestr, tabstr)

# validate only if verify is set
if self.verify:
protocol_status = self.split_prefix(statusline, self.statuslist)
Expand All @@ -256,14 +266,15 @@ def parse(self, stream, full_statusline=None):
msg = msg.format(self.statuslist, statusline)
raise StatusAndHeadersParserException(msg, full_statusline)
else:
protocol_status = statusline.split(' ', 1)
protocol_status = statusline.split(spacestr, 1)

line, total_read = _strip_count(self.decode_header(stream.readline()), total_read)
while line:
result = line.split(':', 1)
result = line.split(colonstr, 1)
if len(result) == 2:
name = result[0].rstrip(' \t')
value = result[1].lstrip()
name = result[0].rstrip(strip_space_tab)
# string are immutable, in-place concatenation via list avoids the quadratic runtime cost
value = [result[1].lstrip()]
else:
name = result[0]
value = None
Expand All @@ -272,14 +283,14 @@ def parse(self, stream, full_statusline=None):
total_read)

# append continuation lines, if any
while next_line and next_line.startswith((' ', '\t')):
while next_line and next_line.startswith(split_on_space_or_tab):
if value is not None:
value += next_line
value.append(next_line)
next_line, total_read = _strip_count(self.decode_header(stream.readline()),
total_read)

if value is not None:
header = (name, value)
header = (name, ''.join(value))
headers.append(header)

line = next_line
Expand Down