Skip to content

Commit 8fb6994

Browse files
committed
Refactored warcio's loop string/byte concatenation heavy sections to be gain performance increases (https://docs.python.org/3/faq/programming.html#id37)
Changed the caught exception in try_brotli_init from ImportError to Exception due to finding causing pywb PR #444 (webrecorder/pywb#444)
1 parent 7f533c8 commit 8fb6994

File tree

3 files changed

+48
-33
lines changed

3 files changed

+48
-33
lines changed

warcio/bufferedreaders.py

+15-13
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def brotli_decompressor():
3232
return decomp
3333

3434
BufferedReader.DECOMPRESSORS['br'] = brotli_decompressor
35-
except ImportError: #pragma: no cover
35+
except Exception: # pragma: no cover
3636
pass
3737

3838

@@ -157,20 +157,19 @@ def read(self, length=None):
157157
if at buffer boundary, will attempt to read again until
158158
specified length is read
159159
"""
160-
all_buffs = []
160+
# bytes are immutable, in-place concatenation via bytearray avoids the quadratic runtime cost
161+
all_buffs = bytearray()
161162
while length is None or length > 0:
162163
self._fillbuff()
163164
if self.empty():
164165
break
165166

166167
buff = self.buff.read(length)
167-
all_buffs.append(buff)
168+
all_buffs += buff
168169
if length:
169170
length -= len(buff)
170171

171-
return b''.join(all_buffs)
172-
173-
172+
return bytes(all_buffs)
174173

175174
def readline(self, length=None):
176175
"""
@@ -187,10 +186,12 @@ def readline(self, length=None):
187186
if self.empty():
188187
return b''
189188

190-
linebuff = self.buff.readline(length)
189+
# bytes are immutable, in-place concatenation via bytearray avoids the quadratic runtime cost
190+
linebuff = bytearray(self.buff.readline(length))
191+
newline_b = b'\n'
191192

192193
# we may be at a boundary
193-
while not linebuff.endswith(b'\n'):
194+
while not linebuff.endswith(newline_b):
194195
if length:
195196
length -= len(linebuff)
196197
if length <= 0:
@@ -203,7 +204,7 @@ def readline(self, length=None):
203204

204205
linebuff += self.buff.readline(length)
205206

206-
return linebuff
207+
return bytes(linebuff)
207208

208209
def empty(self):
209210
if not self.buff or self.buff.tell() >= self.buff_size:
@@ -336,7 +337,8 @@ def _try_decode(self, length_header):
336337
return
337338

338339
data_len = 0
339-
data = b''
340+
# bytes are immutable, in-place concatenation via bytearray avoids the quadratic runtime cost
341+
data = bytearray()
340342

341343
# read chunk
342344
while data_len < chunk_size:
@@ -348,7 +350,7 @@ def _try_decode(self, length_header):
348350
if not new_data:
349351
if self.raise_chunked_data_exceptions:
350352
msg = 'Ran out of data before end of chunk'
351-
raise ChunkedDataException(msg, data)
353+
raise ChunkedDataException(msg, bytes(data))
352354
else:
353355
chunk_size = data_len
354356
self.all_chunks_read = True
@@ -362,10 +364,10 @@ def _try_decode(self, length_header):
362364
clrf = self.stream.read(2)
363365
if clrf != b'\r\n':
364366
raise ChunkedDataException(b"Chunk terminator not found.",
365-
data)
367+
bytes(data))
366368

367369
# hand to base class for further processing
368-
self._process_read(data)
370+
self._process_read(bytes(data))
369371

370372

371373
#=================================================================

warcio/capture_http.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -129,12 +129,14 @@ def _extract_url(self, data):
129129
path = line.split(' ', 2)[1]
130130

131131
scheme = 'https' if self.default_port == 443 else 'http'
132-
url = scheme + '://' + self.host
132+
# string are immutable, in-place concatenation via list avoids the quadratic runtime cost
133+
url = [scheme, '://', self.host]
133134
if self.port != self.default_port:
134-
url += ':' + str(self.port)
135+
url.append(':')
136+
url.append(str(self.port))
135137

136-
url += path
137-
return url
138+
url.append(path)
139+
return ''.join(url)
138140

139141

140142
# ============================================================================

warcio/statusandheaders.py

+27-16
Original file line numberDiff line numberDiff line change
@@ -134,26 +134,28 @@ def __bool__(self):
134134
__nonzero__ = __bool__
135135

136136
def to_str(self, filter_func=None):
137-
string = self.protocol
137+
# strings are immutable, in-place concatenation via list avoids the quadratic runtime cost
138+
crlf = '\r\n'
139+
string = [self.protocol]
138140

139-
if string and self.statusline:
140-
string += ' '
141+
if self.protocol and self.statusline:
142+
string.append(' ')
141143

142144
if self.statusline:
143-
string += self.statusline
145+
string.append(self.statusline)
144146

145-
if string:
146-
string += '\r\n'
147+
if self.protocol or self.statusline:
148+
string.append(crlf)
147149

148150
for h in self.headers:
149151
if filter_func:
150152
h = filter_func(h)
151153
if not h:
152154
continue
155+
string.append(': '.join(h))
156+
string.append(crlf)
153157

154-
string += ': '.join(h) + '\r\n'
155-
156-
return string
158+
return ''.join(string)
157159

158160
def to_bytes(self, filter_func=None, encoding='utf-8'):
159161
return self.to_str(filter_func).encode(encoding) + b'\r\n'
@@ -247,6 +249,14 @@ def parse(self, stream, full_statusline=None):
247249
protocol='',
248250
total_len=total_read)
249251

252+
# strings and tuples are immutable, create these objects before the loop
253+
# in order to only create them once per parse invocation
254+
spacestr = ' '
255+
tabstr = '\t'
256+
strip_space_tab = spacestr + tabstr
257+
colonstr = ':'
258+
split_on_space_or_tab = (spacestr, tabstr)
259+
250260
# validate only if verify is set
251261
if self.verify:
252262
protocol_status = self.split_prefix(statusline, self.statuslist)
@@ -256,14 +266,15 @@ def parse(self, stream, full_statusline=None):
256266
msg = msg.format(self.statuslist, statusline)
257267
raise StatusAndHeadersParserException(msg, full_statusline)
258268
else:
259-
protocol_status = statusline.split(' ', 1)
269+
protocol_status = statusline.split(spacestr, 1)
260270

261271
line, total_read = _strip_count(self.decode_header(stream.readline()), total_read)
262272
while line:
263-
result = line.split(':', 1)
273+
result = line.split(colonstr, 1)
264274
if len(result) == 2:
265-
name = result[0].rstrip(' \t')
266-
value = result[1].lstrip()
275+
name = result[0].rstrip(strip_space_tab)
276+
# string are immutable, in-place concatenation via list avoids the quadratic runtime cost
277+
value = [result[1].lstrip()]
267278
else:
268279
name = result[0]
269280
value = None
@@ -272,14 +283,14 @@ def parse(self, stream, full_statusline=None):
272283
total_read)
273284

274285
# append continuation lines, if any
275-
while next_line and next_line.startswith((' ', '\t')):
286+
while next_line and next_line.startswith(split_on_space_or_tab):
276287
if value is not None:
277-
value += next_line
288+
value.append(next_line)
278289
next_line, total_read = _strip_count(self.decode_header(stream.readline()),
279290
total_read)
280291

281292
if value is not None:
282-
header = (name, value)
293+
header = (name, ''.join(value))
283294
headers.append(header)
284295

285296
line = next_line

0 commit comments

Comments
 (0)