diff --git a/splash/defaults.py b/splash/defaults.py index 05bc15ac4..9a51006f4 100644 --- a/splash/defaults.py +++ b/splash/defaults.py @@ -6,6 +6,9 @@ MAX_TIMEOUT = 90.0 +RESPONSE_SIZE_LIMIT = None +MAX_RESPONSE_SIZE_LIMIT = None + # Default size of browser window. As there're no decorations, this affects # both "window.inner*" and "window.outer*" values. VIEWPORT_SIZE = '1024x768' diff --git a/splash/network_manager.py b/splash/network_manager.py index 29108a505..1cfdbabdc 100644 --- a/splash/network_manager.py +++ b/splash/network_manager.py @@ -26,10 +26,77 @@ ) from splash.response_middleware import ContentTypeMiddleware from splash import defaults +from splash.qtutils import qt_header_items from splash.utils import to_bytes from splash.cookies import SplashCookieJar +class _InvalidContentLength(ValueError): + + def __init__(self, value): + if isinstance(value, bytes): + value = '0x' + value.hex() + message = 'Invalid Content-Length header value: {}'.format(value) + super().__init__(message) + + +def _get_content_length(reply): + for name, value in qt_header_items(reply): + if bytes(name).lower() == b'content-length': + value = bytes(value).split(b',', 1)[0] + try: + value = value.decode('latin1') + value = int(value) + except (UnicodeDecodeError, ValueError): + raise _InvalidContentLength(value) + if value < 0: + raise _InvalidContentLength(value) + return value + + +def _size_warrants_abort(sizes_and_sources, render_options, log, reply): + if render_options is None: + return False + option = "response_size_limit" + max_size = render_options.get(option, None) + if max_size is not None: + try: + max_size = int(max_size) + except ValueError: + log("Non-integer value received for rendering option '{}': {}" + .format(option, max_size), min_level=1) + log(traceback.format_exc(), min_level=1, format_msg=False) + max_size = None + else: + if max_size < 0: + log("The value of rendering option '{}' ({}) must be 0 or " + "higher.".format(option, max_size),min_level=1) + max_size = None + elif (render_options.max_response_size_limit is not None and + max_size > render_options.max_response_size_limit): + log("The value of rendering option '{}' ({}) exceeds the " + "maximum value allowed.".format(option, max_size), + min_level=1) + max_size = None + if max_size is None: + if render_options.max_response_size_limit is not None: + max_size = render_options.max_response_size_limit + else: + max_size = defaults.RESPONSE_SIZE_LIMIT + if max_size is None: + return False + for size, source in sizes_and_sources: + if size is None: + continue + if size <= max_size: + continue + log("The {} ({}) exceeds the maximum response size ({}), aborting: " + "{{url}}".format(source, size, max_size), reply, min_level=1) + log(render_options, reply, min_level=1, format_msg=False) + return True + return False + + class NetworkManagerFactory(object): def __init__(self, filters_path=None, verbosity=None, allowed_schemes=None, disable_browser_caches=None): verbosity = defaults.VERBOSITY if verbosity is None else verbosity @@ -86,6 +153,7 @@ class ProxiedQNetworkAccessManager(QNetworkAccessManager): * Tracks information about requests/responses and stores it in HAR format, including request and response content. * Allows to set per-request timeouts. + * Handles per-request response size limits. """ _REQUEST_ID = QNetworkRequest.User + 1 _SHOULD_TRACK = QNetworkRequest.User + 2 @@ -398,11 +466,32 @@ def _on_reply_finished(self): content) self.log("Finished downloading {url}", reply) + def _size_caused_abort(self, sizes_and_sources): + reply = self.sender() + request = reply.request() + render_options = self._get_render_options(request) + if _size_warrants_abort( + sizes_and_sources, render_options, self.log, reply): + reply.abort() + return True + return False + def _on_reply_headers(self): """Signal emitted before reading response body, after getting headers """ reply = self.sender() request = reply.request() + + try: + content_length = _get_content_length(reply) + except _InvalidContentLength as error: + self.log("On response from {{url}}: {}".format(error), + reply, min_level=3) + content_length = None + sizes_and_sources = ((content_length, "Content-Length header"),) + if self._size_caused_abort(sizes_and_sources): + return + self._handle_reply_cookies(reply) self._run_webpage_callbacks(request, "on_response_headers", reply) @@ -413,6 +502,16 @@ def _on_reply_headers(self): self.log("Headers received for {url}", reply, min_level=3) def _on_reply_download_progress(self, received, total): + reply = self.sender() + request = reply.request() + + sizes_and_sources = ( + (total, "expected response size"), + (received, "size of the response content downloaded so far"), + ) + if self._size_caused_abort(sizes_and_sources): + return + har = self._get_har() if har is not None: req_id = self._get_request_id() diff --git a/splash/render_options.py b/splash/render_options.py index f85e30b47..213df6712 100644 --- a/splash/render_options.py +++ b/splash/render_options.py @@ -14,8 +14,9 @@ class RenderOptions(object): _REQUIRED = object() - def __init__(self, data, max_timeout): + def __init__(self, data, max_timeout, max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT): self.data = data + self.max_response_size_limit = max_response_size_limit self.max_timeout = max_timeout @classmethod @@ -29,7 +30,7 @@ def raise_error(cls, argument, description, type='bad_argument', **kwargs): raise BadOption(params) @classmethod - def fromrequest(cls, request, max_timeout): + def fromrequest(cls, request, max_timeout, max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT): """ Initialize options from a Twisted Request. """ @@ -60,7 +61,7 @@ def fromrequest(cls, request, max_timeout): request.content.seek(0) data['uid'] = id(request) - return cls(data, max_timeout) + return cls(data, max_timeout, max_response_size_limit=max_response_size_limit) def get_expired_args(self, cache): """ diff --git a/splash/resources.py b/splash/resources.py index 2c205ef5b..d57bd79e6 100644 --- a/splash/resources.py +++ b/splash/resources.py @@ -17,6 +17,7 @@ import splash from splash.argument_cache import ArgumentCache +from splash import defaults from splash.qtrender import ( HtmlRender, PngRender, JsonRender, HarRender, JpegRender ) @@ -85,17 +86,18 @@ class BaseRenderResource(_ValidatingResource): isLeaf = True content_type = "text/html; charset=utf-8" - def __init__(self, pool, max_timeout, argument_cache): + def __init__(self, pool, max_timeout, argument_cache, max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT): Resource.__init__(self) self.pool = pool self.js_profiles_path = self.pool.js_profiles_path self.max_timeout = max_timeout self.argument_cache = argument_cache + self.max_response_size_limit = max_response_size_limit def render_GET(self, request): #log.msg("%s %s %s %s" % (id(request), request.method, request.path, request.args)) request.starttime = time.time() - render_options = RenderOptions.fromrequest(request, self.max_timeout) + render_options = RenderOptions.fromrequest(request, self.max_timeout, max_response_size_limit=self.max_response_size_limit) # process argument cache original_options = render_options.data.copy() @@ -281,8 +283,9 @@ def __init__(self, pool, sandboxed, argument_cache, strict, implicit_main, + max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT, ): - BaseRenderResource.__init__(self, pool, max_timeout, argument_cache) + BaseRenderResource.__init__(self, pool, max_timeout, argument_cache, max_response_size_limit=max_response_size_limit) self.sandboxed = sandboxed self.lua_package_path = lua_package_path self.lua_sandbox_allowed_modules = lua_sandbox_allowed_modules @@ -434,20 +437,22 @@ class DemoUI(_ValidatingResource): PATH = b'info' - def __init__(self, pool, lua_enabled, max_timeout): + def __init__(self, pool, lua_enabled, max_timeout, max_response_size_limit=defaults.MAX_RESPONSE_SIZE_LIMIT): Resource.__init__(self) self.pool = pool self.lua_enabled = lua_enabled self.max_timeout = max_timeout + self.max_response_size_limit = max_response_size_limit def _validate_params(self, request): - options = RenderOptions.fromrequest(request, self.max_timeout) + options = RenderOptions.fromrequest(request, self.max_timeout, max_response_size_limit=self.max_response_size_limit) options.get_filters(self.pool) # check params = options.get_common_params(self.pool.js_profiles_path) params.update({ 'save_args': options.get_save_args(), 'load_args': options.get_load_args(), 'timeout': options.get_timeout(), + 'response_size_limit': options.get_response_size_limit(), 'request_body': options.get_request_body(), 'response_body': options.get_response_body(), 'har': 1, @@ -471,6 +476,7 @@ def render_GET(self, request): url = 'http://' + url params['url'] = url timeout = params['timeout'] + response_size_limit = params['response_size_limit'] params = {k: v for k, v in params.items() if v is not None} # disable "phases" HAR Viewer feature @@ -514,6 +520,7 @@ def render_GET(self, request): +