diff --git a/CHANGES.rst b/CHANGES.rst index ea0af90..00966da 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -6,6 +6,12 @@ TBR * removed support for Python 3.6 * added support for Python 3.10 +* add _optional_ setting ``AUTOEXTRACT_RESPONSE_ERROR_LOG_LEVEL`` to set the + logging level for the errors encountered. +* add _optional_ setting ``AUTOEXTRACT_ALLOWED_RESPONSE_ERRORS`` which is a + collection of error strings from AutoExtract's responses to prevent + ``AutoExtractError`` from being raised when errors are valid _(e.g. a website + returns a valid page but with a 404 response status)_. 0.7.0 (2021-08-05) ------------------ diff --git a/README.rst b/README.rst index 730eead..a3e8048 100644 --- a/README.rst +++ b/README.rst @@ -239,6 +239,11 @@ Middleware settings - ``AUTOEXTRACT_SLOT_POLICY`` [optional] Download concurrency options. Defaults to ``SlotPolicy.PER_DOMAIN`` - If set to ``SlotPolicy.PER_DOMAIN``, then consider setting ``SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.DownloaderAwarePriorityQueue'`` to make better usage of AutoExtract concurrency and avoid delays. +- ``AUTOEXTRACT_RESPONSE_ERROR_LOG_LEVEL`` [optional] Sets the logging level for logging the errors encountered. + _(default: logging.DEBUG)_ +- ``AUTOEXTRACT_ALLOWED_RESPONSE_ERRORS`` [optional] A collection of error strings from AutoExtract's responses to + prevent ``AutoExtractError`` from being raised. The collection of strings defined here will be combined with the default values. + _(default: ``{"Downloader error: http404"}`` that prevents valid 404 website responses from erroring out)_ Provider settings ----------------- diff --git a/scrapy_autoextract/middlewares.py b/scrapy_autoextract/middlewares.py index eb65b20..ae96fa9 100644 --- a/scrapy_autoextract/middlewares.py +++ b/scrapy_autoextract/middlewares.py @@ -43,6 +43,9 @@ class AutoExtractMiddleware(object): DEFAULT_URL = 'https://autoextract.scrapinghub.com/v1/extract' DEFAULT_TIMEOUT = 660 DEFAULT_SLOT_POLICY = SlotPolicy.PER_DOMAIN + DEFAULT_ALLOWED_REPONSE_ERRORS = { + "Downloader error: http404", # some sites return 404 as a valid response + } def __init__(self, crawler): self.crawler = crawler @@ -50,6 +53,13 @@ def __init__(self, crawler): self._api_user = self.settings['AUTOEXTRACT_USER'] self._api_pass = '' self.page_type = self.settings['AUTOEXTRACT_PAGE_TYPE'] + self._log_response_error_level = self.settings.get( + "AUTOEXTRACT_RESPONSE_ERROR_LOG_LEVEL", logging.DEBUG + ) + self._allowed_response_errors = ( + set(self.settings.get("AUTOEXTRACT_ALLOWED_RESPONSE_ERRORS", [])) + | self.DEFAULT_ALLOWED_REPONSE_ERRORS + ) if not self.page_type: self.page_type = getattr(crawler.spider, 'page_type', None) self.timeout = max( @@ -164,13 +174,13 @@ def process_response(self, request, response, spider): response_object = json.loads(body) except Exception: self.inc_metric('autoextract/errors/json_decode') - self._log_debug_error(response, body) + self._log_response_error(response, body) raise AutoExtractError('Cannot parse JSON response from AutoExtract' ' for {}: {}'.format(url, response.body[:MAX_ERROR_BODY])) if response.status != 200: self.inc_metric('autoextract/errors/response_error/{}'.format(response.status)) - self._log_debug_error(response, body) + self._log_response_error(response, body) raise AutoExtractError('Received error from AutoExtract for ' '{}: {}'.format(url, response_object)) @@ -179,13 +189,14 @@ def process_response(self, request, response, spider): result = response_object[0] else: self.inc_metric('autoextract/errors/type_error') - self._log_debug_error(response, body) + self._log_response_error(response, body) raise AutoExtractError('Received invalid response from AutoExtract for ' '{}: {}'.format(url, response_object)) - if result.get('error'): + error = result.get('error') + if error and error not in self._allowed_response_errors: self.inc_metric('autoextract/errors/result_error') - self._log_debug_error(response, body) + self._log_response_error(response, body) raise AutoExtractError('Received error from AutoExtract for ' '{}: {}'.format(url, result["error"])) @@ -284,12 +295,17 @@ def inc_metric(self, key, **kwargs): def set_metric(self, key, value): self.crawler.stats.set_value(key, value) - def _log_debug_error(self, response, body): + def _log_response_error(self, response, body): if len(body) > MAX_ERROR_BODY: half_body = MAX_ERROR_BODY // 2 body = body[:half_body] + ' [...] ' + body[-half_body:] - logger.debug('AutoExtract response status=%i headers=%s content=%s', response.status, - response.headers.to_unicode_dict(), body) + logger.log( + self._log_response_error_level, + 'AutoExtract response status=%i headers=%s content=%s', + response.status, + response.headers.to_unicode_dict(), + body + ) def autoextract_latency_stats(self): self.set_metric('autoextract/response_count', self.nr_resp) diff --git a/tests/test_autoextract.py b/tests/test_autoextract.py index 139d9aa..77c7387 100644 --- a/tests/test_autoextract.py +++ b/tests/test_autoextract.py @@ -6,6 +6,7 @@ from scrapy.http import Request, Response from scrapy.spiders import Spider from scrapy.utils.test import get_crawler +from scrapy.http import HtmlResponse from scrapy_autoextract import AutoExtractMiddleware from scrapy_autoextract.middlewares import AutoExtractError, AutoExtractConfigError @@ -103,6 +104,29 @@ def test_request_error(): mw.process_response(out, resp, spider) +def test_response_allowed_error(): + settings = {**MW_SETTINGS, "AUTOEXTRACT_ALLOWED_RESPONSE_ERRORS": {"custom error"}} + mw = _mock_mw(spider, settings) + req = Request('http://quotes.toscrape.com', meta=AUTOX_META) + out = mw.process_request(req, spider) + + # Allowed errors like such this http404 shouldn't raise AutoExtractError + # since it the website is the one actually returning a 404 response status. + err = b'[{"query":{},"error":"Downloader error: http404"}]' + resp = Response(out.url, request=out, body=err) + result = mw.process_response(out, resp, spider) + assert isinstance(result, HtmlResponse) + + req = Request('http://quotes.toscrape.com', meta=AUTOX_META) + out = mw.process_request(req, spider) + + # User specified errors using "AUTOEXTRACT_ALLOWED_RESPONSE_ERRORS" + err = b'[{"query":{},"error":"custom error"}]' + resp = Response(out.url, request=out, body=err) + result = mw.process_response(out, resp, spider) + assert isinstance(result, HtmlResponse) + + def test_timeout(): config = dict(MW_SETTINGS) # add a very low timeout - the middleware will ignore it