scrapinghub · BurnzZ · Feb 4, 2022 · Feb 4, 2022 · Feb 4, 2022
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -6,6 +6,12 @@ TBR
 
 * removed support for Python 3.6
 * added support for Python 3.10
+* add _optional_ setting ``AUTOEXTRACT_RESPONSE_ERROR_LOG_LEVEL`` to set the
+  logging level for the errors encountered.
+* add _optional_ setting ``AUTOEXTRACT_ALLOWED_RESPONSE_ERRORS`` which is a
+  collection of error strings from AutoExtract's responses to prevent
+  ``AutoExtractError`` from being raised when errors are valid _(e.g. a website
+  returns a valid page but with a 404 response status)_.
 
 0.7.0 (2021-08-05)
 ------------------

diff --git a/README.rst b/README.rst
@@ -239,6 +239,11 @@ Middleware settings
 - ``AUTOEXTRACT_SLOT_POLICY`` [optional] Download concurrency options. Defaults to ``SlotPolicy.PER_DOMAIN``
   - If set to ``SlotPolicy.PER_DOMAIN``, then consider setting ``SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.DownloaderAwarePriorityQueue'``
   to make better usage of AutoExtract concurrency and avoid delays.
+- ``AUTOEXTRACT_RESPONSE_ERROR_LOG_LEVEL`` [optional] Sets the logging level for logging the errors encountered.
+  _(default: logging.DEBUG)_
+- ``AUTOEXTRACT_ALLOWED_RESPONSE_ERRORS`` [optional] A collection of error strings from AutoExtract's responses to
+  prevent ``AutoExtractError`` from being raised. The collection of strings defined here will be combined with the default values.
+  _(default: ``{"Downloader error: http404"}`` that prevents valid 404 website responses from erroring out)_
 
 Provider settings
 -----------------

diff --git a/scrapy_autoextract/middlewares.py b/scrapy_autoextract/middlewares.py
@@ -43,13 +43,23 @@ class AutoExtractMiddleware(object):
     DEFAULT_URL = 'https://autoextract.scrapinghub.com/v1/extract'
     DEFAULT_TIMEOUT = 660
     DEFAULT_SLOT_POLICY = SlotPolicy.PER_DOMAIN
+    DEFAULT_ALLOWED_REPONSE_ERRORS = {
+        "Downloader error: http404",  # some sites return 404 as a valid response
+    }
 
     def __init__(self, crawler):
         self.crawler = crawler
         self.settings = crawler.settings
         self._api_user = self.settings['AUTOEXTRACT_USER']
         self._api_pass = ''
         self.page_type = self.settings['AUTOEXTRACT_PAGE_TYPE']
+        self._log_response_error_level = self.settings.get(
+            "AUTOEXTRACT_RESPONSE_ERROR_LOG_LEVEL", logging.DEBUG
+        )
+        self._allowed_response_errors = (
+            set(self.settings.get("AUTOEXTRACT_ALLOWED_RESPONSE_ERRORS", []))
+            | self.DEFAULT_ALLOWED_REPONSE_ERRORS
+        )
         if not self.page_type:
             self.page_type = getattr(crawler.spider, 'page_type', None)
         self.timeout = max(
@@ -164,13 +174,13 @@ def process_response(self, request, response, spider):
             response_object = json.loads(body)
         except Exception:
             self.inc_metric('autoextract/errors/json_decode')
-            self._log_debug_error(response, body)
+            self._log_response_error(response, body)
             raise AutoExtractError('Cannot parse JSON response from AutoExtract'
                                    ' for {}: {}'.format(url, response.body[:MAX_ERROR_BODY]))
 
         if response.status != 200:
             self.inc_metric('autoextract/errors/response_error/{}'.format(response.status))
-            self._log_debug_error(response, body)
+            self._log_response_error(response, body)
             raise AutoExtractError('Received error from AutoExtract for '
                                    '{}: {}'.format(url, response_object))
 
@@ -179,13 +189,14 @@ def process_response(self, request, response, spider):
             result = response_object[0]
         else:
             self.inc_metric('autoextract/errors/type_error')
-            self._log_debug_error(response, body)
+            self._log_response_error(response, body)
             raise AutoExtractError('Received invalid response from AutoExtract for '
                                    '{}: {}'.format(url, response_object))
 
-        if result.get('error'):
+        error = result.get('error')
+        if error and error not in self._allowed_response_errors:
             self.inc_metric('autoextract/errors/result_error')
-            self._log_debug_error(response, body)
+            self._log_response_error(response, body)
             raise AutoExtractError('Received error from AutoExtract for '
                                    '{}: {}'.format(url, result["error"]))
 
@@ -284,12 +295,17 @@ def inc_metric(self, key, **kwargs):
     def set_metric(self, key, value):
         self.crawler.stats.set_value(key, value)
 
-    def _log_debug_error(self, response, body):
+    def _log_response_error(self, response, body):
         if len(body) > MAX_ERROR_BODY:
             half_body = MAX_ERROR_BODY // 2
             body = body[:half_body] + ' [...] ' + body[-half_body:]
-        logger.debug('AutoExtract response status=%i  headers=%s  content=%s', response.status,
-                     response.headers.to_unicode_dict(), body)
+        logger.log(
+            self._log_response_error_level,
+            'AutoExtract response status=%i  headers=%s  content=%s',
+            response.status,
+            response.headers.to_unicode_dict(),
+            body
+        )
 
     def autoextract_latency_stats(self):
         self.set_metric('autoextract/response_count', self.nr_resp)

diff --git a/tests/test_autoextract.py b/tests/test_autoextract.py
@@ -6,6 +6,7 @@
 from scrapy.http import Request, Response
 from scrapy.spiders import Spider
 from scrapy.utils.test import get_crawler
+from scrapy.http import HtmlResponse
 
 from scrapy_autoextract import AutoExtractMiddleware
 from scrapy_autoextract.middlewares import AutoExtractError, AutoExtractConfigError
@@ -103,6 +104,29 @@ def test_request_error():
         mw.process_response(out, resp, spider)
 
 
+def test_response_allowed_error():
+    settings = {**MW_SETTINGS, "AUTOEXTRACT_ALLOWED_RESPONSE_ERRORS": {"custom error"}}
+    mw = _mock_mw(spider, settings)
+    req = Request('http://quotes.toscrape.com', meta=AUTOX_META)
+    out = mw.process_request(req, spider)
+
+    # Allowed errors like such this http404 shouldn't raise AutoExtractError
+    # since it the website is the one actually returning a 404 response status.
+    err = b'[{"query":{},"error":"Downloader error: http404"}]'
+    resp = Response(out.url, request=out, body=err)
+    result = mw.process_response(out, resp, spider)
+    assert isinstance(result, HtmlResponse)
+
+    req = Request('http://quotes.toscrape.com', meta=AUTOX_META)
+    out = mw.process_request(req, spider)
+
+    # User specified errors using "AUTOEXTRACT_ALLOWED_RESPONSE_ERRORS"
+    err = b'[{"query":{},"error":"custom error"}]'
+    resp = Response(out.url, request=out, body=err)
+    result = mw.process_response(out, resp, spider)
+    assert isinstance(result, HtmlResponse)
+
+
 def test_timeout():
     config = dict(MW_SETTINGS)
     # add a very low timeout - the middleware will ignore it