Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ TBR

* removed support for Python 3.6
* added support for Python 3.10
* add _optional_ setting ``AUTOEXTRACT_RESPONSE_ERROR_LOG_LEVEL`` to set the
logging level for the errors encountered.
* add _optional_ setting ``AUTOEXTRACT_ALLOWED_RESPONSE_ERRORS`` which is a
collection of error strings from AutoExtract's responses to prevent
``AutoExtractError`` from being raised when errors are valid _(e.g. a website
returns a valid page but with a 404 response status)_.

0.7.0 (2021-08-05)
------------------
Expand Down
5 changes: 5 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,11 @@ Middleware settings
- ``AUTOEXTRACT_SLOT_POLICY`` [optional] Download concurrency options. Defaults to ``SlotPolicy.PER_DOMAIN``
- If set to ``SlotPolicy.PER_DOMAIN``, then consider setting ``SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.DownloaderAwarePriorityQueue'``
to make better usage of AutoExtract concurrency and avoid delays.
- ``AUTOEXTRACT_RESPONSE_ERROR_LOG_LEVEL`` [optional] Sets the logging level for logging the errors encountered.
_(default: logging.DEBUG)_
- ``AUTOEXTRACT_ALLOWED_RESPONSE_ERRORS`` [optional] A collection of error strings from AutoExtract's responses to
prevent ``AutoExtractError`` from being raised. The collection of strings defined here will be combined with the default values.
_(default: ``{"Downloader error: http404"}`` that prevents valid 404 website responses from erroring out)_

Provider settings
-----------------
Expand Down
32 changes: 24 additions & 8 deletions scrapy_autoextract/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,23 @@ class AutoExtractMiddleware(object):
DEFAULT_URL = 'https://autoextract.scrapinghub.com/v1/extract'
DEFAULT_TIMEOUT = 660
DEFAULT_SLOT_POLICY = SlotPolicy.PER_DOMAIN
DEFAULT_ALLOWED_REPONSE_ERRORS = {
"Downloader error: http404", # some sites return 404 as a valid response
}

def __init__(self, crawler):
self.crawler = crawler
self.settings = crawler.settings
self._api_user = self.settings['AUTOEXTRACT_USER']
self._api_pass = ''
self.page_type = self.settings['AUTOEXTRACT_PAGE_TYPE']
self._log_response_error_level = self.settings.get(
"AUTOEXTRACT_RESPONSE_ERROR_LOG_LEVEL", logging.DEBUG
)
self._allowed_response_errors = (
set(self.settings.get("AUTOEXTRACT_ALLOWED_RESPONSE_ERRORS", []))
| self.DEFAULT_ALLOWED_REPONSE_ERRORS
)
if not self.page_type:
self.page_type = getattr(crawler.spider, 'page_type', None)
self.timeout = max(
Expand Down Expand Up @@ -164,13 +174,13 @@ def process_response(self, request, response, spider):
response_object = json.loads(body)
except Exception:
self.inc_metric('autoextract/errors/json_decode')
self._log_debug_error(response, body)
self._log_response_error(response, body)
raise AutoExtractError('Cannot parse JSON response from AutoExtract'
' for {}: {}'.format(url, response.body[:MAX_ERROR_BODY]))

if response.status != 200:
self.inc_metric('autoextract/errors/response_error/{}'.format(response.status))
self._log_debug_error(response, body)
self._log_response_error(response, body)
raise AutoExtractError('Received error from AutoExtract for '
'{}: {}'.format(url, response_object))

Expand All @@ -179,13 +189,14 @@ def process_response(self, request, response, spider):
result = response_object[0]
else:
self.inc_metric('autoextract/errors/type_error')
self._log_debug_error(response, body)
self._log_response_error(response, body)
raise AutoExtractError('Received invalid response from AutoExtract for '
'{}: {}'.format(url, response_object))

if result.get('error'):
error = result.get('error')
if error and error not in self._allowed_response_errors:
self.inc_metric('autoextract/errors/result_error')
self._log_debug_error(response, body)
self._log_response_error(response, body)
raise AutoExtractError('Received error from AutoExtract for '
'{}: {}'.format(url, result["error"]))

Expand Down Expand Up @@ -284,12 +295,17 @@ def inc_metric(self, key, **kwargs):
def set_metric(self, key, value):
self.crawler.stats.set_value(key, value)

def _log_debug_error(self, response, body):
def _log_response_error(self, response, body):
if len(body) > MAX_ERROR_BODY:
half_body = MAX_ERROR_BODY // 2
body = body[:half_body] + ' [...] ' + body[-half_body:]
logger.debug('AutoExtract response status=%i headers=%s content=%s', response.status,
response.headers.to_unicode_dict(), body)
logger.log(
self._log_response_error_level,
'AutoExtract response status=%i headers=%s content=%s',
response.status,
response.headers.to_unicode_dict(),
body
)

def autoextract_latency_stats(self):
self.set_metric('autoextract/response_count', self.nr_resp)
Expand Down
24 changes: 24 additions & 0 deletions tests/test_autoextract.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from scrapy.http import Request, Response
from scrapy.spiders import Spider
from scrapy.utils.test import get_crawler
from scrapy.http import HtmlResponse

from scrapy_autoextract import AutoExtractMiddleware
from scrapy_autoextract.middlewares import AutoExtractError, AutoExtractConfigError
Expand Down Expand Up @@ -103,6 +104,29 @@ def test_request_error():
mw.process_response(out, resp, spider)


def test_response_allowed_error():
settings = {**MW_SETTINGS, "AUTOEXTRACT_ALLOWED_RESPONSE_ERRORS": {"custom error"}}
mw = _mock_mw(spider, settings)
req = Request('http://quotes.toscrape.com', meta=AUTOX_META)
out = mw.process_request(req, spider)

# Allowed errors like such this http404 shouldn't raise AutoExtractError
# since it the website is the one actually returning a 404 response status.
err = b'[{"query":{},"error":"Downloader error: http404"}]'
resp = Response(out.url, request=out, body=err)
result = mw.process_response(out, resp, spider)
assert isinstance(result, HtmlResponse)

req = Request('http://quotes.toscrape.com', meta=AUTOX_META)
out = mw.process_request(req, spider)

# User specified errors using "AUTOEXTRACT_ALLOWED_RESPONSE_ERRORS"
err = b'[{"query":{},"error":"custom error"}]'
resp = Response(out.url, request=out, body=err)
result = mw.process_response(out, resp, spider)
assert isinstance(result, HtmlResponse)


def test_timeout():
config = dict(MW_SETTINGS)
# add a very low timeout - the middleware will ignore it
Expand Down