diff --git a/scraper/src/custom_downloader_middleware.py b/scraper/src/custom_downloader_middleware.py index c3bea000..94bd1995 100644 --- a/scraper/src/custom_downloader_middleware.py +++ b/scraper/src/custom_downloader_middleware.py @@ -10,9 +10,11 @@ class CustomDownloaderMiddleware: driver = None + auth_cookie = None def __init__(self): self.driver = CustomDownloaderMiddleware.driver + self.initialized_auth = False def process_request(self, request, spider): if not spider.js_render: @@ -23,6 +25,11 @@ def process_request(self, request, spider): url_without_params = o.scheme + "://" + o.netloc + o.path request = request.replace(url=url_without_params) + if self.auth_cookie and not self.initialized_auth: + self.driver.get(unquote_plus(request.url)) + self.driver.add_cookie(self.auth_cookie) + self.initialized_auth = True + print("Getting " + request.url + " from selenium") self.driver.get(unquote_plus( diff --git a/scraper/src/index.py b/scraper/src/index.py index 0f5aeafd..75423e8d 100644 --- a/scraper/src/index.py +++ b/scraper/src/index.py @@ -72,6 +72,13 @@ def run_config(config): DEFAULT_REQUEST_HEADERS = headers + if os.getenv('AUTH_COOKIE_NAME') and os.getenv('AUTH_COOKIE_VALUE'): + auth_cookie = { + 'name': os.getenv('AUTH_COOKIE_NAME'), + 'value': os.getenv('AUTH_COOKIE_VALUE'), + } + CustomDownloaderMiddleware.auth_cookie = auth_cookie + process = CrawlerProcess({ 'LOG_ENABLED': '1', 'LOG_LEVEL': 'ERROR',