From da8ac8335ee5452843b51d63317fb61b35e8ede0 Mon Sep 17 00:00:00 2001 From: eracle Date: Sun, 27 Sep 2020 17:44:42 +0200 Subject: [PATCH] fixed random scraper --- linkedin/middlewares.py | 2 +- linkedin/spiders/random.py | 2 +- linkedin/spiders/search.py | 3 +-- linkedin/spiders/selenium.py | 11 +++++------ 4 files changed, 8 insertions(+), 10 deletions(-) diff --git a/linkedin/middlewares.py b/linkedin/middlewares.py index 4ecabbe..d5b7ed7 100644 --- a/linkedin/middlewares.py +++ b/linkedin/middlewares.py @@ -2,7 +2,7 @@ from scrapy.http import HtmlResponse from scrapy.utils.python import to_bytes -from linkedin.spiders.selenium import get_by_xpath, get_by_xpath_or_none, init_chromium +from linkedin.spiders.selenium import init_chromium class SeleniumDownloaderMiddleware: diff --git a/linkedin/spiders/random.py b/linkedin/spiders/random.py index 958304e..14b740e 100644 --- a/linkedin/spiders/random.py +++ b/linkedin/spiders/random.py @@ -33,7 +33,7 @@ def wait_page_completion(self, driver): :return: """ # waiting links to other users are shown so the crawl can continue - get_by_xpath_or_none(driver, '//*/span/span/span[1]', wait_timeout=3) + get_by_xpath_or_none(driver, "//*/div[@class='pv-deferred-area ember-view']", wait_timeout=3) def extract_profile_id_from_url(self, response): # extract_profile_id_from_url diff --git a/linkedin/spiders/search.py b/linkedin/spiders/search.py index 742d5c8..18a9587 100644 --- a/linkedin/spiders/search.py +++ b/linkedin/spiders/search.py @@ -43,8 +43,7 @@ def parser_search_results_page(self, response): no_result_found_xpath = '//*[text()="No results found."]' no_result_response = get_by_xpath_or_none(driver=driver, xpath=no_result_found_xpath, - wait_timeout=NO_RESULT_WAIT_TIMEOUT, - logs=False) + wait_timeout=NO_RESULT_WAIT_TIMEOUT) if no_result_response is not None: # no results message shown: stop crawling this company diff --git a/linkedin/spiders/selenium.py b/linkedin/spiders/selenium.py index a113262..72e0a27 100644 --- a/linkedin/spiders/selenium.py +++ b/linkedin/spiders/selenium.py @@ -1,3 +1,5 @@ +import logging + from selenium import webdriver from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, WebDriverException from selenium.webdriver import DesiredCapabilities @@ -52,7 +54,7 @@ def wait_invisibility_xpath(driver, xpath, wait_timeout=None): WebDriverWait(driver, wait_timeout).until(ec.invisibility_of_element_located((By.XPATH, xpath))) -def get_by_xpath_or_none(driver, xpath, wait_timeout=None, logs=True): +def get_by_xpath_or_none(driver, xpath, wait_timeout=None): """ Get a web element through the xpath string passed. If a TimeoutException is raised the else_case is called and None is returned. @@ -65,10 +67,7 @@ def get_by_xpath_or_none(driver, xpath, wait_timeout=None, logs=True): try: return get_by_xpath(driver, xpath, wait_timeout=wait_timeout) except (TimeoutException, StaleElementReferenceException, WebDriverException) as e: - if logs: - print("Exception Occurred:") - print(f"XPATH:{xpath}") - print(f"Error:{e}") + logging.warning(f"Exception Occurred:\nXPATH:{xpath}\nError:{e}") return None @@ -103,7 +102,7 @@ def init_chromium(selenium_host, cookies=None): driver = webdriver.Remote(command_executor=selenium_url, desired_capabilities=chrome_options) - if cookies: + if cookies is not None: driver.get("https://www.linkedin.com/404error") for cookie in cookies: if 'expiry' in cookie: