fixed random scraper

AnitCloud1 · Sep 27, 2020 · da8ac83 · da8ac83
1 parent 5d44d5d
commit da8ac83
Show file tree

Hide file tree

Showing 4 changed files with 8 additions and 10 deletions.
diff --git a/linkedin/middlewares.py b/linkedin/middlewares.py
@@ -2,7 +2,7 @@
 from scrapy.http import HtmlResponse
 from scrapy.utils.python import to_bytes
 
-from linkedin.spiders.selenium import get_by_xpath, get_by_xpath_or_none, init_chromium
+from linkedin.spiders.selenium import init_chromium
 
 
 class SeleniumDownloaderMiddleware:

diff --git a/linkedin/spiders/random.py b/linkedin/spiders/random.py
@@ -33,7 +33,7 @@ def wait_page_completion(self, driver):
         :return:
         """
         # waiting links to other users are shown so the crawl can continue
-        get_by_xpath_or_none(driver, '//*/span/span/span[1]', wait_timeout=3)
+        get_by_xpath_or_none(driver, "//*/div[@class='pv-deferred-area ember-view']", wait_timeout=3)
 
     def extract_profile_id_from_url(self, response):
         # extract_profile_id_from_url

diff --git a/linkedin/spiders/search.py b/linkedin/spiders/search.py
@@ -43,8 +43,7 @@ def parser_search_results_page(self, response):
         no_result_found_xpath = '//*[text()="No results found."]'
         no_result_response = get_by_xpath_or_none(driver=driver,
                                                   xpath=no_result_found_xpath,
-                                                  wait_timeout=NO_RESULT_WAIT_TIMEOUT,
-                                                  logs=False)
+                                                  wait_timeout=NO_RESULT_WAIT_TIMEOUT)
 
         if no_result_response is not None:
             # no results message shown: stop crawling this company

diff --git a/linkedin/spiders/selenium.py b/linkedin/spiders/selenium.py
@@ -1,3 +1,5 @@
+import logging
+
 from selenium import webdriver
 from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, WebDriverException
 from selenium.webdriver import DesiredCapabilities
@@ -52,7 +54,7 @@ def wait_invisibility_xpath(driver, xpath, wait_timeout=None):
     WebDriverWait(driver, wait_timeout).until(ec.invisibility_of_element_located((By.XPATH, xpath)))
 
 
-def get_by_xpath_or_none(driver, xpath, wait_timeout=None, logs=True):
+def get_by_xpath_or_none(driver, xpath, wait_timeout=None):
     """
     Get a web element through the xpath string passed.
     If a TimeoutException is raised the else_case is called and None is returned.
@@ -65,10 +67,7 @@ def get_by_xpath_or_none(driver, xpath, wait_timeout=None, logs=True):
     try:
         return get_by_xpath(driver, xpath, wait_timeout=wait_timeout)
     except (TimeoutException, StaleElementReferenceException, WebDriverException) as e:
-        if logs:
-            print("Exception Occurred:")
-            print(f"XPATH:{xpath}")
-            print(f"Error:{e}")
+        logging.warning(f"Exception Occurred:\nXPATH:{xpath}\nError:{e}")
         return None
 
 
@@ -103,7 +102,7 @@ def init_chromium(selenium_host, cookies=None):
     driver = webdriver.Remote(command_executor=selenium_url,
                               desired_capabilities=chrome_options)
 
-    if cookies:
+    if cookies is not None:
         driver.get("https://www.linkedin.com/404error")
         for cookie in cookies:
             if 'expiry' in cookie: