From da8ac8335ee5452843b51d63317fb61b35e8ede0 Mon Sep 17 00:00:00 2001
From: eracle <eracle@posteo.eu>
Date: Sun, 27 Sep 2020 17:44:42 +0200
Subject: [PATCH] fixed random scraper

---
 linkedin/middlewares.py      |  2 +-
 linkedin/spiders/random.py   |  2 +-
 linkedin/spiders/search.py   |  3 +--
 linkedin/spiders/selenium.py | 11 +++++------
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/linkedin/middlewares.py b/linkedin/middlewares.py
index 4ecabbe..d5b7ed7 100644
--- a/linkedin/middlewares.py
+++ b/linkedin/middlewares.py
@@ -2,7 +2,7 @@
 from scrapy.http import HtmlResponse
 from scrapy.utils.python import to_bytes
 
-from linkedin.spiders.selenium import get_by_xpath, get_by_xpath_or_none, init_chromium
+from linkedin.spiders.selenium import init_chromium
 
 
 class SeleniumDownloaderMiddleware:
diff --git a/linkedin/spiders/random.py b/linkedin/spiders/random.py
index 958304e..14b740e 100644
--- a/linkedin/spiders/random.py
+++ b/linkedin/spiders/random.py
@@ -33,7 +33,7 @@ def wait_page_completion(self, driver):
         :return:
         """
         # waiting links to other users are shown so the crawl can continue
-        get_by_xpath_or_none(driver, '//*/span/span/span[1]', wait_timeout=3)
+        get_by_xpath_or_none(driver, "//*/div[@class='pv-deferred-area ember-view']", wait_timeout=3)
 
     def extract_profile_id_from_url(self, response):
         # extract_profile_id_from_url
diff --git a/linkedin/spiders/search.py b/linkedin/spiders/search.py
index 742d5c8..18a9587 100644
--- a/linkedin/spiders/search.py
+++ b/linkedin/spiders/search.py
@@ -43,8 +43,7 @@ def parser_search_results_page(self, response):
         no_result_found_xpath = '//*[text()="No results found."]'
         no_result_response = get_by_xpath_or_none(driver=driver,
                                                   xpath=no_result_found_xpath,
-                                                  wait_timeout=NO_RESULT_WAIT_TIMEOUT,
-                                                  logs=False)
+                                                  wait_timeout=NO_RESULT_WAIT_TIMEOUT)
 
         if no_result_response is not None:
             # no results message shown: stop crawling this company
diff --git a/linkedin/spiders/selenium.py b/linkedin/spiders/selenium.py
index a113262..72e0a27 100644
--- a/linkedin/spiders/selenium.py
+++ b/linkedin/spiders/selenium.py
@@ -1,3 +1,5 @@
+import logging
+
 from selenium import webdriver
 from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, WebDriverException
 from selenium.webdriver import DesiredCapabilities
@@ -52,7 +54,7 @@ def wait_invisibility_xpath(driver, xpath, wait_timeout=None):
     WebDriverWait(driver, wait_timeout).until(ec.invisibility_of_element_located((By.XPATH, xpath)))
 
 
-def get_by_xpath_or_none(driver, xpath, wait_timeout=None, logs=True):
+def get_by_xpath_or_none(driver, xpath, wait_timeout=None):
     """
     Get a web element through the xpath string passed.
     If a TimeoutException is raised the else_case is called and None is returned.
@@ -65,10 +67,7 @@ def get_by_xpath_or_none(driver, xpath, wait_timeout=None, logs=True):
     try:
         return get_by_xpath(driver, xpath, wait_timeout=wait_timeout)
     except (TimeoutException, StaleElementReferenceException, WebDriverException) as e:
-        if logs:
-            print("Exception Occurred:")
-            print(f"XPATH:{xpath}")
-            print(f"Error:{e}")
+        logging.warning(f"Exception Occurred:\nXPATH:{xpath}\nError:{e}")
         return None
 
 
@@ -103,7 +102,7 @@ def init_chromium(selenium_host, cookies=None):
     driver = webdriver.Remote(command_executor=selenium_url,
                               desired_capabilities=chrome_options)
 
-    if cookies:
+    if cookies is not None:
         driver.get("https://www.linkedin.com/404error")
         for cookie in cookies:
             if 'expiry' in cookie: