Skip to content

Commit

Permalink
fixed random scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
eracle committed Sep 27, 2020
1 parent 5d44d5d commit da8ac83
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 10 deletions.
2 changes: 1 addition & 1 deletion linkedin/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from scrapy.http import HtmlResponse
from scrapy.utils.python import to_bytes

from linkedin.spiders.selenium import get_by_xpath, get_by_xpath_or_none, init_chromium
from linkedin.spiders.selenium import init_chromium


class SeleniumDownloaderMiddleware:
Expand Down
2 changes: 1 addition & 1 deletion linkedin/spiders/random.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def wait_page_completion(self, driver):
:return:
"""
# waiting links to other users are shown so the crawl can continue
get_by_xpath_or_none(driver, '//*/span/span/span[1]', wait_timeout=3)
get_by_xpath_or_none(driver, "//*/div[@class='pv-deferred-area ember-view']", wait_timeout=3)

def extract_profile_id_from_url(self, response):
# extract_profile_id_from_url
Expand Down
3 changes: 1 addition & 2 deletions linkedin/spiders/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ def parser_search_results_page(self, response):
no_result_found_xpath = '//*[text()="No results found."]'
no_result_response = get_by_xpath_or_none(driver=driver,
xpath=no_result_found_xpath,
wait_timeout=NO_RESULT_WAIT_TIMEOUT,
logs=False)
wait_timeout=NO_RESULT_WAIT_TIMEOUT)

if no_result_response is not None:
# no results message shown: stop crawling this company
Expand Down
11 changes: 5 additions & 6 deletions linkedin/spiders/selenium.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import logging

from selenium import webdriver
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, WebDriverException
from selenium.webdriver import DesiredCapabilities
Expand Down Expand Up @@ -52,7 +54,7 @@ def wait_invisibility_xpath(driver, xpath, wait_timeout=None):
WebDriverWait(driver, wait_timeout).until(ec.invisibility_of_element_located((By.XPATH, xpath)))


def get_by_xpath_or_none(driver, xpath, wait_timeout=None, logs=True):
def get_by_xpath_or_none(driver, xpath, wait_timeout=None):
"""
Get a web element through the xpath string passed.
If a TimeoutException is raised the else_case is called and None is returned.
Expand All @@ -65,10 +67,7 @@ def get_by_xpath_or_none(driver, xpath, wait_timeout=None, logs=True):
try:
return get_by_xpath(driver, xpath, wait_timeout=wait_timeout)
except (TimeoutException, StaleElementReferenceException, WebDriverException) as e:
if logs:
print("Exception Occurred:")
print(f"XPATH:{xpath}")
print(f"Error:{e}")
logging.warning(f"Exception Occurred:\nXPATH:{xpath}\nError:{e}")
return None


Expand Down Expand Up @@ -103,7 +102,7 @@ def init_chromium(selenium_host, cookies=None):
driver = webdriver.Remote(command_executor=selenium_url,
desired_capabilities=chrome_options)

if cookies:
if cookies is not None:
driver.get("https://www.linkedin.com/404error")
for cookie in cookies:
if 'expiry' in cookie:
Expand Down

0 comments on commit da8ac83

Please sign in to comment.