diff --git a/.gitignore b/.gitignore index 3f560a0..3f40e84 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,5 @@ selenium_prototype.py .pytest_cache/ interactive.py users.csv +.vscode +.dockerignore diff --git a/linkedin/integration.py b/linkedin/integration.py new file mode 100644 index 0000000..ebfb4c8 --- /dev/null +++ b/linkedin/integration.py @@ -0,0 +1,27 @@ +from time import sleep + +import random +from linkedin_api import Linkedin + + +def my_default_evade(): + """ + A catch-all method to try and evade suspension from Linkedin. + Currenly, just delays the request by a random (bounded) time + """ + sleep(random.uniform(0.2, 0.7)) # sleep a random duration to try and evade suspention + + +class CustomLinkedinClient(Linkedin): + + def _fetch(self, uri, evade=my_default_evade, **kwargs): + """ + GET request to Linkedin API + """ + return super()._fetch(uri, evade, **kwargs) + + def _post(self, uri, evade=my_default_evade, **kwargs): + """ + POST request to Linkedin API + """ + return super()._post(uri, evade, **kwargs) diff --git a/linkedin/middlewares.py b/linkedin/middlewares.py index a07a03e..3c4ea03 100644 --- a/linkedin/middlewares.py +++ b/linkedin/middlewares.py @@ -1,8 +1,8 @@ + from scrapy.http import HtmlResponse -from scrapy.http import Response from scrapy.utils.python import to_bytes -from linkedin.spiders.selenium import get_by_xpath +from linkedin.spiders.selenium import get_by_xpath, get_by_xpath_or_none class SeleniumDownloaderMiddleware: @@ -19,8 +19,12 @@ def process_request(self, request, spider): profile_xpath = "//*[@id='nav-settings__dropdown-trigger']/img" get_by_xpath(driver, profile_xpath) + # waiting links to other users are shown so the crawl can continue + get_by_xpath_or_none(driver, '//*/span/span/span[1]', wait_timeout=3) + print('SeleniumMiddleware - retrieving body') body = to_bytes(driver.page_source) # body must be of type bytes - #return Response(driver.current_url) + return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request) + diff --git a/linkedin/settings.py b/linkedin/settings.py index 4c7da6d..a3c97ec 100644 --- a/linkedin/settings.py +++ b/linkedin/settings.py @@ -14,9 +14,8 @@ SPIDER_MODULES = ['linkedin.spiders'] NEWSPIDER_MODULE = 'linkedin.spiders' - # Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'linkedin (+http://www.yourdomain.com)' +# USER_AGENT = 'linkedin (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False @@ -27,7 +26,7 @@ # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs -DOWNLOAD_DELAY = 2 +DOWNLOAD_DELAY = 0.25 # The download delay setting will honor only one of: CONCURRENT_REQUESTS_PER_DOMAIN = 1 CONCURRENT_REQUESTS_PER_IP = 1 @@ -39,32 +38,32 @@ TELNETCONSOLE_ENABLED = False # Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { +# DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', -#} +# } # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { +# SPIDER_MIDDLEWARES = { # 'linkedin.middlewares.MyCustomSpiderMiddleware': 543, -#} +# } # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html -#EXTENSIONS = { +# EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, -#} +# } # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html -#ITEM_PIPELINES = { +# ITEM_PIPELINES = { # 'linkedin.pipelines.SomePipeline': 300, -#} +# } # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html -AUTOTHROTTLE_ENABLED = True +AUTOTHROTTLE_ENABLED = False # The initial download delay AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies @@ -73,15 +72,15 @@ # each remote server AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False +# AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' DOWNLOADER_MIDDLEWARES = { 'linkedin.middlewares.SeleniumDownloaderMiddleware': 200, diff --git a/linkedin/spiders/linkedin.py b/linkedin/spiders/linkedin.py index 794b407..cbe3dd6 100644 --- a/linkedin/spiders/linkedin.py +++ b/linkedin/spiders/linkedin.py @@ -7,6 +7,31 @@ NETWORK_URL = 'https://www.linkedin.com/mynetwork/invite-connect/connections/' +def extract_contact_info(api_client, contact_public_id): + contact_profile = api_client.get_profile(contact_public_id) + contact_info = api_client.get_profile_contact_info(contact_public_id) + + lastName = contact_profile['lastName'] + firstName = contact_profile['firstName'] + + email_address = contact_info['email_address'] + phone_numbers = contact_info['phone_numbers'] + + education = contact_profile['education'] + experience = contact_profile['experience'] + + current_work = [exp for exp in experience if exp.get('timePeriod', {}).get('endDate') is None] + + return dict(lastName=lastName, + firstName=firstName, + email_address=email_address, + phone_numbers=phone_numbers, + education=education, + experience=experience, + current_work=current_work, + ) + + class Linkedin(SeleniumSpiderMixin, CrawlSpider): name = "linkedin" start_urls = [ @@ -15,9 +40,15 @@ class Linkedin(SeleniumSpiderMixin, CrawlSpider): rules = ( # Extract links matching a single user - Rule(LinkExtractor(allow=('https:\/\/.*\/in\/.*',), deny=('https:\/\/.*\/in\/edit\/.*',)), + Rule(LinkExtractor(allow=('https:\/\/.*\/in\/\w*\/$',), deny=('https:\/\/.*\/in\/edit\/.*',)), + callback='extract_profile_id_from_url', + follow=True, ), ) + def extract_profile_id_from_url(self, response): + # extract_profile_id_from_url + profile_id = response.url.split('/')[-2] + item = extract_contact_info(self.api_client, profile_id) - + yield item diff --git a/linkedin/spiders/selenium.py b/linkedin/spiders/selenium.py index b7064be..b1a51b2 100644 --- a/linkedin/spiders/selenium.py +++ b/linkedin/spiders/selenium.py @@ -8,6 +8,7 @@ from selenium.webdriver.support.wait import WebDriverWait from conf import EMAIL, PASSWORD +from linkedin.integration import CustomLinkedinClient from linkedin.items import LinkedinUser """ @@ -176,10 +177,8 @@ def __init__(self, selenium_hostname=None, **kwargs): self.driver = init_chromium(selenium_hostname) - # Stop web page from asking me if really want to leave - past implementation, FIREFOX - # profile = webdriver.FirefoxProfile() - # profile.set_preference('dom.disable_beforeunload', True) - # self.driver = webdriver.Firefox(profile) + # initializing also API's client + self.api_client = CustomLinkedinClient(EMAIL, PASSWORD, debug=True) login(self.driver) diff --git a/requirements/production.txt b/requirements/production.txt index 7db0b9c..20ec6f1 100644 --- a/requirements/production.txt +++ b/requirements/production.txt @@ -8,3 +8,6 @@ selenium>=3.14.0 # pyup: < 4.0 # https://github.com/SeleniumHQ/selenium # todo: move them to a dedicated req file pytest==5.3.1 pytest-sugar==0.9.2 + +# Linkedin API library +-e git+https://github.com/tomquirk/linkedin-api.git@f5962d05e92d135b1be21146a5ce9d41eaf6d423#egg=linkedin_api \ No newline at end of file