linkedin crawler now leverages linkedin-api functionalities

AnitCloud1 · Jan 2, 2020 · b99f43b · b99f43b
1 parent 9a8e919
commit b99f43b
Show file tree

Hide file tree

Showing 7 changed files with 92 additions and 27 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,5 @@ selenium_prototype.py
 .pytest_cache/
 interactive.py
 users.csv
+.vscode
+.dockerignore
diff --git a/linkedin/integration.py b/linkedin/integration.py
@@ -0,0 +1,27 @@
+from time import sleep
+
+import random
+from linkedin_api import Linkedin
+
+
+def my_default_evade():
+    """
+    A catch-all method to try and evade suspension from Linkedin.
+    Currenly, just delays the request by a random (bounded) time
+    """
+    sleep(random.uniform(0.2, 0.7))  # sleep a random duration to try and evade suspention
+
+
+class CustomLinkedinClient(Linkedin):
+
+    def _fetch(self, uri, evade=my_default_evade, **kwargs):
+        """
+        GET request to Linkedin API
+        """
+        return super()._fetch(uri, evade, **kwargs)
+
+    def _post(self, uri, evade=my_default_evade, **kwargs):
+        """
+        POST request to Linkedin API
+        """
+        return super()._post(uri, evade, **kwargs)
diff --git a/linkedin/middlewares.py b/linkedin/middlewares.py
@@ -1,8 +1,8 @@
+
 from scrapy.http import HtmlResponse
-from scrapy.http import Response
 from scrapy.utils.python import to_bytes
 
-from linkedin.spiders.selenium import get_by_xpath
+from linkedin.spiders.selenium import get_by_xpath, get_by_xpath_or_none
 
 
 class SeleniumDownloaderMiddleware:
@@ -19,8 +19,12 @@ def process_request(self, request, spider):
         profile_xpath = "//*[@id='nav-settings__dropdown-trigger']/img"
         get_by_xpath(driver, profile_xpath)
 
+        # waiting links to other users are shown so the crawl can continue
+        get_by_xpath_or_none(driver, '//*/span/span/span[1]', wait_timeout=3)
+
         print('SeleniumMiddleware - retrieving body')
         body = to_bytes(driver.page_source)  # body must be of type bytes
-        #return Response(driver.current_url)
+
         return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
 
+
diff --git a/linkedin/settings.py b/linkedin/settings.py
@@ -14,9 +14,8 @@
 SPIDER_MODULES = ['linkedin.spiders']
 NEWSPIDER_MODULE = 'linkedin.spiders'
 
-
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'linkedin (+http://www.yourdomain.com)'
+# USER_AGENT = 'linkedin (+http://www.yourdomain.com)'
 
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = False
@@ -27,7 +26,7 @@
 # Configure a delay for requests for the same website (default: 0)
 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-DOWNLOAD_DELAY = 2
+DOWNLOAD_DELAY = 0.25
 # The download delay setting will honor only one of:
 CONCURRENT_REQUESTS_PER_DOMAIN = 1
 CONCURRENT_REQUESTS_PER_IP = 1
@@ -39,32 +38,32 @@
 TELNETCONSOLE_ENABLED = False
 
 # Override the default request headers:
-#DEFAULT_REQUEST_HEADERS = {
+# DEFAULT_REQUEST_HEADERS = {
 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 #   'Accept-Language': 'en',
-#}
+# }
 
 # Enable or disable spider middlewares
 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
-#SPIDER_MIDDLEWARES = {
+# SPIDER_MIDDLEWARES = {
 #    'linkedin.middlewares.MyCustomSpiderMiddleware': 543,
-#}
+# }
 
 # Enable or disable extensions
 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
-#EXTENSIONS = {
+# EXTENSIONS = {
 #    'scrapy.extensions.telnet.TelnetConsole': None,
-#}
+# }
 
 # Configure item pipelines
 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
+# ITEM_PIPELINES = {
 #    'linkedin.pipelines.SomePipeline': 300,
-#}
+# }
 
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
-AUTOTHROTTLE_ENABLED = True
+AUTOTHROTTLE_ENABLED = False
 # The initial download delay
 AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
@@ -73,15 +72,15 @@
 # each remote server
 AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
-#AUTOTHROTTLE_DEBUG = False
+# AUTOTHROTTLE_DEBUG = False
 
 # Enable and configure HTTP caching (disabled by default)
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-#HTTPCACHE_ENABLED = True
-#HTTPCACHE_EXPIRATION_SECS = 0
-#HTTPCACHE_DIR = 'httpcache'
-#HTTPCACHE_IGNORE_HTTP_CODES = []
-#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+# HTTPCACHE_ENABLED = True
+# HTTPCACHE_EXPIRATION_SECS = 0
+# HTTPCACHE_DIR = 'httpcache'
+# HTTPCACHE_IGNORE_HTTP_CODES = []
+# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 
 DOWNLOADER_MIDDLEWARES = {
     'linkedin.middlewares.SeleniumDownloaderMiddleware': 200,

diff --git a/linkedin/spiders/linkedin.py b/linkedin/spiders/linkedin.py
@@ -7,6 +7,31 @@
 NETWORK_URL = 'https://www.linkedin.com/mynetwork/invite-connect/connections/'
 
 
+def extract_contact_info(api_client, contact_public_id):
+    contact_profile = api_client.get_profile(contact_public_id)
+    contact_info = api_client.get_profile_contact_info(contact_public_id)
+
+    lastName = contact_profile['lastName']
+    firstName = contact_profile['firstName']
+
+    email_address = contact_info['email_address']
+    phone_numbers = contact_info['phone_numbers']
+
+    education = contact_profile['education']
+    experience = contact_profile['experience']
+
+    current_work = [exp for exp in experience if exp.get('timePeriod', {}).get('endDate') is None]
+
+    return dict(lastName=lastName,
+                firstName=firstName,
+                email_address=email_address,
+                phone_numbers=phone_numbers,
+                education=education,
+                experience=experience,
+                current_work=current_work,
+                )
+
+
 class Linkedin(SeleniumSpiderMixin, CrawlSpider):
     name = "linkedin"
     start_urls = [
@@ -15,9 +40,15 @@ class Linkedin(SeleniumSpiderMixin, CrawlSpider):
 
     rules = (
         # Extract links matching a single user
-        Rule(LinkExtractor(allow=('https:\/\/.*\/in\/.*',), deny=('https:\/\/.*\/in\/edit\/.*',)),
+        Rule(LinkExtractor(allow=('https:\/\/.*\/in\/\w*\/$',), deny=('https:\/\/.*\/in\/edit\/.*',)),
+             callback='extract_profile_id_from_url',
+             follow=True,
              ),
     )
 
+    def extract_profile_id_from_url(self, response):
+        # extract_profile_id_from_url
+        profile_id = response.url.split('/')[-2]
+        item = extract_contact_info(self.api_client, profile_id)
 
-
+        yield item
diff --git a/linkedin/spiders/selenium.py b/linkedin/spiders/selenium.py
@@ -8,6 +8,7 @@
 from selenium.webdriver.support.wait import WebDriverWait
 
 from conf import EMAIL, PASSWORD
+from linkedin.integration import CustomLinkedinClient
 from linkedin.items import LinkedinUser
 
 """
@@ -176,10 +177,8 @@ def __init__(self, selenium_hostname=None, **kwargs):
 
         self.driver = init_chromium(selenium_hostname)
 
-        # Stop web page from asking me if really want to leave - past implementation, FIREFOX
-        # profile = webdriver.FirefoxProfile()
-        # profile.set_preference('dom.disable_beforeunload', True)
-        # self.driver = webdriver.Firefox(profile)
+        # initializing also API's client
+        self.api_client = CustomLinkedinClient(EMAIL, PASSWORD, debug=True)
 
         login(self.driver)
 

diff --git a/requirements/production.txt b/requirements/production.txt
@@ -8,3 +8,6 @@ selenium>=3.14.0 # pyup: < 4.0  # https://github.com/SeleniumHQ/selenium
 # todo: move them to a dedicated req file
 pytest==5.3.1
 pytest-sugar==0.9.2
+
+# Linkedin API library
+-e git+https://github.com/tomquirk/linkedin-api.git@f5962d05e92d135b1be21146a5ce9d41eaf6d423#egg=linkedin_api