Skip to content

Commit

Permalink
linkedin crawler now leverages linkedin-api functionalities
Browse files Browse the repository at this point in the history
  • Loading branch information
eracle committed Jan 2, 2020
1 parent 9a8e919 commit b99f43b
Show file tree
Hide file tree
Showing 7 changed files with 92 additions and 27 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@ selenium_prototype.py
.pytest_cache/
interactive.py
users.csv
.vscode
.dockerignore
27 changes: 27 additions & 0 deletions linkedin/integration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from time import sleep

import random
from linkedin_api import Linkedin


def my_default_evade():
"""
A catch-all method to try and evade suspension from Linkedin.
Currenly, just delays the request by a random (bounded) time
"""
sleep(random.uniform(0.2, 0.7)) # sleep a random duration to try and evade suspention


class CustomLinkedinClient(Linkedin):

def _fetch(self, uri, evade=my_default_evade, **kwargs):
"""
GET request to Linkedin API
"""
return super()._fetch(uri, evade, **kwargs)

def _post(self, uri, evade=my_default_evade, **kwargs):
"""
POST request to Linkedin API
"""
return super()._post(uri, evade, **kwargs)
10 changes: 7 additions & 3 deletions linkedin/middlewares.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@

from scrapy.http import HtmlResponse
from scrapy.http import Response
from scrapy.utils.python import to_bytes

from linkedin.spiders.selenium import get_by_xpath
from linkedin.spiders.selenium import get_by_xpath, get_by_xpath_or_none


class SeleniumDownloaderMiddleware:
Expand All @@ -19,8 +19,12 @@ def process_request(self, request, spider):
profile_xpath = "//*[@id='nav-settings__dropdown-trigger']/img"
get_by_xpath(driver, profile_xpath)

# waiting links to other users are shown so the crawl can continue
get_by_xpath_or_none(driver, '//*/span/span/span[1]', wait_timeout=3)

print('SeleniumMiddleware - retrieving body')
body = to_bytes(driver.page_source) # body must be of type bytes
#return Response(driver.current_url)

return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)


35 changes: 17 additions & 18 deletions linkedin/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,8 @@
SPIDER_MODULES = ['linkedin.spiders']
NEWSPIDER_MODULE = 'linkedin.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'linkedin (+http://www.yourdomain.com)'
# USER_AGENT = 'linkedin (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
Expand All @@ -27,7 +26,7 @@
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 2
DOWNLOAD_DELAY = 0.25
# The download delay setting will honor only one of:
CONCURRENT_REQUESTS_PER_DOMAIN = 1
CONCURRENT_REQUESTS_PER_IP = 1
Expand All @@ -39,32 +38,32 @@
TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# }

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# SPIDER_MIDDLEWARES = {
# 'linkedin.middlewares.MyCustomSpiderMiddleware': 543,
#}
# }

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# }

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# ITEM_PIPELINES = {
# 'linkedin.pipelines.SomePipeline': 300,
#}
# }

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_ENABLED = False
# The initial download delay
AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
Expand All @@ -73,15 +72,15 @@
# each remote server
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

DOWNLOADER_MIDDLEWARES = {
'linkedin.middlewares.SeleniumDownloaderMiddleware': 200,
Expand Down
35 changes: 33 additions & 2 deletions linkedin/spiders/linkedin.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,31 @@
NETWORK_URL = 'https://www.linkedin.com/mynetwork/invite-connect/connections/'


def extract_contact_info(api_client, contact_public_id):
contact_profile = api_client.get_profile(contact_public_id)
contact_info = api_client.get_profile_contact_info(contact_public_id)

lastName = contact_profile['lastName']
firstName = contact_profile['firstName']

email_address = contact_info['email_address']
phone_numbers = contact_info['phone_numbers']

education = contact_profile['education']
experience = contact_profile['experience']

current_work = [exp for exp in experience if exp.get('timePeriod', {}).get('endDate') is None]

return dict(lastName=lastName,
firstName=firstName,
email_address=email_address,
phone_numbers=phone_numbers,
education=education,
experience=experience,
current_work=current_work,
)


class Linkedin(SeleniumSpiderMixin, CrawlSpider):
name = "linkedin"
start_urls = [
Expand All @@ -15,9 +40,15 @@ class Linkedin(SeleniumSpiderMixin, CrawlSpider):

rules = (
# Extract links matching a single user
Rule(LinkExtractor(allow=('https:\/\/.*\/in\/.*',), deny=('https:\/\/.*\/in\/edit\/.*',)),
Rule(LinkExtractor(allow=('https:\/\/.*\/in\/\w*\/$',), deny=('https:\/\/.*\/in\/edit\/.*',)),
callback='extract_profile_id_from_url',
follow=True,
),
)

def extract_profile_id_from_url(self, response):
# extract_profile_id_from_url
profile_id = response.url.split('/')[-2]
item = extract_contact_info(self.api_client, profile_id)


yield item
7 changes: 3 additions & 4 deletions linkedin/spiders/selenium.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from selenium.webdriver.support.wait import WebDriverWait

from conf import EMAIL, PASSWORD
from linkedin.integration import CustomLinkedinClient
from linkedin.items import LinkedinUser

"""
Expand Down Expand Up @@ -176,10 +177,8 @@ def __init__(self, selenium_hostname=None, **kwargs):

self.driver = init_chromium(selenium_hostname)

# Stop web page from asking me if really want to leave - past implementation, FIREFOX
# profile = webdriver.FirefoxProfile()
# profile.set_preference('dom.disable_beforeunload', True)
# self.driver = webdriver.Firefox(profile)
# initializing also API's client
self.api_client = CustomLinkedinClient(EMAIL, PASSWORD, debug=True)

login(self.driver)

Expand Down
3 changes: 3 additions & 0 deletions requirements/production.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,6 @@ selenium>=3.14.0 # pyup: < 4.0 # https://github.com/SeleniumHQ/selenium
# todo: move them to a dedicated req file
pytest==5.3.1
pytest-sugar==0.9.2

# Linkedin API library
-e git+https://github.com/tomquirk/linkedin-api.git@f5962d05e92d135b1be21146a5ce9d41eaf6d423#egg=linkedin_api

0 comments on commit b99f43b

Please sign in to comment.