Skip to content

Commit

Permalink
company spider now users linkedin api client
Browse files Browse the repository at this point in the history
  • Loading branch information
eracle committed Jan 23, 2020
1 parent f99302d commit 5c469ff
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 11 deletions.
4 changes: 1 addition & 3 deletions linkedin/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,10 @@ def process_request(self, request, spider):
profile_xpath = "//*[@id='nav-settings__dropdown-trigger']/img"
get_by_xpath(driver, profile_xpath)

# waiting links to other users are shown so the crawl can continue
get_by_xpath_or_none(driver, '//*/span/span/span[1]', wait_timeout=3)
spider.wait_page_completion(driver=driver)

print('SeleniumMiddleware - retrieving body')
body = to_bytes(driver.page_source) # body must be of type bytes

return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)


13 changes: 11 additions & 2 deletions linkedin/spiders/companies.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from scrapy.spiders import Spider

from linkedin.spiders.selenium import SeleniumSpiderMixin, extracts_see_all_url, extracts_linkedin_users, \
get_by_xpath_or_none, wait_invisibility_xpath, extract_company
get_by_xpath_or_none, extract_company

"""
Number of seconds to wait checking if the page is a "No Result" type.
Expand All @@ -25,6 +25,15 @@ class CompaniesSpider(SeleniumSpiderMixin, Spider):
with open(URLS_FILE, "rt") as f:
start_urls = [url.strip() for url in f]

def wait_page_completion(self, driver):
"""
Abstract function, used to customize how the specific spider have to wait for page completion.
Blank by default
:param driver:
:return:
"""
pass

def parse(self, response):
url = extracts_see_all_url(self.driver) + f'&page={FIRST_PAGE_INDEX}'
return Request(url=url,
Expand All @@ -49,7 +58,7 @@ def parser_search_results_page(self, response):
company = extract_company(self.driver)
print(f'Company:{company}')

users = extracts_linkedin_users(self.driver, company=company)
users = extracts_linkedin_users(self.driver, company=company, api_client=self.api_client)
for user in users:
yield user

Expand Down
12 changes: 11 additions & 1 deletion linkedin/spiders/linkedin.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from scrapy.spiders import CrawlSpider
from scrapy.spiders import Rule

from linkedin.spiders.selenium import SeleniumSpiderMixin
from linkedin.spiders.selenium import SeleniumSpiderMixin, get_by_xpath_or_none

NETWORK_URL = 'https://www.linkedin.com/mynetwork/invite-connect/connections/'

Expand Down Expand Up @@ -46,6 +46,16 @@ class Linkedin(SeleniumSpiderMixin, CrawlSpider):
),
)

def wait_page_completion(self, driver):
"""
Abstract function, used to customize how the specific spider have to wait for page completion.
Blank by default
:param driver:
:return:
"""
# waiting links to other users are shown so the crawl can continue
get_by_xpath_or_none(driver, '//*/span/span/span[1]', wait_timeout=3)

def extract_profile_id_from_url(self, response):
# extract_profile_id_from_url
profile_id = response.url.split('/')[-2]
Expand Down
9 changes: 5 additions & 4 deletions linkedin/spiders/selenium.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

from conf import EMAIL, PASSWORD
from linkedin.integration import CustomLinkedinClient
from linkedin.items import LinkedinUser

"""
number of seconds used to wait the web page's loading.
Expand Down Expand Up @@ -124,13 +123,14 @@ def extracts_see_all_url(driver):
return see_all_url


def extracts_linkedin_users(driver, company):
def extracts_linkedin_users(driver, company, api_client):
"""
Gets from a page containing a list of users, all the users.
For instance: https://www.linkedin.com/search/results/people/?facetCurrentCompany=[%22221027%22]
:param driver: The webdriver, logged in, and located in the page which lists users.
:return: Iterator on LinkedinUser.
"""
from linkedin.spiders.linkedin import extract_contact_info

for i in range(1, 11):
print(f'loading {i}th user')
Expand All @@ -140,7 +140,6 @@ def extracts_linkedin_users(driver, company):
result = get_by_xpath_or_none(driver, last_result_xpath)
if result is not None:


link_elem = get_by_xpath_or_none(result, './/*[@class="search-result__result-link ember-view"]')
link = link_elem.get_attribute('href') if link_elem is not None else None

Expand All @@ -150,7 +149,9 @@ def extracts_linkedin_users(driver, company):
title_elem = get_by_xpath_or_none(result, './/p')
title = title_elem.text if name_elem is not None else None

user = LinkedinUser(name=name, title=title, company=company, link=link)
# extract_profile_id_from_url
profile_id = link.split('/')[-2]
user = extract_contact_info(api_client, profile_id)

yield user

Expand Down
2 changes: 1 addition & 1 deletion urls.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
https://www.linkedin.com/company/google
https://www.linkedin.com/company/twitter

0 comments on commit 5c469ff

Please sign in to comment.