Skip to content

Commit

Permalink
added by name search spider
Browse files Browse the repository at this point in the history
  • Loading branch information
eracle committed Mar 21, 2020
1 parent dacbec9 commit f2ab33e
Show file tree
Hide file tree
Showing 10 changed files with 260 additions and 177 deletions.
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,11 @@ view:
companies:
scrapy crawl companies -a selenium_hostname=localhost -o users.csv

random:
scrapy crawl random -a selenium_hostname=localhost -o users.csv

byname:
scrapy crawl byname -a selenium_hostname=localhost -o users.csv

tests:
pytest linkedin/tests/*
12 changes: 8 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Needed:
- python3.6;
- virtualenvs;

###### 0. Preparations;
###### 0. Prepare your environment:

Install docker from the official website [https://www.docker.com/](https://www.docker.com/)

Expand All @@ -36,10 +36,10 @@ sudo apt-get update
sudo apt-get install vinagre
```

###### 1. Set your Linkedin login and password;
###### 1. Set up Linkedin login and password:
Copy `conf_template.py` in `conf.py` and fill the quotes with your credentials.

###### 2. Run and build containers with docker-compose;
###### 2. Run and build containers with docker-compose:
Only linkedin spider, not the companies spider.
Open your terminal, move to the project folder and type:

Expand Down Expand Up @@ -90,7 +90,11 @@ For more details have a look at the Makefile (here is used to shortcut and not t
```
or
```bash
scrapy crawl linkedin -a selenium_hostname=localhost -o output.csv
scrapy crawl random -a selenium_hostname=localhost -o output.csv
```
or
```bash
scrapy crawl byname -a selenium_hostname=localhost -o output.csv
```
## Legal

Expand Down
4 changes: 2 additions & 2 deletions linkedin/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ def process_request(self, request, spider):
# request.meta['driver'] = self.driver # to access driver from response

print('waiting for page loading')
profile_xpath = "//*[@id='nav-settings__dropdown-trigger']/img"
get_by_xpath(driver, profile_xpath)
# profile_xpath = "//*[@id='nav-settings__dropdown-trigger']/img"
# get_by_xpath_or_none(driver, profile_xpath)

spider.wait_page_completion(driver=driver)

Expand Down
40 changes: 40 additions & 0 deletions linkedin/spiders/by_name.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import urllib.parse

from scrapy import Request

from linkedin.spiders.search import SearchSpider

NAMES_FILE = 'names.txt'


class ByNameSpider(SearchSpider):
"""
Spider who searches People by name.
"""
name = 'byname'
allowed_domains = ['www.linkedin.com']

start_urls = []

with open(NAMES_FILE, "rt") as f:
names = [name for name in f]

def start_requests(self):
for name in self.names:
encoded_name = urllib.parse.quote(name.lower())
url = f"https://www.linkedin.com/search/results/people/?origin=GLOBAL_SEARCH_HEADER&keywords={encoded_name}&page=1"

yield Request(url=url,
callback=super().parser_search_results_page,
dont_filter=True,
meta={'max_page': 1},
)

def wait_page_completion(self, driver):
"""
Abstract function, used to customize how the specific spider must wait for a search page completion.
Blank by default
:param driver:
:return:
"""
pass
88 changes: 42 additions & 46 deletions linkedin/spiders/companies.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,20 @@
# -*- coding: utf-8 -*-
from scrapy import Request
from scrapy.spiders import Spider

from linkedin.spiders.selenium import SeleniumSpiderMixin, extracts_see_all_url, extracts_linkedin_users, \
get_by_xpath_or_none, extract_company
from linkedin.spiders.search import SearchSpider
from linkedin.spiders.selenium import get_by_xpath_or_none, get_by_xpath

"""
Number of seconds to wait checking if the page is a "No Result" type.
"""
NO_RESULT_WAIT_TIMEOUT = 3

URLS_FILE = "urls.txt"

"""
First page to scrape from on the search results list (default to 1).
Placeholder used to recognize the 'See all 27,569 employees on LinkedIn' clickable button,
in the 'https://www.linkedin.com/company/*/' style pages.
"""
FIRST_PAGE_INDEX = 1

URLS_FILE = "urls.txt"
SEE_ALL_PLACEHOLDER = 'See all'


class CompaniesSpider(SeleniumSpiderMixin, Spider):
class CompaniesSpider(SearchSpider):
name = 'companies'
allowed_domains = ['www.linkedin.com']

Expand All @@ -27,49 +23,49 @@ class CompaniesSpider(SeleniumSpiderMixin, Spider):

def wait_page_completion(self, driver):
"""
Abstract function, used to customize how the specific spider have to wait for page completion.
Abstract function, used to customize how the specific spider must wait for a search page completion.
Blank by default
:param driver:
:return:
"""
pass

def parse(self, response):
url = extracts_see_all_url(self.driver) + f'&page={FIRST_PAGE_INDEX}'
url = extracts_see_all_url(self.driver) + f'&page=1'
return Request(url=url,
callback=self.parser_search_results_page,
callback=super().parser_search_results_page,
dont_filter=True,
)

def parser_search_results_page(self, response):
print('Now parsing search result page')

no_result_found_xpath = '//*[text()="No results found."]'

no_result_response = get_by_xpath_or_none(driver=self.driver,
xpath=no_result_found_xpath,
wait_timeout=NO_RESULT_WAIT_TIMEOUT,
logs=False)

if no_result_response is not None:
print('"No results" message shown, stop crawling this company')
return
else:
company = extract_company(self.driver)
print(f'Company:{company}')

users = extracts_linkedin_users(self.driver, company=company, api_client=self.api_client)
for user in users:
yield user

# incrementing the index at the end of the url
url = response.request.url
next_url_split = url.split('=')
index = int(next_url_split[-1])
next_url = '='.join(next_url_split[:-1]) + '=' + str(index + 1)

yield Request(url=next_url,
callback=self.parser_search_results_page,
meta={'company': company},
dont_filter=True,
)
######################
# Module's functions:
######################

def extracts_see_all_url(driver):
"""
Retrieve from the the Company front page the url of the page containing the list of its employees.
:param driver: The already opened (and logged in) webdriver, already located to the company's front page.
:return: String: The "See All" URL.
"""
print('Searching for the "See all * employees on LinkedIn" btn')
see_all_xpath = f'//*[starts-with(text(),"{SEE_ALL_PLACEHOLDER}")]'
see_all_elem = get_by_xpath(driver, see_all_xpath)
see_all_ex_text = see_all_elem.text

a_elem = driver.find_element_by_link_text(see_all_ex_text)
see_all_url = a_elem.get_attribute('href')
print(f'Found the following URL: {see_all_url}')
return see_all_url


def extract_company(driver):
"""
Extract company name from a search result page.
:param driver: The selenium webdriver.
:return: The company string, None if something wrong.
"""
company_xpath = '//li[@class="search-s-facet search-s-facet--facetCurrentCompany inline-block ' \
'search-s-facet--is-closed ember-view"]/form/button/div/div/h3 '
company_elem = get_by_xpath_or_none(driver, company_xpath)
return company_elem.text if company_elem is not None else None
33 changes: 6 additions & 27 deletions linkedin/spiders/linkedin.py → linkedin/spiders/random.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,38 +2,17 @@
from scrapy.spiders import CrawlSpider
from scrapy.spiders import Rule

from linkedin.spiders.search import extract_contact_info
from linkedin.spiders.selenium import SeleniumSpiderMixin, get_by_xpath_or_none

"""
Variable holding where to search for first profiles to scrape.
"""
NETWORK_URL = 'https://www.linkedin.com/mynetwork/invite-connect/connections/'


def extract_contact_info(api_client, contact_public_id):
contact_profile = api_client.get_profile(contact_public_id)
contact_info = api_client.get_profile_contact_info(contact_public_id)

lastName = contact_profile['lastName']
firstName = contact_profile['firstName']

email_address = contact_info['email_address']
phone_numbers = contact_info['phone_numbers']

education = contact_profile['education']
experience = contact_profile['experience']

current_work = [exp for exp in experience if exp.get('timePeriod', {}).get('endDate') is None]

return dict(lastName=lastName,
firstName=firstName,
email_address=email_address,
phone_numbers=phone_numbers,
education=education,
experience=experience,
current_work=current_work,
)


class Linkedin(SeleniumSpiderMixin, CrawlSpider):
name = "linkedin"
class RandomSpider(SeleniumSpiderMixin, CrawlSpider):
name = "random"
start_urls = [
NETWORK_URL,
]
Expand Down
129 changes: 129 additions & 0 deletions linkedin/spiders/search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import time

from scrapy import Spider
from scrapy import Request

from linkedin.spiders.selenium import get_by_xpath_or_none, SeleniumSpiderMixin

"""
Number of seconds to wait checking if the page is a "No Result" type.
"""
NO_RESULT_WAIT_TIMEOUT = 3


class SearchSpider(SeleniumSpiderMixin, Spider):
"""
Abstract class for for generic search on linkedin.
"""

def parser_search_results_page(self, response):
print('Now parsing search result page')

no_result_found_xpath = '//*[text()="No results found."]'

no_result_response = get_by_xpath_or_none(driver=self.driver,
xpath=no_result_found_xpath,
wait_timeout=NO_RESULT_WAIT_TIMEOUT,
logs=False)

if no_result_response is not None:
print('"No results" message shown, stop crawling this company')
return
else:
# company extraction temporary disabled
# company = extract_company(self.driver)
# print(f'Company:{company}')

users = extracts_linkedin_users(self.driver,
#company=company,
api_client=self.api_client)
for user in users:
yield user


# incrementing the index at the end of the url
url = response.request.url
next_url_split = url.split('=')
index = int(next_url_split[-1])
next_url = '='.join(next_url_split[:-1]) + '=' + str(index + 1)

max_page = response.meta.get('max_page', None)
if max_page is not None:
if index >= max_page:
return

yield Request(url=next_url,
callback=self.parser_search_results_page,
meta={'max_page': max_page},
dont_filter=True,
)

######################
# Module's functions:
######################
def extracts_linkedin_users(driver, api_client, company=None):
"""
Gets from a page containing a list of users, all the users.
For instance: https://www.linkedin.com/search/results/people/?facetCurrentCompany=[%22221027%22]
:param driver: The webdriver, logged in, and located in the page which lists users.
:return: Iterator on LinkedinUser.
"""

for i in range(1, 11):
print(f'loading {i}th user')

last_result_xpath = f'//li[{i}]/*/div[@class="search-result__wrapper"]'

result = get_by_xpath_or_none(driver, last_result_xpath)
if result is not None:
link_elem = get_by_xpath_or_none(result, './/*[@class="search-result__result-link ember-view"]')
link = link_elem.get_attribute('href') if link_elem is not None else None

name_elem = get_by_xpath_or_none(result, './/*[@class="name actor-name"]')
name = name_elem.text if name_elem is not None else None

title_elem = get_by_xpath_or_none(result, './/p')
title = title_elem.text if name_elem is not None else None

# extract_profile_id_from_url
profile_id = link.split('/')[-2]
user = extract_contact_info(api_client, profile_id)

yield user

if link_elem is not None:
driver.execute_script("arguments[0].scrollIntoView();", link_elem)
elif name_elem is not None:
driver.execute_script("arguments[0].scrollIntoView();", name_elem)
elif title_elem is not None:
driver.execute_script("arguments[0].scrollIntoView();", title_elem)
else:
print("Was not possible to scroll")

time.sleep(0.7)


def extract_contact_info(api_client, contact_public_id):
contact_profile = api_client.get_profile(contact_public_id)
contact_info = api_client.get_profile_contact_info(contact_public_id)

lastName = contact_profile['lastName']
firstName = contact_profile['firstName']

email_address = contact_info['email_address']
phone_numbers = contact_info['phone_numbers']

education = contact_profile['education']
experience = contact_profile['experience']

current_work = [exp for exp in experience if exp.get('timePeriod', {}).get('endDate') is None]

return dict(lastName=lastName,
firstName=firstName,
email_address=email_address,
phone_numbers=phone_numbers,
education=education,
experience=experience,
current_work=current_work,
)

Loading

0 comments on commit f2ab33e

Please sign in to comment.