diff --git a/Makefile b/Makefile index ee008be..ed5c6fe 100644 --- a/Makefile +++ b/Makefile @@ -7,5 +7,11 @@ view: companies: scrapy crawl companies -a selenium_hostname=localhost -o users.csv +random: + scrapy crawl random -a selenium_hostname=localhost -o users.csv + +byname: + scrapy crawl byname -a selenium_hostname=localhost -o users.csv + tests: pytest linkedin/tests/* diff --git a/README.md b/README.md index b0c1dea..18c8fbb 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ Needed: - python3.6; - virtualenvs; -###### 0. Preparations; +###### 0. Prepare your environment: Install docker from the official website [https://www.docker.com/](https://www.docker.com/) @@ -36,10 +36,10 @@ sudo apt-get update sudo apt-get install vinagre ``` -###### 1. Set your Linkedin login and password; +###### 1. Set up Linkedin login and password: Copy `conf_template.py` in `conf.py` and fill the quotes with your credentials. -###### 2. Run and build containers with docker-compose; +###### 2. Run and build containers with docker-compose: Only linkedin spider, not the companies spider. Open your terminal, move to the project folder and type: @@ -90,7 +90,11 @@ For more details have a look at the Makefile (here is used to shortcut and not t ``` or ```bash - scrapy crawl linkedin -a selenium_hostname=localhost -o output.csv + scrapy crawl random -a selenium_hostname=localhost -o output.csv +``` +or +```bash + scrapy crawl byname -a selenium_hostname=localhost -o output.csv ``` ## Legal diff --git a/linkedin/middlewares.py b/linkedin/middlewares.py index 6ec4980..3685a87 100644 --- a/linkedin/middlewares.py +++ b/linkedin/middlewares.py @@ -16,8 +16,8 @@ def process_request(self, request, spider): # request.meta['driver'] = self.driver # to access driver from response print('waiting for page loading') - profile_xpath = "//*[@id='nav-settings__dropdown-trigger']/img" - get_by_xpath(driver, profile_xpath) + # profile_xpath = "//*[@id='nav-settings__dropdown-trigger']/img" + # get_by_xpath_or_none(driver, profile_xpath) spider.wait_page_completion(driver=driver) diff --git a/linkedin/spiders/by_name.py b/linkedin/spiders/by_name.py new file mode 100644 index 0000000..833478c --- /dev/null +++ b/linkedin/spiders/by_name.py @@ -0,0 +1,40 @@ +import urllib.parse + +from scrapy import Request + +from linkedin.spiders.search import SearchSpider + +NAMES_FILE = 'names.txt' + + +class ByNameSpider(SearchSpider): + """ + Spider who searches People by name. + """ + name = 'byname' + allowed_domains = ['www.linkedin.com'] + + start_urls = [] + + with open(NAMES_FILE, "rt") as f: + names = [name for name in f] + + def start_requests(self): + for name in self.names: + encoded_name = urllib.parse.quote(name.lower()) + url = f"https://www.linkedin.com/search/results/people/?origin=GLOBAL_SEARCH_HEADER&keywords={encoded_name}&page=1" + + yield Request(url=url, + callback=super().parser_search_results_page, + dont_filter=True, + meta={'max_page': 1}, + ) + + def wait_page_completion(self, driver): + """ + Abstract function, used to customize how the specific spider must wait for a search page completion. + Blank by default + :param driver: + :return: + """ + pass diff --git a/linkedin/spiders/companies.py b/linkedin/spiders/companies.py index ec68c89..3071c9d 100644 --- a/linkedin/spiders/companies.py +++ b/linkedin/spiders/companies.py @@ -1,24 +1,20 @@ # -*- coding: utf-8 -*- from scrapy import Request -from scrapy.spiders import Spider -from linkedin.spiders.selenium import SeleniumSpiderMixin, extracts_see_all_url, extracts_linkedin_users, \ - get_by_xpath_or_none, extract_company +from linkedin.spiders.search import SearchSpider +from linkedin.spiders.selenium import get_by_xpath_or_none, get_by_xpath -""" -Number of seconds to wait checking if the page is a "No Result" type. -""" -NO_RESULT_WAIT_TIMEOUT = 3 + +URLS_FILE = "urls.txt" """ -First page to scrape from on the search results list (default to 1). +Placeholder used to recognize the 'See all 27,569 employees on LinkedIn' clickable button, +in the 'https://www.linkedin.com/company/*/' style pages. """ -FIRST_PAGE_INDEX = 1 - -URLS_FILE = "urls.txt" +SEE_ALL_PLACEHOLDER = 'See all' -class CompaniesSpider(SeleniumSpiderMixin, Spider): +class CompaniesSpider(SearchSpider): name = 'companies' allowed_domains = ['www.linkedin.com'] @@ -27,7 +23,7 @@ class CompaniesSpider(SeleniumSpiderMixin, Spider): def wait_page_completion(self, driver): """ - Abstract function, used to customize how the specific spider have to wait for page completion. + Abstract function, used to customize how the specific spider must wait for a search page completion. Blank by default :param driver: :return: @@ -35,41 +31,41 @@ def wait_page_completion(self, driver): pass def parse(self, response): - url = extracts_see_all_url(self.driver) + f'&page={FIRST_PAGE_INDEX}' + url = extracts_see_all_url(self.driver) + f'&page=1' return Request(url=url, - callback=self.parser_search_results_page, + callback=super().parser_search_results_page, dont_filter=True, ) - def parser_search_results_page(self, response): - print('Now parsing search result page') - - no_result_found_xpath = '//*[text()="No results found."]' - - no_result_response = get_by_xpath_or_none(driver=self.driver, - xpath=no_result_found_xpath, - wait_timeout=NO_RESULT_WAIT_TIMEOUT, - logs=False) - - if no_result_response is not None: - print('"No results" message shown, stop crawling this company') - return - else: - company = extract_company(self.driver) - print(f'Company:{company}') - - users = extracts_linkedin_users(self.driver, company=company, api_client=self.api_client) - for user in users: - yield user - - # incrementing the index at the end of the url - url = response.request.url - next_url_split = url.split('=') - index = int(next_url_split[-1]) - next_url = '='.join(next_url_split[:-1]) + '=' + str(index + 1) - yield Request(url=next_url, - callback=self.parser_search_results_page, - meta={'company': company}, - dont_filter=True, - ) +###################### +# Module's functions: +###################### + +def extracts_see_all_url(driver): + """ + Retrieve from the the Company front page the url of the page containing the list of its employees. + :param driver: The already opened (and logged in) webdriver, already located to the company's front page. + :return: String: The "See All" URL. + """ + print('Searching for the "See all * employees on LinkedIn" btn') + see_all_xpath = f'//*[starts-with(text(),"{SEE_ALL_PLACEHOLDER}")]' + see_all_elem = get_by_xpath(driver, see_all_xpath) + see_all_ex_text = see_all_elem.text + + a_elem = driver.find_element_by_link_text(see_all_ex_text) + see_all_url = a_elem.get_attribute('href') + print(f'Found the following URL: {see_all_url}') + return see_all_url + + +def extract_company(driver): + """ + Extract company name from a search result page. + :param driver: The selenium webdriver. + :return: The company string, None if something wrong. + """ + company_xpath = '//li[@class="search-s-facet search-s-facet--facetCurrentCompany inline-block ' \ + 'search-s-facet--is-closed ember-view"]/form/button/div/div/h3 ' + company_elem = get_by_xpath_or_none(driver, company_xpath) + return company_elem.text if company_elem is not None else None diff --git a/linkedin/spiders/linkedin.py b/linkedin/spiders/random.py similarity index 55% rename from linkedin/spiders/linkedin.py rename to linkedin/spiders/random.py index 7888b78..a0bcb10 100644 --- a/linkedin/spiders/linkedin.py +++ b/linkedin/spiders/random.py @@ -2,38 +2,17 @@ from scrapy.spiders import CrawlSpider from scrapy.spiders import Rule +from linkedin.spiders.search import extract_contact_info from linkedin.spiders.selenium import SeleniumSpiderMixin, get_by_xpath_or_none +""" +Variable holding where to search for first profiles to scrape. +""" NETWORK_URL = 'https://www.linkedin.com/mynetwork/invite-connect/connections/' -def extract_contact_info(api_client, contact_public_id): - contact_profile = api_client.get_profile(contact_public_id) - contact_info = api_client.get_profile_contact_info(contact_public_id) - - lastName = contact_profile['lastName'] - firstName = contact_profile['firstName'] - - email_address = contact_info['email_address'] - phone_numbers = contact_info['phone_numbers'] - - education = contact_profile['education'] - experience = contact_profile['experience'] - - current_work = [exp for exp in experience if exp.get('timePeriod', {}).get('endDate') is None] - - return dict(lastName=lastName, - firstName=firstName, - email_address=email_address, - phone_numbers=phone_numbers, - education=education, - experience=experience, - current_work=current_work, - ) - - -class Linkedin(SeleniumSpiderMixin, CrawlSpider): - name = "linkedin" +class RandomSpider(SeleniumSpiderMixin, CrawlSpider): + name = "random" start_urls = [ NETWORK_URL, ] diff --git a/linkedin/spiders/search.py b/linkedin/spiders/search.py new file mode 100644 index 0000000..590be5b --- /dev/null +++ b/linkedin/spiders/search.py @@ -0,0 +1,129 @@ +import time + +from scrapy import Spider +from scrapy import Request + +from linkedin.spiders.selenium import get_by_xpath_or_none, SeleniumSpiderMixin + +""" +Number of seconds to wait checking if the page is a "No Result" type. +""" +NO_RESULT_WAIT_TIMEOUT = 3 + + +class SearchSpider(SeleniumSpiderMixin, Spider): + """ + Abstract class for for generic search on linkedin. + """ + + def parser_search_results_page(self, response): + print('Now parsing search result page') + + no_result_found_xpath = '//*[text()="No results found."]' + + no_result_response = get_by_xpath_or_none(driver=self.driver, + xpath=no_result_found_xpath, + wait_timeout=NO_RESULT_WAIT_TIMEOUT, + logs=False) + + if no_result_response is not None: + print('"No results" message shown, stop crawling this company') + return + else: + # company extraction temporary disabled + # company = extract_company(self.driver) + # print(f'Company:{company}') + + users = extracts_linkedin_users(self.driver, + #company=company, + api_client=self.api_client) + for user in users: + yield user + + + # incrementing the index at the end of the url + url = response.request.url + next_url_split = url.split('=') + index = int(next_url_split[-1]) + next_url = '='.join(next_url_split[:-1]) + '=' + str(index + 1) + + max_page = response.meta.get('max_page', None) + if max_page is not None: + if index >= max_page: + return + + yield Request(url=next_url, + callback=self.parser_search_results_page, + meta={'max_page': max_page}, + dont_filter=True, + ) + +###################### +# Module's functions: +###################### +def extracts_linkedin_users(driver, api_client, company=None): + """ + Gets from a page containing a list of users, all the users. + For instance: https://www.linkedin.com/search/results/people/?facetCurrentCompany=[%22221027%22] + :param driver: The webdriver, logged in, and located in the page which lists users. + :return: Iterator on LinkedinUser. + """ + + for i in range(1, 11): + print(f'loading {i}th user') + + last_result_xpath = f'//li[{i}]/*/div[@class="search-result__wrapper"]' + + result = get_by_xpath_or_none(driver, last_result_xpath) + if result is not None: + link_elem = get_by_xpath_or_none(result, './/*[@class="search-result__result-link ember-view"]') + link = link_elem.get_attribute('href') if link_elem is not None else None + + name_elem = get_by_xpath_or_none(result, './/*[@class="name actor-name"]') + name = name_elem.text if name_elem is not None else None + + title_elem = get_by_xpath_or_none(result, './/p') + title = title_elem.text if name_elem is not None else None + + # extract_profile_id_from_url + profile_id = link.split('/')[-2] + user = extract_contact_info(api_client, profile_id) + + yield user + + if link_elem is not None: + driver.execute_script("arguments[0].scrollIntoView();", link_elem) + elif name_elem is not None: + driver.execute_script("arguments[0].scrollIntoView();", name_elem) + elif title_elem is not None: + driver.execute_script("arguments[0].scrollIntoView();", title_elem) + else: + print("Was not possible to scroll") + + time.sleep(0.7) + + +def extract_contact_info(api_client, contact_public_id): + contact_profile = api_client.get_profile(contact_public_id) + contact_info = api_client.get_profile_contact_info(contact_public_id) + + lastName = contact_profile['lastName'] + firstName = contact_profile['firstName'] + + email_address = contact_info['email_address'] + phone_numbers = contact_info['phone_numbers'] + + education = contact_profile['education'] + experience = contact_profile['experience'] + + current_work = [exp for exp in experience if exp.get('timePeriod', {}).get('endDate') is None] + + return dict(lastName=lastName, + firstName=firstName, + email_address=email_address, + phone_numbers=phone_numbers, + education=education, + experience=experience, + current_work=current_work, + ) + diff --git a/linkedin/spiders/selenium.py b/linkedin/spiders/selenium.py index 778276d..2fcc71b 100644 --- a/linkedin/spiders/selenium.py +++ b/linkedin/spiders/selenium.py @@ -1,5 +1,3 @@ -import time - from selenium import webdriver from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, WebDriverException from selenium.webdriver import DesiredCapabilities @@ -21,11 +19,27 @@ from the scrapy controller to the selenium instance.""" SELENIUM_HOSTNAME = 'selenium' -""" -Placeholder used to recognize the 'See all 27,569 employees on LinkedIn' clickable button, -in the 'https://www.linkedin.com/company/*/' style pages. -""" -SEE_ALL_PLACEHOLDER = 'See all' + +class SeleniumSpiderMixin: + """ + Abstract Spider based on Selenium. + It takes care of login on linkedin. + """ + def __init__(self, selenium_hostname=None, **kwargs): + if selenium_hostname is None: + selenium_hostname = SELENIUM_HOSTNAME + + self.driver = init_chromium(selenium_hostname) + + # initializing also API's client + self.api_client = CustomLinkedinClient(EMAIL, PASSWORD, debug=True) + + login(self.driver) + + super().__init__(**kwargs) + + def closed(self, reason): + self.driver.close() def wait_invisibility_xpath(driver, xpath, wait_timeout=None): @@ -105,92 +119,3 @@ def login(driver): print('Searching for the submit') get_by_xpath(driver, '//*[@type="submit"]').click() - -def extracts_see_all_url(driver): - """ - Retrieve from the the Company front page the url of the page containing the list of its employees. - :param driver: The already opened (and logged in) webdriver, already located to the company's front page. - :return: String: The "See All" URL. - """ - print('Searching for the "See all * employees on LinkedIn" btn') - see_all_xpath = f'//*[starts-with(text(),"{SEE_ALL_PLACEHOLDER}")]' - see_all_elem = get_by_xpath(driver, see_all_xpath) - see_all_ex_text = see_all_elem.text - - a_elem = driver.find_element_by_link_text(see_all_ex_text) - see_all_url = a_elem.get_attribute('href') - print(f'Found the following URL: {see_all_url}') - return see_all_url - - -def extracts_linkedin_users(driver, company, api_client): - """ - Gets from a page containing a list of users, all the users. - For instance: https://www.linkedin.com/search/results/people/?facetCurrentCompany=[%22221027%22] - :param driver: The webdriver, logged in, and located in the page which lists users. - :return: Iterator on LinkedinUser. - """ - from linkedin.spiders.linkedin import extract_contact_info - - for i in range(1, 11): - print(f'loading {i}th user') - - last_result_xpath = f'//li[{i}]/*/div[@class="search-result__wrapper"]' - - result = get_by_xpath_or_none(driver, last_result_xpath) - if result is not None: - link_elem = get_by_xpath_or_none(result, './/*[@class="search-result__result-link ember-view"]') - link = link_elem.get_attribute('href') if link_elem is not None else None - - name_elem = get_by_xpath_or_none(result, './/*[@class="name actor-name"]') - name = name_elem.text if name_elem is not None else None - - title_elem = get_by_xpath_or_none(result, './/p') - title = title_elem.text if name_elem is not None else None - - # extract_profile_id_from_url - profile_id = link.split('/')[-2] - user = extract_contact_info(api_client, profile_id) - - yield user - - if link_elem is not None: - driver.execute_script("arguments[0].scrollIntoView();", link_elem) - elif name_elem is not None: - driver.execute_script("arguments[0].scrollIntoView();", name_elem) - elif title_elem is not None: - driver.execute_script("arguments[0].scrollIntoView();", title_elem) - else: - print("Was not possible to scroll") - - time.sleep(0.7) - - -def extract_company(driver): - """ - Extract company name from a search result page. - :param driver: The selenium webdriver. - :return: The company string, None if something wrong. - """ - company_xpath = '//li[@class="search-s-facet search-s-facet--facetCurrentCompany inline-block ' \ - 'search-s-facet--is-closed ember-view"]/form/button/div/div/h3 ' - company_elem = get_by_xpath_or_none(driver, company_xpath) - return company_elem.text if company_elem is not None else None - - -class SeleniumSpiderMixin: - def __init__(self, selenium_hostname=None, **kwargs): - if selenium_hostname is None: - selenium_hostname = SELENIUM_HOSTNAME - - self.driver = init_chromium(selenium_hostname) - - # initializing also API's client - self.api_client = CustomLinkedinClient(EMAIL, PASSWORD, debug=True) - - login(self.driver) - - super().__init__(**kwargs) - - def closed(self, reason): - self.driver.close() diff --git a/linkedin/tests/companies.py b/linkedin/tests/companies.py index 0cd2843..7151b9d 100644 --- a/linkedin/tests/companies.py +++ b/linkedin/tests/companies.py @@ -1,6 +1,8 @@ import pytest -from linkedin.spiders.selenium import login, extracts_see_all_url, extracts_linkedin_users, extract_company +from linkedin.spiders.selenium import login +from linkedin.spiders.search import extracts_linkedin_users +from linkedin.spiders.companies import extracts_see_all_url, extract_company from linkedin.tests.selenium import SeleniumTest GOOGLE = 'https://www.linkedin.com/company/google' @@ -12,13 +14,14 @@ def setUp(self): super().setUp() login(self.driver) - #@pytest.mark.skip + @pytest.mark.skip def test_extracts_see_all_url(self): self.driver.get(GOOGLE) url = extracts_see_all_url(self.driver) print(url) assert url.startswith("https://www.linkedin.com/search/results/people/?facetCurrentCompany=") + @pytest.mark.skip def test_extracts_linkedin_users(self): self.driver.get(GOOGLE_USERS_LIST) diff --git a/names.txt b/names.txt new file mode 100644 index 0000000..a119588 --- /dev/null +++ b/names.txt @@ -0,0 +1 @@ +Mario Rossi