added by name search spider

AnitCloud1 · Mar 21, 2020 · f2ab33e · f2ab33e
1 parent dacbec9
commit f2ab33e
Show file tree

Hide file tree

Showing 10 changed files with 260 additions and 177 deletions.
diff --git a/Makefile b/Makefile
@@ -7,5 +7,11 @@ view:
 companies:
 	scrapy crawl companies -a selenium_hostname=localhost -o users.csv
 
+random:
+	scrapy crawl random -a selenium_hostname=localhost -o users.csv
+
+byname:
+	scrapy crawl byname -a selenium_hostname=localhost -o users.csv
+
 tests:
 	pytest linkedin/tests/*
diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ Needed:
 - python3.6;
 - virtualenvs;
 
-###### 0. Preparations;
+###### 0. Prepare your environment:
 
 Install docker from the official website [https://www.docker.com/](https://www.docker.com/)
 
@@ -36,10 +36,10 @@ sudo apt-get update
 sudo apt-get install vinagre
 ```
 
-###### 1. Set your Linkedin login and password;
+###### 1. Set up Linkedin login and password:
 Copy `conf_template.py` in `conf.py` and fill the quotes with your credentials.
 
-###### 2. Run and build containers with docker-compose;
+###### 2. Run and build containers with docker-compose:
 Only linkedin spider, not the companies spider.
 Open your terminal, move to the project folder and type:
 
@@ -90,7 +90,11 @@ For more details have a look at the Makefile (here is used to shortcut and not t
 ```
 or
 ```bash
-    scrapy crawl linkedin -a selenium_hostname=localhost -o output.csv
+    scrapy crawl random -a selenium_hostname=localhost -o output.csv
+```
+or
+```bash
+    scrapy crawl byname -a selenium_hostname=localhost -o output.csv
 ```
 ## Legal
 

diff --git a/linkedin/middlewares.py b/linkedin/middlewares.py
@@ -16,8 +16,8 @@ def process_request(self, request, spider):
         # request.meta['driver'] = self.driver  # to access driver from response
 
         print('waiting for page loading')
-        profile_xpath = "//*[@id='nav-settings__dropdown-trigger']/img"
-        get_by_xpath(driver, profile_xpath)
+        # profile_xpath = "//*[@id='nav-settings__dropdown-trigger']/img"
+        # get_by_xpath_or_none(driver, profile_xpath)
 
         spider.wait_page_completion(driver=driver)
 

diff --git a/linkedin/spiders/by_name.py b/linkedin/spiders/by_name.py
@@ -0,0 +1,40 @@
+import urllib.parse
+
+from scrapy import Request
+
+from linkedin.spiders.search import SearchSpider
+
+NAMES_FILE = 'names.txt'
+
+
+class ByNameSpider(SearchSpider):
+    """
+    Spider who searches People by name.
+    """
+    name = 'byname'
+    allowed_domains = ['www.linkedin.com']
+
+    start_urls = []
+
+    with open(NAMES_FILE, "rt") as f:
+        names = [name for name in f]
+
+    def start_requests(self):
+        for name in self.names:
+            encoded_name = urllib.parse.quote(name.lower())
+            url = f"https://www.linkedin.com/search/results/people/?origin=GLOBAL_SEARCH_HEADER&keywords={encoded_name}&page=1"
+
+            yield Request(url=url,
+                          callback=super().parser_search_results_page,
+                          dont_filter=True,
+                          meta={'max_page': 1},
+                          )
+
+    def wait_page_completion(self, driver):
+        """
+        Abstract function, used to customize how the specific spider must wait for a search page completion.
+        Blank by default
+        :param driver:
+        :return:
+        """
+        pass
diff --git a/linkedin/spiders/companies.py b/linkedin/spiders/companies.py
@@ -1,24 +1,20 @@
 # -*- coding: utf-8 -*-
 from scrapy import Request
-from scrapy.spiders import Spider
 
-from linkedin.spiders.selenium import SeleniumSpiderMixin, extracts_see_all_url, extracts_linkedin_users, \
-    get_by_xpath_or_none, extract_company
+from linkedin.spiders.search import SearchSpider
+from linkedin.spiders.selenium import get_by_xpath_or_none, get_by_xpath
 
-"""
-Number of seconds to wait checking if the page is a "No Result" type.
-"""
-NO_RESULT_WAIT_TIMEOUT = 3
+
+URLS_FILE = "urls.txt"
 
 """
-First page to scrape from on the search results list (default to 1).
+Placeholder used to recognize the 'See all 27,569 employees on LinkedIn' clickable button,
+in the 'https://www.linkedin.com/company/*/' style pages.
 """
-FIRST_PAGE_INDEX = 1
-
-URLS_FILE = "urls.txt"
+SEE_ALL_PLACEHOLDER = 'See all'
 
 
-class CompaniesSpider(SeleniumSpiderMixin, Spider):
+class CompaniesSpider(SearchSpider):
     name = 'companies'
     allowed_domains = ['www.linkedin.com']
 
@@ -27,49 +23,49 @@ class CompaniesSpider(SeleniumSpiderMixin, Spider):
 
     def wait_page_completion(self, driver):
         """
-        Abstract function, used to customize how the specific spider have to wait for page completion.
+        Abstract function, used to customize how the specific spider must wait for a search page completion.
         Blank by default
         :param driver:
         :return:
         """
         pass
 
     def parse(self, response):
-        url = extracts_see_all_url(self.driver) + f'&page={FIRST_PAGE_INDEX}'
+        url = extracts_see_all_url(self.driver) + f'&page=1'
         return Request(url=url,
-                       callback=self.parser_search_results_page,
+                       callback=super().parser_search_results_page,
                        dont_filter=True,
                        )
 
-    def parser_search_results_page(self, response):
-        print('Now parsing search result page')
-
-        no_result_found_xpath = '//*[text()="No results found."]'
-
-        no_result_response = get_by_xpath_or_none(driver=self.driver,
-                                                  xpath=no_result_found_xpath,
-                                                  wait_timeout=NO_RESULT_WAIT_TIMEOUT,
-                                                  logs=False)
-
-        if no_result_response is not None:
-            print('"No results" message shown, stop crawling this company')
-            return
-        else:
-            company = extract_company(self.driver)
-            print(f'Company:{company}')
-
-            users = extracts_linkedin_users(self.driver, company=company, api_client=self.api_client)
-            for user in users:
-                yield user
-
-            # incrementing the index at the end of the url
-            url = response.request.url
-            next_url_split = url.split('=')
-            index = int(next_url_split[-1])
-            next_url = '='.join(next_url_split[:-1]) + '=' + str(index + 1)
 
-            yield Request(url=next_url,
-                          callback=self.parser_search_results_page,
-                          meta={'company': company},
-                          dont_filter=True,
-                          )
+######################
+# Module's functions:
+######################
+
+def extracts_see_all_url(driver):
+    """
+    Retrieve from the the Company front page the url of the page containing the list of its employees.
+    :param driver: The already opened (and logged in) webdriver, already located to the company's front page.
+    :return: String: The "See All" URL.
+    """
+    print('Searching for the "See all * employees on LinkedIn" btn')
+    see_all_xpath = f'//*[starts-with(text(),"{SEE_ALL_PLACEHOLDER}")]'
+    see_all_elem = get_by_xpath(driver, see_all_xpath)
+    see_all_ex_text = see_all_elem.text
+
+    a_elem = driver.find_element_by_link_text(see_all_ex_text)
+    see_all_url = a_elem.get_attribute('href')
+    print(f'Found the following URL: {see_all_url}')
+    return see_all_url
+
+
+def extract_company(driver):
+    """
+    Extract company name from a search result page.
+    :param driver: The selenium webdriver.
+    :return: The company string, None if something wrong.
+    """
+    company_xpath = '//li[@class="search-s-facet search-s-facet--facetCurrentCompany inline-block ' \
+                    'search-s-facet--is-closed ember-view"]/form/button/div/div/h3 '
+    company_elem = get_by_xpath_or_none(driver, company_xpath)
+    return company_elem.text if company_elem is not None else None
diff --git a/linkedin/spiders/linkedin.py → linkedin/spiders/random.py b/linkedin/spiders/linkedin.py → linkedin/spiders/random.py
@@ -2,38 +2,17 @@
 from scrapy.spiders import CrawlSpider
 from scrapy.spiders import Rule
 
+from linkedin.spiders.search import extract_contact_info
 from linkedin.spiders.selenium import SeleniumSpiderMixin, get_by_xpath_or_none
 
+"""
+Variable holding where to search for first profiles to scrape.
+"""
 NETWORK_URL = 'https://www.linkedin.com/mynetwork/invite-connect/connections/'
 
 
-def extract_contact_info(api_client, contact_public_id):
-    contact_profile = api_client.get_profile(contact_public_id)
-    contact_info = api_client.get_profile_contact_info(contact_public_id)
-
-    lastName = contact_profile['lastName']
-    firstName = contact_profile['firstName']
-
-    email_address = contact_info['email_address']
-    phone_numbers = contact_info['phone_numbers']
-
-    education = contact_profile['education']
-    experience = contact_profile['experience']
-
-    current_work = [exp for exp in experience if exp.get('timePeriod', {}).get('endDate') is None]
-
-    return dict(lastName=lastName,
-                firstName=firstName,
-                email_address=email_address,
-                phone_numbers=phone_numbers,
-                education=education,
-                experience=experience,
-                current_work=current_work,
-                )
-
-
-class Linkedin(SeleniumSpiderMixin, CrawlSpider):
-    name = "linkedin"
+class RandomSpider(SeleniumSpiderMixin, CrawlSpider):
+    name = "random"
     start_urls = [
         NETWORK_URL,
     ]

diff --git a/linkedin/spiders/search.py b/linkedin/spiders/search.py
@@ -0,0 +1,129 @@
+import time
+
+from scrapy import Spider
+from scrapy import Request
+
+from linkedin.spiders.selenium import get_by_xpath_or_none, SeleniumSpiderMixin
+
+"""
+Number of seconds to wait checking if the page is a "No Result" type.
+"""
+NO_RESULT_WAIT_TIMEOUT = 3
+
+
+class SearchSpider(SeleniumSpiderMixin, Spider):
+    """
+    Abstract class for for generic search on linkedin.
+    """
+
+    def parser_search_results_page(self, response):
+        print('Now parsing search result page')
+
+        no_result_found_xpath = '//*[text()="No results found."]'
+
+        no_result_response = get_by_xpath_or_none(driver=self.driver,
+                                                  xpath=no_result_found_xpath,
+                                                  wait_timeout=NO_RESULT_WAIT_TIMEOUT,
+                                                  logs=False)
+
+        if no_result_response is not None:
+            print('"No results" message shown, stop crawling this company')
+            return
+        else:
+            # company extraction temporary disabled
+            # company = extract_company(self.driver)
+            # print(f'Company:{company}')
+
+            users = extracts_linkedin_users(self.driver,
+                                            #company=company,
+                                            api_client=self.api_client)
+            for user in users:
+                yield user
+
+
+            # incrementing the index at the end of the url
+            url = response.request.url
+            next_url_split = url.split('=')
+            index = int(next_url_split[-1])
+            next_url = '='.join(next_url_split[:-1]) + '=' + str(index + 1)
+
+            max_page = response.meta.get('max_page', None)
+            if max_page is not None:
+                if index >= max_page:
+                    return
+
+            yield Request(url=next_url,
+                          callback=self.parser_search_results_page,
+                          meta={'max_page': max_page},
+                          dont_filter=True,
+                          )
+
+######################
+# Module's functions:
+######################
+def extracts_linkedin_users(driver, api_client, company=None):
+    """
+    Gets from a page containing a list of users, all the users.
+    For instance: https://www.linkedin.com/search/results/people/?facetCurrentCompany=[%22221027%22]
+    :param driver: The webdriver, logged in, and located in the page which lists users.
+    :return: Iterator on LinkedinUser.
+    """
+
+    for i in range(1, 11):
+        print(f'loading {i}th user')
+
+        last_result_xpath = f'//li[{i}]/*/div[@class="search-result__wrapper"]'
+
+        result = get_by_xpath_or_none(driver, last_result_xpath)
+        if result is not None:
+            link_elem = get_by_xpath_or_none(result, './/*[@class="search-result__result-link ember-view"]')
+            link = link_elem.get_attribute('href') if link_elem is not None else None
+
+            name_elem = get_by_xpath_or_none(result, './/*[@class="name actor-name"]')
+            name = name_elem.text if name_elem is not None else None
+
+            title_elem = get_by_xpath_or_none(result, './/p')
+            title = title_elem.text if name_elem is not None else None
+
+            # extract_profile_id_from_url
+            profile_id = link.split('/')[-2]
+            user = extract_contact_info(api_client, profile_id)
+
+            yield user
+
+            if link_elem is not None:
+                driver.execute_script("arguments[0].scrollIntoView();", link_elem)
+            elif name_elem is not None:
+                driver.execute_script("arguments[0].scrollIntoView();", name_elem)
+            elif title_elem is not None:
+                driver.execute_script("arguments[0].scrollIntoView();", title_elem)
+            else:
+                print("Was not possible to scroll")
+
+        time.sleep(0.7)
+
+
+def extract_contact_info(api_client, contact_public_id):
+    contact_profile = api_client.get_profile(contact_public_id)
+    contact_info = api_client.get_profile_contact_info(contact_public_id)
+
+    lastName = contact_profile['lastName']
+    firstName = contact_profile['firstName']
+
+    email_address = contact_info['email_address']
+    phone_numbers = contact_info['phone_numbers']
+
+    education = contact_profile['education']
+    experience = contact_profile['experience']
+
+    current_work = [exp for exp in experience if exp.get('timePeriod', {}).get('endDate') is None]
+
+    return dict(lastName=lastName,
+                firstName=firstName,
+                email_address=email_address,
+                phone_numbers=phone_numbers,
+                education=education,
+                experience=experience,
+                current_work=current_work,
+                )
+