company spider now users linkedin api client

AnitCloud1 · Jan 23, 2020 · 5c469ff · 5c469ff
1 parent f99302d
commit 5c469ff
Show file tree

Hide file tree

Showing 5 changed files with 29 additions and 11 deletions.
diff --git a/linkedin/middlewares.py b/linkedin/middlewares.py
@@ -19,12 +19,10 @@ def process_request(self, request, spider):
         profile_xpath = "//*[@id='nav-settings__dropdown-trigger']/img"
         get_by_xpath(driver, profile_xpath)
 
-        # waiting links to other users are shown so the crawl can continue
-        get_by_xpath_or_none(driver, '//*/span/span/span[1]', wait_timeout=3)
+        spider.wait_page_completion(driver=driver)
 
         print('SeleniumMiddleware - retrieving body')
         body = to_bytes(driver.page_source)  # body must be of type bytes
 
         return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
 
-
diff --git a/linkedin/spiders/companies.py b/linkedin/spiders/companies.py
@@ -3,7 +3,7 @@
 from scrapy.spiders import Spider
 
 from linkedin.spiders.selenium import SeleniumSpiderMixin, extracts_see_all_url, extracts_linkedin_users, \
-    get_by_xpath_or_none, wait_invisibility_xpath, extract_company
+    get_by_xpath_or_none, extract_company
 
 """
 Number of seconds to wait checking if the page is a "No Result" type.
@@ -25,6 +25,15 @@ class CompaniesSpider(SeleniumSpiderMixin, Spider):
     with open(URLS_FILE, "rt") as f:
         start_urls = [url.strip() for url in f]
 
+    def wait_page_completion(self, driver):
+        """
+        Abstract function, used to customize how the specific spider have to wait for page completion.
+        Blank by default
+        :param driver:
+        :return:
+        """
+        pass
+
     def parse(self, response):
         url = extracts_see_all_url(self.driver) + f'&page={FIRST_PAGE_INDEX}'
         return Request(url=url,
@@ -49,7 +58,7 @@ def parser_search_results_page(self, response):
             company = extract_company(self.driver)
             print(f'Company:{company}')
 
-            users = extracts_linkedin_users(self.driver, company=company)
+            users = extracts_linkedin_users(self.driver, company=company, api_client=self.api_client)
             for user in users:
                 yield user
 

diff --git a/linkedin/spiders/linkedin.py b/linkedin/spiders/linkedin.py
@@ -2,7 +2,7 @@
 from scrapy.spiders import CrawlSpider
 from scrapy.spiders import Rule
 
-from linkedin.spiders.selenium import SeleniumSpiderMixin
+from linkedin.spiders.selenium import SeleniumSpiderMixin, get_by_xpath_or_none
 
 NETWORK_URL = 'https://www.linkedin.com/mynetwork/invite-connect/connections/'
 
@@ -46,6 +46,16 @@ class Linkedin(SeleniumSpiderMixin, CrawlSpider):
              ),
     )
 
+    def wait_page_completion(self, driver):
+        """
+        Abstract function, used to customize how the specific spider have to wait for page completion.
+        Blank by default
+        :param driver:
+        :return:
+        """
+        # waiting links to other users are shown so the crawl can continue
+        get_by_xpath_or_none(driver, '//*/span/span/span[1]', wait_timeout=3)
+
     def extract_profile_id_from_url(self, response):
         # extract_profile_id_from_url
         profile_id = response.url.split('/')[-2]

diff --git a/linkedin/spiders/selenium.py b/linkedin/spiders/selenium.py
@@ -9,7 +9,6 @@
 
 from conf import EMAIL, PASSWORD
 from linkedin.integration import CustomLinkedinClient
-from linkedin.items import LinkedinUser
 
 """
 number of seconds used to wait the web page's loading.
@@ -124,13 +123,14 @@ def extracts_see_all_url(driver):
     return see_all_url
 
 
-def extracts_linkedin_users(driver, company):
+def extracts_linkedin_users(driver, company, api_client):
     """
     Gets from a page containing a list of users, all the users.
     For instance: https://www.linkedin.com/search/results/people/?facetCurrentCompany=[%22221027%22]
     :param driver: The webdriver, logged in, and located in the page which lists users.
     :return: Iterator on LinkedinUser.
     """
+    from linkedin.spiders.linkedin import extract_contact_info
 
     for i in range(1, 11):
         print(f'loading {i}th user')
@@ -140,7 +140,6 @@ def extracts_linkedin_users(driver, company):
         result = get_by_xpath_or_none(driver, last_result_xpath)
         if result is not None:
 
-
             link_elem = get_by_xpath_or_none(result, './/*[@class="search-result__result-link ember-view"]')
             link = link_elem.get_attribute('href') if link_elem is not None else None
 
@@ -150,7 +149,9 @@ def extracts_linkedin_users(driver, company):
             title_elem = get_by_xpath_or_none(result, './/p')
             title = title_elem.text if name_elem is not None else None
 
-            user = LinkedinUser(name=name, title=title, company=company, link=link)
+            # extract_profile_id_from_url
+            profile_id = link.split('/')[-2]
+            user = extract_contact_info(api_client, profile_id)
 
             yield user
 

diff --git a/urls.txt b/urls.txt
@@ -1 +1 @@
-https://www.linkedin.com/company/google
+https://www.linkedin.com/company/twitter
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		https://www.linkedin.com/company/google
		https://www.linkedin.com/company/twitter