diff --git a/linkedin/middlewares.py b/linkedin/middlewares.py index 3c4ea03..6ec4980 100644 --- a/linkedin/middlewares.py +++ b/linkedin/middlewares.py @@ -19,12 +19,10 @@ def process_request(self, request, spider): profile_xpath = "//*[@id='nav-settings__dropdown-trigger']/img" get_by_xpath(driver, profile_xpath) - # waiting links to other users are shown so the crawl can continue - get_by_xpath_or_none(driver, '//*/span/span/span[1]', wait_timeout=3) + spider.wait_page_completion(driver=driver) print('SeleniumMiddleware - retrieving body') body = to_bytes(driver.page_source) # body must be of type bytes return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request) - diff --git a/linkedin/spiders/companies.py b/linkedin/spiders/companies.py index a7fa83f..ec68c89 100644 --- a/linkedin/spiders/companies.py +++ b/linkedin/spiders/companies.py @@ -3,7 +3,7 @@ from scrapy.spiders import Spider from linkedin.spiders.selenium import SeleniumSpiderMixin, extracts_see_all_url, extracts_linkedin_users, \ - get_by_xpath_or_none, wait_invisibility_xpath, extract_company + get_by_xpath_or_none, extract_company """ Number of seconds to wait checking if the page is a "No Result" type. @@ -25,6 +25,15 @@ class CompaniesSpider(SeleniumSpiderMixin, Spider): with open(URLS_FILE, "rt") as f: start_urls = [url.strip() for url in f] + def wait_page_completion(self, driver): + """ + Abstract function, used to customize how the specific spider have to wait for page completion. + Blank by default + :param driver: + :return: + """ + pass + def parse(self, response): url = extracts_see_all_url(self.driver) + f'&page={FIRST_PAGE_INDEX}' return Request(url=url, @@ -49,7 +58,7 @@ def parser_search_results_page(self, response): company = extract_company(self.driver) print(f'Company:{company}') - users = extracts_linkedin_users(self.driver, company=company) + users = extracts_linkedin_users(self.driver, company=company, api_client=self.api_client) for user in users: yield user diff --git a/linkedin/spiders/linkedin.py b/linkedin/spiders/linkedin.py index cbe3dd6..7888b78 100644 --- a/linkedin/spiders/linkedin.py +++ b/linkedin/spiders/linkedin.py @@ -2,7 +2,7 @@ from scrapy.spiders import CrawlSpider from scrapy.spiders import Rule -from linkedin.spiders.selenium import SeleniumSpiderMixin +from linkedin.spiders.selenium import SeleniumSpiderMixin, get_by_xpath_or_none NETWORK_URL = 'https://www.linkedin.com/mynetwork/invite-connect/connections/' @@ -46,6 +46,16 @@ class Linkedin(SeleniumSpiderMixin, CrawlSpider): ), ) + def wait_page_completion(self, driver): + """ + Abstract function, used to customize how the specific spider have to wait for page completion. + Blank by default + :param driver: + :return: + """ + # waiting links to other users are shown so the crawl can continue + get_by_xpath_or_none(driver, '//*/span/span/span[1]', wait_timeout=3) + def extract_profile_id_from_url(self, response): # extract_profile_id_from_url profile_id = response.url.split('/')[-2] diff --git a/linkedin/spiders/selenium.py b/linkedin/spiders/selenium.py index f271be4..d85c366 100644 --- a/linkedin/spiders/selenium.py +++ b/linkedin/spiders/selenium.py @@ -9,7 +9,6 @@ from conf import EMAIL, PASSWORD from linkedin.integration import CustomLinkedinClient -from linkedin.items import LinkedinUser """ number of seconds used to wait the web page's loading. @@ -124,13 +123,14 @@ def extracts_see_all_url(driver): return see_all_url -def extracts_linkedin_users(driver, company): +def extracts_linkedin_users(driver, company, api_client): """ Gets from a page containing a list of users, all the users. For instance: https://www.linkedin.com/search/results/people/?facetCurrentCompany=[%22221027%22] :param driver: The webdriver, logged in, and located in the page which lists users. :return: Iterator on LinkedinUser. """ + from linkedin.spiders.linkedin import extract_contact_info for i in range(1, 11): print(f'loading {i}th user') @@ -140,7 +140,6 @@ def extracts_linkedin_users(driver, company): result = get_by_xpath_or_none(driver, last_result_xpath) if result is not None: - link_elem = get_by_xpath_or_none(result, './/*[@class="search-result__result-link ember-view"]') link = link_elem.get_attribute('href') if link_elem is not None else None @@ -150,7 +149,9 @@ def extracts_linkedin_users(driver, company): title_elem = get_by_xpath_or_none(result, './/p') title = title_elem.text if name_elem is not None else None - user = LinkedinUser(name=name, title=title, company=company, link=link) + # extract_profile_id_from_url + profile_id = link.split('/')[-2] + user = extract_contact_info(api_client, profile_id) yield user diff --git a/urls.txt b/urls.txt index 1471369..8b2c61e 100644 --- a/urls.txt +++ b/urls.txt @@ -1 +1 @@ -https://www.linkedin.com/company/google \ No newline at end of file +https://www.linkedin.com/company/twitter \ No newline at end of file