Skip to content

Commit

Permalink
implemented stop critiria on search spider
Browse files Browse the repository at this point in the history
  • Loading branch information
eracle committed Mar 22, 2020
1 parent 65432a8 commit 9fef516
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 16 deletions.
18 changes: 15 additions & 3 deletions linkedin/spiders/by_name.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,16 @@
NAMES_FILE = 'names.txt'


def name_not_matching_stop_criteria(user, name):
name_set = set(name.lower().strip().split())

lastName = user['lastName']
firstName = user['firstName']
user_name_set = set(lastName.lower().strip().split() + firstName.lower().strip().split())

return not name_set.issubset(user_name_set)


class ByNameSpider(SearchSpider):
"""
Spider who searches People by name.
Expand All @@ -16,8 +26,7 @@ class ByNameSpider(SearchSpider):

start_urls = []

with open(NAMES_FILE, "rt") as f:
names = [name for name in f]
names = filter(None, (line.rstrip() for line in open(NAMES_FILE, "rt")))

def start_requests(self):
for name in self.names:
Expand All @@ -27,7 +36,10 @@ def start_requests(self):
yield Request(url=url,
callback=super().parser_search_results_page,
dont_filter=True,
meta={'max_page': 1},
meta={'max_page': 1,
'stop_criteria': name_not_matching_stop_criteria,
'stop_criteria_args': name,
},
)

def wait_page_completion(self, driver):
Expand Down
29 changes: 16 additions & 13 deletions linkedin/spiders/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,44 +17,47 @@ class SearchSpider(SeleniumSpiderMixin, Spider):
"""

def parser_search_results_page(self, response):
print('Now parsing search result page')
# getting optional callback's arguments:

no_result_found_xpath = '//*[text()="No results found."]'
# maximum number for pagination
max_page = response.meta.get('max_page', None)

# stop_criteria : returns True if search must stop
stop_criteria = response.meta.get('stop_criteria', None)
stop_criteria_args = response.meta.get('stop_criteria_args', None)

# Now parsing search result page
no_result_found_xpath = '//*[text()="No results found."]'
no_result_response = get_by_xpath_or_none(driver=self.driver,
xpath=no_result_found_xpath,
wait_timeout=NO_RESULT_WAIT_TIMEOUT,
logs=False)

if no_result_response is not None:
print('"No results" message shown, stop crawling this company')
# no results message shown: stop crawling this company
return
else:
# company extraction temporary disabled
# company = extract_company(self.driver)
# print(f'Company:{company}')

users = extracts_linkedin_users(self.driver,
#company=company,
api_client=self.api_client)
users = extracts_linkedin_users(self.driver, api_client=self.api_client)
for user in users:
if stop_criteria is not None:
if stop_criteria(user, stop_criteria_args):
# if stop criteria is matched stops the crawl, and also next pages
return
yield user


# incrementing the index at the end of the url
url = response.request.url
next_url_split = url.split('=')
index = int(next_url_split[-1])
next_url = '='.join(next_url_split[:-1]) + '=' + str(index + 1)

max_page = response.meta.get('max_page', None)
if max_page is not None:
if index >= max_page:
return

yield Request(url=next_url,
callback=self.parser_search_results_page,
meta={'max_page': max_page},
meta=response.meta,
dont_filter=True,
)

Expand Down

0 comments on commit 9fef516

Please sign in to comment.