-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathjob_scraper.py
executable file
·355 lines (284 loc) · 15.8 KB
/
job_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 13 23:29:48 2020
@author: takutakamatsu
"""
import pandas as pd
from requests import get
from bs4 import BeautifulSoup
from datetime import date
import dateutil.relativedelta
import time
from pyvirtualdisplay import Display #PythonAnywhere
from selenium import webdriver
import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
import secret
def ent_careers():
'''scrapes all job listings with keyword 'data' from Entertainment Careers and returns dataframe'''
base_url = 'https://www.entertainmentcareers.net/2/search/search2.asp?zoom_query=data&FULLORPART=-1&JOBSTATE=NY&zoom_page=1&zoom_per_page=100&zoom_cat=-1&zoom_and=0&zoom_sort=0'
url = 'https://www.entertainmentcareers.net/2/search/search2.asp?zoom_query=data&FULLORPART=-1&JOBSTATE=NY&zoom_page={}&zoom_per_page=100&zoom_cat=-1&zoom_and=0&zoom_sort=0'
response = get(base_url)
soup = BeautifulSoup(response.text, "html.parser")
#find number of pages
pages = int(soup.find("div", class_="result_pagescount").text.split(' ')[0])
job_list = []
#loop through each page
for page in range(1, pages+1):
soup = BeautifulSoup(get(url.format(page)).text, "html.parser")
#for every job listing - main block
for container in soup.find_all('div', class_='result_block'):
job_url = 'https://www.entertainmentcareers.net'+container.a['href']
heading = container.a.text.split(' - (')
title = str(heading[0].strip())
company = str(heading[1].split('-')[1].strip())
location = str(heading[1].split('-')[2].strip())
job_type = container.find('span', class_='category').text.strip()[1:-1]
#try to get posted date if visible in description
try:
context_list = container.find('div', class_='context').text.split(' ')
posted_date = ' '.join(context_list[context_list.index('Posted:')+1 : context_list.index('Posted:')+4])
except:
posted_date = None
job_list.append((title, company, location, job_type, job_url, posted_date))
#for every job listing - altblock
for container in soup.find_all('div', class_='result_altblock'):
job_url = 'https://www.entertainmentcareers.net'+container.a['href']
heading = container.a.text.split(' - (')
title = str(heading[0].strip())
company = str(heading[1].split('-')[1].strip())
location = str(heading[1].split('-')[2].strip())
job_type = container.find('span', class_='category').text.strip()[1:-1]
#try to get posted date if visible in description
try:
context_list = container.find('div', class_='context').text.split(' ')
posted_date = ' '.join(context_list[context_list.index('Posted:')+1 : context_list.index('Posted:')+4])
except:
posted_date = None
job_list.append((title, company, location, job_type, job_url, posted_date))
#return jobs as dataframe
job_df = pd.DataFrame.from_records(job_list, columns=['Title', 'Company', 'Location', 'Job Type', 'Job URL', 'Posted Date'])
return job_df
#builtinnyc
def builtinnyc():
'''scrapes Built In NYC's job listings with keywords "data analytics" and "python", returning dataframe'''
#search for data analytics + python in dev/engineer field
base_url = 'https://www.builtinnyc.com/jobs?f%5B0%5D=job-category_developer-engineer-python&f%5B1%5D=job-category_data-analytics&f%5B2%5D=level_entry'
url = 'https://www.builtinnyc.com/jobs?f%5B0%5D=job-category_developer-engineer-python&f%5B1%5D=job-category_data-analytics&f%5B2%5D=level_entry&page={}'
soup = BeautifulSoup(get(base_url).text, "html.parser")
#find number of pages
pager_items = soup.find_all('li', class_="pager__item")
pages = int(max([page.text.split(' ')[-1].strip() for page in pager_items if page.text.split(' ')[-1].strip().isnumeric()]))
job_lists = []
#loop through pages (starts at 0)
for page in range(pages):
soup = BeautifulSoup(get(url.format(page)).text, "html.parser")
#find all job containers
for container in soup.find_all("div", class_="views-row"):
title = container.find("h2", class_="title").text.strip()
company = container.find("div", class_="company-title").text.strip()
location = container.find("div", class_="job-location").text.strip()
description = container.find("div", class_="description").text
job_url = 'https://builtinnyc.com' + str(container.find("div", class_="wrap-view-page").find('a')['href'])
job_lists.append((title, company, location, description, job_url))
#return as dataframe
job_df = pd.DataFrame.from_records(job_lists, columns=['Title', 'Company', 'Location', 'Description', 'Job URL'])
return job_df
def linkedin():
''' scrapes LinkedIn using Selenium - dynamically scrolling down the page and returning dataframe
Search term: data anlytics
Location: New York City
Range: up to 1-month
'''
#LinkedIn Jobs, not signed in, refreshes dynamically using Javascript. We can use Selenium to physically scroll through the page as it's loading
#set Chrome to headless -- to work with PythonAnywhere
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
#open headless Chrome
driver = webdriver.Chrome(options=chrome_options)
try:
driver.get('https://www.linkedin.com/jobs/search?keywords=Data%2BAnalyst&location=New%2BYork%2C%2BUnited%2BStates&trk=public_jobs_jobs-search-bar_search-submit&f_PP=102571732&f_E=2%2C3&sortBy=DD&f_TP=1%2C2%2C3%2C4&redirect=false&position=1&pageNum=0')
print("Opened:", driver.title)
time.sleep(5)
#check for pop-ups and click dismiss if found
try:
driver.find_element_by_xpath('//button[@class="cta-modal__dismiss-btn"]').click()
except:
pass
#display number of job listings to scrape
num_jobs = driver.find_element_by_xpath('//span[@class="results-context-header__job-count"]').text.replace(',', '').replace('+', '')
print("Number of results to scrape:", num_jobs)
jobs = []
results = 1
while True:
#check for 'load more' button
try:
driver.find_element_by_xpath('//button[contains(@aria-label, "Load more results")]').click()
time.sleep(3)
except:
pass
#scroll
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
#find loaded containers
loaded = driver.find_elements_by_xpath('//li[contains(@class,"job-result-card")][position()>=' + str(results) + ']')
print("loaded:", len(loaded), "jobs")
#append loaded containers to results counter
results += len(loaded)
#loop through each container and append search results to list
for idx, container in enumerate(loaded):
title = container.find_element_by_xpath('.//h3[contains(@class, "result-card__title")]').text
company = container.find_element_by_xpath('.//h4[contains(@class, "result-card__subtitle")]').text
location = container.find_element_by_xpath('.//span[@class="job-result-card__location"]').text
posted_date = container.find_element_by_xpath('.//time[contains(@class,"job-result-card__listdate")]').get_attribute('datetime')
job_href = container.find_element_by_xpath('.//a[@class="result-card__full-card-link"]').get_attribute('href')
jobs.append((title, company, location, posted_date, job_href))
print("total scraped:", results)
#if results count matches number of results, break loop
if results >= int(num_jobs):
break
#create dataframe with job list
jobs_df = pd.DataFrame.from_records(jobs, columns=['Title', 'Company', 'Location', 'Posted Date', 'Job URL'])
#return dataframe
return jobs_df
finally:
driver.quit()
def angellist():
''' scrapes Angel List using Selenium, dynamically scrolling down the page whilst extracting key job insights
search term: data analytics
returns dataframe of current job listings
'''
#as AngelList is dynamic, we'll have to use Selenium
#set Chrome to headless
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
#open Chrome with Selenium
driver = webdriver.Chrome(options=chrome_options)
try:
#try login if not already
try:
driver.get('https://angel.co/login')
driver.find_element_by_id('user_email').send_keys(secret.login_email())
driver.find_element_by_id('user_password').send_keys(secret.login_password())
driver.find_element_by_xpath('/html/body/div[1]/div[4]/div/div/div/div/div/div[1]/div[1]/form/div[2]/input').click()
print("Opened:", driver.title)
time.sleep(5)
except:
pass
#number of results
num_results = int(driver.find_element_by_xpath('//*[@id="main"]/div/div[5]/div[2]/div/div[3]/h4').text.split(' ')[0])
print("Number of results:", num_results)
jobs = []
results = 1
while True:
#scroll down
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#let the page load
time.sleep(3)
#find loaded components / for position where new
loaded_posts = driver.find_elements_by_xpath('//div[@class="component_4d072"][position()>=' + str(results) + ']')
print("loaded posts:", len(loaded_posts))
#add length of loaded posts to results to only scrape new daa
results += len(loaded_posts)
#loop through each loaded post
for post in loaded_posts:
title = post.find_element_by_class_name("title_2148e").text
company = post.find_element_by_xpath('.//a[@class="component_21e4d defaultLink_7325e name_5fa89"]').text
try:
salary_estimate = post.find_element_by_class_name("salaryEstimate_b0878").text
except:
salary_estimate = ''
try:
description = post.find_element_by_xpath('.//span[@class="subheader_755b1"]').text
except:
description = ''
#grab location, remove salary from scrape and convert list to comma-separated list
location = list(filter(None, post.find_element_by_xpath('.//span[contains(@class,"__halo_fontSizeMap_size--sm __halo_color_slate--900")]').text.replace(salary_estimate, '').split(' • ')))
location = ", ".join(location)
#dates are relative to the current date - subtract it to find an estimated post date
try:
posted_date_relative = post.find_element_by_xpath('.//span[contains(@class,"tablet_6074f")]').text.split(' ')
if posted_date_relative[0].lower() == 'this':
posted_date = date.today() - dateutil.relativedelta.relativedelta(days=4)
elif posted_date_relative[0].lower() == 'yesterday':
posted_date = date.today() - dateutil.relativedelta.relativedelta(days=1)
elif posted_date_relative[1].lower() == 'weeks' or posted_date_relative[1].lower() == 'week':
posted_date = date.today() - dateutil.relativedelta.relativedelta(weeks=int(posted_date_relative[0]))
elif posted_date_relative[1].lower() == 'weeks' or posted_date_relative[1].lower() == 'week':
posted_date = date.today() - dateutil.relativedelta.relativedelta(weeks=int(posted_date_relative[0]))
elif posted_date_relative[1].lower() == 'months'or posted_date_relative[1].lower() == 'month':
posted_date = date.today() - dateutil.relativedelta.relativedelta(months=int(posted_date_relative[0]))
elif posted_date_relative[1].lower() == 'days'or posted_date_relative[1].lower() == 'day':
posted_date = date.today() - dateutil.relativedelta.relativedelta(days=int(posted_date_relative[0]))
except:
posted_date = ''
#get job url
job_url = post.find_element_by_xpath('.//div[@class="component_07bb9"]/a').get_attribute('href')
#append all to list
jobs.append((title, company, description, location, str(posted_date), salary_estimate, job_url))
#number of results scraped
print("total scraped", results)
#if results count matches number of results, break loop
if results >= num_results:
break
#convert list to dataframe
jobs_df = pd.DataFrame.from_records(jobs, columns=['Title', 'Company','Description', 'Location', 'Posted Date', 'Salary Estimate', 'Job URL'])
#return dataframe
return jobs_df
finally:
driver.quit()
def combine_jobs(ent_careers, built_in_nyc, linkedin, angellist):
#add website name to each dataframe
ent_careers.loc[:,'Website'] = 'Entertainment Careers'
built_in_nyc.loc[:, 'Website'] = 'Built In NYC'
linkedin.loc[:, 'Website'] = 'LinkedIn'
angellist.loc[:, 'Website'] = 'Angel List'
#concatenate all posts
all_jobs = pd.concat([ent_careers, built_in_nyc, linkedin, angellist])
all_jobs = all_jobs[['Title', 'Company', 'Location', 'Description', 'Salary Estimate', 'Job Type', 'Posted Date', 'Website', 'Job URL']]
#deduplicate jobs by title and company name, keeping most recent occurence
all_jobs.drop_duplicates(['Title', 'Company'], keep='first', inplace=True)
#return combined dataframe
return all_jobs
def write_to_gsheets(df):
gc = gspread.service_account('service_account.json')
sheet = gc.open("Taku-data_job_scraper").sheet1
#sort by date
df['Posted Date'] = pd.to_datetime(df['Posted Date'])
df = df.sort_values('Posted Date', ascending=False)
#fill null with empty string
df.fillna('', inplace=True)
#update google sheets
set_with_dataframe(sheet, df)
def main():
#initiate global variables
global ent_careers
global built_in_nyc
global linkedin
global angellist
#calculate scrape time
start_time = time.time()
ent_careers = ent_careers()
elapsed = time.time() - start_time
print("Returned Entertainment Career; ", len(ent_careers), " results on ", date.today(), ". Took ", format(elapsed, ".2f"), " seconds", sep="")
start_time = time.time()
built_in_nyc = builtinnyc()
elapsed = time.time() - start_time
print("Returned Built In NYC; ", len(built_in_nyc), " results on ", date.today(), ". Took ", format(elapsed, ".2f"), " seconds", sep="")
start_time = time.time()
linkedin = linkedin()
elapsed = time.time() - start_time
print("Returned LinkedIn; ", len(linkedin), " results on ", date.today(), ". Took ", format(elapsed, ".2f"), " seconds", sep="")
start_time = time.time()
angellist = angellist()
elapsed = time.time() - start_time
print("Returned AngelList; ", len(angellist), " results on ", date.today(), ". Took ", format(elapsed, ".2f"), " seconds", sep="")
#combine all dataframes and deduplicate
all_jobs = combine_jobs(ent_careers, built_in_nyc, linkedin, angellist)
#write to spreadsheet
write_to_gsheets(all_jobs)
if __name__ == '__main__':
main()