-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathIndeed_job_scraper.py
62 lines (49 loc) · 2.16 KB
/
Indeed_job_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os
import csv
import requests
from bs4 import BeautifulSoup
headers = {
"User-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"}
# Skills and Place of Work
skill = input('Enter your Skill: ').strip()
place = input('Enter the location: ').strip()
no_of_pages = int(input('Enter the #pages to scrape: '))
# Creating the Main Directory
main_dir = os.getcwd() + '\\'
if not os.path.exists(main_dir):
os.mkdir(main_dir)
print('Base Directory Created Successfully.')
# Name of the CSV File
file_name = skill.title() + '_' + place.title() + '_Jobs.csv'
# Path of the CSV File
file_path = main_dir + file_name
# Writing to the CSV File
with open(file_path, mode='w') as file:
writer = csv.writer(file, delimiter=',', lineterminator='\n')
# Adding the Column Names to the CSV File
writer.writerow(
['JOB_NAME', 'COMPANY', 'LOCATION', 'POSTED', 'APPLY_LINK'])
# Requesting and getting the webpage using requests
print(f'\nScraping in progress...\n')
for page in range(no_of_pages):
url = 'https://www.indeed.co.in/jobs?q=' + skill + \
'&l=' + place + '&start=' + str(page * 10)
response = requests.get(url, headers=headers)
html = response.text
# Scrapping the Web
soup = BeautifulSoup(html, 'lxml')
base_url = 'https://in.indeed.com/viewjob?jk='
d = soup.find('div', attrs={'id': 'mosaic-provider-jobcards'})
jobs = soup.find_all('a', class_='tapItem')
for job in jobs:
job_id = job['id'].split('_')[-1]
job_title = job.find('span', title=True).text.strip()
company = job.find('span', class_='companyName').text.strip()
location = job.find('div', class_='companyLocation').text.strip()
posted = job.find('span', class_='date').text.strip()
job_link = base_url + job_id
#print([job_title, company, location, posted, job_link])
# Writing to CSV File
writer.writerow(
[job_title, company, location.title(), posted, job_link])
print(f'Jobs data written to <{file_name}> successfully.')