-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwuzzuf.py
114 lines (92 loc) · 4.27 KB
/
wuzzuf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# Imports
import requests
import csv
# BeautifulSoup to scrap the basic landing page
from bs4 import BeautifulSoup
# Selenium to scrap the individual job posts, and extract the fields populated by scripts
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
# itertools for handling the list of values and transposing
import itertools
# The first page of the search result for with search input "python"
URL = 'https://wuzzuf.net/search/jobs/?a=hpb&q=python&start=0'
# Scraping the search result page
# Empty container lists, will be filled with the extracted data
page = 0
companies = []
titles = []
locations = []
skills = []
contracts = []
links = []
# using while loop to iterate through the search pages till there's no more search results
# each loop will extract one page data, and fill the data in the container lists
while True:
# search result url, changing to match the page number
page_url = f'{URL[:-1]}{page}'
# Using request and beautiful soup to extract the content of the page
result = requests.get(page_url)
result_content = result.content
soup = BeautifulSoup(result_content, 'lxml')
# The number of total ads found in the search result
ad_number = int(soup.find('strong').text)
# Extracting the info from the page soup
# result: list of html tag with its text
job_titles = soup.find_all("a", {'class': 'css-o171kl', 'rel': 'noreferrer'})
location_names = soup.find_all('span', {'class': 'css-5wys0k'})
contract_type = soup.find_all('div', {'class': 'css-1lh32fc'})
job_skills = soup.find_all('div', {'class': 'css-y4udm8'})
company_names = soup.find_all("a", {'class': 'css-17s97q8'})
page_len = len(company_names)
# populating container lists with the text from info extracted from the page
# each for loop fills the data for one job post
for item in range(page_len):
companies.append(company_names[item].text)
links.append(job_titles[item].attrs['href'])
titles.append(job_titles[item].text)
locations.append(location_names[item].text)
skills.append(job_skills[item].text)
contracts.append(contract_type[item].text)
# Iterate through the search result pages
page += 1
reached_ad = int(soup.find('li', class_='css-8neukt').text.split()[3])
print(f'Page: {page}, ads: {reached_ad}')
# Exit the while only when scraped posts number matches the max number of search results
if reached_ad >= ad_number:
print('get out')
break
# Selenium for extracting salary
# Salary field are inside each post page, and is filled using javaScript
# can't be extracted using BeautifulSoup, because the field doesn't have any value from the html page
# have to run through each post page and extract the salary using selenium
# using Option & headless to stop selenium from opening a FireFox window
options = webdriver.FirefoxOptions()
options.headless= True
driver = webdriver.Firefox(options=options)
# Salary
# since extracting using the selenium is very long process, try extracting just the first item as a proof it works
salaries = []
for i, link in enumerate(links):
driver.get(link)
# Extract the salary field using selenium find
salaries.append(driver.find_elements_by_class_name('css-4xky9y')[3].text)
# Break to extract only sample of the 4 link
# remove the break when extracting all the data
if i == 3:
break
driver.quit()
# Creating the CSV file
# Header and row values list
col_names = ['Title', 'Company', 'Location', 'Contract', 'Skills', 'Link', 'Salaries']
col_values = [titles, companies, locations, contracts, skills, links, salaries]
# transpose the values list (initial col_values would have each row with the same data)
# ex from col_values: 1st row all titles, 2nd row all companies
# we want the result to be one value type in each cell of the row, ex: 1st row: 1 title, 1 company, 1 loc ....
# using itertools and zip_longest to transpose and handle missing values in the same time
col_values2 = list(map(list, itertools.zip_longest(*col_values, fillvalue='not fetched')))
# Write the list of values in a CSV file
with open('jobs.csv', 'w', newline = '', encoding='UTF8') as f:
wr = csv.writer(f)
wr.writerow(col_names)
wr.writerows(col_values2)
print('file created')