-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
114 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
|
||
# Imports | ||
import requests | ||
import csv | ||
# BeautifulSoup to scrap the basic landing page | ||
from bs4 import BeautifulSoup | ||
# Selenium to scrap the individual job posts, and extract the fields populated by scripts | ||
from selenium import webdriver | ||
from selenium.webdriver.common.keys import Keys | ||
# itertools for handling the list of values and transposing | ||
import itertools | ||
|
||
# The first page of the search result for with search input "python" | ||
URL = 'https://wuzzuf.net/search/jobs/?a=hpb&q=python&start=0' | ||
|
||
|
||
# Scraping the search result page | ||
# Empty container lists, will be filled with the extracted data | ||
page = 0 | ||
companies = [] | ||
titles = [] | ||
locations = [] | ||
skills = [] | ||
contracts = [] | ||
links = [] | ||
|
||
# using while loop to iterate through the search pages till there's no more search results | ||
# each loop will extract one page data, and fill the data in the container lists | ||
while True: | ||
# search result url, changing to match the page number | ||
page_url = f'{URL[:-1]}{page}' | ||
|
||
# Using request and beautiful soup to extract the content of the page | ||
result = requests.get(page_url) | ||
result_content = result.content | ||
soup = BeautifulSoup(result_content, 'lxml') | ||
|
||
# The number of total ads found in the search result | ||
ad_number = int(soup.find('strong').text) | ||
|
||
# Extracting the info from the page soup | ||
# result: list of html tag with its text | ||
job_titles = soup.find_all("a", {'class': 'css-o171kl', 'rel': 'noreferrer'}) | ||
location_names = soup.find_all('span', {'class': 'css-5wys0k'}) | ||
contract_type = soup.find_all('div', {'class': 'css-1lh32fc'}) | ||
job_skills = soup.find_all('div', {'class': 'css-y4udm8'}) | ||
company_names = soup.find_all("a", {'class': 'css-17s97q8'}) | ||
|
||
page_len = len(company_names) | ||
|
||
# populating container lists with the text from info extracted from the page | ||
# each for loop fills the data for one job post | ||
for item in range(page_len): | ||
companies.append(company_names[item].text) | ||
links.append(job_titles[item].attrs['href']) | ||
titles.append(job_titles[item].text) | ||
locations.append(location_names[item].text) | ||
skills.append(job_skills[item].text) | ||
contracts.append(contract_type[item].text) | ||
|
||
# Iterate through the search result pages | ||
page += 1 | ||
reached_ad = int(soup.find('li', class_='css-8neukt').text.split()[3]) | ||
print(f'Page: {page}, ads: {reached_ad}') | ||
|
||
# Exit the while only when scraped posts number matches the max number of search results | ||
if reached_ad >= ad_number: | ||
print('get out') | ||
break | ||
|
||
|
||
# Selenium for extracting salary | ||
# Salary field are inside each post page, and is filled using javaScript | ||
# can't be extracted using BeautifulSoup, because the field doesn't have any value from the html page | ||
# have to run through each post page and extract the salary using selenium | ||
# using Option & headless to stop selenium from opening a FireFox window | ||
options = webdriver.FirefoxOptions() | ||
options.headless= True | ||
driver = webdriver.Firefox(options=options) | ||
|
||
|
||
# Salary | ||
# since extracting using the selenium is very long process, try extracting just the first item as a proof it works | ||
salaries = [] | ||
|
||
for i, link in enumerate(links): | ||
driver.get(link) | ||
# Extract the salary field using selenium find | ||
salaries.append(driver.find_elements_by_class_name('css-4xky9y')[3].text) | ||
|
||
# Break to extract only sample of the 4 link | ||
# remove the break when extracting all the data | ||
if i == 3: | ||
break | ||
driver.quit() | ||
|
||
|
||
# Creating the CSV file | ||
# Header and row values list | ||
col_names = ['Title', 'Company', 'Location', 'Contract', 'Skills', 'Link', 'Salaries'] | ||
col_values = [titles, companies, locations, contracts, skills, links, salaries] | ||
|
||
# transpose the values list (initial col_values would have each row with the same data) | ||
# ex from col_values: 1st row all titles, 2nd row all companies | ||
# we want the result to be one value type in each cell of the row, ex: 1st row: 1 title, 1 company, 1 loc .... | ||
# using itertools and zip_longest to transpose and handle missing values in the same time | ||
col_values2 = list(map(list, itertools.zip_longest(*col_values, fillvalue='not fetched'))) | ||
|
||
# Write the list of values in a CSV file | ||
with open('jobs.csv', 'w', newline = '', encoding='UTF8') as f: | ||
wr = csv.writer(f) | ||
wr.writerow(col_names) | ||
wr.writerows(col_values2) | ||
print('file created') |