Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
Amr-YA authored Aug 29, 2021
1 parent c7855b3 commit 8bd0995
Showing 1 changed file with 114 additions and 0 deletions.
114 changes: 114 additions & 0 deletions wuzzuf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@

# Imports
import requests
import csv
# BeautifulSoup to scrap the basic landing page
from bs4 import BeautifulSoup
# Selenium to scrap the individual job posts, and extract the fields populated by scripts
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
# itertools for handling the list of values and transposing
import itertools

# The first page of the search result for with search input "python"
URL = 'https://wuzzuf.net/search/jobs/?a=hpb&q=python&start=0'


# Scraping the search result page
# Empty container lists, will be filled with the extracted data
page = 0
companies = []
titles = []
locations = []
skills = []
contracts = []
links = []

# using while loop to iterate through the search pages till there's no more search results
# each loop will extract one page data, and fill the data in the container lists
while True:
# search result url, changing to match the page number
page_url = f'{URL[:-1]}{page}'

# Using request and beautiful soup to extract the content of the page
result = requests.get(page_url)
result_content = result.content
soup = BeautifulSoup(result_content, 'lxml')

# The number of total ads found in the search result
ad_number = int(soup.find('strong').text)

# Extracting the info from the page soup
# result: list of html tag with its text
job_titles = soup.find_all("a", {'class': 'css-o171kl', 'rel': 'noreferrer'})
location_names = soup.find_all('span', {'class': 'css-5wys0k'})
contract_type = soup.find_all('div', {'class': 'css-1lh32fc'})
job_skills = soup.find_all('div', {'class': 'css-y4udm8'})
company_names = soup.find_all("a", {'class': 'css-17s97q8'})

page_len = len(company_names)

# populating container lists with the text from info extracted from the page
# each for loop fills the data for one job post
for item in range(page_len):
companies.append(company_names[item].text)
links.append(job_titles[item].attrs['href'])
titles.append(job_titles[item].text)
locations.append(location_names[item].text)
skills.append(job_skills[item].text)
contracts.append(contract_type[item].text)

# Iterate through the search result pages
page += 1
reached_ad = int(soup.find('li', class_='css-8neukt').text.split()[3])
print(f'Page: {page}, ads: {reached_ad}')

# Exit the while only when scraped posts number matches the max number of search results
if reached_ad >= ad_number:
print('get out')
break


# Selenium for extracting salary
# Salary field are inside each post page, and is filled using javaScript
# can't be extracted using BeautifulSoup, because the field doesn't have any value from the html page
# have to run through each post page and extract the salary using selenium
# using Option & headless to stop selenium from opening a FireFox window
options = webdriver.FirefoxOptions()
options.headless= True
driver = webdriver.Firefox(options=options)


# Salary
# since extracting using the selenium is very long process, try extracting just the first item as a proof it works
salaries = []

for i, link in enumerate(links):
driver.get(link)
# Extract the salary field using selenium find
salaries.append(driver.find_elements_by_class_name('css-4xky9y')[3].text)

# Break to extract only sample of the 4 link
# remove the break when extracting all the data
if i == 3:
break
driver.quit()


# Creating the CSV file
# Header and row values list
col_names = ['Title', 'Company', 'Location', 'Contract', 'Skills', 'Link', 'Salaries']
col_values = [titles, companies, locations, contracts, skills, links, salaries]

# transpose the values list (initial col_values would have each row with the same data)
# ex from col_values: 1st row all titles, 2nd row all companies
# we want the result to be one value type in each cell of the row, ex: 1st row: 1 title, 1 company, 1 loc ....
# using itertools and zip_longest to transpose and handle missing values in the same time
col_values2 = list(map(list, itertools.zip_longest(*col_values, fillvalue='not fetched')))

# Write the list of values in a CSV file
with open('jobs.csv', 'w', newline = '', encoding='UTF8') as f:
wr = csv.writer(f)
wr.writerow(col_names)
wr.writerows(col_values2)
print('file created')

0 comments on commit 8bd0995

Please sign in to comment.