From 8bd099508142c534dfb5a7d76a7cf86e1e2f722c Mon Sep 17 00:00:00 2001 From: Amr Yasser <74487632+Odd-Baron@users.noreply.github.com> Date: Sun, 29 Aug 2021 07:08:35 +0200 Subject: [PATCH] Add files via upload --- wuzzuf.py | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 wuzzuf.py diff --git a/wuzzuf.py b/wuzzuf.py new file mode 100644 index 0000000..a05d49e --- /dev/null +++ b/wuzzuf.py @@ -0,0 +1,114 @@ + +# Imports +import requests +import csv +# BeautifulSoup to scrap the basic landing page +from bs4 import BeautifulSoup +# Selenium to scrap the individual job posts, and extract the fields populated by scripts +from selenium import webdriver +from selenium.webdriver.common.keys import Keys +# itertools for handling the list of values and transposing +import itertools + +# The first page of the search result for with search input "python" +URL = 'https://wuzzuf.net/search/jobs/?a=hpb&q=python&start=0' + + +# Scraping the search result page +# Empty container lists, will be filled with the extracted data +page = 0 +companies = [] +titles = [] +locations = [] +skills = [] +contracts = [] +links = [] + +# using while loop to iterate through the search pages till there's no more search results +# each loop will extract one page data, and fill the data in the container lists +while True: + # search result url, changing to match the page number + page_url = f'{URL[:-1]}{page}' + + # Using request and beautiful soup to extract the content of the page + result = requests.get(page_url) + result_content = result.content + soup = BeautifulSoup(result_content, 'lxml') + + # The number of total ads found in the search result + ad_number = int(soup.find('strong').text) + + # Extracting the info from the page soup + # result: list of html tag with its text + job_titles = soup.find_all("a", {'class': 'css-o171kl', 'rel': 'noreferrer'}) + location_names = soup.find_all('span', {'class': 'css-5wys0k'}) + contract_type = soup.find_all('div', {'class': 'css-1lh32fc'}) + job_skills = soup.find_all('div', {'class': 'css-y4udm8'}) + company_names = soup.find_all("a", {'class': 'css-17s97q8'}) + + page_len = len(company_names) + + # populating container lists with the text from info extracted from the page + # each for loop fills the data for one job post + for item in range(page_len): + companies.append(company_names[item].text) + links.append(job_titles[item].attrs['href']) + titles.append(job_titles[item].text) + locations.append(location_names[item].text) + skills.append(job_skills[item].text) + contracts.append(contract_type[item].text) + + # Iterate through the search result pages + page += 1 + reached_ad = int(soup.find('li', class_='css-8neukt').text.split()[3]) + print(f'Page: {page}, ads: {reached_ad}') + + # Exit the while only when scraped posts number matches the max number of search results + if reached_ad >= ad_number: + print('get out') + break + + +# Selenium for extracting salary +# Salary field are inside each post page, and is filled using javaScript +# can't be extracted using BeautifulSoup, because the field doesn't have any value from the html page +# have to run through each post page and extract the salary using selenium +# using Option & headless to stop selenium from opening a FireFox window +options = webdriver.FirefoxOptions() +options.headless= True +driver = webdriver.Firefox(options=options) + + +# Salary +# since extracting using the selenium is very long process, try extracting just the first item as a proof it works +salaries = [] + +for i, link in enumerate(links): + driver.get(link) + # Extract the salary field using selenium find + salaries.append(driver.find_elements_by_class_name('css-4xky9y')[3].text) + + # Break to extract only sample of the 4 link + # remove the break when extracting all the data + if i == 3: + break +driver.quit() + + +# Creating the CSV file +# Header and row values list +col_names = ['Title', 'Company', 'Location', 'Contract', 'Skills', 'Link', 'Salaries'] +col_values = [titles, companies, locations, contracts, skills, links, salaries] + +# transpose the values list (initial col_values would have each row with the same data) +# ex from col_values: 1st row all titles, 2nd row all companies +# we want the result to be one value type in each cell of the row, ex: 1st row: 1 title, 1 company, 1 loc .... +# using itertools and zip_longest to transpose and handle missing values in the same time +col_values2 = list(map(list, itertools.zip_longest(*col_values, fillvalue='not fetched'))) + +# Write the list of values in a CSV file +with open('jobs.csv', 'w', newline = '', encoding='UTF8') as f: + wr = csv.writer(f) + wr.writerow(col_names) + wr.writerows(col_values2) +print('file created') \ No newline at end of file