Add files via upload

Amr-YA · Aug 29, 2021 · 8bd0995 · 8bd0995
1 parent c7855b3
commit 8bd0995
Showing 1 changed file with 114 additions and 0 deletions.
diff --git a/wuzzuf.py b/wuzzuf.py
@@ -0,0 +1,114 @@
+
+# Imports
+import requests
+import csv
+# BeautifulSoup to scrap the basic landing page
+from bs4 import BeautifulSoup
+# Selenium to scrap the individual job posts, and extract the fields populated by scripts
+from selenium import webdriver
+from selenium.webdriver.common.keys import Keys
+# itertools for handling the list of values and transposing
+import itertools
+
+# The first page of the search result for with search input "python"
+URL = 'https://wuzzuf.net/search/jobs/?a=hpb&q=python&start=0'
+
+
+# Scraping the search result page
+# Empty container lists, will be filled with the extracted data
+page = 0
+companies = []
+titles = []
+locations = []
+skills = []
+contracts = []
+links = []
+
+# using while loop to iterate through the search pages till there's no more search results
+# each loop will extract one page data, and fill the data in the container lists
+while True:
+    # search result url, changing to match the page number
+    page_url = f'{URL[:-1]}{page}'
+
+    # Using request and beautiful soup to extract the content of the page
+    result = requests.get(page_url)
+    result_content = result.content
+    soup = BeautifulSoup(result_content, 'lxml')
+
+    # The number of total ads found in the search result
+    ad_number = int(soup.find('strong').text)
+
+    # Extracting the info from the page soup
+    # result: list of html tag with its text
+    job_titles = soup.find_all("a", {'class': 'css-o171kl', 'rel': 'noreferrer'})
+    location_names = soup.find_all('span', {'class': 'css-5wys0k'})
+    contract_type = soup.find_all('div', {'class': 'css-1lh32fc'})
+    job_skills = soup.find_all('div', {'class': 'css-y4udm8'})
+    company_names = soup.find_all("a", {'class': 'css-17s97q8'})
+
+    page_len = len(company_names)
+
+    # populating container lists with the text from info extracted from the page
+    # each for loop fills the data for one job post
+    for item in range(page_len):
+        companies.append(company_names[item].text)
+        links.append(job_titles[item].attrs['href'])
+        titles.append(job_titles[item].text)
+        locations.append(location_names[item].text)
+        skills.append(job_skills[item].text)
+        contracts.append(contract_type[item].text)
+
+    # Iterate through the search result pages
+    page += 1
+    reached_ad = int(soup.find('li', class_='css-8neukt').text.split()[3])
+    print(f'Page: {page}, ads: {reached_ad}')
+
+    # Exit the while only when scraped posts number matches the max number of search results
+    if reached_ad >= ad_number:
+        print('get out')
+        break
+
+
+# Selenium for extracting salary
+# Salary field are inside each post page, and is filled using javaScript
+# can't be extracted using BeautifulSoup, because the field doesn't have any value from the html page
+# have to run through each post page and extract the salary using selenium
+# using Option & headless to stop selenium from opening a FireFox window
+options = webdriver.FirefoxOptions() 
+options.headless= True
+driver = webdriver.Firefox(options=options)
+
+
+# Salary  
+# since extracting using the selenium is very long process, try extracting just the first item as a proof it works
+salaries = []
+
+for i, link in enumerate(links):
+    driver.get(link)    
+    # Extract the salary field using selenium find
+    salaries.append(driver.find_elements_by_class_name('css-4xky9y')[3].text)
+
+    # Break to extract only sample of the 4 link
+    # remove the break when extracting all the data
+    if i == 3:
+        break
+driver.quit()
+
+
+# Creating the CSV file
+# Header and row values list
+col_names = ['Title', 'Company', 'Location', 'Contract', 'Skills', 'Link', 'Salaries']
+col_values = [titles, companies, locations, contracts, skills, links, salaries]
+
+# transpose the values list (initial col_values would have each row with the same data)
+# ex from col_values: 1st row all titles, 2nd row all companies
+# we want the result to be one value type in each cell of the row, ex: 1st row: 1 title, 1 company, 1 loc ....
+# using itertools and zip_longest to transpose and handle missing values in the same time
+col_values2 = list(map(list, itertools.zip_longest(*col_values, fillvalue='not fetched')))
+
+# Write the list of values in a CSV file
+with open('jobs.csv', 'w', newline = '', encoding='UTF8') as f:
+    wr = csv.writer(f)
+    wr.writerow(col_names)
+    wr.writerows(col_values2)
+print('file created')