Job.web.scrapping / wuzzuf_scraper.py
Yassmen's picture
Update wuzzuf_scraper.py
f6f5e88 verified
raw
history blame
5.54 kB
import requests
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import re
from selenium.webdriver.common.by import By
# wuzzuf function
def Wuzzuf_scrapping(job_type , job_num,driver):
job1 = job_type.split(" ")[0]
job2 = job_type.split(" ")[1]
link1 = 'https://wuzzuf.net/search/jobs/?a=navbl&q='+job1+'%20'+job1
title = []
location = []
country = []
job_description = []
Job_Requirements =[]
company_name = []
links = []
Jop_type = []
Career_Level = []
company_logo = []
Job_Categories = []
Skills_And_Tools = []
Experience_Needed =[]
post_time = []
Title = []
pages_num = np.ceil(job_num/15)
for i in range(int(pages_num) ):
link_new = link1 +'&start='+str(i)
try:
data = requests.get(link_new)
data.raise_for_status() # Check for HTTP errors
soup = BeautifulSoup(data.content, 'html.parser')
Title = soup.find_all('h2', {'class': 'css-m604qf'})
except requests.exceptions.RequestException as e:
# print(f"Request failed: {e}")
continue # Skip to the next page if there's an error
# to get the info about jobs
for x in range(0,len(Title)):
t = re.split('\(|\-',Title[x].find('a').text)
title.append(t[0].strip())
loc = re.split(',' , soup.find_all('span' , {'class': 'css-5wys0k'})[x].text)
r = ""
for i in range(len(loc[:-1])):
r= r+ ', ' +loc[:-1][i].strip()
location.append(r.replace(',', '', 1).strip())
country.append(loc[-1].strip())
#print("---",Title[x].find('a').attrs['href'])
links.append(Title[x].find('a').attrs['href'])
m = " ".join(re.findall("[a-zA-Z\d+]+", (soup.find_all('div' , {'class': 'css-d7j1kk'})[x].find('a').text)))
company_name.append(m)
c = soup.find_all('div' ,{'class':'css-1lh32fc'})[x].find_all('span')
if len(c) ==1:
Jop_type.append(c[0].text)
else:
n =[]
for i in range(len(c)):
n.append(c[i].text)
Jop_type.append(n)
n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span'])
Career_Level.append(n[0].text)
n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span'])
yy = n[1].text.replace('·',' ').strip()
yy = re.findall('[0-9-+]*',yy)
y1 =""
for i in range(len(yy)):
if any(yy[i]):
y1 = y1+yy[i]
if y1 != "":
Experience_Needed.append(y1)
else:
Experience_Needed.append("Not Specified")
time = (soup.find_all('div' ,{'class':'css-d7j1kk'}))[x].find('div')
post_time.append(time.text)
# to get the logo of the company
# Fetch the company logo
try:
#print(links[x])
data1 = requests.get(links[x])
data1.raise_for_status() # Check for HTTP errors
soup1 = BeautifulSoup(data1.content, 'html.parser')
logo_meta = soup1.find_all('meta', {'property': "og:image"})
if logo_meta:
company_logo.append(logo_meta[0]['content'])
else:
print("No logo meta tag found.")
company_logo.append("No logo found")
except requests.exceptions.RequestException as e:
print(f"Failed to fetch company logo: {e}")
company_logo.append("Error fetching logo")
# data1 = requests.get(links[x])
# soup1 = BeautifulSoup(data1.content)
#company_logo.append(soup1.find_all('meta',{'property':"og:image"})[0]['content'])
#time.sleep(4)
# get Job_Categories , Skills_And_Tools , job_description , and job_requirements from urls
#driver = webdriver.Chrome('chromedriver',options=options)
#driver.implicitly_wait(10)
driver.get(links[x])
Job_Categories.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[5]').text.split("\n")[1:])
Skills_And_Tools.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[6]').text.split("\n")[1:])
job_description.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[3]').text.split("\n")[1:])
all =driver.find_elements(By.XPATH ,'//*[@id="app"]/div/main/section[4]/div')
dict_other = {}
new = all[0].text.split("\n\n")
if len(new)!=1 :
for i in range(len(new)):
result =[]
for k in (new[i].split('\n')[1:]):
result.append(k.replace("\u202f"," "))
dict_other[new[i].split('\n')[0]] = result
#result = re.sub('[\W_]+', '', ini_string)
Job_Requirements.append(dict_other)
else:
nn = new[0].replace("\u202f"," ")
Job_Requirements.append(nn.split('\n'))
# create data frame to combine all together
df = pd.DataFrame({'Title' : title , 'Location' : location ,'country':country,'URLs':links ,'Company_Name' : company_name,'Career_Level':Career_Level,'post_time':post_time,'Experience_Needed':Experience_Needed,'Company_Logo':company_logo,"Job_Categories":Job_Categories , "Skills_And_Tools":Skills_And_Tools , "job_description":job_description,"Job_Requirements":Job_Requirements})
df[:job_num].to_excel('WUZZUF_scrapping.xlsx',index=False,encoding='utf-8')
return df[:job_num]