Spaces:
Sleeping
Sleeping
import requests | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
import numpy as np | |
import re | |
from selenium.webdriver.common.by import By | |
# wuzzuf function | |
def Wuzzuf_scrapping(job_type , job_num,driver): | |
job1 = job_type.split(" ")[0] | |
job2 = job_type.split(" ")[1] | |
link1 = 'https://wuzzuf.net/search/jobs/?a=navbl&q='+job1+'%20'+job1 | |
title = [] | |
location = [] | |
country = [] | |
job_description = [] | |
Job_Requirements =[] | |
company_name = [] | |
links = [] | |
Jop_type = [] | |
Career_Level = [] | |
company_logo = [] | |
Job_Categories = [] | |
Skills_And_Tools = [] | |
Experience_Needed =[] | |
post_time = [] | |
Title = [] | |
pages_num = np.ceil(job_num/15) | |
for i in range(int(pages_num) ): | |
link_new = link1 +'&start='+str(i) | |
try: | |
data = requests.get(link_new) | |
data.raise_for_status() # Check for HTTP errors | |
soup = BeautifulSoup(data.content, 'html.parser') | |
Title = soup.find_all('h2', {'class': 'css-m604qf'}) | |
except requests.exceptions.RequestException as e: | |
# print(f"Request failed: {e}") | |
continue # Skip to the next page if there's an error | |
# to get the info about jobs | |
for x in range(0,len(Title)): | |
t = re.split('\(|\-',Title[x].find('a').text) | |
title.append(t[0].strip()) | |
loc = re.split(',' , soup.find_all('span' , {'class': 'css-5wys0k'})[x].text) | |
r = "" | |
for i in range(len(loc[:-1])): | |
r= r+ ', ' +loc[:-1][i].strip() | |
location.append(r.replace(',', '', 1).strip()) | |
country.append(loc[-1].strip()) | |
#print("---",Title[x].find('a').attrs['href']) | |
links.append(Title[x].find('a').attrs['href']) | |
m = " ".join(re.findall("[a-zA-Z\d+]+", (soup.find_all('div' , {'class': 'css-d7j1kk'})[x].find('a').text))) | |
company_name.append(m) | |
c = soup.find_all('div' ,{'class':'css-1lh32fc'})[x].find_all('span') | |
if len(c) ==1: | |
Jop_type.append(c[0].text) | |
else: | |
n =[] | |
for i in range(len(c)): | |
n.append(c[i].text) | |
Jop_type.append(n) | |
n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span']) | |
Career_Level.append(n[0].text) | |
n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span']) | |
yy = n[1].text.replace('·',' ').strip() | |
yy = re.findall('[0-9-+]*',yy) | |
y1 ="" | |
for i in range(len(yy)): | |
if any(yy[i]): | |
y1 = y1+yy[i] | |
if y1 != "": | |
Experience_Needed.append(y1) | |
else: | |
Experience_Needed.append("Not Specified") | |
time = (soup.find_all('div' ,{'class':'css-d7j1kk'}))[x].find('div') | |
post_time.append(time.text) | |
# to get the logo of the company | |
# Fetch the company logo | |
try: | |
#print(links[x]) | |
data1 = requests.get(links[x]) | |
data1.raise_for_status() # Check for HTTP errors | |
soup1 = BeautifulSoup(data1.content, 'html.parser') | |
logo_meta = soup1.find_all('meta', {'property': "og:image"}) | |
if logo_meta: | |
company_logo.append(logo_meta[0]['content']) | |
else: | |
print("No logo meta tag found.") | |
company_logo.append("No logo found") | |
except requests.exceptions.RequestException as e: | |
print(f"Failed to fetch company logo: {e}") | |
company_logo.append("Error fetching logo") | |
# data1 = requests.get(links[x]) | |
# soup1 = BeautifulSoup(data1.content) | |
#company_logo.append(soup1.find_all('meta',{'property':"og:image"})[0]['content']) | |
#time.sleep(4) | |
# get Job_Categories , Skills_And_Tools , job_description , and job_requirements from urls | |
#driver = webdriver.Chrome('chromedriver',options=options) | |
#driver.implicitly_wait(10) | |
driver.get(links[x]) | |
Job_Categories.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[5]').text.split("\n")[1:]) | |
Skills_And_Tools.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[6]').text.split("\n")[1:]) | |
job_description.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[3]').text.split("\n")[1:]) | |
all =driver.find_elements(By.XPATH ,'//*[@id="app"]/div/main/section[4]/div') | |
dict_other = {} | |
new = all[0].text.split("\n\n") | |
if len(new)!=1 : | |
for i in range(len(new)): | |
result =[] | |
for k in (new[i].split('\n')[1:]): | |
result.append(k.replace("\u202f"," ")) | |
dict_other[new[i].split('\n')[0]] = result | |
#result = re.sub('[\W_]+', '', ini_string) | |
Job_Requirements.append(dict_other) | |
else: | |
nn = new[0].replace("\u202f"," ") | |
Job_Requirements.append(nn.split('\n')) | |
# create data frame to combine all together | |
df = pd.DataFrame({'Title' : title , 'Location' : location ,'country':country,'URLs':links ,'Company_Name' : company_name,'Career_Level':Career_Level,'post_time':post_time,'Experience_Needed':Experience_Needed,'Company_Logo':company_logo,"Job_Categories":Job_Categories , "Skills_And_Tools":Skills_And_Tools , "job_description":job_description,"Job_Requirements":Job_Requirements}) | |
df[:job_num].to_excel('WUZZUF_scrapping.xlsx',index=False,encoding='utf-8') | |
return df[:job_num] | |