Spaces:

Yassmen
/

Job.web.scrapping

Sleeping

App Files Files Community

Job.web.scrapping / wuzzuf_scraper.py

Yassmen

Update wuzzuf_scraper.py

f6f5e88 verified 22 days ago

raw

history blame

5.54 kB

	import requests
	import pandas as pd
	from bs4 import BeautifulSoup
	import numpy as np
	import re
	from selenium.webdriver.common.by import By


	# wuzzuf function
	def Wuzzuf_scrapping(job_type , job_num,driver):
	job1 = job_type.split(" ")[0]
	job2 = job_type.split(" ")[1]
	link1 = 'https://wuzzuf.net/search/jobs/?a=navbl&q='+job1+'%20'+job1
	title = []
	location = []
	country = []
	job_description = []
	Job_Requirements =[]
	company_name = []
	links = []
	Jop_type = []
	Career_Level = []
	company_logo = []
	Job_Categories = []
	Skills_And_Tools = []
	Experience_Needed =[]
	post_time = []
	Title = []
	pages_num = np.ceil(job_num/15)


	for i in range(int(pages_num) ):
	link_new = link1 +'&start='+str(i)
	try:
	data = requests.get(link_new)
	data.raise_for_status() # Check for HTTP errors
	soup = BeautifulSoup(data.content, 'html.parser')
	Title = soup.find_all('h2', {'class': 'css-m604qf'})

	except requests.exceptions.RequestException as e:
	# print(f"Request failed: {e}")
	continue # Skip to the next page if there's an error


	# to get the info about jobs

	for x in range(0,len(Title)):
	t = re.split('\(\|\-',Title[x].find('a').text)
	title.append(t[0].strip())
	loc = re.split(',' , soup.find_all('span' , {'class': 'css-5wys0k'})[x].text)
	r = ""
	for i in range(len(loc[:-1])):
	r= r+ ', ' +loc[:-1][i].strip()
	location.append(r.replace(',', '', 1).strip())
	country.append(loc[-1].strip())
	#print("---",Title[x].find('a').attrs['href'])
	links.append(Title[x].find('a').attrs['href'])
	m = " ".join(re.findall("[a-zA-Z\d+]+", (soup.find_all('div' , {'class': 'css-d7j1kk'})[x].find('a').text)))
	company_name.append(m)
	c = soup.find_all('div' ,{'class':'css-1lh32fc'})[x].find_all('span')
	if len(c) ==1:
	Jop_type.append(c[0].text)
	else:
	n =[]
	for i in range(len(c)):
	n.append(c[i].text)
	Jop_type.append(n)
	n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span'])
	Career_Level.append(n[0].text)
	n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span'])

	yy = n[1].text.replace('·',' ').strip()
	yy = re.findall('[0-9-+]*',yy)
	y1 =""
	for i in range(len(yy)):

	if any(yy[i]):
	y1 = y1+yy[i]
	if y1 != "":
	Experience_Needed.append(y1)
	else:
	Experience_Needed.append("Not Specified")
	time = (soup.find_all('div' ,{'class':'css-d7j1kk'}))[x].find('div')
	post_time.append(time.text)

	# to get the logo of the company
	# Fetch the company logo
	try:
	#print(links[x])
	data1 = requests.get(links[x])
	data1.raise_for_status() # Check for HTTP errors
	soup1 = BeautifulSoup(data1.content, 'html.parser')

	logo_meta = soup1.find_all('meta', {'property': "og:image"})
	if logo_meta:
	company_logo.append(logo_meta[0]['content'])
	else:
	print("No logo meta tag found.")
	company_logo.append("No logo found")

	except requests.exceptions.RequestException as e:
	print(f"Failed to fetch company logo: {e}")
	company_logo.append("Error fetching logo")
	# data1 = requests.get(links[x])
	# soup1 = BeautifulSoup(data1.content)
	#company_logo.append(soup1.find_all('meta',{'property':"og:image"})[0]['content'])
	#time.sleep(4)


	# get Job_Categories , Skills_And_Tools , job_description , and job_requirements from urls
	#driver = webdriver.Chrome('chromedriver',options=options)
	#driver.implicitly_wait(10)
	driver.get(links[x])
	Job_Categories.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[5]').text.split("\n")[1:])
	Skills_And_Tools.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[6]').text.split("\n")[1:])
	job_description.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[3]').text.split("\n")[1:])
	all =driver.find_elements(By.XPATH ,'//*[@id="app"]/div/main/section[4]/div')
	dict_other = {}

	new = all[0].text.split("\n\n")

	if len(new)!=1 :
	for i in range(len(new)):
	result =[]
	for k in (new[i].split('\n')[1:]):
	result.append(k.replace("\u202f"," "))
	dict_other[new[i].split('\n')[0]] = result

	#result = re.sub('[\W_]+', '', ini_string)

	Job_Requirements.append(dict_other)

	else:
	nn = new[0].replace("\u202f"," ")
	Job_Requirements.append(nn.split('\n'))


	# create data frame to combine all together

	df = pd.DataFrame({'Title' : title , 'Location' : location ,'country':country,'URLs':links ,'Company_Name' : company_name,'Career_Level':Career_Level,'post_time':post_time,'Experience_Needed':Experience_Needed,'Company_Logo':company_logo,"Job_Categories":Job_Categories , "Skills_And_Tools":Skills_And_Tools , "job_description":job_description,"Job_Requirements":Job_Requirements})

	df[:job_num].to_excel('WUZZUF_scrapping.xlsx',index=False,encoding='utf-8')
	return df[:job_num]