Spaces:
Sleeping
Sleeping
File size: 5,537 Bytes
6d228e9 be022a1 5dd1e25 6d228e9 5dd1e25 6d228e9 0aab451 2f6212f 0aab451 2f6212f 0aab451 6d228e9 f6f5e88 6d228e9 e19be2a f6f5e88 e19be2a 6d228e9 5dd1e25 6d228e9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import requests
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import re
from selenium.webdriver.common.by import By
# wuzzuf function
def Wuzzuf_scrapping(job_type , job_num,driver):
job1 = job_type.split(" ")[0]
job2 = job_type.split(" ")[1]
link1 = 'https://wuzzuf.net/search/jobs/?a=navbl&q='+job1+'%20'+job1
title = []
location = []
country = []
job_description = []
Job_Requirements =[]
company_name = []
links = []
Jop_type = []
Career_Level = []
company_logo = []
Job_Categories = []
Skills_And_Tools = []
Experience_Needed =[]
post_time = []
Title = []
pages_num = np.ceil(job_num/15)
for i in range(int(pages_num) ):
link_new = link1 +'&start='+str(i)
try:
data = requests.get(link_new)
data.raise_for_status() # Check for HTTP errors
soup = BeautifulSoup(data.content, 'html.parser')
Title = soup.find_all('h2', {'class': 'css-m604qf'})
except requests.exceptions.RequestException as e:
# print(f"Request failed: {e}")
continue # Skip to the next page if there's an error
# to get the info about jobs
for x in range(0,len(Title)):
t = re.split('\(|\-',Title[x].find('a').text)
title.append(t[0].strip())
loc = re.split(',' , soup.find_all('span' , {'class': 'css-5wys0k'})[x].text)
r = ""
for i in range(len(loc[:-1])):
r= r+ ', ' +loc[:-1][i].strip()
location.append(r.replace(',', '', 1).strip())
country.append(loc[-1].strip())
#print("---",Title[x].find('a').attrs['href'])
links.append(Title[x].find('a').attrs['href'])
m = " ".join(re.findall("[a-zA-Z\d+]+", (soup.find_all('div' , {'class': 'css-d7j1kk'})[x].find('a').text)))
company_name.append(m)
c = soup.find_all('div' ,{'class':'css-1lh32fc'})[x].find_all('span')
if len(c) ==1:
Jop_type.append(c[0].text)
else:
n =[]
for i in range(len(c)):
n.append(c[i].text)
Jop_type.append(n)
n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span'])
Career_Level.append(n[0].text)
n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span'])
yy = n[1].text.replace('·',' ').strip()
yy = re.findall('[0-9-+]*',yy)
y1 =""
for i in range(len(yy)):
if any(yy[i]):
y1 = y1+yy[i]
if y1 != "":
Experience_Needed.append(y1)
else:
Experience_Needed.append("Not Specified")
time = (soup.find_all('div' ,{'class':'css-d7j1kk'}))[x].find('div')
post_time.append(time.text)
# to get the logo of the company
# Fetch the company logo
try:
#print(links[x])
data1 = requests.get(links[x])
data1.raise_for_status() # Check for HTTP errors
soup1 = BeautifulSoup(data1.content, 'html.parser')
logo_meta = soup1.find_all('meta', {'property': "og:image"})
if logo_meta:
company_logo.append(logo_meta[0]['content'])
else:
print("No logo meta tag found.")
company_logo.append("No logo found")
except requests.exceptions.RequestException as e:
print(f"Failed to fetch company logo: {e}")
company_logo.append("Error fetching logo")
# data1 = requests.get(links[x])
# soup1 = BeautifulSoup(data1.content)
#company_logo.append(soup1.find_all('meta',{'property':"og:image"})[0]['content'])
#time.sleep(4)
# get Job_Categories , Skills_And_Tools , job_description , and job_requirements from urls
#driver = webdriver.Chrome('chromedriver',options=options)
#driver.implicitly_wait(10)
driver.get(links[x])
Job_Categories.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[5]').text.split("\n")[1:])
Skills_And_Tools.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[6]').text.split("\n")[1:])
job_description.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[3]').text.split("\n")[1:])
all =driver.find_elements(By.XPATH ,'//*[@id="app"]/div/main/section[4]/div')
dict_other = {}
new = all[0].text.split("\n\n")
if len(new)!=1 :
for i in range(len(new)):
result =[]
for k in (new[i].split('\n')[1:]):
result.append(k.replace("\u202f"," "))
dict_other[new[i].split('\n')[0]] = result
#result = re.sub('[\W_]+', '', ini_string)
Job_Requirements.append(dict_other)
else:
nn = new[0].replace("\u202f"," ")
Job_Requirements.append(nn.split('\n'))
# create data frame to combine all together
df = pd.DataFrame({'Title' : title , 'Location' : location ,'country':country,'URLs':links ,'Company_Name' : company_name,'Career_Level':Career_Level,'post_time':post_time,'Experience_Needed':Experience_Needed,'Company_Logo':company_logo,"Job_Categories":Job_Categories , "Skills_And_Tools":Skills_And_Tools , "job_description":job_description,"Job_Requirements":Job_Requirements})
df[:job_num].to_excel('WUZZUF_scrapping.xlsx',index=False,encoding='utf-8')
return df[:job_num]
|