import requests import pandas as pd from bs4 import BeautifulSoup import numpy as np # wuzzuf function def Wuzzuf_scrapping(job_type , job_num): job1 = job_type.split(" ")[0] job2 = job_type.split(" ")[1] link1 = 'https://wuzzuf.net/search/jobs/?a=navbl&q='+job1+'%20'+job1 title = [] location = [] country = [] job_description = [] Job_Requirements =[] company_name = [] links = [] Jop_type = [] Career_Level = [] company_logo = [] Job_Categories = [] Skills_And_Tools = [] Experience_Needed =[] post_time = [] Title = [] pages_num = np.ceil(job_num/15) for i in range(int(pages_num) ): link_new = link1 +'&start='+str(i) data = requests.get(link_new) soup = BeautifulSoup(data.content) Title = soup.find_all('h2' , {'class': 'css-m604qf'}) # to get the info about jobs for x in range(0,len(Title)): t = re.split('\(|\-',Title[x].find('a').text) title.append(t[0].strip()) loc = re.split(',' , soup.find_all('span' , {'class': 'css-5wys0k'})[x].text) r = "" for i in range(len(loc[:-1])): r= r+ ', ' +loc[:-1][i].strip() location.append(r.replace(',', '', 1).strip()) country.append(loc[-1].strip()) links.append('https://wuzzuf.net' + Title[x].find('a').attrs['href']) m = " ".join(re.findall("[a-zA-Z\d+]+", (soup.find_all('div' , {'class': 'css-d7j1kk'})[x].find('a').text))) company_name.append(m) c = soup.find_all('div' ,{'class':'css-1lh32fc'})[x].find_all('span') if len(c) ==1: Jop_type.append(c[0].text) else: n =[] for i in range(len(c)): n.append(c[i].text) Jop_type.append(n) n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span']) Career_Level.append(n[0].text) n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span']) yy = n[1].text.replace('ยท',' ').strip() yy = re.findall('[0-9-+]*',yy) y1 ="" for i in range(len(yy)): if any(yy[i]): y1 = y1+yy[i] if y1 != "": Experience_Needed.append(y1) else: Experience_Needed.append("Not Specified") time = (soup.find_all('div' ,{'class':'css-d7j1kk'}))[x].find('div') post_time.append(time.text) # to get the logo of the company data1 = requests.get(links[x]) soup1 = BeautifulSoup(data1.content) company_logo.append(soup1.find_all('meta',{'property':"og:image"})[0]['content']) #time.sleep(4) # get Job_Categories , Skills_And_Tools , job_description , and job_requirements from urls driver = webdriver.Chrome('chromedriver',options=options) #driver.implicitly_wait(10) driver.get(links[x]) Job_Categories.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[5]').text.split("\n")[1:]) Skills_And_Tools.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[6]').text.split("\n")[1:]) job_description.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[3]').text.split("\n")[1:]) all =driver.find_elements(By.XPATH ,'//*[@id="app"]/div/main/section[4]/div') dict_other = {} new = all[0].text.split("\n\n") if len(new)!=1 : for i in range(len(new)): result =[] for k in (new[i].split('\n')[1:]): result.append(k.replace("\u202f"," ")) dict_other[new[i].split('\n')[0]] = result #result = re.sub('[\W_]+', '', ini_string) Job_Requirements.append(dict_other) else: nn = new[0].replace("\u202f"," ") Job_Requirements.append(nn.split('\n')) # create data frame to combine all together df = pd.DataFrame({'Title' : title , 'Location' : location ,'country':country,'URLs':links ,'Company_Name' : company_name,'Career_Level':Career_Level,'post_time':post_time,'Experience_Needed':Experience_Needed,'Company_Logo':company_logo,"Job_Categories":Job_Categories , "Skills_And_Tools":Skills_And_Tools , "job_description":job_description,"Job_Requirements":Job_Requirements}) df[:job_num].to_excel('WUZZUF_scrapping.xlsx',index=False,encoding='utf-8') return df[:job_num]