Yassmen commited on
Commit
6d228e9
1 Parent(s): e9b17c3

Create wuzzuf_scraper.py

Browse files
Files changed (1) hide show
  1. wuzzuf_scraper.py +118 -0
wuzzuf_scraper.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import pandas as pd
3
+ from bs4 import BeautifulSoup
4
+ import numpy as np
5
+
6
+ # wuzzuf function
7
+ def Wuzzuf_scrapping(job_type , job_num):
8
+ job1 = job_type.split(" ")[0]
9
+ job2 = job_type.split(" ")[1]
10
+ link1 = 'https://wuzzuf.net/search/jobs/?a=navbl&q='+job1+'%20'+job1
11
+ title = []
12
+ location = []
13
+ country = []
14
+ job_description = []
15
+ Job_Requirements =[]
16
+ company_name = []
17
+ links = []
18
+ Jop_type = []
19
+ Career_Level = []
20
+ company_logo = []
21
+ Job_Categories = []
22
+ Skills_And_Tools = []
23
+ Experience_Needed =[]
24
+ post_time = []
25
+ Title = []
26
+ pages_num = np.ceil(job_num/15)
27
+
28
+
29
+ for i in range(int(pages_num) ):
30
+ link_new = link1 +'&start='+str(i)
31
+ data = requests.get(link_new)
32
+ soup = BeautifulSoup(data.content)
33
+ Title = soup.find_all('h2' , {'class': 'css-m604qf'})
34
+
35
+ # to get the info about jobs
36
+
37
+ for x in range(0,len(Title)):
38
+ t = re.split('\(|\-',Title[x].find('a').text)
39
+ title.append(t[0].strip())
40
+ loc = re.split(',' , soup.find_all('span' , {'class': 'css-5wys0k'})[x].text)
41
+ r = ""
42
+ for i in range(len(loc[:-1])):
43
+ r= r+ ', ' +loc[:-1][i].strip()
44
+ location.append(r.replace(',', '', 1).strip())
45
+ country.append(loc[-1].strip())
46
+ links.append('https://wuzzuf.net' + Title[x].find('a').attrs['href'])
47
+ m = " ".join(re.findall("[a-zA-Z\d+]+", (soup.find_all('div' , {'class': 'css-d7j1kk'})[x].find('a').text)))
48
+ company_name.append(m)
49
+ c = soup.find_all('div' ,{'class':'css-1lh32fc'})[x].find_all('span')
50
+ if len(c) ==1:
51
+ Jop_type.append(c[0].text)
52
+ else:
53
+ n =[]
54
+ for i in range(len(c)):
55
+ n.append(c[i].text)
56
+ Jop_type.append(n)
57
+ n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span'])
58
+ Career_Level.append(n[0].text)
59
+ n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span'])
60
+
61
+ yy = n[1].text.replace('·',' ').strip()
62
+ yy = re.findall('[0-9-+]*',yy)
63
+ y1 =""
64
+ for i in range(len(yy)):
65
+
66
+ if any(yy[i]):
67
+ y1 = y1+yy[i]
68
+ if y1 != "":
69
+ Experience_Needed.append(y1)
70
+ else:
71
+ Experience_Needed.append("Not Specified")
72
+ time = (soup.find_all('div' ,{'class':'css-d7j1kk'}))[x].find('div')
73
+ post_time.append(time.text)
74
+
75
+ # to get the logo of the company
76
+
77
+ data1 = requests.get(links[x])
78
+ soup1 = BeautifulSoup(data1.content)
79
+ company_logo.append(soup1.find_all('meta',{'property':"og:image"})[0]['content'])
80
+ #time.sleep(4)
81
+
82
+
83
+ # get Job_Categories , Skills_And_Tools , job_description , and job_requirements from urls
84
+ driver = webdriver.Chrome('chromedriver',options=options)
85
+ #driver.implicitly_wait(10)
86
+ driver.get(links[x])
87
+ Job_Categories.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[5]').text.split("\n")[1:])
88
+ Skills_And_Tools.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[6]').text.split("\n")[1:])
89
+ job_description.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[3]').text.split("\n")[1:])
90
+ all =driver.find_elements(By.XPATH ,'//*[@id="app"]/div/main/section[4]/div')
91
+ dict_other = {}
92
+
93
+ new = all[0].text.split("\n\n")
94
+
95
+ if len(new)!=1 :
96
+ for i in range(len(new)):
97
+ result =[]
98
+ for k in (new[i].split('\n')[1:]):
99
+ result.append(k.replace("\u202f"," "))
100
+ dict_other[new[i].split('\n')[0]] = result
101
+
102
+ #result = re.sub('[\W_]+', '', ini_string)
103
+
104
+ Job_Requirements.append(dict_other)
105
+
106
+ else:
107
+ nn = new[0].replace("\u202f"," ")
108
+ Job_Requirements.append(nn.split('\n'))
109
+
110
+
111
+ # create data frame to combine all together
112
+
113
+ df = pd.DataFrame({'Title' : title , 'Location' : location ,'country':country,'URLs':links ,'Company_Name' : company_name,'Career_Level':Career_Level,'post_time':post_time,'Experience_Needed':Experience_Needed,'Company_Logo':company_logo,"Job_Categories":Job_Categories , "Skills_And_Tools":Skills_And_Tools , "job_description":job_description,"Job_Requirements":Job_Requirements})
114
+
115
+ df[:job_num].to_excel('WUZZUF_scrapping.xlsx',index=False,encoding='utf-8')
116
+ return df[:job_num]
117
+
118
+