Yassmen commited on
Commit
29fc20d
1 Parent(s): 6343888

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -462
app.py CHANGED
@@ -2,7 +2,6 @@
2
  import streamlit as st
3
  import requests
4
  import numpy as np
5
- from streamlit_lottie import st_lottie
6
  from PIL import Image
7
  import warnings
8
  warnings.filterwarnings("ignore")
@@ -43,467 +42,12 @@ options.add_argument("disable-infobars")
43
  options.add_argument("--disable-extensions")
44
  driver = webdriver.Chrome('chromedriver',options=options)
45
 
46
-
47
- # wuzzuf function
48
- def Wuzzuf_scrapping(job_type , job_num):
49
- job1 = job_type.split(" ")[0]
50
- job2 = job_type.split(" ")[1]
51
- link1 = 'https://wuzzuf.net/search/jobs/?a=navbl&q='+job1+'%20'+job1
52
- title = []
53
- location = []
54
- country = []
55
- job_description = []
56
- Job_Requirements =[]
57
- company_name = []
58
- links = []
59
- Jop_type = []
60
- Career_Level = []
61
- company_logo = []
62
- Job_Categories = []
63
- Skills_And_Tools = []
64
- Experience_Needed =[]
65
- post_time = []
66
- Title = []
67
- pages_num = np.ceil(job_num/15)
68
-
69
-
70
- for i in range(int(pages_num) ):
71
- link_new = link1 +'&start='+str(i)
72
- data = requests.get(link_new)
73
- soup = BeautifulSoup(data.content)
74
- Title = soup.find_all('h2' , {'class': 'css-m604qf'})
75
-
76
- # to get the info about jobs
77
-
78
- for x in range(0,len(Title)):
79
- t = re.split('\(|\-',Title[x].find('a').text)
80
- title.append(t[0].strip())
81
- loc = re.split(',' , soup.find_all('span' , {'class': 'css-5wys0k'})[x].text)
82
- r = ""
83
- for i in range(len(loc[:-1])):
84
- r= r+ ', ' +loc[:-1][i].strip()
85
- location.append(r.replace(',', '', 1).strip())
86
- country.append(loc[-1].strip())
87
- links.append('https://wuzzuf.net' + Title[x].find('a').attrs['href'])
88
- m = " ".join(re.findall("[a-zA-Z\d+]+", (soup.find_all('div' , {'class': 'css-d7j1kk'})[x].find('a').text)))
89
- company_name.append(m)
90
- c = soup.find_all('div' ,{'class':'css-1lh32fc'})[x].find_all('span')
91
- if len(c) ==1:
92
- Jop_type.append(c[0].text)
93
- else:
94
- n =[]
95
- for i in range(len(c)):
96
- n.append(c[i].text)
97
- Jop_type.append(n)
98
- n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span'])
99
- Career_Level.append(n[0].text)
100
- n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span'])
101
-
102
- yy = n[1].text.replace('·',' ').strip()
103
- yy = re.findall('[0-9-+]*',yy)
104
- y1 =""
105
- for i in range(len(yy)):
106
-
107
- if any(yy[i]):
108
- y1 = y1+yy[i]
109
- if y1 != "":
110
- Experience_Needed.append(y1)
111
- else:
112
- Experience_Needed.append("Not Specified")
113
- time = (soup.find_all('div' ,{'class':'css-d7j1kk'}))[x].find('div')
114
- post_time.append(time.text)
115
-
116
- # to get the logo of the company
117
-
118
- data1 = requests.get(links[x])
119
- soup1 = BeautifulSoup(data1.content)
120
- company_logo.append(soup1.find_all('meta',{'property':"og:image"})[0]['content'])
121
- #time.sleep(4)
122
-
123
-
124
- # get Job_Categories , Skills_And_Tools , job_description , and job_requirements from urls
125
- driver = webdriver.Chrome('chromedriver',options=options)
126
- #driver.implicitly_wait(10)
127
- driver.get(links[x])
128
- Job_Categories.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[5]').text.split("\n")[1:])
129
- Skills_And_Tools.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[6]').text.split("\n")[1:])
130
- job_description.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[3]').text.split("\n")[1:])
131
- all =driver.find_elements(By.XPATH ,'//*[@id="app"]/div/main/section[4]/div')
132
- dict_other = {}
133
-
134
- new = all[0].text.split("\n\n")
135
-
136
- if len(new)!=1 :
137
- for i in range(len(new)):
138
- result =[]
139
- for k in (new[i].split('\n')[1:]):
140
- result.append(k.replace("\u202f"," "))
141
- dict_other[new[i].split('\n')[0]] = result
142
-
143
- #result = re.sub('[\W_]+', '', ini_string)
144
-
145
- Job_Requirements.append(dict_other)
146
-
147
- else:
148
- nn = new[0].replace("\u202f"," ")
149
- Job_Requirements.append(nn.split('\n'))
150
-
151
-
152
- # create data frame to combine all together
153
-
154
- df = pd.DataFrame({'Title' : title , 'Location' : location ,'country':country,'URLs':links ,'Company_Name' : company_name,'Career_Level':Career_Level,'post_time':post_time,'Experience_Needed':Experience_Needed,'Company_Logo':company_logo,"Job_Categories":Job_Categories , "Skills_And_Tools":Skills_And_Tools , "job_description":job_description,"Job_Requirements":Job_Requirements})
155
-
156
- df[:job_num].to_excel('WUZZUF_scrapping.xlsx',index=False,encoding='utf-8')
157
- return df[:job_num]
158
-
159
-
160
- # linkedin function
161
-
162
-
163
- def LINKEDIN_Scrapping(job_search , num_jobs):
164
- job1 = job_search.split(" ")[0]
165
- job2 = job_search.split(" ")[1]
166
-
167
- link1 = 'https://www.linkedin.com/jobs/search?keywords='+job1 +'%20' +job2 +'&location=&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0'
168
-
169
- # FIRST get main informations about jobs
170
-
171
- title = []
172
- location = []
173
- country = []
174
- company_name = []
175
- post_time = []
176
- links =[]
177
- # get the specific numbers of jobs
178
- l1 = ""
179
- ll =""
180
- driver = webdriver.Chrome('chromedriver',options=options)
181
- driver.get(link1)
182
- SCROLL_PAUSE_TIME = 0.5
183
- while True :
184
- l1 = driver.find_elements(By.XPATH,'//*[@id="main-content"]/section[2]/ul/li[*]/div')
185
- ll= driver.find_elements(By.XPATH ,'//*[@id="main-content"]/section[2]/ul/li[*]/div/a')
186
-
187
- if len(l1) >= num_jobs:
188
- break
189
- time.sleep(3)
190
- # Get scroll height
191
- last_height = driver.execute_script("return document.body.scrollHeight")
192
- while True:
193
-
194
- # Scroll down to bottom
195
- driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
196
-
197
- # Wait to load page
198
- time.sleep(SCROLL_PAUSE_TIME)
199
-
200
- # Calculate new scroll height and compare with last scroll height
201
- new_height = driver.execute_script("return document.body.scrollHeight")
202
- if new_height == last_height:
203
- break
204
- last_height = new_height
205
-
206
- options.add_argument("window-size=1200x600")
207
- WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[2]/button'))).click()
208
- print(len(l1))
209
- time.sleep(2)
210
-
211
-
212
-
213
- l2 = l1[:num_jobs]
214
-
215
- for info in l2:
216
- info_tot = info.text.split("\n")
217
- if len(info_tot)==5:
218
- title.append(info_tot[1])
219
- location.append(info_tot[3])
220
- company_name.append(info_tot[2])
221
- post_time.append(info_tot[4])
222
- else:
223
- title.append(info_tot[1])
224
- location.append(info_tot[3])
225
- company_name.append(info_tot[2])
226
- post_time.append(info_tot[5])
227
-
228
- # get links for jobs
229
- l3 = ll[:num_jobs]
230
- for i in l3:
231
- links.append(i.get_attribute('href'))
232
-
233
- df_ml = pd.DataFrame({'Title' : title , 'Location' : location ,'URLs':links ,'Company_Name' : company_name ,'post_time':post_time})
234
-
235
-
236
-
237
-
238
- # GET DESCRIPTION AND LOGO
239
- def all_description_LOGO(urls):
240
- description =[]
241
- LOGO =[]
242
- for link in urls:
243
- driver = webdriver.Chrome('chromedriver',options=options)
244
- driver.get(link)
245
- options.add_argument("window-size=1200x600")
246
- WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/button[1]'))).click()
247
- qqq= 4+444*58/7+65
248
- K = driver.find_element(By.XPATH,'//*[@id="main-content"]/section[1]/div/section[2]/div/a/img')
249
- LOGO.append(K.get_attribute('src'))
250
- time.sleep(3)
251
- t = driver.find_element(By.XPATH ,'//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/div')
252
- t_reverse=t.text[::-1]
253
-
254
- if t_reverse[:9] =="erom wohs":
255
- l = len(t.text)
256
- strings=t.text[:l-9].split("\n")
257
- strings[:] = [x for x in strings if x]
258
- description.append(strings)
259
- else:
260
- strings=t.text.split("\n")
261
- strings[:] = [x for x in strings if x]
262
- description.append(strings)
263
- df_ml = pd.DataFrame({'all_about_job' : description ,'company_logo':LOGO})
264
-
265
- return df_ml
266
-
267
- # apply desc. and logo function
268
- E = all_description_LOGO(links)
269
-
270
- # other info function
271
- def other(urls):
272
- frames =[]
273
- for url in urls:
274
- data1 = requests.get(url)
275
- soup1 = BeautifulSoup(data1.content)
276
- j = soup1.find('ul' , {'class': 'description__job-criteria-list'})
277
- time.sleep(4)
278
- jj=j.find_all('h3')
279
- dic ={}
280
- for i in range(len(jj)):
281
- dic[jj[i].text.replace('\n',' ').strip()] = j.find_all('span')[i].text.replace('\n',' ').strip()
282
- output = pd.DataFrame()
283
- output = output.append(dic, ignore_index=True)
284
- frames.append(output)
285
- result = pd.concat(frames)
286
- return result
287
-
288
- # apply Other function
289
- df = other(links)
290
- df.fillna('Not_Found',inplace= True)
291
- df.reset_index(inplace=True, drop=True)
292
-
293
- # combine all together
294
- result = pd.concat([df_ml,E, df ], axis=1)
295
-
296
- return result
297
-
298
-
299
- ##################### map_bubble #####################
300
-
301
- #### function to show map for loaction of the job
302
-
303
-
304
-
305
- def map_bubble(df):
306
-
307
- import requests
308
- import urllib.parse
309
- g =[]
310
- for i in range(len(df.Location)):
311
-
312
- if df.Location.loc[i].split(","):
313
- g.append(df.Location.loc[i].split(",")[0])
314
- else:
315
- g.append(df.Location.loc[i])
316
- df['new_loc']=g
317
- if 'country' in df.columns:
318
- df["full_location"] = df["new_loc"] + ", " +df["country"]
319
- dict_cities = dict(df.full_location.value_counts())
320
- else :
321
- dict_cities = dict(df.new_loc.value_counts())
322
- lat = []
323
- lon = []
324
- bubble_df = pd.DataFrame()
325
- add=[]
326
- val=[]
327
- try:
328
- for address in dict_cities.keys():
329
- url = 'https://nominatim.openstreetmap.org/search/' + urllib.parse.quote(address) +'?format=json'
330
-
331
- response = requests.get(url).json()
332
- lat.append(response[0]["lat"])
333
- lon.append(response[0]["lon"])
334
- add.append(address)
335
- val.append(dict_cities[address])
336
- except:
337
- pass
338
-
339
- bubble_df['address'] =add
340
- bubble_df['lat'] = lat
341
- bubble_df['lon'] = lon
342
- bubble_df['value'] = val
343
-
344
-
345
- # import the library
346
- import folium
347
-
348
- # Make an empty map
349
- m = folium.Map(location=[20,0], tiles="OpenStreetMap", zoom_start=2)
350
- # add marker one by one on the map
351
- for i in range(0,len(bubble_df)):
352
- folium.Circle(
353
- location=[bubble_df.iloc[i]['lat'], bubble_df.iloc[i]['lon']],
354
-
355
- popup=bubble_df.iloc[i][['address','value']].values,
356
- radius=float(bubble_df.iloc[i]['value'])*500,
357
- color='#69b3a2',
358
- fill=True,
359
- fill_color='#69b3a2'
360
- ).add_to(m)
361
- m
362
- # Show the map again
363
- return m
364
-
365
-
366
- ##########################
367
-
368
-
369
-
370
-
371
-
372
- #########################
373
- #### wuzzuf analysis
374
- def wuzzuf_exp(df1):
375
- top10_job_title = df1['Title'].value_counts()[:10]
376
- fig1 = px.bar(y=top10_job_title.values,
377
- x=top10_job_title.index,
378
- color = top10_job_title.index,
379
- color_discrete_sequence=px.colors.sequential.deep,
380
- text=top10_job_title.values,
381
- title= 'Top 10 Job Titles',
382
- template= 'plotly_dark')
383
- fig1.update_layout(height=500,width=500,
384
- xaxis_title="Job Titles",
385
- yaxis_title="count",
386
- font = dict(size=17,family="Franklin Gothic"))
387
- st.plotly_chart(fig1)
388
-
389
- type_grouped = df1['Career_Level'].value_counts()
390
- #e_type = ['Full-Time','Part-Time','Contract','Freelance']
391
- e_type =dict(df1['Career_Level'].value_counts()).keys()
392
- fig2 = px.bar(x = e_type, y = type_grouped.values,
393
- color = type_grouped.index,
394
- color_discrete_sequence=px.colors.sequential.dense,
395
- template = 'plotly_dark',
396
- text = type_grouped.values, title = 'Career Level Distribution')
397
- fig2.update_layout( height=500, width=500,
398
- xaxis_title="Career Level",
399
- yaxis_title="count",
400
- font = dict(size=17,family="Franklin Gothic"))
401
- fig2.update_traces(width=0.5)
402
- st.plotly_chart(fig2)
403
- residence = df1['Location'].value_counts()
404
- top10_employee_location = residence[:10]
405
- fig3 = px.bar(y=top10_employee_location.values,
406
- x=top10_employee_location.index,
407
- color = top10_employee_location.index,
408
- color_discrete_sequence=px.colors.sequential.deep,
409
- text=top10_employee_location.values,
410
- title= 'Top 10 Location of job',
411
- template= 'plotly_dark')
412
- fig3.update_layout(height=500,width=500,
413
- xaxis_title="Location of job",
414
- yaxis_title="count",
415
- font = dict(size=17,family="Franklin Gothic"))
416
- st.plotly_chart(fig3)
417
-
418
- type_grouped = df1['Experience_Needed'].value_counts()
419
- #e_type = ['Full-Time','Part-Time','Contract','Freelance']
420
- e_type =dict(df1['Experience_Needed'].value_counts()).keys()
421
- fig4 = px.bar(x = e_type, y = type_grouped.values,
422
- color = type_grouped.index,
423
- color_discrete_sequence=px.colors.sequential.dense,
424
- template = 'plotly_dark',
425
- text = type_grouped.values, title = ' Experience Level Distribution')
426
- fig4.update_layout(height=500,width=500,
427
- xaxis_title=" Experience Level (years)",
428
- yaxis_title="count",
429
- font = dict(size=17,family="Franklin Gothic"))
430
- fig4.update_traces(width=0.5)
431
- st.plotly_chart(fig4)
432
- return
433
-
434
-
435
-
436
- #########################
437
- ### linkedin analysis
438
-
439
- def linkedin_exp(df1):
440
- top10_job_title = df1['Title'].value_counts()[:10]
441
- fig1 = px.bar(y=top10_job_title.values,
442
- x=top10_job_title.index,
443
- color = top10_job_title.index,
444
- color_discrete_sequence=px.colors.sequential.deep,
445
- text=top10_job_title.values,
446
- title= 'Top 10 Job Titles',
447
- template= 'plotly_dark')
448
- fig1.update_layout(height=500,width=500,
449
- xaxis_title="Job Titles",
450
- yaxis_title="count",
451
- font = dict(size=17,family="Franklin Gothic"))
452
- st.plotly_chart(fig1)
453
-
454
- type_grouped = df1['Employment type'].value_counts()
455
- #e_type = ['Full-Time','Part-Time','Contract','Freelance']
456
- e_type =dict(df1['Employment type'].value_counts()).keys()
457
- fig2 = px.bar(x = e_type, y = type_grouped.values,
458
- color = type_grouped.index,
459
- color_discrete_sequence=px.colors.sequential.dense,
460
- template = 'plotly_dark',
461
- text = type_grouped.values, title = 'Employment type Distribution')
462
- fig2.update_layout( height=500, width=500,
463
- xaxis_title="Employment type",
464
- yaxis_title="count",
465
- font = dict(size=17,family="Franklin Gothic"))
466
- fig2.update_traces(width=0.5)
467
- st.plotly_chart(fig2)
468
- residence = df1['Location'].value_counts()
469
- top10_employee_location = residence[:10]
470
- fig3 = px.bar(y=top10_employee_location.values,
471
- x=top10_employee_location.index,
472
- color = top10_employee_location.index,
473
- color_discrete_sequence=px.colors.sequential.deep,
474
- text=top10_employee_location.values,
475
- title= 'Top 10 Location of job',
476
- template= 'plotly_dark')
477
- fig3.update_layout(height=500,width=500,
478
- xaxis_title="Location of job",
479
- yaxis_title="count",
480
- font = dict(size=17,family="Franklin Gothic"))
481
- st.plotly_chart(fig3)
482
-
483
- type_grouped = df1['Seniority level'].value_counts()
484
- #e_type = ['Full-Time','Part-Time','Contract','Freelance']
485
- e_type =dict(df1['Seniority level'].value_counts()).keys()
486
- fig4 = px.bar(x = e_type, y = type_grouped.values,
487
- color = type_grouped.index,
488
- color_discrete_sequence=px.colors.sequential.dense,
489
- template = 'plotly_dark',
490
- text = type_grouped.values, title = 'Seniority level Distribution')
491
- fig4.update_layout(height=500,width=500,
492
- xaxis_title="Seniority level",
493
- yaxis_title="count",
494
- font = dict(size=17,family="Franklin Gothic"))
495
- fig4.update_traces(width=0.5)
496
- st.plotly_chart(fig4)
497
- return
498
-
499
-
500
- ########################
501
 
502
  ####################### stream lit app ################################
503
 
504
- #site = ""
505
- #job =""
506
- #num_jobs = 0
507
 
508
  st.set_page_config(page_title="My Web_Scrap Page", page_icon=":tada:", layout="wide")
509
 
@@ -519,7 +63,7 @@ with st.container():
519
  )
520
  st.write("[Reach me >](https://www.linkedin.com/in/yassmen-youssef-48439a166/)")
521
  with right_column:
522
- pass
523
  # st_lottie(lottie_coding, height=300, key="coding")
524
 
525
 
@@ -556,7 +100,7 @@ if st.sidebar.button('Start Scrapping'):
556
  st.write("---")
557
  tab1, tab2 ,tab3= st.tabs([" Data", " Bubble Map","Data Exploration"])
558
  with tab1 :
559
- with hc.HyLoader('✨Now loading' ,hc.Loaders.standard_loaders,index=[3,0,5]):
560
  time.sleep(5)
561
  n1 = Wuzzuf_scrapping(job ,num_jobs )
562
  try:
@@ -578,7 +122,7 @@ if st.sidebar.button('Start Scrapping'):
578
  st.write("---")
579
  tab1, tab2 ,tab3= st.tabs([" Data", " Bubble Map","Data Exploration"])
580
  with tab1 :
581
- with hc.HyLoader('✨Now loading' ,hc.Loaders.standard_loaders,index=[3,0,5]):
582
  time.sleep(5)
583
  n1 = LINKEDIN_Scrapping(job ,num_jobs )
584
  try:
 
2
  import streamlit as st
3
  import requests
4
  import numpy as np
 
5
  from PIL import Image
6
  import warnings
7
  warnings.filterwarnings("ignore")
 
42
  options.add_argument("--disable-extensions")
43
  driver = webdriver.Chrome('chromedriver',options=options)
44
 
45
+ from wuzzuf_scraper import Wuzzuf_scrapping
46
+ from linkedin_scraper import LINKEDIN_Scrapping
47
+ from data_analysis import map_bubble,linkedin_exp,wuzzuf_exp
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  ####################### stream lit app ################################
50
 
 
 
 
51
 
52
  st.set_page_config(page_title="My Web_Scrap Page", page_icon=":tada:", layout="wide")
53
 
 
63
  )
64
  st.write("[Reach me >](https://www.linkedin.com/in/yassmen-youssef-48439a166/)")
65
  with right_column:
66
+ st.image("im.gif", use_column_width=True)
67
  # st_lottie(lottie_coding, height=300, key="coding")
68
 
69
 
 
100
  st.write("---")
101
  tab1, tab2 ,tab3= st.tabs([" Data", " Bubble Map","Data Exploration"])
102
  with tab1 :
103
+ with st.spinner('✨Now loading...' ):
104
  time.sleep(5)
105
  n1 = Wuzzuf_scrapping(job ,num_jobs )
106
  try:
 
122
  st.write("---")
123
  tab1, tab2 ,tab3= st.tabs([" Data", " Bubble Map","Data Exploration"])
124
  with tab1 :
125
+ with st.spinner('✨Now loading...' ):
126
  time.sleep(5)
127
  n1 = LINKEDIN_Scrapping(job ,num_jobs )
128
  try: