Spaces:

Yassmen
/

Job.web.scrapping

Sleeping

App Files Files Community

Job.web.scrapping / app.py

Yassmen

Create app.py

5a1c96e verified 22 days ago

raw

history blame

19.9 kB


	import streamlit as st
	import requests
	import numpy as np
	from streamlit_lottie import st_lottie
	from PIL import Image
	import warnings
	warnings.filterwarnings("ignore")
	import requests
	import pandas as pd
	import numpy as np
	from bs4 import BeautifulSoup
	import bs4
	from urllib.request import urlopen
	import time
	import re
	import time
	import matplotlib.pyplot as plt
	import seaborn as sns
	import matplotlib as mpl
	import plotly
	import plotly.express as px
	import plotly.graph_objs as go
	import plotly.offline as py
	from plotly.offline import iplot
	from plotly.subplots import make_subplots
	import plotly.figure_factory as ff
	from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.common.keys import Keys
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC

	#Settings for using the driver without a UI
	options = webdriver.ChromeOptions()
	options.add_argument('--headless')
	options.add_argument('--no-sandbox')
	options.add_argument('--disable-dev-shm-usage')

	options.add_argument("start-maximized")
	options.add_argument("disable-infobars")
	options.add_argument("--disable-extensions")
	driver = webdriver.Chrome('chromedriver',options=options)


	# wuzzuf function
	def Wuzzuf_scrapping(job_type , job_num):
	job1 = job_type.split(" ")[0]
	job2 = job_type.split(" ")[1]
	link1 = 'https://wuzzuf.net/search/jobs/?a=navbl&q='+job1+'%20'+job1
	title = []
	location = []
	country = []
	job_description = []
	Job_Requirements =[]
	company_name = []
	links = []
	Jop_type = []
	Career_Level = []
	company_logo = []
	Job_Categories = []
	Skills_And_Tools = []
	Experience_Needed =[]
	post_time = []
	Title = []
	pages_num = np.ceil(job_num/15)


	for i in range(int(pages_num) ):
	link_new = link1 +'&start='+str(i)
	data = requests.get(link_new)
	soup = BeautifulSoup(data.content)
	Title = soup.find_all('h2' , {'class': 'css-m604qf'})

	# to get the info about jobs

	for x in range(0,len(Title)):
	t = re.split('\(\|\-',Title[x].find('a').text)
	title.append(t[0].strip())
	loc = re.split(',' , soup.find_all('span' , {'class': 'css-5wys0k'})[x].text)
	r = ""
	for i in range(len(loc[:-1])):
	r= r+ ', ' +loc[:-1][i].strip()
	location.append(r.replace(',', '', 1).strip())
	country.append(loc[-1].strip())
	links.append('https://wuzzuf.net' + Title[x].find('a').attrs['href'])
	m = " ".join(re.findall("[a-zA-Z\d+]+", (soup.find_all('div' , {'class': 'css-d7j1kk'})[x].find('a').text)))
	company_name.append(m)
	c = soup.find_all('div' ,{'class':'css-1lh32fc'})[x].find_all('span')
	if len(c) ==1:
	Jop_type.append(c[0].text)
	else:
	n =[]
	for i in range(len(c)):
	n.append(c[i].text)
	Jop_type.append(n)
	n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span'])
	Career_Level.append(n[0].text)
	n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span'])

	yy = n[1].text.replace('·',' ').strip()
	yy = re.findall('[0-9-+]*',yy)
	y1 =""
	for i in range(len(yy)):

	if any(yy[i]):
	y1 = y1+yy[i]
	if y1 != "":
	Experience_Needed.append(y1)
	else:
	Experience_Needed.append("Not Specified")
	time = (soup.find_all('div' ,{'class':'css-d7j1kk'}))[x].find('div')
	post_time.append(time.text)

	# to get the logo of the company

	data1 = requests.get(links[x])
	soup1 = BeautifulSoup(data1.content)
	company_logo.append(soup1.find_all('meta',{'property':"og:image"})[0]['content'])
	#time.sleep(4)


	# get Job_Categories , Skills_And_Tools , job_description , and job_requirements from urls
	driver = webdriver.Chrome('chromedriver',options=options)
	#driver.implicitly_wait(10)
	driver.get(links[x])
	Job_Categories.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[5]').text.split("\n")[1:])
	Skills_And_Tools.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[6]').text.split("\n")[1:])
	job_description.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[3]').text.split("\n")[1:])
	all =driver.find_elements(By.XPATH ,'//*[@id="app"]/div/main/section[4]/div')
	dict_other = {}

	new = all[0].text.split("\n\n")

	if len(new)!=1 :
	for i in range(len(new)):
	result =[]
	for k in (new[i].split('\n')[1:]):
	result.append(k.replace("\u202f"," "))
	dict_other[new[i].split('\n')[0]] = result

	#result = re.sub('[\W_]+', '', ini_string)

	Job_Requirements.append(dict_other)

	else:
	nn = new[0].replace("\u202f"," ")
	Job_Requirements.append(nn.split('\n'))


	# create data frame to combine all together

	df = pd.DataFrame({'Title' : title , 'Location' : location ,'country':country,'URLs':links ,'Company_Name' : company_name,'Career_Level':Career_Level,'post_time':post_time,'Experience_Needed':Experience_Needed,'Company_Logo':company_logo,"Job_Categories":Job_Categories , "Skills_And_Tools":Skills_And_Tools , "job_description":job_description,"Job_Requirements":Job_Requirements})

	df[:job_num].to_excel('WUZZUF_scrapping.xlsx',index=False,encoding='utf-8')
	return df[:job_num]


	# linkedin function


	def LINKEDIN_Scrapping(job_search , num_jobs):
	job1 = job_search.split(" ")[0]
	job2 = job_search.split(" ")[1]

	link1 = 'https://www.linkedin.com/jobs/search?keywords='+job1 +'%20' +job2 +'&location=&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0'

	# FIRST get main informations about jobs

	title = []
	location = []
	country = []
	company_name = []
	post_time = []
	links =[]
	# get the specific numbers of jobs
	l1 = ""
	ll =""
	driver = webdriver.Chrome('chromedriver',options=options)
	driver.get(link1)
	SCROLL_PAUSE_TIME = 0.5
	while True :
	l1 = driver.find_elements(By.XPATH,'//[@id="main-content"]/section[2]/ul/li[]/div')
	ll= driver.find_elements(By.XPATH ,'//[@id="main-content"]/section[2]/ul/li[]/div/a')

	if len(l1) >= num_jobs:
	break
	time.sleep(3)
	# Get scroll height
	last_height = driver.execute_script("return document.body.scrollHeight")
	while True:

	# Scroll down to bottom
	driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

	# Wait to load page
	time.sleep(SCROLL_PAUSE_TIME)

	# Calculate new scroll height and compare with last scroll height
	new_height = driver.execute_script("return document.body.scrollHeight")
	if new_height == last_height:
	break
	last_height = new_height

	options.add_argument("window-size=1200x600")
	WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[2]/button'))).click()
	print(len(l1))
	time.sleep(2)



	l2 = l1[:num_jobs]

	for info in l2:
	info_tot = info.text.split("\n")
	if len(info_tot)==5:
	title.append(info_tot[1])
	location.append(info_tot[3])
	company_name.append(info_tot[2])
	post_time.append(info_tot[4])
	else:
	title.append(info_tot[1])
	location.append(info_tot[3])
	company_name.append(info_tot[2])
	post_time.append(info_tot[5])

	# get links for jobs
	l3 = ll[:num_jobs]
	for i in l3:
	links.append(i.get_attribute('href'))

	df_ml = pd.DataFrame({'Title' : title , 'Location' : location ,'URLs':links ,'Company_Name' : company_name ,'post_time':post_time})




	# GET DESCRIPTION AND LOGO
	def all_description_LOGO(urls):
	description =[]
	LOGO =[]
	for link in urls:
	driver = webdriver.Chrome('chromedriver',options=options)
	driver.get(link)
	options.add_argument("window-size=1200x600")
	WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/button[1]'))).click()
	qqq= 4+444*58/7+65
	K = driver.find_element(By.XPATH,'//*[@id="main-content"]/section[1]/div/section[2]/div/a/img')
	LOGO.append(K.get_attribute('src'))
	time.sleep(3)
	t = driver.find_element(By.XPATH ,'//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/div')
	t_reverse=t.text[::-1]

	if t_reverse[:9] =="erom wohs":
	l = len(t.text)
	strings=t.text[:l-9].split("\n")
	strings[:] = [x for x in strings if x]
	description.append(strings)
	else:
	strings=t.text.split("\n")
	strings[:] = [x for x in strings if x]
	description.append(strings)
	df_ml = pd.DataFrame({'all_about_job' : description ,'company_logo':LOGO})

	return df_ml

	# apply desc. and logo function
	E = all_description_LOGO(links)

	# other info function
	def other(urls):
	frames =[]
	for url in urls:
	data1 = requests.get(url)
	soup1 = BeautifulSoup(data1.content)
	j = soup1.find('ul' , {'class': 'description__job-criteria-list'})
	time.sleep(4)
	jj=j.find_all('h3')
	dic ={}
	for i in range(len(jj)):
	dic[jj[i].text.replace('\n',' ').strip()] = j.find_all('span')[i].text.replace('\n',' ').strip()
	output = pd.DataFrame()
	output = output.append(dic, ignore_index=True)
	frames.append(output)
	result = pd.concat(frames)
	return result

	# apply Other function
	df = other(links)
	df.fillna('Not_Found',inplace= True)
	df.reset_index(inplace=True, drop=True)

	# combine all together
	result = pd.concat([df_ml,E, df ], axis=1)

	return result


	##################### map_bubble #####################

	#### function to show map for loaction of the job



	def map_bubble(df):

	import requests
	import urllib.parse
	g =[]
	for i in range(len(df.Location)):

	if df.Location.loc[i].split(","):
	g.append(df.Location.loc[i].split(",")[0])
	else:
	g.append(df.Location.loc[i])
	df['new_loc']=g
	if 'country' in df.columns:
	df["full_location"] = df["new_loc"] + ", " +df["country"]
	dict_cities = dict(df.full_location.value_counts())
	else :
	dict_cities = dict(df.new_loc.value_counts())
	lat = []
	lon = []
	bubble_df = pd.DataFrame()
	add=[]
	val=[]
	try:
	for address in dict_cities.keys():
	url = 'https://nominatim.openstreetmap.org/search/' + urllib.parse.quote(address) +'?format=json'

	response = requests.get(url).json()
	lat.append(response[0]["lat"])
	lon.append(response[0]["lon"])
	add.append(address)
	val.append(dict_cities[address])
	except:
	pass

	bubble_df['address'] =add
	bubble_df['lat'] = lat
	bubble_df['lon'] = lon
	bubble_df['value'] = val


	# import the library
	import folium

	# Make an empty map
	m = folium.Map(location=[20,0], tiles="OpenStreetMap", zoom_start=2)
	# add marker one by one on the map
	for i in range(0,len(bubble_df)):
	folium.Circle(
	location=[bubble_df.iloc[i]['lat'], bubble_df.iloc[i]['lon']],

	popup=bubble_df.iloc[i][['address','value']].values,
	radius=float(bubble_df.iloc[i]['value'])*500,
	color='#69b3a2',
	fill=True,
	fill_color='#69b3a2'
	).add_to(m)
	m
	# Show the map again
	return m


	##########################





	#########################
	#### wuzzuf analysis
	def wuzzuf_exp(df1):
	top10_job_title = df1['Title'].value_counts()[:10]
	fig1 = px.bar(y=top10_job_title.values,
	x=top10_job_title.index,
	color = top10_job_title.index,
	color_discrete_sequence=px.colors.sequential.deep,
	text=top10_job_title.values,
	title= 'Top 10 Job Titles',
	template= 'plotly_dark')
	fig1.update_layout(height=500,width=500,
	xaxis_title="Job Titles",
	yaxis_title="count",
	font = dict(size=17,family="Franklin Gothic"))
	st.plotly_chart(fig1)

	type_grouped = df1['Career_Level'].value_counts()
	#e_type = ['Full-Time','Part-Time','Contract','Freelance']
	e_type =dict(df1['Career_Level'].value_counts()).keys()
	fig2 = px.bar(x = e_type, y = type_grouped.values,
	color = type_grouped.index,
	color_discrete_sequence=px.colors.sequential.dense,
	template = 'plotly_dark',
	text = type_grouped.values, title = 'Career Level Distribution')
	fig2.update_layout( height=500, width=500,
	xaxis_title="Career Level",
	yaxis_title="count",
	font = dict(size=17,family="Franklin Gothic"))
	fig2.update_traces(width=0.5)
	st.plotly_chart(fig2)
	residence = df1['Location'].value_counts()
	top10_employee_location = residence[:10]
	fig3 = px.bar(y=top10_employee_location.values,
	x=top10_employee_location.index,
	color = top10_employee_location.index,
	color_discrete_sequence=px.colors.sequential.deep,
	text=top10_employee_location.values,
	title= 'Top 10 Location of job',
	template= 'plotly_dark')
	fig3.update_layout(height=500,width=500,
	xaxis_title="Location of job",
	yaxis_title="count",
	font = dict(size=17,family="Franklin Gothic"))
	st.plotly_chart(fig3)

	type_grouped = df1['Experience_Needed'].value_counts()
	#e_type = ['Full-Time','Part-Time','Contract','Freelance']
	e_type =dict(df1['Experience_Needed'].value_counts()).keys()
	fig4 = px.bar(x = e_type, y = type_grouped.values,
	color = type_grouped.index,
	color_discrete_sequence=px.colors.sequential.dense,
	template = 'plotly_dark',
	text = type_grouped.values, title = ' Experience Level Distribution')
	fig4.update_layout(height=500,width=500,
	xaxis_title=" Experience Level (years)",
	yaxis_title="count",
	font = dict(size=17,family="Franklin Gothic"))
	fig4.update_traces(width=0.5)
	st.plotly_chart(fig4)
	return



	#########################
	### linkedin analysis

	def linkedin_exp(df1):
	top10_job_title = df1['Title'].value_counts()[:10]
	fig1 = px.bar(y=top10_job_title.values,
	x=top10_job_title.index,
	color = top10_job_title.index,
	color_discrete_sequence=px.colors.sequential.deep,
	text=top10_job_title.values,
	title= 'Top 10 Job Titles',
	template= 'plotly_dark')
	fig1.update_layout(height=500,width=500,
	xaxis_title="Job Titles",
	yaxis_title="count",
	font = dict(size=17,family="Franklin Gothic"))
	st.plotly_chart(fig1)

	type_grouped = df1['Employment type'].value_counts()
	#e_type = ['Full-Time','Part-Time','Contract','Freelance']
	e_type =dict(df1['Employment type'].value_counts()).keys()
	fig2 = px.bar(x = e_type, y = type_grouped.values,
	color = type_grouped.index,
	color_discrete_sequence=px.colors.sequential.dense,
	template = 'plotly_dark',
	text = type_grouped.values, title = 'Employment type Distribution')
	fig2.update_layout( height=500, width=500,
	xaxis_title="Employment type",
	yaxis_title="count",
	font = dict(size=17,family="Franklin Gothic"))
	fig2.update_traces(width=0.5)
	st.plotly_chart(fig2)
	residence = df1['Location'].value_counts()
	top10_employee_location = residence[:10]
	fig3 = px.bar(y=top10_employee_location.values,
	x=top10_employee_location.index,
	color = top10_employee_location.index,
	color_discrete_sequence=px.colors.sequential.deep,
	text=top10_employee_location.values,
	title= 'Top 10 Location of job',
	template= 'plotly_dark')
	fig3.update_layout(height=500,width=500,
	xaxis_title="Location of job",
	yaxis_title="count",
	font = dict(size=17,family="Franklin Gothic"))
	st.plotly_chart(fig3)

	type_grouped = df1['Seniority level'].value_counts()
	#e_type = ['Full-Time','Part-Time','Contract','Freelance']
	e_type =dict(df1['Seniority level'].value_counts()).keys()
	fig4 = px.bar(x = e_type, y = type_grouped.values,
	color = type_grouped.index,
	color_discrete_sequence=px.colors.sequential.dense,
	template = 'plotly_dark',
	text = type_grouped.values, title = 'Seniority level Distribution')
	fig4.update_layout(height=500,width=500,
	xaxis_title="Seniority level",
	yaxis_title="count",
	font = dict(size=17,family="Franklin Gothic"))
	fig4.update_traces(width=0.5)
	st.plotly_chart(fig4)
	return


	########################

	####################### stream lit app ################################

	#site = ""
	#job =""
	#num_jobs = 0

	st.set_page_config(page_title="My Web_Scrap Page", page_icon=":tada:", layout="wide")


	# ---- HEADER SECTION ----
	with st.container():
	left_column, right_column = st.columns(2)
	with left_column:
	st.subheader("Hi! I am Yassmen :wave:")
	st.title("An Electronics and Communcation Engineer")
	st.write(
	"In this app we will scrap jobs from LinkedIn and Wuzzuf websites, let's get it started :boom:"
	)
	st.write("[Reach me >](https://www.linkedin.com/in/yassmen-youssef-48439a166/)")
	with right_column:
	pass
	# st_lottie(lottie_coding, height=300, key="coding")



	import streamlit as st
	from streamlit_option_menu import option_menu

	#with st.sidebar:
	# selected = option_menu("Main Menu", ["select website", 'search job','numbers of jobs'], icons=['linkedin', 'search','123'], menu_icon="cast", default_index=1)

	webs =["Wuzzuf","Linkedin"]
	jobs =["Machine Learning","AI Engineer","Data Analysis","Software Testing"]
	nums = np.arange(1,1000)

	#with st.sidebar:
	#if selected == "select website":
	site = st.sidebar.selectbox("select one website", webs)
	#elif selected == "search job":
	job = st.sidebar.selectbox("select one job", jobs)
	#elif selected == "numbers of jobs":
	num_jobs = st.sidebar.selectbox("select num of jobs you want to scrap", nums)



	import streamlit.components.v1 as components

	import hydralit_components as hc
	n2 = pd.DataFrame()

	if st.sidebar.button('Start Scrapping'):
	if site =="Wuzzuf":

	with st.container():
	st.write("---")
	tab1, tab2 ,tab3= st.tabs([" Data", " Bubble Map","Data Exploration"])
	with tab1 :
	with hc.HyLoader('✨Now loading' ,hc.Loaders.standard_loaders,index=[3,0,5]):
	time.sleep(5)
	n1 = Wuzzuf_scrapping(job ,num_jobs )
	try:
	tab1.dataframe(n1)
	except:
	try:
	tab1.write(n1.astype(str).set_index(n1.index.astype(str))) # Success
	except:
	tab1.table(n1)
	with tab2:
	map_bubble(n1)
	with tab3:
	#tab3.plotly_chart(wuzzuf_exp(n1))
	wuzzuf_exp(n1)


	if site =="Linkedin":
	with st.container():
	st.write("---")
	tab1, tab2 ,tab3= st.tabs([" Data", " Bubble Map","Data Exploration"])
	with tab1 :
	with hc.HyLoader('✨Now loading' ,hc.Loaders.standard_loaders,index=[3,0,5]):
	time.sleep(5)
	n1 = LINKEDIN_Scrapping(job ,num_jobs )
	try:
	tab1.dataframe(n1)
	except:
	try:
	tab1.write(n1.astype(str).set_index(n1.index.astype(str))) # Success
	except:
	tab1.table(n1)
	with tab2:
	map_bubble(n1)
	with tab3:
	linkedin_exp(n1) # WILL CHANGE