Spaces:
Sleeping
Sleeping
import streamlit as st | |
import requests | |
import numpy as np | |
from streamlit_lottie import st_lottie | |
from PIL import Image | |
import warnings | |
warnings.filterwarnings("ignore") | |
import requests | |
import pandas as pd | |
import numpy as np | |
from bs4 import BeautifulSoup | |
import bs4 | |
from urllib.request import urlopen | |
import time | |
import re | |
import time | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import matplotlib as mpl | |
import plotly | |
import plotly.express as px | |
import plotly.graph_objs as go | |
import plotly.offline as py | |
from plotly.offline import iplot | |
from plotly.subplots import make_subplots | |
import plotly.figure_factory as ff | |
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.common.keys import Keys | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
#Settings for using the driver without a UI | |
options = webdriver.ChromeOptions() | |
options.add_argument('--headless') | |
options.add_argument('--no-sandbox') | |
options.add_argument('--disable-dev-shm-usage') | |
options.add_argument("start-maximized") | |
options.add_argument("disable-infobars") | |
options.add_argument("--disable-extensions") | |
driver = webdriver.Chrome('chromedriver',options=options) | |
# wuzzuf function | |
def Wuzzuf_scrapping(job_type , job_num): | |
job1 = job_type.split(" ")[0] | |
job2 = job_type.split(" ")[1] | |
link1 = 'https://wuzzuf.net/search/jobs/?a=navbl&q='+job1+'%20'+job1 | |
title = [] | |
location = [] | |
country = [] | |
job_description = [] | |
Job_Requirements =[] | |
company_name = [] | |
links = [] | |
Jop_type = [] | |
Career_Level = [] | |
company_logo = [] | |
Job_Categories = [] | |
Skills_And_Tools = [] | |
Experience_Needed =[] | |
post_time = [] | |
Title = [] | |
pages_num = np.ceil(job_num/15) | |
for i in range(int(pages_num) ): | |
link_new = link1 +'&start='+str(i) | |
data = requests.get(link_new) | |
soup = BeautifulSoup(data.content) | |
Title = soup.find_all('h2' , {'class': 'css-m604qf'}) | |
# to get the info about jobs | |
for x in range(0,len(Title)): | |
t = re.split('\(|\-',Title[x].find('a').text) | |
title.append(t[0].strip()) | |
loc = re.split(',' , soup.find_all('span' , {'class': 'css-5wys0k'})[x].text) | |
r = "" | |
for i in range(len(loc[:-1])): | |
r= r+ ', ' +loc[:-1][i].strip() | |
location.append(r.replace(',', '', 1).strip()) | |
country.append(loc[-1].strip()) | |
links.append('https://wuzzuf.net' + Title[x].find('a').attrs['href']) | |
m = " ".join(re.findall("[a-zA-Z\d+]+", (soup.find_all('div' , {'class': 'css-d7j1kk'})[x].find('a').text))) | |
company_name.append(m) | |
c = soup.find_all('div' ,{'class':'css-1lh32fc'})[x].find_all('span') | |
if len(c) ==1: | |
Jop_type.append(c[0].text) | |
else: | |
n =[] | |
for i in range(len(c)): | |
n.append(c[i].text) | |
Jop_type.append(n) | |
n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span']) | |
Career_Level.append(n[0].text) | |
n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span']) | |
yy = n[1].text.replace('·',' ').strip() | |
yy = re.findall('[0-9-+]*',yy) | |
y1 ="" | |
for i in range(len(yy)): | |
if any(yy[i]): | |
y1 = y1+yy[i] | |
if y1 != "": | |
Experience_Needed.append(y1) | |
else: | |
Experience_Needed.append("Not Specified") | |
time = (soup.find_all('div' ,{'class':'css-d7j1kk'}))[x].find('div') | |
post_time.append(time.text) | |
# to get the logo of the company | |
data1 = requests.get(links[x]) | |
soup1 = BeautifulSoup(data1.content) | |
company_logo.append(soup1.find_all('meta',{'property':"og:image"})[0]['content']) | |
#time.sleep(4) | |
# get Job_Categories , Skills_And_Tools , job_description , and job_requirements from urls | |
driver = webdriver.Chrome('chromedriver',options=options) | |
#driver.implicitly_wait(10) | |
driver.get(links[x]) | |
Job_Categories.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[5]').text.split("\n")[1:]) | |
Skills_And_Tools.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[6]').text.split("\n")[1:]) | |
job_description.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[3]').text.split("\n")[1:]) | |
all =driver.find_elements(By.XPATH ,'//*[@id="app"]/div/main/section[4]/div') | |
dict_other = {} | |
new = all[0].text.split("\n\n") | |
if len(new)!=1 : | |
for i in range(len(new)): | |
result =[] | |
for k in (new[i].split('\n')[1:]): | |
result.append(k.replace("\u202f"," ")) | |
dict_other[new[i].split('\n')[0]] = result | |
#result = re.sub('[\W_]+', '', ini_string) | |
Job_Requirements.append(dict_other) | |
else: | |
nn = new[0].replace("\u202f"," ") | |
Job_Requirements.append(nn.split('\n')) | |
# create data frame to combine all together | |
df = pd.DataFrame({'Title' : title , 'Location' : location ,'country':country,'URLs':links ,'Company_Name' : company_name,'Career_Level':Career_Level,'post_time':post_time,'Experience_Needed':Experience_Needed,'Company_Logo':company_logo,"Job_Categories":Job_Categories , "Skills_And_Tools":Skills_And_Tools , "job_description":job_description,"Job_Requirements":Job_Requirements}) | |
df[:job_num].to_excel('WUZZUF_scrapping.xlsx',index=False,encoding='utf-8') | |
return df[:job_num] | |
# linkedin function | |
def LINKEDIN_Scrapping(job_search , num_jobs): | |
job1 = job_search.split(" ")[0] | |
job2 = job_search.split(" ")[1] | |
link1 = 'https://www.linkedin.com/jobs/search?keywords='+job1 +'%20' +job2 +'&location=&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0' | |
# FIRST get main informations about jobs | |
title = [] | |
location = [] | |
country = [] | |
company_name = [] | |
post_time = [] | |
links =[] | |
# get the specific numbers of jobs | |
l1 = "" | |
ll ="" | |
driver = webdriver.Chrome('chromedriver',options=options) | |
driver.get(link1) | |
SCROLL_PAUSE_TIME = 0.5 | |
while True : | |
l1 = driver.find_elements(By.XPATH,'//*[@id="main-content"]/section[2]/ul/li[*]/div') | |
ll= driver.find_elements(By.XPATH ,'//*[@id="main-content"]/section[2]/ul/li[*]/div/a') | |
if len(l1) >= num_jobs: | |
break | |
time.sleep(3) | |
# Get scroll height | |
last_height = driver.execute_script("return document.body.scrollHeight") | |
while True: | |
# Scroll down to bottom | |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
# Wait to load page | |
time.sleep(SCROLL_PAUSE_TIME) | |
# Calculate new scroll height and compare with last scroll height | |
new_height = driver.execute_script("return document.body.scrollHeight") | |
if new_height == last_height: | |
break | |
last_height = new_height | |
options.add_argument("window-size=1200x600") | |
WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[2]/button'))).click() | |
print(len(l1)) | |
time.sleep(2) | |
l2 = l1[:num_jobs] | |
for info in l2: | |
info_tot = info.text.split("\n") | |
if len(info_tot)==5: | |
title.append(info_tot[1]) | |
location.append(info_tot[3]) | |
company_name.append(info_tot[2]) | |
post_time.append(info_tot[4]) | |
else: | |
title.append(info_tot[1]) | |
location.append(info_tot[3]) | |
company_name.append(info_tot[2]) | |
post_time.append(info_tot[5]) | |
# get links for jobs | |
l3 = ll[:num_jobs] | |
for i in l3: | |
links.append(i.get_attribute('href')) | |
df_ml = pd.DataFrame({'Title' : title , 'Location' : location ,'URLs':links ,'Company_Name' : company_name ,'post_time':post_time}) | |
# GET DESCRIPTION AND LOGO | |
def all_description_LOGO(urls): | |
description =[] | |
LOGO =[] | |
for link in urls: | |
driver = webdriver.Chrome('chromedriver',options=options) | |
driver.get(link) | |
options.add_argument("window-size=1200x600") | |
WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/button[1]'))).click() | |
qqq= 4+444*58/7+65 | |
K = driver.find_element(By.XPATH,'//*[@id="main-content"]/section[1]/div/section[2]/div/a/img') | |
LOGO.append(K.get_attribute('src')) | |
time.sleep(3) | |
t = driver.find_element(By.XPATH ,'//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/div') | |
t_reverse=t.text[::-1] | |
if t_reverse[:9] =="erom wohs": | |
l = len(t.text) | |
strings=t.text[:l-9].split("\n") | |
strings[:] = [x for x in strings if x] | |
description.append(strings) | |
else: | |
strings=t.text.split("\n") | |
strings[:] = [x for x in strings if x] | |
description.append(strings) | |
df_ml = pd.DataFrame({'all_about_job' : description ,'company_logo':LOGO}) | |
return df_ml | |
# apply desc. and logo function | |
E = all_description_LOGO(links) | |
# other info function | |
def other(urls): | |
frames =[] | |
for url in urls: | |
data1 = requests.get(url) | |
soup1 = BeautifulSoup(data1.content) | |
j = soup1.find('ul' , {'class': 'description__job-criteria-list'}) | |
time.sleep(4) | |
jj=j.find_all('h3') | |
dic ={} | |
for i in range(len(jj)): | |
dic[jj[i].text.replace('\n',' ').strip()] = j.find_all('span')[i].text.replace('\n',' ').strip() | |
output = pd.DataFrame() | |
output = output.append(dic, ignore_index=True) | |
frames.append(output) | |
result = pd.concat(frames) | |
return result | |
# apply Other function | |
df = other(links) | |
df.fillna('Not_Found',inplace= True) | |
df.reset_index(inplace=True, drop=True) | |
# combine all together | |
result = pd.concat([df_ml,E, df ], axis=1) | |
return result | |
##################### map_bubble ##################### | |
#### function to show map for loaction of the job | |
def map_bubble(df): | |
import requests | |
import urllib.parse | |
g =[] | |
for i in range(len(df.Location)): | |
if df.Location.loc[i].split(","): | |
g.append(df.Location.loc[i].split(",")[0]) | |
else: | |
g.append(df.Location.loc[i]) | |
df['new_loc']=g | |
if 'country' in df.columns: | |
df["full_location"] = df["new_loc"] + ", " +df["country"] | |
dict_cities = dict(df.full_location.value_counts()) | |
else : | |
dict_cities = dict(df.new_loc.value_counts()) | |
lat = [] | |
lon = [] | |
bubble_df = pd.DataFrame() | |
add=[] | |
val=[] | |
try: | |
for address in dict_cities.keys(): | |
url = 'https://nominatim.openstreetmap.org/search/' + urllib.parse.quote(address) +'?format=json' | |
response = requests.get(url).json() | |
lat.append(response[0]["lat"]) | |
lon.append(response[0]["lon"]) | |
add.append(address) | |
val.append(dict_cities[address]) | |
except: | |
pass | |
bubble_df['address'] =add | |
bubble_df['lat'] = lat | |
bubble_df['lon'] = lon | |
bubble_df['value'] = val | |
# import the library | |
import folium | |
# Make an empty map | |
m = folium.Map(location=[20,0], tiles="OpenStreetMap", zoom_start=2) | |
# add marker one by one on the map | |
for i in range(0,len(bubble_df)): | |
folium.Circle( | |
location=[bubble_df.iloc[i]['lat'], bubble_df.iloc[i]['lon']], | |
popup=bubble_df.iloc[i][['address','value']].values, | |
radius=float(bubble_df.iloc[i]['value'])*500, | |
color='#69b3a2', | |
fill=True, | |
fill_color='#69b3a2' | |
).add_to(m) | |
m | |
# Show the map again | |
return m | |
########################## | |
######################### | |
#### wuzzuf analysis | |
def wuzzuf_exp(df1): | |
top10_job_title = df1['Title'].value_counts()[:10] | |
fig1 = px.bar(y=top10_job_title.values, | |
x=top10_job_title.index, | |
color = top10_job_title.index, | |
color_discrete_sequence=px.colors.sequential.deep, | |
text=top10_job_title.values, | |
title= 'Top 10 Job Titles', | |
template= 'plotly_dark') | |
fig1.update_layout(height=500,width=500, | |
xaxis_title="Job Titles", | |
yaxis_title="count", | |
font = dict(size=17,family="Franklin Gothic")) | |
st.plotly_chart(fig1) | |
type_grouped = df1['Career_Level'].value_counts() | |
#e_type = ['Full-Time','Part-Time','Contract','Freelance'] | |
e_type =dict(df1['Career_Level'].value_counts()).keys() | |
fig2 = px.bar(x = e_type, y = type_grouped.values, | |
color = type_grouped.index, | |
color_discrete_sequence=px.colors.sequential.dense, | |
template = 'plotly_dark', | |
text = type_grouped.values, title = 'Career Level Distribution') | |
fig2.update_layout( height=500, width=500, | |
xaxis_title="Career Level", | |
yaxis_title="count", | |
font = dict(size=17,family="Franklin Gothic")) | |
fig2.update_traces(width=0.5) | |
st.plotly_chart(fig2) | |
residence = df1['Location'].value_counts() | |
top10_employee_location = residence[:10] | |
fig3 = px.bar(y=top10_employee_location.values, | |
x=top10_employee_location.index, | |
color = top10_employee_location.index, | |
color_discrete_sequence=px.colors.sequential.deep, | |
text=top10_employee_location.values, | |
title= 'Top 10 Location of job', | |
template= 'plotly_dark') | |
fig3.update_layout(height=500,width=500, | |
xaxis_title="Location of job", | |
yaxis_title="count", | |
font = dict(size=17,family="Franklin Gothic")) | |
st.plotly_chart(fig3) | |
type_grouped = df1['Experience_Needed'].value_counts() | |
#e_type = ['Full-Time','Part-Time','Contract','Freelance'] | |
e_type =dict(df1['Experience_Needed'].value_counts()).keys() | |
fig4 = px.bar(x = e_type, y = type_grouped.values, | |
color = type_grouped.index, | |
color_discrete_sequence=px.colors.sequential.dense, | |
template = 'plotly_dark', | |
text = type_grouped.values, title = ' Experience Level Distribution') | |
fig4.update_layout(height=500,width=500, | |
xaxis_title=" Experience Level (years)", | |
yaxis_title="count", | |
font = dict(size=17,family="Franklin Gothic")) | |
fig4.update_traces(width=0.5) | |
st.plotly_chart(fig4) | |
return | |
######################### | |
### linkedin analysis | |
def linkedin_exp(df1): | |
top10_job_title = df1['Title'].value_counts()[:10] | |
fig1 = px.bar(y=top10_job_title.values, | |
x=top10_job_title.index, | |
color = top10_job_title.index, | |
color_discrete_sequence=px.colors.sequential.deep, | |
text=top10_job_title.values, | |
title= 'Top 10 Job Titles', | |
template= 'plotly_dark') | |
fig1.update_layout(height=500,width=500, | |
xaxis_title="Job Titles", | |
yaxis_title="count", | |
font = dict(size=17,family="Franklin Gothic")) | |
st.plotly_chart(fig1) | |
type_grouped = df1['Employment type'].value_counts() | |
#e_type = ['Full-Time','Part-Time','Contract','Freelance'] | |
e_type =dict(df1['Employment type'].value_counts()).keys() | |
fig2 = px.bar(x = e_type, y = type_grouped.values, | |
color = type_grouped.index, | |
color_discrete_sequence=px.colors.sequential.dense, | |
template = 'plotly_dark', | |
text = type_grouped.values, title = 'Employment type Distribution') | |
fig2.update_layout( height=500, width=500, | |
xaxis_title="Employment type", | |
yaxis_title="count", | |
font = dict(size=17,family="Franklin Gothic")) | |
fig2.update_traces(width=0.5) | |
st.plotly_chart(fig2) | |
residence = df1['Location'].value_counts() | |
top10_employee_location = residence[:10] | |
fig3 = px.bar(y=top10_employee_location.values, | |
x=top10_employee_location.index, | |
color = top10_employee_location.index, | |
color_discrete_sequence=px.colors.sequential.deep, | |
text=top10_employee_location.values, | |
title= 'Top 10 Location of job', | |
template= 'plotly_dark') | |
fig3.update_layout(height=500,width=500, | |
xaxis_title="Location of job", | |
yaxis_title="count", | |
font = dict(size=17,family="Franklin Gothic")) | |
st.plotly_chart(fig3) | |
type_grouped = df1['Seniority level'].value_counts() | |
#e_type = ['Full-Time','Part-Time','Contract','Freelance'] | |
e_type =dict(df1['Seniority level'].value_counts()).keys() | |
fig4 = px.bar(x = e_type, y = type_grouped.values, | |
color = type_grouped.index, | |
color_discrete_sequence=px.colors.sequential.dense, | |
template = 'plotly_dark', | |
text = type_grouped.values, title = 'Seniority level Distribution') | |
fig4.update_layout(height=500,width=500, | |
xaxis_title="Seniority level", | |
yaxis_title="count", | |
font = dict(size=17,family="Franklin Gothic")) | |
fig4.update_traces(width=0.5) | |
st.plotly_chart(fig4) | |
return | |
######################## | |
####################### stream lit app ################################ | |
#site = "" | |
#job ="" | |
#num_jobs = 0 | |
st.set_page_config(page_title="My Web_Scrap Page", page_icon=":tada:", layout="wide") | |
# ---- HEADER SECTION ---- | |
with st.container(): | |
left_column, right_column = st.columns(2) | |
with left_column: | |
st.subheader("Hi! I am Yassmen :wave:") | |
st.title("An Electronics and Communcation Engineer") | |
st.write( | |
"In this app we will scrap jobs from LinkedIn and Wuzzuf websites, let's get it started :boom:" | |
) | |
st.write("[Reach me >](https://www.linkedin.com/in/yassmen-youssef-48439a166/)") | |
with right_column: | |
pass | |
# st_lottie(lottie_coding, height=300, key="coding") | |
import streamlit as st | |
from streamlit_option_menu import option_menu | |
#with st.sidebar: | |
# selected = option_menu("Main Menu", ["select website", 'search job','numbers of jobs'], icons=['linkedin', 'search','123'], menu_icon="cast", default_index=1) | |
webs =["Wuzzuf","Linkedin"] | |
jobs =["Machine Learning","AI Engineer","Data Analysis","Software Testing"] | |
nums = np.arange(1,1000) | |
#with st.sidebar: | |
#if selected == "select website": | |
site = st.sidebar.selectbox("select one website", webs) | |
#elif selected == "search job": | |
job = st.sidebar.selectbox("select one job", jobs) | |
#elif selected == "numbers of jobs": | |
num_jobs = st.sidebar.selectbox("select num of jobs you want to scrap", nums) | |
import streamlit.components.v1 as components | |
import hydralit_components as hc | |
n2 = pd.DataFrame() | |
if st.sidebar.button('Start Scrapping'): | |
if site =="Wuzzuf": | |
with st.container(): | |
st.write("---") | |
tab1, tab2 ,tab3= st.tabs([" Data", " Bubble Map","Data Exploration"]) | |
with tab1 : | |
with hc.HyLoader('✨Now loading' ,hc.Loaders.standard_loaders,index=[3,0,5]): | |
time.sleep(5) | |
n1 = Wuzzuf_scrapping(job ,num_jobs ) | |
try: | |
tab1.dataframe(n1) | |
except: | |
try: | |
tab1.write(n1.astype(str).set_index(n1.index.astype(str))) # Success | |
except: | |
tab1.table(n1) | |
with tab2: | |
map_bubble(n1) | |
with tab3: | |
#tab3.plotly_chart(wuzzuf_exp(n1)) | |
wuzzuf_exp(n1) | |
if site =="Linkedin": | |
with st.container(): | |
st.write("---") | |
tab1, tab2 ,tab3= st.tabs([" Data", " Bubble Map","Data Exploration"]) | |
with tab1 : | |
with hc.HyLoader('✨Now loading' ,hc.Loaders.standard_loaders,index=[3,0,5]): | |
time.sleep(5) | |
n1 = LINKEDIN_Scrapping(job ,num_jobs ) | |
try: | |
tab1.dataframe(n1) | |
except: | |
try: | |
tab1.write(n1.astype(str).set_index(n1.index.astype(str))) # Success | |
except: | |
tab1.table(n1) | |
with tab2: | |
map_bubble(n1) | |
with tab3: | |
linkedin_exp(n1) # WILL CHANGE |