Spaces:

Yassmen
/

Job.web.scrapping

Sleeping

App Files Files Community

Yassmen commited on 22 days ago

Commit

29fc20d

•

1 Parent(s): 6343888

Update app.py

Browse files

Files changed (1) hide show

app.py +6 -462

app.py CHANGED Viewed

@@ -2,7 +2,6 @@
 import streamlit as st
 import requests
 import numpy as np
-from streamlit_lottie import st_lottie
 from PIL import Image
 import warnings
 warnings.filterwarnings("ignore")
@@ -43,467 +42,12 @@ options.add_argument("disable-infobars")
 options.add_argument("--disable-extensions")
 driver = webdriver.Chrome('chromedriver',options=options)
-# wuzzuf function
-def Wuzzuf_scrapping(job_type , job_num):
-    job1 = job_type.split(" ")[0]
-    job2 = job_type.split(" ")[1]
-    link1 = 'https://wuzzuf.net/search/jobs/?a=navbl&q='+job1+'%20'+job1
-    title = []
-    location = []
-    country = []
-    job_description = []
-    Job_Requirements =[]
-    company_name = []
-    links = []
-    Jop_type = []
-    Career_Level = []
-    company_logo = []
-    Job_Categories = []
-    Skills_And_Tools = []
-    Experience_Needed =[]
-    post_time = []
-    Title = []
-    pages_num = np.ceil(job_num/15)
-    for i in range(int(pages_num) ):
-      link_new = link1 +'&start='+str(i)
-      data  = requests.get(link_new)
-      soup  = BeautifulSoup(data.content)
-      Title = soup.find_all('h2' , {'class': 'css-m604qf'})
-  # to get the info about jobs
-      for x in range(0,len(Title)):
-        t = re.split('\(|\-',Title[x].find('a').text)
-        title.append(t[0].strip())
-        loc = re.split(',' , soup.find_all('span' , {'class': 'css-5wys0k'})[x].text)
-        r = ""
-        for i in range(len(loc[:-1])):
-          r= r+ ', ' +loc[:-1][i].strip()
-        location.append(r.replace(',', '', 1).strip())
-        country.append(loc[-1].strip())
-        links.append('https://wuzzuf.net' + Title[x].find('a').attrs['href'])
-        m = " ".join(re.findall("[a-zA-Z\d+]+", (soup.find_all('div' , {'class': 'css-d7j1kk'})[x].find('a').text)))
-        company_name.append(m)
-        c = soup.find_all('div' ,{'class':'css-1lh32fc'})[x].find_all('span')
-        if len(c) ==1:
-          Jop_type.append(c[0].text)
-        else:
-          n =[]
-          for i in range(len(c)):
-            n.append(c[i].text)
-          Jop_type.append(n)
-        n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span'])
-        Career_Level.append(n[0].text)
-        n =soup.find_all('div' ,{'class':'css-y4udm8'})[x].find_all('div')[1].find_all(['a','span'])
-        yy = n[1].text.replace('·',' ').strip()
-        yy = re.findall('[0-9-+]*',yy)
-        y1 =""
-        for i in range(len(yy)):
-          if any(yy[i]):
-            y1 = y1+yy[i]
-        if y1 != "":
-          Experience_Needed.append(y1)
-        else:
-          Experience_Needed.append("Not Specified")
-        time = (soup.find_all('div' ,{'class':'css-d7j1kk'}))[x].find('div')
-        post_time.append(time.text)
-  # to get the logo of the company
-        data1  = requests.get(links[x])
-        soup1 = BeautifulSoup(data1.content)
-        company_logo.append(soup1.find_all('meta',{'property':"og:image"})[0]['content'])
-        #time.sleep(4)
-  # get Job_Categories , Skills_And_Tools , job_description , and job_requirements from urls
-        driver = webdriver.Chrome('chromedriver',options=options)
-        #driver.implicitly_wait(10)
-        driver.get(links[x])
-        Job_Categories.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[5]').text.split("\n")[1:])
-        Skills_And_Tools.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[2]/div[6]').text.split("\n")[1:])
-        job_description.append(driver.find_element(By.XPATH ,'//*[@id="app"]/div/main/section[3]').text.split("\n")[1:])
-        all =driver.find_elements(By.XPATH ,'//*[@id="app"]/div/main/section[4]/div')
-        dict_other = {}
-        new = all[0].text.split("\n\n")
-        if len(new)!=1 :
-          for i in range(len(new)):
-            result =[]
-            for k in (new[i].split('\n')[1:]):
-              result.append(k.replace("\u202f"," "))
-              dict_other[new[i].split('\n')[0]] = result
-            #result = re.sub('[\W_]+', '', ini_string)
-          Job_Requirements.append(dict_other)
-        else:
-          nn = new[0].replace("\u202f"," ")
-          Job_Requirements.append(nn.split('\n'))
-  #  create data frame to combine all together
-    df = pd.DataFrame({'Title' : title , 'Location' : location ,'country':country,'URLs':links ,'Company_Name' : company_name,'Career_Level':Career_Level,'post_time':post_time,'Experience_Needed':Experience_Needed,'Company_Logo':company_logo,"Job_Categories":Job_Categories , "Skills_And_Tools":Skills_And_Tools , "job_description":job_description,"Job_Requirements":Job_Requirements})
-    df[:job_num].to_excel('WUZZUF_scrapping.xlsx',index=False,encoding='utf-8')
-    return df[:job_num]
-# linkedin function
-def LINKEDIN_Scrapping(job_search , num_jobs):
-  job1 = job_search.split(" ")[0]
-  job2 = job_search.split(" ")[1]
-  link1 = 'https://www.linkedin.com/jobs/search?keywords='+job1 +'%20' +job2 +'&location=&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0'
-  # FIRST get main informations about jobs
-  title = []
-  location = []
-  country = []
-  company_name = []
-  post_time = []
-  links =[]
-  # get the specific numbers of jobs
-  l1 = ""
-  ll =""
-  driver = webdriver.Chrome('chromedriver',options=options)
-  driver.get(link1)
-  SCROLL_PAUSE_TIME = 0.5
-  while True :
-    l1 = driver.find_elements(By.XPATH,'//*[@id="main-content"]/section[2]/ul/li[*]/div')
-    ll= driver.find_elements(By.XPATH ,'//*[@id="main-content"]/section[2]/ul/li[*]/div/a')
-    if len(l1) >= num_jobs:
-      break
-    time.sleep(3)
-    # Get scroll height
-    last_height = driver.execute_script("return document.body.scrollHeight")
-    while True:
-        # Scroll down to bottom
-        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
-        # Wait to load page
-        time.sleep(SCROLL_PAUSE_TIME)
-        # Calculate new scroll height and compare with last scroll height
-        new_height = driver.execute_script("return document.body.scrollHeight")
-        if new_height == last_height:
-            break
-        last_height = new_height
-    options.add_argument("window-size=1200x600")
-    WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[2]/button'))).click()
-    print(len(l1))
-    time.sleep(2)
-  l2 = l1[:num_jobs]
-  for info in l2:
-    info_tot = info.text.split("\n")
-    if len(info_tot)==5:
-      title.append(info_tot[1])
-      location.append(info_tot[3])
-      company_name.append(info_tot[2])
-      post_time.append(info_tot[4])
-    else:
-      title.append(info_tot[1])
-      location.append(info_tot[3])
-      company_name.append(info_tot[2])
-      post_time.append(info_tot[5])
-  # get links for jobs
-  l3 = ll[:num_jobs]
-  for i in l3:
-    links.append(i.get_attribute('href'))
-  df_ml = pd.DataFrame({'Title' : title , 'Location' : location ,'URLs':links ,'Company_Name' : company_name ,'post_time':post_time})
-    # GET DESCRIPTION AND LOGO
-  def all_description_LOGO(urls):
-    description =[]
-    LOGO =[]
-    for link in urls:
-      driver = webdriver.Chrome('chromedriver',options=options)
-      driver.get(link)
-      options.add_argument("window-size=1200x600")
-      WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/button[1]'))).click()
-      qqq= 4+444*58/7+65
-      K = driver.find_element(By.XPATH,'//*[@id="main-content"]/section[1]/div/section[2]/div/a/img')
-      LOGO.append(K.get_attribute('src'))
-      time.sleep(3)
-      t = driver.find_element(By.XPATH ,'//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/div')
-      t_reverse=t.text[::-1]
-      if t_reverse[:9] =="erom wohs":
-        l = len(t.text)
-        strings=t.text[:l-9].split("\n")
-        strings[:] = [x for x in strings if x]
-        description.append(strings)
-      else:
-        strings=t.text.split("\n")
-        strings[:] = [x for x in strings if x]
-        description.append(strings)
-    df_ml = pd.DataFrame({'all_about_job' : description ,'company_logo':LOGO})
-    return df_ml
-  # apply desc. and logo function
-  E = all_description_LOGO(links)
-  # other info function
-  def other(urls):
-    frames =[]
-    for url in urls:
-      data1 = requests.get(url)
-      soup1 = BeautifulSoup(data1.content)
-      j =  soup1.find('ul' , {'class': 'description__job-criteria-list'})
-      time.sleep(4)
-      jj=j.find_all('h3')
-      dic ={}
-      for i in range(len(jj)):
-        dic[jj[i].text.replace('\n',' ').strip()] = j.find_all('span')[i].text.replace('\n',' ').strip()
-      output = pd.DataFrame()
-      output = output.append(dic, ignore_index=True)
-      frames.append(output)
-    result = pd.concat(frames)
-    return result
-  # apply Other function
-  df = other(links)
-  df.fillna('Not_Found',inplace= True)
-  df.reset_index(inplace=True, drop=True)
- # combine all together
-  result = pd.concat([df_ml,E, df ], axis=1)
-  return result
-##################### map_bubble #####################
-#### function to show map for loaction of the job
-def map_bubble(df):
-  import requests
-  import urllib.parse
-  g =[]
-  for i  in range(len(df.Location)):
-    if  df.Location.loc[i].split(","):
-      g.append(df.Location.loc[i].split(",")[0])
-    else:
-      g.append(df.Location.loc[i])
-  df['new_loc']=g
-  if 'country' in df.columns:
-    df["full_location"] = df["new_loc"] + ", " +df["country"]
-    dict_cities = dict(df.full_location.value_counts())
-  else :
-    dict_cities = dict(df.new_loc.value_counts())
-  lat = []
-  lon = []
-  bubble_df = pd.DataFrame()
-  add=[]
-  val=[]
-  try:
-    for address in dict_cities.keys():
-      url = 'https://nominatim.openstreetmap.org/search/' + urllib.parse.quote(address) +'?format=json'
-      response = requests.get(url).json()
-      lat.append(response[0]["lat"])
-      lon.append(response[0]["lon"])
-      add.append(address)
-      val.append(dict_cities[address])
-  except:
-    pass
-  bubble_df['address'] =add
-  bubble_df['lat'] = lat
-  bubble_df['lon'] = lon
-  bubble_df['value'] = val
-  # import the library
-  import folium
-  # Make an empty map
-  m = folium.Map(location=[20,0], tiles="OpenStreetMap", zoom_start=2)
-  # add marker one by one on the map
-  for i in range(0,len(bubble_df)):
-    folium.Circle(
-        location=[bubble_df.iloc[i]['lat'], bubble_df.iloc[i]['lon']],
-        popup=bubble_df.iloc[i][['address','value']].values,
-        radius=float(bubble_df.iloc[i]['value'])*500,
-        color='#69b3a2',
-        fill=True,
-        fill_color='#69b3a2'
-    ).add_to(m)
-  m
-  # Show the map again
-  return m
-##########################
-#########################
-#### wuzzuf analysis
-def wuzzuf_exp(df1):
-  top10_job_title = df1['Title'].value_counts()[:10]
-  fig1 = px.bar(y=top10_job_title.values,
-              x=top10_job_title.index,
-              color = top10_job_title.index,
-              color_discrete_sequence=px.colors.sequential.deep,
-              text=top10_job_title.values,
-              title= 'Top 10 Job Titles',
-              template= 'plotly_dark')
-  fig1.update_layout(height=500,width=500,
-      xaxis_title="Job Titles",
-      yaxis_title="count",
-      font = dict(size=17,family="Franklin Gothic"))
-  st.plotly_chart(fig1)
-  type_grouped = df1['Career_Level'].value_counts()
-  #e_type = ['Full-Time','Part-Time','Contract','Freelance']
-  e_type =dict(df1['Career_Level'].value_counts()).keys()
-  fig2 = px.bar(x = e_type, y = type_grouped.values,
-        color = type_grouped.index,
-        color_discrete_sequence=px.colors.sequential.dense,
-        template = 'plotly_dark',
-        text = type_grouped.values, title = 'Career Level Distribution')
-  fig2.update_layout( height=500, width=500,
-      xaxis_title="Career Level",
-      yaxis_title="count",
-      font = dict(size=17,family="Franklin Gothic"))
-  fig2.update_traces(width=0.5)
-  st.plotly_chart(fig2)
-  residence = df1['Location'].value_counts()
-  top10_employee_location = residence[:10]
-  fig3 = px.bar(y=top10_employee_location.values,
-              x=top10_employee_location.index,
-              color = top10_employee_location.index,
-              color_discrete_sequence=px.colors.sequential.deep,
-              text=top10_employee_location.values,
-              title= 'Top 10 Location of job',
-              template= 'plotly_dark')
-  fig3.update_layout(height=500,width=500,
-      xaxis_title="Location of job",
-      yaxis_title="count",
-      font = dict(size=17,family="Franklin Gothic"))
-  st.plotly_chart(fig3)
-  type_grouped = df1['Experience_Needed'].value_counts()
-  #e_type = ['Full-Time','Part-Time','Contract','Freelance']
-  e_type =dict(df1['Experience_Needed'].value_counts()).keys()
-  fig4 = px.bar(x = e_type, y = type_grouped.values,
-        color = type_grouped.index,
-        color_discrete_sequence=px.colors.sequential.dense,
-        template = 'plotly_dark',
-        text = type_grouped.values, title = ' Experience Level Distribution')
-  fig4.update_layout(height=500,width=500,
-      xaxis_title=" Experience Level (years)",
-      yaxis_title="count",
-      font = dict(size=17,family="Franklin Gothic"))
-  fig4.update_traces(width=0.5)
-  st.plotly_chart(fig4)
-  return
-#########################
-### linkedin analysis
-def linkedin_exp(df1):
-  top10_job_title = df1['Title'].value_counts()[:10]
-  fig1 = px.bar(y=top10_job_title.values,
-              x=top10_job_title.index,
-              color = top10_job_title.index,
-              color_discrete_sequence=px.colors.sequential.deep,
-              text=top10_job_title.values,
-              title= 'Top 10 Job Titles',
-              template= 'plotly_dark')
-  fig1.update_layout(height=500,width=500,
-      xaxis_title="Job Titles",
-      yaxis_title="count",
-      font = dict(size=17,family="Franklin Gothic"))
-  st.plotly_chart(fig1)
-  type_grouped = df1['Employment type'].value_counts()
-  #e_type = ['Full-Time','Part-Time','Contract','Freelance']
-  e_type =dict(df1['Employment type'].value_counts()).keys()
-  fig2 = px.bar(x = e_type, y = type_grouped.values,
-        color = type_grouped.index,
-        color_discrete_sequence=px.colors.sequential.dense,
-        template = 'plotly_dark',
-        text = type_grouped.values, title = 'Employment type Distribution')
-  fig2.update_layout( height=500, width=500,
-      xaxis_title="Employment type",
-      yaxis_title="count",
-      font = dict(size=17,family="Franklin Gothic"))
-  fig2.update_traces(width=0.5)
-  st.plotly_chart(fig2)
-  residence = df1['Location'].value_counts()
-  top10_employee_location = residence[:10]
-  fig3 = px.bar(y=top10_employee_location.values,
-              x=top10_employee_location.index,
-              color = top10_employee_location.index,
-              color_discrete_sequence=px.colors.sequential.deep,
-              text=top10_employee_location.values,
-              title= 'Top 10 Location of job',
-              template= 'plotly_dark')
-  fig3.update_layout(height=500,width=500,
-      xaxis_title="Location of job",
-      yaxis_title="count",
-      font = dict(size=17,family="Franklin Gothic"))
-  st.plotly_chart(fig3)
-  type_grouped = df1['Seniority level'].value_counts()
-  #e_type = ['Full-Time','Part-Time','Contract','Freelance']
-  e_type =dict(df1['Seniority level'].value_counts()).keys()
-  fig4 = px.bar(x = e_type, y = type_grouped.values,
-        color = type_grouped.index,
-        color_discrete_sequence=px.colors.sequential.dense,
-        template = 'plotly_dark',
-        text = type_grouped.values, title = 'Seniority level Distribution')
-  fig4.update_layout(height=500,width=500,
-      xaxis_title="Seniority level",
-      yaxis_title="count",
-      font = dict(size=17,family="Franklin Gothic"))
-  fig4.update_traces(width=0.5)
-  st.plotly_chart(fig4)
-  return
-########################
 ####################### stream lit app ################################
-#site = ""
-#job =""
-#num_jobs = 0
 st.set_page_config(page_title="My Web_Scrap Page", page_icon=":tada:", layout="wide")
@@ -519,7 +63,7 @@ with st.container():
         )
         st.write("[Reach me >](https://www.linkedin.com/in/yassmen-youssef-48439a166/)")
     with right_column:
-        pass
        # st_lottie(lottie_coding, height=300, key="coding")
@@ -556,7 +100,7 @@ if st.sidebar.button('Start Scrapping'):
         st.write("---")
         tab1, tab2 ,tab3= st.tabs([" Data", " Bubble Map","Data Exploration"])
         with tab1 :
-          with hc.HyLoader('✨Now loading' ,hc.Loaders.standard_loaders,index=[3,0,5]):
             time.sleep(5)
             n1 = Wuzzuf_scrapping(job ,num_jobs )
             try:
@@ -578,7 +122,7 @@ if st.sidebar.button('Start Scrapping'):
         st.write("---")
         tab1, tab2 ,tab3= st.tabs([" Data", " Bubble Map","Data Exploration"])
         with tab1 :
-          with hc.HyLoader('✨Now loading' ,hc.Loaders.standard_loaders,index=[3,0,5]):
             time.sleep(5)
             n1 = LINKEDIN_Scrapping(job ,num_jobs )
             try:

 import streamlit as st
 import requests
 import numpy as np
 from PIL import Image
 import warnings
 warnings.filterwarnings("ignore")
 options.add_argument("--disable-extensions")
 driver = webdriver.Chrome('chromedriver',options=options)
+from wuzzuf_scraper import Wuzzuf_scrapping
+from linkedin_scraper import LINKEDIN_Scrapping
+from data_analysis import map_bubble,linkedin_exp,wuzzuf_exp
 ####################### stream lit app ################################
 st.set_page_config(page_title="My Web_Scrap Page", page_icon=":tada:", layout="wide")
         )
         st.write("[Reach me >](https://www.linkedin.com/in/yassmen-youssef-48439a166/)")
     with right_column:
+        st.image("im.gif", use_column_width=True)
        # st_lottie(lottie_coding, height=300, key="coding")
         st.write("---")
         tab1, tab2 ,tab3= st.tabs([" Data", " Bubble Map","Data Exploration"])
         with tab1 :
+          with st.spinner('✨Now loading...' ):
             time.sleep(5)
             n1 = Wuzzuf_scrapping(job ,num_jobs )
             try:
         st.write("---")
         tab1, tab2 ,tab3= st.tabs([" Data", " Bubble Map","Data Exploration"])
         with tab1 :
+          with st.spinner('✨Now loading...' ):
             time.sleep(5)
             n1 = LINKEDIN_Scrapping(job ,num_jobs )
             try: