Spaces:

Yassmen
/

Job.web.scrapping

Sleeping

Job.web.scrapping

File size: 4,763 Bytes

5a1c96e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b36e20
5a1c96e
3ce60d5
261b9b6
3ce60d5
261b9b6
3a477b5
4364f4a
a5b8861
 
 
 
 
 
 
 
29fc20d
 
a5b8861
5a1c96e
a5b8861
5a1c96e
 
 
 
 
 
 
a5b8861
 
5a1c96e
 
a5b8861
5a1c96e
a5b8861
 
 
 
5a1c96e
a5b8861
 
 
5a1c96e
1be82b7
ed0a961
 
 
 
a5b8861
 
03c511d
a5b8861
ed0a961
 
 
1be82b7
03c511d
1be82b7
 
 
ed0a961
5a1c96e
a5b8861
 
5a1c96e
 
 
 
 
 
 
 
 
 
 
 
 
 
29fc20d
5a1c96e
a5b8861
 
 
5a1c96e
 
 
 
 
 
 
 
 
 
 
 
 
 
3cf74b0
f2d0d7c
 
a5b8861


import streamlit as st
import requests
import numpy as np
from PIL import Image
import warnings
warnings.filterwarnings("ignore")
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import bs4
from urllib.request import urlopen
import time
import re
import time
import matplotlib.pyplot as plt 
import seaborn as sns 
import matplotlib as mpl
import plotly 
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as py
from plotly.offline import iplot
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service

import requests
import platform
import zipfile
import os
import subprocess

import streamlit as st
import numpy as np
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from wuzzuf_scraper import Wuzzuf_scrapping
from linkedin_scraper import LINKEDIN_Scrapping
from data_analysis import map_bubble, linkedin_exp, wuzzuf_exp

# Set up Streamlit page configuration
st.set_page_config(page_title="My Web_Scrap Page", page_icon=":tada:", layout="wide")

# ---- HEADER SECTION ----
with st.container():
    left_column, right_column = st.columns(2)
    with left_column:
        st.subheader("Hi! I am Yassmen :wave:")
        st.title("An Electronics and Communication Engineer")
        st.write("In this app we will scrap jobs from LinkedIn and Wuzzuf websites, let's get it started :boom:")
        st.write("[Reach me >](https://www.linkedin.com/in/yassmen-youssef-48439a166/)")
    with right_column:
        st.image("im.gif", use_column_width=True)

# Sidebar selections
webs = ["Wuzzuf", "Linkedin"]
jobs = ["Machine Learning", "AI Engineer", "Data Analysis", "Software Testing"]
nums = np.arange(1, 1000)

site = st.sidebar.selectbox("Select one website", webs)
job = st.sidebar.selectbox("Select one job", jobs)
num_jobs = st.sidebar.selectbox("Select number of jobs you want to scrap", nums)

# Function to get Selenium driver
from selenium import webdriver
from selenium.webdriver.firefox.service import Service as FirefoxService
from webdriver_manager.firefox import GeckoDriverManager

@st.cache_resource
def get_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in headless mode
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    try:
        driver = webdriver.Chrome(options=options)
        return driver
    except Exception as e:
        st.error(f"Error initializing WebDriver: {e}")
        return None

import streamlit as st
from streamlit_option_menu import option_menu



import streamlit.components.v1 as components

n2 = pd.DataFrame()

if st.sidebar.button('Start Scrapping'):
  if site =="Wuzzuf":

    with st.container():
        st.write("---")
        tab1, tab2 ,tab3= st.tabs([" Data", " Bubble Map","Data Exploration"])
        with tab1 :
          with st.spinner('✨Now loading...' ):
            time.sleep(5)
            driver = get_driver()  # Initialize the driver
            n1 = Wuzzuf_scrapping(job, num_jobs, driver)  # Pass driver to the scraping function
            driver.quit()  # Clean up the driver
            try:
              tab1.dataframe(n1)
            except:
              try:
                tab1.write(n1.astype(str).set_index(n1.index.astype(str)))  # Success
              except:
                tab1.table(n1)
        with tab2:
          map_bubble(n1)
        with tab3:
          #tab3.plotly_chart(wuzzuf_exp(n1))
          wuzzuf_exp(n1)


  elif site =="Linkedin":
    with st.container():
        st.write("---")
        tab1, tab2 ,tab3= st.tabs([" Data", " Bubble Map","Data Exploration"])
        with tab1 :
          with st.spinner('✨Now loading...' ):
            time.sleep(5)
            driver = get_driver()
            n1 = LINKEDIN_Scrapping(job ,num_jobs,driver )
            driver.quit()  # Clean up the driver
            try:
              tab1.dataframe(n1)
            except:
              try:
                tab1.write(n1.astype(str).set_index(n1.index.astype(str)))  # Success
              except:
                tab1.table(n1)
        with tab2:
          map_bubble(n1)
        with tab3:
          linkedin_exp(n1)