Spaces:

Yassmen
/

Job.web.scrapping

Sleeping

Job.web.scrapping

File size: 4,433 Bytes

e9b17c3

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

def LINKEDIN_Scrapping(job_search , num_jobs):
  job1 = job_search.split(" ")[0]
  job2 = job_search.split(" ")[1]

  link1 = 'https://www.linkedin.com/jobs/search?keywords='+job1 +'%20' +job2 +'&location=&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0'
  
  # FIRST get main informations about jobs

  title = []
  location = []
  country = []
  company_name = []
  post_time = []
  links =[]
  # get the specific numbers of jobs
  l1 = ""
  ll =""
  driver = webdriver.Chrome('chromedriver',options=options)
  driver.get(link1)
  SCROLL_PAUSE_TIME = 0.5
  while True :
    l1 = driver.find_elements(By.XPATH,'//*[@id="main-content"]/section[2]/ul/li[*]/div')
    ll= driver.find_elements(By.XPATH ,'//*[@id="main-content"]/section[2]/ul/li[*]/div/a') 

    if len(l1) >= num_jobs:
      break
    time.sleep(3)
    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
        
    options.add_argument("window-size=1200x600")
    WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[2]/button'))).click()
    print(len(l1))
    time.sleep(2)
    


  l2 = l1[:num_jobs]

  for info in l2:   
    info_tot = info.text.split("\n")
    if len(info_tot)==5:
      title.append(info_tot[1])
      location.append(info_tot[3])
      company_name.append(info_tot[2])
      post_time.append(info_tot[4])
    else:
      title.append(info_tot[1])
      location.append(info_tot[3])
      company_name.append(info_tot[2])
      post_time.append(info_tot[5])

  # get links for jobs
  l3 = ll[:num_jobs]
  for i in l3:
    links.append(i.get_attribute('href'))
  
  df_ml = pd.DataFrame({'Title' : title , 'Location' : location ,'URLs':links ,'Company_Name' : company_name ,'post_time':post_time})




    # GET DESCRIPTION AND LOGO 
  def all_description_LOGO(urls):
    description =[]
    LOGO =[]
    for link in urls:         
      driver = webdriver.Chrome('chromedriver',options=options)
      driver.get(link)
      options.add_argument("window-size=1200x600")
      WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/button[1]'))).click()
      qqq= 4+444*58/7+65
      K = driver.find_element(By.XPATH,'//*[@id="main-content"]/section[1]/div/section[2]/div/a/img')
      LOGO.append(K.get_attribute('src'))
      time.sleep(3)
      t = driver.find_element(By.XPATH ,'//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/div')
      t_reverse=t.text[::-1]

      if t_reverse[:9] =="erom wohs":
        l = len(t.text)
        strings=t.text[:l-9].split("\n")
        strings[:] = [x for x in strings if x]
        description.append(strings)
      else:
        strings=t.text.split("\n")
        strings[:] = [x for x in strings if x]
        description.append(strings)
    df_ml = pd.DataFrame({'all_about_job' : description ,'company_logo':LOGO})

    return df_ml

  # apply desc. and logo function
  E = all_description_LOGO(links)

  # other info function
  def other(urls):
    frames =[]
    for url in urls:
      data1 = requests.get(url)
      soup1 = BeautifulSoup(data1.content)
      j =  soup1.find('ul' , {'class': 'description__job-criteria-list'})
      time.sleep(4)
      jj=j.find_all('h3')
      dic ={}
      for i in range(len(jj)):
        dic[jj[i].text.replace('\n',' ').strip()] = j.find_all('span')[i].text.replace('\n',' ').strip()
      output = pd.DataFrame()
      output = output.append(dic, ignore_index=True) 
      frames.append(output)
    result = pd.concat(frames)
    return result

  # apply Other function
  df = other(links)
  df.fillna('Not_Found',inplace= True)
  df.reset_index(inplace=True, drop=True)
 
 # combine all together
  result = pd.concat([df_ml,E, df ], axis=1)

  return result