Job.web.scrapping / linkedin_scraper.py
Yassmen's picture
Create linkedin_scraper.py
e9b17c3 verified
raw
history blame
4.43 kB
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
def LINKEDIN_Scrapping(job_search , num_jobs):
job1 = job_search.split(" ")[0]
job2 = job_search.split(" ")[1]
link1 = 'https://www.linkedin.com/jobs/search?keywords='+job1 +'%20' +job2 +'&location=&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0'
# FIRST get main informations about jobs
title = []
location = []
country = []
company_name = []
post_time = []
links =[]
# get the specific numbers of jobs
l1 = ""
ll =""
driver = webdriver.Chrome('chromedriver',options=options)
driver.get(link1)
SCROLL_PAUSE_TIME = 0.5
while True :
l1 = driver.find_elements(By.XPATH,'//*[@id="main-content"]/section[2]/ul/li[*]/div')
ll= driver.find_elements(By.XPATH ,'//*[@id="main-content"]/section[2]/ul/li[*]/div/a')
if len(l1) >= num_jobs:
break
time.sleep(3)
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
options.add_argument("window-size=1200x600")
WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[2]/button'))).click()
print(len(l1))
time.sleep(2)
l2 = l1[:num_jobs]
for info in l2:
info_tot = info.text.split("\n")
if len(info_tot)==5:
title.append(info_tot[1])
location.append(info_tot[3])
company_name.append(info_tot[2])
post_time.append(info_tot[4])
else:
title.append(info_tot[1])
location.append(info_tot[3])
company_name.append(info_tot[2])
post_time.append(info_tot[5])
# get links for jobs
l3 = ll[:num_jobs]
for i in l3:
links.append(i.get_attribute('href'))
df_ml = pd.DataFrame({'Title' : title , 'Location' : location ,'URLs':links ,'Company_Name' : company_name ,'post_time':post_time})
# GET DESCRIPTION AND LOGO
def all_description_LOGO(urls):
description =[]
LOGO =[]
for link in urls:
driver = webdriver.Chrome('chromedriver',options=options)
driver.get(link)
options.add_argument("window-size=1200x600")
WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/button[1]'))).click()
qqq= 4+444*58/7+65
K = driver.find_element(By.XPATH,'//*[@id="main-content"]/section[1]/div/section[2]/div/a/img')
LOGO.append(K.get_attribute('src'))
time.sleep(3)
t = driver.find_element(By.XPATH ,'//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/div')
t_reverse=t.text[::-1]
if t_reverse[:9] =="erom wohs":
l = len(t.text)
strings=t.text[:l-9].split("\n")
strings[:] = [x for x in strings if x]
description.append(strings)
else:
strings=t.text.split("\n")
strings[:] = [x for x in strings if x]
description.append(strings)
df_ml = pd.DataFrame({'all_about_job' : description ,'company_logo':LOGO})
return df_ml
# apply desc. and logo function
E = all_description_LOGO(links)
# other info function
def other(urls):
frames =[]
for url in urls:
data1 = requests.get(url)
soup1 = BeautifulSoup(data1.content)
j = soup1.find('ul' , {'class': 'description__job-criteria-list'})
time.sleep(4)
jj=j.find_all('h3')
dic ={}
for i in range(len(jj)):
dic[jj[i].text.replace('\n',' ').strip()] = j.find_all('span')[i].text.replace('\n',' ').strip()
output = pd.DataFrame()
output = output.append(dic, ignore_index=True)
frames.append(output)
result = pd.concat(frames)
return result
# apply Other function
df = other(links)
df.fillna('Not_Found',inplace= True)
df.reset_index(inplace=True, drop=True)
# combine all together
result = pd.concat([df_ml,E, df ], axis=1)
return result