Spaces:
Sleeping
Sleeping
import pandas as pd | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
import time | |
def LINKEDIN_Scrapping(job_search , num_jobs): | |
job1 = job_search.split(" ")[0] | |
job2 = job_search.split(" ")[1] | |
link1 = 'https://www.linkedin.com/jobs/search?keywords='+job1 +'%20' +job2 +'&location=&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0' | |
# FIRST get main informations about jobs | |
title = [] | |
location = [] | |
country = [] | |
company_name = [] | |
post_time = [] | |
links =[] | |
# get the specific numbers of jobs | |
l1 = "" | |
ll ="" | |
driver = webdriver.Chrome('chromedriver',options=options) | |
driver.get(link1) | |
SCROLL_PAUSE_TIME = 0.5 | |
while True : | |
l1 = driver.find_elements(By.XPATH,'//*[@id="main-content"]/section[2]/ul/li[*]/div') | |
ll= driver.find_elements(By.XPATH ,'//*[@id="main-content"]/section[2]/ul/li[*]/div/a') | |
if len(l1) >= num_jobs: | |
break | |
time.sleep(3) | |
# Get scroll height | |
last_height = driver.execute_script("return document.body.scrollHeight") | |
while True: | |
# Scroll down to bottom | |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
# Wait to load page | |
time.sleep(SCROLL_PAUSE_TIME) | |
# Calculate new scroll height and compare with last scroll height | |
new_height = driver.execute_script("return document.body.scrollHeight") | |
if new_height == last_height: | |
break | |
last_height = new_height | |
options.add_argument("window-size=1200x600") | |
WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[2]/button'))).click() | |
print(len(l1)) | |
time.sleep(2) | |
l2 = l1[:num_jobs] | |
for info in l2: | |
info_tot = info.text.split("\n") | |
if len(info_tot)==5: | |
title.append(info_tot[1]) | |
location.append(info_tot[3]) | |
company_name.append(info_tot[2]) | |
post_time.append(info_tot[4]) | |
else: | |
title.append(info_tot[1]) | |
location.append(info_tot[3]) | |
company_name.append(info_tot[2]) | |
post_time.append(info_tot[5]) | |
# get links for jobs | |
l3 = ll[:num_jobs] | |
for i in l3: | |
links.append(i.get_attribute('href')) | |
df_ml = pd.DataFrame({'Title' : title , 'Location' : location ,'URLs':links ,'Company_Name' : company_name ,'post_time':post_time}) | |
# GET DESCRIPTION AND LOGO | |
def all_description_LOGO(urls): | |
description =[] | |
LOGO =[] | |
for link in urls: | |
driver = webdriver.Chrome('chromedriver',options=options) | |
driver.get(link) | |
options.add_argument("window-size=1200x600") | |
WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/button[1]'))).click() | |
qqq= 4+444*58/7+65 | |
K = driver.find_element(By.XPATH,'//*[@id="main-content"]/section[1]/div/section[2]/div/a/img') | |
LOGO.append(K.get_attribute('src')) | |
time.sleep(3) | |
t = driver.find_element(By.XPATH ,'//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/div') | |
t_reverse=t.text[::-1] | |
if t_reverse[:9] =="erom wohs": | |
l = len(t.text) | |
strings=t.text[:l-9].split("\n") | |
strings[:] = [x for x in strings if x] | |
description.append(strings) | |
else: | |
strings=t.text.split("\n") | |
strings[:] = [x for x in strings if x] | |
description.append(strings) | |
df_ml = pd.DataFrame({'all_about_job' : description ,'company_logo':LOGO}) | |
return df_ml | |
# apply desc. and logo function | |
E = all_description_LOGO(links) | |
# other info function | |
def other(urls): | |
frames =[] | |
for url in urls: | |
data1 = requests.get(url) | |
soup1 = BeautifulSoup(data1.content) | |
j = soup1.find('ul' , {'class': 'description__job-criteria-list'}) | |
time.sleep(4) | |
jj=j.find_all('h3') | |
dic ={} | |
for i in range(len(jj)): | |
dic[jj[i].text.replace('\n',' ').strip()] = j.find_all('span')[i].text.replace('\n',' ').strip() | |
output = pd.DataFrame() | |
output = output.append(dic, ignore_index=True) | |
frames.append(output) | |
result = pd.concat(frames) | |
return result | |
# apply Other function | |
df = other(links) | |
df.fillna('Not_Found',inplace= True) | |
df.reset_index(inplace=True, drop=True) | |
# combine all together | |
result = pd.concat([df_ml,E, df ], axis=1) | |
return result | |