Spaces:
Sleeping
Sleeping
File size: 4,380 Bytes
e9b17c3 c1fc806 e9b17c3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
def LINKEDIN_Scrapping(job_search , num_jobs,driver):
job1 = job_search.split(" ")[0]
job2 = job_search.split(" ")[1]
link1 = 'https://www.linkedin.com/jobs/search?keywords='+job1 +'%20' +job2 +'&location=&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0'
# FIRST get main informations about jobs
title = []
location = []
country = []
company_name = []
post_time = []
links =[]
# get the specific numbers of jobs
l1 = ""
ll =""
driver.get(link1)
SCROLL_PAUSE_TIME = 0.5
while True :
l1 = driver.find_elements(By.XPATH,'//*[@id="main-content"]/section[2]/ul/li[*]/div')
ll= driver.find_elements(By.XPATH ,'//*[@id="main-content"]/section[2]/ul/li[*]/div/a')
if len(l1) >= num_jobs:
break
time.sleep(3)
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
options.add_argument("window-size=1200x600")
WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[2]/button'))).click()
print(len(l1))
time.sleep(2)
l2 = l1[:num_jobs]
for info in l2:
info_tot = info.text.split("\n")
if len(info_tot)==5:
title.append(info_tot[1])
location.append(info_tot[3])
company_name.append(info_tot[2])
post_time.append(info_tot[4])
else:
title.append(info_tot[1])
location.append(info_tot[3])
company_name.append(info_tot[2])
post_time.append(info_tot[5])
# get links for jobs
l3 = ll[:num_jobs]
for i in l3:
links.append(i.get_attribute('href'))
df_ml = pd.DataFrame({'Title' : title , 'Location' : location ,'URLs':links ,'Company_Name' : company_name ,'post_time':post_time})
# GET DESCRIPTION AND LOGO
def all_description_LOGO(urls):
description =[]
LOGO =[]
for link in urls:
driver = webdriver.Chrome('chromedriver',options=options)
driver.get(link)
options.add_argument("window-size=1200x600")
WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/button[1]'))).click()
qqq= 4+444*58/7+65
K = driver.find_element(By.XPATH,'//*[@id="main-content"]/section[1]/div/section[2]/div/a/img')
LOGO.append(K.get_attribute('src'))
time.sleep(3)
t = driver.find_element(By.XPATH ,'//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/div')
t_reverse=t.text[::-1]
if t_reverse[:9] =="erom wohs":
l = len(t.text)
strings=t.text[:l-9].split("\n")
strings[:] = [x for x in strings if x]
description.append(strings)
else:
strings=t.text.split("\n")
strings[:] = [x for x in strings if x]
description.append(strings)
df_ml = pd.DataFrame({'all_about_job' : description ,'company_logo':LOGO})
return df_ml
# apply desc. and logo function
E = all_description_LOGO(links)
# other info function
def other(urls):
frames =[]
for url in urls:
data1 = requests.get(url)
soup1 = BeautifulSoup(data1.content)
j = soup1.find('ul' , {'class': 'description__job-criteria-list'})
time.sleep(4)
jj=j.find_all('h3')
dic ={}
for i in range(len(jj)):
dic[jj[i].text.replace('\n',' ').strip()] = j.find_all('span')[i].text.replace('\n',' ').strip()
output = pd.DataFrame()
output = output.append(dic, ignore_index=True)
frames.append(output)
result = pd.concat(frames)
return result
# apply Other function
df = other(links)
df.fillna('Not_Found',inplace= True)
df.reset_index(inplace=True, drop=True)
# combine all together
result = pd.concat([df_ml,E, df ], axis=1)
return result
|