Spaces:

Yassmen
/

Job.web.scrapping

Sleeping

App Files Files Community

Job.web.scrapping / linkedin_scraper.py

Yassmen

Create linkedin_scraper.py

e9b17c3 verified 22 days ago

raw

history blame

4.43 kB

	import pandas as pd
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	import time

	def LINKEDIN_Scrapping(job_search , num_jobs):
	job1 = job_search.split(" ")[0]
	job2 = job_search.split(" ")[1]

	link1 = 'https://www.linkedin.com/jobs/search?keywords='+job1 +'%20' +job2 +'&location=&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0'

	# FIRST get main informations about jobs

	title = []
	location = []
	country = []
	company_name = []
	post_time = []
	links =[]
	# get the specific numbers of jobs
	l1 = ""
	ll =""
	driver = webdriver.Chrome('chromedriver',options=options)
	driver.get(link1)
	SCROLL_PAUSE_TIME = 0.5
	while True :
	l1 = driver.find_elements(By.XPATH,'//[@id="main-content"]/section[2]/ul/li[]/div')
	ll= driver.find_elements(By.XPATH ,'//[@id="main-content"]/section[2]/ul/li[]/div/a')

	if len(l1) >= num_jobs:
	break
	time.sleep(3)
	# Get scroll height
	last_height = driver.execute_script("return document.body.scrollHeight")
	while True:

	# Scroll down to bottom
	driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

	# Wait to load page
	time.sleep(SCROLL_PAUSE_TIME)

	# Calculate new scroll height and compare with last scroll height
	new_height = driver.execute_script("return document.body.scrollHeight")
	if new_height == last_height:
	break
	last_height = new_height

	options.add_argument("window-size=1200x600")
	WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[2]/button'))).click()
	print(len(l1))
	time.sleep(2)



	l2 = l1[:num_jobs]

	for info in l2:
	info_tot = info.text.split("\n")
	if len(info_tot)==5:
	title.append(info_tot[1])
	location.append(info_tot[3])
	company_name.append(info_tot[2])
	post_time.append(info_tot[4])
	else:
	title.append(info_tot[1])
	location.append(info_tot[3])
	company_name.append(info_tot[2])
	post_time.append(info_tot[5])

	# get links for jobs
	l3 = ll[:num_jobs]
	for i in l3:
	links.append(i.get_attribute('href'))

	df_ml = pd.DataFrame({'Title' : title , 'Location' : location ,'URLs':links ,'Company_Name' : company_name ,'post_time':post_time})




	# GET DESCRIPTION AND LOGO
	def all_description_LOGO(urls):
	description =[]
	LOGO =[]
	for link in urls:
	driver = webdriver.Chrome('chromedriver',options=options)
	driver.get(link)
	options.add_argument("window-size=1200x600")
	WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/button[1]'))).click()
	qqq= 4+444*58/7+65
	K = driver.find_element(By.XPATH,'//*[@id="main-content"]/section[1]/div/section[2]/div/a/img')
	LOGO.append(K.get_attribute('src'))
	time.sleep(3)
	t = driver.find_element(By.XPATH ,'//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/div')
	t_reverse=t.text[::-1]

	if t_reverse[:9] =="erom wohs":
	l = len(t.text)
	strings=t.text[:l-9].split("\n")
	strings[:] = [x for x in strings if x]
	description.append(strings)
	else:
	strings=t.text.split("\n")
	strings[:] = [x for x in strings if x]
	description.append(strings)
	df_ml = pd.DataFrame({'all_about_job' : description ,'company_logo':LOGO})

	return df_ml

	# apply desc. and logo function
	E = all_description_LOGO(links)

	# other info function
	def other(urls):
	frames =[]
	for url in urls:
	data1 = requests.get(url)
	soup1 = BeautifulSoup(data1.content)
	j = soup1.find('ul' , {'class': 'description__job-criteria-list'})
	time.sleep(4)
	jj=j.find_all('h3')
	dic ={}
	for i in range(len(jj)):
	dic[jj[i].text.replace('\n',' ').strip()] = j.find_all('span')[i].text.replace('\n',' ').strip()
	output = pd.DataFrame()
	output = output.append(dic, ignore_index=True)
	frames.append(output)
	result = pd.concat(frames)
	return result

	# apply Other function
	df = other(links)
	df.fillna('Not_Found',inplace= True)
	df.reset_index(inplace=True, drop=True)

	# combine all together
	result = pd.concat([df_ml,E, df ], axis=1)

	return result