Spaces:
Sleeping
Sleeping
Create linkedin_scraper.py
Browse files- linkedin_scraper.py +139 -0
linkedin_scraper.py
ADDED
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from selenium import webdriver
|
3 |
+
from selenium.webdriver.common.by import By
|
4 |
+
import time
|
5 |
+
|
6 |
+
def LINKEDIN_Scrapping(job_search , num_jobs):
|
7 |
+
job1 = job_search.split(" ")[0]
|
8 |
+
job2 = job_search.split(" ")[1]
|
9 |
+
|
10 |
+
link1 = 'https://www.linkedin.com/jobs/search?keywords='+job1 +'%20' +job2 +'&location=&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0'
|
11 |
+
|
12 |
+
# FIRST get main informations about jobs
|
13 |
+
|
14 |
+
title = []
|
15 |
+
location = []
|
16 |
+
country = []
|
17 |
+
company_name = []
|
18 |
+
post_time = []
|
19 |
+
links =[]
|
20 |
+
# get the specific numbers of jobs
|
21 |
+
l1 = ""
|
22 |
+
ll =""
|
23 |
+
driver = webdriver.Chrome('chromedriver',options=options)
|
24 |
+
driver.get(link1)
|
25 |
+
SCROLL_PAUSE_TIME = 0.5
|
26 |
+
while True :
|
27 |
+
l1 = driver.find_elements(By.XPATH,'//*[@id="main-content"]/section[2]/ul/li[*]/div')
|
28 |
+
ll= driver.find_elements(By.XPATH ,'//*[@id="main-content"]/section[2]/ul/li[*]/div/a')
|
29 |
+
|
30 |
+
if len(l1) >= num_jobs:
|
31 |
+
break
|
32 |
+
time.sleep(3)
|
33 |
+
# Get scroll height
|
34 |
+
last_height = driver.execute_script("return document.body.scrollHeight")
|
35 |
+
while True:
|
36 |
+
|
37 |
+
# Scroll down to bottom
|
38 |
+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
39 |
+
|
40 |
+
# Wait to load page
|
41 |
+
time.sleep(SCROLL_PAUSE_TIME)
|
42 |
+
|
43 |
+
# Calculate new scroll height and compare with last scroll height
|
44 |
+
new_height = driver.execute_script("return document.body.scrollHeight")
|
45 |
+
if new_height == last_height:
|
46 |
+
break
|
47 |
+
last_height = new_height
|
48 |
+
|
49 |
+
options.add_argument("window-size=1200x600")
|
50 |
+
WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[2]/button'))).click()
|
51 |
+
print(len(l1))
|
52 |
+
time.sleep(2)
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
l2 = l1[:num_jobs]
|
57 |
+
|
58 |
+
for info in l2:
|
59 |
+
info_tot = info.text.split("\n")
|
60 |
+
if len(info_tot)==5:
|
61 |
+
title.append(info_tot[1])
|
62 |
+
location.append(info_tot[3])
|
63 |
+
company_name.append(info_tot[2])
|
64 |
+
post_time.append(info_tot[4])
|
65 |
+
else:
|
66 |
+
title.append(info_tot[1])
|
67 |
+
location.append(info_tot[3])
|
68 |
+
company_name.append(info_tot[2])
|
69 |
+
post_time.append(info_tot[5])
|
70 |
+
|
71 |
+
# get links for jobs
|
72 |
+
l3 = ll[:num_jobs]
|
73 |
+
for i in l3:
|
74 |
+
links.append(i.get_attribute('href'))
|
75 |
+
|
76 |
+
df_ml = pd.DataFrame({'Title' : title , 'Location' : location ,'URLs':links ,'Company_Name' : company_name ,'post_time':post_time})
|
77 |
+
|
78 |
+
|
79 |
+
|
80 |
+
|
81 |
+
# GET DESCRIPTION AND LOGO
|
82 |
+
def all_description_LOGO(urls):
|
83 |
+
description =[]
|
84 |
+
LOGO =[]
|
85 |
+
for link in urls:
|
86 |
+
driver = webdriver.Chrome('chromedriver',options=options)
|
87 |
+
driver.get(link)
|
88 |
+
options.add_argument("window-size=1200x600")
|
89 |
+
WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/button[1]'))).click()
|
90 |
+
qqq= 4+444*58/7+65
|
91 |
+
K = driver.find_element(By.XPATH,'//*[@id="main-content"]/section[1]/div/section[2]/div/a/img')
|
92 |
+
LOGO.append(K.get_attribute('src'))
|
93 |
+
time.sleep(3)
|
94 |
+
t = driver.find_element(By.XPATH ,'//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/div')
|
95 |
+
t_reverse=t.text[::-1]
|
96 |
+
|
97 |
+
if t_reverse[:9] =="erom wohs":
|
98 |
+
l = len(t.text)
|
99 |
+
strings=t.text[:l-9].split("\n")
|
100 |
+
strings[:] = [x for x in strings if x]
|
101 |
+
description.append(strings)
|
102 |
+
else:
|
103 |
+
strings=t.text.split("\n")
|
104 |
+
strings[:] = [x for x in strings if x]
|
105 |
+
description.append(strings)
|
106 |
+
df_ml = pd.DataFrame({'all_about_job' : description ,'company_logo':LOGO})
|
107 |
+
|
108 |
+
return df_ml
|
109 |
+
|
110 |
+
# apply desc. and logo function
|
111 |
+
E = all_description_LOGO(links)
|
112 |
+
|
113 |
+
# other info function
|
114 |
+
def other(urls):
|
115 |
+
frames =[]
|
116 |
+
for url in urls:
|
117 |
+
data1 = requests.get(url)
|
118 |
+
soup1 = BeautifulSoup(data1.content)
|
119 |
+
j = soup1.find('ul' , {'class': 'description__job-criteria-list'})
|
120 |
+
time.sleep(4)
|
121 |
+
jj=j.find_all('h3')
|
122 |
+
dic ={}
|
123 |
+
for i in range(len(jj)):
|
124 |
+
dic[jj[i].text.replace('\n',' ').strip()] = j.find_all('span')[i].text.replace('\n',' ').strip()
|
125 |
+
output = pd.DataFrame()
|
126 |
+
output = output.append(dic, ignore_index=True)
|
127 |
+
frames.append(output)
|
128 |
+
result = pd.concat(frames)
|
129 |
+
return result
|
130 |
+
|
131 |
+
# apply Other function
|
132 |
+
df = other(links)
|
133 |
+
df.fillna('Not_Found',inplace= True)
|
134 |
+
df.reset_index(inplace=True, drop=True)
|
135 |
+
|
136 |
+
# combine all together
|
137 |
+
result = pd.concat([df_ml,E, df ], axis=1)
|
138 |
+
|
139 |
+
return result
|