Yassmen commited on
Commit
e9b17c3
1 Parent(s): 5a1c96e

Create linkedin_scraper.py

Browse files
Files changed (1) hide show
  1. linkedin_scraper.py +139 -0
linkedin_scraper.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from selenium import webdriver
3
+ from selenium.webdriver.common.by import By
4
+ import time
5
+
6
+ def LINKEDIN_Scrapping(job_search , num_jobs):
7
+ job1 = job_search.split(" ")[0]
8
+ job2 = job_search.split(" ")[1]
9
+
10
+ link1 = 'https://www.linkedin.com/jobs/search?keywords='+job1 +'%20' +job2 +'&location=&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0'
11
+
12
+ # FIRST get main informations about jobs
13
+
14
+ title = []
15
+ location = []
16
+ country = []
17
+ company_name = []
18
+ post_time = []
19
+ links =[]
20
+ # get the specific numbers of jobs
21
+ l1 = ""
22
+ ll =""
23
+ driver = webdriver.Chrome('chromedriver',options=options)
24
+ driver.get(link1)
25
+ SCROLL_PAUSE_TIME = 0.5
26
+ while True :
27
+ l1 = driver.find_elements(By.XPATH,'//*[@id="main-content"]/section[2]/ul/li[*]/div')
28
+ ll= driver.find_elements(By.XPATH ,'//*[@id="main-content"]/section[2]/ul/li[*]/div/a')
29
+
30
+ if len(l1) >= num_jobs:
31
+ break
32
+ time.sleep(3)
33
+ # Get scroll height
34
+ last_height = driver.execute_script("return document.body.scrollHeight")
35
+ while True:
36
+
37
+ # Scroll down to bottom
38
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
39
+
40
+ # Wait to load page
41
+ time.sleep(SCROLL_PAUSE_TIME)
42
+
43
+ # Calculate new scroll height and compare with last scroll height
44
+ new_height = driver.execute_script("return document.body.scrollHeight")
45
+ if new_height == last_height:
46
+ break
47
+ last_height = new_height
48
+
49
+ options.add_argument("window-size=1200x600")
50
+ WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[2]/button'))).click()
51
+ print(len(l1))
52
+ time.sleep(2)
53
+
54
+
55
+
56
+ l2 = l1[:num_jobs]
57
+
58
+ for info in l2:
59
+ info_tot = info.text.split("\n")
60
+ if len(info_tot)==5:
61
+ title.append(info_tot[1])
62
+ location.append(info_tot[3])
63
+ company_name.append(info_tot[2])
64
+ post_time.append(info_tot[4])
65
+ else:
66
+ title.append(info_tot[1])
67
+ location.append(info_tot[3])
68
+ company_name.append(info_tot[2])
69
+ post_time.append(info_tot[5])
70
+
71
+ # get links for jobs
72
+ l3 = ll[:num_jobs]
73
+ for i in l3:
74
+ links.append(i.get_attribute('href'))
75
+
76
+ df_ml = pd.DataFrame({'Title' : title , 'Location' : location ,'URLs':links ,'Company_Name' : company_name ,'post_time':post_time})
77
+
78
+
79
+
80
+
81
+ # GET DESCRIPTION AND LOGO
82
+ def all_description_LOGO(urls):
83
+ description =[]
84
+ LOGO =[]
85
+ for link in urls:
86
+ driver = webdriver.Chrome('chromedriver',options=options)
87
+ driver.get(link)
88
+ options.add_argument("window-size=1200x600")
89
+ WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/button[1]'))).click()
90
+ qqq= 4+444*58/7+65
91
+ K = driver.find_element(By.XPATH,'//*[@id="main-content"]/section[1]/div/section[2]/div/a/img')
92
+ LOGO.append(K.get_attribute('src'))
93
+ time.sleep(3)
94
+ t = driver.find_element(By.XPATH ,'//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/div')
95
+ t_reverse=t.text[::-1]
96
+
97
+ if t_reverse[:9] =="erom wohs":
98
+ l = len(t.text)
99
+ strings=t.text[:l-9].split("\n")
100
+ strings[:] = [x for x in strings if x]
101
+ description.append(strings)
102
+ else:
103
+ strings=t.text.split("\n")
104
+ strings[:] = [x for x in strings if x]
105
+ description.append(strings)
106
+ df_ml = pd.DataFrame({'all_about_job' : description ,'company_logo':LOGO})
107
+
108
+ return df_ml
109
+
110
+ # apply desc. and logo function
111
+ E = all_description_LOGO(links)
112
+
113
+ # other info function
114
+ def other(urls):
115
+ frames =[]
116
+ for url in urls:
117
+ data1 = requests.get(url)
118
+ soup1 = BeautifulSoup(data1.content)
119
+ j = soup1.find('ul' , {'class': 'description__job-criteria-list'})
120
+ time.sleep(4)
121
+ jj=j.find_all('h3')
122
+ dic ={}
123
+ for i in range(len(jj)):
124
+ dic[jj[i].text.replace('\n',' ').strip()] = j.find_all('span')[i].text.replace('\n',' ').strip()
125
+ output = pd.DataFrame()
126
+ output = output.append(dic, ignore_index=True)
127
+ frames.append(output)
128
+ result = pd.concat(frames)
129
+ return result
130
+
131
+ # apply Other function
132
+ df = other(links)
133
+ df.fillna('Not_Found',inplace= True)
134
+ df.reset_index(inplace=True, drop=True)
135
+
136
+ # combine all together
137
+ result = pd.concat([df_ml,E, df ], axis=1)
138
+
139
+ return result