import pandas as pd from selenium import webdriver from selenium.webdriver.common.by import By import time def LINKEDIN_Scrapping(job_search , num_jobs): job1 = job_search.split(" ")[0] job2 = job_search.split(" ")[1] link1 = 'https://www.linkedin.com/jobs/search?keywords='+job1 +'%20' +job2 +'&location=&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0' # FIRST get main informations about jobs title = [] location = [] country = [] company_name = [] post_time = [] links =[] # get the specific numbers of jobs l1 = "" ll ="" driver = webdriver.Chrome('chromedriver',options=options) driver.get(link1) SCROLL_PAUSE_TIME = 0.5 while True : l1 = driver.find_elements(By.XPATH,'//*[@id="main-content"]/section[2]/ul/li[*]/div') ll= driver.find_elements(By.XPATH ,'//*[@id="main-content"]/section[2]/ul/li[*]/div/a') if len(l1) >= num_jobs: break time.sleep(3) # Get scroll height last_height = driver.execute_script("return document.body.scrollHeight") while True: # Scroll down to bottom driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # Wait to load page time.sleep(SCROLL_PAUSE_TIME) # Calculate new scroll height and compare with last scroll height new_height = driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height options.add_argument("window-size=1200x600") WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[2]/button'))).click() print(len(l1)) time.sleep(2) l2 = l1[:num_jobs] for info in l2: info_tot = info.text.split("\n") if len(info_tot)==5: title.append(info_tot[1]) location.append(info_tot[3]) company_name.append(info_tot[2]) post_time.append(info_tot[4]) else: title.append(info_tot[1]) location.append(info_tot[3]) company_name.append(info_tot[2]) post_time.append(info_tot[5]) # get links for jobs l3 = ll[:num_jobs] for i in l3: links.append(i.get_attribute('href')) df_ml = pd.DataFrame({'Title' : title , 'Location' : location ,'URLs':links ,'Company_Name' : company_name ,'post_time':post_time}) # GET DESCRIPTION AND LOGO def all_description_LOGO(urls): description =[] LOGO =[] for link in urls: driver = webdriver.Chrome('chromedriver',options=options) driver.get(link) options.add_argument("window-size=1200x600") WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/button[1]'))).click() qqq= 4+444*58/7+65 K = driver.find_element(By.XPATH,'//*[@id="main-content"]/section[1]/div/section[2]/div/a/img') LOGO.append(K.get_attribute('src')) time.sleep(3) t = driver.find_element(By.XPATH ,'//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/div') t_reverse=t.text[::-1] if t_reverse[:9] =="erom wohs": l = len(t.text) strings=t.text[:l-9].split("\n") strings[:] = [x for x in strings if x] description.append(strings) else: strings=t.text.split("\n") strings[:] = [x for x in strings if x] description.append(strings) df_ml = pd.DataFrame({'all_about_job' : description ,'company_logo':LOGO}) return df_ml # apply desc. and logo function E = all_description_LOGO(links) # other info function def other(urls): frames =[] for url in urls: data1 = requests.get(url) soup1 = BeautifulSoup(data1.content) j = soup1.find('ul' , {'class': 'description__job-criteria-list'}) time.sleep(4) jj=j.find_all('h3') dic ={} for i in range(len(jj)): dic[jj[i].text.replace('\n',' ').strip()] = j.find_all('span')[i].text.replace('\n',' ').strip() output = pd.DataFrame() output = output.append(dic, ignore_index=True) frames.append(output) result = pd.concat(frames) return result # apply Other function df = other(links) df.fillna('Not_Found',inplace= True) df.reset_index(inplace=True, drop=True) # combine all together result = pd.concat([df_ml,E, df ], axis=1) return result