Spaces:

Yassmen
/

Job.web.scrapping

Sleeping

App Files Files Community

Yassmen commited on 22 days ago

Commit

e9b17c3

•

1 Parent(s): 5a1c96e

Create linkedin_scraper.py

Browse files

Files changed (1) hide show

linkedin_scraper.py +139 -0

linkedin_scraper.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import pandas as pd
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+import time
+def LINKEDIN_Scrapping(job_search , num_jobs):
+  job1 = job_search.split(" ")[0]
+  job2 = job_search.split(" ")[1]
+  link1 = 'https://www.linkedin.com/jobs/search?keywords='+job1 +'%20' +job2 +'&location=&geoId=&trk=public_jobs_jobs-search-bar_search-submit&position=1&pageNum=0'
+  # FIRST get main informations about jobs
+  title = []
+  location = []
+  country = []
+  company_name = []
+  post_time = []
+  links =[]
+  # get the specific numbers of jobs
+  l1 = ""
+  ll =""
+  driver = webdriver.Chrome('chromedriver',options=options)
+  driver.get(link1)
+  SCROLL_PAUSE_TIME = 0.5
+  while True :
+    l1 = driver.find_elements(By.XPATH,'//*[@id="main-content"]/section[2]/ul/li[*]/div')
+    ll= driver.find_elements(By.XPATH ,'//*[@id="main-content"]/section[2]/ul/li[*]/div/a')
+    if len(l1) >= num_jobs:
+      break
+    time.sleep(3)
+    # Get scroll height
+    last_height = driver.execute_script("return document.body.scrollHeight")
+    while True:
+        # Scroll down to bottom
+        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+        # Wait to load page
+        time.sleep(SCROLL_PAUSE_TIME)
+        # Calculate new scroll height and compare with last scroll height
+        new_height = driver.execute_script("return document.body.scrollHeight")
+        if new_height == last_height:
+            break
+        last_height = new_height
+    options.add_argument("window-size=1200x600")
+    WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[2]/button'))).click()
+    print(len(l1))
+    time.sleep(2)
+  l2 = l1[:num_jobs]
+  for info in l2:
+    info_tot = info.text.split("\n")
+    if len(info_tot)==5:
+      title.append(info_tot[1])
+      location.append(info_tot[3])
+      company_name.append(info_tot[2])
+      post_time.append(info_tot[4])
+    else:
+      title.append(info_tot[1])
+      location.append(info_tot[3])
+      company_name.append(info_tot[2])
+      post_time.append(info_tot[5])
+  # get links for jobs
+  l3 = ll[:num_jobs]
+  for i in l3:
+    links.append(i.get_attribute('href'))
+  df_ml = pd.DataFrame({'Title' : title , 'Location' : location ,'URLs':links ,'Company_Name' : company_name ,'post_time':post_time})
+    # GET DESCRIPTION AND LOGO
+  def all_description_LOGO(urls):
+    description =[]
+    LOGO =[]
+    for link in urls:
+      driver = webdriver.Chrome('chromedriver',options=options)
+      driver.get(link)
+      options.add_argument("window-size=1200x600")
+      WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/button[1]'))).click()
+      qqq= 4+444*58/7+65
+      K = driver.find_element(By.XPATH,'//*[@id="main-content"]/section[1]/div/section[2]/div/a/img')
+      LOGO.append(K.get_attribute('src'))
+      time.sleep(3)
+      t = driver.find_element(By.XPATH ,'//*[@id="main-content"]/section[1]/div/div[1]/section[1]/div/div/section/div')
+      t_reverse=t.text[::-1]
+      if t_reverse[:9] =="erom wohs":
+        l = len(t.text)
+        strings=t.text[:l-9].split("\n")
+        strings[:] = [x for x in strings if x]
+        description.append(strings)
+      else:
+        strings=t.text.split("\n")
+        strings[:] = [x for x in strings if x]
+        description.append(strings)
+    df_ml = pd.DataFrame({'all_about_job' : description ,'company_logo':LOGO})
+    return df_ml
+  # apply desc. and logo function
+  E = all_description_LOGO(links)
+  # other info function
+  def other(urls):
+    frames =[]
+    for url in urls:
+      data1 = requests.get(url)
+      soup1 = BeautifulSoup(data1.content)
+      j =  soup1.find('ul' , {'class': 'description__job-criteria-list'})
+      time.sleep(4)
+      jj=j.find_all('h3')
+      dic ={}
+      for i in range(len(jj)):
+        dic[jj[i].text.replace('\n',' ').strip()] = j.find_all('span')[i].text.replace('\n',' ').strip()
+      output = pd.DataFrame()
+      output = output.append(dic, ignore_index=True)
+      frames.append(output)
+    result = pd.concat(frames)
+    return result
+  # apply Other function
+  df = other(links)
+  df.fillna('Not_Found',inplace= True)
+  df.reset_index(inplace=True, drop=True)
+ # combine all together
+  result = pd.concat([df_ml,E, df ], axis=1)
+  return result