from selenium.webdriver.chrome.service import Service as ChromeService from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.common.by import By from googlesearch import search from selenium import webdriver from bs4 import BeautifulSoup import trafilatura import requests import urllib def setup_driver(headless=True): chrome_options = webdriver.ChromeOptions() if headless: chrome_options.add_argument('headless') chrome_options.add_experimental_option('excludeSwitches', ['enable-logging']) driver = webdriver.Chrome(service=ChromeService( ChromeDriverManager().install()), options=chrome_options) return driver def scrape_article(url): downloaded = trafilatura.fetch_url(url) extracted = trafilatura.extract(downloaded) return extracted def get_articles(urls, deep=False): articles = [] for article in urls: article_text = scrape_article(article) if article_text and article_text != "" and article_text != "When you have eliminated the JavaScript, whatever remains must be an empty page." and article_text != "When you have eliminated the\nJavaScript\n, whatever remains must be an empty page.\nEnable JavaScript to see Google Maps." and "Something went wrong. Wait a moment and try again.\nTry again\nPlease enable Javascript and refresh the page to continue" != article_text: if deep: articles.append( ".".join(article_text[:-1].split('.')[:-1]).split("|")[-1]) else: articles.append( ".".join(article_text[:2000].split('.')[:-1]).split("|")[-1]) if len(articles) == 2: break return articles def web_scraper(web_query, deep=False): driver = setup_driver() urls = list(search(web_query, num_results=10, sleep_interval=0.1)) web_query = urllib.parse.quote(web_query) url = "https://www.google.com/search?q="+web_query driver.get(url) source = requests.get(url).text soup = BeautifulSoup(source, 'html.parser') part_of_speeches = ['noun', 'adjective', 'verb', 'adverb', 'pronoun', 'preposition', 'conjunction', 'interjection', 'exclamation', 'numeral', 'article', 'determiner'] list1 = [] articles = get_articles(urls, deep) for i in soup.find_all('div', class_='BNeawe s3v9rd AP7Wnd'): for j in i.find_all('div', class_='BNeawe s3v9rd AP7Wnd'): list1.append(j.text) try: top_result_element = soup.find('div', class_='BNeawe iBp4i AP7Wnd') except: pass if not top_result_element: try: top_result_element = driver.find_element(By.CLASS_NAME, "IZ6rdc") except: pass if not top_result_element: try: top_result_element = driver.find_element( By.CLASS_NAME, "Z0LcW CfV8xf") except: pass if not top_result_element: try: top_result_element = driver.find_element( By.CLASS_NAME, "ayqGOc kno-fb-ctx KBXm4e") except: pass top_result = top_result_element.text if top_result_element else None try: if list1[0].split()[0] in part_of_speeches: pos = list1[0].split()[0] if pos[0] == "a": top_result += f'As an {pos} it means {list1[1]}' else: top_result += f'As a {pos} it means {list1[1]}' except: pass try: if not top_result: for text in list1: list_text = text.split() if len(list_text) != 0 and list_text[-1] == 'Wikipedia': top_result = f'According to Wikipedia, {"/".join(text.split()[0:-1]).replace("/", " ")}' except: pass try: if "youtube.com" in urls[0]: driver.get(urls[0]) transcript_elements = driver.find_elements( By.CLASS_NAME, "ytd-transcript-segment-renderer") transcript = "\n".join( [element for element in transcript_elements]) if transcript: top_result = transcript except: pass driver.quit() article_text = "" for index, article in enumerate(articles): article_text += f"Article {index+1}: {article}\n" return f"Top Results: {top_result}\n{article_text}" def get_weather_data(): driver = setup_driver() driver.get('https://www.google.com/search?q=weather') weather_data = driver.find_element(By.CLASS_NAME, 'UQt4rd') weather_data = weather_data.text data_list = weather_data.split('\n') data_list[0] = data_list[0][0:-2] data_list.append(driver.find_element(By.ID, 'wob_dc').text) location = driver.find_element(By.CLASS_NAME, "eKPi4").text location = location.replace("Results for\n", "") weather_icon_link = driver.find_element( By.ID, 'wob_tci').get_attribute('src') url = weather_icon_link with urllib.request.urlopen(url) as url1: weather_data = url1.read() temp = data_list[0] weather_details = f'{data_list[1]}, {data_list[2]}, {data_list[3]}' weather_name = data_list[-1] print( f'Weather in {location} is: {temp}, {weather_details}, {weather_name}') driver.quit() return {"location": location, "temperature": temp, "details": weather_details, "name": weather_name} if __name__ == "__main__": data = get_weather_data() location = data["location"] temperature = data["temperature"] details = data["details"] name = data["name"] weather = f"{location} is {name} with {temperature} and {details}" print(weather) print(web_scraper("top news", True))