|
from selenium.webdriver.chrome.service import Service as ChromeService |
|
from webdriver_manager.chrome import ChromeDriverManager |
|
from selenium.webdriver.common.by import By |
|
from googlesearch import search |
|
from selenium import webdriver |
|
from bs4 import BeautifulSoup |
|
import trafilatura |
|
import requests |
|
import urllib |
|
|
|
def setup_driver(headless=True): |
|
chrome_options = webdriver.ChromeOptions() |
|
if headless: |
|
chrome_options.add_argument('headless') |
|
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging']) |
|
|
|
driver = webdriver.Chrome(service=ChromeService( |
|
ChromeDriverManager().install()), options=chrome_options) |
|
return driver |
|
|
|
|
|
def scrape_article(url): |
|
downloaded = trafilatura.fetch_url(url) |
|
extracted = trafilatura.extract(downloaded) |
|
return extracted |
|
|
|
|
|
def get_articles(urls, deep=False): |
|
articles = [] |
|
for article in urls: |
|
article_text = scrape_article(article) |
|
if article_text and article_text != "" and article_text != "When you have eliminated the JavaScript, whatever remains must be an empty page." and article_text != "When you have eliminated the\nJavaScript\n, whatever remains must be an empty page.\nEnable JavaScript to see Google Maps." and "Something went wrong. Wait a moment and try again.\nTry again\nPlease enable Javascript and refresh the page to continue" != article_text: |
|
if deep: |
|
articles.append( |
|
".".join(article_text[:-1].split('.')[:-1]).split("|")[-1]) |
|
else: |
|
articles.append( |
|
".".join(article_text[:2000].split('.')[:-1]).split("|")[-1]) |
|
if len(articles) == 2: |
|
break |
|
return articles |
|
|
|
def web_scraper(web_query, deep=False): |
|
driver = setup_driver() |
|
urls = list(search(web_query, num_results=10, sleep_interval=0.1)) |
|
|
|
web_query = urllib.parse.quote(web_query) |
|
|
|
url = "https://www.google.com/search?q="+web_query |
|
driver.get(url) |
|
source = requests.get(url).text |
|
soup = BeautifulSoup(source, 'html.parser') |
|
part_of_speeches = ['noun', 'adjective', 'verb', 'adverb', 'pronoun', 'preposition', |
|
'conjunction', 'interjection', 'exclamation', 'numeral', 'article', 'determiner'] |
|
list1 = [] |
|
articles = get_articles(urls, deep) |
|
|
|
for i in soup.find_all('div', class_='BNeawe s3v9rd AP7Wnd'): |
|
for j in i.find_all('div', class_='BNeawe s3v9rd AP7Wnd'): |
|
list1.append(j.text) |
|
|
|
try: |
|
top_result_element = soup.find('div', class_='BNeawe iBp4i AP7Wnd') |
|
except: |
|
pass |
|
if not top_result_element: |
|
try: |
|
top_result_element = driver.find_element(By.CLASS_NAME, "IZ6rdc") |
|
except: |
|
pass |
|
if not top_result_element: |
|
try: |
|
top_result_element = driver.find_element( |
|
By.CLASS_NAME, "Z0LcW CfV8xf") |
|
except: |
|
pass |
|
if not top_result_element: |
|
try: |
|
top_result_element = driver.find_element( |
|
By.CLASS_NAME, "ayqGOc kno-fb-ctx KBXm4e") |
|
except: |
|
pass |
|
top_result = top_result_element.text if top_result_element else None |
|
|
|
try: |
|
if list1[0].split()[0] in part_of_speeches: |
|
pos = list1[0].split()[0] |
|
if pos[0] == "a": |
|
top_result += f'As an {pos} it means {list1[1]}' |
|
else: |
|
top_result += f'As a {pos} it means {list1[1]}' |
|
except: |
|
pass |
|
|
|
try: |
|
if not top_result: |
|
for text in list1: |
|
list_text = text.split() |
|
if len(list_text) != 0 and list_text[-1] == 'Wikipedia': |
|
top_result = f'According to Wikipedia, {"/".join(text.split()[0:-1]).replace("/", " ")}' |
|
except: |
|
pass |
|
|
|
try: |
|
if "youtube.com" in urls[0]: |
|
driver.get(urls[0]) |
|
transcript_elements = driver.find_elements( |
|
By.CLASS_NAME, "ytd-transcript-segment-renderer") |
|
transcript = "\n".join( |
|
[element for element in transcript_elements]) |
|
if transcript: |
|
top_result = transcript |
|
except: |
|
pass |
|
|
|
driver.quit() |
|
article_text = "" |
|
for index, article in enumerate(articles): |
|
article_text += f"Article {index+1}: {article}\n" |
|
|
|
return f"Top Results: {top_result}\n{article_text}" |
|
|
|
|
|
def get_weather_data(): |
|
driver = setup_driver() |
|
driver.get('https://www.google.com/search?q=weather') |
|
weather_data = driver.find_element(By.CLASS_NAME, 'UQt4rd') |
|
weather_data = weather_data.text |
|
data_list = weather_data.split('\n') |
|
data_list[0] = data_list[0][0:-2] |
|
data_list.append(driver.find_element(By.ID, 'wob_dc').text) |
|
location = driver.find_element(By.CLASS_NAME, "eKPi4").text |
|
location = location.replace("Results for\n", "") |
|
weather_icon_link = driver.find_element( |
|
By.ID, 'wob_tci').get_attribute('src') |
|
url = weather_icon_link |
|
with urllib.request.urlopen(url) as url1: |
|
weather_data = url1.read() |
|
|
|
temp = data_list[0] |
|
weather_details = f'{data_list[1]}, {data_list[2]}, {data_list[3]}' |
|
weather_name = data_list[-1] |
|
|
|
print( |
|
f'Weather in {location} is: {temp}, {weather_details}, {weather_name}') |
|
driver.quit() |
|
return {"location": location, "temperature": temp, "details": weather_details, "name": weather_name} |
|
|
|
|
|
if __name__ == "__main__": |
|
data = get_weather_data() |
|
location = data["location"] |
|
temperature = data["temperature"] |
|
details = data["details"] |
|
name = data["name"] |
|
|
|
weather = f"{location} is {name} with {temperature} and {details}" |
|
print(weather) |
|
print(web_scraper("top news", True)) |
|
|