File size: 5,756 Bytes

6e73cd3

from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from googlesearch import search
from selenium import webdriver
from bs4 import BeautifulSoup
import trafilatura
import requests
import urllib

def setup_driver(headless=True):
    chrome_options = webdriver.ChromeOptions()
    if headless:
        chrome_options.add_argument('headless')
    chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])

    driver = webdriver.Chrome(service=ChromeService(
    ChromeDriverManager().install()), options=chrome_options)
    return driver


def scrape_article(url):
    downloaded = trafilatura.fetch_url(url)
    extracted = trafilatura.extract(downloaded)
    return extracted


def get_articles(urls, deep=False):
    articles = []
    for article in urls:
        article_text = scrape_article(article)
        if article_text and article_text != "" and article_text != "When you have eliminated the JavaScript, whatever remains must be an empty page." and article_text != "When you have eliminated the\nJavaScript\n, whatever remains must be an empty page.\nEnable JavaScript to see Google Maps." and "Something went wrong. Wait a moment and try again.\nTry again\nPlease enable Javascript and refresh the page to continue" != article_text:
            if deep:
                articles.append(
                    ".".join(article_text[:-1].split('.')[:-1]).split("|")[-1])
            else:
                articles.append(
                    ".".join(article_text[:2000].split('.')[:-1]).split("|")[-1])
                if len(articles) == 2:
                    break
    return articles

def web_scraper(web_query, deep=False):
    driver = setup_driver()
    urls = list(search(web_query, num_results=10, sleep_interval=0.1))

    web_query = urllib.parse.quote(web_query)
    
    url = "https://www.google.com/search?q="+web_query
    driver.get(url)
    source = requests.get(url).text
    soup = BeautifulSoup(source, 'html.parser')
    part_of_speeches = ['noun', 'adjective', 'verb', 'adverb', 'pronoun', 'preposition',
                        'conjunction', 'interjection', 'exclamation', 'numeral', 'article', 'determiner']
    list1 = []
    articles = get_articles(urls, deep)

    for i in soup.find_all('div', class_='BNeawe s3v9rd AP7Wnd'):
        for j in i.find_all('div', class_='BNeawe s3v9rd AP7Wnd'):
            list1.append(j.text)

    try:
        top_result_element = soup.find('div', class_='BNeawe iBp4i AP7Wnd')
    except:
        pass
    if not top_result_element:
        try:
            top_result_element = driver.find_element(By.CLASS_NAME, "IZ6rdc")
        except:
            pass
    if not top_result_element:
        try:
            top_result_element = driver.find_element(
                By.CLASS_NAME, "Z0LcW CfV8xf")
        except:
            pass
    if not top_result_element:
        try:
            top_result_element = driver.find_element(
                By.CLASS_NAME, "ayqGOc kno-fb-ctx KBXm4e")
        except:
            pass
    top_result = top_result_element.text if top_result_element else None

    try:
        if list1[0].split()[0] in part_of_speeches:
            pos = list1[0].split()[0]
            if pos[0] == "a":
                top_result += f'As an {pos} it means {list1[1]}'
            else:
                top_result += f'As a {pos} it means {list1[1]}'
    except:
        pass

    try:
        if not top_result:
            for text in list1:
                list_text = text.split()
                if len(list_text) != 0 and list_text[-1] == 'Wikipedia':
                    top_result = f'According to Wikipedia, {"/".join(text.split()[0:-1]).replace("/", " ")}'
    except:
        pass

    try:
        if "youtube.com" in urls[0]:
            driver.get(urls[0])
            transcript_elements = driver.find_elements(
                By.CLASS_NAME, "ytd-transcript-segment-renderer")
            transcript = "\n".join(
                [element for element in transcript_elements])
            if transcript:
                top_result = transcript
    except:
        pass

    driver.quit()
    article_text = ""
    for index, article in enumerate(articles):
        article_text += f"Article {index+1}: {article}\n"

    return f"Top Results: {top_result}\n{article_text}"


def get_weather_data():
    driver = setup_driver()
    driver.get('https://www.google.com/search?q=weather')
    weather_data = driver.find_element(By.CLASS_NAME, 'UQt4rd')
    weather_data = weather_data.text
    data_list = weather_data.split('\n')
    data_list[0] = data_list[0][0:-2]
    data_list.append(driver.find_element(By.ID, 'wob_dc').text)
    location = driver.find_element(By.CLASS_NAME, "eKPi4").text
    location = location.replace("Results for\n", "")
    weather_icon_link = driver.find_element(
        By.ID, 'wob_tci').get_attribute('src')
    url = weather_icon_link
    with urllib.request.urlopen(url) as url1:
        weather_data = url1.read()

    temp = data_list[0]
    weather_details = f'{data_list[1]}, {data_list[2]}, {data_list[3]}'
    weather_name = data_list[-1]

    print(
        f'Weather in {location} is: {temp}, {weather_details}, {weather_name}')
    driver.quit()
    return {"location": location, "temperature": temp, "details": weather_details, "name": weather_name}


if __name__ == "__main__":
    data = get_weather_data()
    location = data["location"]
    temperature = data["temperature"]
    details = data["details"]
    name = data["name"]

    weather = f"{location} is {name} with {temperature} and {details}"
    print(weather)
    print(web_scraper("top news", True))