File size: 5,756 Bytes
6e73cd3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from googlesearch import search
from selenium import webdriver
from bs4 import BeautifulSoup
import trafilatura
import requests
import urllib

def setup_driver(headless=True):
    chrome_options = webdriver.ChromeOptions()
    if headless:
        chrome_options.add_argument('headless')
    chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])

    driver = webdriver.Chrome(service=ChromeService(
    ChromeDriverManager().install()), options=chrome_options)
    return driver


def scrape_article(url):
    downloaded = trafilatura.fetch_url(url)
    extracted = trafilatura.extract(downloaded)
    return extracted


def get_articles(urls, deep=False):
    articles = []
    for article in urls:
        article_text = scrape_article(article)
        if article_text and article_text != "" and article_text != "When you have eliminated the JavaScript, whatever remains must be an empty page." and article_text != "When you have eliminated the\nJavaScript\n, whatever remains must be an empty page.\nEnable JavaScript to see Google Maps." and "Something went wrong. Wait a moment and try again.\nTry again\nPlease enable Javascript and refresh the page to continue" != article_text:
            if deep:
                articles.append(
                    ".".join(article_text[:-1].split('.')[:-1]).split("|")[-1])
            else:
                articles.append(
                    ".".join(article_text[:2000].split('.')[:-1]).split("|")[-1])
                if len(articles) == 2:
                    break
    return articles

def web_scraper(web_query, deep=False):
    driver = setup_driver()
    urls = list(search(web_query, num_results=10, sleep_interval=0.1))

    web_query = urllib.parse.quote(web_query)
    
    url = "https://www.google.com/search?q="+web_query
    driver.get(url)
    source = requests.get(url).text
    soup = BeautifulSoup(source, 'html.parser')
    part_of_speeches = ['noun', 'adjective', 'verb', 'adverb', 'pronoun', 'preposition',
                        'conjunction', 'interjection', 'exclamation', 'numeral', 'article', 'determiner']
    list1 = []
    articles = get_articles(urls, deep)

    for i in soup.find_all('div', class_='BNeawe s3v9rd AP7Wnd'):
        for j in i.find_all('div', class_='BNeawe s3v9rd AP7Wnd'):
            list1.append(j.text)

    try:
        top_result_element = soup.find('div', class_='BNeawe iBp4i AP7Wnd')
    except:
        pass
    if not top_result_element:
        try:
            top_result_element = driver.find_element(By.CLASS_NAME, "IZ6rdc")
        except:
            pass
    if not top_result_element:
        try:
            top_result_element = driver.find_element(
                By.CLASS_NAME, "Z0LcW CfV8xf")
        except:
            pass
    if not top_result_element:
        try:
            top_result_element = driver.find_element(
                By.CLASS_NAME, "ayqGOc kno-fb-ctx KBXm4e")
        except:
            pass
    top_result = top_result_element.text if top_result_element else None

    try:
        if list1[0].split()[0] in part_of_speeches:
            pos = list1[0].split()[0]
            if pos[0] == "a":
                top_result += f'As an {pos} it means {list1[1]}'
            else:
                top_result += f'As a {pos} it means {list1[1]}'
    except:
        pass

    try:
        if not top_result:
            for text in list1:
                list_text = text.split()
                if len(list_text) != 0 and list_text[-1] == 'Wikipedia':
                    top_result = f'According to Wikipedia, {"/".join(text.split()[0:-1]).replace("/", " ")}'
    except:
        pass

    try:
        if "youtube.com" in urls[0]:
            driver.get(urls[0])
            transcript_elements = driver.find_elements(
                By.CLASS_NAME, "ytd-transcript-segment-renderer")
            transcript = "\n".join(
                [element for element in transcript_elements])
            if transcript:
                top_result = transcript
    except:
        pass

    driver.quit()
    article_text = ""
    for index, article in enumerate(articles):
        article_text += f"Article {index+1}: {article}\n"

    return f"Top Results: {top_result}\n{article_text}"


def get_weather_data():
    driver = setup_driver()
    driver.get('https://www.google.com/search?q=weather')
    weather_data = driver.find_element(By.CLASS_NAME, 'UQt4rd')
    weather_data = weather_data.text
    data_list = weather_data.split('\n')
    data_list[0] = data_list[0][0:-2]
    data_list.append(driver.find_element(By.ID, 'wob_dc').text)
    location = driver.find_element(By.CLASS_NAME, "eKPi4").text
    location = location.replace("Results for\n", "")
    weather_icon_link = driver.find_element(
        By.ID, 'wob_tci').get_attribute('src')
    url = weather_icon_link
    with urllib.request.urlopen(url) as url1:
        weather_data = url1.read()

    temp = data_list[0]
    weather_details = f'{data_list[1]}, {data_list[2]}, {data_list[3]}'
    weather_name = data_list[-1]

    print(
        f'Weather in {location} is: {temp}, {weather_details}, {weather_name}')
    driver.quit()
    return {"location": location, "temperature": temp, "details": weather_details, "name": weather_name}


if __name__ == "__main__":
    data = get_weather_data()
    location = data["location"]
    temperature = data["temperature"]
    details = data["details"]
    name = data["name"]

    weather = f"{location} is {name} with {temperature} and {details}"
    print(weather)
    print(web_scraper("top news", True))