File size: 5,756 Bytes
6e73cd3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from googlesearch import search
from selenium import webdriver
from bs4 import BeautifulSoup
import trafilatura
import requests
import urllib
def setup_driver(headless=True):
chrome_options = webdriver.ChromeOptions()
if headless:
chrome_options.add_argument('headless')
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(service=ChromeService(
ChromeDriverManager().install()), options=chrome_options)
return driver
def scrape_article(url):
downloaded = trafilatura.fetch_url(url)
extracted = trafilatura.extract(downloaded)
return extracted
def get_articles(urls, deep=False):
articles = []
for article in urls:
article_text = scrape_article(article)
if article_text and article_text != "" and article_text != "When you have eliminated the JavaScript, whatever remains must be an empty page." and article_text != "When you have eliminated the\nJavaScript\n, whatever remains must be an empty page.\nEnable JavaScript to see Google Maps." and "Something went wrong. Wait a moment and try again.\nTry again\nPlease enable Javascript and refresh the page to continue" != article_text:
if deep:
articles.append(
".".join(article_text[:-1].split('.')[:-1]).split("|")[-1])
else:
articles.append(
".".join(article_text[:2000].split('.')[:-1]).split("|")[-1])
if len(articles) == 2:
break
return articles
def web_scraper(web_query, deep=False):
driver = setup_driver()
urls = list(search(web_query, num_results=10, sleep_interval=0.1))
web_query = urllib.parse.quote(web_query)
url = "https://www.google.com/search?q="+web_query
driver.get(url)
source = requests.get(url).text
soup = BeautifulSoup(source, 'html.parser')
part_of_speeches = ['noun', 'adjective', 'verb', 'adverb', 'pronoun', 'preposition',
'conjunction', 'interjection', 'exclamation', 'numeral', 'article', 'determiner']
list1 = []
articles = get_articles(urls, deep)
for i in soup.find_all('div', class_='BNeawe s3v9rd AP7Wnd'):
for j in i.find_all('div', class_='BNeawe s3v9rd AP7Wnd'):
list1.append(j.text)
try:
top_result_element = soup.find('div', class_='BNeawe iBp4i AP7Wnd')
except:
pass
if not top_result_element:
try:
top_result_element = driver.find_element(By.CLASS_NAME, "IZ6rdc")
except:
pass
if not top_result_element:
try:
top_result_element = driver.find_element(
By.CLASS_NAME, "Z0LcW CfV8xf")
except:
pass
if not top_result_element:
try:
top_result_element = driver.find_element(
By.CLASS_NAME, "ayqGOc kno-fb-ctx KBXm4e")
except:
pass
top_result = top_result_element.text if top_result_element else None
try:
if list1[0].split()[0] in part_of_speeches:
pos = list1[0].split()[0]
if pos[0] == "a":
top_result += f'As an {pos} it means {list1[1]}'
else:
top_result += f'As a {pos} it means {list1[1]}'
except:
pass
try:
if not top_result:
for text in list1:
list_text = text.split()
if len(list_text) != 0 and list_text[-1] == 'Wikipedia':
top_result = f'According to Wikipedia, {"/".join(text.split()[0:-1]).replace("/", " ")}'
except:
pass
try:
if "youtube.com" in urls[0]:
driver.get(urls[0])
transcript_elements = driver.find_elements(
By.CLASS_NAME, "ytd-transcript-segment-renderer")
transcript = "\n".join(
[element for element in transcript_elements])
if transcript:
top_result = transcript
except:
pass
driver.quit()
article_text = ""
for index, article in enumerate(articles):
article_text += f"Article {index+1}: {article}\n"
return f"Top Results: {top_result}\n{article_text}"
def get_weather_data():
driver = setup_driver()
driver.get('https://www.google.com/search?q=weather')
weather_data = driver.find_element(By.CLASS_NAME, 'UQt4rd')
weather_data = weather_data.text
data_list = weather_data.split('\n')
data_list[0] = data_list[0][0:-2]
data_list.append(driver.find_element(By.ID, 'wob_dc').text)
location = driver.find_element(By.CLASS_NAME, "eKPi4").text
location = location.replace("Results for\n", "")
weather_icon_link = driver.find_element(
By.ID, 'wob_tci').get_attribute('src')
url = weather_icon_link
with urllib.request.urlopen(url) as url1:
weather_data = url1.read()
temp = data_list[0]
weather_details = f'{data_list[1]}, {data_list[2]}, {data_list[3]}'
weather_name = data_list[-1]
print(
f'Weather in {location} is: {temp}, {weather_details}, {weather_name}')
driver.quit()
return {"location": location, "temperature": temp, "details": weather_details, "name": weather_name}
if __name__ == "__main__":
data = get_weather_data()
location = data["location"]
temperature = data["temperature"]
details = data["details"]
name = data["name"]
weather = f"{location} is {name} with {temperature} and {details}"
print(weather)
print(web_scraper("top news", True))
|