CRYSTAL-R1 / internet.py
crystal-technologies's picture
Upload 2711 files
6e73cd3
raw
history blame
5.76 kB
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from googlesearch import search
from selenium import webdriver
from bs4 import BeautifulSoup
import trafilatura
import requests
import urllib
def setup_driver(headless=True):
chrome_options = webdriver.ChromeOptions()
if headless:
chrome_options.add_argument('headless')
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(service=ChromeService(
ChromeDriverManager().install()), options=chrome_options)
return driver
def scrape_article(url):
downloaded = trafilatura.fetch_url(url)
extracted = trafilatura.extract(downloaded)
return extracted
def get_articles(urls, deep=False):
articles = []
for article in urls:
article_text = scrape_article(article)
if article_text and article_text != "" and article_text != "When you have eliminated the JavaScript, whatever remains must be an empty page." and article_text != "When you have eliminated the\nJavaScript\n, whatever remains must be an empty page.\nEnable JavaScript to see Google Maps." and "Something went wrong. Wait a moment and try again.\nTry again\nPlease enable Javascript and refresh the page to continue" != article_text:
if deep:
articles.append(
".".join(article_text[:-1].split('.')[:-1]).split("|")[-1])
else:
articles.append(
".".join(article_text[:2000].split('.')[:-1]).split("|")[-1])
if len(articles) == 2:
break
return articles
def web_scraper(web_query, deep=False):
driver = setup_driver()
urls = list(search(web_query, num_results=10, sleep_interval=0.1))
web_query = urllib.parse.quote(web_query)
url = "https://www.google.com/search?q="+web_query
driver.get(url)
source = requests.get(url).text
soup = BeautifulSoup(source, 'html.parser')
part_of_speeches = ['noun', 'adjective', 'verb', 'adverb', 'pronoun', 'preposition',
'conjunction', 'interjection', 'exclamation', 'numeral', 'article', 'determiner']
list1 = []
articles = get_articles(urls, deep)
for i in soup.find_all('div', class_='BNeawe s3v9rd AP7Wnd'):
for j in i.find_all('div', class_='BNeawe s3v9rd AP7Wnd'):
list1.append(j.text)
try:
top_result_element = soup.find('div', class_='BNeawe iBp4i AP7Wnd')
except:
pass
if not top_result_element:
try:
top_result_element = driver.find_element(By.CLASS_NAME, "IZ6rdc")
except:
pass
if not top_result_element:
try:
top_result_element = driver.find_element(
By.CLASS_NAME, "Z0LcW CfV8xf")
except:
pass
if not top_result_element:
try:
top_result_element = driver.find_element(
By.CLASS_NAME, "ayqGOc kno-fb-ctx KBXm4e")
except:
pass
top_result = top_result_element.text if top_result_element else None
try:
if list1[0].split()[0] in part_of_speeches:
pos = list1[0].split()[0]
if pos[0] == "a":
top_result += f'As an {pos} it means {list1[1]}'
else:
top_result += f'As a {pos} it means {list1[1]}'
except:
pass
try:
if not top_result:
for text in list1:
list_text = text.split()
if len(list_text) != 0 and list_text[-1] == 'Wikipedia':
top_result = f'According to Wikipedia, {"/".join(text.split()[0:-1]).replace("/", " ")}'
except:
pass
try:
if "youtube.com" in urls[0]:
driver.get(urls[0])
transcript_elements = driver.find_elements(
By.CLASS_NAME, "ytd-transcript-segment-renderer")
transcript = "\n".join(
[element for element in transcript_elements])
if transcript:
top_result = transcript
except:
pass
driver.quit()
article_text = ""
for index, article in enumerate(articles):
article_text += f"Article {index+1}: {article}\n"
return f"Top Results: {top_result}\n{article_text}"
def get_weather_data():
driver = setup_driver()
driver.get('https://www.google.com/search?q=weather')
weather_data = driver.find_element(By.CLASS_NAME, 'UQt4rd')
weather_data = weather_data.text
data_list = weather_data.split('\n')
data_list[0] = data_list[0][0:-2]
data_list.append(driver.find_element(By.ID, 'wob_dc').text)
location = driver.find_element(By.CLASS_NAME, "eKPi4").text
location = location.replace("Results for\n", "")
weather_icon_link = driver.find_element(
By.ID, 'wob_tci').get_attribute('src')
url = weather_icon_link
with urllib.request.urlopen(url) as url1:
weather_data = url1.read()
temp = data_list[0]
weather_details = f'{data_list[1]}, {data_list[2]}, {data_list[3]}'
weather_name = data_list[-1]
print(
f'Weather in {location} is: {temp}, {weather_details}, {weather_name}')
driver.quit()
return {"location": location, "temperature": temp, "details": weather_details, "name": weather_name}
if __name__ == "__main__":
data = get_weather_data()
location = data["location"]
temperature = data["temperature"]
details = data["details"]
name = data["name"]
weather = f"{location} is {name} with {temperature} and {details}"
print(weather)
print(web_scraper("top news", True))