crystal-technologies
/

CRYSTAL-R1

Model card Files Files and versions Community

CRYSTAL-R1 / internet.py

crystal-technologies

Upload 2711 files

6e73cd3 about 1 year ago

raw

history blame

5.76 kB

	from selenium.webdriver.chrome.service import Service as ChromeService
	from webdriver_manager.chrome import ChromeDriverManager
	from selenium.webdriver.common.by import By
	from googlesearch import search
	from selenium import webdriver
	from bs4 import BeautifulSoup
	import trafilatura
	import requests
	import urllib

	def setup_driver(headless=True):
	chrome_options = webdriver.ChromeOptions()
	if headless:
	chrome_options.add_argument('headless')
	chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])

	driver = webdriver.Chrome(service=ChromeService(
	ChromeDriverManager().install()), options=chrome_options)
	return driver


	def scrape_article(url):
	downloaded = trafilatura.fetch_url(url)
	extracted = trafilatura.extract(downloaded)
	return extracted


	def get_articles(urls, deep=False):
	articles = []
	for article in urls:
	article_text = scrape_article(article)
	if article_text and article_text != "" and article_text != "When you have eliminated the JavaScript, whatever remains must be an empty page." and article_text != "When you have eliminated the\nJavaScript\n, whatever remains must be an empty page.\nEnable JavaScript to see Google Maps." and "Something went wrong. Wait a moment and try again.\nTry again\nPlease enable Javascript and refresh the page to continue" != article_text:
	if deep:
	articles.append(
	".".join(article_text[:-1].split('.')[:-1]).split("\|")[-1])
	else:
	articles.append(
	".".join(article_text[:2000].split('.')[:-1]).split("\|")[-1])
	if len(articles) == 2:
	break
	return articles

	def web_scraper(web_query, deep=False):
	driver = setup_driver()
	urls = list(search(web_query, num_results=10, sleep_interval=0.1))

	web_query = urllib.parse.quote(web_query)

	url = "https://www.google.com/search?q="+web_query
	driver.get(url)
	source = requests.get(url).text
	soup = BeautifulSoup(source, 'html.parser')
	part_of_speeches = ['noun', 'adjective', 'verb', 'adverb', 'pronoun', 'preposition',
	'conjunction', 'interjection', 'exclamation', 'numeral', 'article', 'determiner']
	list1 = []
	articles = get_articles(urls, deep)

	for i in soup.find_all('div', class_='BNeawe s3v9rd AP7Wnd'):
	for j in i.find_all('div', class_='BNeawe s3v9rd AP7Wnd'):
	list1.append(j.text)

	try:
	top_result_element = soup.find('div', class_='BNeawe iBp4i AP7Wnd')
	except:
	pass
	if not top_result_element:
	try:
	top_result_element = driver.find_element(By.CLASS_NAME, "IZ6rdc")
	except:
	pass
	if not top_result_element:
	try:
	top_result_element = driver.find_element(
	By.CLASS_NAME, "Z0LcW CfV8xf")
	except:
	pass
	if not top_result_element:
	try:
	top_result_element = driver.find_element(
	By.CLASS_NAME, "ayqGOc kno-fb-ctx KBXm4e")
	except:
	pass
	top_result = top_result_element.text if top_result_element else None

	try:
	if list1[0].split()[0] in part_of_speeches:
	pos = list1[0].split()[0]
	if pos[0] == "a":
	top_result += f'As an {pos} it means {list1[1]}'
	else:
	top_result += f'As a {pos} it means {list1[1]}'
	except:
	pass

	try:
	if not top_result:
	for text in list1:
	list_text = text.split()
	if len(list_text) != 0 and list_text[-1] == 'Wikipedia':
	top_result = f'According to Wikipedia, {"/".join(text.split()[0:-1]).replace("/", " ")}'
	except:
	pass

	try:
	if "youtube.com" in urls[0]:
	driver.get(urls[0])
	transcript_elements = driver.find_elements(
	By.CLASS_NAME, "ytd-transcript-segment-renderer")
	transcript = "\n".join(
	[element for element in transcript_elements])
	if transcript:
	top_result = transcript
	except:
	pass

	driver.quit()
	article_text = ""
	for index, article in enumerate(articles):
	article_text += f"Article {index+1}: {article}\n"

	return f"Top Results: {top_result}\n{article_text}"


	def get_weather_data():
	driver = setup_driver()
	driver.get('https://www.google.com/search?q=weather')
	weather_data = driver.find_element(By.CLASS_NAME, 'UQt4rd')
	weather_data = weather_data.text
	data_list = weather_data.split('\n')
	data_list[0] = data_list[0][0:-2]
	data_list.append(driver.find_element(By.ID, 'wob_dc').text)
	location = driver.find_element(By.CLASS_NAME, "eKPi4").text
	location = location.replace("Results for\n", "")
	weather_icon_link = driver.find_element(
	By.ID, 'wob_tci').get_attribute('src')
	url = weather_icon_link
	with urllib.request.urlopen(url) as url1:
	weather_data = url1.read()

	temp = data_list[0]
	weather_details = f'{data_list[1]}, {data_list[2]}, {data_list[3]}'
	weather_name = data_list[-1]

	print(
	f'Weather in {location} is: {temp}, {weather_details}, {weather_name}')
	driver.quit()
	return {"location": location, "temperature": temp, "details": weather_details, "name": weather_name}


	if __name__ == "__main__":
	data = get_weather_data()
	location = data["location"]
	temperature = data["temperature"]
	details = data["details"]
	name = data["name"]

	weather = f"{location} is {name} with {temperature} and {details}"
	print(weather)
	print(web_scraper("top news", True))