Spaces:
Running
on
Zero
Running
on
Zero
Update chatbot.py
Browse files- chatbot.py +20 -44
chatbot.py
CHANGED
@@ -236,54 +236,30 @@ def extract_text_from_webpage(html_content):
|
|
236 |
visible_text = soup.get_text(strip=True)
|
237 |
return visible_text
|
238 |
|
|
|
|
|
239 |
# Perform a Google search and return the results
|
240 |
-
def search(term
|
241 |
-
"""Performs a Google search and returns the results."""
|
242 |
-
escaped_term = urllib.parse.quote_plus(term)
|
243 |
-
start = 0
|
244 |
all_results = []
|
245 |
# Limit the number of characters from each webpage to stay under the token limit
|
246 |
-
max_chars_per_page = 8000 # Adjust this value based on your token limit and average webpage length
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"}
|
253 |
-
|
254 |
-
|
255 |
-
"num": num_results - start,
|
256 |
-
"hl": lang,
|
257 |
-
"start": start,
|
258 |
-
"safe": safe,
|
259 |
-
},
|
260 |
-
timeout=timeout,
|
261 |
-
verify=ssl_verify,
|
262 |
-
)
|
263 |
-
resp.raise_for_status()
|
264 |
-
soup = BeautifulSoup(resp.text, "html.parser")
|
265 |
-
result_block = soup.find_all("div", attrs={"class": "g"})
|
266 |
-
if not result_block:
|
267 |
-
start += 1
|
268 |
-
continue
|
269 |
-
for result in result_block:
|
270 |
-
link = result.find("a", href=True)
|
271 |
-
if link:
|
272 |
-
link = link["href"]
|
273 |
-
try:
|
274 |
-
webpage = session.get(link, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"})
|
275 |
-
webpage.raise_for_status()
|
276 |
-
visible_text = extract_text_from_webpage(webpage.text)
|
277 |
# Truncate text if it's too long
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
start += len(result_block)
|
287 |
return all_results
|
288 |
|
289 |
# Format the prompt for the language model
|
|
|
236 |
visible_text = soup.get_text(strip=True)
|
237 |
return visible_text
|
238 |
|
239 |
+
from duckduckgo_search import DDGS
|
240 |
+
|
241 |
# Perform a Google search and return the results
|
242 |
+
def search(term):
|
|
|
|
|
|
|
243 |
all_results = []
|
244 |
# Limit the number of characters from each webpage to stay under the token limit
|
245 |
+
max_chars_per_page = 8000 # Adjust this value based on your token limit and average webpage length
|
246 |
+
result_block = DDGS().text(term, max_results=2)
|
247 |
+
for result in result_block:
|
248 |
+
if 'href' in result:
|
249 |
+
link = result["href"]
|
250 |
+
try:
|
251 |
+
webpage = requests.get(link, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"})
|
252 |
+
webpage.raise_for_status()
|
253 |
+
visible_text = extract_text_from_webpage(webpage.text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
# Truncate text if it's too long
|
255 |
+
if len(visible_text) > max_chars_per_page:
|
256 |
+
visible_text = visible_text[:max_chars_per_page] + "..."
|
257 |
+
all_results.append({"link": link, "text": visible_text})
|
258 |
+
except requests.exceptions.RequestException as e:
|
259 |
+
print(f"Error fetching or processing {link}: {e}")
|
260 |
+
all_results.append({"link": link, "text": None})
|
261 |
+
else:
|
262 |
+
all_results.append({"link": None, "text": None})
|
|
|
263 |
return all_results
|
264 |
|
265 |
# Format the prompt for the language model
|