import requests from bs4 import BeautifulSoup from tqdm import tqdm from urllib.parse import urlparse import chainlit as cl from langchain import PromptTemplate import requests from bs4 import BeautifulSoup try: from modules.constants import * except: from constants import * """ Ref: https://python.plainenglish.io/scraping-the-subpages-on-a-website-ea2d4e3db113 """ class WebpageCrawler: def __init__(self): pass def getdata(self, url): r = requests.get(url) return r.text def url_exists(self, url): try: response = requests.head(url) return response.status_code == 200 except requests.ConnectionError: return False def get_links(self, website_link, base_url=None): if base_url is None: base_url = website_link html_data = self.getdata(website_link) soup = BeautifulSoup(html_data, "html.parser") list_links = [] for link in soup.find_all("a", href=True): # clean the link # remove empty spaces link["href"] = link["href"].strip() # Append to list if new link contains original link if str(link["href"]).startswith((str(website_link))): list_links.append(link["href"]) # Include all href that do not start with website link but with "/" if str(link["href"]).startswith("/"): if link["href"] not in self.dict_href_links: print(link["href"]) self.dict_href_links[link["href"]] = None link_with_www = base_url + link["href"][1:] if self.url_exists(link_with_www): print("adjusted link =", link_with_www) list_links.append(link_with_www) # Convert list of links to dictionary and define keys as the links and the values as "Not-checked" dict_links = dict.fromkeys(list_links, "Not-checked") return dict_links def get_subpage_links(self, l, base_url): for link in tqdm(l): print("checking link:", link) if not link.endswith("/"): l[link] = "Checked" dict_links_subpages = {} else: # If not crawled through this page start crawling and get links if l[link] == "Not-checked": dict_links_subpages = self.get_links(link, base_url) # Change the dictionary value of the link to "Checked" l[link] = "Checked" else: # Create an empty dictionary in case every link is checked dict_links_subpages = {} # Add new dictionary to old dictionary l = {**dict_links_subpages, **l} return l def get_all_pages(self, url, base_url): dict_links = {url: "Not-checked"} self.dict_href_links = {} counter, counter2 = None, 0 while counter != 0: counter2 += 1 dict_links2 = self.get_subpage_links(dict_links, base_url) # Count number of non-values and set counter to 0 if there are no values within the dictionary equal to the string "Not-checked" # https://stackoverflow.com/questions/48371856/count-the-number-of-occurrences-of-a-certain-value-in-a-dictionary-in-python counter = sum(value == "Not-checked" for value in dict_links2.values()) dict_links = dict_links2 checked_urls = [ url for url, status in dict_links.items() if status == "Checked" ] return checked_urls def get_urls_from_file(file_path: str): """ Function to get urls from a file """ with open(file_path, "r") as f: urls = f.readlines() urls = [url.strip() for url in urls] return urls def get_base_url(url): parsed_url = urlparse(url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/" return base_url def get_prompt(config): if config["llm_params"]["use_history"]: if config["llm_params"]["llm_loader"] == "local_llm": custom_prompt_template = tinyllama_prompt_template_with_history elif config["llm_params"]["llm_loader"] == "openai": custom_prompt_template = openai_prompt_template_with_history # else: # custom_prompt_template = tinyllama_prompt_template_with_history # default prompt = PromptTemplate( template=custom_prompt_template, input_variables=["context", "chat_history", "question"], ) else: if config["llm_params"]["llm_loader"] == "local_llm": custom_prompt_template = tinyllama_prompt_template elif config["llm_params"]["llm_loader"] == "openai": custom_prompt_template = openai_prompt_template # else: # custom_prompt_template = tinyllama_prompt_template prompt = PromptTemplate( template=custom_prompt_template, input_variables=["context", "question"], ) return prompt def get_sources(res, answer): source_elements = [] source_dict = {} # Dictionary to store URL elements for idx, source in enumerate(res["source_documents"]): source_metadata = source.metadata url = source_metadata["source"] score = source_metadata.get("score", "N/A") page = source_metadata.get("page", 1) lecture_tldr = source_metadata.get("tldr", "N/A") lecture_recording = source_metadata.get("lecture_recording", "N/A") suggested_readings = source_metadata.get("suggested_readings", "N/A") date = source_metadata.get("date", "N/A") source_type = source_metadata.get("source_type", "N/A") url_name = f"{url}_{page}" if url_name not in source_dict: source_dict[url_name] = { "text": source.page_content, "url": url, "score": score, "page": page, "lecture_tldr": lecture_tldr, "lecture_recording": lecture_recording, "suggested_readings": suggested_readings, "date": date, "source_type": source_type, } else: source_dict[url_name]["text"] += f"\n\n{source.page_content}" # First, display the answer full_answer = "**Answer:**\n" full_answer += answer # Then, display the sources full_answer += "\n\n**Sources:**\n" for idx, (url_name, source_data) in enumerate(source_dict.items()): full_answer += f"\nSource {idx + 1} (Score: {source_data['score']}): {source_data['url']}\n" name = f"Source {idx + 1} Text\n" full_answer += name source_elements.append(cl.Text(name=name, content=source_data["text"])) # Add a PDF element if the source is a PDF file if source_data["url"].lower().endswith(".pdf"): name = f"Source {idx + 1} PDF\n" full_answer += name pdf_url = f"{source_data['url']}#page={source_data['page']+1}" source_elements.append(cl.Pdf(name=name, url=pdf_url)) # Finally, include lecture metadata for each unique source # displayed_urls = set() # full_answer += "\n**Metadata:**\n" # for url_name, source_data in source_dict.items(): # if source_data["url"] not in displayed_urls: # full_answer += f"\nSource: {source_data['url']}\n" # full_answer += f"Type: {source_data['source_type']}\n" # full_answer += f"TL;DR: {source_data['lecture_tldr']}\n" # full_answer += f"Lecture Recording: {source_data['lecture_recording']}\n" # full_answer += f"Suggested Readings: {source_data['suggested_readings']}\n" # displayed_urls.add(source_data["url"]) full_answer += "\n**Metadata:**\n" for url_name, source_data in source_dict.items(): full_answer += f"\nSource: {source_data['url']}\n" full_answer += f"Page: {source_data['page']}\n" full_answer += f"Type: {source_data['source_type']}\n" full_answer += f"Date: {source_data['date']}\n" full_answer += f"TL;DR: {source_data['lecture_tldr']}\n" full_answer += f"Lecture Recording: {source_data['lecture_recording']}\n" full_answer += f"Suggested Readings: {source_data['suggested_readings']}\n" return full_answer, source_elements def get_lecture_metadata(lectures_url, schedule_url): """ Function to get the lecture metadata from the lectures and schedule URLs. """ lecture_metadata = {} # Get the main lectures page content r_lectures = requests.get(lectures_url) soup_lectures = BeautifulSoup(r_lectures.text, "html.parser") # Get the main schedule page content r_schedule = requests.get(schedule_url) soup_schedule = BeautifulSoup(r_schedule.text, "html.parser") # Find all lecture blocks lecture_blocks = soup_lectures.find_all("div", class_="lecture-container") # Create a mapping from slides link to date date_mapping = {} schedule_rows = soup_schedule.find_all("li", class_="table-row-lecture") for row in schedule_rows: try: date = ( row.find("div", {"data-label": "Date"}).get_text(separator=" ").strip() ) description_div = row.find("div", {"data-label": "Description"}) slides_link_tag = description_div.find("a", title="Download slides") slides_link = slides_link_tag["href"].strip() if slides_link_tag else None slides_link = ( f"https://dl4ds.github.io{slides_link}" if slides_link else None ) if slides_link: date_mapping[slides_link] = date except Exception as e: print(f"Error processing schedule row: {e}") continue for block in lecture_blocks: try: # Extract the lecture title title = block.find("span", style="font-weight: bold;").text.strip() # Extract the TL;DR tldr = block.find("strong", text="tl;dr:").next_sibling.strip() # Extract the link to the slides slides_link_tag = block.find("a", title="Download slides") slides_link = slides_link_tag["href"].strip() if slides_link_tag else None slides_link = ( f"https://dl4ds.github.io{slides_link}" if slides_link else None ) # Extract the link to the lecture recording recording_link_tag = block.find("a", title="Download lecture recording") recording_link = ( recording_link_tag["href"].strip() if recording_link_tag else None ) # Extract suggested readings or summary if available suggested_readings_tag = block.find("p", text="Suggested Readings:") if suggested_readings_tag: suggested_readings = suggested_readings_tag.find_next_sibling("ul") if suggested_readings: suggested_readings = suggested_readings.get_text( separator="\n" ).strip() else: suggested_readings = "No specific readings provided." else: suggested_readings = "No specific readings provided." # Get the date from the schedule date = date_mapping.get(slides_link, "No date available") # Add to the dictionary lecture_metadata[slides_link] = { "date": date, "tldr": tldr, "title": title, "lecture_recording": recording_link, "suggested_readings": suggested_readings, } except Exception as e: print(f"Error processing block: {e}") continue return lecture_metadata