Spaces:
Build error
Build error
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urlparse | |
import tempfile | |
def get_urls_from_file(file_path: str): | |
""" | |
Function to get urls from a file | |
""" | |
with open(file_path, "r") as f: | |
urls = f.readlines() | |
urls = [url.strip() for url in urls] | |
return urls | |
def get_base_url(url): | |
parsed_url = urlparse(url) | |
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/" | |
return base_url | |
def get_metadata(lectures_url, schedule_url): | |
""" | |
Function to get the lecture metadata from the lectures and schedule URLs. | |
""" | |
lecture_metadata = {} | |
# Get the main lectures page content | |
r_lectures = requests.get(lectures_url) | |
soup_lectures = BeautifulSoup(r_lectures.text, "html.parser") | |
# Get the main schedule page content | |
r_schedule = requests.get(schedule_url) | |
soup_schedule = BeautifulSoup(r_schedule.text, "html.parser") | |
# Find all lecture blocks | |
lecture_blocks = soup_lectures.find_all("div", class_="lecture-container") | |
# Create a mapping from slides link to date | |
date_mapping = {} | |
schedule_rows = soup_schedule.find_all("li", class_="table-row-lecture") | |
for row in schedule_rows: | |
try: | |
date = ( | |
row.find("div", {"data-label": "Date"}).get_text(separator=" ").strip() | |
) | |
description_div = row.find("div", {"data-label": "Description"}) | |
slides_link_tag = description_div.find("a", title="Download slides") | |
slides_link = slides_link_tag["href"].strip() if slides_link_tag else None | |
slides_link = ( | |
f"https://dl4ds.github.io{slides_link}" if slides_link else None | |
) | |
if slides_link: | |
date_mapping[slides_link] = date | |
except Exception as e: | |
print(f"Error processing schedule row: {e}") | |
continue | |
for block in lecture_blocks: | |
try: | |
# Extract the lecture title | |
title = block.find("span", style="font-weight: bold;").text.strip() | |
# Extract the TL;DR | |
tldr = block.find("strong", text="tl;dr:").next_sibling.strip() | |
# Extract the link to the slides | |
slides_link_tag = block.find("a", title="Download slides") | |
slides_link = slides_link_tag["href"].strip() if slides_link_tag else None | |
slides_link = ( | |
f"https://dl4ds.github.io{slides_link}" if slides_link else None | |
) | |
# Extract the link to the lecture recording | |
recording_link_tag = block.find("a", title="Download lecture recording") | |
recording_link = ( | |
recording_link_tag["href"].strip() if recording_link_tag else None | |
) | |
# Extract suggested readings or summary if available | |
suggested_readings_tag = block.find("p", text="Suggested Readings:") | |
if suggested_readings_tag: | |
suggested_readings = suggested_readings_tag.find_next_sibling("ul") | |
if suggested_readings: | |
suggested_readings = suggested_readings.get_text( | |
separator="\n" | |
).strip() | |
else: | |
suggested_readings = "No specific readings provided." | |
else: | |
suggested_readings = "No specific readings provided." | |
# Get the date from the schedule | |
date = date_mapping.get(slides_link, "No date available") | |
# Add to the dictionary | |
lecture_metadata[slides_link] = { | |
"date": date, | |
"tldr": tldr, | |
"title": title, | |
"lecture_recording": recording_link, | |
"suggested_readings": suggested_readings, | |
} | |
except Exception as e: | |
print(f"Error processing block: {e}") | |
continue | |
return lecture_metadata | |
def download_pdf_from_url(pdf_url): | |
""" | |
Function to temporarily download a PDF file from a URL and return the local file path. | |
Args: | |
pdf_url (str): The URL of the PDF file to download. | |
Returns: | |
str: The local file path of the downloaded PDF file. | |
""" | |
response = requests.get(pdf_url) | |
if response.status_code == 200: | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: | |
temp_file.write(response.content) | |
temp_file_path = temp_file.name | |
return temp_file_path | |
else: | |
return None | |