Farid Karimli
LLaMa parser fix
638bffe
raw
history blame
4.55 kB
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import tempfile
def get_urls_from_file(file_path: str):
"""
Function to get urls from a file
"""
with open(file_path, "r") as f:
urls = f.readlines()
urls = [url.strip() for url in urls]
return urls
def get_base_url(url):
parsed_url = urlparse(url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/"
return base_url
def get_metadata(lectures_url, schedule_url):
"""
Function to get the lecture metadata from the lectures and schedule URLs.
"""
lecture_metadata = {}
# Get the main lectures page content
r_lectures = requests.get(lectures_url)
soup_lectures = BeautifulSoup(r_lectures.text, "html.parser")
# Get the main schedule page content
r_schedule = requests.get(schedule_url)
soup_schedule = BeautifulSoup(r_schedule.text, "html.parser")
# Find all lecture blocks
lecture_blocks = soup_lectures.find_all("div", class_="lecture-container")
# Create a mapping from slides link to date
date_mapping = {}
schedule_rows = soup_schedule.find_all("li", class_="table-row-lecture")
for row in schedule_rows:
try:
date = (
row.find("div", {"data-label": "Date"}).get_text(separator=" ").strip()
)
description_div = row.find("div", {"data-label": "Description"})
slides_link_tag = description_div.find("a", title="Download slides")
slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
slides_link = (
f"https://dl4ds.github.io{slides_link}" if slides_link else None
)
if slides_link:
date_mapping[slides_link] = date
except Exception as e:
print(f"Error processing schedule row: {e}")
continue
for block in lecture_blocks:
try:
# Extract the lecture title
title = block.find("span", style="font-weight: bold;").text.strip()
# Extract the TL;DR
tldr = block.find("strong", text="tl;dr:").next_sibling.strip()
# Extract the link to the slides
slides_link_tag = block.find("a", title="Download slides")
slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
slides_link = (
f"https://dl4ds.github.io{slides_link}" if slides_link else None
)
# Extract the link to the lecture recording
recording_link_tag = block.find("a", title="Download lecture recording")
recording_link = (
recording_link_tag["href"].strip() if recording_link_tag else None
)
# Extract suggested readings or summary if available
suggested_readings_tag = block.find("p", text="Suggested Readings:")
if suggested_readings_tag:
suggested_readings = suggested_readings_tag.find_next_sibling("ul")
if suggested_readings:
suggested_readings = suggested_readings.get_text(
separator="\n"
).strip()
else:
suggested_readings = "No specific readings provided."
else:
suggested_readings = "No specific readings provided."
# Get the date from the schedule
date = date_mapping.get(slides_link, "No date available")
# Add to the dictionary
lecture_metadata[slides_link] = {
"date": date,
"tldr": tldr,
"title": title,
"lecture_recording": recording_link,
"suggested_readings": suggested_readings,
}
except Exception as e:
print(f"Error processing block: {e}")
continue
return lecture_metadata
def download_pdf_from_url(pdf_url):
"""
Function to temporarily download a PDF file from a URL and return the local file path.
Args:
pdf_url (str): The URL of the PDF file to download.
Returns:
str: The local file path of the downloaded PDF file.
"""
response = requests.get(pdf_url)
if response.status_code == 200:
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
temp_file.write(response.content)
temp_file_path = temp_file.name
return temp_file_path
else:
return None