File size: 4,549 Bytes
f51bb92
 
638bffe
 
f51bb92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
638bffe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import tempfile

def get_urls_from_file(file_path: str):
    """
    Function to get urls from a file
    """
    with open(file_path, "r") as f:
        urls = f.readlines()
    urls = [url.strip() for url in urls]
    return urls


def get_base_url(url):
    parsed_url = urlparse(url)
    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/"
    return base_url


def get_metadata(lectures_url, schedule_url):
    """
    Function to get the lecture metadata from the lectures and schedule URLs.
    """
    lecture_metadata = {}

    # Get the main lectures page content
    r_lectures = requests.get(lectures_url)
    soup_lectures = BeautifulSoup(r_lectures.text, "html.parser")

    # Get the main schedule page content
    r_schedule = requests.get(schedule_url)
    soup_schedule = BeautifulSoup(r_schedule.text, "html.parser")

    # Find all lecture blocks
    lecture_blocks = soup_lectures.find_all("div", class_="lecture-container")

    # Create a mapping from slides link to date
    date_mapping = {}
    schedule_rows = soup_schedule.find_all("li", class_="table-row-lecture")
    for row in schedule_rows:
        try:
            date = (
                row.find("div", {"data-label": "Date"}).get_text(separator=" ").strip()
            )
            description_div = row.find("div", {"data-label": "Description"})
            slides_link_tag = description_div.find("a", title="Download slides")
            slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
            slides_link = (
                f"https://dl4ds.github.io{slides_link}" if slides_link else None
            )
            if slides_link:
                date_mapping[slides_link] = date
        except Exception as e:
            print(f"Error processing schedule row: {e}")
            continue

    for block in lecture_blocks:
        try:
            # Extract the lecture title
            title = block.find("span", style="font-weight: bold;").text.strip()

            # Extract the TL;DR
            tldr = block.find("strong", text="tl;dr:").next_sibling.strip()

            # Extract the link to the slides
            slides_link_tag = block.find("a", title="Download slides")
            slides_link = slides_link_tag["href"].strip() if slides_link_tag else None
            slides_link = (
                f"https://dl4ds.github.io{slides_link}" if slides_link else None
            )

            # Extract the link to the lecture recording
            recording_link_tag = block.find("a", title="Download lecture recording")
            recording_link = (
                recording_link_tag["href"].strip() if recording_link_tag else None
            )

            # Extract suggested readings or summary if available
            suggested_readings_tag = block.find("p", text="Suggested Readings:")
            if suggested_readings_tag:
                suggested_readings = suggested_readings_tag.find_next_sibling("ul")
                if suggested_readings:
                    suggested_readings = suggested_readings.get_text(
                        separator="\n"
                    ).strip()
                else:
                    suggested_readings = "No specific readings provided."
            else:
                suggested_readings = "No specific readings provided."

            # Get the date from the schedule
            date = date_mapping.get(slides_link, "No date available")

            # Add to the dictionary
            lecture_metadata[slides_link] = {
                "date": date,
                "tldr": tldr,
                "title": title,
                "lecture_recording": recording_link,
                "suggested_readings": suggested_readings,
            }
        except Exception as e:
            print(f"Error processing block: {e}")
            continue

    return lecture_metadata


def download_pdf_from_url(pdf_url):
    """
    Function to temporarily download a PDF file from a URL and return the local file path.

    Args:
        pdf_url (str): The URL of the PDF file to download.

    Returns:
        str: The local file path of the downloaded PDF file.
    """
    response = requests.get(pdf_url)
    if response.status_code == 200:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
            temp_file.write(response.content)
            temp_file_path = temp_file.name
        return temp_file_path
    else:
        return None