File size: 4,902 Bytes
407c075
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
from pathlib import Path

DOCS_FOLDER = Path("documents")

def download_lectures(docs_folder=DOCS_FOLDER):
    import os
    import subprocess

    if not os.path.exists(docs_folder):
        os.makedirs(docs_folder, exist_ok=True)

    lecture_titles = get_lecture_titles()
    lecture_md_urls = list_lecture_md_urls(lecture_titles)

    for idx, url in lecture_md_urls.items():
        filename = "documents/lecture-{}.md".format(str(idx).zfill(2))
        if not os.path.exists(filename):
          subprocess.run(["wget","-O", filename, url], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)


def list_lecture_md_urls(lecture_titles):

    lecture_md_url_base = "https://raw.githubusercontent.com/full-stack-deep-learning/website/main/docs/course/2022/"

    lecture_md_urls = {idx: lecture_md_url_base + title + "/index.md" for idx, title in lecture_titles.items()}

    return lecture_md_urls


def get_lecture_titles():
    lecture_titles = {
      1: "lecture-1-course-vision-and-when-to-use-ml",
      2: "lecture-2-development-infrastructure-and-tooling",
      3: "lecture-3-troubleshooting-and-testing",
      4: "lecture-4-data-management",
      5: "lecture-5-deployment",
      6: "lecture-6-continual-learning",
      7: "lecture-7-foundation-models",
      8: "lecture-8-teams-and-pm",
      9: "lecture-9-ethics"
    }

    return lecture_titles


def produce_documents(docs_folder=DOCS_FOLDER):
    """Assumes the documents are on disk already."""
    import os
    from pathlib import Path
    import shutil
    import string

    import srt

    if not os.path.exists(docs_folder):
        os.makedirs(docs_folder, exist_ok=True)

    lecture_md_filenames = [elem for elem in os.listdir(docs_folder) if "lecture" in elem]

    lecture_titles = get_lecture_titles()
    lecture_texts = {}

    for fn in lecture_md_filenames:
        idx = int("".join(elem for elem in fn if elem in string.digits))
        lecture_md_path = docs_folder / fn
        with open(lecture_md_path) as f:
            lecture = f.read()
            lecture_texts[idx] = lecture

    from langchain.text_splitter import CharacterTextSplitter


    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

    lecture_texts_split = {idx: text_splitter.split_text(lecture_text) for idx, lecture_text in lecture_texts.items()}

    website_url_base = "https://fullstackdeeplearning.com/course/2022/"
    source_urls = {idx: website_url_base + title for idx, title in lecture_titles.items()}
    source_urls_split = {"source": [source_urls[idx]] * len(splits) for idx, splits in lecture_texts_split.items()}

    lecture_texts_flat = [split for lecture_text in lecture_texts_split.values() for split in lecture_text]
    source_urls_flat = [{"source": source_urls[idx]} for idx, lecture_text in lecture_texts_split.items() for split in lecture_text]

    srt_filenames = list(sorted([elem for elem in os.listdir(docs_folder) if elem.endswith(".srt")]))
    srt_urls = get_srt_urls()
    srt_texts_flat, srt_metadatas_flat = [], []

    for fn in srt_filenames:
        idx = int("".join(elem for elem in fn if elem in string.digits))
        srt_url = srt_urls[idx]

        srt_text_path = docs_folder / fn
        with open(srt_text_path) as f:
            srt_text = "\n".join(f.readlines())

        subtitles = list(srt.parse(srt_text))

        texts, metadatas = create_srt_texts_and_metadatas(subtitles, srt_url)
        srt_texts_flat += texts
        srt_metadatas_flat += metadatas

    texts_flat = lecture_texts_flat + srt_texts_flat
    metadatas_flat = source_urls_flat + srt_metadatas_flat

    return texts_flat, metadatas_flat


def create_srt_texts_and_metadatas(subtitles, base_url):
    query_params_format = "&t={start}s"
    texts, metadatas = [], []

    for subtitle in subtitles:
        raw_text = subtitle.content
        text = subtitle.content.strip()
        start = timestamp_from_timedelta(subtitle.start)
        url = base_url + query_params_format.format(start=start)

        texts.append(text)
        metadatas.append({"source": url})

    return texts, metadatas


def timestamp_from_timedelta(timedelta):
    return int(timedelta.total_seconds())


def get_srt_urls():
    return {
        1: "https://www.youtube.com/watch?v=-Iob-FW5jVM",
        2: "https://www.youtube.com/watch?v=BPYOsDCZbno",
        3: "https://www.youtube.com/watch?v=RLemHNAO5Lw",
        4: "https://www.youtube.com/watch?v=Jlm4oqW41vY",
        5: "https://www.youtube.com/watch?v=W3hKjXg7fXM",
        6: "https://www.youtube.com/watch?v=nra0Tt3a-Oc",
        7: "https://www.youtube.com/watch?v=Rm11UeGwGgk",
        8: "https://www.youtube.com/watch?v=a54xH6nT4Sw",
        9: "https://www.youtube.com/watch?v=7FQpbYTqjAA"
        }


if __name__ == "__main__":
  download_lectures()
  texts, metadatas = produce_documents()
  print(texts[-1])
  print(metadatas[-1])