Spaces:
Runtime error
Runtime error
File size: 4,902 Bytes
407c075 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
from pathlib import Path
DOCS_FOLDER = Path("documents")
def download_lectures(docs_folder=DOCS_FOLDER):
import os
import subprocess
if not os.path.exists(docs_folder):
os.makedirs(docs_folder, exist_ok=True)
lecture_titles = get_lecture_titles()
lecture_md_urls = list_lecture_md_urls(lecture_titles)
for idx, url in lecture_md_urls.items():
filename = "documents/lecture-{}.md".format(str(idx).zfill(2))
if not os.path.exists(filename):
subprocess.run(["wget","-O", filename, url], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
def list_lecture_md_urls(lecture_titles):
lecture_md_url_base = "https://raw.githubusercontent.com/full-stack-deep-learning/website/main/docs/course/2022/"
lecture_md_urls = {idx: lecture_md_url_base + title + "/index.md" for idx, title in lecture_titles.items()}
return lecture_md_urls
def get_lecture_titles():
lecture_titles = {
1: "lecture-1-course-vision-and-when-to-use-ml",
2: "lecture-2-development-infrastructure-and-tooling",
3: "lecture-3-troubleshooting-and-testing",
4: "lecture-4-data-management",
5: "lecture-5-deployment",
6: "lecture-6-continual-learning",
7: "lecture-7-foundation-models",
8: "lecture-8-teams-and-pm",
9: "lecture-9-ethics"
}
return lecture_titles
def produce_documents(docs_folder=DOCS_FOLDER):
"""Assumes the documents are on disk already."""
import os
from pathlib import Path
import shutil
import string
import srt
if not os.path.exists(docs_folder):
os.makedirs(docs_folder, exist_ok=True)
lecture_md_filenames = [elem for elem in os.listdir(docs_folder) if "lecture" in elem]
lecture_titles = get_lecture_titles()
lecture_texts = {}
for fn in lecture_md_filenames:
idx = int("".join(elem for elem in fn if elem in string.digits))
lecture_md_path = docs_folder / fn
with open(lecture_md_path) as f:
lecture = f.read()
lecture_texts[idx] = lecture
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
lecture_texts_split = {idx: text_splitter.split_text(lecture_text) for idx, lecture_text in lecture_texts.items()}
website_url_base = "https://fullstackdeeplearning.com/course/2022/"
source_urls = {idx: website_url_base + title for idx, title in lecture_titles.items()}
source_urls_split = {"source": [source_urls[idx]] * len(splits) for idx, splits in lecture_texts_split.items()}
lecture_texts_flat = [split for lecture_text in lecture_texts_split.values() for split in lecture_text]
source_urls_flat = [{"source": source_urls[idx]} for idx, lecture_text in lecture_texts_split.items() for split in lecture_text]
srt_filenames = list(sorted([elem for elem in os.listdir(docs_folder) if elem.endswith(".srt")]))
srt_urls = get_srt_urls()
srt_texts_flat, srt_metadatas_flat = [], []
for fn in srt_filenames:
idx = int("".join(elem for elem in fn if elem in string.digits))
srt_url = srt_urls[idx]
srt_text_path = docs_folder / fn
with open(srt_text_path) as f:
srt_text = "\n".join(f.readlines())
subtitles = list(srt.parse(srt_text))
texts, metadatas = create_srt_texts_and_metadatas(subtitles, srt_url)
srt_texts_flat += texts
srt_metadatas_flat += metadatas
texts_flat = lecture_texts_flat + srt_texts_flat
metadatas_flat = source_urls_flat + srt_metadatas_flat
return texts_flat, metadatas_flat
def create_srt_texts_and_metadatas(subtitles, base_url):
query_params_format = "&t={start}s"
texts, metadatas = [], []
for subtitle in subtitles:
raw_text = subtitle.content
text = subtitle.content.strip()
start = timestamp_from_timedelta(subtitle.start)
url = base_url + query_params_format.format(start=start)
texts.append(text)
metadatas.append({"source": url})
return texts, metadatas
def timestamp_from_timedelta(timedelta):
return int(timedelta.total_seconds())
def get_srt_urls():
return {
1: "https://www.youtube.com/watch?v=-Iob-FW5jVM",
2: "https://www.youtube.com/watch?v=BPYOsDCZbno",
3: "https://www.youtube.com/watch?v=RLemHNAO5Lw",
4: "https://www.youtube.com/watch?v=Jlm4oqW41vY",
5: "https://www.youtube.com/watch?v=W3hKjXg7fXM",
6: "https://www.youtube.com/watch?v=nra0Tt3a-Oc",
7: "https://www.youtube.com/watch?v=Rm11UeGwGgk",
8: "https://www.youtube.com/watch?v=a54xH6nT4Sw",
9: "https://www.youtube.com/watch?v=7FQpbYTqjAA"
}
if __name__ == "__main__":
download_lectures()
texts, metadatas = produce_documents()
print(texts[-1])
print(metadatas[-1])
|