Spaces:
Sleeping
Sleeping
init
Browse files- .gitattributes +2 -0
- app.py +89 -0
- requirements.txt +8 -0
- utils/chunk.py +52 -0
- utils/markdown.py +59 -0
- utils/marp_wrapper.py +65 -0
- utils/ppt.py +17 -0
- utils/subtitles.py +72 -0
- utils/video.py +55 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
|
37 |
+
**__pycache__
|
app.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# from utils.subtitles import getSubs
|
2 |
+
|
3 |
+
# subs = getSubs("G8gEos8F9R0")
|
4 |
+
# print(subs)
|
5 |
+
|
6 |
+
# import json
|
7 |
+
|
8 |
+
# with open("subs.json", "w") as f:
|
9 |
+
# json.dump(subs, f)
|
10 |
+
|
11 |
+
import datetime
|
12 |
+
import gradio as gr
|
13 |
+
|
14 |
+
def greet(name):
|
15 |
+
pass
|
16 |
+
|
17 |
+
def run_model(video_id, outname, chunk_size):
|
18 |
+
CHUNK_SIZE = chunk_size
|
19 |
+
OUT_PPT_NAME = outname
|
20 |
+
from rich.progress import track
|
21 |
+
from utils.subtitles import getSubsText
|
22 |
+
from models.distilbart_cnn_12_6 import summarize
|
23 |
+
from models.t5_small_medium_title_generation import t5model as generate_title
|
24 |
+
from utils.marp_wrapper import marp
|
25 |
+
import utils.markdown as md
|
26 |
+
# from utils.chunk import LangChainChunker as chunker
|
27 |
+
from utils.subtitles import subs as chunker
|
28 |
+
from utils.ppt import generate_ppt
|
29 |
+
from utils.video import video
|
30 |
+
import os
|
31 |
+
|
32 |
+
# Intermediary Markdown file
|
33 |
+
print("Creating Markdown file...")
|
34 |
+
ppt = marp("summary.md")
|
35 |
+
ppt.add_header(
|
36 |
+
theme="uncover",
|
37 |
+
background="",
|
38 |
+
_class="invert",
|
39 |
+
)
|
40 |
+
|
41 |
+
# smaller font size (1.5rem)
|
42 |
+
ppt.add_body("<style> section { font-size: 1.5rem; } </style>")
|
43 |
+
|
44 |
+
# Generate video
|
45 |
+
vid = video(f"https://youtu.be/{video_id}",
|
46 |
+
f"out/vid-{video_id}")
|
47 |
+
vid.download()
|
48 |
+
|
49 |
+
# Get the Subtitles from the YouTube video
|
50 |
+
print("Getting subtitles...")
|
51 |
+
|
52 |
+
chunker_init = chunker(video_id)
|
53 |
+
chunks = chunker_init.getSubsList(size=CHUNK_SIZE)
|
54 |
+
chunk_len = len(chunks)
|
55 |
+
|
56 |
+
print(f"subtitles divided to {chunk_len} chunks")
|
57 |
+
|
58 |
+
chunk_num = 1
|
59 |
+
for chunk in track(chunks, description="Processing chunks"):
|
60 |
+
print(f"processing Chunk: {chunk_num}/{chunk_len}")
|
61 |
+
timestamp = str(datetime.timedelta(seconds=chunk[1]))
|
62 |
+
# TODO: better file path
|
63 |
+
img_path = f"out/vid-{video_id}_{timestamp}.png"
|
64 |
+
|
65 |
+
summary = summarize(chunk[0])
|
66 |
+
vid.getframe(timestamp)
|
67 |
+
title = generate_title(summary)
|
68 |
+
|
69 |
+
ppt.add_page( md.h2(title), summary )
|
70 |
+
|
71 |
+
if os.path.exists(img_path):
|
72 |
+
ppt.add_body(md.image( img_path,
|
73 |
+
align="left", setAsBackground=True, size="contain"))
|
74 |
+
|
75 |
+
ppt.marp_end()
|
76 |
+
chunk_num += 1
|
77 |
+
continue
|
78 |
+
|
79 |
+
print(f"Generating {OUT_PPT_NAME}..")
|
80 |
+
ppt.close_file()
|
81 |
+
generate_ppt("summary.md", OUT_PPT_NAME)
|
82 |
+
|
83 |
+
# return full path to the ppt file
|
84 |
+
return os.path.abspath(OUT_PPT_NAME)
|
85 |
+
|
86 |
+
|
87 |
+
demo = gr.Interface(fn=run_model, inputs=["text", "text", gr.Slider(200, 1000)], outputs="file")
|
88 |
+
|
89 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
transformers
|
3 |
+
nltk
|
4 |
+
youtube_transcript_api
|
5 |
+
accelerate
|
6 |
+
langchain
|
7 |
+
yt-dlp
|
8 |
+
rich
|
utils/chunk.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# divide the subs into chunks for more accurate summarization
|
2 |
+
# TODO: divide the subs into chunks based on the topics
|
3 |
+
# summarize each chunk and add it to the markdown file
|
4 |
+
|
5 |
+
class legacy_chunker:
|
6 |
+
# legacy manual chunker
|
7 |
+
def __init__(self, text):
|
8 |
+
self.text = text
|
9 |
+
def chunker(self, size=1000):
|
10 |
+
words = self.text.split()
|
11 |
+
chunks = []
|
12 |
+
current_chunk = ""
|
13 |
+
for word in words:
|
14 |
+
if len(current_chunk) + len(word) + 1 <= size:
|
15 |
+
current_chunk += f"{word} "
|
16 |
+
else:
|
17 |
+
chunks.append(current_chunk.strip())
|
18 |
+
current_chunk = f"{word} "
|
19 |
+
if current_chunk:
|
20 |
+
chunks.append(current_chunk.strip())
|
21 |
+
return chunks
|
22 |
+
|
23 |
+
|
24 |
+
def __sizeof__(self) -> int:
|
25 |
+
count = 0
|
26 |
+
for _ in self.text:
|
27 |
+
count += 1
|
28 |
+
return count
|
29 |
+
|
30 |
+
class LangChainChunker:
|
31 |
+
def __init__(self, text):
|
32 |
+
self.text = text
|
33 |
+
|
34 |
+
def chunker(self, size=1000):
|
35 |
+
from langchain.text_splitter import CharacterTextSplitter
|
36 |
+
|
37 |
+
# attach the duration of the video to the chunk
|
38 |
+
# [[chunk, duration]]
|
39 |
+
|
40 |
+
text_splitter = CharacterTextSplitter(
|
41 |
+
separator=" ",
|
42 |
+
chunk_size=size,
|
43 |
+
chunk_overlap=0.9,
|
44 |
+
)
|
45 |
+
|
46 |
+
return text_splitter.split_text(self.text)
|
47 |
+
|
48 |
+
def __sizeof__(self) -> int:
|
49 |
+
count = 0
|
50 |
+
for _ in self.text:
|
51 |
+
count += 1
|
52 |
+
return count
|
utils/markdown.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def h1(text):
|
2 |
+
return f"# {text}\n"
|
3 |
+
|
4 |
+
def h2(text):
|
5 |
+
return f"## {text}\n"
|
6 |
+
|
7 |
+
def h3(text):
|
8 |
+
return f"### {text}\n"
|
9 |
+
|
10 |
+
def backtick(text):
|
11 |
+
return f"`{text}`"
|
12 |
+
|
13 |
+
def code(lang, text):
|
14 |
+
return f"```{lang}\n{text}\n```"
|
15 |
+
|
16 |
+
def link(text, url):
|
17 |
+
return f"[{text}]({url})"
|
18 |
+
|
19 |
+
def image(
|
20 |
+
url,
|
21 |
+
align=None,
|
22 |
+
height=None,
|
23 |
+
size=None,
|
24 |
+
setAsBackground=False,
|
25 |
+
):
|
26 |
+
|
27 |
+
"""Summary
|
28 |
+
return markdown image syntax with marp options
|
29 |
+
|
30 |
+
Args:
|
31 |
+
url (str): image url (only online images)
|
32 |
+
align (str, optional): image alignment (left, right, center)
|
33 |
+
height (str, optional): image height (ex:"2in")
|
34 |
+
size (str, optional): cover, contain, auto, fix, x%
|
35 |
+
setAsBackground (bool, optional): set image as background
|
36 |
+
Returns:
|
37 |
+
_type_: _description_
|
38 |
+
"""
|
39 |
+
|
40 |
+
options = ""
|
41 |
+
if align is not None:
|
42 |
+
options += f"{align}"
|
43 |
+
if height is not None:
|
44 |
+
options += f" {height}"
|
45 |
+
if size is not None:
|
46 |
+
options += f" {size}"
|
47 |
+
if setAsBackground:
|
48 |
+
options += " bg"
|
49 |
+
|
50 |
+
return f"![{options}]({url})"
|
51 |
+
|
52 |
+
def quote(text):
|
53 |
+
return f"> {text}"
|
54 |
+
|
55 |
+
def bold(text):
|
56 |
+
return f"**{text}**"
|
57 |
+
|
58 |
+
def italic(text):
|
59 |
+
return f"*{text}*"
|
utils/marp_wrapper.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import utils.markdown as md
|
2 |
+
|
3 |
+
class marp:
|
4 |
+
def __init__(self, path):
|
5 |
+
self.path = path
|
6 |
+
self.f = open(path, "w")
|
7 |
+
self.f.truncate(0) # clear the file
|
8 |
+
|
9 |
+
def marp_write(self, text):
|
10 |
+
self.f.write(text)
|
11 |
+
|
12 |
+
def add_header(
|
13 |
+
self,
|
14 |
+
theme: str = "default",
|
15 |
+
_class: str = "lead",
|
16 |
+
paginate: bool = True,
|
17 |
+
background: str = "",
|
18 |
+
backgroundImage: str = None,
|
19 |
+
extra_styles: str = None
|
20 |
+
):
|
21 |
+
## write the header
|
22 |
+
# ---
|
23 |
+
# theme: gaia
|
24 |
+
# _class: lead
|
25 |
+
# paginate: true
|
26 |
+
# backgroundColor: #fff
|
27 |
+
# backgroundImage: url('https://marp.app/assets/hero-background.svg')
|
28 |
+
# ---
|
29 |
+
self.marp_write("---\n")
|
30 |
+
self.marp_write("marp: true\n")
|
31 |
+
self.marp_write(f"theme: {theme}\n")
|
32 |
+
self.marp_write(f"class: {_class}\n")
|
33 |
+
if paginate:
|
34 |
+
self.marp_write(f"paginate: true\n")
|
35 |
+
else:
|
36 |
+
self.marp_write(f"paginate: false\n")
|
37 |
+
self.marp_write(f"backgroundColor: {background}\n")
|
38 |
+
self.writeifNotNone(backgroundImage)
|
39 |
+
self.marp_end()
|
40 |
+
self.writeifNotNone(extra_styles) # for extra css styles
|
41 |
+
|
42 |
+
def writeifNotNone(self, var):
|
43 |
+
if var is not None:
|
44 |
+
self.marp_write(var)
|
45 |
+
|
46 |
+
def add_page(self,
|
47 |
+
title=None,
|
48 |
+
body=None,
|
49 |
+
directives: str = None
|
50 |
+
):
|
51 |
+
self.writeifNotNone(f"<!-- {directives} -->\n")
|
52 |
+
self.writeifNotNone(title)
|
53 |
+
self.writeifNotNone(body)
|
54 |
+
|
55 |
+
def add_directives(self, directives: str):
|
56 |
+
self.marp_write(f"<!-- {directives} -->\n")
|
57 |
+
|
58 |
+
def add_body(self, body: str):
|
59 |
+
self.marp_write(body)
|
60 |
+
|
61 |
+
def marp_end(self):
|
62 |
+
self.marp_write("\n\n---\n\n") # page end
|
63 |
+
|
64 |
+
def close_file(self):
|
65 |
+
self.f.close() # close the file and flush the buffer
|
utils/ppt.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
def generate_ppt(markdown_source, output_name, chromium_path="./chrome_sandbox") -> None:
|
4 |
+
# check for marp
|
5 |
+
if os.system("marp --version") != 0:
|
6 |
+
raise Exception("Marp is not installed")
|
7 |
+
|
8 |
+
# if user is root, then set CHROMIUM_PATH to chromium_path
|
9 |
+
if os.getuid() == 0 and os.name == "posix":
|
10 |
+
os.environ["CHROME_PATH"] = chromium_path
|
11 |
+
|
12 |
+
# check for markdown source
|
13 |
+
if not os.path.exists(markdown_source):
|
14 |
+
raise Exception("Markdown source does not exist")
|
15 |
+
|
16 |
+
# generate ppt
|
17 |
+
os.system(f"marp {markdown_source} -o {output_name} --allow-local-files")
|
utils/subtitles.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from youtube_transcript_api import YouTubeTranscriptApi as ytapi
|
2 |
+
from youtube_transcript_api.formatters import TextFormatter
|
3 |
+
import json
|
4 |
+
|
5 |
+
def getSubsText(video_id="", getGenerated=False):
|
6 |
+
tList = ytapi.list_transcripts(video_id)
|
7 |
+
|
8 |
+
if getGenerated:
|
9 |
+
# TODO: implement getGenerated
|
10 |
+
pass
|
11 |
+
|
12 |
+
for t in tList:
|
13 |
+
data = t.fetch()
|
14 |
+
|
15 |
+
return (TextFormatter().format_transcript(data)).replace("\n", " ")
|
16 |
+
|
17 |
+
def getSubs(video_id="", getGenerated=False, chunker=None):
|
18 |
+
tList = ytapi.list_transcripts(video_id)
|
19 |
+
|
20 |
+
if getGenerated:
|
21 |
+
pass
|
22 |
+
for t in tList:
|
23 |
+
data = t.fetch()
|
24 |
+
|
25 |
+
return data
|
26 |
+
|
27 |
+
class subs:
|
28 |
+
def __init__(self, video_id="", generated=False):
|
29 |
+
self.video_id = video_id
|
30 |
+
self.generated = generated
|
31 |
+
self.subs = getSubs(video_id, generated)
|
32 |
+
|
33 |
+
def __sizeof__(self) -> int:
|
34 |
+
count = 0
|
35 |
+
for _ in self.subs:
|
36 |
+
count += 1
|
37 |
+
return count
|
38 |
+
|
39 |
+
def getText(self):
|
40 |
+
return (TextFormatter().format_transcript(self.subs)).replace("\n", " ")
|
41 |
+
|
42 |
+
def getSubs(self):
|
43 |
+
subs = self.subs
|
44 |
+
# [chunk, duration]
|
45 |
+
c_d_subs = '\n'.join(f"{subs['text']}:::{subs['duration']}" for subs in subs)
|
46 |
+
return c_d_subs
|
47 |
+
|
48 |
+
def getSubsRaw(self):
|
49 |
+
return self.subs
|
50 |
+
|
51 |
+
def getSubsList(self, size=100):
|
52 |
+
subs = json.loads(json.dumps(self.subs))
|
53 |
+
chunks = []
|
54 |
+
current_chunk = "" # limited to {size}
|
55 |
+
current_duaration = 0 # TODO: add better variable name
|
56 |
+
c_d_target = 2
|
57 |
+
c_d_count = 0
|
58 |
+
|
59 |
+
for subline in subs:
|
60 |
+
current_duaration = subline["start"]
|
61 |
+
if len(current_chunk) + len(subline["text"]) + 1 <= size:
|
62 |
+
current_chunk += f"{subline['text']} "
|
63 |
+
else:
|
64 |
+
chunks.append(
|
65 |
+
[
|
66 |
+
current_chunk.strip(),
|
67 |
+
current_duaration
|
68 |
+
]
|
69 |
+
)
|
70 |
+
current_chunk = f"{subline['text']} "
|
71 |
+
|
72 |
+
return chunks
|
utils/video.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import subprocess, os
|
2 |
+
|
3 |
+
def Popen(cmd: list) -> str:
|
4 |
+
"""Run a command and return the output as a string
|
5 |
+
|
6 |
+
Args:
|
7 |
+
cmd (list): The command to run
|
8 |
+
|
9 |
+
Returns:
|
10 |
+
str: The output of the command
|
11 |
+
"""
|
12 |
+
return subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE).stdout.read().strip().decode('utf-8')
|
13 |
+
|
14 |
+
class video:
|
15 |
+
def __init__(self,url, path):
|
16 |
+
self.path = path
|
17 |
+
self.url = url
|
18 |
+
|
19 |
+
# check if directory exists
|
20 |
+
if not os.path.exists(self.path.split("/")[-1]):
|
21 |
+
os.mkdir(self.path.split("/")[-1])
|
22 |
+
|
23 |
+
def download(self):
|
24 |
+
if os.path.exists(f"{self.path}.webm"):
|
25 |
+
print(f"{self.path}.webm already exists, skipping download")
|
26 |
+
return
|
27 |
+
print(f"Downloading {self.url}")
|
28 |
+
# (
|
29 |
+
# Popen(
|
30 |
+
# ["yt-dlp", self.url, "-o", self.path ]
|
31 |
+
# )
|
32 |
+
# )
|
33 |
+
os.system(f"yt-dlp {self.url} -o {self.path}")
|
34 |
+
|
35 |
+
def getframe(self, timestamp):
|
36 |
+
filename = f"{self.path}_{timestamp}.png"
|
37 |
+
|
38 |
+
if os.path.exists(filename):
|
39 |
+
print(f"{filename} already exists, skipping download")
|
40 |
+
return
|
41 |
+
|
42 |
+
print(f"Getting frame at {timestamp}")
|
43 |
+
(
|
44 |
+
Popen(
|
45 |
+
[
|
46 |
+
"ffmpeg",
|
47 |
+
"-hide_banner",
|
48 |
+
"-loglevel", "panic",
|
49 |
+
"-ss", timestamp,
|
50 |
+
"-i", f"{self.path}.webm",
|
51 |
+
"-vframes", "1",
|
52 |
+
f"{filename}"
|
53 |
+
]
|
54 |
+
)
|
55 |
+
)
|