zman1x1 commited on
Commit
9150552
1 Parent(s): a9996a1
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+
37
+ **__pycache__
app.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from utils.subtitles import getSubs
2
+
3
+ # subs = getSubs("G8gEos8F9R0")
4
+ # print(subs)
5
+
6
+ # import json
7
+
8
+ # with open("subs.json", "w") as f:
9
+ # json.dump(subs, f)
10
+
11
+ import datetime
12
+ import gradio as gr
13
+
14
+ def greet(name):
15
+ pass
16
+
17
+ def run_model(video_id, outname, chunk_size):
18
+ CHUNK_SIZE = chunk_size
19
+ OUT_PPT_NAME = outname
20
+ from rich.progress import track
21
+ from utils.subtitles import getSubsText
22
+ from models.distilbart_cnn_12_6 import summarize
23
+ from models.t5_small_medium_title_generation import t5model as generate_title
24
+ from utils.marp_wrapper import marp
25
+ import utils.markdown as md
26
+ # from utils.chunk import LangChainChunker as chunker
27
+ from utils.subtitles import subs as chunker
28
+ from utils.ppt import generate_ppt
29
+ from utils.video import video
30
+ import os
31
+
32
+ # Intermediary Markdown file
33
+ print("Creating Markdown file...")
34
+ ppt = marp("summary.md")
35
+ ppt.add_header(
36
+ theme="uncover",
37
+ background="",
38
+ _class="invert",
39
+ )
40
+
41
+ # smaller font size (1.5rem)
42
+ ppt.add_body("<style> section { font-size: 1.5rem; } </style>")
43
+
44
+ # Generate video
45
+ vid = video(f"https://youtu.be/{video_id}",
46
+ f"out/vid-{video_id}")
47
+ vid.download()
48
+
49
+ # Get the Subtitles from the YouTube video
50
+ print("Getting subtitles...")
51
+
52
+ chunker_init = chunker(video_id)
53
+ chunks = chunker_init.getSubsList(size=CHUNK_SIZE)
54
+ chunk_len = len(chunks)
55
+
56
+ print(f"subtitles divided to {chunk_len} chunks")
57
+
58
+ chunk_num = 1
59
+ for chunk in track(chunks, description="Processing chunks"):
60
+ print(f"processing Chunk: {chunk_num}/{chunk_len}")
61
+ timestamp = str(datetime.timedelta(seconds=chunk[1]))
62
+ # TODO: better file path
63
+ img_path = f"out/vid-{video_id}_{timestamp}.png"
64
+
65
+ summary = summarize(chunk[0])
66
+ vid.getframe(timestamp)
67
+ title = generate_title(summary)
68
+
69
+ ppt.add_page( md.h2(title), summary )
70
+
71
+ if os.path.exists(img_path):
72
+ ppt.add_body(md.image( img_path,
73
+ align="left", setAsBackground=True, size="contain"))
74
+
75
+ ppt.marp_end()
76
+ chunk_num += 1
77
+ continue
78
+
79
+ print(f"Generating {OUT_PPT_NAME}..")
80
+ ppt.close_file()
81
+ generate_ppt("summary.md", OUT_PPT_NAME)
82
+
83
+ # return full path to the ppt file
84
+ return os.path.abspath(OUT_PPT_NAME)
85
+
86
+
87
+ demo = gr.Interface(fn=run_model, inputs=["text", "text", gr.Slider(200, 1000)], outputs="file")
88
+
89
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ nltk
4
+ youtube_transcript_api
5
+ accelerate
6
+ langchain
7
+ yt-dlp
8
+ rich
utils/chunk.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # divide the subs into chunks for more accurate summarization
2
+ # TODO: divide the subs into chunks based on the topics
3
+ # summarize each chunk and add it to the markdown file
4
+
5
+ class legacy_chunker:
6
+ # legacy manual chunker
7
+ def __init__(self, text):
8
+ self.text = text
9
+ def chunker(self, size=1000):
10
+ words = self.text.split()
11
+ chunks = []
12
+ current_chunk = ""
13
+ for word in words:
14
+ if len(current_chunk) + len(word) + 1 <= size:
15
+ current_chunk += f"{word} "
16
+ else:
17
+ chunks.append(current_chunk.strip())
18
+ current_chunk = f"{word} "
19
+ if current_chunk:
20
+ chunks.append(current_chunk.strip())
21
+ return chunks
22
+
23
+
24
+ def __sizeof__(self) -> int:
25
+ count = 0
26
+ for _ in self.text:
27
+ count += 1
28
+ return count
29
+
30
+ class LangChainChunker:
31
+ def __init__(self, text):
32
+ self.text = text
33
+
34
+ def chunker(self, size=1000):
35
+ from langchain.text_splitter import CharacterTextSplitter
36
+
37
+ # attach the duration of the video to the chunk
38
+ # [[chunk, duration]]
39
+
40
+ text_splitter = CharacterTextSplitter(
41
+ separator=" ",
42
+ chunk_size=size,
43
+ chunk_overlap=0.9,
44
+ )
45
+
46
+ return text_splitter.split_text(self.text)
47
+
48
+ def __sizeof__(self) -> int:
49
+ count = 0
50
+ for _ in self.text:
51
+ count += 1
52
+ return count
utils/markdown.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def h1(text):
2
+ return f"# {text}\n"
3
+
4
+ def h2(text):
5
+ return f"## {text}\n"
6
+
7
+ def h3(text):
8
+ return f"### {text}\n"
9
+
10
+ def backtick(text):
11
+ return f"`{text}`"
12
+
13
+ def code(lang, text):
14
+ return f"```{lang}\n{text}\n```"
15
+
16
+ def link(text, url):
17
+ return f"[{text}]({url})"
18
+
19
+ def image(
20
+ url,
21
+ align=None,
22
+ height=None,
23
+ size=None,
24
+ setAsBackground=False,
25
+ ):
26
+
27
+ """Summary
28
+ return markdown image syntax with marp options
29
+
30
+ Args:
31
+ url (str): image url (only online images)
32
+ align (str, optional): image alignment (left, right, center)
33
+ height (str, optional): image height (ex:"2in")
34
+ size (str, optional): cover, contain, auto, fix, x%
35
+ setAsBackground (bool, optional): set image as background
36
+ Returns:
37
+ _type_: _description_
38
+ """
39
+
40
+ options = ""
41
+ if align is not None:
42
+ options += f"{align}"
43
+ if height is not None:
44
+ options += f" {height}"
45
+ if size is not None:
46
+ options += f" {size}"
47
+ if setAsBackground:
48
+ options += " bg"
49
+
50
+ return f"![{options}]({url})"
51
+
52
+ def quote(text):
53
+ return f"> {text}"
54
+
55
+ def bold(text):
56
+ return f"**{text}**"
57
+
58
+ def italic(text):
59
+ return f"*{text}*"
utils/marp_wrapper.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import utils.markdown as md
2
+
3
+ class marp:
4
+ def __init__(self, path):
5
+ self.path = path
6
+ self.f = open(path, "w")
7
+ self.f.truncate(0) # clear the file
8
+
9
+ def marp_write(self, text):
10
+ self.f.write(text)
11
+
12
+ def add_header(
13
+ self,
14
+ theme: str = "default",
15
+ _class: str = "lead",
16
+ paginate: bool = True,
17
+ background: str = "",
18
+ backgroundImage: str = None,
19
+ extra_styles: str = None
20
+ ):
21
+ ## write the header
22
+ # ---
23
+ # theme: gaia
24
+ # _class: lead
25
+ # paginate: true
26
+ # backgroundColor: #fff
27
+ # backgroundImage: url('https://marp.app/assets/hero-background.svg')
28
+ # ---
29
+ self.marp_write("---\n")
30
+ self.marp_write("marp: true\n")
31
+ self.marp_write(f"theme: {theme}\n")
32
+ self.marp_write(f"class: {_class}\n")
33
+ if paginate:
34
+ self.marp_write(f"paginate: true\n")
35
+ else:
36
+ self.marp_write(f"paginate: false\n")
37
+ self.marp_write(f"backgroundColor: {background}\n")
38
+ self.writeifNotNone(backgroundImage)
39
+ self.marp_end()
40
+ self.writeifNotNone(extra_styles) # for extra css styles
41
+
42
+ def writeifNotNone(self, var):
43
+ if var is not None:
44
+ self.marp_write(var)
45
+
46
+ def add_page(self,
47
+ title=None,
48
+ body=None,
49
+ directives: str = None
50
+ ):
51
+ self.writeifNotNone(f"<!-- {directives} -->\n")
52
+ self.writeifNotNone(title)
53
+ self.writeifNotNone(body)
54
+
55
+ def add_directives(self, directives: str):
56
+ self.marp_write(f"<!-- {directives} -->\n")
57
+
58
+ def add_body(self, body: str):
59
+ self.marp_write(body)
60
+
61
+ def marp_end(self):
62
+ self.marp_write("\n\n---\n\n") # page end
63
+
64
+ def close_file(self):
65
+ self.f.close() # close the file and flush the buffer
utils/ppt.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ def generate_ppt(markdown_source, output_name, chromium_path="./chrome_sandbox") -> None:
4
+ # check for marp
5
+ if os.system("marp --version") != 0:
6
+ raise Exception("Marp is not installed")
7
+
8
+ # if user is root, then set CHROMIUM_PATH to chromium_path
9
+ if os.getuid() == 0 and os.name == "posix":
10
+ os.environ["CHROME_PATH"] = chromium_path
11
+
12
+ # check for markdown source
13
+ if not os.path.exists(markdown_source):
14
+ raise Exception("Markdown source does not exist")
15
+
16
+ # generate ppt
17
+ os.system(f"marp {markdown_source} -o {output_name} --allow-local-files")
utils/subtitles.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from youtube_transcript_api import YouTubeTranscriptApi as ytapi
2
+ from youtube_transcript_api.formatters import TextFormatter
3
+ import json
4
+
5
+ def getSubsText(video_id="", getGenerated=False):
6
+ tList = ytapi.list_transcripts(video_id)
7
+
8
+ if getGenerated:
9
+ # TODO: implement getGenerated
10
+ pass
11
+
12
+ for t in tList:
13
+ data = t.fetch()
14
+
15
+ return (TextFormatter().format_transcript(data)).replace("\n", " ")
16
+
17
+ def getSubs(video_id="", getGenerated=False, chunker=None):
18
+ tList = ytapi.list_transcripts(video_id)
19
+
20
+ if getGenerated:
21
+ pass
22
+ for t in tList:
23
+ data = t.fetch()
24
+
25
+ return data
26
+
27
+ class subs:
28
+ def __init__(self, video_id="", generated=False):
29
+ self.video_id = video_id
30
+ self.generated = generated
31
+ self.subs = getSubs(video_id, generated)
32
+
33
+ def __sizeof__(self) -> int:
34
+ count = 0
35
+ for _ in self.subs:
36
+ count += 1
37
+ return count
38
+
39
+ def getText(self):
40
+ return (TextFormatter().format_transcript(self.subs)).replace("\n", " ")
41
+
42
+ def getSubs(self):
43
+ subs = self.subs
44
+ # [chunk, duration]
45
+ c_d_subs = '\n'.join(f"{subs['text']}:::{subs['duration']}" for subs in subs)
46
+ return c_d_subs
47
+
48
+ def getSubsRaw(self):
49
+ return self.subs
50
+
51
+ def getSubsList(self, size=100):
52
+ subs = json.loads(json.dumps(self.subs))
53
+ chunks = []
54
+ current_chunk = "" # limited to {size}
55
+ current_duaration = 0 # TODO: add better variable name
56
+ c_d_target = 2
57
+ c_d_count = 0
58
+
59
+ for subline in subs:
60
+ current_duaration = subline["start"]
61
+ if len(current_chunk) + len(subline["text"]) + 1 <= size:
62
+ current_chunk += f"{subline['text']} "
63
+ else:
64
+ chunks.append(
65
+ [
66
+ current_chunk.strip(),
67
+ current_duaration
68
+ ]
69
+ )
70
+ current_chunk = f"{subline['text']} "
71
+
72
+ return chunks
utils/video.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess, os
2
+
3
+ def Popen(cmd: list) -> str:
4
+ """Run a command and return the output as a string
5
+
6
+ Args:
7
+ cmd (list): The command to run
8
+
9
+ Returns:
10
+ str: The output of the command
11
+ """
12
+ return subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE).stdout.read().strip().decode('utf-8')
13
+
14
+ class video:
15
+ def __init__(self,url, path):
16
+ self.path = path
17
+ self.url = url
18
+
19
+ # check if directory exists
20
+ if not os.path.exists(self.path.split("/")[-1]):
21
+ os.mkdir(self.path.split("/")[-1])
22
+
23
+ def download(self):
24
+ if os.path.exists(f"{self.path}.webm"):
25
+ print(f"{self.path}.webm already exists, skipping download")
26
+ return
27
+ print(f"Downloading {self.url}")
28
+ # (
29
+ # Popen(
30
+ # ["yt-dlp", self.url, "-o", self.path ]
31
+ # )
32
+ # )
33
+ os.system(f"yt-dlp {self.url} -o {self.path}")
34
+
35
+ def getframe(self, timestamp):
36
+ filename = f"{self.path}_{timestamp}.png"
37
+
38
+ if os.path.exists(filename):
39
+ print(f"{filename} already exists, skipping download")
40
+ return
41
+
42
+ print(f"Getting frame at {timestamp}")
43
+ (
44
+ Popen(
45
+ [
46
+ "ffmpeg",
47
+ "-hide_banner",
48
+ "-loglevel", "panic",
49
+ "-ss", timestamp,
50
+ "-i", f"{self.path}.webm",
51
+ "-vframes", "1",
52
+ f"{filename}"
53
+ ]
54
+ )
55
+ )