Spaces:

zman1x1
/

yt-video-to-summary

Sleeping

App Files Files Community

zman1x1 commited on Jul 9, 2023

Commit

9150552

•

1 Parent(s): a9996a1

init

Browse files

Files changed (9) hide show

.gitattributes +2 -0
app.py +89 -0
requirements.txt +8 -0
utils/chunk.py +52 -0
utils/markdown.py +59 -0
utils/marp_wrapper.py +65 -0
utils/ppt.py +17 -0
utils/subtitles.py +72 -0
utils/video.py +55 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+**__pycache__

app.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# from utils.subtitles import getSubs
+# subs = getSubs("G8gEos8F9R0")
+# print(subs)
+# import json
+# with open("subs.json", "w") as f:
+#     json.dump(subs, f)
+import datetime
+import gradio as gr
+def greet(name):
+    pass
+def run_model(video_id, outname, chunk_size):
+    CHUNK_SIZE = chunk_size
+    OUT_PPT_NAME = outname
+    from rich.progress import track
+    from utils.subtitles import getSubsText
+    from models.distilbart_cnn_12_6 import summarize
+    from models.t5_small_medium_title_generation import t5model as generate_title
+    from utils.marp_wrapper import marp
+    import utils.markdown as md
+    # from utils.chunk import LangChainChunker as chunker
+    from utils.subtitles import subs as chunker
+    from utils.ppt import generate_ppt
+    from utils.video import video
+    import os
+    # Intermediary Markdown file
+    print("Creating Markdown file...")
+    ppt = marp("summary.md")
+    ppt.add_header(
+        theme="uncover",
+        background="",
+        _class="invert",
+    )
+    # smaller font size (1.5rem)
+    ppt.add_body("<style> section { font-size: 1.5rem; } </style>")
+    # Generate video
+    vid = video(f"https://youtu.be/{video_id}",
+                f"out/vid-{video_id}")
+    vid.download()
+    # Get the Subtitles from the YouTube video
+    print("Getting subtitles...")
+    chunker_init    = chunker(video_id)
+    chunks          = chunker_init.getSubsList(size=CHUNK_SIZE)
+    chunk_len       = len(chunks)
+    print(f"subtitles divided to {chunk_len} chunks")
+    chunk_num = 1
+    for chunk in track(chunks, description="Processing chunks"):
+        print(f"processing Chunk: {chunk_num}/{chunk_len}")
+        timestamp = str(datetime.timedelta(seconds=chunk[1]))
+        # TODO: better file path
+        img_path  = f"out/vid-{video_id}_{timestamp}.png"
+        summary = summarize(chunk[0])
+        vid.getframe(timestamp)
+        title = generate_title(summary)
+        ppt.add_page( md.h2(title), summary )
+        if os.path.exists(img_path):
+            ppt.add_body(md.image( img_path,
+            align="left", setAsBackground=True, size="contain"))
+        ppt.marp_end()
+        chunk_num += 1
+        continue
+    print(f"Generating {OUT_PPT_NAME}..")
+    ppt.close_file()
+    generate_ppt("summary.md", OUT_PPT_NAME)
+    # return full path to the ppt file
+    return os.path.abspath(OUT_PPT_NAME)
+demo = gr.Interface(fn=run_model, inputs=["text", "text", gr.Slider(200, 1000)], outputs="file")
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch
+transformers
+nltk
+youtube_transcript_api
+accelerate
+langchain
+yt-dlp
+rich

utils/chunk.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# divide the subs into chunks for more accurate summarization
+# TODO: divide the subs into chunks based on the topics
+# summarize each chunk and add it to the markdown file
+class legacy_chunker:
+    # legacy manual chunker
+    def __init__(self, text):
+        self.text = text
+    def chunker(self, size=1000):
+        words = self.text.split()
+        chunks = []
+        current_chunk = ""
+        for word in words:
+            if len(current_chunk) + len(word) + 1 <= size:
+                current_chunk += f"{word} "
+            else:
+                chunks.append(current_chunk.strip())
+                current_chunk = f"{word} "
+        if current_chunk:
+            chunks.append(current_chunk.strip())
+        return chunks
+    def __sizeof__(self) -> int:
+        count = 0
+        for _ in self.text:
+            count += 1
+        return count
+class LangChainChunker:
+    def __init__(self, text):
+        self.text = text
+    def chunker(self, size=1000):
+        from langchain.text_splitter import CharacterTextSplitter
+        # attach the duration of the video to the chunk
+        # [[chunk, duration]]
+        text_splitter = CharacterTextSplitter(
+            separator=" ",
+            chunk_size=size,
+            chunk_overlap=0.9,
+        )
+        return text_splitter.split_text(self.text)
+    def __sizeof__(self) -> int:
+        count = 0
+        for _ in self.text:
+            count += 1
+        return count

utils/markdown.py ADDED Viewed

	@@ -0,0 +1,59 @@

+def h1(text):
+    return f"# {text}\n"
+def h2(text):
+    return f"## {text}\n"
+def h3(text):
+    return f"### {text}\n"
+def backtick(text):
+    return f"`{text}`"
+def code(lang, text):
+    return f"```{lang}\n{text}\n```"
+def link(text, url):
+    return f"[{text}]({url})"
+def image(
+    url,
+    align=None,
+    height=None,
+    size=None,
+    setAsBackground=False,
+    ):
+    """Summary
+    return markdown image syntax with marp options
+    Args:
+        url (str): image url (only online images)
+        align (str, optional): image alignment (left, right, center)
+        height (str, optional): image height (ex:"2in")
+        size (str, optional): cover, contain, auto, fix, x%
+        setAsBackground (bool, optional): set image as background
+    Returns:
+        _type_: _description_
+    """
+    options = ""
+    if align is not None:
+        options += f"{align}"
+    if height is not None:
+        options += f" {height}"
+    if size is not None:
+        options += f" {size}"
+    if setAsBackground:
+        options += " bg"
+    return f"![{options}]({url})"
+def quote(text):
+    return f"> {text}"
+def bold(text):
+    return f"**{text}**"
+def italic(text):
+    return f"*{text}*"

utils/marp_wrapper.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import utils.markdown as md
+class marp:
+    def __init__(self, path):
+        self.path = path
+        self.f = open(path, "w")
+        self.f.truncate(0) # clear the file
+    def marp_write(self, text):
+        self.f.write(text)
+    def add_header(
+        self,
+        theme: str = "default",
+        _class: str = "lead",
+        paginate: bool = True,
+        background: str = "",
+        backgroundImage: str = None,
+        extra_styles: str = None
+    ):
+        ## write the header
+        # ---
+        # theme: gaia
+        # _class: lead
+        # paginate: true
+        # backgroundColor: #fff
+        # backgroundImage: url('https://marp.app/assets/hero-background.svg')
+        # ---
+        self.marp_write("---\n")
+        self.marp_write("marp: true\n")
+        self.marp_write(f"theme: {theme}\n")
+        self.marp_write(f"class: {_class}\n")
+        if paginate:
+            self.marp_write(f"paginate: true\n")
+        else:
+            self.marp_write(f"paginate: false\n")
+        self.marp_write(f"backgroundColor: {background}\n")
+        self.writeifNotNone(backgroundImage)
+        self.marp_end()
+        self.writeifNotNone(extra_styles) # for extra css styles
+    def writeifNotNone(self, var):
+        if var is not None:
+            self.marp_write(var)
+    def add_page(self,
+                title=None,
+                body=None,
+                directives: str = None
+    ):
+        self.writeifNotNone(f"<!-- {directives} -->\n")
+        self.writeifNotNone(title)
+        self.writeifNotNone(body)
+    def add_directives(self, directives: str):
+        self.marp_write(f"<!-- {directives} -->\n")
+    def add_body(self, body: str):
+        self.marp_write(body)
+    def marp_end(self):
+        self.marp_write("\n\n---\n\n") # page end
+    def close_file(self):
+        self.f.close() # close the file and flush the buffer

utils/ppt.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import os
+def generate_ppt(markdown_source, output_name, chromium_path="./chrome_sandbox") -> None:
+    # check for marp
+    if os.system("marp --version") != 0:
+        raise Exception("Marp is not installed")
+    # if user is root, then set CHROMIUM_PATH to chromium_path
+    if os.getuid() == 0 and os.name == "posix":
+        os.environ["CHROME_PATH"] = chromium_path
+    # check for markdown source
+    if not os.path.exists(markdown_source):
+        raise Exception("Markdown source does not exist")
+    # generate ppt
+    os.system(f"marp {markdown_source} -o {output_name} --allow-local-files")

utils/subtitles.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from youtube_transcript_api import YouTubeTranscriptApi as ytapi
+from youtube_transcript_api.formatters import TextFormatter
+import json
+def getSubsText(video_id="", getGenerated=False):
+    tList = ytapi.list_transcripts(video_id)
+    if getGenerated:
+        # TODO: implement getGenerated
+        pass
+    for t in tList:
+        data = t.fetch()
+    return (TextFormatter().format_transcript(data)).replace("\n", " ")
+def getSubs(video_id="", getGenerated=False, chunker=None):
+    tList = ytapi.list_transcripts(video_id)
+    if getGenerated:
+        pass
+    for t in tList:
+        data = t.fetch()
+    return data
+class subs:
+    def __init__(self, video_id="", generated=False):
+        self.video_id = video_id
+        self.generated = generated
+        self.subs = getSubs(video_id, generated)
+    def __sizeof__(self) -> int:
+        count = 0
+        for _ in self.subs:
+            count += 1
+        return count
+    def getText(self):
+        return (TextFormatter().format_transcript(self.subs)).replace("\n", " ")
+    def getSubs(self):
+        subs = self.subs
+        # [chunk, duration]
+        c_d_subs = '\n'.join(f"{subs['text']}:::{subs['duration']}" for subs in subs)
+        return c_d_subs
+    def getSubsRaw(self):
+        return self.subs
+    def getSubsList(self, size=100):
+        subs = json.loads(json.dumps(self.subs))
+        chunks = []
+        current_chunk = "" # limited to {size}
+        current_duaration = 0  # TODO: add better variable name
+        c_d_target = 2
+        c_d_count = 0
+        for subline in subs:
+            current_duaration = subline["start"]
+            if len(current_chunk) + len(subline["text"]) + 1 <= size:
+                current_chunk += f"{subline['text']} "
+            else:
+                chunks.append(
+                    [
+                        current_chunk.strip(),
+                        current_duaration
+                    ]
+                )
+                current_chunk = f"{subline['text']} "
+        return chunks

utils/video.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import subprocess, os
+def Popen(cmd: list) -> str:
+    """Run a command and return the output as a string
+    Args:
+        cmd (list): The command to run
+    Returns:
+        str: The output of the command
+    """
+    return subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE).stdout.read().strip().decode('utf-8')
+class video:
+    def __init__(self,url, path):
+        self.path = path
+        self.url = url
+        # check if directory exists
+        if not os.path.exists(self.path.split("/")[-1]):
+            os.mkdir(self.path.split("/")[-1])
+    def download(self):
+        if os.path.exists(f"{self.path}.webm"):
+            print(f"{self.path}.webm already exists, skipping download")
+            return
+        print(f"Downloading {self.url}")
+        # (
+        #     Popen(
+        #             ["yt-dlp", self.url, "-o", self.path ]
+        #     )
+        # )
+        os.system(f"yt-dlp {self.url} -o {self.path}")
+    def getframe(self, timestamp):
+        filename = f"{self.path}_{timestamp}.png"
+        if os.path.exists(filename):
+            print(f"{filename} already exists, skipping download")
+            return
+        print(f"Getting frame at {timestamp}")
+        (
+            Popen(
+                [
+                    "ffmpeg",
+                    "-hide_banner",
+                    "-loglevel", "panic",
+                    "-ss", timestamp,
+                    "-i", f"{self.path}.webm",
+                    "-vframes", "1",
+                    f"{filename}"
+                ]
+            )
+        )