File size: 6,844 Bytes
3456a58
9150552
3456a58
9150552
3456a58
 
9150552
3456a58
9150552
3456a58
 
 
 
 
 
 
 
69ebc65
 
 
 
050bb7a
 
 
69ebc65
 
 
 
 
 
 
 
 
 
 
 
3456a58
 
 
 
69ebc65
 
 
3456a58
9150552
3456a58
 
 
 
 
 
 
 
 
 
 
9150552
3456a58
9150552
3456a58
 
9150552
3456a58
9150552
3456a58
52085a2
3456a58
 
 
 
e5ec50a
3456a58
 
 
 
 
 
 
 
 
 
 
9150552
 
3456a58
 
 
 
9150552
3456a58
 
 
9150552
3456a58
9150552
3456a58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
de077cb
 
3456a58
012edff
de077cb
3456a58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9150552
3456a58
 
 
9150552
3456a58
 
 
 
 
 
 
 
 
 
 
 
 
 
9150552
 
 
3456a58
 
 
 
 
9150552
 
 
3456a58
050bb7a
3456a58
 
 
 
a4caf66
3456a58
 
 
 
 
 
 
9150552
3456a58
 
b158abb
 
 
 
 
 
 
3456a58
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import argparse
import datetime
import os
import gradio as gr
from signal import SIGINT, signal
from utils.log import debug, info, logger, breakPoint as bc

import requests

from constants import *

CHUNK_SIZE  =   512
VIDEO_ID    =   ""
OUT_PPT_NAME=   PPTX_DEST
NO_IMAGES   =   False
QUESTIONS   =   5

def init_check():
    # check for google-chrome
    if os.system("google-chrome --version") != 0:
        logger.critical("Google Chrome is not installed")
        if os.path.exists("scripts/chrome-setup.sh"):
            logger.info("Trying to install chrome..")
            os.system("bash scripts/chrome-setup.sh")
    
    if os.system("npm --version") != 0:
        logger.critical("npm is not installed")
    
    if os.system("npx --version") != 0:
        logger.critical("npx is not installed")
    
    if os.system("ffmpeg --version") != 0:
        logger.critical("ffmpeg is not installed")
    
    logger.info("Init check done, look for errors above..")

def gradio_run(
    video_id, chunk_size: int,
    no_images: bool, no_chapters: bool, out_type="pdf"):
    
    # do init check
    init_check()
    
    VIDEO_ID = video_id
    CHUNK_SIZE = chunk_size
    NO_IMAGES = no_images
    NO_CHAPTERS = no_chapters
    OUT_PPT_NAME = f"{OUTDIR}/gradio-out{VIDEO_ID}.{out_type}"
    
    info("Loading modules..")
    from langchain.chains.summarize import load_summarize_chain
    # from langchain.vectorstores import Chroma
    # from langchain.embeddings.huggingface import HuggingFaceEmbeddings
    # from langchain.chains import RetrievalQA
    # from langchain.llms import HuggingFacePipeline
    from langchain.docstore.document import Document
    from rich.progress import track

    import utils.markdown as md
    from models.lamini import lamini as model
    from utils.marp_wrapper import marp
    from utils.ppt import generate_ppt
    from utils.subtitles import subs
    from utils.video import video
    from utils.chunk import ChunkByChapters
    
    # intialize marp
    out = marp(MD_DEST)
    out.add_header(config=MARP_GAIA)
    # out.add_body("<style> section { font-size: 1.5rem; } </style>")
    
    # initialize video
    vid = video(VIDEO_ID, f"{OUTDIR}/vid-{VIDEO_ID}")
    vid.download()
        
    # initialize model
    llm_model = model
    llm = llm_model.load_model(
            max_length=400,
            temperature=0,
            top_p=0.95,
            repetition_penalty=1.15
    )
    
    # slice subtitle and chunk them 
    # to CHUNK_SIZE based on chapters
    info(f"Getting subtitles {VIDEO_ID}..")
    raw_subs     = vid.getSubtitles()
    
    if raw_subs is None:
        logger.critical("No subtitles found, exiting..")
        exit()
    
    info(f"got {len(raw_subs)} length subtitles")
    
    
    if NO_CHAPTERS:
        chunker = subs(VIDEO_ID)
        chunks = chunker.getSubsList(size=CHUNK_SIZE)
        model_tmplts = llm_model.templates()
        summarizer = model_tmplts.summarize
        title_gen = model_tmplts.generate_title
        
        # title Photo
        first_pic = str(datetime.timedelta(seconds=chunks[0][1]))
        img_name = f"vid-{VIDEO_ID}_{first_pic}.png"
        img_path = f"{PNG_DEST}/{img_name}"
        vid.getframe(first_pic, img_path)
        out.add_page(md.h1(VIDEO_ID), md.image(url=img_name))
        out.marp_end()
        
        FCL = len(chunks) # full chunk length
        CCH = 0
        for chunk in track(chunks, description="(processing chunks) Summarizing.."):
            CCH += 1
            logger.info(f"{CCH}/{FCL} - {(CCH/FCL)*100:.2f}% - PROCESSING CHUNKS.") 
            summary = summarizer(chunk[0])[0]["generated_text"].replace("-", "\n-")
            title = title_gen(chunk[0])[0]["generated_text"]
            
            heading = md.h2 if len(title) < 40 else md.h3
            out.add_page(heading(title), summary)
            
            if not NO_IMAGES and len(summary+title) < 270:
                timestamp = str(datetime.timedelta(seconds=chunk[1]))
                imgName = f"vid-{VIDEO_ID}_{timestamp}.png"
                imgPath = f"{PNG_DEST}/{imgName}"
                vid.getframe(timestamp, imgPath)
                out.add_body(md.image(imgName, align="left", setAsBackground=True))
                
            out.marp_end()
    else:
        raw_chapters = vid.getChapters(f"{YT_CHAPTER_ENDPOINT}{VIDEO_ID}")
        chunk_dict = ChunkByChapters(raw_chapters, raw_subs, CHUNK_SIZE)
        chain = load_summarize_chain(llm, chain_type="stuff")
            # TODO: ( use refine chain type to summarize all chapters )
        img_hook = False
        for title, subchunks in track(chunk_dict.items(), description="(processing chunks) Summarizing.."):
            # Typecase subchunks to Document for every topic
            # get summary for every topic with stuff/refine chain
            # add to final summary

            debug(subchunks)
            docs = [ Document(page_content=t[0]) for t in subchunks[0] ]
            summary = chain.run(docs)

            if img_hook == False:
                ts = str(datetime.timedelta(seconds=subchunks[0][1][0]))
                img_path  = f"{PNG_DEST}/vid-{VIDEO_ID}_{ts}.png"
                vid.getframe(ts, img_path)
                if os.path.exists(img_path):
                # if summary is long ignore images for better page and no clipping
                    if len(summary+title) < 270:
                        out.add_body(md.image( 
                                          img_path.replace(f"{OUTEXTRA}/", ""),
                                          align="left",
                                          setAsBackground=True
                                  ))
            out.add_page(md.h2(title), summary)
            out.marp_end()




    info(f"Generating {OUT_PPT_NAME}..")
    out.close_file()
    generate_ppt(MD_DEST, OUT_PPT_NAME)
    print(f"Done! {OUT_PPT_NAME}")
    
    return os.path.abspath(OUT_PPT_NAME)

def gradio_Interface():
    init_check()
    app = gr.Interface(
        fn=gradio_run,
        inputs=[
            "text",
            gr.Slider(300, 2000, 50, label="Chunk Size", info="More chunk size = longer text & shorter numbber of slides"),
            gr.Checkbox(label="No Images", info="Don't keep images in output ( gives more spaces for larger text)"),
            gr.Checkbox(label="No Chapters", info="Don't use chapter based chunking"),
            gr.Dropdown(["pptx", "pdf", "html"], label="file format", info="which file format to generte.")
        ],
        outputs="file"
    )
    app.launch()
    
if __name__ == "__main__":
    logger.info("Starting gradio interface..")
    
    if not os.path.exists(OUTDIR):
        os.mkdir(OUTDIR)
        os.mkdir(OUTEXTRA)
    
    if not os.path.exists(OUTEXTRA):
        os.mkdir(OUTEXTRA)
    gradio_Interface()