|
from __future__ import annotations |
|
|
|
import argparse |
|
import os |
|
import tempfile |
|
from typing import Callable |
|
|
|
from gradio_client import Client |
|
|
|
from groq import Groq |
|
from loguru import logger |
|
|
|
from hf import hf_transcript, get_whisper_hf_client |
|
from logs import configure_logging |
|
from rate_limit import rate_limit_bypass |
|
|
|
from settings import app_settings |
|
from transcribe import get_full_transcript, parse_audio |
|
|
|
|
|
@rate_limit_bypass(sleep_time=20) |
|
def summarize_groq(client: Groq, text: str): |
|
completion = client.chat.completions.create( |
|
model=app_settings.model, |
|
messages=[ |
|
{ |
|
"role": "system", |
|
"content": app_settings.system_prompt, |
|
}, |
|
{ |
|
"role": "user", |
|
"content": f"Кратко перескажи видео по транскрипции, " |
|
f"как будто это только часть видео. " |
|
f"Используй оформление и ненумерованные пункты. " |
|
f"Оформи название блока через **Название**" |
|
f"Не пиши о том, что это краткое изложение. " |
|
f"Вот транскрипция: {text}", |
|
} |
|
], |
|
temperature=app_settings.temperature, |
|
max_tokens=app_settings.max_tokens, |
|
top_p=1, |
|
stream=False, |
|
stop=None, |
|
) |
|
return completion.choices[0].message.content |
|
|
|
|
|
def summarize( |
|
texts: list[str], |
|
client: Client | Groq, |
|
summarizer: Callable[[Client | Groq, str], str] = summarize_groq, |
|
) -> str: |
|
logger.info("Summarizing transcript...") |
|
result = "" |
|
i = 1 |
|
for chunk in texts: |
|
logger.info(f"Summarizing chunk #{i}") |
|
i += 1 |
|
result += summarizer(client, chunk) |
|
|
|
return result |
|
|
|
|
|
if __name__ == "__main__": |
|
configure_logging() |
|
parser = argparse.ArgumentParser("Video transcript summarizer") |
|
parser.add_argument("video_path", help="Path to video file", type=str) |
|
args = parser.parse_args() |
|
groq_client = Groq(api_key=app_settings.groq_api_key) |
|
hf_client = get_whisper_hf_client() |
|
|
|
with tempfile.TemporaryDirectory() as tmpdirname: |
|
parse_audio(args.video_path, os.path.join(tmpdirname, "audio.mp3")) |
|
transcript = get_full_transcript(tmpdirname, hf_client, one_file_transcript_func=hf_transcript) |
|
print(summarize(transcript, groq_client)) |
|
|