Spaces:

xsestech
/

summary

Sleeping

App Files Files Community

xsestech commited on Sep 8, 2024

Commit

d5c679f

•

0 Parent(s):

Created app

Browse files

Files changed (18) hide show

.gitignore +271 -0
.idea/.gitignore +8 -0
.idea/VTT.iml +8 -0
.idea/inspectionProfiles/Project_Default.xml +51 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/misc.xml +7 -0
.idea/modules.xml +8 -0
.idea/vcs.xml +6 -0
app.py +43 -0
hf.py +21 -0
llm.py +20 -0
local_transcript.py +67 -0
logs.py +65 -0
rate_limit.py +37 -0
remote_llm.py +78 -0
requirements.txt +67 -0
settings.py +24 -0
transcribe.py +84 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,271 @@

+### macOS template
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+# Icon must end with two \r
+Icon
+# Thumbnails
+._*
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+### PyCharm+all template
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+# AWS User-specific
+.idea/**/aws.xml
+# Generated files
+.idea/**/contentModel.xml
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+# CMake
+cmake-build-*/
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+# File-based project format
+*.iws
+# IntelliJ
+out/
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+# JIRA plugin
+atlassian-ide-plugin.xml
+# Cursive Clojure plugin
+.idea/replstate.xml
+# SonarLint plugin
+.idea/sonarlint/
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+# Editor-based Rest Client
+.idea/httpRequests
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+examples
+flagged

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,8 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

.idea/VTT.iml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="$USER_HOME$/miniconda3/envs/vtt-remote" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

.idea/inspectionProfiles/Project_Default.xml ADDED Viewed

	@@ -0,0 +1,51 @@

+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
+    <inspection_tool class="PyCompatibilityInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ourVersions">
+        <value>
+          <list size="8">
+            <item index="0" class="java.lang.String" itemvalue="3.12" />
+            <item index="1" class="java.lang.String" itemvalue="3.6" />
+            <item index="2" class="java.lang.String" itemvalue="3.7" />
+            <item index="3" class="java.lang.String" itemvalue="3.8" />
+            <item index="4" class="java.lang.String" itemvalue="3.9" />
+            <item index="5" class="java.lang.String" itemvalue="3.10" />
+            <item index="6" class="java.lang.String" itemvalue="3.11" />
+            <item index="7" class="java.lang.String" itemvalue="3.13" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="false" level="WARNING" enabled_by_default="false">
+      <option name="ignoredPackages">
+        <value>
+          <list size="6">
+            <item index="0" class="java.lang.String" itemvalue="dnspython" />
+            <item index="1" class="java.lang.String" itemvalue="pydantic" />
+            <item index="2" class="java.lang.String" itemvalue="alembic" />
+            <item index="3" class="java.lang.String" itemvalue="certifi" />
+            <item index="4" class="java.lang.String" itemvalue="pydantic-core" />
+            <item index="5" class="java.lang.String" itemvalue="click" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="N803" />
+          <option value="N802" />
+        </list>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredIdentifiers">
+        <list>
+          <option value="rabbit_backend.util.abc_registry.ABCRegistry.*" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,7 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="Python 3.11 (2)" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="$USER_HOME$/miniconda3/envs/vtt-remote" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/VTT.iml" filepath="$PROJECT_DIR$/.idea/VTT.iml" />
+    </modules>
+  </component>
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>

app.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import os
+import tempfile
+import gradio as gr
+from groq import Groq
+from hf import hf_transcript, get_whisper_hf_client
+from logs import configure_logging
+from remote_llm import summarize
+from transcribe import parse_audio, get_full_transcript, groq_transcript
+def gradio_pipeline(video: gr.Video, whisper_inference, groq_api_token):
+    groq_client = Groq(api_key=groq_api_token)
+    hf_client = get_whisper_hf_client()
+    print(video)
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        parse_audio(video, os.path.join(tmpdirname, "audio.mp3"))
+        if whisper_inference == "hf":
+            transcript = get_full_transcript(tmpdirname, hf_client, one_file_transcript_func=hf_transcript)
+        elif whisper_inference == "groq":
+            transcript = get_full_transcript(tmpdirname, groq_client, one_file_transcript_func=groq_transcript)
+        return summarize(transcript, groq_client)
+if __name__ == "__main__":
+    configure_logging()
+    demo = gr.Interface(
+        fn=gradio_pipeline,
+        inputs=[
+            gr.Video(),
+            gr.Radio(choices=["groq", "hf"], value="hf", label="Whisper inference"),
+            gr.Text(max_lines=1, type="password",
+                    placeholder="Enter your groq API key",
+                    label="groq API key")
+        ],
+        outputs=gr.Markdown(
+            value="# Here will be the summary...",
+            label="Summary",
+            show_copy_button=True,
+        ),
+        allow_flagging="never")
+    demo.launch()

hf.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from gradio_client import Client, handle_file
+from loguru import logger
+from rate_limit import rate_limit_bypass
+def get_whisper_hf_client() -> Client:
+    API_URL = "sanchit-gandhi/whisper-jax-spaces"
+    return Client(API_URL)
+@rate_limit_bypass(sleep_time=2)
+def hf_transcript(client: Client, audio_path: str):
+    text, runtime = client.predict(
+        inputs=handle_file(audio_path),
+        task="transcribe",
+        return_timestamps=False,
+        api_name="/predict_1",
+    )
+    logger.info(text)
+    return text

llm.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from llama_cpp import Llama
+def get_llm(model_path: str = "models/Meta-Llama-3.1-8B-Instruct-IQ4_XS.gguf") -> Llama:
+    return Llama(
+        model_path=model_path,
+        n_gpu_layers=-1,
+    )
+def summarize_transcript(llm: Llama, transcript: str) -> str:
+    summary = llm.create_chat_completion(
+        messages=[
+            {
+                "role": "user",
+                "content": f"Summarize the following video transcript: {transcript}",
+            }
+        ]
+    )
+    return summary

local_transcript.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import argparse
+import faulthandler
+import gc
+import os
+import tempfile
+import torch
+import whisperx
+from whisperx.asr import FasterWhisperPipeline
+def get_device():
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # device = "mps" if torch.backends.mps.is_available() else device
+    return device
+def generate_subtitles_from_audio(
+        audio_file_path: str,
+        model: FasterWhisperPipeline,
+        batch_size: int = 8
+):
+    audio = whisperx.load_audio(audio_file_path)
+    result = model.transcribe(audio, batch_size=batch_size, language="ru", )
+    return result
+def generate_subtitles_from_video(
+        video_path: str,
+        model_name: str = "base",
+        batch_size: int = 8,
+        compute_type: str = "int8",
+):
+    _, audio_file = tempfile.mkstemp()
+    device = get_device()
+    print("Loading model:")
+    model = whisperx.load_model(model_name, device, compute_type=compute_type, language="ru")
+    print("Parsing audio:")
+    parse_audio(video_path, audio_file)
+    print("Generating subtitles:")
+    result = generate_subtitles_from_audio(audio_file, model, batch_size=batch_size)
+    os.remove(audio_file)
+    del model
+    gc.collect()
+    return result
+def add_whisper_args(arg_parser: argparse.ArgumentParser):
+    arg_parser.add_argument("video", help="video file")
+    arg_parser.add_argument("--compute_type", help="Base type for model", default="int8",
+                            choices=["int8", "float16", "float32"])
+    arg_parser.add_argument("--whisper_model", help="model to use", default="large-v2")
+    arg_parser.add_argument("--batch_size", help="Batch size for inference", default=4, type=int)
+if __name__ == "__main__":
+    faulthandler.enable()
+    parser = argparse.ArgumentParser(description="Get video subtitles from a video")
+    add_whisper_args(parser)
+    args = parser.parse_args()
+    print(generate_subtitles_from_video(args.video, args.whisper_model, args.batch_size, args.compute_type))

logs.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import logging
+import sys
+from typing import Union
+from loguru import logger
+from settings import app_settings
+class InterceptHandler(logging.Handler):
+    """
+    Default handler from examples in loguru documentation.
+    This handler intercepts all log requests and
+    passes them to loguru.
+    For more info see:
+    https://loguru.readthedocs.io/en/stable/overview.html#entirely-compatible-with-standard-logging
+    """
+    def emit(self, record: logging.LogRecord) -> None:  # pragma: no cover
+        """Propagates logs to loguru.
+        Parameters
+        ----------
+        record
+            record to log.
+        """
+        try:
+            level: Union[str, int] = logger.level(record.levelname).name
+        except ValueError:
+            level = record.levelno
+        # Find caller from where originated the logged message
+        frame, depth = logging.currentframe(), 2
+        while frame.f_code.co_filename == logging.__file__:
+            frame = frame.f_back  # type: ignore
+            depth += 1
+        logger.opt(depth=depth, exception=record.exc_info).log(
+            level,
+            record.getMessage(),
+        )
+def configure_logging() -> None:  # pragma: no cover
+    """Configures logging."""
+    intercept_handler = InterceptHandler()
+    logging.basicConfig(handlers=[intercept_handler], level=logging.NOTSET)
+    for logger_name in logging.root.manager.loggerDict:
+        if logger_name.startswith("uvicorn."):
+            logging.getLogger(logger_name).handlers = []
+    # change handler for default uvicorn logger
+    logging.getLogger("uvicorn").handlers = [intercept_handler]
+    logging.getLogger("uvicorn.access").handlers = [intercept_handler]
+    # set logs output, level and format
+    logger.remove()
+    logger.add(
+        sys.stdout,
+        level=app_settings.log_level,
+    )

rate_limit.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import functools
+import time
+from groq import RateLimitError
+from httpx import ReadTimeout
+from loguru import logger
+def rate_limit_bypass(sleep_time: int = 1, max_retries: int = 10):
+    """Bypass rate limit for groq
+    Parameters
+    ----------
+    sleep_time : int, optional, default 1
+    max_retries : int, optional, default 10"""
+    def decorate_rate_limit(func):
+        @functools.wraps(func)
+        def wrapper_rate_limit(*args, **kwargs):
+            retries = 0
+            while True:
+                try:
+                    result = func(*args, **kwargs)
+                except (RateLimitError, ReadTimeout) as e:
+                    logger.info(f"Rate limit exceeded, sleeping for {sleep_time} seconds")
+                    logger.debug(repr(e))
+                    time.sleep(sleep_time)
+                    retries += 1
+                    if retries > max_retries:
+                        raise e
+                    continue
+                return result
+        return wrapper_rate_limit
+    return decorate_rate_limit

remote_llm.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from __future__ import annotations
+import argparse
+import os
+import tempfile
+from typing import Callable
+from gradio_client import Client
+# import loguru
+from groq import Groq
+from loguru import logger
+from hf import hf_transcript, get_whisper_hf_client
+from logs import configure_logging
+from rate_limit import rate_limit_bypass
+# from remote_whisper import hf_transcribe_audio
+from settings import app_settings
+from transcribe import get_full_transcript, parse_audio
+SEGMENT_TIME = 1500
+@rate_limit_bypass(sleep_time=20)
+def summarize_groq(client: Groq, text: str):
+    completion = client.chat.completions.create(
+        model=app_settings.model,
+        messages=[
+            {
+                "role": "system",
+                "content": "Summarize the video transcript excerpt including"
+                           " a concise title that reflects the content. "
+                           "Wrap the title with **markdown bold notation**. "
+                           "Write the summary as if you are continuing a conversation without needing "
+                           "to signal a beginning. Answer only in russian."
+                           "Here is the transcript: "
+            },
+            {
+                "role": "user",
+                "content": text,
+            }
+        ],
+        temperature=app_settings.temperature,
+        max_tokens=1024,
+        top_p=1,
+        stream=False,
+        stop=None,
+    )
+    return completion.choices[0].message.content
+def summarize(
+        texts: list[str],
+        client: Client | Groq,
+        summarizer: Callable[[Client | Groq, str], str] = summarize_groq,
+) -> str:
+    logger.info("Summarizing transcript...")
+    result = ""
+    i = 1
+    for chunk in texts:
+        logger.info(f"Summarizing chunk #{i}")
+        i += 1
+        result += summarizer(client, chunk)
+    return result
+if __name__ == "__main__":
+    configure_logging()
+    parser = argparse.ArgumentParser("Video transcript summarizer")
+    parser.add_argument("video_path", help="Path to video file", type=str)
+    args = parser.parse_args()
+    groq_client = Groq(api_key=app_settings.groq_api_key)
+    hf_client = get_whisper_hf_client()
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        parse_audio(args.video_path, os.path.join(tmpdirname, "audio.mp3"))
+        transcript = get_full_transcript(tmpdirname, hf_client, one_file_transcript_func=hf_transcript)
+        print(summarize(transcript, groq_client))

requirements.txt ADDED Viewed

	@@ -0,0 +1,67 @@

+aiofiles==23.2.1
+annotated-types==0.7.0
+ansible==8.0.0
+ansible-core==2.15.0
+anyio==4.4.0
+argcomplete==3.1.1
+certifi==2024.8.30
+charset-normalizer==3.3.2
+click==8.1.7
+contourpy==1.3.0
+cycler==0.12.1
+distro==1.9.0
+exceptiongroup==1.2.2
+fastapi==0.112.4
+ffmpy==0.4.0
+filelock==3.16.0
+fonttools==4.53.1
+fsspec==2024.9.0
+gradio==4.43.0
+gradio_client==1.3.0
+groq==0.11.0
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.2
+huggingface-hub==0.24.6
+idna==3.8
+importlib_resources==6.4.4
+Jinja2==3.1.4
+kiwisolver==1.4.7
+loguru==0.7.2
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.2
+mdurl==0.1.2
+numpy==2.1.1
+orjson==3.10.7
+packaging==24.1
+pandas==2.2.2
+pillow==10.4.0
+pydantic==2.9.0
+pydantic-settings==2.4.0
+pydantic_core==2.23.2
+pydub==0.25.1
+Pygments==2.18.0
+pyparsing==3.1.4
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.9
+pytz==2024.1
+PyYAML==6.0
+requests==2.32.3
+resolvelib==1.0.1
+rich==13.8.0
+ruff==0.6.4
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.1
+starlette==0.38.5
+tomlkit==0.12.0
+tqdm==4.66.5
+typer==0.12.5
+typing_extensions==4.12.2
+tzdata==2024.1
+urllib3==2.2.2
+uvicorn==0.30.6
+websockets==12.0

settings.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import enum
+from pydantic_settings import BaseSettings
+class LogLevel(str, enum.Enum):
+    """Possible log levels."""
+    NOTSET = "NOTSET"
+    DEBUG = "DEBUG"
+    INFO = "INFO"
+    WARNING = "WARNING"
+    ERROR = "ERROR"
+    FATAL = "FATAL"
+class Settings(BaseSettings):
+    temperature: float = 1
+    model: str = "llama3-8b-8192"
+    log_level: LogLevel = LogLevel.INFO
+    segment_time: int = 1500
+app_settings = Settings()

transcribe.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from __future__ import annotations
+import os
+import time
+from typing import Callable
+from gradio_client import Client
+from groq import Groq
+from loguru import logger
+from rate_limit import rate_limit_bypass
+from settings import app_settings
+def parse_audio(input_file_path: str, output_file_path: str) -> None:
+    """Parse audio file from video file to mp3 format"""
+    os.system(f"ffmpeg -y -i {input_file_path} -f mp3 {output_file_path}")
+def split_audio_file(audio_folder_path: str, segment_time: int = app_settings.segment_time) -> None:
+    """Splits an audio file into multiple segments using ffmpeg.
+    Parameters
+    ----------
+    audio_folder_path : str
+        The path to the audio file.
+    segment_time : int, optional
+        Time in seconds for each segment.
+    """
+    audio_file_path = os.path.join(audio_folder_path, "audio.mp3")
+    output_file_template = os.path.join(audio_folder_path, "audio_%03d.mp3")
+    os.system(
+        f"ffmpeg -y -i {audio_file_path} -f segment -segment_time {segment_time} -c copy {output_file_template}")
+@rate_limit_bypass(sleep_time=10)
+def groq_transcript(client: Groq, audio_file_path: str) -> str:
+    """Get transcript for one file.
+    Parameters
+    ----------
+    client : Groq
+    audio_file_path : str
+        The path to the audio file to transcribe."""
+    with open(audio_file_path, "rb") as file:
+        transcription = client.audio.transcriptions.create(
+            file=(audio_file_path, file.read()),
+            model="whisper-large-v3",
+        )
+        logger.debug(f"Transcription: {transcription.text}")
+        return transcription.text
+def get_full_transcript(
+        audio_folder_path: str,
+        client: Client | Groq,
+        one_file_transcript_func: Callable[[Client | Groq, str], str] = groq_transcript,
+) -> list[str]:
+    """Get full transcript for all audio files in a folder.
+    Parameters
+    ----------
+    audio_folder_path : str
+        folder, where all audio files are located.
+    one_file_transcript_func : Callable[[str], str], optional
+        Function that transcribes a single audio file.
+    client : Client | Groq
+        A client object to pass to transcript function
+    Returns
+    -------
+    list[str]
+        A list of transcripts for all audio files in a folder.
+    ."""
+    logger.info("Getting transcript...")
+    split_audio_file(audio_folder_path)
+    transcript = []
+    for file_name in os.listdir(audio_folder_path):
+        if file_name.startswith("audio_"):
+            audio_file_path = os.path.join(audio_folder_path, file_name)
+            transcript += [one_file_transcript_func(client, audio_file_path)]
+            time.sleep(2)
+    return transcript