|
import torch
|
|
|
|
import gradio as gr
|
|
from transformers import pipeline
|
|
from transformers.pipelines.audio_utils import ffmpeg_read
|
|
|
|
import tempfile
|
|
import os
|
|
|
|
MODEL_NAME = "dmatekenya/whisper-large-v3-chichewa"
|
|
BATCH_SIZE = 8
|
|
FILE_LIMIT_MB = 1000
|
|
YT_LENGTH_LIMIT_S = 3600
|
|
|
|
from transformers import AutoTokenizer
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("openai/whisper-large-v3")
|
|
|
|
|
|
|
|
|
|
device = 0 if torch.cuda.is_available() else "cpu"
|
|
|
|
pipe = pipeline(
|
|
task="automatic-speech-recognition",
|
|
tokenizer=tokenizer,
|
|
model=MODEL_NAME,
|
|
chunk_length_s=30,
|
|
device=device,
|
|
)
|
|
|
|
|
|
def transcribe(inputs, task):
|
|
if inputs is None:
|
|
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
|
|
|
|
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
|
|
return text
|
|
|
|
|
|
demo = gr.Blocks()
|
|
|
|
file_transcribe = gr.Interface(
|
|
fn=transcribe,
|
|
inputs=[
|
|
gr.Audio(sources="upload", type="filepath", label="Audio file"),
|
|
|
|
],
|
|
outputs="text",
|
|
|
|
|
|
title="Whisper Large V3: Transcribe Audio",
|
|
description=(
|
|
"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the OpenAI Whisper"
|
|
f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and π€ Transformers to transcribe audio files"
|
|
" of arbitrary length."
|
|
),
|
|
allow_flagging="never",
|
|
)
|
|
|
|
|
|
with demo:
|
|
gr.TabbedInterface([file_transcribe], [ "Audio file"])
|
|
|
|
demo.launch()
|
|
|
|
|