Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import pipeline | |
import torch | |
import librosa | |
import json | |
import os | |
import whisper | |
# Assuming other necessary imports and setup are already done | |
auth_token = os.environ.get("HF_TOKEN") | |
target_lang_options = {"English": "eng", "Luganda": "lug", "Acholi": "ach", "Runyankole": "nyn", "Lugbara": "lgg"} | |
languages = list(target_lang_options.keys()) | |
# Helper function to format and group word timestamps | |
def format_and_group_timestamps(chunks, interval=5.0): | |
grouped = {} | |
transcript = "" | |
for chunk in chunks: | |
start, end = chunk['timestamp'] | |
word = chunk['text'] | |
transcript += f"{word} " | |
interval_start = int(start // interval) * interval | |
if interval_start not in grouped: | |
grouped[interval_start] = [] | |
grouped[interval_start].append((start, end, word)) | |
formatted_output = f"Transcript: {transcript.strip()}'\n\n-------\n\nword-stamped transcripts (every 5 seconds):\n\n" | |
for interval_start, words in grouped.items(): | |
formatted_output += f"({interval_start}, {interval_start + interval}) -- {' '.join([w[2] for w in words])}\n" | |
return formatted_output | |
# Modified transcribe_audio function to use Whisper for English | |
def transcribe_audio(input_file, language, chunk_length_s=10, stride_length_s=(4, 2), return_timestamps="word"): | |
target_lang_options = {"English": "eng", "Luganda": "lug", "Acholi": "ach", "Runyankole": "nyn", "Lugbara": "lgg"} | |
target_lang_code = target_lang_options[language] | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
if target_lang_code == "eng": | |
# Use Whisper for English | |
model = whisper.load_model("small") | |
result = model.transcribe(input_file) | |
# Assuming you want to keep the formatting function for consistency | |
return result["text"] | |
else: | |
# Use specified model for other languages | |
model_id = "Sunbird/sunbird-mms" | |
auth_token = os.environ.get("HF_TOKEN") | |
pipe = pipeline(model=model_id, device=device, token=auth_token) | |
pipe.tokenizer.set_target_lang(target_lang_code) | |
pipe.model.load_adapter(target_lang_code) | |
output = pipe(input_file, chunk_length_s=chunk_length_s, stride_length_s=stride_length_s, return_timestamps=return_timestamps) | |
formatted_output = format_and_group_timestamps(output['chunks']) | |
return formatted_output | |
# Interface setup remains the same | |
description = '''ASR with salt-mms''' | |
iface = gr.Interface(fn=transcribe_audio, | |
inputs=[ | |
gr.Audio(sources="upload", type="filepath", label="upload file to transcribe"), | |
gr.Dropdown(choices=list(target_lang_options.keys()), label="Language", value="English") | |
], | |
outputs=gr.Textbox(label="Transcription"), | |
description=description | |
) | |
# Launch the interface | |
iface.launch() | |