Spaces:
Sleeping
Sleeping
File size: 2,988 Bytes
d7ae26e a39ac0d d7ae26e 6d352f5 e59bf3f d5f6791 6d352f5 a39ac0d 6ce3643 a39ac0d 27b508c a39ac0d 59bf002 27b508c a39ac0d 22fe498 1609670 6d352f5 27b508c 6d352f5 27b508c 6ce3643 d7ae26e 27b508c 6d352f5 a39ac0d 6d352f5 d4afb45 d7ae26e 27b508c a39ac0d d7ae26e 6d352f5 a39ac0d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import gradio as gr
from transformers import pipeline
import torch
import librosa
import json
import os
import whisper
# Assuming other necessary imports and setup are already done
auth_token = os.environ.get("HF_TOKEN")
target_lang_options = {"English": "eng", "Luganda": "lug", "Acholi": "ach", "Runyankole": "nyn", "Lugbara": "lgg"}
languages = list(target_lang_options.keys())
# Helper function to format and group word timestamps
def format_and_group_timestamps(chunks, interval=5.0):
grouped = {}
transcript = ""
for chunk in chunks:
start, end = chunk['timestamp']
word = chunk['text']
transcript += f"{word} "
interval_start = int(start // interval) * interval
if interval_start not in grouped:
grouped[interval_start] = []
grouped[interval_start].append((start, end, word))
formatted_output = f"Transcript: {transcript.strip()}'\n\n-------\n\nword-stamped transcripts (every 5 seconds):\n\n"
for interval_start, words in grouped.items():
formatted_output += f"({interval_start}, {interval_start + interval}) -- {' '.join([w[2] for w in words])}\n"
return formatted_output
# Modified transcribe_audio function to use Whisper for English
def transcribe_audio(input_file, language, chunk_length_s=10, stride_length_s=(4, 2), return_timestamps="word"):
target_lang_options = {"English": "eng", "Luganda": "lug", "Acholi": "ach", "Runyankole": "nyn", "Lugbara": "lgg"}
target_lang_code = target_lang_options[language]
device = "cuda" if torch.cuda.is_available() else "cpu"
if target_lang_code == "eng":
# Use Whisper for English
model = whisper.load_model("small")
result = model.transcribe(input_file)
# Assuming you want to keep the formatting function for consistency
return result["text"]
else:
# Use specified model for other languages
model_id = "Sunbird/sunbird-mms"
auth_token = os.environ.get("HF_TOKEN")
pipe = pipeline(model=model_id, device=device, token=auth_token)
pipe.tokenizer.set_target_lang(target_lang_code)
pipe.model.load_adapter(target_lang_code)
output = pipe(input_file, chunk_length_s=chunk_length_s, stride_length_s=stride_length_s, return_timestamps=return_timestamps)
formatted_output = format_and_group_timestamps(output['chunks'])
return formatted_output
# Interface setup remains the same
description = '''ASR with salt-mms'''
iface = gr.Interface(fn=transcribe_audio,
inputs=[
gr.Audio(sources="upload", type="filepath", label="upload file to transcribe"),
gr.Dropdown(choices=list(target_lang_options.keys()), label="Language", value="English")
],
outputs=gr.Textbox(label="Transcription"),
description=description
)
# Launch the interface
iface.launch()
|