File size: 2,988 Bytes
d7ae26e
a39ac0d
d7ae26e
 
6d352f5
e59bf3f
d5f6791
6d352f5
a39ac0d
 
6ce3643
 
 
 
 
a39ac0d
 
 
 
 
 
 
 
 
 
 
 
27b508c
a39ac0d
 
 
 
59bf002
27b508c
a39ac0d
 
22fe498
1609670
 
6d352f5
27b508c
 
 
 
 
6d352f5
27b508c
6ce3643
d7ae26e
27b508c
 
 
 
 
 
 
 
6d352f5
a39ac0d
6d352f5
d4afb45
d7ae26e
27b508c
a39ac0d
 
d7ae26e
6d352f5
 
 
a39ac0d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import gradio as gr
from transformers import pipeline
import torch
import librosa
import json
import os
import whisper

# Assuming other necessary imports and setup are already done

auth_token = os.environ.get("HF_TOKEN")
target_lang_options = {"English": "eng", "Luganda": "lug", "Acholi": "ach", "Runyankole": "nyn", "Lugbara": "lgg"}
languages = list(target_lang_options.keys())


# Helper function to format and group word timestamps
def format_and_group_timestamps(chunks, interval=5.0):
    grouped = {}
    transcript = ""
    for chunk in chunks:
        start, end = chunk['timestamp']
        word = chunk['text']
        transcript += f"{word} "
        interval_start = int(start // interval) * interval
        if interval_start not in grouped:
            grouped[interval_start] = []
        grouped[interval_start].append((start, end, word))

    formatted_output = f"Transcript: {transcript.strip()}'\n\n-------\n\nword-stamped transcripts (every 5 seconds):\n\n"
    for interval_start, words in grouped.items():
        formatted_output += f"({interval_start}, {interval_start + interval}) -- {' '.join([w[2] for w in words])}\n"
    return formatted_output

# Modified transcribe_audio function to use Whisper for English
def transcribe_audio(input_file, language, chunk_length_s=10, stride_length_s=(4, 2), return_timestamps="word"):
    target_lang_options = {"English": "eng", "Luganda": "lug", "Acholi": "ach", "Runyankole": "nyn", "Lugbara": "lgg"}
    target_lang_code = target_lang_options[language]
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    if target_lang_code == "eng":
        # Use Whisper for English
        model = whisper.load_model("small")
        result = model.transcribe(input_file)
        # Assuming you want to keep the formatting function for consistency
        return result["text"]
    else:
        # Use specified model for other languages
        model_id = "Sunbird/sunbird-mms"

        auth_token = os.environ.get("HF_TOKEN")
        pipe = pipeline(model=model_id, device=device, token=auth_token)
        pipe.tokenizer.set_target_lang(target_lang_code)
        pipe.model.load_adapter(target_lang_code)

        output = pipe(input_file, chunk_length_s=chunk_length_s, stride_length_s=stride_length_s, return_timestamps=return_timestamps)
        formatted_output = format_and_group_timestamps(output['chunks'])
        return formatted_output

# Interface setup remains the same
description = '''ASR with salt-mms'''
iface = gr.Interface(fn=transcribe_audio,
                     inputs=[
                         gr.Audio(sources="upload", type="filepath", label="upload file to transcribe"),
                         gr.Dropdown(choices=list(target_lang_options.keys()), label="Language", value="English")
                     ],
                     outputs=gr.Textbox(label="Transcription"),
                     description=description
                     )

# Launch the interface
iface.launch()