import gradio as gr from transformers import pipeline import torch import librosa import os # Authentication token for Hugging Face auth_token = os.environ.get("HF_TOKEN") # Mapping of language options to their codes target_lang_options = {"English": "eng", "Luganda": "lug", "Acholi": "ach", "Runyankole": "nyn", "Lugbara": "lgg"} languages = list(target_lang_options.keys()) # Determine device based on CUDA availability device = "cuda" if torch.cuda.is_available() else "cpu" base_model_id = "facebook/mms-1b-all" pipe = pipeline("automatic-speech-recognition", model=base_model_id, device=device, token=auth_token) def transcribe_audio(input_file, language, chunk_length_s=10, stride_length_s=(4, 2), return_timestamps="word"): target_lang_code = target_lang_options[language] # Dynamically set the target language and load the corresponding adapter pipe.tokenizer.set_target_lang(target_lang_code) # Assuming each language code directly corresponds to an adapter name available for the model pipe.model.load_adapter(target_lang_code) # Load and transcribe the audio file audio_data, _ = librosa.load(input_file, sr=None) output = pipe(audio_data, chunk_length_s=chunk_length_s, stride_length_s=stride_length_s, return_timestamps=return_timestamps) return output description = "ASR with dynamic language adaptation" iface = gr.Interface(fn=transcribe_audio, inputs=[ gr.Audio(source="upload", type="filepath", label="Upload file to transcribe"), gr.Dropdown(choices=languages, label="Language", value="English") ], outputs=gr.Textbox(label="Transcription"), description=description) iface.launch()