from transformers import pipeline import gradio as gr import numpy as np import librosa # Initialize the speech recognition pipeline pipe = pipeline("automatic-speech-recognition", model="oyemade/w2v-bert-2.0-yoruba-CV17.0") def transcribe(audio): if audio is None: return "No audio detected. Please try again." try: # Check if the input is a file path (for uploaded files) or numpy array (for microphone input) if isinstance(audio, str): # Load the audio file using librosa audio, sr = librosa.load(audio, sr=16000) # Resample to 16kHz elif isinstance(audio, tuple): # Gradio audio components return a tuple (sr, audio) sr, audio = audio if sr != 16000: audio = librosa.resample(audio, sr, 16000) else: return "Invalid audio format. Please try again." # Check if the audio is valid (not silent) if np.max(np.abs(audio)) < 0.01: return "Audio is too quiet. Please speak louder or choose a different file and try again." text = pipe(audio)["text"] return text except Exception as e: return f"An error occurred: {str(e)}" # Create the Gradio interface iface = gr.Interface( fn=transcribe, inputs=[ gr.Audio(sources=["microphone", "upload"], type="filepath", label="Audio Input"), ], outputs="text", title="Neoform AI: Yoruba Speech Recognition", description="Realtime demo for Yoruba speech recognition using a fine-tuned Wav2Vec-Bert model. " "You can either use your microphone or upload an MP3 file. " "https://neoformai.com", ) # Launch the interface iface.launch()