from transformers import pipeline import gradio as gr from pyannote.core import Annotation from pydub import AudioSegment import torchaudio from pyannote.audio import Pipeline diarization_pipe = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1", use_auth_token="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") # change to "your-username/the-name-you-picked" # Load the speech-to-text model (Whisper) asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny") def transcribe_with_diarization(audio_path): # Get speaker segments using the diarization model diarization_result = diarization_pipe(audio_path) # Extract speaker segments and transcribe them using Whisper ASR transcripts = [] for track, segment,speaker in diarization_result.itertracks(yield_label=True): print(segment) print(speaker) start_time = track.start end_time = track.end print(start_time) print(end_time) label = segment # Extract the label manually waveform, sample_rate = torchaudio.load(audio_path, normalize=True) start_sample = int(start_time * sample_rate) end_sample = int(end_time * sample_rate) print(waveform) interval_audio = waveform[:,start_sample:end_sample] # Export the interval audio as a temporary WAV file torchaudio.save("interval_audio.wav", interval_audio,sample_rate) transcript = asr_pipe("interval_audio.wav") print(transcript) transcripts.append(transcript) # Combine the transcriptions from all speakers text = " ".join(transcripts) return text iface = gr.Interface( fn=transcribe_with_diarization, inputs=[ gr.File(label="Audio File"), gr.Audio(source="microphone", type="filepath", filetype="mp3") ], outputs="text", title="Whisper Large Hindi with Speaker Diarization", description="Real-time demo for Hindi speech recognition using a fine-tuned Whisper large model with speaker diarization.", ) iface.launch()