from transformers import pipeline import gradio as gr from pyannote.core import Annotation from pydub import AudioSegment import torchaudio from pyannote.audio import Pipeline diarization_pipe = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1", use_auth_token="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") # change to "your-username/the-name-you-picked" # Load the speech-to-text model (Whisper) asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny") def transcribe_with_diarization(audio_path): # Get speaker segments using the diarization model diarization_result = diarization_pipe(audio_path) # Extract speaker segments and transcribe them using Whisper ASR transcripts = [] for track, segment,speaker in diarization_result.itertracks(yield_label=True): print(segment) print(speaker) start_time = track.start end_time = track.end print(start_time) print(end_time) label = segment # Extract the label manually waveform, sample_rate = torchaudio.load("/content/drive/MyDrive/recording.mp3", normalize=True) start_sample = int(start_time * sample_rate) end_sample = int(end_time * sample_rate) print(waveform) interval_audio = waveform[:,start_sample:end_sample] # Export the interval audio as a temporary WAV file torchaudio.save("interval_audio.wav", interval_audio,sample_rate) transcript = asr_pipe("interval_audio.wav") print(transcript) start_time = segment.start end_time = segment.end label = track[0].label() # Extract the label manually speaker_audio = audio_path + f"[{start_time:.2f},{end_time:.2f}]" transcript = asr_pipe(speaker_audio)[0]["text"] transcripts.append(transcript) # Combine the transcriptions from all speakers text = " ".join(transcripts) return text iface = gr.Interface( fn=transcribe_with_diarization, inputs=[ gr.File(label="Audio File"), gr.Audio(source="microphone", type="filepath", filetype="wav") ], outputs="text", title="Whisper small Hindi with Speaker Diarization", description="Real-time demo for Hindi speech recognition using a fine-tuned Whisper large model with speaker diarization.", ) iface.launch()