Spaces:
Runtime error
Runtime error
from transformers import pipeline | |
import gradio as gr | |
from pyannote.core import Annotation | |
from pydub import AudioSegment | |
import torchaudio | |
from pyannote.audio import Pipeline | |
diarization_pipe = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1", | |
use_auth_token="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") # change to "your-username/the-name-you-picked" | |
# Load the speech-to-text model (Whisper) | |
asr_pipe = pipeline("automatic-speech-recognition", model="SyedAunZaidi/whisper-small-hi") | |
def transcribe_with_diarization(audio_path): | |
# Get speaker segments using the diarization model | |
diarization_result = diarization_pipe(audio_path) | |
# Extract speaker segments and transcribe them using Whisper ASR | |
transcripts = [] | |
for track, segment,speaker in diarization_result.itertracks(yield_label=True): | |
print(segment) | |
print(speaker) | |
start_time = track.start | |
end_time = track.end | |
print(start_time) | |
print(end_time) | |
label = segment # Extract the label manually | |
waveform, sample_rate = torchaudio.load("/content/drive/MyDrive/recording.mp3", normalize=True) | |
start_sample = int(start_time * sample_rate) | |
end_sample = int(end_time * sample_rate) | |
print(waveform) | |
interval_audio = waveform[:,start_sample:end_sample] | |
# Export the interval audio as a temporary WAV file | |
torchaudio.save("interval_audio.wav", interval_audio,sample_rate) | |
transcript = asr_pipe("interval_audio.wav") | |
print(transcript) | |
start_time = segment.start | |
end_time = segment.end | |
label = track[0].label() # Extract the label manually | |
speaker_audio = audio_path + f"[{start_time:.2f},{end_time:.2f}]" | |
transcript = asr_pipe(speaker_audio)[0]["text"] | |
transcripts.append(transcript) | |
# Combine the transcriptions from all speakers | |
text = " ".join(transcripts) | |
return text | |
iface = gr.Interface( | |
fn=transcribe_with_diarization, | |
inputs=gr.Audio(source="microphone", type="filepath", filetype="wav"), | |
outputs="text", | |
title="Whisper Large Hindi with Speaker Diarization", | |
description="Real-time demo for Hindi speech recognition using a fine-tuned Whisper large model with speaker diarization.", | |
) | |
iface.launch() | |