import gradio as gr import pandas as pd from pydub import AudioSegment from pyannote.audio import Pipeline import whisper model = whisper.load_model("medium") pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization",use_auth_token="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") def diarization(inp_audio): diarization = pipeline(inp_audio) speakertime=[] output="" # print the result for turn, _, speaker in diarization.itertracks(yield_label=True): details=[turn.start,turn.end,speaker] speakertime.append(details) #print(turn.start) #print(speaker) print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}") output=output+f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}"+"\n" #print(speakertime) df = pd.DataFrame(speakertime,columns=['start', 'end','speaker']) text=[] for i in range (df.start.count()): text.append(generatetext(inp_audio,df.start[i], df.end[i])) df['text']=text with open('my_file.txt', 'w') as my_file: for i in range (df.start.count()): my_file.write(df.speaker[i]+": " +df.text[i] + '\n') output=output+df.speaker[i]+": " +df.text[i] + '\n' print(open("my_file.txt","r").read()) return output def generatetext(filename,starttime,endtime): t1 = starttime * 1000 # works in milliseconds t2 = endtime * 1000 newAudio = AudioSegment.from_wav(filename) a = newAudio[t1:t2] a.export('audio.wav', format="wav") audio = whisper.load_audio('audio.wav') result= model.transcribe(audio) #text1 = whisper('audio.wav') print(result) print(result.get("text")) return result.get("text") block = gr.Blocks() with block: with gr.Group(): with gr.Box(): with gr.Row().style(): inp_audio = gr.Audio( label="Input Audio", type="filepath", mirror_webcam = False ) outputdialogs = gr.Textbox() btn = gr.Button("Generate Text") btn.click(diarization, inputs=[inp_audio], outputs=[outputdialogs],api_name="view_api") block.launch(enable_queue = True,debug=True)