kabita-choudhary's picture
Update app.py
731d7a6
raw
history blame contribute delete
No virus
2.25 kB
import gradio as gr
import pandas as pd
from pydub import AudioSegment
from pyannote.audio import Pipeline
import whisper
model = whisper.load_model("medium")
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization",use_auth_token="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")
def diarization(inp_audio):
diarization = pipeline(inp_audio)
speakertime=[]
output=""
# print the result
for turn, _, speaker in diarization.itertracks(yield_label=True):
details=[turn.start,turn.end,speaker]
speakertime.append(details)
#print(turn.start)
#print(speaker)
print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")
output=output+f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}"+"\n"
#print(speakertime)
df = pd.DataFrame(speakertime,columns=['start', 'end','speaker'])
text=[]
for i in range (df.start.count()):
text.append(generatetext(inp_audio,df.start[i], df.end[i]))
df['text']=text
with open('my_file.txt', 'w') as my_file:
for i in range (df.start.count()):
my_file.write(df.speaker[i]+": " +df.text[i] + '\n')
output=output+df.speaker[i]+": " +df.text[i] + '\n'
print(open("my_file.txt","r").read())
return output
def generatetext(filename,starttime,endtime):
t1 = starttime * 1000 # works in milliseconds
t2 = endtime * 1000
newAudio = AudioSegment.from_wav(filename)
a = newAudio[t1:t2]
a.export('audio.wav', format="wav")
audio = whisper.load_audio('audio.wav')
result= model.transcribe(audio)
#text1 = whisper('audio.wav')
print(result)
print(result.get("text"))
return result.get("text")
block = gr.Blocks()
with block:
with gr.Group():
with gr.Box():
with gr.Row().style():
inp_audio = gr.Audio(
label="Input Audio",
type="filepath",
mirror_webcam = False
)
outputdialogs = gr.Textbox()
btn = gr.Button("Generate Text")
btn.click(diarization, inputs=[inp_audio], outputs=[outputdialogs],api_name="view_api")
block.launch(enable_queue = True,debug=True)