File size: 2,245 Bytes
6364f03
b733b99
1c463a3
b733b99
ac543ab
 
b733b99
5ad83b4
 
b733b99
e8adb19
b733b99
 
 
 
 
 
 
e8adb19
b733b99
 
 
 
5ad83b4
b733b99
 
 
 
731d7a6
b733b99
e8adb19
 
b733b99
 
 
fdc4fd8
 
 
 
 
 
 
ac543ab
fdc4fd8
 
 
b733b99
 
 
 
 
 
 
 
 
 
 
 
 
f10e069
b733b99
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import gradio as gr
import pandas as pd
from pydub import AudioSegment
from pyannote.audio import Pipeline
import whisper
model = whisper.load_model("medium")
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization",use_auth_token="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")
def diarization(inp_audio):
    diarization = pipeline(inp_audio)
    speakertime=[]
    output=""
# print the result
    for turn, _, speaker in diarization.itertracks(yield_label=True):
        details=[turn.start,turn.end,speaker]
        speakertime.append(details)
        #print(turn.start)
        #print(speaker)
        print(f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}")
        output=output+f"start={turn.start:.1f}s stop={turn.end:.1f}s speaker_{speaker}"+"\n"
    #print(speakertime)
    df = pd.DataFrame(speakertime,columns=['start', 'end','speaker'])
    text=[]
    for i in range (df.start.count()):
        text.append(generatetext(inp_audio,df.start[i], df.end[i]))
    df['text']=text
    with open('my_file.txt', 'w') as my_file:
      for i in range (df.start.count()):
          my_file.write(df.speaker[i]+": " +df.text[i] + '\n')
          output=output+df.speaker[i]+": " +df.text[i] + '\n'
    print(open("my_file.txt","r").read())
    return output
    


def generatetext(filename,starttime,endtime):
    t1 = starttime * 1000 # works in milliseconds
    t2 = endtime * 1000
    newAudio = AudioSegment.from_wav(filename)
    a = newAudio[t1:t2]
    a.export('audio.wav', format="wav") 
    audio = whisper.load_audio('audio.wav')
    result= model.transcribe(audio)
  #text1 = whisper('audio.wav')
    print(result)
    print(result.get("text"))
    return result.get("text")
block = gr.Blocks()
with block:
    with gr.Group():
        with gr.Box(): 
            with gr.Row().style():
               
                inp_audio = gr.Audio(
                    label="Input Audio",
                    type="filepath",
                    mirror_webcam = False
                )
                outputdialogs = gr.Textbox()
            btn = gr.Button("Generate Text")
        btn.click(diarization, inputs=[inp_audio], outputs=[outputdialogs],api_name="view_api")
block.launch(enable_queue = True,debug=True)