File size: 2,407 Bytes
159d2d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import whisperx
import streamlit as st
import torch
import tempfile
import subprocess


def transcribe(audio_file):


    if torch.cuda.is_available():
        device = "gpu"
    else:
        device = "cpu"
    batch_size = 16 # reduce if low on GPU mem
    compute_type = "int8" # change to "float16" if high on GPU mem (may reduce accuracy)
    YOUR_HF_TOKEN = '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'

    # load audio file
    audio_bytes = uploaded_file.getvalue()
    with open(temp_file, 'wb') as f:
        f.write(audio_bytes)

    # 1. Transcribe with original whisper (batched)
    model = whisperx.load_model("tiny", device = device, compute_type=compute_type)

    audio = whisperx.load_audio(temp_file)
    result = model.transcribe(audio, batch_size=batch_size)
    st.write("Transcribed! Here's what we have so far:")
    st.write(result["segments"]) # before alignment

    # delete model if low on GPU resources
    # import gc; gc.collect(); torch.cuda.empty_cache(); del model

    # 2. Align whisper output
    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
    result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
    st.write("Aligned! Here's what we have so far:")
    st.write(result["segments"]) # after alignment

    # delete model if low on GPU resources
    # import gc; gc.collect(); torch.cuda.empty_cache(); del model_a

    # 3. Assign speaker labels
    diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)

    # add min/max number of speakers if known
    diarize_segments = diarize_model(audio_file)
    # diarize_model(audio_file, min_speakers=min_speakers, max_speakers=max_speakers)

    result = whisperx.assign_word_speakers(diarize_segments, result)
    st.write(diarize_segments)
    st.write(result["segments"]) # segments are now assigned speaker IDs


st.title("Automated Transcription")

form = st.form(key='my_form')
uploaded_file = form.file_uploader("Choose a file")

submit = form.form_submit_button("Transcribe!")


if submit:
    #temporary file to store audio_file
    tmp_dir = tempfile.TemporaryDirectory()
    temp_file = tmp_dir.name + '/mono.wav'
    cmd = f"ffmpeg -y -i {uploaded_file} -acodec pcm_s16le -ar 16000 -ac 1 {temp_file}"
    subprocess.Popen(cmd, shell=True).wait()

    transcribe(temp_file)