File size: 2,618 Bytes
889a5cc
 
 
 
6395f19
 
 
 
 
889a5cc
6395f19
 
889a5cc
14b80ce
889a5cc
b1d5ba9
 
f95617d
 
 
 
 
 
 
 
 
 
 
 
889a5cc
 
 
47af204
f9b643b
3830518
4872dd1
47af204
4872dd1
47af204
 
889a5cc
f95617d
66c8ad2
6395f19
 
3830518
889a5cc
 
6395f19
15ad32c
889a5cc
e8db4c4
14b80ce
e8db4c4
 
 
 
 
 
 
 
 
 
 
05e5cdb
889a5cc
 
47af204
 
 
14b80ce
47af204
59fa2b8
48b5bc5
 
 
 
 
 
 
 
2cb87d0
48b5bc5
 
 
 
 
6395f19
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess
from modelscope import snapshot_download

import datetime
import math
import io
import os
import tempfile
import json
from typing import Optional

import torch
import gradio as gr

from config import model_config

try:
    import spaces
    USING_SPACES = True
except ImportError:
    USING_SPACES = False

def gpu_decorator(func):
    if USING_SPACES:
        return spaces.GPU(func)
    else:
        return func
        
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model_dir = snapshot_download(model_config['model_dir'])

model = AutoModel(
    model=model_dir,
    vad_kwargs={"max_single_segment_time": 15000},
    ncpu=torch.get_num_threads(),
    batch_size=1,
    hub="hf",
    device=device,
)

@gpu_decorator
def transcribe_audio(file_path, vad_model="fsmn-vad", vad_kwargs='{"max_single_segment_time": 15000}', 
                     batch_size=1, language="auto", use_itn=True, batch_size_s=60, 
                     merge_vad=True, merge_length_s=15, batch_size_threshold_s=50, 
                     hotword=" ", ban_emo_unk=True):
    try:
        vad_kwargs = json.loads(vad_kwargs)
        
        temp_file_path = file_path

        res = model.generate(
            input=temp_file_path,
            cache={},
            language=language,
            use_itn=use_itn,
            batch_size_s=batch_size_s,
            merge_vad=merge_vad,
            merge_length_s=merge_length_s,
            batch_size_threshold_s=batch_size_threshold_s,
            hotword=hotword,
            ban_emo_unk=ban_emo_unk
        )

        return res[0]

    except Exception as e:
        return str(e)

inputs = [
    gr.Audio(type="filepath"),
    gr.Textbox(value="fsmn-vad", label="VAD Model"),
    gr.Textbox(value='{"max_single_segment_time": 15000}', label="VAD Kwargs"),
    gr.Slider(1, 10, value=1, step=1, label="Batch Size"),
    gr.Textbox(value="auto", label="Language"),
    gr.Checkbox(value=True, label="Use ITN"),
    gr.Slider(30, 120, value=60, step=1, label="Batch Size (seconds)"),
    gr.Checkbox(value=True, label="Merge VAD"),
    gr.Slider(5, 60, value=15, step=1, label="Merge Length (seconds)"),
    gr.Slider(10, 100, value=50, step=1, label="Batch Size Threshold (seconds)"),
    gr.Textbox(value=" ", label="Hotword"),
    gr.Checkbox(value=True, label="Ban Emotional Unknown"),
]

outputs = gr.Textbox(label="Transcription")

gr.Interface(
    fn=transcribe_audio, 
    inputs=inputs, 
    outputs=outputs, 
    title="ASR Transcription with FunASR"
).launch()