funasr-svsmall / app.py
TaiYouWeb's picture
Update app.py
e710876 verified
raw
history blame
2.84 kB
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess
from modelscope import snapshot_download
import datetime
import math
import io
import os
import tempfile
import json
from typing import Optional
from pyannote.audio import Audio, Pipeline
from pyannote.core import Segment
import torch
import gradio as gr
from config import model_config
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model_dir = snapshot_download(model_config['model_dir'])
model = AutoModel(
model=model_dir,
trust_remote_code=False,
remote_code="./model.py",
vad_model="fsmn-vad",
punc_model="ct-punc",
vad_kwargs={"max_single_segment_time": 30000},
ncpu=torch.get_num_threads(),
batch_size=1,
hub="hf",
device=device,
)
def transcribe_audio(file_path, vad_model="fsmn-vad", punc_model="ct-punc", vad_kwargs='{"max_single_segment_time": 30000}',
batch_size=1, language="auto", use_itn=True, batch_size_s=60,
merge_vad=True, merge_length_s=15, batch_size_threshold_s=50,
hotword=" ", spk_model="cam++", ban_emo_unk=True):
try:
vad_kwargs = json.loads(vad_kwargs)
temp_file_path = file_path
res = model.generate(
input=temp_file_path,
cache={},
language=language,
use_itn=use_itn,
batch_size_s=batch_size_s,
merge_vad=merge_vad,
merge_length_s=merge_length_s,
batch_size_threshold_s=batch_size_threshold_s,
hotword=hotword,
spk_model=spk_model,
ban_emo_unk=ban_emo_unk
)
text = rich_transcription_postprocess(res[0]["text"])
return text
except Exception as e:
return str(e)
inputs = [
gr.Audio(type="filepath"),
gr.Textbox(value="fsmn-vad", label="VAD Model"),
gr.Textbox(value="ct-punc", label="PUNC Model"),
gr.Textbox(value='{"max_single_segment_time": 30000}', label="VAD Kwargs"),
gr.Slider(1, 10, value=1, step=1, label="Batch Size"),
gr.Textbox(value="auto", label="Language"),
gr.Checkbox(value=True, label="Use ITN"),
gr.Slider(30, 120, value=60, step=1, label="Batch Size (seconds)"),
gr.Checkbox(value=True, label="Merge VAD"),
gr.Slider(5, 60, value=15, step=1, label="Merge Length (seconds)"),
gr.Slider(10, 100, value=50, step=1, label="Batch Size Threshold (seconds)"),
gr.Textbox(value=" ", label="Hotword"),
gr.Textbox(value="cam++", label="Speaker Model"),
gr.Checkbox(value=True, label="Ban Emotional Unknown"),
]
outputs = gr.Textbox(label="Transcription")
gr.Interface(
fn=transcribe_audio,
inputs=inputs,
outputs=outputs,
title="ASR Transcription with FunASR"
).launch()