from funasr import AutoModel from funasr.utils.postprocess_utils import rich_transcription_postprocess from modelscope import snapshot_download import io import os import tempfile import json from typing import Optional import torch import gradio as gr from config import model_config device = "cuda:0" if torch.cuda.is_available() else "cpu" model_dir = snapshot_download(model_config['model_dir']) model = AutoModel( model=model_dir, trust_remote_code=False, remote_code="./model.py", vad_model="fsmn-vad", punc_model="ct-punc", vad_kwargs={"max_single_segment_time": 30000}, ncpu=torch.get_num_threads(), batch_size=1, hub="hf", device=device, ) def transcribe_audio(file_path, vad_model="fsmn-vad", punc_model="ct-punc", vad_kwargs='{"max_single_segment_time": 30000}', batch_size=1, language="auto", use_itn=True, batch_size_s=60, merge_vad=True, merge_length_s=15, batch_size_threshold_s=50, hotword=" ", spk_model="cam++", ban_emo_unk=True): try: vad_kwargs = json.loads(vad_kwargs) temp_file_path = file_path res = model.generate( input=temp_file_path, cache={}, language=language, use_itn=use_itn, batch_size_s=batch_size_s, merge_vad=merge_vad, merge_length_s=merge_length_s, batch_size_threshold_s=batch_size_threshold_s, hotword=hotword, spk_model=spk_model, ban_emo_unk=ban_emo_unk ) text = rich_transcription_postprocess(res[0]["text"]) return text except Exception as e: return str(e) inputs = [ gr.Audio(type="filepath"), gr.Textbox(value="fsmn-vad", label="VAD Model"), gr.Textbox(value="ct-punc", label="PUNC Model"), gr.Textbox(value='{"max_single_segment_time": 30000}', label="VAD Kwargs"), gr.Slider(1, 10, value=1, step=1, label="Batch Size"), gr.Textbox(value="auto", label="Language"), gr.Checkbox(value=True, label="Use ITN"), gr.Slider(30, 120, value=60, step=1, label="Batch Size (seconds)"), gr.Checkbox(value=True, label="Merge VAD"), gr.Slider(5, 60, value=15, step=1, label="Merge Length (seconds)"), gr.Slider(10, 100, value=50, step=1, label="Batch Size Threshold (seconds)"), gr.Textbox(value=" ", label="Hotword"), gr.Textbox(value="cam++", label="Speaker Model"), gr.Checkbox(value=True, label="Ban Emotional Unknown"), ] outputs = gr.Textbox(label="Transcription") gr.Interface( fn=transcribe_audio, inputs=inputs, outputs=outputs, title="ASR Transcription with FunASR" ).launch()