from funasr import AutoModel from funasr.utils.postprocess_utils import rich_transcription_postprocess from modelscope import snapshot_download import datetime import math import io import os import tempfile import json from typing import Optional from pyannote.audio import Audio, Pipeline from pyannote.core import Segment import torch import gradio as gr from config import model_config device = "cuda:0" if torch.cuda.is_available() else "cpu" model_dir = snapshot_download(model_config['model_dir']) model = AutoModel( model=model_dir, trust_remote_code=False, remote_code="./model.py", vad_model="fsmn-vad", punc_model="ct-punc", vad_kwargs={"max_single_segment_time": 30000}, ncpu=torch.get_num_threads(), batch_size=1, hub="hf", device=device, ) def transcribe_audio(file_path, vad_model="fsmn-vad", punc_model="ct-punc", vad_kwargs='{"max_single_segment_time": 30000}', batch_size=1, language="auto", use_itn=True, batch_size_s=60, merge_vad=True, merge_length_s=15, batch_size_threshold_s=50, hotword=" ", spk_model="cam++", ban_emo_unk=True): try: vad_kwargs = json.loads(vad_kwargs) temp_file_path = file_path res = model.generate( input=temp_file_path, cache={}, language=language, use_itn=use_itn, batch_size_s=batch_size_s, merge_vad=merge_vad, merge_length_s=merge_length_s, batch_size_threshold_s=batch_size_threshold_s, hotword=hotword, spk_model=spk_model, ban_emo_unk=ban_emo_unk ) text = rich_transcription_postprocess(res[0]["text"]) return text except Exception as e: return str(e) inputs = [ gr.Audio(type="filepath"), gr.Textbox(value="fsmn-vad", label="VAD Model"), gr.Textbox(value="ct-punc", label="PUNC Model"), gr.Textbox(value='{"max_single_segment_time": 30000}', label="VAD Kwargs"), gr.Slider(1, 10, value=1, step=1, label="Batch Size"), gr.Textbox(value="auto", label="Language"), gr.Checkbox(value=True, label="Use ITN"), gr.Slider(30, 120, value=60, step=1, label="Batch Size (seconds)"), gr.Checkbox(value=True, label="Merge VAD"), gr.Slider(5, 60, value=15, step=1, label="Merge Length (seconds)"), gr.Slider(10, 100, value=50, step=1, label="Batch Size Threshold (seconds)"), gr.Textbox(value=" ", label="Hotword"), gr.Textbox(value="cam++", label="Speaker Model"), gr.Checkbox(value=True, label="Ban Emotional Unknown"), ] outputs = gr.Textbox(label="Transcription") gr.Interface( fn=transcribe_audio, inputs=inputs, outputs=outputs, title="ASR Transcription with FunASR" ).launch()