# File: onnx/fsmn_vad_ort_session.py
# ```py

# -*- coding:utf-8 -*-
# @FileName  :fsmn_vad_ort_session.py.py
# @Time      :2024/8/31 16:45
# @Author    :lovemefan
# @Email     :lovemefan@outlook.com

import argparse
import logging
import math
import os
import time
import warnings
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Tuple, Union

import kaldi_native_fbank as knf
import numpy as np
import sentencepiece as spm
import soundfile as sf
import yaml
from onnxruntime import (GraphOptimizationLevel, InferenceSession,
                         SessionOptions, get_available_providers, get_device)
from rknnlite.api.rknn_lite import RKNNLite

RKNN_INPUT_LEN = 171

SPEECH_SCALE = 1/2  # 因为是fp16推理，如果中间结果太大可能会溢出变inf，所以需要缩放一下

class VadOrtInferRuntimeSession:
    def __init__(self, config, root_dir: Path):
        sess_opt = SessionOptions()
        sess_opt.log_severity_level = 4
        sess_opt.enable_cpu_mem_arena = False
        sess_opt.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL

        cuda_ep = "CUDAExecutionProvider"
        cpu_ep = "CPUExecutionProvider"
        cpu_provider_options = {
            "arena_extend_strategy": "kSameAsRequested",
        }

        EP_list = []
        if (
            config["use_cuda"]
            and get_device() == "GPU"
            and cuda_ep in get_available_providers()
        ):
            EP_list = [(cuda_ep, config[cuda_ep])]
        EP_list.append((cpu_ep, cpu_provider_options))

        config["model_path"] = root_dir / str(config["model_path"])
        self._verify_model(config["model_path"])
        logging.info(f"Loading onnx model at {str(config['model_path'])}")
        self.session = InferenceSession(
            str(config["model_path"]), sess_options=sess_opt, providers=EP_list
        )

        if config["use_cuda"] and cuda_ep not in self.session.get_providers():
            logging.warning(
                f"{cuda_ep} is not available for current env, "
                f"the inference part is automatically shifted to be "
                f"executed under {cpu_ep}.\n "
                "Please ensure the installed onnxruntime-gpu version"
                " matches your cuda and cudnn version, "
                "you can check their relations from the offical web site: "
                "https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html",
                RuntimeWarning,
            )

    def __call__(
        self, input_content: List[Union[np.ndarray, np.ndarray]]
    ) -> np.ndarray:
        if isinstance(input_content, list):
            input_dict = {
                "speech": input_content[0],
                "in_cache0": input_content[1],
                "in_cache1": input_content[2],
                "in_cache2": input_content[3],
                "in_cache3": input_content[4],
            }
        else:
            input_dict = {"speech": input_content}

        return self.session.run(None, input_dict)

    def get_input_names(
        self,
    ):
        return [v.name for v in self.session.get_inputs()]

    def get_output_names(
        self,
    ):
        return [v.name for v in self.session.get_outputs()]

    def get_character_list(self, key: str = "character"):
        return self.meta_dict[key].splitlines()

    def have_key(self, key: str = "character") -> bool:
        self.meta_dict = self.session.get_modelmeta().custom_metadata_map
        if key in self.meta_dict.keys():
            return True
        return False

    @staticmethod
    def _verify_model(model_path):
        model_path = Path(model_path)
        if not model_path.exists():
            raise FileNotFoundError(f"{model_path} does not exists.")
        if not model_path.is_file():
            raise FileExistsError(f"{model_path} is not a file.")

# ```

# File: onnx/sense_voice_ort_session.py
# ```py
# -*- coding:utf-8 -*-
# @FileName  :sense_voice_onnxruntime.py
# @Time      :2024/7/17 20:53
# @Author    :lovemefan
# @Email     :lovemefan@outlook.com


formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
logging.basicConfig(format=formatter, level=logging.INFO)


class OrtInferRuntimeSession:
    def __init__(self, model_file, device_id=-1, intra_op_num_threads=4):
        device_id = str(device_id)
        sess_opt = SessionOptions()
        sess_opt.intra_op_num_threads = intra_op_num_threads
        sess_opt.log_severity_level = 4
        sess_opt.enable_cpu_mem_arena = False
        sess_opt.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL

        cuda_ep = "CUDAExecutionProvider"
        cuda_provider_options = {
            "device_id": device_id,
            "arena_extend_strategy": "kNextPowerOfTwo",
            "cudnn_conv_algo_search": "EXHAUSTIVE",
            "do_copy_in_default_stream": "true",
        }
        cpu_ep = "CPUExecutionProvider"
        cpu_provider_options = {
            "arena_extend_strategy": "kSameAsRequested",
        }

        EP_list = []
        if (
            device_id != "-1"
            and get_device() == "GPU"
            and cuda_ep in get_available_providers()
        ):
            EP_list = [(cuda_ep, cuda_provider_options)]
        EP_list.append((cpu_ep, cpu_provider_options))

        self._verify_model(model_file)

        self.session = InferenceSession(
            model_file, sess_options=sess_opt, providers=EP_list
        )

        # delete binary of model file to save memory
        del model_file

        if device_id != "-1" and cuda_ep not in self.session.get_providers():
            warnings.warn(
                f"{cuda_ep} is not avaiable for current env, the inference part is automatically shifted to be executed under {cpu_ep}.\n"
                "Please ensure the installed onnxruntime-gpu version matches your cuda and cudnn version, "
                "you can check their relations from the offical web site: "
                "https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html",
                RuntimeWarning,
            )

    def __call__(self, input_content) -> np.ndarray:
        input_dict = dict(zip(self.get_input_names(), input_content))
        try:
            result = self.session.run(self.get_output_names(), input_dict)
            return result
        except Exception as e:
            print(e)
            raise RuntimeError(f"ONNXRuntime inferece failed. ") from e

    def get_input_names(
        self,
    ):
        return [v.name for v in self.session.get_inputs()]

    def get_output_names(
        self,
    ):
        return [v.name for v in self.session.get_outputs()]

    def get_character_list(self, key: str = "character"):
        return self.meta_dict[key].splitlines()

    def have_key(self, key: str = "character") -> bool:
        self.meta_dict = self.session.get_modelmeta().custom_metadata_map
        if key in self.meta_dict.keys():
            return True
        return False

    @staticmethod
    def _verify_model(model_path):
        model_path = Path(model_path)
        if not model_path.exists():
            raise FileNotFoundError(f"{model_path} does not exists.")
        if not model_path.is_file():
            raise FileExistsError(f"{model_path} is not a file.")


def log_softmax(x: np.ndarray) -> np.ndarray:
    # Subtract the maximum value in each row for numerical stability
    x_max = np.max(x, axis=-1, keepdims=True)
    # Calculate the softmax of x
    softmax = np.exp(x - x_max)
    softmax_sum = np.sum(softmax, axis=-1, keepdims=True)
    softmax = softmax / softmax_sum
    # Calculate the log of the softmax values
    return np.log(softmax)


class SenseVoiceInferenceSession:
    def __init__(
        self,
        embedding_model_file,
        encoder_model_file,
        bpe_model_file,
        device_id=-1,
        intra_op_num_threads=4,
    ):
        logging.info(f"Loading model from {embedding_model_file}")

        self.embedding = np.load(embedding_model_file)
        logging.info(f"Loading model {encoder_model_file}")
        start = time.time()
        self.encoder = RKNNLite(verbose=False)
        self.encoder.load_rknn(encoder_model_file)
        self.encoder.init_runtime()

        logging.info(
            f"Loading {encoder_model_file} takes {time.time() - start:.2f} seconds"
        )
        self.blank_id = 0
        self.sp = spm.SentencePieceProcessor()
        self.sp.load(bpe_model_file)

    def __call__(self, speech, language: int, use_itn: bool) -> np.ndarray:
        language_query = self.embedding[[[language]]]

        # 14 means with itn, 15 means without itn
        text_norm_query = self.embedding[[[14 if use_itn else 15]]]
        event_emo_query = self.embedding[[[1, 2]]]

        # scale the speech
        speech = speech * SPEECH_SCALE
        
        input_content = np.concatenate(
            [
                language_query,
                event_emo_query,
                text_norm_query,
                speech,
            ],
            axis=1,
        ).astype(np.float32)
        print(input_content.shape)
        # pad [1, len, ...] to [1, RKNN_INPUT_LEN, ... ]
        input_content = np.pad(input_content, ((0, 0), (0, RKNN_INPUT_LEN - input_content.shape[1]), (0, 0)))
        print("padded shape:", input_content.shape)
        start_time = time.time()
        encoder_out = self.encoder.inference(inputs=[input_content])[0]
        end_time = time.time()
        print(f"encoder inference time: {end_time - start_time:.2f} seconds")
        # print(encoder_out)
        def unique_consecutive(arr):
            if len(arr) == 0:
                return arr
            # Create a boolean mask where True indicates the element is different from the previous one
            mask = np.append([True], arr[1:] != arr[:-1])
            out = arr[mask]
            out = out[out != self.blank_id]
            return out.tolist()
        
        #现在shape变成了1, n_vocab, n_seq. 这里axis需要改一下
        # hypos = unique_consecutive(encoder_out[0].argmax(axis=-1))
        hypos = unique_consecutive(encoder_out[0].argmax(axis=0))
        text = self.sp.DecodeIds(hypos)
        return text

# ```

# File: utils/frontend.py
# ```py
# -*- coding:utf-8 -*-
# @FileName  :frontend.py
# @Time      :2024/7/18 09:39
# @Author    :lovemefan
# @Email     :lovemefan@outlook.com

class WavFrontend:
    """Conventional frontend structure for ASR."""

    def __init__(
        self,
        cmvn_file: str = None,
        fs: int = 16000,
        window: str = "hamming",
        n_mels: int = 80,
        frame_length: int = 25,
        frame_shift: int = 10,
        lfr_m: int = 7,
        lfr_n: int = 6,
        dither: float = 0,
        **kwargs,
    ) -> None:
        opts = knf.FbankOptions()
        opts.frame_opts.samp_freq = fs
        opts.frame_opts.dither = dither
        opts.frame_opts.window_type = window
        opts.frame_opts.frame_shift_ms = float(frame_shift)
        opts.frame_opts.frame_length_ms = float(frame_length)
        opts.mel_opts.num_bins = n_mels
        opts.energy_floor = 0
        opts.frame_opts.snip_edges = True
        opts.mel_opts.debug_mel = False
        self.opts = opts

        self.lfr_m = lfr_m
        self.lfr_n = lfr_n
        self.cmvn_file = cmvn_file

        if self.cmvn_file:
            self.cmvn = self.load_cmvn()
        self.fbank_fn = None
        self.fbank_beg_idx = 0
        self.reset_status()

    def reset_status(self):
        self.fbank_fn = knf.OnlineFbank(self.opts)
        self.fbank_beg_idx = 0

    def fbank(self, waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        waveform = waveform * (1 << 15)
        self.fbank_fn = knf.OnlineFbank(self.opts)
        self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
        frames = self.fbank_fn.num_frames_ready
        mat = np.empty([frames, self.opts.mel_opts.num_bins])
        for i in range(frames):
            mat[i, :] = self.fbank_fn.get_frame(i)
        feat = mat.astype(np.float32)
        feat_len = np.array(mat.shape[0]).astype(np.int32)
        return feat, feat_len

    def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        if self.lfr_m != 1 or self.lfr_n != 1:
            feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)

        if self.cmvn_file:
            feat = self.apply_cmvn(feat)

        feat_len = np.array(feat.shape[0]).astype(np.int32)
        return feat, feat_len

    def load_audio(self, filename: str) -> Tuple[np.ndarray, int]:
        data, sample_rate = sf.read(
            filename,
            always_2d=True,
            dtype="float32",
        )
        assert (
            sample_rate == 16000
        ), f"Only 16000 Hz is supported, but got {sample_rate}Hz"
        self.sample_rate = sample_rate
        data = data[:, 0]  # use only the first channel
        samples = np.ascontiguousarray(data)

        return samples, sample_rate

    @staticmethod
    def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
        LFR_inputs = []

        T = inputs.shape[0]
        T_lfr = int(np.ceil(T / lfr_n))
        left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
        inputs = np.vstack((left_padding, inputs))
        T = T + (lfr_m - 1) // 2
        for i in range(T_lfr):
            if lfr_m <= T - i * lfr_n:
                LFR_inputs.append(
                    (inputs[i * lfr_n : i * lfr_n + lfr_m]).reshape(1, -1)
                )
            else:
                # process last LFR frame
                num_padding = lfr_m - (T - i * lfr_n)
                frame = inputs[i * lfr_n :].reshape(-1)
                for _ in range(num_padding):
                    frame = np.hstack((frame, inputs[-1]))

                LFR_inputs.append(frame)
        LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
        return LFR_outputs

    def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
        """
        Apply CMVN with mvn data
        """
        frame, dim = inputs.shape
        means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
        vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
        inputs = (inputs + means) * vars
        return inputs

    def get_features(self, inputs: Union[str, np.ndarray]) -> Tuple[np.ndarray, int]:
        if isinstance(inputs, str):
            inputs, _ = self.load_audio(inputs)

        fbank, _ = self.fbank(inputs)
        feats = self.apply_cmvn(self.apply_lfr(fbank, self.lfr_m, self.lfr_n))
        return feats

    def load_cmvn(
        self,
    ) -> np.ndarray:
        with open(self.cmvn_file, "r", encoding="utf-8") as f:
            lines = f.readlines()

        means_list = []
        vars_list = []
        for i in range(len(lines)):
            line_item = lines[i].split()
            if line_item[0] == "<AddShift>":
                line_item = lines[i + 1].split()
                if line_item[0] == "<LearnRateCoef>":
                    add_shift_line = line_item[3 : (len(line_item) - 1)]
                    means_list = list(add_shift_line)
                    continue
            elif line_item[0] == "<Rescale>":
                line_item = lines[i + 1].split()
                if line_item[0] == "<LearnRateCoef>":
                    rescale_line = line_item[3 : (len(line_item) - 1)]
                    vars_list = list(rescale_line)
                    continue

        means = np.array(means_list).astype(np.float64)
        vars = np.array(vars_list).astype(np.float64)
        cmvn = np.array([means, vars])
        return cmvn

# ```

# File: utils/fsmn_vad.py
# ```py
# -*- coding:utf-8 -*-
# @FileName  :fsmn_vad.py
# @Time      :2024/8/31 16:50
# @Author    :lovemefan
# @Email     :lovemefan@outlook.com


def read_yaml(yaml_path: Union[str, Path]) -> Dict:
    if not Path(yaml_path).exists():
        raise FileExistsError(f"The {yaml_path} does not exist.")

    with open(str(yaml_path), "rb") as f:
        data = yaml.load(f, Loader=yaml.Loader)
    return data


class VadStateMachine(Enum):
    kVadInStateStartPointNotDetected = 1
    kVadInStateInSpeechSegment = 2
    kVadInStateEndPointDetected = 3


class FrameState(Enum):
    kFrameStateInvalid = -1
    kFrameStateSpeech = 1
    kFrameStateSil = 0


# final voice/unvoice state per frame
class AudioChangeState(Enum):
    kChangeStateSpeech2Speech = 0
    kChangeStateSpeech2Sil = 1
    kChangeStateSil2Sil = 2
    kChangeStateSil2Speech = 3
    kChangeStateNoBegin = 4
    kChangeStateInvalid = 5


class VadDetectMode(Enum):
    kVadSingleUtteranceDetectMode = 0
    kVadMutipleUtteranceDetectMode = 1


class VADXOptions:
    def __init__(
        self,
        sample_rate: int = 16000,
        detect_mode: int = VadDetectMode.kVadMutipleUtteranceDetectMode.value,
        snr_mode: int = 0,
        max_end_silence_time: int = 800,
        max_start_silence_time: int = 3000,
        do_start_point_detection: bool = True,
        do_end_point_detection: bool = True,
        window_size_ms: int = 200,
        sil_to_speech_time_thres: int = 150,
        speech_to_sil_time_thres: int = 150,
        speech_2_noise_ratio: float = 1.0,
        do_extend: int = 1,
        lookback_time_start_point: int = 200,
        lookahead_time_end_point: int = 100,
        max_single_segment_time: int = 60000,
        nn_eval_block_size: int = 8,
        dcd_block_size: int = 4,
        snr_thres: int = -100.0,
        noise_frame_num_used_for_snr: int = 100,
        decibel_thres: int = -100.0,
        speech_noise_thres: float = 0.6,
        fe_prior_thres: float = 1e-4,
        silence_pdf_num: int = 1,
        sil_pdf_ids: List[int] = [0],
        speech_noise_thresh_low: float = -0.1,
        speech_noise_thresh_high: float = 0.3,
        output_frame_probs: bool = False,
        frame_in_ms: int = 10,
        frame_length_ms: int = 25,
    ):
        self.sample_rate = sample_rate
        self.detect_mode = detect_mode
        self.snr_mode = snr_mode
        self.max_end_silence_time = max_end_silence_time
        self.max_start_silence_time = max_start_silence_time
        self.do_start_point_detection = do_start_point_detection
        self.do_end_point_detection = do_end_point_detection
        self.window_size_ms = window_size_ms
        self.sil_to_speech_time_thres = sil_to_speech_time_thres
        self.speech_to_sil_time_thres = speech_to_sil_time_thres
        self.speech_2_noise_ratio = speech_2_noise_ratio
        self.do_extend = do_extend
        self.lookback_time_start_point = lookback_time_start_point
        self.lookahead_time_end_point = lookahead_time_end_point
        self.max_single_segment_time = max_single_segment_time
        self.nn_eval_block_size = nn_eval_block_size
        self.dcd_block_size = dcd_block_size
        self.snr_thres = snr_thres
        self.noise_frame_num_used_for_snr = noise_frame_num_used_for_snr
        self.decibel_thres = decibel_thres
        self.speech_noise_thres = speech_noise_thres
        self.fe_prior_thres = fe_prior_thres
        self.silence_pdf_num = silence_pdf_num
        self.sil_pdf_ids = sil_pdf_ids
        self.speech_noise_thresh_low = speech_noise_thresh_low
        self.speech_noise_thresh_high = speech_noise_thresh_high
        self.output_frame_probs = output_frame_probs
        self.frame_in_ms = frame_in_ms
        self.frame_length_ms = frame_length_ms


class E2EVadSpeechBufWithDoa(object):
    def __init__(self):
        self.start_ms = 0
        self.end_ms = 0
        self.buffer = []
        self.contain_seg_start_point = False
        self.contain_seg_end_point = False
        self.doa = 0

    def reset(self):
        self.start_ms = 0
        self.end_ms = 0
        self.buffer = []
        self.contain_seg_start_point = False
        self.contain_seg_end_point = False
        self.doa = 0


class E2EVadFrameProb(object):
    def __init__(self):
        self.noise_prob = 0.0
        self.speech_prob = 0.0
        self.score = 0.0
        self.frame_id = 0
        self.frm_state = 0


class WindowDetector(object):
    def __init__(
        self,
        window_size_ms: int,
        sil_to_speech_time: int,
        speech_to_sil_time: int,
        frame_size_ms: int,
    ):
        self.window_size_ms = window_size_ms
        self.sil_to_speech_time = sil_to_speech_time
        self.speech_to_sil_time = speech_to_sil_time
        self.frame_size_ms = frame_size_ms

        self.win_size_frame = int(window_size_ms / frame_size_ms)
        self.win_sum = 0
        self.win_state = [0] * self.win_size_frame  # 初始化窗

        self.cur_win_pos = 0
        self.pre_frame_state = FrameState.kFrameStateSil
        self.cur_frame_state = FrameState.kFrameStateSil
        self.sil_to_speech_frmcnt_thres = int(sil_to_speech_time / frame_size_ms)
        self.speech_to_sil_frmcnt_thres = int(speech_to_sil_time / frame_size_ms)

        self.voice_last_frame_count = 0
        self.noise_last_frame_count = 0
        self.hydre_frame_count = 0

    def reset(self) -> None:
        self.cur_win_pos = 0
        self.win_sum = 0
        self.win_state = [0] * self.win_size_frame
        self.pre_frame_state = FrameState.kFrameStateSil
        self.cur_frame_state = FrameState.kFrameStateSil
        self.voice_last_frame_count = 0
        self.noise_last_frame_count = 0
        self.hydre_frame_count = 0

    def get_win_size(self) -> int:
        return int(self.win_size_frame)

    def detect_one_frame(
        self, frameState: FrameState, frame_count: int
    ) -> AudioChangeState:
        cur_frame_state = FrameState.kFrameStateSil
        if frameState == FrameState.kFrameStateSpeech:
            cur_frame_state = 1
        elif frameState == FrameState.kFrameStateSil:
            cur_frame_state = 0
        else:
            return AudioChangeState.kChangeStateInvalid
        self.win_sum -= self.win_state[self.cur_win_pos]
        self.win_sum += cur_frame_state
        self.win_state[self.cur_win_pos] = cur_frame_state
        self.cur_win_pos = (self.cur_win_pos + 1) % self.win_size_frame

        if (
            self.pre_frame_state == FrameState.kFrameStateSil
            and self.win_sum >= self.sil_to_speech_frmcnt_thres
        ):
            self.pre_frame_state = FrameState.kFrameStateSpeech
            return AudioChangeState.kChangeStateSil2Speech

        if (
            self.pre_frame_state == FrameState.kFrameStateSpeech
            and self.win_sum <= self.speech_to_sil_frmcnt_thres
        ):
            self.pre_frame_state = FrameState.kFrameStateSil
            return AudioChangeState.kChangeStateSpeech2Sil

        if self.pre_frame_state == FrameState.kFrameStateSil:
            return AudioChangeState.kChangeStateSil2Sil
        if self.pre_frame_state == FrameState.kFrameStateSpeech:
            return AudioChangeState.kChangeStateSpeech2Speech
        return AudioChangeState.kChangeStateInvalid

    def frame_size_ms(self) -> int:
        return int(self.frame_size_ms)


class E2EVadModel:
    def __init__(self, config, vad_post_args: Dict[str, Any], root_dir: Path):
        super(E2EVadModel, self).__init__()
        self.vad_opts = VADXOptions(**vad_post_args)
        self.windows_detector = WindowDetector(
            self.vad_opts.window_size_ms,
            self.vad_opts.sil_to_speech_time_thres,
            self.vad_opts.speech_to_sil_time_thres,
            self.vad_opts.frame_in_ms,
        )
        self.model = VadOrtInferRuntimeSession(config, root_dir)
        self.all_reset_detection()

    def all_reset_detection(self):
        # init variables
        self.is_final = False
        self.data_buf_start_frame = 0
        self.frm_cnt = 0
        self.latest_confirmed_speech_frame = 0
        self.lastest_confirmed_silence_frame = -1
        self.continous_silence_frame_count = 0
        self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
        self.confirmed_start_frame = -1
        self.confirmed_end_frame = -1
        self.number_end_time_detected = 0
        self.sil_frame = 0
        self.sil_pdf_ids = self.vad_opts.sil_pdf_ids
        self.noise_average_decibel = -100.0
        self.pre_end_silence_detected = False
        self.next_seg = True

        self.output_data_buf = []
        self.output_data_buf_offset = 0
        self.frame_probs = []
        self.max_end_sil_frame_cnt_thresh = (
            self.vad_opts.max_end_silence_time - self.vad_opts.speech_to_sil_time_thres
        )
        self.speech_noise_thres = self.vad_opts.speech_noise_thres
        self.scores = None
        self.scores_offset = 0
        self.max_time_out = False
        self.decibel = []
        self.decibel_offset = 0
        self.data_buf_size = 0
        self.data_buf_all_size = 0
        self.waveform = None
        self.reset_detection()

    def reset_detection(self):
        self.continous_silence_frame_count = 0
        self.latest_confirmed_speech_frame = 0
        self.lastest_confirmed_silence_frame = -1
        self.confirmed_start_frame = -1
        self.confirmed_end_frame = -1
        self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
        self.windows_detector.reset()
        self.sil_frame = 0
        self.frame_probs = []

    def compute_decibel(self) -> None:
        frame_sample_length = int(
            self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000
        )
        frame_shift_length = int(
            self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000
        )
        if self.data_buf_all_size == 0:
            self.data_buf_all_size = len(self.waveform[0])
            self.data_buf_size = self.data_buf_all_size
        else:
            self.data_buf_all_size += len(self.waveform[0])

        for offset in range(
            0, self.waveform.shape[1] - frame_sample_length + 1, frame_shift_length
        ):
            self.decibel.append(
                10
                * np.log10(
                    np.square(
                        self.waveform[0][offset : offset + frame_sample_length]
                    ).sum()
                    + 1e-6
                )
            )

    def compute_scores(self, feats: np.ndarray) -> None:
        scores = self.model(feats)
        self.vad_opts.nn_eval_block_size = scores[0].shape[1]
        self.frm_cnt += scores[0].shape[1]  # count total frames
        if isinstance(feats, list):
            # return B * T * D
            feats = feats[0]

        assert (
            scores[0].shape[1] == feats.shape[1]
        ), "The shape between feats and scores does not match"

        self.scores = scores[0]  # the first calculation
        self.scores_offset += self.scores.shape[1]

        return scores[1:]

    def pop_data_buf_till_frame(self, frame_idx: int) -> None:  # need check again
        while self.data_buf_start_frame < frame_idx:
            if self.data_buf_size >= int(
                self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000
            ):
                self.data_buf_start_frame += 1
                self.data_buf_size = (
                    self.data_buf_all_size
                    - self.data_buf_start_frame
                    * int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)
                )

    def pop_data_to_output_buf(
        self,
        start_frm: int,
        frm_cnt: int,
        first_frm_is_start_point: bool,
        last_frm_is_end_point: bool,
        end_point_is_sent_end: bool,
    ) -> None:
        self.pop_data_buf_till_frame(start_frm)
        expected_sample_number = int(
            frm_cnt * self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000
        )
        if last_frm_is_end_point:
            extra_sample = max(
                0,
                int(
                    self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000
                    - self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000
                ),
            )
            expected_sample_number += int(extra_sample)
        if end_point_is_sent_end:
            expected_sample_number = max(expected_sample_number, self.data_buf_size)
        if self.data_buf_size < expected_sample_number:
            logging.error("error in calling pop data_buf\n")

        if len(self.output_data_buf) == 0 or first_frm_is_start_point:
            self.output_data_buf.append(E2EVadSpeechBufWithDoa())
            self.output_data_buf[-1].reset()
            self.output_data_buf[-1].start_ms = start_frm * self.vad_opts.frame_in_ms
            self.output_data_buf[-1].end_ms = self.output_data_buf[-1].start_ms
            self.output_data_buf[-1].doa = 0
        cur_seg = self.output_data_buf[-1]
        if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms:
            logging.error("warning\n")
        out_pos = len(cur_seg.buffer)  # cur_seg.buff现在没做任何操作
        data_to_pop = 0
        if end_point_is_sent_end:
            data_to_pop = expected_sample_number
        else:
            data_to_pop = int(
                frm_cnt * self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000
            )
        if data_to_pop > self.data_buf_size:
            logging.error("VAD data_to_pop is bigger than self.data_buf.size()!!!\n")
            data_to_pop = self.data_buf_size
            expected_sample_number = self.data_buf_size

        cur_seg.doa = 0
        for sample_cpy_out in range(0, data_to_pop):
            # cur_seg.buffer[out_pos ++] = data_buf_.back();
            out_pos += 1
        for sample_cpy_out in range(data_to_pop, expected_sample_number):
            # cur_seg.buffer[out_pos++] = data_buf_.back()
            out_pos += 1
        if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms:
            logging.error("Something wrong with the VAD algorithm\n")
        self.data_buf_start_frame += frm_cnt
        cur_seg.end_ms = (start_frm + frm_cnt) * self.vad_opts.frame_in_ms
        if first_frm_is_start_point:
            cur_seg.contain_seg_start_point = True
        if last_frm_is_end_point:
            cur_seg.contain_seg_end_point = True

    def on_silence_detected(self, valid_frame: int):
        self.lastest_confirmed_silence_frame = valid_frame
        if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
            self.pop_data_buf_till_frame(valid_frame)
        # silence_detected_callback_
        # pass

    def on_voice_detected(self, valid_frame: int) -> None:
        self.latest_confirmed_speech_frame = valid_frame
        self.pop_data_to_output_buf(valid_frame, 1, False, False, False)

    def on_voice_start(self, start_frame: int, fake_result: bool = False) -> None:
        if self.vad_opts.do_start_point_detection:
            pass
        if self.confirmed_start_frame != -1:
            logging.error("not reset vad properly\n")
        else:
            self.confirmed_start_frame = start_frame

        if (
            not fake_result
            and self.vad_state_machine
            == VadStateMachine.kVadInStateStartPointNotDetected
        ):
            self.pop_data_to_output_buf(
                self.confirmed_start_frame, 1, True, False, False
            )

    def on_voice_end(
        self, end_frame: int, fake_result: bool, is_last_frame: bool
    ) -> None:
        for t in range(self.latest_confirmed_speech_frame + 1, end_frame):
            self.on_voice_detected(t)
        if self.vad_opts.do_end_point_detection:
            pass
        if self.confirmed_end_frame != -1:
            logging.error("not reset vad properly\n")
        else:
            self.confirmed_end_frame = end_frame
        if not fake_result:
            self.sil_frame = 0
            self.pop_data_to_output_buf(
                self.confirmed_end_frame, 1, False, True, is_last_frame
            )
        self.number_end_time_detected += 1

    def maybe_on_voice_end_last_frame(
        self, is_final_frame: bool, cur_frm_idx: int
    ) -> None:
        if is_final_frame:
            self.on_voice_end(cur_frm_idx, False, True)
            self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected

    def get_latency(self) -> int:
        return int(self.latency_frm_num_at_start_point() * self.vad_opts.frame_in_ms)

    def latency_frm_num_at_start_point(self) -> int:
        vad_latency = self.windows_detector.get_win_size()
        if self.vad_opts.do_extend:
            vad_latency += int(
                self.vad_opts.lookback_time_start_point / self.vad_opts.frame_in_ms
            )
        return vad_latency

    def get_frame_state(self, t: int) -> FrameState:
        frame_state = FrameState.kFrameStateInvalid
        cur_decibel = self.decibel[t - self.decibel_offset]
        cur_snr = cur_decibel - self.noise_average_decibel
        # for each frame, calc log posterior probability of each state
        if cur_decibel < self.vad_opts.decibel_thres:
            frame_state = FrameState.kFrameStateSil
            self.detect_one_frame(frame_state, t, False)
            return frame_state

        sum_score = 0.0
        noise_prob = 0.0
        assert len(self.sil_pdf_ids) == self.vad_opts.silence_pdf_num
        if len(self.sil_pdf_ids) > 0:
            assert len(self.scores) == 1  # 只支持batch_size = 1的测试
            sil_pdf_scores = [
                self.scores[0][t - self.scores_offset][sil_pdf_id]
                for sil_pdf_id in self.sil_pdf_ids
            ]
            sum_score = sum(sil_pdf_scores)
            noise_prob = math.log(sum_score) * self.vad_opts.speech_2_noise_ratio
            total_score = 1.0
            sum_score = total_score - sum_score
        speech_prob = math.log(sum_score)
        if self.vad_opts.output_frame_probs:
            frame_prob = E2EVadFrameProb()
            frame_prob.noise_prob = noise_prob
            frame_prob.speech_prob = speech_prob
            frame_prob.score = sum_score
            frame_prob.frame_id = t
            self.frame_probs.append(frame_prob)
        if math.exp(speech_prob) >= math.exp(noise_prob) + self.speech_noise_thres:
            if (
                cur_snr >= self.vad_opts.snr_thres
                and cur_decibel >= self.vad_opts.decibel_thres
            ):
                frame_state = FrameState.kFrameStateSpeech
            else:
                frame_state = FrameState.kFrameStateSil
        else:
            frame_state = FrameState.kFrameStateSil
            if self.noise_average_decibel < -99.9:
                self.noise_average_decibel = cur_decibel
            else:
                self.noise_average_decibel = (
                    cur_decibel
                    + self.noise_average_decibel
                    * (self.vad_opts.noise_frame_num_used_for_snr - 1)
                ) / self.vad_opts.noise_frame_num_used_for_snr

        return frame_state

    def infer_offline(
        self,
        feats: np.ndarray,
        waveform: np.ndarray,
        in_cache: Dict[str, np.ndarray] = dict(),
        is_final: bool = False,
    ) -> Tuple[List[List[List[int]]], Dict[str, np.ndarray]]:
        self.waveform = waveform
        self.compute_decibel()

        self.compute_scores(feats)
        if not is_final:
            self.detect_common_frames()
        else:
            self.detect_last_frames()
        segments = []
        for batch_num in range(0, feats.shape[0]):  # only support batch_size = 1 now
            segment_batch = []
            if len(self.output_data_buf) > 0:
                for i in range(self.output_data_buf_offset, len(self.output_data_buf)):
                    if (
                        not self.output_data_buf[i].contain_seg_start_point
                        or not self.output_data_buf[i].contain_seg_end_point
                    ):
                        continue
                    segment = [
                        self.output_data_buf[i].start_ms,
                        self.output_data_buf[i].end_ms,
                    ]
                    segment_batch.append(segment)
                    self.output_data_buf_offset += 1  # need update this parameter
            if segment_batch:
                segments.append(segment_batch)

        if is_final:
            # reset class variables and clear the dict for the next query
            self.all_reset_detection()
        return segments, in_cache

    def infer_online(
        self,
        feats: np.ndarray,
        waveform: np.ndarray,
        in_cache: list = None,
        is_final: bool = False,
        max_end_sil: int = 800,
    ) -> Tuple[List[List[List[int]]], Dict[str, np.ndarray]]:
        feats = [feats]
        if in_cache is None:
            in_cache = []

        self.max_end_sil_frame_cnt_thresh = (
            max_end_sil - self.vad_opts.speech_to_sil_time_thres
        )
        self.waveform = waveform  # compute decibel for each frame
        feats.extend(in_cache)
        in_cache = self.compute_scores(feats)
        self.compute_decibel()

        if is_final:
            self.detect_last_frames()
        else:
            self.detect_common_frames()

        segments = []
        # only support batch_size = 1 now
        for batch_num in range(0, feats[0].shape[0]):
            if len(self.output_data_buf) > 0:
                for i in range(self.output_data_buf_offset, len(self.output_data_buf)):
                    if not self.output_data_buf[i].contain_seg_start_point:
                        continue
                    if (
                        not self.next_seg
                        and not self.output_data_buf[i].contain_seg_end_point
                    ):
                        continue
                    start_ms = self.output_data_buf[i].start_ms if self.next_seg else -1
                    if self.output_data_buf[i].contain_seg_end_point:
                        end_ms = self.output_data_buf[i].end_ms
                        self.next_seg = True
                        self.output_data_buf_offset += 1
                    else:
                        end_ms = -1
                        self.next_seg = False
                    segments.append([start_ms, end_ms])

        return segments, in_cache

    def get_frames_state(
        self,
        feats: np.ndarray,
        waveform: np.ndarray,
        in_cache: list = None,
        is_final: bool = False,
        max_end_sil: int = 800,
    ):
        feats = [feats]
        states = []
        if in_cache is None:
            in_cache = []

        self.max_end_sil_frame_cnt_thresh = (
            max_end_sil - self.vad_opts.speech_to_sil_time_thres
        )
        self.waveform = waveform  # compute decibel for each frame
        feats.extend(in_cache)
        in_cache = self.compute_scores(feats)
        self.compute_decibel()

        if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
            return states

        for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
            frame_state = FrameState.kFrameStateInvalid
            frame_state = self.get_frame_state(self.frm_cnt - 1 - i)
            states.append(frame_state)
            if i == 0 and is_final:
                logging.info("last frame detected")
                self.detect_one_frame(frame_state, self.frm_cnt - 1, True)
            else:
                self.detect_one_frame(frame_state, self.frm_cnt - 1 - i, False)

        return states

    def detect_common_frames(self) -> int:
        if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
            return 0
        for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
            frame_state = FrameState.kFrameStateInvalid
            frame_state = self.get_frame_state(self.frm_cnt - 1 - i)
            # print(f"cur frame: {self.frm_cnt - 1 - i}, state is {frame_state}")
            self.detect_one_frame(frame_state, self.frm_cnt - 1 - i, False)

        self.decibel = self.decibel[self.vad_opts.nn_eval_block_size - 1 :]
        self.decibel_offset = self.frm_cnt - 1 - i
        return 0

    def detect_last_frames(self) -> int:
        if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
            return 0
        for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
            frame_state = FrameState.kFrameStateInvalid
            frame_state = self.get_frame_state(self.frm_cnt - 1 - i)
            if i != 0:
                self.detect_one_frame(frame_state, self.frm_cnt - 1 - i, False)
            else:
                self.detect_one_frame(frame_state, self.frm_cnt - 1, True)

        return 0

    def detect_one_frame(
        self, cur_frm_state: FrameState, cur_frm_idx: int, is_final_frame: bool
    ) -> None:
        tmp_cur_frm_state = FrameState.kFrameStateInvalid
        if cur_frm_state == FrameState.kFrameStateSpeech:
            if math.fabs(1.0) > float(self.vad_opts.fe_prior_thres):
                tmp_cur_frm_state = FrameState.kFrameStateSpeech
            else:
                tmp_cur_frm_state = FrameState.kFrameStateSil
        elif cur_frm_state == FrameState.kFrameStateSil:
            tmp_cur_frm_state = FrameState.kFrameStateSil
        state_change = self.windows_detector.detect_one_frame(
            tmp_cur_frm_state, cur_frm_idx
        )
        frm_shift_in_ms = self.vad_opts.frame_in_ms
        if AudioChangeState.kChangeStateSil2Speech == state_change:
            self.continous_silence_frame_count = 0
            self.pre_end_silence_detected = False

            if (
                self.vad_state_machine
                == VadStateMachine.kVadInStateStartPointNotDetected
            ):
                start_frame = max(
                    self.data_buf_start_frame,
                    cur_frm_idx - self.latency_frm_num_at_start_point(),
                )
                self.on_voice_start(start_frame)
                self.vad_state_machine = VadStateMachine.kVadInStateInSpeechSegment
                for t in range(start_frame + 1, cur_frm_idx + 1):
                    self.on_voice_detected(t)
            elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
                for t in range(self.latest_confirmed_speech_frame + 1, cur_frm_idx):
                    self.on_voice_detected(t)
                if (
                    cur_frm_idx - self.confirmed_start_frame + 1
                    > self.vad_opts.max_single_segment_time / frm_shift_in_ms
                ):
                    self.on_voice_end(cur_frm_idx, False, False)
                    self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
                elif not is_final_frame:
                    self.on_voice_detected(cur_frm_idx)
                else:
                    self.maybe_on_voice_end_last_frame(is_final_frame, cur_frm_idx)
            else:
                pass
        elif AudioChangeState.kChangeStateSpeech2Sil == state_change:
            self.continous_silence_frame_count = 0
            if (
                self.vad_state_machine
                == VadStateMachine.kVadInStateStartPointNotDetected
            ):
                pass
            elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
                if (
                    cur_frm_idx - self.confirmed_start_frame + 1
                    > self.vad_opts.max_single_segment_time / frm_shift_in_ms
                ):
                    self.on_voice_end(cur_frm_idx, False, False)
                    self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
                elif not is_final_frame:
                    self.on_voice_detected(cur_frm_idx)
                else:
                    self.maybe_on_voice_end_last_frame(is_final_frame, cur_frm_idx)
            else:
                pass
        elif AudioChangeState.kChangeStateSpeech2Speech == state_change:
            self.continous_silence_frame_count = 0
            if self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
                if (
                    cur_frm_idx - self.confirmed_start_frame + 1
                    > self.vad_opts.max_single_segment_time / frm_shift_in_ms
                ):
                    self.max_time_out = True
                    self.on_voice_end(cur_frm_idx, False, False)
                    self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
                elif not is_final_frame:
                    self.on_voice_detected(cur_frm_idx)
                else:
                    self.maybe_on_voice_end_last_frame(is_final_frame, cur_frm_idx)
            else:
                pass
        elif AudioChangeState.kChangeStateSil2Sil == state_change:
            self.continous_silence_frame_count += 1
            if (
                self.vad_state_machine
                == VadStateMachine.kVadInStateStartPointNotDetected
            ):
                # silence timeout, return zero length decision
                if (
                    (
                        self.vad_opts.detect_mode
                        == VadDetectMode.kVadSingleUtteranceDetectMode.value
                    )
                    and (
                        self.continous_silence_frame_count * frm_shift_in_ms
                        > self.vad_opts.max_start_silence_time
                    )
                ) or (is_final_frame and self.number_end_time_detected == 0):
                    for t in range(
                        self.lastest_confirmed_silence_frame + 1, cur_frm_idx
                    ):
                        self.on_silence_detected(t)
                    self.on_voice_start(0, True)
                    self.on_voice_end(0, True, False)
                    self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
                else:
                    if cur_frm_idx >= self.latency_frm_num_at_start_point():
                        self.on_silence_detected(
                            cur_frm_idx - self.latency_frm_num_at_start_point()
                        )
            elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
                if (
                    self.continous_silence_frame_count * frm_shift_in_ms
                    >= self.max_end_sil_frame_cnt_thresh
                ):
                    lookback_frame = int(
                        self.max_end_sil_frame_cnt_thresh / frm_shift_in_ms
                    )
                    if self.vad_opts.do_extend:
                        lookback_frame -= int(
                            self.vad_opts.lookahead_time_end_point / frm_shift_in_ms
                        )
                        lookback_frame -= 1
                        lookback_frame = max(0, lookback_frame)
                    self.on_voice_end(cur_frm_idx - lookback_frame, False, False)
                    self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
                elif (
                    cur_frm_idx - self.confirmed_start_frame + 1
                    > self.vad_opts.max_single_segment_time / frm_shift_in_ms
                ):
                    self.on_voice_end(cur_frm_idx, False, False)
                    self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
                elif self.vad_opts.do_extend and not is_final_frame:
                    if self.continous_silence_frame_count <= int(
                        self.vad_opts.lookahead_time_end_point / frm_shift_in_ms
                    ):
                        self.on_voice_detected(cur_frm_idx)
                else:
                    self.maybe_on_voice_end_last_frame(is_final_frame, cur_frm_idx)
            else:
                pass

        if (
            self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected
            and self.vad_opts.detect_mode
            == VadDetectMode.kVadMutipleUtteranceDetectMode.value
        ):
            self.reset_detection()


class FSMNVad(object):
    def __init__(self, config_dir: str):
        config_dir = Path(config_dir)
        self.config = read_yaml(config_dir / "fsmn-config.yaml")
        self.frontend = WavFrontend(
            cmvn_file=config_dir / "fsmn-am.mvn",
            **self.config["WavFrontend"]["frontend_conf"],
        )
        self.config["FSMN"]["model_path"] = config_dir / "fsmnvad-offline.onnx"

        self.vad = E2EVadModel(
            self.config["FSMN"], self.config["vadPostArgs"], config_dir
        )

    def set_parameters(self, mode):
        pass

    def extract_feature(self, waveform):
        fbank, _ = self.frontend.fbank(waveform)
        feats, feats_len = self.frontend.lfr_cmvn(fbank)
        return feats.astype(np.float32), feats_len

    def is_speech(self, buf, sample_rate=16000):
        assert sample_rate == 16000, "only support 16k sample rate"

    def segments_offline(self, waveform_path: Union[str, Path, np.ndarray]):
        """get sements of audio"""

        if isinstance(waveform_path, np.ndarray):
            waveform = waveform_path
        else:
            if not os.path.exists(waveform_path):
                raise FileExistsError(f"{waveform_path} is not exist.")
            if os.path.isfile(waveform_path):
                logging.info(f"load audio {waveform_path}")
                waveform, _sample_rate = sf.read(
                    waveform_path,
                    dtype="float32",
                )
            else:
                raise FileNotFoundError(str(Path))
            assert (
                _sample_rate == 16000
            ), f"only support 16k sample rate, current sample rate is {_sample_rate}"

        feats, feats_len = self.extract_feature(waveform)
        waveform = waveform[None, ...]
        segments_part, in_cache = self.vad.infer_offline(
            feats[None, ...], waveform, is_final=True
        )
        return segments_part[0]

# ```

# File: sense_voice.py
# ```py
# -*- coding:utf-8 -*-
# @FileName  :sense_voice.py.py
# @Time      :2024/7/18 15:40
# @Author    :lovemefan
# @Email     :lovemefan@outlook.com

languages = {"auto": 0, "zh": 3, "en": 4, "yue": 7, "ja": 11, "ko": 12, "nospeech": 13}
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
logging.basicConfig(format=formatter, level=logging.INFO)

def main():
    arg_parser = argparse.ArgumentParser(description="Sense Voice")
    arg_parser.add_argument("-a", "--audio_file", required=True, type=str, help="Model")
    download_model_path = os.path.dirname(__file__)
    arg_parser.add_argument(
        "-dp",
        "--download_path",
        default=download_model_path,
        type=str,
        help="dir path of resource downloaded",
    )
    arg_parser.add_argument("-d", "--device", default=-1, type=int, help="Device")
    arg_parser.add_argument(
        "-n", "--num_threads", default=4, type=int, help="Num threads"
    )
    arg_parser.add_argument(
        "-l",
        "--language",
        choices=languages.keys(),
        default="auto",
        type=str,
        help="Language",
    )
    arg_parser.add_argument("--use_itn", action="store_true", help="Use ITN")
    args = arg_parser.parse_args()

    front = WavFrontend(os.path.join(download_model_path, "am.mvn"))

    model = SenseVoiceInferenceSession(
        os.path.join(download_model_path, "embedding.npy"),
        os.path.join(
            download_model_path,
            "sense-voice-encoder.rknn",
        ),
        os.path.join(download_model_path, "chn_jpn_yue_eng_ko_spectok.bpe.model"),
        args.device,
        args.num_threads,
    )
    waveform, _sample_rate = sf.read(
        args.audio_file,
        dtype="float32",
        always_2d=True
    )

    logging.info(f"Audio {args.audio_file} is {len(waveform) / _sample_rate} seconds, {waveform.shape[1]} channel")
    # load vad model
    start = time.time()
    vad = FSMNVad(download_model_path)
    for channel_id, channel_data in enumerate(waveform.T):
        segments = vad.segments_offline(channel_data)
        results = ""
        for part in segments:
            audio_feats = front.get_features(channel_data[part[0] * 16 : part[1] * 16])
            asr_result = model(
                audio_feats[None, ...],
                language=languages[args.language],
                use_itn=args.use_itn,
            )
            logging.info(f"[Channel {channel_id}] [{part[0] / 1000}s - {part[1] / 1000}s] {asr_result}")
        vad.vad.all_reset_detection()
    decoding_time = time.time() - start
    logging.info(f"Decoder audio takes {decoding_time} seconds")
    logging.info(f"The RTF is {decoding_time/(waveform.shape[1] * len(waveform) / _sample_rate)}.")


if __name__ == "__main__":
    main()