Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -2,12 +2,22 @@ from funasr import AutoModel
|
|
2 |
from funasr.utils.postprocess_utils import rich_transcription_postprocess
|
3 |
from modelscope import snapshot_download
|
4 |
|
|
|
|
|
|
|
|
|
|
|
5 |
import json
|
|
|
|
|
6 |
import torch
|
7 |
import gradio as gr
|
8 |
|
9 |
from config import model_config
|
10 |
|
|
|
|
|
|
|
11 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
12 |
model_dir = snapshot_download(model_config['model_dir'])
|
13 |
|
@@ -16,8 +26,7 @@ model = AutoModel(
|
|
16 |
trust_remote_code=False,
|
17 |
remote_code="./model.py",
|
18 |
vad_model="fsmn-vad",
|
19 |
-
punc_model="ct-punc",
|
20 |
-
spk_model="cam++",
|
21 |
vad_kwargs={"max_single_segment_time": 15000},
|
22 |
ncpu=torch.get_num_threads(),
|
23 |
batch_size=1,
|
@@ -25,12 +34,13 @@ model = AutoModel(
|
|
25 |
device=device,
|
26 |
)
|
27 |
|
28 |
-
def transcribe_audio(file_path, vad_model="fsmn-vad", punc_model="ct-punc",
|
29 |
-
batch_size=1, language="auto", use_itn=True, batch_size_s=60,
|
30 |
-
merge_vad=True, merge_length_s=15, batch_size_threshold_s=50,
|
31 |
hotword=" ", ban_emo_unk=True):
|
32 |
try:
|
33 |
vad_kwargs = json.loads(vad_kwargs)
|
|
|
34 |
temp_file_path = file_path
|
35 |
|
36 |
res = model.generate(
|
@@ -46,18 +56,9 @@ def transcribe_audio(file_path, vad_model="fsmn-vad", punc_model="ct-punc", spk_
|
|
46 |
ban_emo_unk=ban_emo_unk
|
47 |
)
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
for segment in segments:
|
53 |
-
start_time = segment["start"]
|
54 |
-
end_time = segment["end"]
|
55 |
-
speaker = segment.get("speaker", "unknown")
|
56 |
-
text = segment["text"]
|
57 |
-
|
58 |
-
transcription += f"[{start_time:.2f}s - {end_time:.2f}s] Speaker {speaker}: {text}\n"
|
59 |
-
|
60 |
-
return transcription
|
61 |
|
62 |
except Exception as e:
|
63 |
return str(e)
|
@@ -66,7 +67,6 @@ inputs = [
|
|
66 |
gr.Audio(type="filepath"),
|
67 |
gr.Textbox(value="fsmn-vad", label="VAD Model"),
|
68 |
gr.Textbox(value="ct-punc", label="PUNC Model"),
|
69 |
-
gr.Textbox(value="cam++", label="SPK Model"),
|
70 |
gr.Textbox(value='{"max_single_segment_time": 15000}', label="VAD Kwargs"),
|
71 |
gr.Slider(1, 10, value=1, step=1, label="Batch Size"),
|
72 |
gr.Textbox(value="auto", label="Language"),
|
@@ -82,8 +82,8 @@ inputs = [
|
|
82 |
outputs = gr.Textbox(label="Transcription")
|
83 |
|
84 |
gr.Interface(
|
85 |
-
fn=transcribe_audio,
|
86 |
-
inputs=inputs,
|
87 |
-
outputs=outputs,
|
88 |
-
title="ASR Transcription with
|
89 |
-
).launch()
|
|
|
2 |
from funasr.utils.postprocess_utils import rich_transcription_postprocess
|
3 |
from modelscope import snapshot_download
|
4 |
|
5 |
+
import datetime
|
6 |
+
import math
|
7 |
+
import io
|
8 |
+
import os
|
9 |
+
import tempfile
|
10 |
import json
|
11 |
+
from typing import Optional
|
12 |
+
|
13 |
import torch
|
14 |
import gradio as gr
|
15 |
|
16 |
from config import model_config
|
17 |
|
18 |
+
|
19 |
+
|
20 |
+
|
21 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
22 |
model_dir = snapshot_download(model_config['model_dir'])
|
23 |
|
|
|
26 |
trust_remote_code=False,
|
27 |
remote_code="./model.py",
|
28 |
vad_model="fsmn-vad",
|
29 |
+
punc_model="ct-punc",
|
|
|
30 |
vad_kwargs={"max_single_segment_time": 15000},
|
31 |
ncpu=torch.get_num_threads(),
|
32 |
batch_size=1,
|
|
|
34 |
device=device,
|
35 |
)
|
36 |
|
37 |
+
def transcribe_audio(file_path, vad_model="fsmn-vad", punc_model="ct-punc", vad_kwargs='{"max_single_segment_time": 15000}',
|
38 |
+
batch_size=1, language="auto", use_itn=True, batch_size_s=60,
|
39 |
+
merge_vad=True, merge_length_s=15, batch_size_threshold_s=50,
|
40 |
hotword=" ", ban_emo_unk=True):
|
41 |
try:
|
42 |
vad_kwargs = json.loads(vad_kwargs)
|
43 |
+
|
44 |
temp_file_path = file_path
|
45 |
|
46 |
res = model.generate(
|
|
|
56 |
ban_emo_unk=ban_emo_unk
|
57 |
)
|
58 |
|
59 |
+
text = rich_transcription_postprocess(res[0]["text"])
|
60 |
+
|
61 |
+
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
except Exception as e:
|
64 |
return str(e)
|
|
|
67 |
gr.Audio(type="filepath"),
|
68 |
gr.Textbox(value="fsmn-vad", label="VAD Model"),
|
69 |
gr.Textbox(value="ct-punc", label="PUNC Model"),
|
|
|
70 |
gr.Textbox(value='{"max_single_segment_time": 15000}', label="VAD Kwargs"),
|
71 |
gr.Slider(1, 10, value=1, step=1, label="Batch Size"),
|
72 |
gr.Textbox(value="auto", label="Language"),
|
|
|
82 |
outputs = gr.Textbox(label="Transcription")
|
83 |
|
84 |
gr.Interface(
|
85 |
+
fn=transcribe_audio,
|
86 |
+
inputs=inputs,
|
87 |
+
outputs=outputs,
|
88 |
+
title="ASR Transcription with FunASR"
|
89 |
+
).launch()
|