TaiYouWeb commited on
Commit
6395f19
1 Parent(s): b1d5ba9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -23
app.py CHANGED
@@ -2,12 +2,22 @@ from funasr import AutoModel
2
  from funasr.utils.postprocess_utils import rich_transcription_postprocess
3
  from modelscope import snapshot_download
4
 
 
 
 
 
 
5
  import json
 
 
6
  import torch
7
  import gradio as gr
8
 
9
  from config import model_config
10
 
 
 
 
11
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
12
  model_dir = snapshot_download(model_config['model_dir'])
13
 
@@ -16,8 +26,7 @@ model = AutoModel(
16
  trust_remote_code=False,
17
  remote_code="./model.py",
18
  vad_model="fsmn-vad",
19
- punc_model="ct-punc",
20
- spk_model="cam++",
21
  vad_kwargs={"max_single_segment_time": 15000},
22
  ncpu=torch.get_num_threads(),
23
  batch_size=1,
@@ -25,12 +34,13 @@ model = AutoModel(
25
  device=device,
26
  )
27
 
28
- def transcribe_audio(file_path, vad_model="fsmn-vad", punc_model="ct-punc", spk_model="cam++", vad_kwargs='{"max_single_segment_time": 15000}',
29
- batch_size=1, language="auto", use_itn=True, batch_size_s=60,
30
- merge_vad=True, merge_length_s=15, batch_size_threshold_s=50,
31
  hotword=" ", ban_emo_unk=True):
32
  try:
33
  vad_kwargs = json.loads(vad_kwargs)
 
34
  temp_file_path = file_path
35
 
36
  res = model.generate(
@@ -46,18 +56,9 @@ def transcribe_audio(file_path, vad_model="fsmn-vad", punc_model="ct-punc", spk_
46
  ban_emo_unk=ban_emo_unk
47
  )
48
 
49
- segments = res[0]["segments"]
50
- transcription = ""
51
-
52
- for segment in segments:
53
- start_time = segment["start"]
54
- end_time = segment["end"]
55
- speaker = segment.get("speaker", "unknown")
56
- text = segment["text"]
57
-
58
- transcription += f"[{start_time:.2f}s - {end_time:.2f}s] Speaker {speaker}: {text}\n"
59
-
60
- return transcription
61
 
62
  except Exception as e:
63
  return str(e)
@@ -66,7 +67,6 @@ inputs = [
66
  gr.Audio(type="filepath"),
67
  gr.Textbox(value="fsmn-vad", label="VAD Model"),
68
  gr.Textbox(value="ct-punc", label="PUNC Model"),
69
- gr.Textbox(value="cam++", label="SPK Model"),
70
  gr.Textbox(value='{"max_single_segment_time": 15000}', label="VAD Kwargs"),
71
  gr.Slider(1, 10, value=1, step=1, label="Batch Size"),
72
  gr.Textbox(value="auto", label="Language"),
@@ -82,8 +82,8 @@ inputs = [
82
  outputs = gr.Textbox(label="Transcription")
83
 
84
  gr.Interface(
85
- fn=transcribe_audio,
86
- inputs=inputs,
87
- outputs=outputs,
88
- title="ASR Transcription with Speaker Diarization and Timestamps"
89
- ).launch()
 
2
  from funasr.utils.postprocess_utils import rich_transcription_postprocess
3
  from modelscope import snapshot_download
4
 
5
+ import datetime
6
+ import math
7
+ import io
8
+ import os
9
+ import tempfile
10
  import json
11
+ from typing import Optional
12
+
13
  import torch
14
  import gradio as gr
15
 
16
  from config import model_config
17
 
18
+
19
+
20
+
21
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
22
  model_dir = snapshot_download(model_config['model_dir'])
23
 
 
26
  trust_remote_code=False,
27
  remote_code="./model.py",
28
  vad_model="fsmn-vad",
29
+ punc_model="ct-punc",
 
30
  vad_kwargs={"max_single_segment_time": 15000},
31
  ncpu=torch.get_num_threads(),
32
  batch_size=1,
 
34
  device=device,
35
  )
36
 
37
+ def transcribe_audio(file_path, vad_model="fsmn-vad", punc_model="ct-punc", vad_kwargs='{"max_single_segment_time": 15000}',
38
+ batch_size=1, language="auto", use_itn=True, batch_size_s=60,
39
+ merge_vad=True, merge_length_s=15, batch_size_threshold_s=50,
40
  hotword=" ", ban_emo_unk=True):
41
  try:
42
  vad_kwargs = json.loads(vad_kwargs)
43
+
44
  temp_file_path = file_path
45
 
46
  res = model.generate(
 
56
  ban_emo_unk=ban_emo_unk
57
  )
58
 
59
+ text = rich_transcription_postprocess(res[0]["text"])
60
+
61
+ return text
 
 
 
 
 
 
 
 
 
62
 
63
  except Exception as e:
64
  return str(e)
 
67
  gr.Audio(type="filepath"),
68
  gr.Textbox(value="fsmn-vad", label="VAD Model"),
69
  gr.Textbox(value="ct-punc", label="PUNC Model"),
 
70
  gr.Textbox(value='{"max_single_segment_time": 15000}', label="VAD Kwargs"),
71
  gr.Slider(1, 10, value=1, step=1, label="Batch Size"),
72
  gr.Textbox(value="auto", label="Language"),
 
82
  outputs = gr.Textbox(label="Transcription")
83
 
84
  gr.Interface(
85
+ fn=transcribe_audio,
86
+ inputs=inputs,
87
+ outputs=outputs,
88
+ title="ASR Transcription with FunASR"
89
+ ).launch()