camparchimedes commited on
Commit
869e885
1 Parent(s): 1863902

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -14
app.py CHANGED
@@ -1,29 +1,35 @@
 
 
1
  import gradio as gr
2
  import warnings
3
  import torch
4
  from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
5
  import soundfile as sf
6
-
7
-
8
 
9
  warnings.filterwarnings("ignore")
10
 
11
- # Load tokenizer + model
12
  tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium")
13
  model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-medium")
14
  processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium")
15
 
16
- # set up device
17
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
18
  torch_dtype = torch.float32
19
 
20
- # move model to device
21
  model.to(device)
22
 
 
 
 
 
23
 
24
  def transcribe_audio(audio_file, batch_size=4):
25
- audio_input, sample_rate = sf.read(audio_file)
26
- chunk_size = 16000 * 28 # 28 seconds chunks, seems to work best
 
27
  chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
28
 
29
  transcription = ""
@@ -35,18 +41,15 @@ def transcribe_audio(audio_file, batch_size=4):
35
  with torch.no_grad():
36
  output = model.generate(
37
  inputs.input_features,
38
- max_length=1024, # Increase max_length for longer outputs
39
  num_beams=7,
40
- task="transcribe",
41
- attention_mask=attention_mask,
42
- forced_decoder_ids=None, # forced_decoder_ids must not be set
43
- language="no"
44
  )
45
  transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
46
 
47
  return transcription.strip()
48
 
49
- # HTML |banner image
50
  banner_html = """
51
  <div style="text-align: center;">
52
  <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%" height="auto">
@@ -70,4 +73,4 @@ with iface:
70
  transcribe_button.click(fn=transcribe_audio, inputs=[audio_input, batch_size_input], outputs=transcription_output)
71
 
72
  # Launch interface
73
- iface.launch(share=True, debug=True)
 
1
+ # app.py
2
+
3
  import gradio as gr
4
  import warnings
5
  import torch
6
  from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
7
  import soundfile as sf
8
+ import ffmpeg
 
9
 
10
  warnings.filterwarnings("ignore")
11
 
12
+ # Load tokenizer, model, and processor
13
  tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium")
14
  model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-medium")
15
  processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium")
16
 
17
+ # Set up device
18
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
19
  torch_dtype = torch.float32
20
 
21
+ # Move model to device
22
  model.to(device)
23
 
24
+ def convert_audio_format(audio_path):
25
+ output_path = "converted_audio.wav"
26
+ ffmpeg.input(audio_path).output(output_path, format='wav', ar='16000').run(overwrite_output=True)
27
+ return output_path
28
 
29
  def transcribe_audio(audio_file, batch_size=4):
30
+ audio_path = convert_audio_format(audio_file)
31
+ audio_input, sample_rate = sf.read(audio_path)
32
+ chunk_size = 16000 * 28 # 28 seconds chunks
33
  chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
34
 
35
  transcription = ""
 
41
  with torch.no_grad():
42
  output = model.generate(
43
  inputs.input_features,
44
+ max_length=1024,
45
  num_beams=7,
46
+ attention_mask=attention_mask
 
 
 
47
  )
48
  transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
49
 
50
  return transcription.strip()
51
 
52
+ # HTML | Banner image
53
  banner_html = """
54
  <div style="text-align: center;">
55
  <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%" height="auto">
 
73
  transcribe_button.click(fn=transcribe_audio, inputs=[audio_input, batch_size_input], outputs=transcription_output)
74
 
75
  # Launch interface
76
+ iface.launch(share=True, debug=True)