Chris Bracegirdle commited on
Commit
38db600
1 Parent(s): 9fbfc52
Files changed (1) hide show
  1. app.py +7 -14
app.py CHANGED
@@ -4,10 +4,9 @@ import torch
4
  import librosa
5
  import json
6
  # Load model directly
7
- from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
8
 
9
- processor = AutoProcessor.from_pretrained("dmatekenya/whisper-large-v3-chichewa")
10
- model = AutoModelForSpeechSeq2Seq.from_pretrained("dmatekenya/whisper-large-v3-chichewa")
11
 
12
  def transcribe(audio_file_mic=None, audio_file_upload=None, language="English (eng)"):
13
  if audio_file_mic:
@@ -18,23 +17,17 @@ def transcribe(audio_file_mic=None, audio_file_upload=None, language="English (e
18
  return "Please upload an audio file or record one"
19
 
20
  # Make sure audio is 16kHz
21
- speech, sample_rate = librosa.load(audio_file)
22
- if sample_rate != 16000:
23
- speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=16000)
24
 
25
  # Keep the same model in memory and simply switch out the language adapters by calling load_adapter() for the model and set_target_lang() for the tokenizer
26
  # language_code = iso_codes[language]
27
  # processor.tokenizer.set_target_lang(language_code)
28
  # model.load_adapter(language_code)
29
 
30
- inputs = processor(speech, sampling_rate=16_000, return_tensors="pt")
31
-
32
- with torch.no_grad():
33
- outputs = model(**inputs).logits
34
-
35
- ids = torch.argmax(outputs, dim=-1)[0]
36
- transcription = processor.decode(ids)
37
- return transcription
38
 
39
 
40
  description = ''''''
 
4
  import librosa
5
  import json
6
  # Load model directly
7
+ from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq
8
 
9
+ pipe = pipeline("automatic-speech-recognition", model="dmatekenya/whisper-large-v3-chichewa")
 
10
 
11
  def transcribe(audio_file_mic=None, audio_file_upload=None, language="English (eng)"):
12
  if audio_file_mic:
 
17
  return "Please upload an audio file or record one"
18
 
19
  # Make sure audio is 16kHz
20
+ # speech, sample_rate = librosa.load(audio_file)
21
+ # if sample_rate != 16000:
22
+ # speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=16000)
23
 
24
  # Keep the same model in memory and simply switch out the language adapters by calling load_adapter() for the model and set_target_lang() for the tokenizer
25
  # language_code = iso_codes[language]
26
  # processor.tokenizer.set_target_lang(language_code)
27
  # model.load_adapter(language_code)
28
 
29
+ result = pipe(audio_file)
30
+ return result["text"]
 
 
 
 
 
 
31
 
32
 
33
  description = ''''''