Baghdad99 commited on
Commit
3077d90
1 Parent(s): 2de6d50

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -6
app.py CHANGED
@@ -3,13 +3,13 @@ from transformers import pipeline, AutoTokenizer
3
  import numpy as np
4
  from pydub import AudioSegment
5
  import librosa
 
 
 
 
 
 
6
 
7
- # Load the pipeline for speech recognition and translation
8
- pipe = pipeline(
9
- "automatic-speech-recognition",
10
- model="Akashpb13/Hausa_xlsr",
11
- tokenizer="Akashpb13/Hausa_xlsr"
12
- )
13
  translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-to-english-text")
14
  tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")
15
 
@@ -17,6 +17,18 @@ def translate_speech(audio_input):
17
  # Load the audio file as a floating point time series
18
  audio_data, sample_rate = librosa.load(audio_input, sr=None)
19
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  # Use the speech recognition pipeline to transcribe the audio
21
  output = pipe(audio_data)
22
 
 
3
  import numpy as np
4
  from pydub import AudioSegment
5
  import librosa
6
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
7
+
8
+ # Load the model and processor
9
+ model = Wav2Vec2ForCTC.from_pretrained("Akashpb13/Hausa_xlsr")
10
+ processor = Wav2Vec2Processor.from_pretrained("Akashpb13/Hausa_xlsr")
11
+
12
 
 
 
 
 
 
 
13
  translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-to-english-text")
14
  tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")
15
 
 
17
  # Load the audio file as a floating point time series
18
  audio_data, sample_rate = librosa.load(audio_input, sr=None)
19
 
20
+ # Prepare the input dictionary
21
+ input_dict = processor(audio_data, return_tensors="pt", padding=True)
22
+
23
+ # Use the model to get the logits
24
+ logits = model(input_dict.input_values.to("cuda")).logits
25
+
26
+ # Get the predicted IDs
27
+ pred_ids = torch.argmax(logits, dim=-1)[0]
28
+
29
+ # Decode the predicted IDs to get the transcription
30
+ transcription = processor.decode(pred_ids)
31
+
32
  # Use the speech recognition pipeline to transcribe the audio
33
  output = pipe(audio_data)
34