Spaces:

Baghdad99
/

ha-en

Sleeping

Baghdad99 commited on Dec 21, 2023

Commit

3077d90

•

1 Parent(s): 2de6d50

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,13 +3,13 @@ from transformers import pipeline, AutoTokenizer
 import numpy as np
 from pydub import AudioSegment
 import librosa
-# Load the pipeline for speech recognition and translation
-pipe = pipeline(
-    "automatic-speech-recognition",
-    model="Akashpb13/Hausa_xlsr",
-    tokenizer="Akashpb13/Hausa_xlsr"
-)
 translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-to-english-text")
 tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")
@@ -17,6 +17,18 @@ def translate_speech(audio_input):
     # Load the audio file as a floating point time series
     audio_data, sample_rate = librosa.load(audio_input, sr=None)
     # Use the speech recognition pipeline to transcribe the audio
     output = pipe(audio_data)

 import numpy as np
 from pydub import AudioSegment
 import librosa
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+# Load the model and processor
+model = Wav2Vec2ForCTC.from_pretrained("Akashpb13/Hausa_xlsr")
+processor = Wav2Vec2Processor.from_pretrained("Akashpb13/Hausa_xlsr")
 translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-to-english-text")
 tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")
     # Load the audio file as a floating point time series
     audio_data, sample_rate = librosa.load(audio_input, sr=None)
+    # Prepare the input dictionary
+    input_dict = processor(audio_data, return_tensors="pt", padding=True)
+    # Use the model to get the logits
+    logits = model(input_dict.input_values.to("cuda")).logits
+    # Get the predicted IDs
+    pred_ids = torch.argmax(logits, dim=-1)[0]
+    # Decode the predicted IDs to get the transcription
+    transcription = processor.decode(pred_ids)
     # Use the speech recognition pipeline to transcribe the audio
     output = pipe(audio_data)