test / app.py
Aryan Wadhawan
lotta changes
a7fd32e
raw
history blame
994 Bytes
import gradio as gr
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import phonemizer
import librosa
import io
import base64
def lark(audioAsB64):
# base64 to wav data conversion
wav_data = base64.b64decode(audioAsB64.encode("utf-8"))
# processing
processor = Wav2Vec2Processor.from_pretrained(
"facebook/wav2vec2-xlsr-53-espeak-cv-ft"
)
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
waveform, sample_rate = librosa.load(
io.BytesIO(wav_data), sr=16000
) # Downsample 44.1kHz to 8kHz
input_values = processor(
waveform, sampling_rate=sample_rate, return_tensors="pt"
).input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
return transcription
iface = gr.Interface(fn=lark, inputs="text", outputs="text")
iface.launch()