Spaces:
Sleeping
Sleeping
File size: 2,734 Bytes
5473c42 e59b0bd a7fd32e 5473c42 e59b0bd 5473c42 a7fd32e 5473c42 e59b0bd 5473c42 e59b0bd 5473c42 e59b0bd 5473c42 e59b0bd 5473c42 e59b0bd 5473c42 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import gradio as gr
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import phonemizer
import librosa
import math
import io
import base64
from strsimpy.jaro_winkler import JaroWinkler
# base64 to audio β
# audio to transcription β
# audio to text β
# text to phoneme β
# accuracy = jarowinkler(transcription, phoneme) β
# band = getBandFromAccuracy(accuracy) β
# return accuracy, band β
def lark(audioAsB64):
# base64 to wav data conversion
wav_data = base64.b64decode(audioAsB64.encode("utf-8"))
# audio to transcription
processor = Wav2Vec2Processor.from_pretrained(
"facebook/wav2vec2-xlsr-53-espeak-cv-ft"
)
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
waveform, sample_rate = librosa.load(io.BytesIO(wav_data), sr=16000)
input_values = processor(
waveform, sampling_rate=sample_rate, return_tensors="pt"
).input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
speechToPhonemeTranscription = processor.batch_decode(predicted_ids)[0]
# audio to text
processorSTT = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
input_values = processorSTT(
waveform, sampling_rate=sample_rate, return_tensors="pt"
).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
speechToTextTranscripition = processor.batch_decode(predicted_ids)[0]
# text to phoneme
graphemeToPhonemeTranscription = phonemizer.phonemize(speechToTextTranscripition)
# accuracy = jaroWinkler(transcription, phoneme)
jarowinkler = JaroWinkler()
similarity_score = jarowinkler.similarity(
speechToPhonemeTranscription, graphemeToPhonemeTranscription
)
# ielts pronunciation band estimation
def getBandFromSimilarityScore(similarity_score):
if similarity_score >= 0.91:
return 9
elif similarity_score >= 0.81:
return 8
elif similarity_score >= 0.73:
return 7
elif similarity_score >= 0.65:
return 6
elif similarity_score >= 0.60:
return 5
elif similarity_score >= 0.46:
return 4
elif similarity_score >= 0.35:
return 3
elif similarity_score >= 0.1:
return 2
else:
return 1
IELTSband = getBandFromSimilarityScore(similarity_score)
return [similarity_score, IELTSband]
iface = gr.Interface(fn=lark, inputs="text", outputs=["text", "text"])
iface.launch()
|