File size: 2,734 Bytes
5473c42
 
 
 
 
e59b0bd
a7fd32e
5473c42
e59b0bd
 
 
 
 
 
 
 
 
5473c42
 
 
a7fd32e
 
5473c42
e59b0bd
5473c42
 
 
 
 
e59b0bd
5473c42
 
 
 
 
 
 
 
 
e59b0bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5473c42
e59b0bd
5473c42
 
e59b0bd
5473c42
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import gradio as gr
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import phonemizer
import librosa
import math
import io
import base64
from strsimpy.jaro_winkler import JaroWinkler

# base64 to audio βœ…
# audio to transcription βœ…
# audio to text βœ…
# text to phoneme βœ…
# accuracy = jarowinkler(transcription, phoneme) βœ…
# band = getBandFromAccuracy(accuracy)  βœ…
# return accuracy, band βœ…


def lark(audioAsB64):
    # base64 to wav data conversion
    wav_data = base64.b64decode(audioAsB64.encode("utf-8"))

    # audio to transcription
    processor = Wav2Vec2Processor.from_pretrained(
        "facebook/wav2vec2-xlsr-53-espeak-cv-ft"
    )
    model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")

    waveform, sample_rate = librosa.load(io.BytesIO(wav_data), sr=16000)

    input_values = processor(
        waveform, sampling_rate=sample_rate, return_tensors="pt"
    ).input_values

    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    speechToPhonemeTranscription = processor.batch_decode(predicted_ids)[0]

    # audio to text
    processorSTT = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
    model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

    input_values = processorSTT(
        waveform, sampling_rate=sample_rate, return_tensors="pt"
    ).input_values

    logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    speechToTextTranscripition = processor.batch_decode(predicted_ids)[0]

    # text to phoneme
    graphemeToPhonemeTranscription = phonemizer.phonemize(speechToTextTranscripition)

    # accuracy = jaroWinkler(transcription, phoneme)

    jarowinkler = JaroWinkler()
    similarity_score = jarowinkler.similarity(
        speechToPhonemeTranscription, graphemeToPhonemeTranscription
    )

    # ielts pronunciation band estimation
    def getBandFromSimilarityScore(similarity_score):
        if similarity_score >= 0.91:
            return 9
        elif similarity_score >= 0.81:
            return 8
        elif similarity_score >= 0.73:
            return 7
        elif similarity_score >= 0.65:
            return 6
        elif similarity_score >= 0.60:
            return 5
        elif similarity_score >= 0.46:
            return 4
        elif similarity_score >= 0.35:
            return 3
        elif similarity_score >= 0.1:
            return 2
        else:
            return 1

    IELTSband = getBandFromSimilarityScore(similarity_score)

    return [similarity_score, IELTSband]


iface = gr.Interface(fn=lark, inputs="text", outputs=["text", "text"])
iface.launch()