File size: 2,130 Bytes
7f53721
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import gradio as gr
import librosa
import numpy as np
from transformers import Wav2Vec2ForClassification, Wav2Vec2Processor
import os


model_id = "AescF/hubert-base-ls960-finetuned-common_language"
processor = Wav2Vec2Processor.from_pretrained(model_id)
model = Wav2Vec2ForClassification.from_pretrained(model_id)
language_classes = {
    0: "Arabic",
    1: "Basque",
    2: "Breton",
    3: "Catalan",
    4: "Chinese_China",
    5: "Chinese_Hongkong",
    6: "Chinese_Taiwan",
    7: "Chuvash",
    8: "Czech",
    9: "Dhivehi",
    10: "Dutch",
    11: "English",
    12: "Esperanto",
    13: "Estonian",
    14: "French",
    15: "Frisian",
    16: "Georgian",
    17: "German",
    18: "Greek",
    19: "Hakha_Chin",
    20: "Indonesian",
    21: "Interlingua",
    22: "Italian",
    23: "Japanese",
    24: "Kabyle",
    25: "Kinyarwanda",
    26: "Kyrgyz",
    27: "Latvian",
    28: "Maltese",
    29: "Mongolian",
    30: "Persian",
    31: "Polish",
    32: "Portuguese",
    33: "Romanian",
    34: "Romansh_Sursilvan",
    35: "Russian",
    36: "Sakha",
    37: "Slovenian",
    38: "Spanish",
    39: "Swedish",
    40: "Tamil",
    41: "Tatar",
    42: "Turkish",
    43: "Ukranian",
    44: "Welsh"
}


def predict_language(audio):
    # Read audio file
    audio_input, sr = librosa.load(audio, sr=16000)
    
    # Convert to suitable format
    input_values = processor(audio_input, return_tensors="pt", padding=True).input_values
    
    # Make prediction
    with torch.no_grad():
        logits = model(input_values).logits
    
    # Compute probabilities
    probabilities = torch.softmax(logits, dim=1)
    
    # Retrieve label
    predicted_language_idx = torch.argmax(probabilities[0]).item()
    
    return {language_classes[predicted_language_idx]: float(probabilities[0][predicted_language_idx])}

iface = gr.Interface(
    predict_language, 
    inputs=gr.inputs.Audio(type="filepath", label="Upload Language Audio file"),
    outputs=gr.outputs.Label(),
    title="Language Classifier",
    live=True
)
script_dir = os.path.abspath(os.path.join(os.path.abspath(''), os.pardir))
iface.launch()