File size: 2,605 Bytes
7f53721
 
 
fc74109
 
7f53721
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc74109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc090d1
573776a
fc74109
 
 
 
573776a
fc74109
 
 
1c21e30
db14524
 
 
 
 
 
 
cc090d1
84cc55e
fc74109
 
 
573776a
fc74109
 
 
7f53721
fc74109
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import gradio as gr
import librosa
import numpy as np
import torch
from transformers import pipeline


language_classes = {
    0: "Arabic",
    1: "Basque",
    2: "Breton",
    3: "Catalan",
    4: "Chinese_China",
    5: "Chinese_Hongkong",
    6: "Chinese_Taiwan",
    7: "Chuvash",
    8: "Czech",
    9: "Dhivehi",
    10: "Dutch",
    11: "English",
    12: "Esperanto",
    13: "Estonian",
    14: "French",
    15: "Frisian",
    16: "Georgian",
    17: "German",
    18: "Greek",
    19: "Hakha_Chin",
    20: "Indonesian",
    21: "Interlingua",
    22: "Italian",
    23: "Japanese",
    24: "Kabyle",
    25: "Kinyarwanda",
    26: "Kyrgyz",
    27: "Latvian",
    28: "Maltese",
    29: "Mongolian",
    30: "Persian",
    31: "Polish",
    32: "Portuguese",
    33: "Romanian",
    34: "Romansh_Sursilvan",
    35: "Russian",
    36: "Sakha",
    37: "Slovenian",
    38: "Spanish",
    39: "Swedish",
    40: "Tamil",
    41: "Tatar",
    42: "Turkish",
    43: "Ukranian",
    44: "Welsh"
}




username = "AescF"  ## Complete your username
model_id = "AescF/hubert-base-ls960-finetuned-common_language"
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = pipeline("audio-classification", model=model_id, device=device)

# def predict_trunc(filepath):
#     preprocessed = pipe.preprocess(filepath)
#     truncated = pipe.feature_extractor.pad(preprocessed,truncation=True, max_length = 16_000*30)
#     model_outputs = pipe.forward(truncated)
#     outputs = pipe.postprocess(model_outputs)

#     return outputs


def classify_audio(filepath):
    
    
    preds = pipe(filepath)
    # preds = predict_trunc(filepath)
    outputs = {}
    for p in preds:
        outputs[p["label"]] = p["score"]
    return outputs


title = "Language Classification Model"
description = (
    "This model has been fine-tuned on a dataset containing various languages\n"
    "including Arabic, Basque, Catalan, Chinese, English, French, German, Japanese, Russian, and more.\n"
    "It is designed for audio classification, allowing it to predict the language spoken in a given audio clip.\n"
    "Try it out by uploading an audio sample and see how accurately it can identify the language being spoken!\n"
    "For more info, check out [GITHUB](https://github.com/AEscF)"
)
filenames = ['EN_0049.wav', "FR_0098.wav", "JP_0222.wav",]
filenames = [[f"./{f}"] for f in filenames]
demo = gr.Interface(
    fn=classify_audio,
    inputs=gr.Audio(type="filepath"),
    outputs=[gr.Label(label="Predictions")],
    title=title,
    description=description,
    examples=filenames,
)
demo.launch()