import gradio as gr import librosa import numpy as np from transformers import AutoFeatureExtractor import os model_id = "AescF/hubert-base-ls960-finetuned-common_language" processor = Wav2Vec2Processor.from_pretrained(model_id) model = Wav2Vec2ForClassification.from_pretrained(model_id) language_classes = { 0: "Arabic", 1: "Basque", 2: "Breton", 3: "Catalan", 4: "Chinese_China", 5: "Chinese_Hongkong", 6: "Chinese_Taiwan", 7: "Chuvash", 8: "Czech", 9: "Dhivehi", 10: "Dutch", 11: "English", 12: "Esperanto", 13: "Estonian", 14: "French", 15: "Frisian", 16: "Georgian", 17: "German", 18: "Greek", 19: "Hakha_Chin", 20: "Indonesian", 21: "Interlingua", 22: "Italian", 23: "Japanese", 24: "Kabyle", 25: "Kinyarwanda", 26: "Kyrgyz", 27: "Latvian", 28: "Maltese", 29: "Mongolian", 30: "Persian", 31: "Polish", 32: "Portuguese", 33: "Romanian", 34: "Romansh_Sursilvan", 35: "Russian", 36: "Sakha", 37: "Slovenian", 38: "Spanish", 39: "Swedish", 40: "Tamil", 41: "Tatar", 42: "Turkish", 43: "Ukranian", 44: "Welsh" } def predict_language(audio): # Read audio file audio_input, sr = librosa.load(audio, sr=16000) # Convert to suitable format input_values = processor(audio_input, return_tensors="pt", padding=True).input_values # Make prediction with torch.no_grad(): logits = model(input_values).logits # Compute probabilities probabilities = torch.softmax(logits, dim=1) # Retrieve label predicted_language_idx = torch.argmax(probabilities[0]).item() return {language_classes[predicted_language_idx]: float(probabilities[0][predicted_language_idx])} iface = gr.Interface( predict_language, inputs=gr.inputs.Audio(type="filepath", label="Upload Language Audio file"), outputs=gr.outputs.Label(), title="Language Classifier", live=True ) script_dir = os.path.abspath(os.path.join(os.path.abspath(''), os.pardir)) iface.launch()