ASR-Arabic / app.py
muzammil-eds's picture
Update app.py
f36d376 verified
import streamlit as st
import requests
import Levenshtein
import time
from io import BytesIO
import librosa
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from audio_recorder_streamlit import audio_recorder
@st.cache_resource
def load_model():
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
return processor, model
processor, model = load_model()
def transcribe_audio_hf(audio_bytes):
speech_array, sampling_rate = librosa.load(BytesIO(audio_bytes), sr=16000)
input_values = processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True).input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0].strip()
return transcription
def levenshtein_similarity(transcription1, transcription2):
distance = Levenshtein.distance(transcription1, transcription2)
max_len = max(len(transcription1), len(transcription2))
return 1 - distance / max_len # Normalize to get similarity score
def evaluate_audio_similarity(original_audio_bytes, user_audio_bytes):
transcription_original = transcribe_audio_hf(original_audio_bytes)
transcription_user = transcribe_audio_hf(user_audio_bytes)
similarity_score_levenshtein = levenshtein_similarity(transcription_original, transcription_user)
return transcription_original, transcription_user, similarity_score_levenshtein
st.title("Audio Transcription and Similarity Checker")
# Initialize the session state to control the view
if 'initialized' not in st.session_state:
st.session_state['initialized'] = False
# Button to initialize the recorders
if not st.session_state['initialized']:
st.write("Click the Loader below to initialize the audio recorders.")
init_button = audio_recorder(
text="Click to Initialize",
recording_color="#e8b62c",
neutral_color="#6aa36f",
pause_threshold=0.2,
icon_name="play-circle", # A nice play icon to signify starting the initialization
icon_size="4x",
auto_start=False
)
if init_button:
st.session_state['initialized'] = True
# If initialized, display the recorders
if st.session_state['initialized']:
st.subheader("Record or Upload Original Audio")
# Style the record button with the provided parameters
original_audio_bytes = audio_recorder(
text="Click to Record Audio",
recording_color="#e8b62c",
neutral_color="#6aa36f",
pause_threshold=30,
icon_name="microphone", # You can change this to any Font Awesome solid icon
icon_size="4x"
)
if not original_audio_bytes:
original_audio = st.file_uploader("Or Upload Original Audio", type=["wav", "mp3"])
if original_audio:
original_audio_bytes = original_audio.read()
if original_audio_bytes:
with st.spinner("Processing original audio..."):
st.audio(original_audio_bytes, format="audio/wav")
st.subheader("Record or Upload User Audio")
st.write("")
# Style the user audio recorder similarly
user_audio_bytes = audio_recorder(
text="Click to Record Audio",
recording_color="#e86f6f",
neutral_color="#6a6faf",
pause_threshold=30,
icon_name="user", # You can change this to any Font Awesome solid icon
icon_size="4x"
)
if not user_audio_bytes:
user_audio = st.file_uploader("Or Upload User Audio", type=["wav", "mp3"])
if user_audio:
user_audio_bytes = user_audio.read()
if user_audio_bytes:
with st.spinner("Processing user audio..."):
st.audio(user_audio_bytes, format="audio/wav")
# Add a button to perform the test
if original_audio_bytes and user_audio_bytes:
if st.button("Perform Testing"):
with st.spinner("Performing transcription and similarity testing..."):
transcription_original, transcription_user, similarity_score = evaluate_audio_similarity(original_audio_bytes, user_audio_bytes)
# Display results
st.markdown("---")
st.subheader("Transcriptions and Similarity Score")
st.write(f"**Original Transcription:** {transcription_original}")
st.write(f"**User Transcription:** {transcription_user}")
st.write(f"**Levenshtein Similarity Score:** {similarity_score:.2f}")
if similarity_score > 0.8: # Adjust the threshold as needed
st.success("The pronunciation is likely correct based on transcription similarity.")
else:
st.error("The pronunciation may be incorrect based on transcription similarity.")