Spaces:

romas-458
/

acr

Sleeping

App Files Files Community

roman commited on May 29

Commit

d242d3a

•

1 Parent(s): 7bd33ad

try new approach

Browse files

Files changed (2) hide show

app.py +19 -47
app3.py +76 -0

app.py CHANGED Viewed

@@ -1,40 +1,22 @@
 import streamlit as st
-# from transformers import AutoModelForSpeechSeq2Seq, Wav2Vec2Processor
-from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
-import torch
 import tempfile
 from pydub import AudioSegment
 import numpy as np
-# Define available models
-# available_models = [
-#     "facebook/s2t-small-mustc-en-fr-st",
-#     "facebook/s2t-medium-mustc-en-fr-st",
-#     "facebook/s2t-large-mustc-en-fr-st"
-# ]
-available_models = ["Yehor/whisper-small-ukrainian"]
-st.title("Voice Recognition App using SpeechSeq2Seq")
-st.write("Upload an audio file and choose a model to transcribe it to text.")
-# Model selection dropdown
-model_choice = st.selectbox("Choose a SpeechSeq2Seq model", available_models)
-# Load the selected model and processor
-@st.cache_resource
-def load_model_and_processor(model_name):
-    # model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
-    # processor = Wav2Vec2Processor.from_pretrained(model_name)
-    model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
-    processor = AutoProcessor.from_pretrained(model_name)
-    return model, processor
-st.write(f"Loading {model_choice} model...")
-model, processor = load_model_and_processor(model_choice)
-st.write(f"{model_choice} model loaded successfully.")
 # File uploader for audio file
 uploaded_file = st.file_uploader("Choose an audio file", type=["wav", "mp3", "m4a"])
@@ -45,7 +27,7 @@ if uploaded_file is not None:
         temp_file.write(uploaded_file.read())
         temp_file_path = temp_file.name
-    # Convert audio file to a format supported by the processor (if necessary)
     audio = AudioSegment.from_file(temp_file_path)
     temp_wav_path = tempfile.mktemp(suffix=".wav")
     audio.export(temp_wav_path, format="wav")
@@ -54,23 +36,13 @@ if uploaded_file is not None:
     st.write("Transcribing audio...")
-    # # Load audio
-    # audio_input = AudioSegment.from_file(temp_wav_path).set_frame_rate(16000).set_channels(1)
-    # audio_input = np.array(audio_input.get_array_of_samples())
-    #
-    # # Normalize audio
-    # audio_input = (audio_input - np.mean(audio_input)) / np.std(audio_input)
-    #
-    # # Process the audio
-    # input_features = processor(audio_input, return_tensors="pt", sampling_rate=16000).input_values
-    #
-    # # Generate transcription
-    # with torch.no_grad():
-    #     predicted_ids = model.generate(input_features)
-    #
-    # transcription = processor.batch_decode(predicted_ids)[0]
-    transcription = model.transcribe(temp_wav_path)
     st.write("Transcription:")
-    st.write(transcription)

 import streamlit as st
+from transformers import pipeline
 import tempfile
 from pydub import AudioSegment
 import numpy as np
+# Load the ASR pipeline
+@st.cache_resource
+def load_asr_pipeline():
+    asr_pipeline = pipeline("automatic-speech-recognition", model="Yehor/whisper-small-ukrainian")
+    return asr_pipeline
+st.title("Voice Recognition App using Whisper")
+st.write("Upload an audio file and the Whisper model will transcribe it to text.")
+# Load the ASR pipeline
+asr_pipeline = load_asr_pipeline()
+st.write("Model loaded successfully.")
 # File uploader for audio file
 uploaded_file = st.file_uploader("Choose an audio file", type=["wav", "mp3", "m4a"])
         temp_file.write(uploaded_file.read())
         temp_file_path = temp_file.name
+    # Convert audio file to WAV format if necessary
     audio = AudioSegment.from_file(temp_file_path)
     temp_wav_path = tempfile.mktemp(suffix=".wav")
     audio.export(temp_wav_path, format="wav")
     st.write("Transcribing audio...")
+    # Read the audio file
+    audio_input = AudioSegment.from_file(temp_wav_path).set_frame_rate(16000).set_channels(1)
+    audio_input = np.array(audio_input.get_array_of_samples(), dtype=np.float32)
+    # Perform transcription
+    result = asr_pipeline(audio_input)
+    # Display transcription
     st.write("Transcription:")
+    st.write(result['text'])

app3.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import streamlit as st
+# from transformers import AutoModelForSpeechSeq2Seq, Wav2Vec2Processor
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
+import torch
+import tempfile
+from pydub import AudioSegment
+import numpy as np
+# Define available models
+# available_models = [
+#     "facebook/s2t-small-mustc-en-fr-st",
+#     "facebook/s2t-medium-mustc-en-fr-st",
+#     "facebook/s2t-large-mustc-en-fr-st"
+# ]
+available_models = ["Yehor/whisper-small-ukrainian"]
+st.title("Voice Recognition App using SpeechSeq2Seq")
+st.write("Upload an audio file and choose a model to transcribe it to text.")
+# Model selection dropdown
+model_choice = st.selectbox("Choose a SpeechSeq2Seq model", available_models)
+# Load the selected model and processor
+@st.cache_resource
+def load_model_and_processor(model_name):
+    # model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
+    # processor = Wav2Vec2Processor.from_pretrained(model_name)
+    model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
+    processor = AutoProcessor.from_pretrained(model_name)
+    return model, processor
+st.write(f"Loading {model_choice} model...")
+model, processor = load_model_and_processor(model_choice)
+st.write(f"{model_choice} model loaded successfully.")
+# File uploader for audio file
+uploaded_file = st.file_uploader("Choose an audio file", type=["wav", "mp3", "m4a"])
+if uploaded_file is not None:
+    # Save the uploaded file temporarily
+    with tempfile.NamedTemporaryFile(delete=False) as temp_file:
+        temp_file.write(uploaded_file.read())
+        temp_file_path = temp_file.name
+    # Convert audio file to a format supported by the processor (if necessary)
+    audio = AudioSegment.from_file(temp_file_path)
+    temp_wav_path = tempfile.mktemp(suffix=".wav")
+    audio.export(temp_wav_path, format="wav")
+    st.audio(uploaded_file, format="audio/wav")
+    st.write("Transcribing audio...")
+    # # Load audio
+    # audio_input = AudioSegment.from_file(temp_wav_path).set_frame_rate(16000).set_channels(1)
+    # audio_input = np.array(audio_input.get_array_of_samples())
+    #
+    # # Normalize audio
+    # audio_input = (audio_input - np.mean(audio_input)) / np.std(audio_input)
+    #
+    # # Process the audio
+    # input_features = processor(audio_input, return_tensors="pt", sampling_rate=16000).input_values
+    #
+    # # Generate transcription
+    # with torch.no_grad():
+    #     predicted_ids = model.generate(input_features)
+    #
+    # transcription = processor.batch_decode(predicted_ids)[0]
+    transcription = model.transcribe(temp_wav_path)
+    st.write("Transcription:")
+    st.write(transcription)