roman commited on
Commit
d242d3a
1 Parent(s): 7bd33ad

try new approach

Browse files
Files changed (2) hide show
  1. app.py +19 -47
  2. app3.py +76 -0
app.py CHANGED
@@ -1,40 +1,22 @@
1
  import streamlit as st
2
- # from transformers import AutoModelForSpeechSeq2Seq, Wav2Vec2Processor
3
- from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
4
- import torch
5
  import tempfile
6
  from pydub import AudioSegment
7
  import numpy as np
8
 
9
- # Define available models
10
- # available_models = [
11
- # "facebook/s2t-small-mustc-en-fr-st",
12
- # "facebook/s2t-medium-mustc-en-fr-st",
13
- # "facebook/s2t-large-mustc-en-fr-st"
14
- # ]
15
-
16
- available_models = ["Yehor/whisper-small-ukrainian"]
17
-
18
- st.title("Voice Recognition App using SpeechSeq2Seq")
19
-
20
- st.write("Upload an audio file and choose a model to transcribe it to text.")
21
-
22
- # Model selection dropdown
23
- model_choice = st.selectbox("Choose a SpeechSeq2Seq model", available_models)
24
 
 
25
 
26
- # Load the selected model and processor
27
- @st.cache_resource
28
- def load_model_and_processor(model_name):
29
- # model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
30
- # processor = Wav2Vec2Processor.from_pretrained(model_name)
31
- model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
32
- processor = AutoProcessor.from_pretrained(model_name)
33
- return model, processor
34
 
35
- st.write(f"Loading {model_choice} model...")
36
- model, processor = load_model_and_processor(model_choice)
37
- st.write(f"{model_choice} model loaded successfully.")
38
 
39
  # File uploader for audio file
40
  uploaded_file = st.file_uploader("Choose an audio file", type=["wav", "mp3", "m4a"])
@@ -45,7 +27,7 @@ if uploaded_file is not None:
45
  temp_file.write(uploaded_file.read())
46
  temp_file_path = temp_file.name
47
 
48
- # Convert audio file to a format supported by the processor (if necessary)
49
  audio = AudioSegment.from_file(temp_file_path)
50
  temp_wav_path = tempfile.mktemp(suffix=".wav")
51
  audio.export(temp_wav_path, format="wav")
@@ -54,23 +36,13 @@ if uploaded_file is not None:
54
 
55
  st.write("Transcribing audio...")
56
 
57
- # # Load audio
58
- # audio_input = AudioSegment.from_file(temp_wav_path).set_frame_rate(16000).set_channels(1)
59
- # audio_input = np.array(audio_input.get_array_of_samples())
60
- #
61
- # # Normalize audio
62
- # audio_input = (audio_input - np.mean(audio_input)) / np.std(audio_input)
63
- #
64
- # # Process the audio
65
- # input_features = processor(audio_input, return_tensors="pt", sampling_rate=16000).input_values
66
- #
67
- # # Generate transcription
68
- # with torch.no_grad():
69
- # predicted_ids = model.generate(input_features)
70
- #
71
- # transcription = processor.batch_decode(predicted_ids)[0]
72
 
73
- transcription = model.transcribe(temp_wav_path)
 
74
 
 
75
  st.write("Transcription:")
76
- st.write(transcription)
 
1
  import streamlit as st
2
+ from transformers import pipeline
 
 
3
  import tempfile
4
  from pydub import AudioSegment
5
  import numpy as np
6
 
7
+ # Load the ASR pipeline
8
+ @st.cache_resource
9
+ def load_asr_pipeline():
10
+ asr_pipeline = pipeline("automatic-speech-recognition", model="Yehor/whisper-small-ukrainian")
11
+ return asr_pipeline
 
 
 
 
 
 
 
 
 
 
12
 
13
+ st.title("Voice Recognition App using Whisper")
14
 
15
+ st.write("Upload an audio file and the Whisper model will transcribe it to text.")
 
 
 
 
 
 
 
16
 
17
+ # Load the ASR pipeline
18
+ asr_pipeline = load_asr_pipeline()
19
+ st.write("Model loaded successfully.")
20
 
21
  # File uploader for audio file
22
  uploaded_file = st.file_uploader("Choose an audio file", type=["wav", "mp3", "m4a"])
 
27
  temp_file.write(uploaded_file.read())
28
  temp_file_path = temp_file.name
29
 
30
+ # Convert audio file to WAV format if necessary
31
  audio = AudioSegment.from_file(temp_file_path)
32
  temp_wav_path = tempfile.mktemp(suffix=".wav")
33
  audio.export(temp_wav_path, format="wav")
 
36
 
37
  st.write("Transcribing audio...")
38
 
39
+ # Read the audio file
40
+ audio_input = AudioSegment.from_file(temp_wav_path).set_frame_rate(16000).set_channels(1)
41
+ audio_input = np.array(audio_input.get_array_of_samples(), dtype=np.float32)
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ # Perform transcription
44
+ result = asr_pipeline(audio_input)
45
 
46
+ # Display transcription
47
  st.write("Transcription:")
48
+ st.write(result['text'])
app3.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ # from transformers import AutoModelForSpeechSeq2Seq, Wav2Vec2Processor
3
+ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
4
+ import torch
5
+ import tempfile
6
+ from pydub import AudioSegment
7
+ import numpy as np
8
+
9
+ # Define available models
10
+ # available_models = [
11
+ # "facebook/s2t-small-mustc-en-fr-st",
12
+ # "facebook/s2t-medium-mustc-en-fr-st",
13
+ # "facebook/s2t-large-mustc-en-fr-st"
14
+ # ]
15
+
16
+ available_models = ["Yehor/whisper-small-ukrainian"]
17
+
18
+ st.title("Voice Recognition App using SpeechSeq2Seq")
19
+
20
+ st.write("Upload an audio file and choose a model to transcribe it to text.")
21
+
22
+ # Model selection dropdown
23
+ model_choice = st.selectbox("Choose a SpeechSeq2Seq model", available_models)
24
+
25
+
26
+ # Load the selected model and processor
27
+ @st.cache_resource
28
+ def load_model_and_processor(model_name):
29
+ # model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
30
+ # processor = Wav2Vec2Processor.from_pretrained(model_name)
31
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
32
+ processor = AutoProcessor.from_pretrained(model_name)
33
+ return model, processor
34
+
35
+ st.write(f"Loading {model_choice} model...")
36
+ model, processor = load_model_and_processor(model_choice)
37
+ st.write(f"{model_choice} model loaded successfully.")
38
+
39
+ # File uploader for audio file
40
+ uploaded_file = st.file_uploader("Choose an audio file", type=["wav", "mp3", "m4a"])
41
+
42
+ if uploaded_file is not None:
43
+ # Save the uploaded file temporarily
44
+ with tempfile.NamedTemporaryFile(delete=False) as temp_file:
45
+ temp_file.write(uploaded_file.read())
46
+ temp_file_path = temp_file.name
47
+
48
+ # Convert audio file to a format supported by the processor (if necessary)
49
+ audio = AudioSegment.from_file(temp_file_path)
50
+ temp_wav_path = tempfile.mktemp(suffix=".wav")
51
+ audio.export(temp_wav_path, format="wav")
52
+
53
+ st.audio(uploaded_file, format="audio/wav")
54
+
55
+ st.write("Transcribing audio...")
56
+
57
+ # # Load audio
58
+ # audio_input = AudioSegment.from_file(temp_wav_path).set_frame_rate(16000).set_channels(1)
59
+ # audio_input = np.array(audio_input.get_array_of_samples())
60
+ #
61
+ # # Normalize audio
62
+ # audio_input = (audio_input - np.mean(audio_input)) / np.std(audio_input)
63
+ #
64
+ # # Process the audio
65
+ # input_features = processor(audio_input, return_tensors="pt", sampling_rate=16000).input_values
66
+ #
67
+ # # Generate transcription
68
+ # with torch.no_grad():
69
+ # predicted_ids = model.generate(input_features)
70
+ #
71
+ # transcription = processor.batch_decode(predicted_ids)[0]
72
+
73
+ transcription = model.transcribe(temp_wav_path)
74
+
75
+ st.write("Transcription:")
76
+ st.write(transcription)