File size: 3,025 Bytes
993f0db
 
 
 
 
 
e6a9b5c
993f0db
 
 
 
 
 
 
 
e6a9b5c
 
993f0db
e6a9b5c
 
 
 
 
 
 
 
993f0db
 
 
 
 
 
 
 
 
 
e6a9b5c
993f0db
 
e6a9b5c
993f0db
 
 
e6a9b5c
 
 
 
 
993f0db
 
e6a9b5c
993f0db
 
 
 
e6a9b5c
 
 
 
993f0db
 
 
 
 
 
 
 
 
e6a9b5c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import streamlit as st
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import numpy as np
import soundfile as sf
import io
import librosa

st.title("Syllables per Second Calculator")
st.write("Upload an audio file to calculate the number of 'p', 't', and 'k' syllables per second.")

def get_syllables_per_second(audio_file):
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")
    model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-xlsr-53-espeak-cv-ft")

    audio_input, original_sample_rate = sf.read(io.BytesIO(audio_file.read()))
    target_sample_rate = processor.feature_extractor.sampling_rate

    # resample the sample rate if not 16 k
    if original_sample_rate != target_sample_rate:
        if audio_input.ndim > 1:
            audio_input = np.asarray([librosa.resample(channel, orig_sr=original_sample_rate, target_sr=target_sample_rate) for channel in audio_input.T]).T
        else:
            audio_input = librosa.resample(audio_input, orig_sr=original_sample_rate, target_sr=target_sample_rate)

    # make the audio mono if it is stereo
    if audio_input.ndim > 1 and audio_input.shape[1] == 2:
        audio_input = np.mean(audio_input, axis=1)

    input_values = processor(audio_input, return_tensors="pt").input_values

    with torch.no_grad():
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids, output_char_offsets=True)
        offsets = transcription['char_offsets']
        print("the offets are: ", offsets)

    # Find the start and end time offsets of the syllables

    syllable_offsets = [item for item in offsets[0] if item['char'] in ['p', 't', 'k']]
    
    if syllable_offsets:  # if any syllable is found
        first_syllable_offset = syllable_offsets[0]['start_offset'] * 0.02
        last_syllable_offset = syllable_offsets[-1]['end_offset'] * 0.02

        print("the first syllable offset is: ", first_syllable_offset)
        print("the last syllable offset is: ", last_syllable_offset)
        # Duration from the first to the last syllable
        syllable_duration = last_syllable_offset - first_syllable_offset
        print("the syllable duration is: ", syllable_duration)
    else:
        syllable_duration = 0

    syllable_count = len(syllable_offsets)
    audio_duration = len(audio_input) / target_sample_rate
    print("the audio duration is: ", audio_duration)
    print("the syllable count is: ", syllable_count)
    #print("the syllabels per second is: ", syllable_count / audio_duration)
    syllables_per_second = syllable_count / syllable_duration if syllable_duration > 0 else 0

    return syllables_per_second

uploaded_file = st.file_uploader("Choose an audio file", type=["wav"])

if uploaded_file is not None:
    with st.spinner("Processing the audio file..."):
        result = get_syllables_per_second(uploaded_file)
        st.write("Syllables per second: ", result)