File size: 6,173 Bytes
869e885
 
8cb8264
592f7e1
 
440d6b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55eafca
440d6b7
 
 
 
 
55eafca
b3ed824
 
440d6b7
 
592f7e1
440d6b7
6d1025b
440d6b7
2d9e081
14c8f51
8cb8264
2d9e081
 
d18b751
 
b3ed824
2d9e081
440d6b7
 
b3ed824
440d6b7
 
592f7e1
b3ed824
f9cd637
592f7e1
f9cd637
b3ed824
440d6b7
b3ed824
592f7e1
440d6b7
 
 
 
 
47661bd
6de75ee
14c8f51
440d6b7
 
 
 
 
 
47661bd
 
 
1b9402b
 
 
47661bd
1b9402b
47661bd
440d6b7
47661bd
440d6b7
1b9402b
2d9e081
440d6b7
 
 
47661bd
1b9402b
592f7e1
14c8f51
 
 
 
 
 
2d9e081
 
55eafca
440d6b7
 
 
 
 
2d9e081
55eafca
 
592f7e1
6de75ee
 
 
440d6b7
592f7e1
 
 
b3ed824
592f7e1
 
 
55eafca
440d6b7
5e4096f
440d6b7
 
 
 
 
 
55eafca
2d9e081
55eafca
2d9e081
440d6b7
 
592f7e1
440d6b7
 
 
 
 
55eafca
440d6b7
 
b3ed824
440d6b7
 
b3ed824
440d6b7
 
 
 
 
 
 
 
b3ed824
869e885
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# app.py

import gradio as gr
import warnings
import torch
from transformers import pipeline, WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor

warnings.filterwarnings("ignore")

# Load tokenizer and model
tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium")
model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-medium")
processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium")

# Set up the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch_dtype = torch.float32

# Initialize pipeline
asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, device=device, torch_dtype=torch_dtype)

def transcribe_audio(audio_file):
    # Perform transcription
    with torch.no_grad():
        output = asr(audio_file, chunk_length_s=28, generate_kwargs={"num_beams": 5, "task": "transcribe", "language": "no"})
    return output["text"]

# Create Gradio interface
iface = gr.Interface(
    fn=transcribe_audio,
    inputs=gr.Audio(type="filepath"),
    outputs="text",
    title="Audio Transcription App",
    description="Upload an audio file to get the transcription",
    theme="default",
    live=False
)

    
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM
from pydub import AudioSegment
import soundfile as sf
import numpy as np
import os
import nltk
from fpdf import FPDF
import time

nltk.download('punkt')

HF_AUTH_TOKEN = os.getenv('HF_AUTH_TOKEN')

# transcription
processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
transcription_model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")

# summarization
summarization_tokenizer = AutoTokenizer.from_pretrained("NbAiLab/norbert-summarization")
summarization_model = AutoModelForSeq2SeqLM.from_pretrained("NbAiLab/norbert-summarization")

# setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch_dtype = torch.float32

# move 'em
transcription_model.to(device)
summarization_model.to(device)  # PS. model needs to be told to use graph-based summary method (Lexname?)

def convert_to_wav(audio_file):
    audio = AudioSegment.from_file(audio_file, format="m4a")
    wav_file = "temp.wav"
    audio.export(wav_file, format="wav")
    return wav_file

def transcribe_audio(audio_file, batch_size=4):
    start_time = time.time()
    # Convert .m4a to .wav
    if audio_file.endswith(".m4a"):
        audio_file = convert_to_wav(audio_file)
    
    audio_input, sample_rate = sf.read(audio_file)
    chunk_size = 16000 * 30 
    chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]

    transcription = ""
    for i in range(0, len(chunks), batch_size):
        batch_chunks = chunks[i:i + batch_size]
        inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True)
        inputs = inputs.to(device)
        attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
        with torch.no_grad():
            output = transcription_model.generate(
                inputs.input_features,
                max_length=2048,  # Increase max_length for longer outputs
                num_beams=7,
                task="transcribe",
                attention_mask=attention_mask,
                # forced_decoder_ids=None,  # OBS! forced_decoder_ids must not be set. Just marked it out for, just in case..
                language="no"
            )
        transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "

    end_time = time.time()
    transcription_time = end_time - start_time
    word_count = len(transcription.split())

    result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}"
    
    return transcription.strip(), result

def summarize_text(text):
    inputs = summarization_tokenizer([text], max_length=1024, return_tensors="pt", truncation=True)
    inputs = inputs.to(device)
    summary_ids = summarization_model.generate(inputs.input_ids, num_beams=4, max_length=150, early_stopping=True)
    summary = summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# HTML syntax for imagery
image_html = """
<div style="text-align: center;">
    <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%" height="auto">
</div>
<div style="text-align: center; margin-top: 20px;">
    <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/picture.jpg" alt="Additional Image" width="50%" height="auto">
</div>
"""

# Gradio UI
iface = gr.Blocks()

with iface:
    gr.HTML(image_html)
    gr.Markdown("# Switch Work Audio Transcription App\nUpload an audio file to get the transcription")
    audio_input = gr.Audio(type="filepath")
    batch_size_input = gr.Slider(minimum=1, maximum=16, step=1, default=4, label="Batch Size")
    transcription_output = gr.Textbox()
    summary_output = gr.Textbox()
    transcribe_button = gr.Button("Transcribe and Summarize")

    def transcribe_and_summarize(audio_file, batch_size):
        transcription, result = transcribe_audio(audio_file, batch_size)
        summary = summarize_text(transcription)
        return result, summary

    transcribe_button.click(fn=transcribe_and_summarize, inputs=[audio_input, batch_size_input], outputs=[transcription_output, summary_output])
    

def save_to_pdf(transcription, summary):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    
    # include transcription
    pdf.multi_cell(0, 10, "Transcription:\n" + transcription)
    
    # paragraph space
    pdf.ln(10)
    
    # include summary
    pdf.multi_cell(0, 10, "Summary:\n" + summary)
    
    pdf_output_path = "transcription_summary.pdf"
    pdf.output(pdf_output_path)
    return pdf_output_path


    
# run
iface.launch(share=True, debug=True)