File size: 4,855 Bytes
869e885
 
8cb8264
592f7e1
 
2d9e081
 
592f7e1
869e885
6d1025b
2d9e081
14c8f51
2d9e081
 
 
 
 
 
 
 
8cb8264
592f7e1
8cb8264
2d9e081
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
592f7e1
f9cd637
592f7e1
f9cd637
5c56ed6
592f7e1
869e885
 
 
 
47661bd
6de75ee
14c8f51
869e885
 
2d9e081
47661bd
 
 
1b9402b
 
 
47661bd
1b9402b
47661bd
 
 
14c8f51
1b9402b
2d9e081
869e885
47661bd
1b9402b
592f7e1
14c8f51
 
 
 
 
 
2d9e081
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8de25ef
6d1025b
592f7e1
 
6de75ee
 
14c8f51
 
6de75ee
2d9e081
592f7e1
 
 
f9cd637
592f7e1
 
 
 
14c8f51
5e4096f
fc933fb
2d9e081
 
 
5e4096f
 
2d9e081
 
 
 
 
 
 
592f7e1
aea18b3
869e885
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# app.py

import gradio as gr
import warnings
import torch
#from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import soundfile as sf
import ffmpeg
import os
from fpdf import FPDF
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import re

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd

warnings.filterwarnings("ignore")

nltk.download('punkt')
nltk.download('stopwords')

#tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-large")
#model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-large")
#processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large")

generation_config = {
    "temperature": 0.8,
    "top_p": 0.9,
    "top_k": 0.5,
    "max_output_tokens": 2048
}


processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch_dtype = torch.float32

model.to(device)

def convert_audio_format(audio_path):
    output_path = "converted_audio.wav"
    ffmpeg.input(audio_path).output(output_path, format='wav', ar='16000').run(overwrite_output=True)
    return output_path

def transcribe_audio(audio_file, batch_size=4):
    start_time = time.time()
    audio_path = convert_audio_format(audio_file)
    audio_input, sample_rate = sf.read(audio_path)
    chunk_size = 16000 * 30  
    chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]

    transcription = ""
    for i in range(0, len(chunks), batch_size):
        batch_chunks = chunks[i:i + batch_size]
        inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True)
        inputs = inputs.to(device)
        attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
        with torch.no_grad():
            output = model.generate(
                inputs.input_features,
                max_length=2048,
                num_beams=7,
                task="transcribe",
                attention_mask=attention_mask
            )
        transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "

    end_time = time.time()
    transcription_time = end_time - start_time
    word_count = len(transcription.split())

    result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}"
    
    return transcription.strip(), result

def save_to_pdf(transcription):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    pdf.multi_cell(0, 10, transcription)
    pdf_output_path = "transcription.pdf"
    pdf.output(pdf_output_path)
    return pdf_output_path

def summarize_text(transcription):
    sentences = transcription.split(". ")
    vectorizer = TfidfVectorizer(stop_words='norwegian')
    X = vectorizer.fit_transform(sentences)
    
    kmeans = KMeans(n_clusters=1)
    kmeans.fit(X)
    avg = X.mean(axis=0)
    summary = [sentences[i] for i in kmeans.predict(avg)]
    
    return ". ".join(summary) + "."

# HTML
banner_html = """
<div style="text-align: center;">
    <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%" height="auto">
</div>
"""
image_html = """
<div style="text-align: center; margin-top: 20px;">
    <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/500x_picture.png" alt="picture" width="50%" height="auto">
</div>
"""

# Gradio interface
iface = gr.Blocks()

with iface:
    gr.HTML(banner_html)
    gr.Markdown("# 𝐍𝐯𝐒𝐝𝐒𝐚 π€πŸπŸŽπŸŽ πŸ‘‹πŸΌπŸ‘ΎπŸ¦Ύβš‘ @{NbAiLab/whisper-norwegian-medium}\nUpload audio file: β˜•")
    audio_input = gr.Audio(type="filepath")
    batch_size_input = gr.Slider(minimum=1, maximum=16, step=1, label="Batch Size")
    transcription_output = gr.Textbox(label="Transcription")
    pdf_output = gr.File(label="Download Transcription as PDF")
    summary_output = gr.Textbox(label="Summary")
    transcribe_button = gr.Button("Transcribe")

    def process_audio(audio_file, batch_size):
        transcription, result = transcribe_audio(audio_file, batch_size)
        pdf_path = save_to_pdf(transcription)
        summary = summarize_text(transcription)
        return result, pdf_path, summary

    transcribe_button.click(fn=process_audio, inputs=[audio_input, batch_size_input], outputs=[transcription_output, pdf_output, summary_output])

# Launch interface
iface.launch(share=True, debug=True)