Spaces:
Build error
Build error
File size: 4,855 Bytes
869e885 8cb8264 592f7e1 2d9e081 592f7e1 869e885 6d1025b 2d9e081 14c8f51 2d9e081 8cb8264 592f7e1 8cb8264 2d9e081 592f7e1 f9cd637 592f7e1 f9cd637 5c56ed6 592f7e1 869e885 47661bd 6de75ee 14c8f51 869e885 2d9e081 47661bd 1b9402b 47661bd 1b9402b 47661bd 14c8f51 1b9402b 2d9e081 869e885 47661bd 1b9402b 592f7e1 14c8f51 2d9e081 8de25ef 6d1025b 592f7e1 6de75ee 14c8f51 6de75ee 2d9e081 592f7e1 f9cd637 592f7e1 14c8f51 5e4096f fc933fb 2d9e081 5e4096f 2d9e081 592f7e1 aea18b3 869e885 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
# app.py
import gradio as gr
import warnings
import torch
#from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import soundfile as sf
import ffmpeg
import os
from fpdf import FPDF
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
warnings.filterwarnings("ignore")
nltk.download('punkt')
nltk.download('stopwords')
#tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-large")
#model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-large")
#processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large")
generation_config = {
"temperature": 0.8,
"top_p": 0.9,
"top_k": 0.5,
"max_output_tokens": 2048
}
processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch_dtype = torch.float32
model.to(device)
def convert_audio_format(audio_path):
output_path = "converted_audio.wav"
ffmpeg.input(audio_path).output(output_path, format='wav', ar='16000').run(overwrite_output=True)
return output_path
def transcribe_audio(audio_file, batch_size=4):
start_time = time.time()
audio_path = convert_audio_format(audio_file)
audio_input, sample_rate = sf.read(audio_path)
chunk_size = 16000 * 30
chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
transcription = ""
for i in range(0, len(chunks), batch_size):
batch_chunks = chunks[i:i + batch_size]
inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True)
inputs = inputs.to(device)
attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
with torch.no_grad():
output = model.generate(
inputs.input_features,
max_length=2048,
num_beams=7,
task="transcribe",
attention_mask=attention_mask
)
transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
end_time = time.time()
transcription_time = end_time - start_time
word_count = len(transcription.split())
result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}"
return transcription.strip(), result
def save_to_pdf(transcription):
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
pdf.multi_cell(0, 10, transcription)
pdf_output_path = "transcription.pdf"
pdf.output(pdf_output_path)
return pdf_output_path
def summarize_text(transcription):
sentences = transcription.split(". ")
vectorizer = TfidfVectorizer(stop_words='norwegian')
X = vectorizer.fit_transform(sentences)
kmeans = KMeans(n_clusters=1)
kmeans.fit(X)
avg = X.mean(axis=0)
summary = [sentences[i] for i in kmeans.predict(avg)]
return ". ".join(summary) + "."
# HTML
banner_html = """
<div style="text-align: center;">
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%" height="auto">
</div>
"""
image_html = """
<div style="text-align: center; margin-top: 20px;">
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/500x_picture.png" alt="picture" width="50%" height="auto">
</div>
"""
# Gradio interface
iface = gr.Blocks()
with iface:
gr.HTML(banner_html)
gr.Markdown("# ππ―π’ππ’π ππππ ππΌπΎπ¦Ύβ‘ @{NbAiLab/whisper-norwegian-medium}\nUpload audio file: β")
audio_input = gr.Audio(type="filepath")
batch_size_input = gr.Slider(minimum=1, maximum=16, step=1, label="Batch Size")
transcription_output = gr.Textbox(label="Transcription")
pdf_output = gr.File(label="Download Transcription as PDF")
summary_output = gr.Textbox(label="Summary")
transcribe_button = gr.Button("Transcribe")
def process_audio(audio_file, batch_size):
transcription, result = transcribe_audio(audio_file, batch_size)
pdf_path = save_to_pdf(transcription)
summary = summarize_text(transcription)
return result, pdf_path, summary
transcribe_button.click(fn=process_audio, inputs=[audio_input, batch_size_input], outputs=[transcription_output, pdf_output, summary_output])
# Launch interface
iface.launch(share=True, debug=True)
|