camparchimedes commited on
Commit
2d9e081
β€’
1 Parent(s): bde795e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -18
app.py CHANGED
@@ -3,28 +3,45 @@
3
  import gradio as gr
4
  import warnings
5
  import torch
6
- from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
 
7
  import soundfile as sf
8
  import ffmpeg
9
  import os
10
- from PIL import Image
11
- from huggingface_hub import InferenceClient
12
- from gradio_client import Client, file
13
- import spaces
14
  import time
 
 
 
 
 
 
 
 
15
 
16
  warnings.filterwarnings("ignore")
17
 
18
- # Load tokenizer, model, and processor
19
- tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium")
20
- model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-medium")
21
- processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium")
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- # Set up device
24
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
25
  torch_dtype = torch.float32
26
 
27
- # Move model to device
28
  model.to(device)
29
 
30
  def convert_audio_format(audio_path):
@@ -32,12 +49,11 @@ def convert_audio_format(audio_path):
32
  ffmpeg.input(audio_path).output(output_path, format='wav', ar='16000').run(overwrite_output=True)
33
  return output_path
34
 
35
- # @spaces.GPU(duration=120, queue=False)
36
  def transcribe_audio(audio_file, batch_size=4):
37
  start_time = time.time()
38
  audio_path = convert_audio_format(audio_file)
39
  audio_input, sample_rate = sf.read(audio_path)
40
- chunk_size = 16000 * 28 # 28 seconds chunks
41
  chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
42
 
43
  transcription = ""
@@ -51,6 +67,7 @@ def transcribe_audio(audio_file, batch_size=4):
51
  inputs.input_features,
52
  max_length=2048,
53
  num_beams=7,
 
54
  attention_mask=attention_mask
55
  )
56
  transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
@@ -61,8 +78,28 @@ def transcribe_audio(audio_file, batch_size=4):
61
 
62
  result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}"
63
 
64
- return result
65
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  # HTML
68
  banner_html = """
@@ -72,7 +109,7 @@ banner_html = """
72
  """
73
  image_html = """
74
  <div style="text-align: center; margin-top: 20px;">
75
- <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/blob/main/500x_picture.png" alt="picture" width="50%" height="auto">
76
  </div>
77
  """
78
 
@@ -84,10 +121,18 @@ with iface:
84
  gr.Markdown("# 𝐍𝐯𝐒𝐝𝐒𝐚 π€πŸπŸŽπŸŽ πŸ‘‹πŸΌπŸ‘ΎπŸ¦Ύβš‘ @{NbAiLab/whisper-norwegian-medium}\nUpload audio file: β˜•")
85
  audio_input = gr.Audio(type="filepath")
86
  batch_size_input = gr.Slider(minimum=1, maximum=16, step=1, label="Batch Size")
87
- transcription_output = gr.Textbox()
 
 
88
  transcribe_button = gr.Button("Transcribe")
89
 
90
- transcribe_button.click(fn=transcribe_audio, inputs=[audio_input, batch_size_input], outputs=transcription_output)
 
 
 
 
 
 
91
 
92
  # Launch interface
93
  iface.launch(share=True, debug=True)
 
3
  import gradio as gr
4
  import warnings
5
  import torch
6
+ #from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
7
+ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
8
  import soundfile as sf
9
  import ffmpeg
10
  import os
11
+ from fpdf import FPDF
 
 
 
12
  import time
13
+ from sklearn.feature_extraction.text import TfidfVectorizer
14
+ from sklearn.cluster import KMeans
15
+ import re
16
+
17
+ import nltk
18
+ from nltk.tokenize import word_tokenize
19
+ from nltk.corpus import stopwords
20
+ import pandas as pd
21
 
22
  warnings.filterwarnings("ignore")
23
 
24
+ nltk.download('punkt')
25
+ nltk.download('stopwords')
26
+
27
+ #tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-large")
28
+ #model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-large")
29
+ #processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large")
30
+
31
+ generation_config = {
32
+ "temperature": 0.8,
33
+ "top_p": 0.9,
34
+ "top_k": 0.5,
35
+ "max_output_tokens": 2048
36
+ }
37
+
38
+
39
+ processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
40
+ model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
41
 
 
42
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
43
  torch_dtype = torch.float32
44
 
 
45
  model.to(device)
46
 
47
  def convert_audio_format(audio_path):
 
49
  ffmpeg.input(audio_path).output(output_path, format='wav', ar='16000').run(overwrite_output=True)
50
  return output_path
51
 
 
52
  def transcribe_audio(audio_file, batch_size=4):
53
  start_time = time.time()
54
  audio_path = convert_audio_format(audio_file)
55
  audio_input, sample_rate = sf.read(audio_path)
56
+ chunk_size = 16000 * 30
57
  chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
58
 
59
  transcription = ""
 
67
  inputs.input_features,
68
  max_length=2048,
69
  num_beams=7,
70
+ task="transcribe",
71
  attention_mask=attention_mask
72
  )
73
  transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
 
78
 
79
  result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}"
80
 
81
+ return transcription.strip(), result
82
+
83
+ def save_to_pdf(transcription):
84
+ pdf = FPDF()
85
+ pdf.add_page()
86
+ pdf.set_font("Arial", size=12)
87
+ pdf.multi_cell(0, 10, transcription)
88
+ pdf_output_path = "transcription.pdf"
89
+ pdf.output(pdf_output_path)
90
+ return pdf_output_path
91
+
92
+ def summarize_text(transcription):
93
+ sentences = transcription.split(". ")
94
+ vectorizer = TfidfVectorizer(stop_words='norwegian')
95
+ X = vectorizer.fit_transform(sentences)
96
+
97
+ kmeans = KMeans(n_clusters=1)
98
+ kmeans.fit(X)
99
+ avg = X.mean(axis=0)
100
+ summary = [sentences[i] for i in kmeans.predict(avg)]
101
+
102
+ return ". ".join(summary) + "."
103
 
104
  # HTML
105
  banner_html = """
 
109
  """
110
  image_html = """
111
  <div style="text-align: center; margin-top: 20px;">
112
+ <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/500x_picture.png" alt="picture" width="50%" height="auto">
113
  </div>
114
  """
115
 
 
121
  gr.Markdown("# 𝐍𝐯𝐒𝐝𝐒𝐚 π€πŸπŸŽπŸŽ πŸ‘‹πŸΌπŸ‘ΎπŸ¦Ύβš‘ @{NbAiLab/whisper-norwegian-medium}\nUpload audio file: β˜•")
122
  audio_input = gr.Audio(type="filepath")
123
  batch_size_input = gr.Slider(minimum=1, maximum=16, step=1, label="Batch Size")
124
+ transcription_output = gr.Textbox(label="Transcription")
125
+ pdf_output = gr.File(label="Download Transcription as PDF")
126
+ summary_output = gr.Textbox(label="Summary")
127
  transcribe_button = gr.Button("Transcribe")
128
 
129
+ def process_audio(audio_file, batch_size):
130
+ transcription, result = transcribe_audio(audio_file, batch_size)
131
+ pdf_path = save_to_pdf(transcription)
132
+ summary = summarize_text(transcription)
133
+ return result, pdf_path, summary
134
+
135
+ transcribe_button.click(fn=process_audio, inputs=[audio_input, batch_size_input], outputs=[transcription_output, pdf_output, summary_output])
136
 
137
  # Launch interface
138
  iface.launch(share=True, debug=True)