camparchimedes commited on
Commit
3d3ff49
1 Parent(s): d18b751

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -37
app.py CHANGED
@@ -1,11 +1,20 @@
1
- # app.py
2
-
3
  import gradio as gr
4
  import warnings
5
  import torch
6
- from transformers import pipeline, WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
 
 
 
 
 
 
7
 
8
  warnings.filterwarnings("ignore")
 
 
 
 
9
 
10
  # Load tokenizer and model
11
  tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium")
@@ -15,9 +24,10 @@ processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium")
15
  # Set up the device
16
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
17
  torch_dtype = torch.float32
 
18
 
19
  # Initialize pipeline
20
- asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, device=device, torch_dtype=torch_dtype)
21
 
22
  def transcribe_audio(audio_file):
23
  # Perform transcription
@@ -33,38 +43,16 @@ iface = gr.Interface(
33
  title="Audio Transcription App",
34
  description="Upload an audio file to get the transcription",
35
  theme="default",
 
36
  live=False
37
  )
38
 
39
-
40
- from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM
41
- from pydub import AudioSegment
42
- import soundfile as sf
43
- import numpy as np
44
- import os
45
- import nltk
46
- from fpdf import FPDF
47
- import time
48
-
49
- nltk.download('punkt')
50
-
51
- HF_AUTH_TOKEN = os.getenv('HF_AUTH_TOKEN')
52
-
53
- # transcription
54
- processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
55
- transcription_model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
56
 
57
- # summarization
58
- summarization_tokenizer = AutoTokenizer.from_pretrained("NbAiLab/norbert-summarization")
59
- summarization_model = AutoModelForSeq2SeqLM.from_pretrained("NbAiLab/norbert-summarization")
60
-
61
- # setup
62
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
63
- torch_dtype = torch.float32
64
-
65
- # move 'em
66
- transcription_model.to(device)
67
- summarization_model.to(device) # PS. model needs to be told to use graph-based summary method (Lexname?)
68
 
69
  def convert_to_wav(audio_file):
70
  audio = AudioSegment.from_file(audio_file, format="m4a")
@@ -95,7 +83,6 @@ def transcribe_audio(audio_file, batch_size=4):
95
  num_beams=7,
96
  task="transcribe",
97
  attention_mask=attention_mask,
98
- # forced_decoder_ids=None, # OBS! forced_decoder_ids must not be set. Just marked it out for, just in case..
99
  language="no"
100
  )
101
  transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
@@ -109,7 +96,7 @@ def transcribe_audio(audio_file, batch_size=4):
109
  return transcription.strip(), result
110
 
111
  def summarize_text(text):
112
- inputs = summarization_tokenizer([text], max_length=1024, return_tensors="pt", truncation=True)
113
  inputs = inputs.to(device)
114
  summary_ids = summarization_model.generate(inputs.input_ids, num_beams=4, max_length=150, early_stopping=True)
115
  summary = summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
@@ -144,7 +131,6 @@ with iface:
144
 
145
  transcribe_button.click(fn=transcribe_and_summarize, inputs=[audio_input, batch_size_input], outputs=[transcription_output, summary_output])
146
 
147
-
148
  def save_to_pdf(transcription, summary):
149
  pdf = FPDF()
150
  pdf.add_page()
@@ -163,7 +149,5 @@ def save_to_pdf(transcription, summary):
163
  pdf.output(pdf_output_path)
164
  return pdf_output_path
165
 
166
-
167
-
168
  # run
169
  iface.launch(share=True, debug=True)
 
1
+ import os
 
2
  import gradio as gr
3
  import warnings
4
  import torch
5
+ from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor, AutoTokenizer, AutoModelForSeq2SeqLM
6
+ from pydub import AudioSegment
7
+ import soundfile as sf
8
+ import numpy as np
9
+ import nltk
10
+ from fpdf import FPDF
11
+ import time
12
 
13
  warnings.filterwarnings("ignore")
14
+ nltk.download('punkt')
15
+
16
+ # Load environment variable
17
+ HF_AUTH_TOKEN = os.getenv('HF_AUTH_TOKEN')
18
 
19
  # Load tokenizer and model
20
  tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium")
 
24
  # Set up the device
25
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
26
  torch_dtype = torch.float32
27
+ model.to(device)
28
 
29
  # Initialize pipeline
30
+ asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, device=device, torch_dtype=torch.float32)
31
 
32
  def transcribe_audio(audio_file):
33
  # Perform transcription
 
43
  title="Audio Transcription App",
44
  description="Upload an audio file to get the transcription",
45
  theme="default",
46
+ layout="vertical",
47
  live=False
48
  )
49
 
50
+ # Load summarization models with authentication token
51
+ summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
52
+ summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ # Move summarization model to device
55
+ summarization_model.to(device)
 
 
 
 
 
 
 
 
 
56
 
57
  def convert_to_wav(audio_file):
58
  audio = AudioSegment.from_file(audio_file, format="m4a")
 
83
  num_beams=7,
84
  task="transcribe",
85
  attention_mask=attention_mask,
 
86
  language="no"
87
  )
88
  transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
 
96
  return transcription.strip(), result
97
 
98
  def summarize_text(text):
99
+ inputs = summarization_tokenizer("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
100
  inputs = inputs.to(device)
101
  summary_ids = summarization_model.generate(inputs.input_ids, num_beams=4, max_length=150, early_stopping=True)
102
  summary = summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
 
131
 
132
  transcribe_button.click(fn=transcribe_and_summarize, inputs=[audio_input, batch_size_input], outputs=[transcription_output, summary_output])
133
 
 
134
  def save_to_pdf(transcription, summary):
135
  pdf = FPDF()
136
  pdf.add_page()
 
149
  pdf.output(pdf_output_path)
150
  return pdf_output_path
151
 
 
 
152
  # run
153
  iface.launch(share=True, debug=True)