camparchimedes commited on
Commit
f7e87b9
1 Parent(s): 9e722fb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -38
app.py CHANGED
@@ -12,64 +12,50 @@ def convert_to_wav(audio_file):
12
  audio.export(wav_file, format="wav")
13
  return wav_file
14
 
15
-
16
-
17
  import torch
18
  from transformers import pipeline # AutoProcessor, AutoModelForSpeechSeq2Seq
19
 
20
-
21
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
22
  torch_dtype = torch.float32
23
 
24
- # ASR pipeline
25
  asr = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large", device=device, torch_dtype=torch_dtype)
26
 
27
- # ASR
28
  def transcribe_audio(audio_file):
29
  if audio_file.endswith(".m4a"):
30
  audio_file = convert_to_wav(audio_file)
31
 
32
  start_time = time.time()
33
 
34
-
35
  with torch.no_grad():
36
- output = asr(audio_file, chunk_length_s=30, generate_kwargs={"num_beams": 8}) # "task": "transcribe", "language": "no" + ..batch_size=10 ?
37
 
38
  transcription = output["text"]
39
  end_time = time.time()
40
-
41
  output_time = end_time - start_time
42
  word_count = len(transcription.split())
43
 
44
- result = f"Time taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}"
45
 
46
  return transcription.strip(), result
47
 
48
-
49
-
50
-
51
-
52
-
53
-
54
-
55
- # -------------------------------------[VERSION 3: full-on t5-base + NLTK + 3 styles for summarization]-------------------------------------
56
  import nltk
57
  from nltk.tokenize import word_tokenize, sent_tokenize
58
  from nltk.corpus import stopwords
59
  import networkx as nx
60
- from sklearn.feature_extraction.text import TfidfVectorizer # from before
61
- from sklearn.metrics.pairwise import cosine_similarity # from before
62
  import pandas as pd
63
  import numpy as np
64
- from rouge_score import rouge_scorer
65
 
66
- nltk.download('punkt') # from before
67
  nltk.download('stopwords')
68
 
69
-
70
  WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
71
 
72
- # Clean text using Norwegian-specific replacements
 
73
  def clean_text(text):
74
  text = re.sub(r'https?:\/\/.*[\r\n]*', '', str(text), flags=re.MULTILINE)
75
  text = re.sub(r'\<a href', ' ', str(text))
@@ -98,15 +84,11 @@ def preprocess_text(text):
98
  except Exception as e:
99
  st.error(f"Error during text preprocessing: {e}")
100
  return None
101
- # ROUGE metric
102
- scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=False)
103
-
104
 
105
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
106
 
107
  summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base", return_dict=True, torch_dtype=torch.float16)
108
  summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
109
-
110
  summarization_model.to(device)
111
 
112
  def summarize_text(text):
@@ -128,7 +110,6 @@ def build_similarity_matrix(sentences, stop_words):
128
  similarity_matrix.add_edge(i, j, weight=len(common_words))
129
  return similarity_matrix
130
 
131
-
132
  def graph_based_summary(text, num_paragraphs=3):
133
  sentences = text.strip().split(".")
134
  if len(sentences) < num_paragraphs:
@@ -142,7 +123,7 @@ def graph_based_summary(text, num_paragraphs=3):
142
  summary = [sent for _, sent in ranked_sentences[:num_paragraphs]]
143
  return summary
144
 
145
- def lex_rank_summary(text, num_paragraphs=3):
146
  sentences = nltk.sent_tokenize(text)
147
  if len(sentences) < num_paragraphs:
148
  return sentences
@@ -150,26 +131,37 @@ def lex_rank_summary(text, num_paragraphs=3):
150
  vectorizer = TfidfVectorizer(stop_words=list(stop_words))
151
  X = vectorizer.fit_transform(sentences)
152
  similarity_matrix = cosine_similarity(X, X)
 
 
 
 
 
 
 
153
  nx_graph = nx.from_numpy_array(similarity_matrix)
154
- scores = nx.pagerank_numpy(nx_graph)
155
  ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
156
  summary = [ranked_sentences[i][1] for i in range(num_paragraphs)]
157
  return summary
158
 
 
159
  def text_rank_summary(text, num_paragraphs=3):
160
  sentences = nltk.sent_tokenize(text)
161
  if len(sentences) < num_paragraphs:
162
  return sentences
 
163
  stop_words = set(stopwords.words('norwegian'))
164
  vectorizer = TfidfVectorizer(stop_words=list(stop_words))
165
  X = vectorizer.fit_transform(sentences)
166
  similarity_matrix = cosine_similarity(X, X)
167
- nx_graph = nx.from_numpy_array(similarity_matrix)
168
- scores = nx.pagerank(nx_graph)
169
- ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
170
- summary = [ranked_sentences[i][1] for i in range(num_paragraphs)]
171
- return summary
172
 
 
 
 
 
 
 
 
173
 
174
 
175
  import gradio as gr
@@ -194,7 +186,6 @@ def save_to_pdf(transcription, summary):
194
  pdf.output(pdf_output_path)
195
  return pdf_output_path
196
 
197
-
198
  banner_html = """
199
  <div style="text-align: center;">
200
  <img src="https://huggingface.co/spaces/camparchimedes/transcription_app/raw/main/picture.png" alt="Banner" width="100%" height="auto">
@@ -267,7 +258,6 @@ with iface:
267
  outputs=summary_output
268
  )
269
 
270
-
271
  with gr.TabItem("Summary_t3"):
272
  summary_output = gr.Textbox(label="Summary | TextRank")
273
  summarize_button = gr.Button("Summarize")
@@ -284,7 +274,6 @@ with iface:
284
  outputs=summary_output
285
  )
286
 
287
-
288
  with gr.TabItem("Download PDF"):
289
  pdf_transcription_only = gr.Button("Download PDF with Transcription Only")
290
  pdf_summary_only = gr.Button("Download PDF with Summary Only")
 
12
  audio.export(wav_file, format="wav")
13
  return wav_file
14
 
 
 
15
  import torch
16
  from transformers import pipeline # AutoProcessor, AutoModelForSpeechSeq2Seq
17
 
 
18
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
19
  torch_dtype = torch.float32
20
 
 
21
  asr = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large", device=device, torch_dtype=torch_dtype)
22
 
 
23
  def transcribe_audio(audio_file):
24
  if audio_file.endswith(".m4a"):
25
  audio_file = convert_to_wav(audio_file)
26
 
27
  start_time = time.time()
28
 
 
29
  with torch.no_grad():
30
+ output = asr(audio_file, chunk_length_s=30, generate_kwargs={"num_beams": 8, "batch_size": 10}) # "task": "transcribe", "language": "no" + ..batch_size=10 ?
31
 
32
  transcription = output["text"]
33
  end_time = time.time()
 
34
  output_time = end_time - start_time
35
  word_count = len(transcription.split())
36
 
37
+ result = f"Time taken: {output_time:.2f} seconds\nNumber of words: {word_count}"
38
 
39
  return transcription.strip(), result
40
 
41
+ # [VERSION 3: full-on w/ 3 styles for summarization]
 
 
 
 
 
 
 
42
  import nltk
43
  from nltk.tokenize import word_tokenize, sent_tokenize
44
  from nltk.corpus import stopwords
45
  import networkx as nx
46
+ from sklearn.feature_extraction.text import TfidfVectorizer
47
+ from sklearn.metrics.pairwise import cosine_similarity
48
  import pandas as pd
49
  import numpy as np
50
+ import re
51
 
52
+ nltk.download('punkt')
53
  nltk.download('stopwords')
54
 
 
55
  WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
56
 
57
+ text = transcription
58
+
59
  def clean_text(text):
60
  text = re.sub(r'https?:\/\/.*[\r\n]*', '', str(text), flags=re.MULTILINE)
61
  text = re.sub(r'\<a href', ' ', str(text))
 
84
  except Exception as e:
85
  st.error(f"Error during text preprocessing: {e}")
86
  return None
 
 
 
87
 
88
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
89
 
90
  summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base", return_dict=True, torch_dtype=torch.float16)
91
  summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
 
92
  summarization_model.to(device)
93
 
94
  def summarize_text(text):
 
110
  similarity_matrix.add_edge(i, j, weight=len(common_words))
111
  return similarity_matrix
112
 
 
113
  def graph_based_summary(text, num_paragraphs=3):
114
  sentences = text.strip().split(".")
115
  if len(sentences) < num_paragraphs:
 
123
  summary = [sent for _, sent in ranked_sentences[:num_paragraphs]]
124
  return summary
125
 
126
+ def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
127
  sentences = nltk.sent_tokenize(text)
128
  if len(sentences) < num_paragraphs:
129
  return sentences
 
131
  vectorizer = TfidfVectorizer(stop_words=list(stop_words))
132
  X = vectorizer.fit_transform(sentences)
133
  similarity_matrix = cosine_similarity(X, X)
134
+
135
+
136
+ for i in range(len(similarity_matrix)): # threshold
137
+ for j in range(len(similarity_matrix[i])):
138
+ if similarity_matrix[i][j] < threshold:
139
+ similarity_matrix[i][j] = 0.0
140
+
141
  nx_graph = nx.from_numpy_array(similarity_matrix)
142
+ scores = nx.pagerank(nx_graph)
143
  ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
144
  summary = [ranked_sentences[i][1] for i in range(num_paragraphs)]
145
  return summary
146
 
147
+
148
  def text_rank_summary(text, num_paragraphs=3):
149
  sentences = nltk.sent_tokenize(text)
150
  if len(sentences) < num_paragraphs:
151
  return sentences
152
+
153
  stop_words = set(stopwords.words('norwegian'))
154
  vectorizer = TfidfVectorizer(stop_words=list(stop_words))
155
  X = vectorizer.fit_transform(sentences)
156
  similarity_matrix = cosine_similarity(X, X)
 
 
 
 
 
157
 
158
+ nx_graph = nx.from_numpy_array(similarity_matrix) # graph, nodes (i.e sentences) & edges are similarity scores (is cool)
159
+ scores = nx.pagerank(nx_graph) # PageRank algorithm, scoring sentences
160
+ ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) # rank by PageRank scores
161
+
162
+ summary = [ranked_sentences[i][1] for i in range(num_paragraphs)] # top sentences for summary
163
+
164
+ return ' '.join(summary)
165
 
166
 
167
  import gradio as gr
 
186
  pdf.output(pdf_output_path)
187
  return pdf_output_path
188
 
 
189
  banner_html = """
190
  <div style="text-align: center;">
191
  <img src="https://huggingface.co/spaces/camparchimedes/transcription_app/raw/main/picture.png" alt="Banner" width="100%" height="auto">
 
258
  outputs=summary_output
259
  )
260
 
 
261
  with gr.TabItem("Summary_t3"):
262
  summary_output = gr.Textbox(label="Summary | TextRank")
263
  summarize_button = gr.Button("Summarize")
 
274
  outputs=summary_output
275
  )
276
 
 
277
  with gr.TabItem("Download PDF"):
278
  pdf_transcription_only = gr.Button("Download PDF with Transcription Only")
279
  pdf_summary_only = gr.Button("Download PDF with Summary Only")