codeteach commited on
Commit
5773fec
1 Parent(s): d9b5a75

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -16
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import gradio as gr
2
  from transformers import pipeline, AutoTokenizer
3
- from sentence_transformers import SentenceTransformer, util
4
  import nltk
5
  from nltk.tokenize import sent_tokenize
6
 
@@ -25,7 +24,7 @@ summarization_models = {
25
  tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
26
 
27
  # Initialize summarization pipelines
28
- summarizers = {model: pipeline("summarization", model=model) for model in summarization_models.values()}
29
 
30
  # Initialize translation pipeline
31
  def get_translator(language):
@@ -36,23 +35,28 @@ def get_translator(language):
36
 
37
  # Helper function to split text into chunks
38
  def split_text(text, max_tokens=1024):
39
- inputs = tokenizer(text, return_tensors='pt', truncation=False)
40
- input_ids = inputs['input_ids'][0]
41
- total_tokens = len(input_ids)
42
-
43
  chunks = []
44
- start = 0
45
- while start < total_tokens:
46
- end = min(start + max_tokens, total_tokens)
47
- chunk_ids = input_ids[start:end]
48
- chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
49
- chunks.append(chunk_text)
50
- start = end
51
-
 
 
 
 
 
 
 
 
52
  return chunks
53
 
54
  # Helper function to summarize text
55
- def summarize_text(text, model):
56
  if len(text) < 200: # Adjust the threshold as needed
57
  print("Input text is too short for summarization. Please provide longer text.")
58
  return ""
@@ -60,7 +64,7 @@ def summarize_text(text, model):
60
  summaries = []
61
  for chunk in chunks:
62
  try:
63
- summary = summarizers[model](chunk, max_length=150, min_length=20, do_sample=False)[0]['summary_text']
64
  summaries.append(summary)
65
  except Exception as e:
66
  print(f"Error summarizing chunk: {chunk}\nError: {e}")
@@ -144,3 +148,4 @@ iface.launch()
144
 
145
 
146
 
 
 
1
  import gradio as gr
2
  from transformers import pipeline, AutoTokenizer
 
3
  import nltk
4
  from nltk.tokenize import sent_tokenize
5
 
 
24
  tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
25
 
26
  # Initialize summarization pipelines
27
+ summarizers = {name: pipeline("summarization", model=model) for name, model in summarization_models.items()}
28
 
29
  # Initialize translation pipeline
30
  def get_translator(language):
 
35
 
36
  # Helper function to split text into chunks
37
  def split_text(text, max_tokens=1024):
38
+ sentences = sent_tokenize(text)
 
 
 
39
  chunks = []
40
+ current_chunk = []
41
+ current_length = 0
42
+
43
+ for sentence in sentences:
44
+ sentence_length = len(tokenizer.tokenize(sentence))
45
+ if current_length + sentence_length <= max_tokens:
46
+ current_chunk.append(sentence)
47
+ current_length += sentence_length
48
+ else:
49
+ chunks.append(" ".join(current_chunk))
50
+ current_chunk = [sentence]
51
+ current_length = sentence_length
52
+
53
+ if current_chunk:
54
+ chunks.append(" ".join(current_chunk))
55
+
56
  return chunks
57
 
58
  # Helper function to summarize text
59
+ def summarize_text(text, model_name):
60
  if len(text) < 200: # Adjust the threshold as needed
61
  print("Input text is too short for summarization. Please provide longer text.")
62
  return ""
 
64
  summaries = []
65
  for chunk in chunks:
66
  try:
67
+ summary = summarizers[model_name](chunk, max_length=150, min_length=20, do_sample=False)[0]['summary_text']
68
  summaries.append(summary)
69
  except Exception as e:
70
  print(f"Error summarizing chunk: {chunk}\nError: {e}")
 
148
 
149
 
150
 
151
+