smaller-pegasus / app.py
Shivam29rathore's picture
Update app.py
71fdb73
raw
history blame
4.17 kB
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pickle
import torch
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import tensorflow as tf
from tensorflow.python.lib.io import file_io
from nltk.tokenize import sent_tokenize
import io
tf.compat.v1.disable_eager_execution()
# Let's load the model and the tokenizer
model_name = "human-centered-summarization/financial-summarization-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model2 = PegasusForConditionalGeneration.from_pretrained(model_name)
#tokenizer = AutoTokenizer.from_pretrained(checkpoint)
#model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
import nltk
from finbert_embedding.embedding import FinbertEmbedding
import pandas as pd
from nltk.cluster import KMeansClusterer
import numpy as np
import os
from scipy.spatial import distance_matrix
from tensorflow.python.lib.io import file_io
import pickle
nltk.download('punkt')
def pegasus(text):
'''A function to obtain summaries for each tokenized sentence.
It returns a summarized document as output'''
import nltk
nltk.download('punkt')
import os
data_path = "/tmp/"
if not os.path.exists(data_path):
os.makedirs(data_path)
input_ = "/tmp/input.txt"
with open(input_, "w") as file:
file.write(text)
# read the written txt into a variable
with open(input_ , 'r') as f:
text_ = f.read()
def tokenized_sentences(file):
'''A function to generate chunks of sentences and texts.
Returns tokenized texts'''
# Create empty arrays
tokenized_sentences = []
sentences = []
length = 0
for sentence in sent_tokenize(file):
length += len(sentence)
# 512 is the maximum input length for the Pegasus model
if length < 512:
sentences.append(sentence)
else:
tokenized_sentences.append(sentences)
sentences = [sentence]
length = len(sentence)
sentences = [sentence.strip() for sentence in sentences]
size = len(sentences)
# Append all tokenized sentences
if sentences:
tokenized_sentences.append(sentences)
return tokenized_sentences
tokenized = tokenized_sentences(text_)
# Use GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
global summary
# Create an empty array for all summaries
summary = []
if size <= 4:
max_length= size
else:
max_length = size//4
# Loop to encode tokens, to generate abstractive summary and finally decode tokens
for token in tokenized:
# Encoding
inputs = tokenizer.encode(' '.join(token), truncation=True, return_tensors='pt')
# Use CPU or GPU
inputs = inputs.to(device)
# Get summaries from transformer model
all_summary = model2.to(device).generate(inputs,do_sample=True,
max_length=max_length, top_k=50, top_p=0.95,
num_beams = 5, early_stopping=True)
# num_return_sequences=5)
# length_penalty=0.2, no_repeat_ngram_size=2
# min_length=10,
# max_length=50)
# Decoding
output = [tokenizer.decode(each_summary, skip_special_tokens=True, clean_up_tokenization_spaces=False) for each_summary in all_summary]
# Append each output to array
summary.append(output)
# Get final summary
summary = [sentence for each in summary for sentence in each]
final = "".join(summary)
return final
import gradio as gr
interface1 = gr.Interface(fn=pegasus,
inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!",label='Input-10k Sections'),
outputs=gr.outputs.Textbox(label='Output- Pegasus')).launch()