Spaces:

Shivam29rathore
/

smaller-pegasus

Runtime error

App Files Files Community

smaller-pegasus / app.py

Shivam29rathore

Update app.py

71fdb73 over 2 years ago

raw

history blame

4.17 kB

	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	import pickle
	import torch
	from transformers import PegasusTokenizer, PegasusForConditionalGeneration
	import tensorflow as tf
	from tensorflow.python.lib.io import file_io
	from nltk.tokenize import sent_tokenize


	import io









	tf.compat.v1.disable_eager_execution()
	# Let's load the model and the tokenizer
	model_name = "human-centered-summarization/financial-summarization-pegasus"
	tokenizer = PegasusTokenizer.from_pretrained(model_name)
	model2 = PegasusForConditionalGeneration.from_pretrained(model_name)


	#tokenizer = AutoTokenizer.from_pretrained(checkpoint)
	#model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)


	import nltk
	from finbert_embedding.embedding import FinbertEmbedding
	import pandas as pd
	from nltk.cluster import KMeansClusterer
	import numpy as np
	import os
	from scipy.spatial import distance_matrix
	from tensorflow.python.lib.io import file_io
	import pickle

	nltk.download('punkt')


	def pegasus(text):
	'''A function to obtain summaries for each tokenized sentence.
	It returns a summarized document as output'''

	import nltk
	nltk.download('punkt')

	import os
	data_path = "/tmp/"
	if not os.path.exists(data_path):
	os.makedirs(data_path)
	input_ = "/tmp/input.txt"

	with open(input_, "w") as file:
	file.write(text)
	# read the written txt into a variable
	with open(input_ , 'r') as f:
	text_ = f.read()

	def tokenized_sentences(file):
	'''A function to generate chunks of sentences and texts.
	Returns tokenized texts'''
	# Create empty arrays
	tokenized_sentences = []
	sentences = []
	length = 0
	for sentence in sent_tokenize(file):
	length += len(sentence)
	# 512 is the maximum input length for the Pegasus model
	if length < 512:
	sentences.append(sentence)
	else:
	tokenized_sentences.append(sentences)
	sentences = [sentence]
	length = len(sentence)

	sentences = [sentence.strip() for sentence in sentences]
	size = len(sentences)
	# Append all tokenized sentences
	if sentences:
	tokenized_sentences.append(sentences)
	return tokenized_sentences

	tokenized = tokenized_sentences(text_)
	# Use GPU if available
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	global summary
	# Create an empty array for all summaries
	summary = []
	if size <= 4:
	max_length= size
	else:
	max_length = size//4

	# Loop to encode tokens, to generate abstractive summary and finally decode tokens
	for token in tokenized:
	# Encoding
	inputs = tokenizer.encode(' '.join(token), truncation=True, return_tensors='pt')
	# Use CPU or GPU
	inputs = inputs.to(device)
	# Get summaries from transformer model
	all_summary = model2.to(device).generate(inputs,do_sample=True,
	max_length=max_length, top_k=50, top_p=0.95,
	num_beams = 5, early_stopping=True)
	# num_return_sequences=5)
	# length_penalty=0.2, no_repeat_ngram_size=2
	# min_length=10,
	# max_length=50)
	# Decoding
	output = [tokenizer.decode(each_summary, skip_special_tokens=True, clean_up_tokenization_spaces=False) for each_summary in all_summary]
	# Append each output to array
	summary.append(output)
	# Get final summary
	summary = [sentence for each in summary for sentence in each]
	final = "".join(summary)

	return final


	import gradio as gr




	interface1 = gr.Interface(fn=pegasus,
	inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!",label='Input-10k Sections'),
	outputs=gr.outputs.Textbox(label='Output- Pegasus')).launch()