Spaces:

Shivam29rathore
/

shorter-finbert

Runtime error

App Files Files Community

shorter-finbert / app.py

Shivam29rathore

Update app.py

fbefd1f about 2 years ago

raw

history blame contribute delete

4.2 kB

	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	import pickle
	import torch
	from transformers import PegasusTokenizer, PegasusForConditionalGeneration
	import tensorflow as tf
	from tensorflow.python.lib.io import file_io
	from nltk.tokenize import sent_tokenize


	import io


	#contents = pickle.load(f) becomes...
	#contents = CPU_Unpickler(f).load()


	model_path = "finbert.sav"

	#load model from drive
	with open(model_path, "rb") as f:
	model= pickle.load(f)



	#tokenizer = AutoTokenizer.from_pretrained(checkpoint)
	#model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)


	import nltk
	from finbert_embedding.embedding import FinbertEmbedding
	import pandas as pd
	from nltk.cluster import KMeansClusterer
	import numpy as np
	import os
	from scipy.spatial import distance_matrix
	from tensorflow.python.lib.io import file_io
	import pickle

	nltk.download('punkt')


	def make_summary(word):

	# Create tokens from the txt file
	tokens = nltk.sent_tokenize(word)
	# Strip out trailing and leading white spaces from tokens
	sentences = [word.strip() for word in tokens]
	#Create a DataFrame from the tokens
	data = pd.DataFrame(sentences)
	# Assign name Sentences to the column containing text tokens
	data.columns = ['Sentences']

	# Function to create numerical embeddings for each text tokens in dataframe
	def get_sentence_embeddings():
	# Create empty list for sentence embeddings
	sentence_list = []
	# Loop through all sentences and append sentence embeddings to list
	for i in tokens:
	sentence_embedding = model.sentence_vector(i)
	sentence_list.append(sentence_embedding)
	# Create empty list for ndarray
	sentence_array=[]
	# Loop through sentence list and change data type from tensor to array
	for i in sentence_list:
	sentence_array.append(i.numpy())
	# return sentence embeddings as list
	return sentence_array

	# Apply get_sentence_embeddings to dataframe to create column Embeddings
	data['Embeddings'] = get_sentence_embeddings()

	#Number of expected sentences for shorter summaries
	if len(tokens) <= 4:
	NUM_CLUSTERS = 1
	else:
	NUM_CLUSTERS = len(tokens)//4

	iterations = 25
	# Convert Embeddings into an array and store in variable X
	X = np.array(data['Embeddings'].to_list())

	#Build k-means cluster algorithm
	Kclusterer = KMeansClusterer(
	NUM_CLUSTERS,
	distance = nltk.cluster.util.cosine_distance,
	repeats = iterations, avoid_empty_clusters = True)

	# if length of text is too short, K means would return an error
	# use the try except block to return the text as result if it is too short.
	try:

	assigned_clusters = Kclusterer.cluster(X,assign_clusters=True)

	# Apply Kmean Cluster to DataFrame and create new columns Clusters and Centroid
	data['Cluster'] = pd.Series(assigned_clusters, index = data.index)
	data['Centroid'] = data['Cluster'].apply(lambda x: Kclusterer.means()[x])

	# return the text if clustering algorithm catches an exceptiona and move to the next text file
	except ValueError:
	return word

	# function that computes the distance of each embeddings from the centroid of the cluster
	def distance_from_centroid(row):
	return distance_matrix([row['Embeddings']], [row['Centroid'].tolist()])[0][0]

	# apply distance_from_centroid function to data
	data['Distance_From_Centroid'] = data.apply(distance_from_centroid, axis =1)

	## Return Final Summary
	summary = " ".join(data.sort_values(
	'Distance_From_Centroid',
	ascending = True).groupby('Cluster').head(1).sort_index()['Sentences'].tolist())
	return summary

	import gradio as gr




	interface1 = gr.Interface(fn=make_summary,
	inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!",label='Input-10k Sections'),
	outputs=gr.outputs.Textbox(label='Output- Finbert')).launch()