Spaces:
Runtime error
Runtime error
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
import pickle | |
import torch | |
from transformers import PegasusTokenizer, PegasusForConditionalGeneration | |
import tensorflow as tf | |
from tensorflow.python.lib.io import file_io | |
from nltk.tokenize import sent_tokenize | |
import io | |
#contents = pickle.load(f) becomes... | |
#contents = CPU_Unpickler(f).load() | |
model_path = "finbert.sav" | |
#load model from drive | |
with open(model_path, "rb") as f: | |
model= pickle.load(f) | |
#tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
#model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) | |
import nltk | |
from finbert_embedding.embedding import FinbertEmbedding | |
import pandas as pd | |
from nltk.cluster import KMeansClusterer | |
import numpy as np | |
import os | |
from scipy.spatial import distance_matrix | |
from tensorflow.python.lib.io import file_io | |
import pickle | |
nltk.download('punkt') | |
def make_summary(word): | |
# Create tokens from the txt file | |
tokens = nltk.sent_tokenize(word) | |
# Strip out trailing and leading white spaces from tokens | |
sentences = [word.strip() for word in tokens] | |
#Create a DataFrame from the tokens | |
data = pd.DataFrame(sentences) | |
# Assign name Sentences to the column containing text tokens | |
data.columns = ['Sentences'] | |
# Function to create numerical embeddings for each text tokens in dataframe | |
def get_sentence_embeddings(): | |
# Create empty list for sentence embeddings | |
sentence_list = [] | |
# Loop through all sentences and append sentence embeddings to list | |
for i in tokens: | |
sentence_embedding = model.sentence_vector(i) | |
sentence_list.append(sentence_embedding) | |
# Create empty list for ndarray | |
sentence_array=[] | |
# Loop through sentence list and change data type from tensor to array | |
for i in sentence_list: | |
sentence_array.append(i.numpy()) | |
# return sentence embeddings as list | |
return sentence_array | |
# Apply get_sentence_embeddings to dataframe to create column Embeddings | |
data['Embeddings'] = get_sentence_embeddings() | |
#Number of expected sentences for shorter summaries | |
if len(tokens) <= 4: | |
NUM_CLUSTERS = 1 | |
else: | |
NUM_CLUSTERS = len(tokens)//4 | |
iterations = 25 | |
# Convert Embeddings into an array and store in variable X | |
X = np.array(data['Embeddings'].to_list()) | |
#Build k-means cluster algorithm | |
Kclusterer = KMeansClusterer( | |
NUM_CLUSTERS, | |
distance = nltk.cluster.util.cosine_distance, | |
repeats = iterations, avoid_empty_clusters = True) | |
# if length of text is too short, K means would return an error | |
# use the try except block to return the text as result if it is too short. | |
try: | |
assigned_clusters = Kclusterer.cluster(X,assign_clusters=True) | |
# Apply Kmean Cluster to DataFrame and create new columns Clusters and Centroid | |
data['Cluster'] = pd.Series(assigned_clusters, index = data.index) | |
data['Centroid'] = data['Cluster'].apply(lambda x: Kclusterer.means()[x]) | |
# return the text if clustering algorithm catches an exceptiona and move to the next text file | |
except ValueError: | |
return word | |
# function that computes the distance of each embeddings from the centroid of the cluster | |
def distance_from_centroid(row): | |
return distance_matrix([row['Embeddings']], [row['Centroid'].tolist()])[0][0] | |
# apply distance_from_centroid function to data | |
data['Distance_From_Centroid'] = data.apply(distance_from_centroid, axis =1) | |
## Return Final Summary | |
summary = " ".join(data.sort_values( | |
'Distance_From_Centroid', | |
ascending = True).groupby('Cluster').head(1).sort_index()['Sentences'].tolist()) | |
return summary | |
import gradio as gr | |
interface1 = gr.Interface(fn=make_summary, | |
inputs =gr.inputs.Textbox(lines=15,placeholder="Enter your text !!",label='Input-10k Sections'), | |
outputs=gr.outputs.Textbox(label='Output- Finbert')).launch() | |