botchagalupe's picture
First Commit
933c7ad
import gradio as gr
import pandas as pd
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
# Constants for default values
DEFAULT_CHUNK_SIZE = 100
DEFAULT_CHUNK_OVERLAP = 0
DEFAULT_NUM_CHUNKS = 10
# Initialize the sentence transformer model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
def tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks):
"""
Tokenizes the input text based on the selected method and provided parameters.
"""
num_chunks = int(num_chunks)
output = []
# Ensure text is provided
if not text.strip():
return pd.DataFrame(columns=['Chunk #', 'Text Chunk', 'Character Count', 'Token Count'])
if method == "RecursiveCharacterTextSplitter":
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False)
tokenized_texts = text_splitter.split_text(text)[:num_chunks]
for i, chunk in enumerate(tokenized_texts):
output.append({
'Chunk #': i,
'Text Chunk': chunk,
'Character Count': len(chunk),
'Token Count': len(chunk.split())
})
df = pd.DataFrame(output)
return df
def calculate_embeddings(df):
"""
Calculates embeddings for each text chunk in the dataframe.
"""
if df.empty:
return df
chunks = df['Text Chunk'].tolist()
embeddings = model.encode(chunks)
df['Embeddings'] = embeddings.tolist()
return df
def search_similar_chunks(query, df_with_embeddings):
"""
Search for chunks similar to the query embedding.
"""
# Compute the query embedding
query_embedding = model.encode([query])[0]
# Calculate similarity scores
chunk_embeddings = np.vstack(df_with_embeddings['Embeddings'])
similarity_scores = cosine_similarity([query_embedding], chunk_embeddings)[0]
# Insert similarity scores into the dataframe after 'Chunk #'
df_with_embeddings.insert(1, 'Similarity', similarity_scores)
# Return the dataframe sorted by similarity scores in descending order
return df_with_embeddings.sort_values(by='Similarity', ascending=False)
def process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks):
"""
Tokenizes the text and calculates embeddings.
"""
df = tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks)
df_with_embeddings = calculate_embeddings(df)
return df_with_embeddings
def update_output(method, text, chunk_size, chunk_overlap, num_chunks, query):
df_with_embeddings = process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks)
if query:
df_with_embeddings = search_similar_chunks(query, df_with_embeddings)
# Update the headers to reflect the new column order after similarity search
return df_with_embeddings[['Chunk #', 'Similarity', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']]
return df_with_embeddings[['Chunk #', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']]
iface = gr.Interface(
fn=update_output,
inputs=[
gr.Dropdown(label="Select Tokenization Method", choices=["RecursiveCharacterTextSplitter"]),
gr.Textbox(label="Enter Text", lines=10, placeholder="Type or paste text here."),
gr.Number(label="Chunk Size", value=DEFAULT_CHUNK_SIZE),
gr.Number(label="Chunk Overlap", value=DEFAULT_CHUNK_OVERLAP),
gr.Number(label="Number of Chunks to Display", value=DEFAULT_NUM_CHUNKS),
gr.Textbox(label="Enter Query for Similarity Search", lines=2, placeholder="Type your query here.")
],
outputs=gr.Dataframe(height=900),
title="Text Tokenization and Embedding Tool",
description="A tool for tokenizing text and calculating embeddings. Now with similarity search feature."
)
if __name__ == "__main__":
iface.launch()