Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pandas as pd | |
import numpy as np | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from sentence_transformers import SentenceTransformer | |
from sklearn.metrics.pairwise import cosine_similarity | |
# Constants for default values | |
DEFAULT_CHUNK_SIZE = 100 | |
DEFAULT_CHUNK_OVERLAP = 0 | |
DEFAULT_NUM_CHUNKS = 10 | |
# Initialize the sentence transformer model for embeddings | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
def tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks): | |
""" | |
Tokenizes the input text based on the selected method and provided parameters. | |
""" | |
num_chunks = int(num_chunks) | |
output = [] | |
# Ensure text is provided | |
if not text.strip(): | |
return pd.DataFrame(columns=['Chunk #', 'Text Chunk', 'Character Count', 'Token Count']) | |
if method == "RecursiveCharacterTextSplitter": | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False) | |
tokenized_texts = text_splitter.split_text(text)[:num_chunks] | |
for i, chunk in enumerate(tokenized_texts): | |
output.append({ | |
'Chunk #': i, | |
'Text Chunk': chunk, | |
'Character Count': len(chunk), | |
'Token Count': len(chunk.split()) | |
}) | |
df = pd.DataFrame(output) | |
return df | |
def calculate_embeddings(df): | |
""" | |
Calculates embeddings for each text chunk in the dataframe. | |
""" | |
if df.empty: | |
return df | |
chunks = df['Text Chunk'].tolist() | |
embeddings = model.encode(chunks) | |
df['Embeddings'] = embeddings.tolist() | |
return df | |
def search_similar_chunks(query, df_with_embeddings): | |
""" | |
Search for chunks similar to the query embedding. | |
""" | |
# Compute the query embedding | |
query_embedding = model.encode([query])[0] | |
# Calculate similarity scores | |
chunk_embeddings = np.vstack(df_with_embeddings['Embeddings']) | |
similarity_scores = cosine_similarity([query_embedding], chunk_embeddings)[0] | |
# Insert similarity scores into the dataframe after 'Chunk #' | |
df_with_embeddings.insert(1, 'Similarity', similarity_scores) | |
# Return the dataframe sorted by similarity scores in descending order | |
return df_with_embeddings.sort_values(by='Similarity', ascending=False) | |
def process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks): | |
""" | |
Tokenizes the text and calculates embeddings. | |
""" | |
df = tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks) | |
df_with_embeddings = calculate_embeddings(df) | |
return df_with_embeddings | |
def update_output(method, text, chunk_size, chunk_overlap, num_chunks, query): | |
df_with_embeddings = process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks) | |
if query: | |
df_with_embeddings = search_similar_chunks(query, df_with_embeddings) | |
# Update the headers to reflect the new column order after similarity search | |
return df_with_embeddings[['Chunk #', 'Similarity', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']] | |
return df_with_embeddings[['Chunk #', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']] | |
iface = gr.Interface( | |
fn=update_output, | |
inputs=[ | |
gr.Dropdown(label="Select Tokenization Method", choices=["RecursiveCharacterTextSplitter"]), | |
gr.Textbox(label="Enter Text", lines=10, placeholder="Type or paste text here."), | |
gr.Number(label="Chunk Size", value=DEFAULT_CHUNK_SIZE), | |
gr.Number(label="Chunk Overlap", value=DEFAULT_CHUNK_OVERLAP), | |
gr.Number(label="Number of Chunks to Display", value=DEFAULT_NUM_CHUNKS), | |
gr.Textbox(label="Enter Query for Similarity Search", lines=2, placeholder="Type your query here.") | |
], | |
outputs=gr.Dataframe(height=900), | |
title="Text Tokenization and Embedding Tool", | |
description="A tool for tokenizing text and calculating embeddings. Now with similarity search feature." | |
) | |
if __name__ == "__main__": | |
iface.launch() | |