import gradio as gr import pandas as pd import numpy as np from langchain.text_splitter import RecursiveCharacterTextSplitter from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity # Constants for default values DEFAULT_CHUNK_SIZE = 100 DEFAULT_CHUNK_OVERLAP = 0 DEFAULT_NUM_CHUNKS = 10 # Initialize the sentence transformer model for embeddings model = SentenceTransformer('all-MiniLM-L6-v2') def tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks): """ Tokenizes the input text based on the selected method and provided parameters. """ num_chunks = int(num_chunks) output = [] # Ensure text is provided if not text.strip(): return pd.DataFrame(columns=['Chunk #', 'Text Chunk', 'Character Count', 'Token Count']) if method == "RecursiveCharacterTextSplitter": text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False) tokenized_texts = text_splitter.split_text(text)[:num_chunks] for i, chunk in enumerate(tokenized_texts): output.append({ 'Chunk #': i, 'Text Chunk': chunk, 'Character Count': len(chunk), 'Token Count': len(chunk.split()) }) df = pd.DataFrame(output) return df def calculate_embeddings(df): """ Calculates embeddings for each text chunk in the dataframe. """ if df.empty: return df chunks = df['Text Chunk'].tolist() embeddings = model.encode(chunks) df['Embeddings'] = embeddings.tolist() return df def search_similar_chunks(query, df_with_embeddings): """ Search for chunks similar to the query embedding. """ # Compute the query embedding query_embedding = model.encode([query])[0] # Calculate similarity scores chunk_embeddings = np.vstack(df_with_embeddings['Embeddings']) similarity_scores = cosine_similarity([query_embedding], chunk_embeddings)[0] # Insert similarity scores into the dataframe after 'Chunk #' df_with_embeddings.insert(1, 'Similarity', similarity_scores) # Return the dataframe sorted by similarity scores in descending order return df_with_embeddings.sort_values(by='Similarity', ascending=False) def process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks): """ Tokenizes the text and calculates embeddings. """ df = tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks) df_with_embeddings = calculate_embeddings(df) return df_with_embeddings def update_output(method, text, chunk_size, chunk_overlap, num_chunks, query): df_with_embeddings = process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks) if query: df_with_embeddings = search_similar_chunks(query, df_with_embeddings) # Update the headers to reflect the new column order after similarity search return df_with_embeddings[['Chunk #', 'Similarity', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']] return df_with_embeddings[['Chunk #', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']] iface = gr.Interface( fn=update_output, inputs=[ gr.Dropdown(label="Select Tokenization Method", choices=["RecursiveCharacterTextSplitter"]), gr.Textbox(label="Enter Text", lines=10, placeholder="Type or paste text here."), gr.Number(label="Chunk Size", value=DEFAULT_CHUNK_SIZE), gr.Number(label="Chunk Overlap", value=DEFAULT_CHUNK_OVERLAP), gr.Number(label="Number of Chunks to Display", value=DEFAULT_NUM_CHUNKS), gr.Textbox(label="Enter Query for Similarity Search", lines=2, placeholder="Type your query here.") ], outputs=gr.Dataframe(height=900), title="Text Tokenization and Embedding Tool", description="A tool for tokenizing text and calculating embeddings. Now with similarity search feature." ) if __name__ == "__main__": iface.launch()