File size: 4,080 Bytes
933c7ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import gradio as gr
import pandas as pd
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Constants for default values
DEFAULT_CHUNK_SIZE = 100
DEFAULT_CHUNK_OVERLAP = 0
DEFAULT_NUM_CHUNKS = 10

# Initialize the sentence transformer model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

def tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks):
    """
    Tokenizes the input text based on the selected method and provided parameters.
    """
    num_chunks = int(num_chunks)
    output = []

    # Ensure text is provided
    if not text.strip():
        return pd.DataFrame(columns=['Chunk #', 'Text Chunk', 'Character Count', 'Token Count'])

    if method == "RecursiveCharacterTextSplitter":
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False)
        tokenized_texts = text_splitter.split_text(text)[:num_chunks]
        for i, chunk in enumerate(tokenized_texts):
            output.append({
                'Chunk #': i,
                'Text Chunk': chunk,
                'Character Count': len(chunk),
                'Token Count': len(chunk.split())
            })

    df = pd.DataFrame(output)
    return df

def calculate_embeddings(df):
    """
    Calculates embeddings for each text chunk in the dataframe.
    """
    if df.empty:
        return df

    chunks = df['Text Chunk'].tolist()
    embeddings = model.encode(chunks)
    df['Embeddings'] = embeddings.tolist()
    return df

def search_similar_chunks(query, df_with_embeddings):
    """
    Search for chunks similar to the query embedding.
    """
    # Compute the query embedding
    query_embedding = model.encode([query])[0]
    
    # Calculate similarity scores
    chunk_embeddings = np.vstack(df_with_embeddings['Embeddings'])
    similarity_scores = cosine_similarity([query_embedding], chunk_embeddings)[0]
    
    # Insert similarity scores into the dataframe after 'Chunk #'
    df_with_embeddings.insert(1, 'Similarity', similarity_scores)
    
    # Return the dataframe sorted by similarity scores in descending order
    return df_with_embeddings.sort_values(by='Similarity', ascending=False)

def process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks):
    """
    Tokenizes the text and calculates embeddings.
    """
    df = tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks)
    df_with_embeddings = calculate_embeddings(df)
    return df_with_embeddings

def update_output(method, text, chunk_size, chunk_overlap, num_chunks, query):
    df_with_embeddings = process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks)
    if query:
        df_with_embeddings = search_similar_chunks(query, df_with_embeddings)
        # Update the headers to reflect the new column order after similarity search
        return df_with_embeddings[['Chunk #', 'Similarity', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']]
    return df_with_embeddings[['Chunk #', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']]

iface = gr.Interface(
    fn=update_output,
    inputs=[
        gr.Dropdown(label="Select Tokenization Method", choices=["RecursiveCharacterTextSplitter"]),
        gr.Textbox(label="Enter Text", lines=10, placeholder="Type or paste text here."),
        gr.Number(label="Chunk Size", value=DEFAULT_CHUNK_SIZE),
        gr.Number(label="Chunk Overlap", value=DEFAULT_CHUNK_OVERLAP),
        gr.Number(label="Number of Chunks to Display", value=DEFAULT_NUM_CHUNKS),
        gr.Textbox(label="Enter Query for Similarity Search", lines=2, placeholder="Type your query here.")
    ],
    outputs=gr.Dataframe(height=900),
    title="Text Tokenization and Embedding Tool",
    description="A tool for tokenizing text and calculating embeddings. Now with similarity search feature."
)

if __name__ == "__main__":
    iface.launch()