Spaces:
Sleeping
Sleeping
botchagalupe
commited on
Commit
•
933c7ad
1
Parent(s):
d89fe7b
First Commit
Browse files- README.md +4 -4
- app.py +102 -0
- requirements.txt +6 -0
README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
-
title: GAI Workshop
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.16.0
|
8 |
app_file: app.py
|
|
|
1 |
---
|
2 |
+
title: GAI Workshop
|
3 |
+
emoji: 💻
|
4 |
+
colorFrom: gray
|
5 |
+
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.16.0
|
8 |
app_file: app.py
|
app.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
+
from sentence_transformers import SentenceTransformer
|
6 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
7 |
+
|
8 |
+
# Constants for default values
|
9 |
+
DEFAULT_CHUNK_SIZE = 100
|
10 |
+
DEFAULT_CHUNK_OVERLAP = 0
|
11 |
+
DEFAULT_NUM_CHUNKS = 10
|
12 |
+
|
13 |
+
# Initialize the sentence transformer model for embeddings
|
14 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
15 |
+
|
16 |
+
def tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks):
|
17 |
+
"""
|
18 |
+
Tokenizes the input text based on the selected method and provided parameters.
|
19 |
+
"""
|
20 |
+
num_chunks = int(num_chunks)
|
21 |
+
output = []
|
22 |
+
|
23 |
+
# Ensure text is provided
|
24 |
+
if not text.strip():
|
25 |
+
return pd.DataFrame(columns=['Chunk #', 'Text Chunk', 'Character Count', 'Token Count'])
|
26 |
+
|
27 |
+
if method == "RecursiveCharacterTextSplitter":
|
28 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False)
|
29 |
+
tokenized_texts = text_splitter.split_text(text)[:num_chunks]
|
30 |
+
for i, chunk in enumerate(tokenized_texts):
|
31 |
+
output.append({
|
32 |
+
'Chunk #': i,
|
33 |
+
'Text Chunk': chunk,
|
34 |
+
'Character Count': len(chunk),
|
35 |
+
'Token Count': len(chunk.split())
|
36 |
+
})
|
37 |
+
|
38 |
+
df = pd.DataFrame(output)
|
39 |
+
return df
|
40 |
+
|
41 |
+
def calculate_embeddings(df):
|
42 |
+
"""
|
43 |
+
Calculates embeddings for each text chunk in the dataframe.
|
44 |
+
"""
|
45 |
+
if df.empty:
|
46 |
+
return df
|
47 |
+
|
48 |
+
chunks = df['Text Chunk'].tolist()
|
49 |
+
embeddings = model.encode(chunks)
|
50 |
+
df['Embeddings'] = embeddings.tolist()
|
51 |
+
return df
|
52 |
+
|
53 |
+
def search_similar_chunks(query, df_with_embeddings):
|
54 |
+
"""
|
55 |
+
Search for chunks similar to the query embedding.
|
56 |
+
"""
|
57 |
+
# Compute the query embedding
|
58 |
+
query_embedding = model.encode([query])[0]
|
59 |
+
|
60 |
+
# Calculate similarity scores
|
61 |
+
chunk_embeddings = np.vstack(df_with_embeddings['Embeddings'])
|
62 |
+
similarity_scores = cosine_similarity([query_embedding], chunk_embeddings)[0]
|
63 |
+
|
64 |
+
# Insert similarity scores into the dataframe after 'Chunk #'
|
65 |
+
df_with_embeddings.insert(1, 'Similarity', similarity_scores)
|
66 |
+
|
67 |
+
# Return the dataframe sorted by similarity scores in descending order
|
68 |
+
return df_with_embeddings.sort_values(by='Similarity', ascending=False)
|
69 |
+
|
70 |
+
def process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks):
|
71 |
+
"""
|
72 |
+
Tokenizes the text and calculates embeddings.
|
73 |
+
"""
|
74 |
+
df = tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks)
|
75 |
+
df_with_embeddings = calculate_embeddings(df)
|
76 |
+
return df_with_embeddings
|
77 |
+
|
78 |
+
def update_output(method, text, chunk_size, chunk_overlap, num_chunks, query):
|
79 |
+
df_with_embeddings = process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks)
|
80 |
+
if query:
|
81 |
+
df_with_embeddings = search_similar_chunks(query, df_with_embeddings)
|
82 |
+
# Update the headers to reflect the new column order after similarity search
|
83 |
+
return df_with_embeddings[['Chunk #', 'Similarity', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']]
|
84 |
+
return df_with_embeddings[['Chunk #', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']]
|
85 |
+
|
86 |
+
iface = gr.Interface(
|
87 |
+
fn=update_output,
|
88 |
+
inputs=[
|
89 |
+
gr.Dropdown(label="Select Tokenization Method", choices=["RecursiveCharacterTextSplitter"]),
|
90 |
+
gr.Textbox(label="Enter Text", lines=10, placeholder="Type or paste text here."),
|
91 |
+
gr.Number(label="Chunk Size", value=DEFAULT_CHUNK_SIZE),
|
92 |
+
gr.Number(label="Chunk Overlap", value=DEFAULT_CHUNK_OVERLAP),
|
93 |
+
gr.Number(label="Number of Chunks to Display", value=DEFAULT_NUM_CHUNKS),
|
94 |
+
gr.Textbox(label="Enter Query for Similarity Search", lines=2, placeholder="Type your query here.")
|
95 |
+
],
|
96 |
+
outputs=gr.Dataframe(height=900),
|
97 |
+
title="Text Tokenization and Embedding Tool",
|
98 |
+
description="A tool for tokenizing text and calculating embeddings. Now with similarity search feature."
|
99 |
+
)
|
100 |
+
|
101 |
+
if __name__ == "__main__":
|
102 |
+
iface.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
pandas
|
3 |
+
sentence-transformers
|
4 |
+
scikit-learn
|
5 |
+
numpy
|
6 |
+
langchain
|