botchagalupe commited on
Commit
933c7ad
1 Parent(s): d89fe7b

First Commit

Browse files
Files changed (3) hide show
  1. README.md +4 -4
  2. app.py +102 -0
  3. requirements.txt +6 -0
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: GAI Workshop-201
3
- emoji: 🐢
4
- colorFrom: blue
5
- colorTo: blue
6
  sdk: gradio
7
  sdk_version: 4.16.0
8
  app_file: app.py
 
1
  ---
2
+ title: GAI Workshop
3
+ emoji: 💻
4
+ colorFrom: gray
5
+ colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 4.16.0
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from sentence_transformers import SentenceTransformer
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+
8
+ # Constants for default values
9
+ DEFAULT_CHUNK_SIZE = 100
10
+ DEFAULT_CHUNK_OVERLAP = 0
11
+ DEFAULT_NUM_CHUNKS = 10
12
+
13
+ # Initialize the sentence transformer model for embeddings
14
+ model = SentenceTransformer('all-MiniLM-L6-v2')
15
+
16
+ def tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks):
17
+ """
18
+ Tokenizes the input text based on the selected method and provided parameters.
19
+ """
20
+ num_chunks = int(num_chunks)
21
+ output = []
22
+
23
+ # Ensure text is provided
24
+ if not text.strip():
25
+ return pd.DataFrame(columns=['Chunk #', 'Text Chunk', 'Character Count', 'Token Count'])
26
+
27
+ if method == "RecursiveCharacterTextSplitter":
28
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False)
29
+ tokenized_texts = text_splitter.split_text(text)[:num_chunks]
30
+ for i, chunk in enumerate(tokenized_texts):
31
+ output.append({
32
+ 'Chunk #': i,
33
+ 'Text Chunk': chunk,
34
+ 'Character Count': len(chunk),
35
+ 'Token Count': len(chunk.split())
36
+ })
37
+
38
+ df = pd.DataFrame(output)
39
+ return df
40
+
41
+ def calculate_embeddings(df):
42
+ """
43
+ Calculates embeddings for each text chunk in the dataframe.
44
+ """
45
+ if df.empty:
46
+ return df
47
+
48
+ chunks = df['Text Chunk'].tolist()
49
+ embeddings = model.encode(chunks)
50
+ df['Embeddings'] = embeddings.tolist()
51
+ return df
52
+
53
+ def search_similar_chunks(query, df_with_embeddings):
54
+ """
55
+ Search for chunks similar to the query embedding.
56
+ """
57
+ # Compute the query embedding
58
+ query_embedding = model.encode([query])[0]
59
+
60
+ # Calculate similarity scores
61
+ chunk_embeddings = np.vstack(df_with_embeddings['Embeddings'])
62
+ similarity_scores = cosine_similarity([query_embedding], chunk_embeddings)[0]
63
+
64
+ # Insert similarity scores into the dataframe after 'Chunk #'
65
+ df_with_embeddings.insert(1, 'Similarity', similarity_scores)
66
+
67
+ # Return the dataframe sorted by similarity scores in descending order
68
+ return df_with_embeddings.sort_values(by='Similarity', ascending=False)
69
+
70
+ def process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks):
71
+ """
72
+ Tokenizes the text and calculates embeddings.
73
+ """
74
+ df = tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks)
75
+ df_with_embeddings = calculate_embeddings(df)
76
+ return df_with_embeddings
77
+
78
+ def update_output(method, text, chunk_size, chunk_overlap, num_chunks, query):
79
+ df_with_embeddings = process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks)
80
+ if query:
81
+ df_with_embeddings = search_similar_chunks(query, df_with_embeddings)
82
+ # Update the headers to reflect the new column order after similarity search
83
+ return df_with_embeddings[['Chunk #', 'Similarity', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']]
84
+ return df_with_embeddings[['Chunk #', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']]
85
+
86
+ iface = gr.Interface(
87
+ fn=update_output,
88
+ inputs=[
89
+ gr.Dropdown(label="Select Tokenization Method", choices=["RecursiveCharacterTextSplitter"]),
90
+ gr.Textbox(label="Enter Text", lines=10, placeholder="Type or paste text here."),
91
+ gr.Number(label="Chunk Size", value=DEFAULT_CHUNK_SIZE),
92
+ gr.Number(label="Chunk Overlap", value=DEFAULT_CHUNK_OVERLAP),
93
+ gr.Number(label="Number of Chunks to Display", value=DEFAULT_NUM_CHUNKS),
94
+ gr.Textbox(label="Enter Query for Similarity Search", lines=2, placeholder="Type your query here.")
95
+ ],
96
+ outputs=gr.Dataframe(height=900),
97
+ title="Text Tokenization and Embedding Tool",
98
+ description="A tool for tokenizing text and calculating embeddings. Now with similarity search feature."
99
+ )
100
+
101
+ if __name__ == "__main__":
102
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ pandas
3
+ sentence-transformers
4
+ scikit-learn
5
+ numpy
6
+ langchain