Create demo.py
Browse files
demo.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import tensorflow as tf
|
3 |
+
import sentencepiece as spm
|
4 |
+
import numpy as np
|
5 |
+
from scipy.spatial.distance import cosine
|
6 |
+
import pandas as pd
|
7 |
+
from openTSNE import TSNE
|
8 |
+
import plotly.express as px
|
9 |
+
|
10 |
+
# Set Streamlit layout to wide mode
|
11 |
+
st.set_page_config(layout="wide")
|
12 |
+
|
13 |
+
# Load the TFLite model and SentencePiece model
|
14 |
+
tflite_model_path = "model.tflite"
|
15 |
+
spm_model_path = "sentencepiece.model"
|
16 |
+
|
17 |
+
sp = spm.SentencePieceProcessor()
|
18 |
+
sp.load(spm_model_path)
|
19 |
+
|
20 |
+
interpreter = tf.lite.Interpreter(model_path=tflite_model_path)
|
21 |
+
interpreter.allocate_tensors()
|
22 |
+
|
23 |
+
input_details = interpreter.get_input_details()
|
24 |
+
output_details = interpreter.get_output_details()
|
25 |
+
required_input_length = 64 # Fixed length of 64 tokens
|
26 |
+
|
27 |
+
# Function to preprocess text input
|
28 |
+
def preprocess_text(text, sp, required_length):
|
29 |
+
input_ids = sp.encode(text, out_type=int)
|
30 |
+
input_ids = input_ids[:required_length] + [0] * (required_length - len(input_ids))
|
31 |
+
return np.array(input_ids, dtype=np.int32).reshape(1, -1)
|
32 |
+
|
33 |
+
# Function to generate embeddings
|
34 |
+
def generate_embeddings(text):
|
35 |
+
input_data = preprocess_text(text, sp, required_input_length)
|
36 |
+
interpreter.set_tensor(input_details[0]['index'], input_data)
|
37 |
+
interpreter.invoke()
|
38 |
+
embedding = interpreter.get_tensor(output_details[0]['index'])
|
39 |
+
return embedding.flatten()
|
40 |
+
|
41 |
+
# Predefined sentence sets
|
42 |
+
preset_sentences_a = [
|
43 |
+
"Dan Petrovic predicted conversational search in 2013.",
|
44 |
+
"Understanding user intent is key to effective SEO.",
|
45 |
+
"Dejan SEO has been a leader in data-driven SEO.",
|
46 |
+
"Machine learning is transforming search engines.",
|
47 |
+
"The future of search is AI-driven and personalized.",
|
48 |
+
"Search algorithms are evolving to better match user intent.",
|
49 |
+
"AI technologies enhance digital marketing strategies."
|
50 |
+
]
|
51 |
+
|
52 |
+
preset_sentences_b = [
|
53 |
+
"Advances in machine learning reshape how search engines operate.",
|
54 |
+
"Personalized content is becoming more prevalent with AI.",
|
55 |
+
"Customer behavior insights are crucial for marketing strategies.",
|
56 |
+
"Dan Petrovic anticipated the rise of chat-based search interactions.",
|
57 |
+
"Dejan SEO is recognized for innovative SEO research and analysis.",
|
58 |
+
"Quantum computing is advancing rapidly in the tech world.",
|
59 |
+
"Studying user behavior can improve the effectiveness of online ads."
|
60 |
+
]
|
61 |
+
|
62 |
+
# Initialize session state for input fields if not already set
|
63 |
+
if "input_text_a" not in st.session_state:
|
64 |
+
st.session_state["input_text_a"] = "\n".join(preset_sentences_a)
|
65 |
+
if "input_text_b" not in st.session_state:
|
66 |
+
st.session_state["input_text_b"] = "\n".join(preset_sentences_b)
|
67 |
+
|
68 |
+
# Clear button to reset text areas
|
69 |
+
if st.button("Clear Fields"):
|
70 |
+
st.session_state["input_text_a"] = ""
|
71 |
+
st.session_state["input_text_b"] = ""
|
72 |
+
|
73 |
+
# Side-by-side layout for Set A and Set B inputs
|
74 |
+
col1, col2 = st.columns(2)
|
75 |
+
|
76 |
+
with col1:
|
77 |
+
st.subheader("Set A Sentences")
|
78 |
+
input_text_a = st.text_area("Set A", value=st.session_state["input_text_a"], height=200)
|
79 |
+
|
80 |
+
with col2:
|
81 |
+
st.subheader("Set B Sentences")
|
82 |
+
input_text_b = st.text_area("Set B", value=st.session_state["input_text_b"], height=200)
|
83 |
+
|
84 |
+
# Slider to control t-SNE iteration steps
|
85 |
+
iterations = st.slider("Number of t-SNE Iterations (Higher values = more refined clusters)", 250, 1000, step=250)
|
86 |
+
|
87 |
+
# Submit button
|
88 |
+
if st.button("Calculate Similarity"):
|
89 |
+
sentences_a = [line.strip() for line in input_text_a.split("\n") if line.strip()]
|
90 |
+
sentences_b = [line.strip() for line in input_text_b.split("\n") if line.strip()]
|
91 |
+
|
92 |
+
if len(sentences_a) > 0 and len(sentences_b) > 0:
|
93 |
+
# Generate embeddings for both sets
|
94 |
+
embeddings_a = [generate_embeddings(sentence) for sentence in sentences_a]
|
95 |
+
embeddings_b = [generate_embeddings(sentence) for sentence in sentences_b]
|
96 |
+
|
97 |
+
# Combine sentences and embeddings for both sets
|
98 |
+
all_sentences = sentences_a + sentences_b
|
99 |
+
all_embeddings = np.array(embeddings_a + embeddings_b) # Convert to NumPy array
|
100 |
+
labels = ["Set A"] * len(sentences_a) + ["Set B"] * len(sentences_b)
|
101 |
+
|
102 |
+
# Set perplexity dynamically based on number of samples
|
103 |
+
perplexity_value = min(5, len(all_sentences) - 1)
|
104 |
+
|
105 |
+
# Perform 3D t-SNE with OpenTSNE, limiting the number of iterations
|
106 |
+
tsne = TSNE(n_components=3, perplexity=perplexity_value, n_iter=iterations, initialization="pca", random_state=42)
|
107 |
+
tsne_results = tsne.fit(all_embeddings)
|
108 |
+
|
109 |
+
# Prepare DataFrame for Plotly
|
110 |
+
df_tsne = pd.DataFrame({
|
111 |
+
"Sentence": all_sentences,
|
112 |
+
"Set": labels,
|
113 |
+
"X": tsne_results[:, 0],
|
114 |
+
"Y": tsne_results[:, 1],
|
115 |
+
"Z": tsne_results[:, 2]
|
116 |
+
})
|
117 |
+
|
118 |
+
# Plot 3D t-SNE results with Plotly
|
119 |
+
fig = px.scatter_3d(df_tsne, x="X", y="Y", z="Z", color="Set", hover_data={"Sentence": True},
|
120 |
+
title="Incremental 3D t-SNE Visualization of Sentence Similarity",
|
121 |
+
labels={"X": "t-SNE Dimension 1", "Y": "t-SNE Dimension 2", "Z": "t-SNE Dimension 3"},
|
122 |
+
width=1200, height=800) # Increased chart width and height
|
123 |
+
fig.update_traces(marker=dict(size=5, opacity=0.8))
|
124 |
+
|
125 |
+
# Display interactive Plotly plot
|
126 |
+
st.plotly_chart(fig)
|
127 |
+
|
128 |
+
# Display expandable embeddings
|
129 |
+
st.subheader("Embeddings for each sentence in Set A")
|
130 |
+
for i, (sentence, embedding) in enumerate(zip(sentences_a, embeddings_a)):
|
131 |
+
with st.expander(f"Embedding for Sentence A{i+1}: {sentence}"):
|
132 |
+
st.write(", ".join([f"{x:.4f}" for x in embedding])) # Comma-separated values
|
133 |
+
|
134 |
+
st.subheader("Embeddings for each sentence in Set B")
|
135 |
+
for i, (sentence, embedding) in enumerate(zip(sentences_b, embeddings_b)):
|
136 |
+
with st.expander(f"Embedding for Sentence B{i+1}: {sentence}"):
|
137 |
+
st.write(", ".join([f"{x:.4f}" for x in embedding])) # Comma-separated values
|
138 |
+
|
139 |
+
else:
|
140 |
+
st.warning("Please enter sentences in both Set A and Set B.")
|