dejanseo commited on
Commit
eca2e65
1 Parent(s): 3b59afb

Create demo.py

Browse files
Files changed (1) hide show
  1. demo.py +140 -0
demo.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import tensorflow as tf
3
+ import sentencepiece as spm
4
+ import numpy as np
5
+ from scipy.spatial.distance import cosine
6
+ import pandas as pd
7
+ from openTSNE import TSNE
8
+ import plotly.express as px
9
+
10
+ # Set Streamlit layout to wide mode
11
+ st.set_page_config(layout="wide")
12
+
13
+ # Load the TFLite model and SentencePiece model
14
+ tflite_model_path = "model.tflite"
15
+ spm_model_path = "sentencepiece.model"
16
+
17
+ sp = spm.SentencePieceProcessor()
18
+ sp.load(spm_model_path)
19
+
20
+ interpreter = tf.lite.Interpreter(model_path=tflite_model_path)
21
+ interpreter.allocate_tensors()
22
+
23
+ input_details = interpreter.get_input_details()
24
+ output_details = interpreter.get_output_details()
25
+ required_input_length = 64 # Fixed length of 64 tokens
26
+
27
+ # Function to preprocess text input
28
+ def preprocess_text(text, sp, required_length):
29
+ input_ids = sp.encode(text, out_type=int)
30
+ input_ids = input_ids[:required_length] + [0] * (required_length - len(input_ids))
31
+ return np.array(input_ids, dtype=np.int32).reshape(1, -1)
32
+
33
+ # Function to generate embeddings
34
+ def generate_embeddings(text):
35
+ input_data = preprocess_text(text, sp, required_input_length)
36
+ interpreter.set_tensor(input_details[0]['index'], input_data)
37
+ interpreter.invoke()
38
+ embedding = interpreter.get_tensor(output_details[0]['index'])
39
+ return embedding.flatten()
40
+
41
+ # Predefined sentence sets
42
+ preset_sentences_a = [
43
+ "Dan Petrovic predicted conversational search in 2013.",
44
+ "Understanding user intent is key to effective SEO.",
45
+ "Dejan SEO has been a leader in data-driven SEO.",
46
+ "Machine learning is transforming search engines.",
47
+ "The future of search is AI-driven and personalized.",
48
+ "Search algorithms are evolving to better match user intent.",
49
+ "AI technologies enhance digital marketing strategies."
50
+ ]
51
+
52
+ preset_sentences_b = [
53
+ "Advances in machine learning reshape how search engines operate.",
54
+ "Personalized content is becoming more prevalent with AI.",
55
+ "Customer behavior insights are crucial for marketing strategies.",
56
+ "Dan Petrovic anticipated the rise of chat-based search interactions.",
57
+ "Dejan SEO is recognized for innovative SEO research and analysis.",
58
+ "Quantum computing is advancing rapidly in the tech world.",
59
+ "Studying user behavior can improve the effectiveness of online ads."
60
+ ]
61
+
62
+ # Initialize session state for input fields if not already set
63
+ if "input_text_a" not in st.session_state:
64
+ st.session_state["input_text_a"] = "\n".join(preset_sentences_a)
65
+ if "input_text_b" not in st.session_state:
66
+ st.session_state["input_text_b"] = "\n".join(preset_sentences_b)
67
+
68
+ # Clear button to reset text areas
69
+ if st.button("Clear Fields"):
70
+ st.session_state["input_text_a"] = ""
71
+ st.session_state["input_text_b"] = ""
72
+
73
+ # Side-by-side layout for Set A and Set B inputs
74
+ col1, col2 = st.columns(2)
75
+
76
+ with col1:
77
+ st.subheader("Set A Sentences")
78
+ input_text_a = st.text_area("Set A", value=st.session_state["input_text_a"], height=200)
79
+
80
+ with col2:
81
+ st.subheader("Set B Sentences")
82
+ input_text_b = st.text_area("Set B", value=st.session_state["input_text_b"], height=200)
83
+
84
+ # Slider to control t-SNE iteration steps
85
+ iterations = st.slider("Number of t-SNE Iterations (Higher values = more refined clusters)", 250, 1000, step=250)
86
+
87
+ # Submit button
88
+ if st.button("Calculate Similarity"):
89
+ sentences_a = [line.strip() for line in input_text_a.split("\n") if line.strip()]
90
+ sentences_b = [line.strip() for line in input_text_b.split("\n") if line.strip()]
91
+
92
+ if len(sentences_a) > 0 and len(sentences_b) > 0:
93
+ # Generate embeddings for both sets
94
+ embeddings_a = [generate_embeddings(sentence) for sentence in sentences_a]
95
+ embeddings_b = [generate_embeddings(sentence) for sentence in sentences_b]
96
+
97
+ # Combine sentences and embeddings for both sets
98
+ all_sentences = sentences_a + sentences_b
99
+ all_embeddings = np.array(embeddings_a + embeddings_b) # Convert to NumPy array
100
+ labels = ["Set A"] * len(sentences_a) + ["Set B"] * len(sentences_b)
101
+
102
+ # Set perplexity dynamically based on number of samples
103
+ perplexity_value = min(5, len(all_sentences) - 1)
104
+
105
+ # Perform 3D t-SNE with OpenTSNE, limiting the number of iterations
106
+ tsne = TSNE(n_components=3, perplexity=perplexity_value, n_iter=iterations, initialization="pca", random_state=42)
107
+ tsne_results = tsne.fit(all_embeddings)
108
+
109
+ # Prepare DataFrame for Plotly
110
+ df_tsne = pd.DataFrame({
111
+ "Sentence": all_sentences,
112
+ "Set": labels,
113
+ "X": tsne_results[:, 0],
114
+ "Y": tsne_results[:, 1],
115
+ "Z": tsne_results[:, 2]
116
+ })
117
+
118
+ # Plot 3D t-SNE results with Plotly
119
+ fig = px.scatter_3d(df_tsne, x="X", y="Y", z="Z", color="Set", hover_data={"Sentence": True},
120
+ title="Incremental 3D t-SNE Visualization of Sentence Similarity",
121
+ labels={"X": "t-SNE Dimension 1", "Y": "t-SNE Dimension 2", "Z": "t-SNE Dimension 3"},
122
+ width=1200, height=800) # Increased chart width and height
123
+ fig.update_traces(marker=dict(size=5, opacity=0.8))
124
+
125
+ # Display interactive Plotly plot
126
+ st.plotly_chart(fig)
127
+
128
+ # Display expandable embeddings
129
+ st.subheader("Embeddings for each sentence in Set A")
130
+ for i, (sentence, embedding) in enumerate(zip(sentences_a, embeddings_a)):
131
+ with st.expander(f"Embedding for Sentence A{i+1}: {sentence}"):
132
+ st.write(", ".join([f"{x:.4f}" for x in embedding])) # Comma-separated values
133
+
134
+ st.subheader("Embeddings for each sentence in Set B")
135
+ for i, (sentence, embedding) in enumerate(zip(sentences_b, embeddings_b)):
136
+ with st.expander(f"Embedding for Sentence B{i+1}: {sentence}"):
137
+ st.write(", ".join([f"{x:.4f}" for x in embedding])) # Comma-separated values
138
+
139
+ else:
140
+ st.warning("Please enter sentences in both Set A and Set B.")