dejanseo
/

PassageEmbeddings

TF Lite

Model card Files Files and versions Community

dejanseo commited on 24 days ago

Commit

eca2e65

•

1 Parent(s): 3b59afb

Create demo.py

Browse files

Files changed (1) hide show

demo.py +140 -0

demo.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import streamlit as st
+import tensorflow as tf
+import sentencepiece as spm
+import numpy as np
+from scipy.spatial.distance import cosine
+import pandas as pd
+from openTSNE import TSNE
+import plotly.express as px
+# Set Streamlit layout to wide mode
+st.set_page_config(layout="wide")
+# Load the TFLite model and SentencePiece model
+tflite_model_path = "model.tflite"
+spm_model_path = "sentencepiece.model"
+sp = spm.SentencePieceProcessor()
+sp.load(spm_model_path)
+interpreter = tf.lite.Interpreter(model_path=tflite_model_path)
+interpreter.allocate_tensors()
+input_details = interpreter.get_input_details()
+output_details = interpreter.get_output_details()
+required_input_length = 64  # Fixed length of 64 tokens
+# Function to preprocess text input
+def preprocess_text(text, sp, required_length):
+    input_ids = sp.encode(text, out_type=int)
+    input_ids = input_ids[:required_length] + [0] * (required_length - len(input_ids))
+    return np.array(input_ids, dtype=np.int32).reshape(1, -1)
+# Function to generate embeddings
+def generate_embeddings(text):
+    input_data = preprocess_text(text, sp, required_input_length)
+    interpreter.set_tensor(input_details[0]['index'], input_data)
+    interpreter.invoke()
+    embedding = interpreter.get_tensor(output_details[0]['index'])
+    return embedding.flatten()
+# Predefined sentence sets
+preset_sentences_a = [
+    "Dan Petrovic predicted conversational search in 2013.",
+    "Understanding user intent is key to effective SEO.",
+    "Dejan SEO has been a leader in data-driven SEO.",
+    "Machine learning is transforming search engines.",
+    "The future of search is AI-driven and personalized.",
+    "Search algorithms are evolving to better match user intent.",
+    "AI technologies enhance digital marketing strategies."
+]
+preset_sentences_b = [
+    "Advances in machine learning reshape how search engines operate.",
+    "Personalized content is becoming more prevalent with AI.",
+    "Customer behavior insights are crucial for marketing strategies.",
+    "Dan Petrovic anticipated the rise of chat-based search interactions.",
+    "Dejan SEO is recognized for innovative SEO research and analysis.",
+    "Quantum computing is advancing rapidly in the tech world.",
+    "Studying user behavior can improve the effectiveness of online ads."
+]
+# Initialize session state for input fields if not already set
+if "input_text_a" not in st.session_state:
+    st.session_state["input_text_a"] = "\n".join(preset_sentences_a)
+if "input_text_b" not in st.session_state:
+    st.session_state["input_text_b"] = "\n".join(preset_sentences_b)
+# Clear button to reset text areas
+if st.button("Clear Fields"):
+    st.session_state["input_text_a"] = ""
+    st.session_state["input_text_b"] = ""
+# Side-by-side layout for Set A and Set B inputs
+col1, col2 = st.columns(2)
+with col1:
+    st.subheader("Set A Sentences")
+    input_text_a = st.text_area("Set A", value=st.session_state["input_text_a"], height=200)
+with col2:
+    st.subheader("Set B Sentences")
+    input_text_b = st.text_area("Set B", value=st.session_state["input_text_b"], height=200)
+# Slider to control t-SNE iteration steps
+iterations = st.slider("Number of t-SNE Iterations (Higher values = more refined clusters)", 250, 1000, step=250)
+# Submit button
+if st.button("Calculate Similarity"):
+    sentences_a = [line.strip() for line in input_text_a.split("\n") if line.strip()]
+    sentences_b = [line.strip() for line in input_text_b.split("\n") if line.strip()]
+    if len(sentences_a) > 0 and len(sentences_b) > 0:
+        # Generate embeddings for both sets
+        embeddings_a = [generate_embeddings(sentence) for sentence in sentences_a]
+        embeddings_b = [generate_embeddings(sentence) for sentence in sentences_b]
+        # Combine sentences and embeddings for both sets
+        all_sentences = sentences_a + sentences_b
+        all_embeddings = np.array(embeddings_a + embeddings_b)  # Convert to NumPy array
+        labels = ["Set A"] * len(sentences_a) + ["Set B"] * len(sentences_b)
+        # Set perplexity dynamically based on number of samples
+        perplexity_value = min(5, len(all_sentences) - 1)
+        # Perform 3D t-SNE with OpenTSNE, limiting the number of iterations
+        tsne = TSNE(n_components=3, perplexity=perplexity_value, n_iter=iterations, initialization="pca", random_state=42)
+        tsne_results = tsne.fit(all_embeddings)
+        # Prepare DataFrame for Plotly
+        df_tsne = pd.DataFrame({
+            "Sentence": all_sentences,
+            "Set": labels,
+            "X": tsne_results[:, 0],
+            "Y": tsne_results[:, 1],
+            "Z": tsne_results[:, 2]
+        })
+        # Plot 3D t-SNE results with Plotly
+        fig = px.scatter_3d(df_tsne, x="X", y="Y", z="Z", color="Set", hover_data={"Sentence": True},
+                            title="Incremental 3D t-SNE Visualization of Sentence Similarity",
+                            labels={"X": "t-SNE Dimension 1", "Y": "t-SNE Dimension 2", "Z": "t-SNE Dimension 3"},
+                            width=1200, height=800)  # Increased chart width and height
+        fig.update_traces(marker=dict(size=5, opacity=0.8))
+        # Display interactive Plotly plot
+        st.plotly_chart(fig)
+        # Display expandable embeddings
+        st.subheader("Embeddings for each sentence in Set A")
+        for i, (sentence, embedding) in enumerate(zip(sentences_a, embeddings_a)):
+            with st.expander(f"Embedding for Sentence A{i+1}: {sentence}"):
+                st.write(", ".join([f"{x:.4f}" for x in embedding]))  # Comma-separated values
+        st.subheader("Embeddings for each sentence in Set B")
+        for i, (sentence, embedding) in enumerate(zip(sentences_b, embeddings_b)):
+            with st.expander(f"Embedding for Sentence B{i+1}: {sentence}"):
+                st.write(", ".join([f"{x:.4f}" for x in embedding]))  # Comma-separated values
+    else:
+        st.warning("Please enter sentences in both Set A and Set B.")