Spaces:

asoria
/

auto-notebook-creator

Running

App Files Files Community

asoria HF staff commited on Sep 3

Commit

aa1bdb0

•

1 Parent(s): 939f6ae

Adjust template for embeddings

Browse files

Files changed (2) hide show

app.py +22 -2
utils/notebook_utils.py +107 -4

app.py CHANGED Viewed

@@ -15,6 +15,8 @@ from dotenv import load_dotenv
 import os
 # TODOS:
 # Add template for RAG and embeddings
 load_dotenv()
@@ -91,6 +93,19 @@ def generate_rag_cells(dataset_id):
     yield from generate_cells(dataset_id, rag_cells, "rag")
 def generate_embedding_cells(dataset_id):
     yield from generate_cells(dataset_id, embeggins_cells, "embeddings")
@@ -143,9 +158,10 @@ def generate_cells(dataset_id, cells, notebook_type="eda"):
     first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
     features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
     html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
-    wildcards = ["{dataset_name}", "{first_code}", "{html_code}"]
-    replacements = [dataset_id, first_code, html_code]
     has_numeric_columns = len(df.select_dtypes(include=["number"]).columns) > 0
     has_categoric_columns = len(df.select_dtypes(include=["object"]).columns) > 0
     cells = replace_wildcards(
@@ -248,4 +264,8 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
         outputs=[code_component, go_to_notebook],
     )
 demo.launch()

 import os
 # TODOS:
+# Validate dataset type for type before generating the notebook
+# Add template for training
 # Add template for RAG and embeddings
 load_dotenv()
     yield from generate_cells(dataset_id, rag_cells, "rag")
+def longest_string_column(df):
+    longest_col = None
+    max_length = 0
+    for col in df.select_dtypes(include=["object", "string"]):
+        max_col_length = df[col].str.len().max()
+        if max_col_length > max_length:
+            max_length = max_col_length
+            longest_col = col
+    return longest_col
 def generate_embedding_cells(dataset_id):
     yield from generate_cells(dataset_id, embeggins_cells, "embeddings")
     first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
     features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
+    longest_col = longest_string_column(df)
     html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
+    wildcards = ["{dataset_name}", "{first_code}", "{html_code}", "{longest_col}"]
+    replacements = [dataset_id, first_code, html_code, longest_col]
     has_numeric_columns = len(df.select_dtypes(include=["number"]).columns) > 0
     has_categoric_columns = len(df.select_dtypes(include=["object"]).columns) > 0
     cells = replace_wildcards(
         outputs=[code_component, go_to_notebook],
     )
+    gr.Markdown(
+        "🚧 Note: Some code may not be compatible with datasets that contain binary data or complex structures. 🚧"
+    )
 demo.launch()

utils/notebook_utils.py CHANGED Viewed

@@ -31,9 +31,112 @@ rag_cells = [
 embeggins_cells = [
     {
         "cell_type": "markdown",
-        "source": "# Embeddings Generation Notebook",
     },
-    {"cell_type": "code", "source": ""},
 ]
 eda_cells = [
@@ -52,7 +155,7 @@ eda_cells = [
     {
         "cell_type": "code",
         "source": """
-# 1. Install and import necessary libraries.
 !pip install pandas matplotlib seaborn
 """,
     },
@@ -67,7 +170,7 @@ import seaborn as sns
     {
         "cell_type": "code",
         "source": """
-# 2. Load the dataset as a DataFrame
 {first_code}
 """,
     },

 embeggins_cells = [
     {
         "cell_type": "markdown",
+        "source": """
+---
+# **Embeddings Notebook for {dataset_name} dataset**
+---
+""",
+    },
+    {
+        "cell_type": "markdown",
+        "source": "## 1. Setup necessary libraries and load the dataset",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# Install and import necessary libraries.
+!pip install pandas sentence-transformers faiss-cpu
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+import pandas as pd
+from sentence_transformers import SentenceTransformer
+import faiss
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# Load the dataset as a DataFrame
+{first_code}
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# Specify the column name that contains the text data to generate embeddings
+column_to_generate_embeddings = '{longest_col}'
+""",
+    },
+    {
+        "cell_type": "markdown",
+        "source": "## 2. Loading embedding model and creating FAISS index",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# Remove duplicate entries based on the specified column
+df = df.drop_duplicates(subset=column_to_generate_embeddings)
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# Convert the column data to a list of text entries
+text_list = df[column_to_generate_embeddings].tolist()
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# Specify the embedding model you want to use
+model = SentenceTransformer('distiluse-base-multilingual-cased')
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+vectors = model.encode(text_list)
+vector_dimension = vectors.shape[1]
+# Initialize the FAISS index with the appropriate dimension (384 for this model)
+index = faiss.IndexFlatL2(vector_dimension)
+# Encode the text list into embeddings and add them to the FAISS index
+index.add(vectors)
+""",
+    },
+    {
+        "cell_type": "markdown",
+        "source": "## 3. Perform a text search",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# Specify the text you want to search for in the list
+text_to_search = text_list[0]
+print(f"Text to search: {text_to_search}")
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# Generate the embedding for the search query
+query_embedding = model.encode([text_to_search])
+""",
+    },
+    {
+        "cell_type": "code",
+        "source": """
+# Perform the search to find the 'k' nearest neighbors (adjust 'k' as needed)
+D, I = index.search(query_embedding, k=10)
+# Print the similar documents found
+print(f"Similar documents: {[text_list[i] for i in I[0]]}")
+""",
     },
 ]
 eda_cells = [
     {
         "cell_type": "code",
         "source": """
+# Install and import necessary libraries.
 !pip install pandas matplotlib seaborn
 """,
     },
     {
         "cell_type": "code",
         "source": """
+# Load the dataset as a DataFrame
 {first_code}
 """,
     },