Spaces:

ExpandAIOrg
/

search_demo

Sleeping

App Files Files

bibliotecadebabel commited on Apr 16

Commit

37c2a8d

•

1 Parent(s): 475dbf8

first commit

Browse files

Files changed (13) hide show

README.md +5 -5
app.py +117 -0
requirements.txt +11 -0
src/constants/__init__.py +0 -0
src/constants/config.py +69 -0
src/constants/credentials.py +11 -0
src/decorators/decorators.py +11 -0
src/pytorch_modules/datasets/schema_string_dataset.py +40 -0
src/pytorch_modules/datasets/tokenized_dataset.py +62 -0
src/pytorch_modules/models/utils_models.py +21 -0
src/reader.py +92 -0
src/utils.py +90 -0
src/utils_search.py +153 -0

README.md CHANGED Viewed

@@ -1,13 +1,13 @@
 ---
 title: Search Demo
-emoji: 🐠
-colorFrom: green
-colorTo: green
 sdk: streamlit
-sdk_version: 1.33.0
 app_file: app.py
 pinned: false
-license: cc
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Search Demo
+emoji: ⚡
+colorFrom: indigo
+colorTo: purple
 sdk: streamlit
+sdk_version: 1.32.2
 app_file: app.py
 pinned: false
+license: isc
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import torch
+import src.constants.config as configurations
+from sentence_transformers import SentenceTransformer
+from sentence_transformers import CrossEncoder
+from src.constants.credentials import cohere_trial_key
+import streamlit as st
+from src.reader import Reader
+from src.utils_search import UtilsSearch
+from copy import deepcopy
+import numpy as np
+import cohere
+configurations = configurations.service_mxbai_msc_direct_config
+api_key = cohere_trial_key
+co = cohere.Client(api_key)
+semantic_column_names = configurations["semantic_column_names"]
+# Check CUDA availability and set device
+if torch.cuda.is_available():
+    torch.cuda.set_device(0)  # Use the first GPU
+else:
+    st.write("CUDA is not available. Using CPU instead.")
+@st.cache_data
+def init():
+    config = configurations
+    search_utils = UtilsSearch(config)
+    reader = Reader(config=config["reader_config"])
+    model = SentenceTransformer(config['sentence_transformer_name'], device='cuda:0')
+    cross_encoder = CrossEncoder(config['cross_encoder_name'], device='cuda:0')
+    df = reader.read()
+    index = search_utils.dataframe_to_index(df)
+    return df, model, cross_encoder, index, search_utils
+def get_possible_values_for_column(column_name, search_utils, df):
+    if column_name not in st.session_state:
+        setattr(st.session_state, column_name, search_utils.top_10_common_values(df, column_name))
+    return getattr(st.session_state, column_name)
+# Initialize or retrieve from session state
+if 'init_results' not in st.session_state:
+    st.session_state.init_results = init()
+# Now you can access your initialized objects directly from the session state
+df, model, cross_encoder, index, search_utils = st.session_state.init_results
+# Streamlit app layout
+st.title('Search Demo')
+# Input fields
+query = st.text_input('Enter your search query here')
+use_cohere = st.checkbox('Use Cohere', value=False)  # Default to checked
+programmatic_search_config = deepcopy(configurations['programmatic_search_config'])
+dynamic_programmatic_search_config = {
+    "scalar_columns": [],
+    "discrete_columns": []
+}
+for column in programmatic_search_config['scalar_columns']:
+    # Create number input for scalar values
+    col_name = column["column_name"]
+    min_val = float(column["min_value"])
+    max_val = float(column["max_value"])
+    user_min = st.number_input(f'Minimum {col_name.capitalize()}', min_value=min_val, max_value=max_val, value=min_val)
+    user_max = st.number_input(f'Maximum {col_name.capitalize()}', min_value=min_val, max_value=max_val, value=max_val)
+    dynamic_programmatic_search_config['scalar_columns'].append({"column_name": col_name, "min_value": user_min, "max_value": user_max})
+for column in programmatic_search_config['discrete_columns']:
+    # Create multiselect for discrete values
+    col_name = column["column_name"]
+    default_values = column["default_values"]
+    # Assuming you have a function to fetch possible values for the discrete columns based on the column name
+    possible_values = get_possible_values_for_column(col_name, search_utils, df)  # Implement this function based on your application
+    selected_values = st.multiselect(f'Select {col_name.capitalize()}', options=possible_values, default=default_values)
+    dynamic_programmatic_search_config['discrete_columns'].append({"column_name": col_name, "default_values": selected_values})
+programmatic_search_config['scalar_columns'] = dynamic_programmatic_search_config['scalar_columns']
+programmatic_search_config['discrete_columns'] = dynamic_programmatic_search_config['discrete_columns']
+# Search button
+if st.button('Search'):
+    if query:  # Checking if a query was entered
+        df_filtered = search_utils.filter_dataframe(df, programmatic_search_config)
+        if len(df_filtered) == 0:
+            st.write('No results found')
+        else:
+            index = search_utils.dataframe_to_index(df_filtered)
+            if use_cohere == False:
+                # Call your Cohere-based search function here
+                results_df = search_utils.search(query, df_filtered, model, cross_encoder, index)
+                results_df = search_utils.drop_columns(results_df, programmatic_search_config)
+            else:
+                df_retrieved = search_utils.retrieve(query, df_filtered, model, index)
+                df_retrieved = search_utils.drop_columns(df_retrieved, programmatic_search_config)
+                df_retrieved.fillna(value="", inplace=True)
+                docs = df_retrieved.to_dict('records')
+                column_names = semantic_column_names
+                docs = [{name: str(doc[name]) for name in column_names} for doc in docs]
+                rank_fields = list(docs[0].keys())
+                results = co.rerank(query=query, documents=docs, top_n=10, model='rerank-english-v3.0',
+                                    rank_fields=rank_fields)
+                top_ids = [hit.index for hit in results.results]
+                # Create the DataFrame with the rerank results
+                results_df = df_retrieved.iloc[top_ids].copy()
+                results_df['rank'] = (np.arange(len(results_df)) + 1)
+            st.write(results_df)
+    else:
+        st.write("Please enter a query to search.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+torch
+transformers
+datasets
+accelerate>=0.21.0
+pandas
+fastparquet
+s3fs
+numpy
+faiss-gpu
+sentence_transformers
+cohere

src/constants/__init__.py ADDED Viewed

File without changes

src/constants/config.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import src.constants.credentials as cred
+import os
+service_mxbai_made_in_china_config = {"reader_config": {"input_path": os.environ['made_in_china_s3_path'],
+                                                                     "credentials": cred.credentials_backblaze,
+                                                                     "format":"parquet"
+                                                                    },
+                             "sample_size": 32,
+                             "sentence_transformer_name": "mixedbread-ai/mxbai-embed-large-v1",
+                             "cross_encoder_name": "mixedbread-ai/mxbai-rerank-large-v1",
+                             "batch_size": 4,
+                             "dataset_size": 32,
+                             "seq_len": 256,
+                             "top_k": 1000,
+                             "programmatic_search_config": {
+                                 "scalar_columns": [{"column_name": "price", "min_value": 0, "max_value": "10000"}],
+                                 "discrete_columns": [{"column_name": "supplierName",
+                                                       # "default_values": ['Zhongshan Norye Hardware Co., Ltd.']
+                                                       "default_values": []
+                                                       },
+                                                      {"column_name": "warranty",
+                                                       # "default_values": ['Zhongshan Norye Hardware Co., Ltd.']
+                                                       "default_values": []
+                                                       }
+                                                      ],
+                                 "columns_to_drop": ["similarities", "embeddings"]
+                             }
+                             }
+service_mxbai_msc_direct_sample_config = {"reader_config": {"input_path": os.environ['msc_direct_s3_path'],
+                                                                     "credentials": cred.credentials_backblaze,
+                                                                     "format":"parquet"
+                                                                    },
+                             "sample_size": 32,
+                             "sentence_transformer_name": "mixedbread-ai/mxbai-embed-large-v1",
+                             "cross_encoder_name": "mixedbread-ai/mxbai-rerank-large-v1",
+                             "batch_size": 4,
+                             "dataset_size": 32,
+                             "seq_len": 256,
+                             "top_k": 50,
+                            "semantic_column_names": ['name', 'price', 'brand', 'keyword', 'description',
+                                                     'specifications'],
+                             "programmatic_search_config": {
+                                 "scalar_columns": [{"column_name": "price", "min_value": 0, "max_value": "10000"}],
+                                 "discrete_columns": [{"column_name": "brand", "default_values": []}],
+                                 "columns_to_drop": ["similarities", "embeddings", "index"]
+                             }
+                             }
+service_mxbai_msc_direct_config = {"reader_config": {"input_path": os.environ['msc_direct_s3_path'],
+                                                                     "credentials": cred.credentials_backblaze,
+                                                                     "format":"parquet"
+                                                                    },
+                             "sample_size": 32,
+                             "sentence_transformer_name": "mixedbread-ai/mxbai-embed-large-v1",
+                             "cross_encoder_name": "mixedbread-ai/mxbai-rerank-large-v1",
+                             "batch_size": 4,
+                             "dataset_size": 32,
+                             "seq_len": 256,
+                             "top_k": 50,
+                            "semantic_column_names": ['name', 'price', 'brand', 'keyword', 'description',
+                                                     'specifications'],
+                             "programmatic_search_config": {
+                                 "scalar_columns": [{"column_name": "price", "min_value": 0, "max_value": "10000"}],
+                                 "discrete_columns": [{"column_name": "brand", "default_values": []}],
+                                 "columns_to_drop": ["similarities", "embeddings", "index"]
+                             }
+                             }

src/constants/credentials.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import os
+credentials_backblaze = {"access_key_id": os.environ['credentials_backblaze_access_key_id'],
+                        "secret_access_key": os.environ['credentials_backblaze_secret_access_key'],
+                        "bucket_name": os.environ['credentials_backblaze_bucket_name'],
+                        "endpoint_url": os.environ['credentials_backblaze_endpoint_url'],
+                         "region_name": "us-east-1"
+                         }
+cohere_trial_key = os.environ["cohere_trial_key"]

src/decorators/decorators.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import time
+def timeit_decorator(func):
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        result = func(*args, **kwargs)
+        end_time = time.time()
+        print(f"Function {func.__name__} took {end_time-start_time:.4f} seconds to execute")
+        return result
+    return wrapper

src/pytorch_modules/datasets/schema_string_dataset.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch
+from torch.utils.data import Dataset
+import numpy as np
+class SchemaStringDataset(Dataset):
+    def __init__(self, data, config):
+        self.data = data
+        self.config = config
+    def __len__(self):
+        # Return the dataset size specified in the configuration
+        return self.config["dataset_size"]
+    def transform_entry(self, entry):
+        # Filter out None and NaN values
+        filtered_entry = {k: v for k, v in entry.items() if v is not np.nan and v is not None}
+        # Check if there are any entries after filtering
+        if not filtered_entry:
+            return '', ''  # Return empty strings if no valid entries exist
+        # Use the rest of the entry as input
+        inputs = [f"{k}:{v}" for k, v in filtered_entry.items()]
+        return ' '.join(inputs)
+    def __getitem__(self, idx):
+        transformed_data = {
+            'inputs': []
+        }
+        item = self.data[idx]
+        input_data = {k: v for k, v in item.items()}
+        inputs = self.transform_entry(input_data)
+        transformed_data['inputs'] = inputs
+        transformed_data['idx'] = idx
+        # Return the transformed item for the current idx
+        return transformed_data

src/pytorch_modules/datasets/tokenized_dataset.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from torch.utils.data import Dataset
+from transformers import AutoTokenizer
+import torch
+class TokenizedDataset(Dataset):
+    def __init__(self, custom_dataset, tokenizer, max_seq_len):
+        """
+        custom_dataset: An instance of CustomDataset
+        tokenizer: An instance of the tokenizer
+        max_seq_len: Maximum sequence length for padding
+        """
+        self.dataset = custom_dataset
+        self.tokenizer = tokenizer
+        self.max_seq_len = max_seq_len
+    def __len__(self):
+        # The length is inherited from the custom dataset
+        return len(self.dataset)
+    def tokenize_and_pad(self, text_list):
+        """
+        Tokenize and pad a list of text strings.
+        """
+        # Tokenize all text strings in the list
+        tokens = self.tokenizer(text_list, padding='max_length', max_length=self.max_seq_len, truncation=True, return_tensors="pt")
+        return tokens
+    def __getitem__(self, idx):
+        # Fetch the transformed data from the CustomDataset instance
+        transformed_data = self.dataset[idx]
+        # Initialize containers for inputs and optionally labels
+        tokenized_inputs = {}
+        tokenized_labels = {}
+        # Dynamically process each item in the dataset
+        for key, value in transformed_data.items():
+            if type(value) == int:  # Check if value is an integer
+                # Convert integer to tensor and directly assign to inputs or labels based on key prefix
+                if key.startswith('label'):
+                    tokenized_labels[key] = torch.tensor(value)  # Convert int to tensor for labels
+                else:
+                    tokenized_inputs[key] = torch.tensor(value)  # Convert int to tensor for inputs
+            if type(value) == str:
+                tokenized_data = self.tokenize_and_pad(value)
+                if key.startswith('label'):
+                    tokenized_labels[key] = tokenized_data['input_ids']
+                    tokenized_labels['attention_mask_' + key] = tokenized_data['attention_mask']
+                else:
+                    tokenized_inputs[key] = tokenized_data['input_ids']
+                    tokenized_inputs['attention_mask_' + key] = tokenized_data['attention_mask']
+        # Prepare the return structure, conditionally including 'label' if labels are present
+        output = {"inputs": tokenized_inputs}
+        if tokenized_labels:  # Check if there are any labels before adding to the output
+            output["label"] = tokenized_labels
+        return output

src/pytorch_modules/models/utils_models.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import os
+import pandas as pd
+class UtilsModels:
+    @staticmethod
+    def compute_embeddings(sentence_transformer, tokenized_sentences, attention_mask):
+        # Flatten the batch and num_sentences dimensions
+        batch_size, num_sentences, seq_len = tokenized_sentences.size()
+        flat_input_ids = tokenized_sentences.view(-1, seq_len)
+        flat_attention_mask = attention_mask.view(-1, seq_len) if attention_mask is not None else None
+        # Process sentences through the sentence_transformer
+        outputs = sentence_transformer(input_ids=flat_input_ids, attention_mask=flat_attention_mask)
+        embeddings = outputs.last_hidden_state
+        # Pool the embeddings to get a single vector per sentence (optional)
+        # Here, simply taking the mean across the sequence_length dimension
+        sentence_embeddings = embeddings.mean(dim=1)
+        # Reshape back to [batch_size, num_sentences * 2, embedding_dim]
+        return sentence_embeddings.view(batch_size, num_sentences, -1)

src/reader.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import os
+import pandas as pd
+import numpy as np
+import json
+from src.utils import Utils
+class Reader:
+    def __init__(self, config):
+        self.config = config
+        self.utils = Utils()
+        self.cache_dir = config.get("cache_dir", "./cache")  # default cache directory
+    def read(self, input_path=None, reader_config=None):
+        # If reader_config is None, use the class-level config
+        if reader_config is None:
+            reader_config = self.config
+        file_format = reader_config.get("format", None)
+        input_path = input_path or reader_config.get("input_path", "")
+        # Decide which method to use based on file format
+        if file_format == "parquet":
+            return self._read_dataframe_from_parquet(input_path, reader_config)
+        elif file_format == "csv":
+            return self._read_dataframe_from_csv(input_path)
+        elif file_format == "s3_csv":
+            return self._read_dataframe_from_csv_s3(input_path, reader_config)
+        elif file_format == "json_folder":
+            return self._read_json_files_to_dataframe(input_path)
+        else:
+            raise ValueError(f"Unsupported file format: {file_format}")
+    def _read_dataframe_from_parquet(self, input_path=None, reader_config=None):
+        if reader_config is None:
+            reader_config = self.config
+        input_path = input_path or reader_config.get("input_path", "")
+        if input_path.startswith("s3://"):
+            # Check if the file is cached
+            local_cache_path = os.path.join(self.cache_dir, os.path.basename(input_path))
+            if os.path.exists(local_cache_path):
+                print("reading from cache")
+                print(local_cache_path)
+                return pd.read_parquet(local_cache_path)
+            print("reading from s3")
+            credentials = reader_config.get("credentials", {})
+            storage_options = {
+                'key': credentials.get("access_key_id", ""),
+                'secret': credentials.get("secret_access_key", ""),
+                'client_kwargs': {'endpoint_url': credentials.get("endpoint_url", "")}
+            }
+            # Read from S3 and cache locally
+            df = pd.read_parquet(input_path, storage_options=storage_options)
+            os.makedirs(self.cache_dir, exist_ok=True)  # Check and create if not exists
+            df.to_parquet(local_cache_path)  # Save to cache
+            return df
+        else:
+            return pd.read_parquet(input_path)
+    def _read_dataframe_from_csv(self, file_path):
+        return self.utils.read_dataframe_from_csv(file_path)
+    def _read_json_files_to_dataframe(self, folder_path):
+        self.utils.load_json_files_to_dataframe(folder_path)
+    def _read_dataframe_from_csv_s3(self, input_path, reader_config):
+        credentials = reader_config.get("credentials", {})
+        endpoint_url = credentials.get("endpoint_url", "")
+        access_key_id = credentials.get("access_key_id", "")
+        secret_access_key = credentials.get("secret_access_key", "")
+        # Constructing the storage options for s3fs
+        storage_options = {
+            'key': access_key_id,
+            'secret': secret_access_key,
+            'client_kwargs': {'endpoint_url': endpoint_url}
+        }
+        # Use pandas to read the CSV file directly from S3
+        try:
+            df = pd.read_csv(input_path, storage_options=storage_options)
+            return df
+        except Exception as e:
+            print(f"An error occurred while reading the CSV file from S3: {e}")
+            return None

src/utils.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import os
+import pandas as pd
+import numpy as np
+import json
+class Utils:
+    @staticmethod
+    def read_dataframe_from_csv(file_path):
+        """
+        Reads a DataFrame from a CSV file if the file exists.
+        Parameters:
+        - file_path: The full path to the CSV file.
+        Returns:
+        - A pandas DataFrame if the file exists and is read successfully; None otherwise.
+        """
+        # Check if the file exists
+        if os.path.isfile(file_path):
+            try:
+                # Attempt to read the CSV file into a DataFrame
+                df = pd.read_csv(file_path)
+                return df
+            except Exception as e:
+                # If an error occurs during reading, print it
+                print(f"An error occurred while reading the file: {e}")
+                return None
+        else:
+            # If the file does not exist, print a message
+            print(f"File does not exist: {file_path}")
+            return None
+    @staticmethod
+    def read_json_files_to_dataframe(folder_path):
+        """
+        Reads JSON files from a specified folder, automatically infers columns from the JSON files,
+        and returns the data as a pandas DataFrame.
+        :param folder_path: Path to the folder containing JSON files.
+        :return: A pandas DataFrame containing data from all JSON files in the folder.
+        """
+        data = []
+        for filename in os.listdir(folder_path):
+            if filename.endswith('.json'):
+                file_path = os.path.join(folder_path, filename)
+                with open(file_path, 'r') as file:
+                    # First attempt to load the JSON
+                    json_data = json.load(file)
+                    # Check if json_data is a string instead of a dict, decode it again
+                    if isinstance(json_data, str):
+                        json_data = json.loads(json_data)
+                    data.append(json_data)
+        # Create a DataFrame from the list of dictionaries
+        df = pd.DataFrame(data)
+        return df
+    @staticmethod
+    def write_pandas_to_local(df, output_path):
+        """
+        Writes a pandas DataFrame to a CSV file at the specified output path.
+        :param df: The pandas DataFrame to be saved.
+        :param output_path: The file path where the DataFrame should be saved as a CSV.
+        """
+        # Create the directory if it does not exist
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        # Save the DataFrame to a CSV file without saving the index
+        df.to_csv(output_path, index=False)
+    @staticmethod
+    def convert_iterables_to_strings(df):
+        """
+        Convert columns with iterable types (excluding strings) to string representations.
+        This includes handling numpy arrays or lists within dataframe cells.
+        """
+        for col in df.columns:
+            # Apply conversion if the value is an iterable (excluding strings) or a numpy array
+            df[col] = df[col].apply(lambda x: str(x) if isinstance(x, (np.ndarray, list)) else x)
+        return df

src/utils_search.py ADDED Viewed

	@@ -0,0 +1,153 @@

+from src.pytorch_modules.datasets.schema_string_dataset import SchemaStringDataset
+import os
+import pandas as pd
+import numpy as np
+import json
+import faiss
+import torch
+class UtilsSearch:
+    def __init__(self, config):
+        self.config = config
+    @staticmethod
+    def dataframe_to_index(df):
+        embeddings = np.stack(df['embeddings'].to_numpy())
+        norm_embeddings = np.ascontiguousarray(embeddings / np.linalg.norm(embeddings, axis=1)[:, None])
+        # Create a FAISS index (Step 2, unchanged but using normalized embeddings)
+        dimension = norm_embeddings.shape[1]
+        index = faiss.IndexFlatL2(dimension)
+        index.add(norm_embeddings)
+        return index  # Ad
+    @staticmethod
+    def retrieve(query, df, model, index, top_k=100):
+        query += "Represent this sentence for searching relevant passages: "
+        """
+        Search the DataFrame for the given query and return a sorted DataFrame based on similarity.
+        :param query: The search query string.
+        :param df: The input DataFrame containing embeddings.
+        :param model: The model to encode the query and compute embeddings.
+        :param index: The search index for querying.
+        :param top_k: The number of top results to return.
+        :return: A new DataFrame sorted by similarity to the query, with a 'similarities' column.
+        """
+        # Check if CUDA is available and set the device accordingly
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        model.to(device)
+        # Compute the query embedding
+        query_vector = model.encode(query, convert_to_tensor=True, device=device).cpu().numpy()
+        # Normalize the query vector
+        query_vector /= np.linalg.norm(query_vector)
+        # Perform the search
+        distances, indices = index.search(np.array([query_vector]), top_k)
+        # Retrieve the rows from the DataFrame corresponding to the indices
+        retrieved_df = df.iloc[indices[0]]
+        # Attach the distances as a new column named 'similarities'
+        # Ensure the distances array matches the size of the retrieved DataFrame, especially if using slicing or other operations that might change its shape
+        retrieved_df = retrieved_df.assign(similarities=distances[0])
+        if 'similarities' in retrieved_df.columns:
+            retrieved_df = retrieved_df.sort_values(by='similarities', ascending=False)
+        # Optionally, you might want to reset the index if the order matters or if you need to serialize the DataFrame without index issues
+        retrieved_df = retrieved_df.reset_index(drop=True)
+        return retrieved_df
+    def rerank(self, query, df_top_100, cross_encoder, index):
+        # Convert the top 5 records to a list of dictionaries for processing
+        # print(df_top_100)
+        config = self.config
+        df_copy = df_top_100.copy().reset_index(drop=True)
+        records = df_copy.to_dict(orient='records')[:100]
+        # Assuming SchemaStringDataset can handle GPU data
+        dataset_str = SchemaStringDataset(records, config)
+        # Extract documents from dataset
+        documents = [batch["inputs"][:256]  for batch in dataset_str]
+        # Rank documents based on the query
+        # Ensure data processed by cross_encoder is moved to the correct device
+        ids = [item["corpus_id"] for item in cross_encoder.rank(query, documents, top_k=10)]
+        # Use the ids to filter and reorder the original DataFrame
+        df_sorted_by_relevance = df_copy.loc[ids]
+        return df_sorted_by_relevance
+    def search(self, query, df, model, cross_encoder, index):
+        sorted_df = self.retrieve(query, df, model, index)
+        return self.rerank(query, sorted_df, cross_encoder, index)
+    @staticmethod
+    def top_10_common_values(df, column_name):
+        """
+        This function takes a pandas dataframe and a column name,
+        and returns the top 10 most common non-null values of that column as a list.
+        """
+        # Drop null values from the specified column and count occurrences of each value
+        # Convert the index of the resulting Series (which contains the values) to a list
+        value_counts_list = df[column_name].dropna().value_counts().head(10).index.tolist()
+        return value_counts_list
+    @staticmethod
+    def filter_dataframe(df, config, top_k_programmatic=100):
+        """
+        Filters a DataFrame based on scalar and discrete column configurations, with type handling and null filtering.
+        Parameters:
+        - df: pandas.DataFrame to filter.
+        - config: Dictionary containing 'scalar_columns' and 'discrete_columns' configurations.
+        Returns:
+        - Filtered pandas.DataFrame.
+        """
+        scalar_columns = config.get('scalar_columns', [])
+        discrete_columns = config.get('discrete_columns', [])
+        # Combine all column names to check for nulls
+        all_columns = [col["column_name"] for col in scalar_columns] + [col["column_name"] for col in discrete_columns]
+        # Drop rows where any of the specified columns have null values
+        df = df.dropna(subset=all_columns)
+        # Filtering based on scalar columns
+        for col in scalar_columns:
+            column_name = col["column_name"]
+            # Ensure min_value and max_value are of numeric type
+            min_value = float(col["min_value"])
+            max_value = float(col["max_value"])
+            # Convert the DataFrame column to numeric type to avoid comparison issues
+            df[column_name] = pd.to_numeric(df[column_name], errors='coerce')
+            df = df[df[column_name].between(min_value, max_value)]
+        # Filtering based on discrete columns
+        for col in discrete_columns:
+            column_name = col["column_name"]
+            default_values = col["default_values"]
+            if len(default_values) > 0:
+                df = df[df[column_name].isin(default_values)]
+        if 'similarities' in df.columns:
+            df = df.sort_values(by='similarities', ascending=False)
+            # Return the top 100 items with the highest similarity
+        return df
+    @staticmethod
+    def drop_columns(df, config):
+        columns_to_drop = config.get('columns_to_drop', [])
+        df_dropped = df.drop(columns_to_drop, axis=1)
+        return df_dropped