def replace_wildcards(templates, wildcards, replacements): if len(wildcards) != len(replacements): raise ValueError( "The number of wildcards must match the number of replacements." ) new_templates = [] for tmp in templates: tmp_text = tmp["source"] for wildcard, replacement in zip(wildcards, replacements): tmp_text = tmp_text.replace(wildcard, replacement) new_templates.append({"cell_type": tmp["cell_type"], "source": tmp_text}) return new_templates rag_cells = [ { "cell_type": "markdown", "source": "# Retrieval-Augmented Generation (RAG) System Notebook", }, {"cell_type": "code", "source": ""}, ] embeggins_cells = [ { "cell_type": "markdown", "source": "# Embeddings Generation Notebook", }, {"cell_type": "code", "source": ""}, ] eda_cells = [ { "cell_type": "markdown", "source": "# Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset", }, { "cell_type": "code", "source": """ from IPython.display import HTML display(HTML("{html_code}")) """, }, { "cell_type": "code", "source": """ # 1. Install and import necessary libraries. !pip install pandas matplotlib seaborn """, }, { "cell_type": "code", "source": """ import pandas as pd import matplotlib.pyplot as plt import seaborn as sns """, }, { "cell_type": "code", "source": """ # 2. Load the dataset as a DataFrame using the provided code {first_code} """, }, { "cell_type": "code", "source": """ # 3. Understand the dataset structure print(df.head()) print(df.info()) print(df.describe()) """, }, { "cell_type": "code", "source": """ # 4. Check for missing values print(df.isnull().sum()) """, }, { "cell_type": "code", "source": """ # 5. Identify data types of each column print(df.dtypes) """, }, { "cell_type": "code", "source": """ # 6. Detect duplicated rows print(df.duplicated().sum()) """, }, { "cell_type": "code", "source": """ # 7. Generate descriptive statistics print(df.describe()) """, }, { "cell_type": "code", "source": """ # 8. Visualize the distribution of each column. # TODO: Add code to visualize the distribution of each column. # 9. Explore relationships between columns. # TODO: Add code to explore relationships between columns. # 10. Perform correlation analysis. # TODO: Add code to perform correlation analysis. """, }, ] def generate_embedding_system_prompt(): """You are an expert data scientist tasked with creating a Jupyter notebook to generate embeddings for a specific dataset. Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, and 'faiss-cpu' to create the index. The notebook should include: 1. Install necessary libraries with !pip install. 2. Import libraries. 3. Load the dataset as a DataFrame using the provided code. 4. Select the column to generate embeddings. 5. Remove duplicate data. 6. Convert the selected column to a list. 7. Load the sentence-transformers model. 8. Create a FAISS index. 9. Encode a query sample. 10. Search for similar documents using the FAISS index. Ensure the notebook is well-organized with explanations for each step. The output should be Markdown content with Python code snippets enclosed in "```python" and "```". The user will provide dataset information in the following format: ## Columns and Data Types ## Sample Data ## Loading Data code Use the provided code to load the dataset; do not use any other method. """ def generate_rag_system_prompt(): """You are an expert machine learning engineer tasked with creating a Jupyter notebook to demonstrate a Retrieval-Augmented Generation (RAG) system using a specific dataset. The dataset is provided as a pandas DataFrame. Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, 'faiss-cpu' to create the index, and 'transformers' for inference. The RAG notebook should include: 1. Install necessary libraries. 2. Import libraries. 3. Load the dataset as a DataFrame using the provided code. 4. Select the column for generating embeddings. 5. Remove duplicate data. 6. Convert the selected column to a list. 7. Load the sentence-transformers model. 8. Create a FAISS index. 9. Encode a query sample. 10. Search for similar documents using the FAISS index. 11. Load the 'HuggingFaceH4/zephyr-7b-beta' model from the transformers library and create a pipeline. 12. Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query. 13. Send the prompt to the pipeline and display the answer. Ensure the notebook is well-organized with explanations for each step. The output should be Markdown content with Python code snippets enclosed in "```python" and "```". The user will provide the dataset information in the following format: ## Columns and Data Types ## Sample Data ## Loading Data code Use the provided code to load the dataset; do not use any other method. """