asoria HF staff commited on
Commit
939f6ae
1 Parent(s): 4dc6cd8

Adding filter for numeric and categoric datasets

Browse files
Files changed (2) hide show
  1. app.py +6 -3
  2. utils/notebook_utils.py +12 -2
app.py CHANGED
@@ -15,8 +15,7 @@ from dotenv import load_dotenv
15
  import os
16
 
17
  # TODOS:
18
- # 1. Add cells by data types in EDA notebook
19
- # 2. Add template for RAG and embeddings
20
 
21
  load_dotenv()
22
 
@@ -147,7 +146,11 @@ def generate_cells(dataset_id, cells, notebook_type="eda"):
147
  html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
148
  wildcards = ["{dataset_name}", "{first_code}", "{html_code}"]
149
  replacements = [dataset_id, first_code, html_code]
150
- cells = replace_wildcards(cells, wildcards, replacements)
 
 
 
 
151
  generated_text = ""
152
  # Show only the first 40 lines, would like to have a scroll in gr.Code https://github.com/gradio-app/gradio/issues/9192
153
  viewer_lines = 0
 
15
  import os
16
 
17
  # TODOS:
18
+ # Add template for RAG and embeddings
 
19
 
20
  load_dotenv()
21
 
 
146
  html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
147
  wildcards = ["{dataset_name}", "{first_code}", "{html_code}"]
148
  replacements = [dataset_id, first_code, html_code]
149
+ has_numeric_columns = len(df.select_dtypes(include=["number"]).columns) > 0
150
+ has_categoric_columns = len(df.select_dtypes(include=["object"]).columns) > 0
151
+ cells = replace_wildcards(
152
+ cells, wildcards, replacements, has_numeric_columns, has_categoric_columns
153
+ )
154
  generated_text = ""
155
  # Show only the first 40 lines, would like to have a scroll in gr.Code https://github.com/gradio-app/gradio/issues/9192
156
  viewer_lines = 0
utils/notebook_utils.py CHANGED
@@ -1,4 +1,6 @@
1
- def replace_wildcards(templates, wildcards, replacements):
 
 
2
  if len(wildcards) != len(replacements):
3
  raise ValueError(
4
  "The number of wildcards must match the number of replacements."
@@ -6,6 +8,10 @@ def replace_wildcards(templates, wildcards, replacements):
6
 
7
  new_templates = []
8
  for tmp in templates:
 
 
 
 
9
  tmp_text = tmp["source"]
10
  for wildcard, replacement in zip(wildcards, replacements):
11
  tmp_text = tmp_text.replace(wildcard, replacement)
@@ -75,7 +81,6 @@ import seaborn as sns
75
  # First rows of the dataset and info
76
  print(df.head())
77
  print(df.info())
78
- print(df.describe())
79
  """,
80
  },
81
  {
@@ -107,6 +112,7 @@ print(df.describe())
107
  """,
108
  },
109
  {
 
110
  "cell_type": "code",
111
  "source": """
112
  # Unique values in categorical columns
@@ -118,6 +124,7 @@ df.select_dtypes(include=['object']).nunique()
118
  "source": "## 3. Data Visualization",
119
  },
120
  {
 
121
  "cell_type": "code",
122
  "source": """
123
  # Correlation matrix for numerical columns
@@ -129,6 +136,7 @@ plt.show()
129
  """,
130
  },
131
  {
 
132
  "cell_type": "code",
133
  "source": """
134
  # Distribution plots for numerical columns
@@ -142,6 +150,7 @@ for column in df.select_dtypes(include=['int64', 'float64']).columns:
142
  """,
143
  },
144
  {
 
145
  "cell_type": "code",
146
  "source": """
147
  # Count plots for categorical columns
@@ -155,6 +164,7 @@ for column in df.select_dtypes(include=['object']).columns:
155
  """,
156
  },
157
  {
 
158
  "cell_type": "code",
159
  "source": """
160
  # Box plots for detecting outliers in numerical columns
 
1
+ def replace_wildcards(
2
+ templates, wildcards, replacements, has_numeric_columns, has_categoric_columns
3
+ ):
4
  if len(wildcards) != len(replacements):
5
  raise ValueError(
6
  "The number of wildcards must match the number of replacements."
 
8
 
9
  new_templates = []
10
  for tmp in templates:
11
+ if "type" in tmp and tmp["type"] == "numeric" and not has_numeric_columns:
12
+ continue
13
+ if "type" in tmp and tmp["type"] == "categoric" and not has_categoric_columns:
14
+ continue
15
  tmp_text = tmp["source"]
16
  for wildcard, replacement in zip(wildcards, replacements):
17
  tmp_text = tmp_text.replace(wildcard, replacement)
 
81
  # First rows of the dataset and info
82
  print(df.head())
83
  print(df.info())
 
84
  """,
85
  },
86
  {
 
112
  """,
113
  },
114
  {
115
+ "type": "categoric",
116
  "cell_type": "code",
117
  "source": """
118
  # Unique values in categorical columns
 
124
  "source": "## 3. Data Visualization",
125
  },
126
  {
127
+ "type": "numeric",
128
  "cell_type": "code",
129
  "source": """
130
  # Correlation matrix for numerical columns
 
136
  """,
137
  },
138
  {
139
+ "type": "numeric",
140
  "cell_type": "code",
141
  "source": """
142
  # Distribution plots for numerical columns
 
150
  """,
151
  },
152
  {
153
+ "type": "categoric",
154
  "cell_type": "code",
155
  "source": """
156
  # Count plots for categorical columns
 
164
  """,
165
  },
166
  {
167
+ "type": "numeric",
168
  "cell_type": "code",
169
  "source": """
170
  # Box plots for detecting outliers in numerical columns