Spaces:
Sleeping
Sleeping
Update app.py
#1
by
lhoestq
HF staff
- opened
- analyze.py +3 -5
- app.py +11 -4
analyze.py
CHANGED
@@ -11,7 +11,8 @@ Row = dict[str, Any]
|
|
11 |
T = TypeVar("T")
|
12 |
BATCH_SIZE = 1
|
13 |
MAX_TEXT_LENGTH = 500
|
14 |
-
|
|
|
15 |
|
16 |
|
17 |
class PresidioEntity(TypedDict):
|
@@ -121,16 +122,13 @@ def analyze(
|
|
121 |
def presidio_scan_entities(
|
122 |
rows: Iterable[Row], scanned_columns: list[str], columns_descriptions: list[str]
|
123 |
) -> Iterable[PresidioEntity]:
|
124 |
-
global batch_analyzer
|
125 |
cache: dict[str, list[RecognizerResult]] = {}
|
126 |
-
if batch_analyzer is None:
|
127 |
-
batch_analyser = BatchAnalyzerEngine(AnalyzerEngine())
|
128 |
rows_with_scanned_columns_only = (
|
129 |
{column_name: get_strings(row[column_name])[:MAX_TEXT_LENGTH] for column_name in scanned_columns} for row in rows
|
130 |
)
|
131 |
for indices, batch in batched(rows_with_scanned_columns_only, BATCH_SIZE, with_indices=True):
|
132 |
yield from analyze(
|
133 |
-
batch_analyzer=
|
134 |
batch=batch,
|
135 |
indices=indices,
|
136 |
scanned_columns=scanned_columns,
|
|
|
11 |
T = TypeVar("T")
|
12 |
BATCH_SIZE = 1
|
13 |
MAX_TEXT_LENGTH = 500
|
14 |
+
analyzer = AnalyzerEngine()
|
15 |
+
batch_analyzer = BatchAnalyzerEngine(analyzer)
|
16 |
|
17 |
|
18 |
class PresidioEntity(TypedDict):
|
|
|
122 |
def presidio_scan_entities(
|
123 |
rows: Iterable[Row], scanned_columns: list[str], columns_descriptions: list[str]
|
124 |
) -> Iterable[PresidioEntity]:
|
|
|
125 |
cache: dict[str, list[RecognizerResult]] = {}
|
|
|
|
|
126 |
rows_with_scanned_columns_only = (
|
127 |
{column_name: get_strings(row[column_name])[:MAX_TEXT_LENGTH] for column_name in scanned_columns} for row in rows
|
128 |
)
|
129 |
for indices, batch in batched(rows_with_scanned_columns_only, BATCH_SIZE, with_indices=True):
|
130 |
yield from analyze(
|
131 |
+
batch_analyzer=batch_analyzer,
|
132 |
batch=batch,
|
133 |
indices=indices,
|
134 |
scanned_columns=scanned_columns,
|
app.py
CHANGED
@@ -7,7 +7,7 @@ import pandas as pd
|
|
7 |
from datasets import Features
|
8 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
9 |
|
10 |
-
from analyze import get_column_description, get_columns_with_strings, presidio_scan_entities
|
11 |
|
12 |
MAX_ROWS = 100
|
13 |
T = TypeVar("T")
|
@@ -34,7 +34,7 @@ class track_iter:
|
|
34 |
self.next_idx += 1
|
35 |
yield item
|
36 |
|
37 |
-
def analyze_dataset(dataset: str) -> pd.DataFrame:
|
38 |
info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
|
39 |
if "error" in info_resp:
|
40 |
yield "β " + info_resp["error"], pd.DataFrame()
|
@@ -52,8 +52,9 @@ def analyze_dataset(dataset: str) -> pd.DataFrame:
|
|
52 |
for presidio_entity in presidio_scan_entities(
|
53 |
rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
|
54 |
):
|
55 |
-
|
56 |
-
|
|
|
57 |
yield f"β
Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
|
58 |
|
59 |
with gr.Blocks() as demo:
|
@@ -65,6 +66,12 @@ with gr.Blocks() as demo:
|
|
65 |
placeholder="Search for dataset id on Huggingface",
|
66 |
search_type="dataset",
|
67 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
]
|
69 |
button = gr.Button("Run Presidio Scan")
|
70 |
outputs = [
|
|
|
7 |
from datasets import Features
|
8 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
9 |
|
10 |
+
from analyze import analyzer, get_column_description, get_columns_with_strings, presidio_scan_entities
|
11 |
|
12 |
MAX_ROWS = 100
|
13 |
T = TypeVar("T")
|
|
|
34 |
self.next_idx += 1
|
35 |
yield item
|
36 |
|
37 |
+
def analyze_dataset(dataset: str, enabled_presidio_entities: str) -> pd.DataFrame:
|
38 |
info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
|
39 |
if "error" in info_resp:
|
40 |
yield "β " + info_resp["error"], pd.DataFrame()
|
|
|
52 |
for presidio_entity in presidio_scan_entities(
|
53 |
rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
|
54 |
):
|
55 |
+
if presidio_entity.type in enabled_presidio_entities:
|
56 |
+
presidio_entities.append(presidio_entity)
|
57 |
+
yield f"βοΈ Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
|
58 |
yield f"β
Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
|
59 |
|
60 |
with gr.Blocks() as demo:
|
|
|
66 |
placeholder="Search for dataset id on Huggingface",
|
67 |
search_type="dataset",
|
68 |
),
|
69 |
+
gr.CheckBoxGroup(
|
70 |
+
label="Presidio entities",
|
71 |
+
choices=analyzer.get_supported_entities(),
|
72 |
+
value=["PERSON", "CREDIT_CARD", "US_SSN", "PHONE_NUMBER", "EMAIL_ADDRESS", "IP_ADDRESS", "US_BANK_NUMBER", "EMAIL", "IBAN_CODE"],
|
73 |
+
interative=True,
|
74 |
+
),
|
75 |
]
|
76 |
button = gr.Button("Run Presidio Scan")
|
77 |
outputs = [
|