Update app.py

#1
by lhoestq HF staff - opened
Files changed (2) hide show
  1. analyze.py +3 -5
  2. app.py +11 -4
analyze.py CHANGED
@@ -11,7 +11,8 @@ Row = dict[str, Any]
11
  T = TypeVar("T")
12
  BATCH_SIZE = 1
13
  MAX_TEXT_LENGTH = 500
14
- batch_analyzer: Optional[BatchAnalyzerEngine] = None
 
15
 
16
 
17
  class PresidioEntity(TypedDict):
@@ -121,16 +122,13 @@ def analyze(
121
  def presidio_scan_entities(
122
  rows: Iterable[Row], scanned_columns: list[str], columns_descriptions: list[str]
123
  ) -> Iterable[PresidioEntity]:
124
- global batch_analyzer
125
  cache: dict[str, list[RecognizerResult]] = {}
126
- if batch_analyzer is None:
127
- batch_analyser = BatchAnalyzerEngine(AnalyzerEngine())
128
  rows_with_scanned_columns_only = (
129
  {column_name: get_strings(row[column_name])[:MAX_TEXT_LENGTH] for column_name in scanned_columns} for row in rows
130
  )
131
  for indices, batch in batched(rows_with_scanned_columns_only, BATCH_SIZE, with_indices=True):
132
  yield from analyze(
133
- batch_analyzer=batch_analyser,
134
  batch=batch,
135
  indices=indices,
136
  scanned_columns=scanned_columns,
 
11
  T = TypeVar("T")
12
  BATCH_SIZE = 1
13
  MAX_TEXT_LENGTH = 500
14
+ analyzer = AnalyzerEngine()
15
+ batch_analyzer = BatchAnalyzerEngine(analyzer)
16
 
17
 
18
  class PresidioEntity(TypedDict):
 
122
  def presidio_scan_entities(
123
  rows: Iterable[Row], scanned_columns: list[str], columns_descriptions: list[str]
124
  ) -> Iterable[PresidioEntity]:
 
125
  cache: dict[str, list[RecognizerResult]] = {}
 
 
126
  rows_with_scanned_columns_only = (
127
  {column_name: get_strings(row[column_name])[:MAX_TEXT_LENGTH] for column_name in scanned_columns} for row in rows
128
  )
129
  for indices, batch in batched(rows_with_scanned_columns_only, BATCH_SIZE, with_indices=True):
130
  yield from analyze(
131
+ batch_analyzer=batch_analyzer,
132
  batch=batch,
133
  indices=indices,
134
  scanned_columns=scanned_columns,
app.py CHANGED
@@ -7,7 +7,7 @@ import pandas as pd
7
  from datasets import Features
8
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
9
 
10
- from analyze import get_column_description, get_columns_with_strings, presidio_scan_entities
11
 
12
  MAX_ROWS = 100
13
  T = TypeVar("T")
@@ -34,7 +34,7 @@ class track_iter:
34
  self.next_idx += 1
35
  yield item
36
 
37
- def analyze_dataset(dataset: str) -> pd.DataFrame:
38
  info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
39
  if "error" in info_resp:
40
  yield "❌ " + info_resp["error"], pd.DataFrame()
@@ -52,8 +52,9 @@ def analyze_dataset(dataset: str) -> pd.DataFrame:
52
  for presidio_entity in presidio_scan_entities(
53
  rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
54
  ):
55
- presidio_entities.append(presidio_entity)
56
- yield f"βš™οΈ Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
 
57
  yield f"βœ… Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
58
 
59
  with gr.Blocks() as demo:
@@ -65,6 +66,12 @@ with gr.Blocks() as demo:
65
  placeholder="Search for dataset id on Huggingface",
66
  search_type="dataset",
67
  ),
 
 
 
 
 
 
68
  ]
69
  button = gr.Button("Run Presidio Scan")
70
  outputs = [
 
7
  from datasets import Features
8
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
9
 
10
+ from analyze import analyzer, get_column_description, get_columns_with_strings, presidio_scan_entities
11
 
12
  MAX_ROWS = 100
13
  T = TypeVar("T")
 
34
  self.next_idx += 1
35
  yield item
36
 
37
+ def analyze_dataset(dataset: str, enabled_presidio_entities: str) -> pd.DataFrame:
38
  info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
39
  if "error" in info_resp:
40
  yield "❌ " + info_resp["error"], pd.DataFrame()
 
52
  for presidio_entity in presidio_scan_entities(
53
  rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
54
  ):
55
+ if presidio_entity.type in enabled_presidio_entities:
56
+ presidio_entities.append(presidio_entity)
57
+ yield f"βš™οΈ Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
58
  yield f"βœ… Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
59
 
60
  with gr.Blocks() as demo:
 
66
  placeholder="Search for dataset id on Huggingface",
67
  search_type="dataset",
68
  ),
69
+ gr.CheckBoxGroup(
70
+ label="Presidio entities",
71
+ choices=analyzer.get_supported_entities(),
72
+ value=["PERSON", "CREDIT_CARD", "US_SSN", "PHONE_NUMBER", "EMAIL_ADDRESS", "IP_ADDRESS", "US_BANK_NUMBER", "EMAIL", "IBAN_CODE"],
73
+ interative=True,
74
+ ),
75
  ]
76
  button = gr.Button("Run Presidio Scan")
77
  outputs = [