Zamanonymize3

Sleeping

App Files Files Community

kcelia commited on Mar 22

Commit

174cd37

•

1 Parent(s): bbc133a

chore: update the space with layout

Browse files

Files changed (17) hide show

README.md +1 -1
anonymize_file_clear.py +70 -39
app.py +284 -82
demo_text.txt +0 -1
encrypted_anonymization_diagram.jpg +0 -0
fhe_anonymizer.py +93 -43
files/anonymized_document.txt +6 -6
files/chatgpt_prompt.txt +1 -0
models/embedded_model.model → files/mapping_clear_to_anonymized.pkl +2 -2
files/original_document.txt +1 -1
files/original_document_uuid_mapping.json +1 -0
models/embedded_model.model.wv.vectors_ngrams.npy +0 -3
models/without_pronoun_cml_xgboost.model +0 -3
models/without_pronoun_embedded_model.model +0 -3
models/without_pronoun_embedded_model.model.wv.vectors_ngrams.npy +0 -3
original_document_uuid_mapping.json +0 -1
utils_demo.py +124 -13

README.md CHANGED Viewed

@@ -15,7 +15,7 @@ tags:
   - data anonymization
   - homomorphic encryption
   - security
-python_version: 3.10
 ---
 # Data Anonymization using FHE

   - data anonymization
   - homomorphic encryption
   - security
+python_version: 3.10.12
 ---
 # Data Anonymization using FHE

anonymize_file_clear.py CHANGED Viewed

@@ -1,25 +1,28 @@
 import argparse
-import json
 import re
 import uuid
-from pathlib import Path
-import gensim
 from concrete.ml.common.serialization.loaders import load
-from transformers import AutoTokenizer, AutoModel
-from utils_demo import get_batch_text_representation
 def load_models():
-    base_dir = Path(__file__).parent / "models"
-    # Load tokenizer and model
-    tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
-    embeddings_model = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
-    with open(base_dir / "cml_logreg.model", "r") as model_file:
-        fhe_ner_detection = load(file=model_file)
-    return embeddings_model, tokenizer, fhe_ner_detection
-def anonymize_text(text, embeddings_model, tokenizer, fhe_ner_detection):
     token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)"
     tokens = re.findall(token_pattern, text)
     uuid_map = {}
@@ -28,9 +31,9 @@ def anonymize_text(text, embeddings_model, tokenizer, fhe_ner_detection):
     for token in tokens:
         if token.strip() and re.match(r"\w+", token):  # If the token is a word
             x = get_batch_text_representation([token], embeddings_model, tokenizer)
-            prediction_proba = fhe_ner_detection.predict_proba(x)
             probability = prediction_proba[0][1]
-            prediction = probability >= 0.5
             if prediction:
                 if token not in uuid_map:
                     uuid_map[token] = str(uuid.uuid4())[:8]
@@ -40,41 +43,69 @@ def anonymize_text(text, embeddings_model, tokenizer, fhe_ner_detection):
         else:
             processed_tokens.append(token)  # Preserve punctuation and spaces as is
-    anonymized_text = ''.join(processed_tokens)
     return anonymized_text, uuid_map
-def main():
-    parser = argparse.ArgumentParser(description="Anonymize named entities in a text file and save the mapping to a JSON file.")
-    parser.add_argument("file_path", type=str, help="The path to the file to be processed.")
-    args = parser.parse_args()
-    embeddings_model, tokenizer, fhe_ner_detection = load_models()
-    # Read the input file
-    with open(args.file_path, 'r', encoding='utf-8') as file:
-        text = file.read()
     # Save the original text to its specified file
-    original_file_path = Path(__file__).parent / "files" / "original_document.txt"
-    with open(original_file_path, 'w', encoding='utf-8') as original_file:
-        original_file.write(text)
     # Anonymize the text
-    anonymized_text, uuid_map = anonymize_text(text, embeddings_model, tokenizer, fhe_ner_detection)
     # Save the anonymized text to its specified file
-    anonymized_file_path = Path(__file__).parent / "files" / "anonymized_document.txt"
-    with open(anonymized_file_path, 'w', encoding='utf-8') as anonymized_file:
-        anonymized_file.write(anonymized_text)
     # Save the UUID mapping to a JSON file
-    mapping_path = Path(args.file_path).stem + "_uuid_mapping.json"
-    with open(mapping_path, 'w', encoding='utf-8') as file:
-        json.dump(uuid_map, file, indent=4, sort_keys=True)
-    print(f"Original text saved to {original_file_path}")
-    print(f"Anonymized text saved to {anonymized_file_path}")
-    print(f"UUID mapping saved to {mapping_path}")
 if __name__ == "__main__":
-    main()

 import argparse
 import re
 import uuid
+from transformers import AutoModel, AutoTokenizer
 from concrete.ml.common.serialization.loaders import load
+from utils_demo import *
 def load_models():
+    # Load the tokenizer and the embedding model
+    try:
+        tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
+        embeddings_model = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
+    except:
+        print("Error while loading Roberta")
+    # Load the CML trained model
+    with open(LOGREG_MODEL_PATH, "r") as model_file:
+        cml_ner_model = load(file=model_file)
+    return embeddings_model, tokenizer, cml_ner_model
+def anonymize_with_cml(text, embeddings_model, tokenizer, cml_ner_model):
     token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)"
     tokens = re.findall(token_pattern, text)
     uuid_map = {}
     for token in tokens:
         if token.strip() and re.match(r"\w+", token):  # If the token is a word
             x = get_batch_text_representation([token], embeddings_model, tokenizer)
+            prediction_proba = cml_ner_model.predict_proba(x, fhe="disable")
             probability = prediction_proba[0][1]
+            prediction = probability >= 0.77
             if prediction:
                 if token not in uuid_map:
                     uuid_map[token] = str(uuid.uuid4())[:8]
         else:
             processed_tokens.append(token)  # Preserve punctuation and spaces as is
+    anonymized_text = "".join(processed_tokens)
     return anonymized_text, uuid_map
+def anonymize_text(text, verbose=False, save=False):
+    # Load models
+    if verbose:
+        print("Loading models..")
+    embeddings_model, tokenizer, cml_ner_model = load_models()
+    if verbose:
+        print(f"\nText to process:--------------------\n{text}\n--------------------\n")
     # Save the original text to its specified file
+    if save:
+        write_txt(ORIGINAL_FILE_PATH, text)
     # Anonymize the text
+    anonymized_text, uuid_map = anonymize_with_cml(text, embeddings_model, tokenizer, cml_ner_model)
     # Save the anonymized text to its specified file
+    if save:
+        mapping = {o: (i, a) for i, (o, a) in enumerate(zip(text.split("\n\n"), anonymized_text.split("\n\n")))}
+        write_txt(ANONYMIZED_FILE_PATH, anonymized_text)
+        write_pickle(MAPPING_SENTENCES_PATH, mapping)
+    if verbose:
+        print(f"\nAnonymized text:--------------------\n{anonymized_text}\n--------------------\n")
     # Save the UUID mapping to a JSON file
+    if save:
+        write_json(MAPPING_UUID_PATH, uuid_map)
+    if verbose and save:
+        print(f"Original text saved to    :{ORIGINAL_FILE_PATH}")
+        print(f"Anonymized text saved to  :{ANONYMIZED_FILE_PATH}")
+        print(f"UUID mapping saved to     :{MAPPING_UUID_PATH}")
+        print(f"Sentence mapping saved to :{MAPPING_SENTENCES_PATH}")
+    return anonymized_text
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Anonymize named entities in a text file and save the mapping to a JSON file."
+    )
+    parser.add_argument(
+        "--file_path",
+        type=str,
+        default="files/original_document.txt",
+        help="The path to the file to be processed.",
+    )
+    parser.add_argument(
+        "--verbose",
+        type=bool,
+        default=True,
+        help="This provides additional details about the program's execution.",
+    )
+    parser.add_argument("--save", type=bool, default=True, help="Save the files.")
+    args = parser.parse_args()
+    text = read_txt(args.file_path)
+    anonymize_text(text, verbose=args.verbose, save=args.save)

app.py CHANGED Viewed

@@ -1,35 +1,102 @@
 """A Gradio app for anonymizing text data using FHE."""
 import gradio as gr
-from fhe_anonymizer import FHEAnonymizer
 import pandas as pd
 from openai import OpenAI
-import os
-import json
-import re
 from utils_demo import *
-from typing import List, Dict, Tuple
 anonymizer = FHEAnonymizer()
-client = OpenAI(
-    api_key=os.environ.get("openaikey"),
-)
-def check_user_query_fn(user_query: str) -> Dict:
-    if is_user_query_valid(user_query):
-        # TODO: check if the query is related to our context
-        error_msg = ("Unable to process ❌: The request exceeds the length limit or falls "
-                    "outside the scope of this document. Please refine your query.")
-        print(error_msg)
-        return {input_text: gr.update(value=error_msg)}
     else:
-        # Collapsing Multiple Spaces
-        return {input_text: gr.update(value=re.sub(" +", " ", user_query))}
-def deidentify_text(input_text):
-    anonymized_text, identified_words_with_prob = anonymizer(input_text)
     # Convert the list of identified words and probabilities into a DataFrame
     if identified_words_with_prob:
@@ -41,18 +108,35 @@ def deidentify_text(input_text):
     return anonymized_text, identified_df
-def query_chatgpt(anonymized_query):
-    with open("files/anonymized_document.txt", "r") as file:
-        anonymized_document = file.read()
-    with open("files/chatgpt_prompt.txt", "r") as file:
-        prompt = file.read()
     # Prepare prompt
-    full_prompt = (
-        prompt + "\n"
     )
-    query = "Document content:\n```\n" + anonymized_document + "\n\n```" + "Query:\n```\n" + anonymized_query + "\n```"
     print(full_prompt)
     completion = client.chat.completions.create(
@@ -63,16 +147,16 @@ def query_chatgpt(anonymized_query):
         ],
     )
     anonymized_response = completion.choices[0].message.content
-    with open("original_document_uuid_mapping.json", "r") as file:
-        uuid_map = json.load(file)
-    inverse_uuid_map = {v: k for k, v in uuid_map.items()}  # TODO load the inverse mapping from disk for efficiency
     # Pattern to identify words and non-words (including punctuation, spaces, etc.)
-    token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)"
-    tokens = re.findall(token_pattern, anonymized_response)
     processed_tokens = []
     for token in tokens:
         # Directly append non-word tokens or whitespace to processed_tokens
         if not token.strip() or not re.match(r"\w+", token):
@@ -87,12 +171,6 @@ def query_chatgpt(anonymized_query):
     return anonymized_response, deanonymized_response
-with open("files/original_document.txt", "r") as file:
-    original_document = file.read()
-with open("files/anonymized_document.txt", "r") as file:
-    anonymized_document = file.read()
 demo = gr.Blocks(css=".markdown-body { font-size: 18px; }")
 with demo:
@@ -108,80 +186,204 @@ with demo:
             —
             <a href="https://docs.zama.ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/documentation.png">Documentation</a>
             —
-            <a href="https://zama.ai/community"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/community.png">Community</a>
             —
             <a href="https://twitter.com/zama_fhe"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/x.png">@zama_fhe</a>
         </p>
         """
     )
-    gr.Markdown(
         """
-        <p align="center">
-            <img width="30%" height="25%" src="./encrypted_anonymization_diagram.jpg">
-        </p>
         """
     )
-    with gr.Accordion("What is Encrypted Anonymization?", open=False):
-        gr.Markdown(
-            """
-            Encrypted Anonymization leverages Fully Homomorphic Encryption (FHE) to
-            protect sensitive information during data processing. This approach allows for the
-            anonymization of text data, such as personal identifiers, while ensuring that the data
-            remains encrypted throughout the entire process.
-            """
-        )
     ########################## Main document Part ##########################
     with gr.Row():
         with gr.Column():
-            original_doc_box = gr.Textbox(label="Original Document:", value=original_document, interactive=True)
         with gr.Column():
-            anonymized_doc_box = gr.Textbox(label="Anonymized Document:", value=anonymized_document, interactive=False)
     ########################## User Query Part ##########################
     with gr.Row():
-        input_text = gr.Textbox(value="Who lives in Maine?", label="User query", interactive=True)
-        default_query_box = gr.Radio(choices=list(DEFAULT_QUERIES.keys()), label="Example Queries")
-        default_query_box.change(
-            fn=lambda default_query_box: DEFAULT_QUERIES[default_query_box],
-            inputs=[default_query_box],
-            outputs=[input_text]
-        )
-        input_text.change(
-            check_user_query_fn,
-            inputs=[input_text],
-            outputs=[input_text],
-        )
-    anonymized_text_output = gr.Textbox(label="Anonymized Text with FHE", lines=1, interactive=True)
-    identified_words_output = gr.Dataframe(label="Identified Words", visible=False)
-    submit_button = gr.Button("Anonymize with FHE")
-    submit_button.click(
-        deidentify_text,
-        inputs=[input_text],
         outputs=[anonymized_text_output, identified_words_output],
     )
-    with gr.Row():
-        chatgpt_response_anonymized = gr.Textbox(label="ChatGPT Anonymized Response", lines=13)
-        chatgpt_response_deanonymized = gr.Textbox(label="ChatGPT Deanonymized Response", lines=13)
     chatgpt_button = gr.Button("Query ChatGPT")
     chatgpt_button.click(
-        query_chatgpt,
-        inputs=[anonymized_text_output],
         outputs=[chatgpt_response_anonymized, chatgpt_response_deanonymized],
     )
 # Launch the app
 demo.launch(share=False)

 """A Gradio app for anonymizing text data using FHE."""
+import os
+import re
+from typing import Dict, List
 import gradio as gr
 import pandas as pd
+from fhe_anonymizer import FHEAnonymizer
 from openai import OpenAI
 from utils_demo import *
+ORIGINAL_DOCUMENT = read_txt(ORIGINAL_FILE_PATH).split("\n\n")
+ANONYMIZED_DOCUMENT = read_txt(ANONYMIZED_FILE_PATH)
+MAPPING_SENTENCES = read_pickle(MAPPING_SENTENCES_PATH)
+clean_directory()
 anonymizer = FHEAnonymizer()
+client = OpenAI(api_key=os.environ.get("openaikey"))
+def select_static_sentences_fn(selected_sentences: List):
+    selected_sentences = [MAPPING_SENTENCES[sentence] for sentence in selected_sentences]
+    anonymized_selected_sentence = sorted(selected_sentences, key=lambda x: x[0])
+    anonymized_selected_sentence = [sentence for _, sentence in anonymized_selected_sentence]
+    return {anonymized_doc_box: gr.update(value="\n\n".join(anonymized_selected_sentence))}
+def key_gen_fn() -> Dict:
+    """Generate keys for a given user.
+    Returns:
+        dict: A dictionary containing the generated keys and related information.
+    """
+    print("Key Gen..")
+    anonymizer.generate_key()
+    evaluation_key_path = KEYS_DIR / "evaluation_key"
+    if not evaluation_key_path.is_file():
+        error_message = (
+            f"Error Encountered While generating the evaluation {evaluation_key_path.is_file()=}"
+        )
+        print(error_message)
+        return {gen_key_btn: gr.update(value=error_message)}
     else:
+        return {gen_key_btn: gr.update(value="Keys have been generated ✅")}
+def encrypt_query_fn(query):
+    print(f"Query: {query}")
+    evaluation_key_path = KEYS_DIR / "evaluation_key"
+    if not evaluation_key_path.is_file():
+        error_message = "Error ❌: Please generate the key first!"
+        return {output_encrypted_box: gr.update(value=error_message)}
+    if is_user_query_valid(query):
+        # TODO: check if the query is related to our context
+        error_msg = (
+            "Unable to process ❌: The request exceeds the length limit or falls "
+            "outside the scope of this document. Please refine your query."
+        )
+        print(error_msg)
+        return {query_box: gr.update(value=error_msg)}
+    anonymizer.encrypt_query(query)
+    encrypted_tokens = read_pickle(KEYS_DIR / "encrypted_quantized_query")
+    encrypted_quant_tokens_hex = [token.hex()[500:510] for token in encrypted_tokens]
+    return {output_encrypted_box: gr.update(value=" ".join(encrypted_quant_tokens_hex))}
+def run_fhe_fn(query_box):
+    evaluation_key_path = KEYS_DIR / "evaluation_key"
+    if not evaluation_key_path.is_file():
+        error_message = "Error ❌: Please generate the key first!"
+        return {anonymized_text_output: gr.update(value=error_message)}
+    encryted_query_path = KEYS_DIR / "encrypted_quantized_query"
+    if not encryted_query_path.is_file():
+        error_message = "Error ❌: Please encrypt your query first!"
+        return {anonymized_text_output: gr.update(value=error_message)}
+    anonymizer.run_server_and_decrypt_output(query_box)
+    anonymized_text = read_pickle(KEYS_DIR / "reconstructed_sentence")
+    identified_words_with_prob = read_pickle(KEYS_DIR / "identified_words_with_prob")
     # Convert the list of identified words and probabilities into a DataFrame
     if identified_words_with_prob:
     return anonymized_text, identified_df
+def query_chatgpt_fn(anonymized_query, anonymized_document):
+    evaluation_key_path = KEYS_DIR / "evaluation_key"
+    if not evaluation_key_path.is_file():
+        error_message = "Error ❌: Please generate the key first!"
+        return {anonymized_text_output: gr.update(value=error_message)}
+    encryted_query_path = KEYS_DIR / "encrypted_quantized_query"
+    if not encryted_query_path.is_file():
+        error_message = "Error ❌: Please encrypt your query first!"
+        return {anonymized_text_output: gr.update(value=error_message)}
+    decrypted_query_path = KEYS_DIR / "reconstructed_sentence"
+    if not decrypted_query_path.is_file():
+        error_message = "Error ❌: Please run the FHE computation first!"
+        return {anonymized_text_output: gr.update(value=error_message)}
+    prompt = read_txt(PROMPT_PATH)
     # Prepare prompt
+    full_prompt = prompt + "\n"
+    query = (
+        "Document content:\n```\n"
+        + anonymized_document
+        + "\n\n```"
+        + "Query:\n```\n"
+        + anonymized_query
+        + "\n```"
     )
     print(full_prompt)
     completion = client.chat.completions.create(
         ],
     )
     anonymized_response = completion.choices[0].message.content
+    uuid_map = read_json(MAPPING_UUID_PATH)
+    inverse_uuid_map = {
+        v: k for k, v in uuid_map.items()
+    }  # TODO load the inverse mapping from disk for efficiency
     # Pattern to identify words and non-words (including punctuation, spaces, etc.)
+    tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", anonymized_response)
     processed_tokens = []
     for token in tokens:
         # Directly append non-word tokens or whitespace to processed_tokens
         if not token.strip() or not re.match(r"\w+", token):
     return anonymized_response, deanonymized_response
 demo = gr.Blocks(css=".markdown-body { font-size: 18px; }")
 with demo:
             —
             <a href="https://docs.zama.ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/documentation.png">Documentation</a>
             —
+            <a href=" https://community.zama.ai/c/concrete-ml/8"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/community.png">Community</a>
             —
             <a href="https://twitter.com/zama_fhe"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/x.png">@zama_fhe</a>
         </p>
         """
     )
+    # gr.Markdown(
+    #     """
+    #     <p align="center">
+    #         <img width="15%" height="15%" src="./encrypted_anonymization_diagram.jpg">
+    #     </p>
+    #     """
+    # )
+    with gr.Accordion("What is encrypted anonymization?", open=False):
+        gr.Markdown(
+<<<<<<< HEAD
         """
+        Anonymization is the process of removing personally identifiable information (PII)
+=======
+            """Anonymization is the process of removing personally identifiable information (PII)
+>>>>>>> 053bec9 (chore: update with marketing remarks)
+        from data to protect individual privacy.
+        To resolve trust issues when deploying anonymization as a cloud service, Fully Homomorphic
+        Encryption (FHE) can be used to preserve the privacy of the original data using
+        encryption.
+        The data remains encrypted throughout the anonymization process, eliminating the need for
+        third-party access to the raw data. Once the data is anonymized, it can safely be sent
+        to GenAI services such as ChatGPT.
         """
+        )
+    ########################## Key Gen Part ##########################
+    gr.Markdown(
+        "### Key generation\n\n"
+        """In FHE schemes, two sets of keys are generated. First, secret keys are used for
+        encrypting and decrypting data owned by the client. Second, evaluation keys allow a server
+        to blindly process the encrypted data. """
     )
+    gen_key_btn = gr.Button("Generate the private and evaluation keys")
+    gen_key_btn.click(
+        key_gen_fn,
+        inputs=[],
+        outputs=[gen_key_btn],
+    )
     ########################## Main document Part ##########################
+    gr.Markdown("## Private document")
     with gr.Row():
         with gr.Column():
+            gr.Markdown(
+                """This document was retrieved from the [Microsoft Presidio](https://huggingface.co/spaces/presidio/presidio_demo) demo.\n\n
+                You can select and deselect sentences to customize the document that will be used
+                as the initial prompt for ChatGPT in this space's final stage.\n\n
+                """
+            )
+        with gr.Column():
+            gr.Markdown(
+                """You can see the anonymized document that is sent to ChatGPT here.
+                ChatGPT will answer any queries that you have about the document below.
+                The anonymized information is replaced with hexadecimal strings.
+                """
+            )
+    with gr.Row():
         with gr.Column():
+            original_sentences_box = gr.CheckboxGroup(
+                ORIGINAL_DOCUMENT, value=ORIGINAL_DOCUMENT, label="Original document:"
+            )
+        with gr.Column():
+            anonymized_doc_box = gr.Textbox(
+                label="Anonymized document:", value=ANONYMIZED_DOCUMENT, interactive=False, lines=11
+            )
+    original_sentences_box.change(
+        fn=select_static_sentences_fn,
+        inputs=[original_sentences_box],
+        outputs=[anonymized_doc_box],
+    )
     ########################## User Query Part ##########################
+    gr.Markdown("<hr />")
+    gr.Markdown("## Private query")
+    gr.Markdown(
+        """Now, formulate a query regarding the selected document.\n\n
+                Choose from predefined options in 'Example Queries' or craft a custom query
+                in the 'User Query' box. Keep your question concise and relevant to the text's
+                context. Any off-topic question will not be processed.
+                """
+    )
     with gr.Row():
+        with gr.Column(scale=5):
+            with gr.Column(scale=5):
+                default_query_box = gr.Dropdown(
+                    list(DEFAULT_QUERIES.values()), label="Example queries"
+                )
+            query_box = gr.Textbox(
+                value="Who lives in Maine?", label="User query", interactive=True
+            )
+            default_query_box.change(
+                fn=lambda default_query_box: default_query_box,
+                inputs=[default_query_box],
+                outputs=[query_box],
+            )
+        with gr.Column(scale=1, min_width=6):
+            gr.HTML("<div style='height: 25px;'></div>")
+            gr.Markdown(
+                """
+                <p align="center">
+                Encrypt data locally with FHE 💻 ⚙️
+                </p>
+                """
+            )
+            encrypt_btn = gr.Button("Encrypt data")
+            gr.HTML("<div style='height: 25px;'></div>")
+        with gr.Column(scale=5):
+            output_encrypted_box = gr.Textbox(
+                label="Encrypted anonymized query that is sent to the anonymization server", lines=6
+            )
+    encrypt_btn.click(
+        fn=encrypt_query_fn, inputs=[query_box], outputs=[query_box, output_encrypted_box]
+    )
+    gr.Markdown("<hr />")
+    gr.Markdown("## Secure anonymization with FHE")
+    gr.Markdown(
+        """
+        Once the client encrypts the private query locally,
+        the client transmits it to a remote server to perform the
+        anonymization on encrypted data. When the computation is finished, the server returns
+        the result to the client for decryption.
+        """
+    )
+    run_fhe_btn = gr.Button("Anonymize with FHE")
+    anonymized_text_output = gr.Textbox(
+        label="Decrypted anonymized query that will be sent to ChatGPT", lines=1, interactive=True
+    )
+    identified_words_output = gr.Dataframe(label="Identified words", visible=False)
+    run_fhe_btn.click(
+        run_fhe_fn,
+        inputs=[query_box],
         outputs=[anonymized_text_output, identified_words_output],
     )
+    gr.Markdown("<hr />")
+    gr.Markdown("## Secure your communication on ChatGPT with anonymized queries")
+    gr.Markdown(
+        """After securely anonymizing the query with FHE,
+                you can forward it to ChatGPT without any concern for information leakage."""
+    )
     chatgpt_button = gr.Button("Query ChatGPT")
+    with gr.Row():
+        chatgpt_response_anonymized = gr.Textbox(label="ChatGPT anonymized response", lines=13)
+        chatgpt_response_deanonymized = gr.Textbox(
+            label="ChatGPT non-anonymized response", lines=13
+        )
     chatgpt_button.click(
+        query_chatgpt_fn,
+        inputs=[anonymized_text_output, anonymized_doc_box],
         outputs=[chatgpt_response_anonymized, chatgpt_response_deanonymized],
     )
+    gr.Markdown(
+        """**Please Note**: As this space is intended solely for demonstration purposes, some
+        private information may be missed the the anonymization algorithm. Please validate the
+        following query before sending it to ChatGPT."""
+    )
+<<<<<<< HEAD
+=======
+>>>>>>> 053bec9 (chore: update with marketing remarks)
 # Launch the app
 demo.launch(share=False)

demo_text.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- who lives in Maine?

encrypted_anonymization_diagram.jpg DELETED Viewed

Binary file (94.7 kB)

fhe_anonymizer.py CHANGED Viewed

@@ -1,73 +1,123 @@
-import gensim
 import re
-from concrete.ml.deployment import FHEModelClient, FHEModelServer
 from pathlib import Path
 from concrete.ml.common.serialization.loaders import load
-import uuid
-import json
-from transformers import AutoTokenizer, AutoModel
-from utils_demo import get_batch_text_representation
-base_dir = Path(__file__).parent
 class FHEAnonymizer:
-    def __init__(self, punctuation_list=".,!?:;"):
         # Load tokenizer and model
         self.tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
         self.embeddings_model = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
-        self.punctuation_list = punctuation_list
-        with open(base_dir / "original_document_uuid_mapping.json", 'r') as file:
-            self.uuid_map = json.load(file)
-        path_to_model = (base_dir / "deployment").resolve()
-        self.client = FHEModelClient(path_to_model)
-        self.server = FHEModelServer(path_to_model)
         self.client.generate_private_and_evaluation_keys()
         self.evaluation_key = self.client.get_serialized_evaluation_keys()
-    def fhe_inference(self, x):
-        enc_x = self.client.quantize_encrypt_serialize(x)
-        enc_y = self.server.run(enc_x, self.evaluation_key)
-        y = self.client.deserialize_decrypt_dequantize(enc_y)
-        return y
-    def __call__(self, text: str):
         # Pattern to identify words and non-words (including punctuation, spaces, etc.)
-        token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)"
-        tokens = re.findall(token_pattern, text)
-        identified_words_with_prob = []
-        processed_tokens = []
         for token in tokens:
-            # Directly append non-word tokens or whitespace to processed_tokens
-            if not token.strip() or not re.match(r"\w+", token):
-                processed_tokens.append(token)
                 continue
             # Prediction for each word
-            x = get_batch_text_representation([token], self.embeddings_model, self.tokenizer)
-            prediction_proba = self.fhe_inference(x)
-            probability = prediction_proba[0][1]
-            if probability >= 0.5:
-                identified_words_with_prob.append((token, probability))
-                # Use the existing UUID if available, otherwise generate a new one
-                tmp_uuid = self.uuid_map.get(token, str(uuid.uuid4())[:8])
-                processed_tokens.append(tmp_uuid)
-                self.uuid_map[token] = tmp_uuid
             else:
-                processed_tokens.append(token)
-        # Update the UUID map with query.
-        with open(base_dir / "original_document_uuid_mapping.json", 'w') as file:
-            json.dump(self.uuid_map, file)
-        # Reconstruct the sentence
-        reconstructed_sentence = ''.join(processed_tokens)
-        return reconstructed_sentence, identified_words_with_prob

+import json
 import re
+import time
+import uuid
 from pathlib import Path
+from transformers import AutoModel, AutoTokenizer
+from utils_demo import *
 from concrete.ml.common.serialization.loaders import load
+from concrete.ml.deployment import FHEModelClient, FHEModelServer
+TOLERANCE_PROBA = 0.77
+CURRENT_DIR = Path(__file__).parent
+DEPLOYMENT_DIR = CURRENT_DIR / "deployment"
+KEYS_DIR = DEPLOYMENT_DIR / ".fhe_keys"
 class FHEAnonymizer:
+    def __init__(self):
         # Load tokenizer and model
         self.tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
         self.embeddings_model = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
+        self.punctuation_list = PUNCTUATION_LIST
+        self.uuid_map = read_json(MAPPING_UUID_PATH)
+        self.client = FHEModelClient(DEPLOYMENT_DIR, key_dir=KEYS_DIR)
+        self.server = FHEModelServer(DEPLOYMENT_DIR)
+    def generate_key(self):
+        clean_directory()
+        # Creates the private and evaluation keys on the client side
         self.client.generate_private_and_evaluation_keys()
+        # Get the serialized evaluation keys
         self.evaluation_key = self.client.get_serialized_evaluation_keys()
+        assert isinstance(self.evaluation_key, bytes)
+        evaluation_key_path = KEYS_DIR / "evaluation_key"
+        with evaluation_key_path.open("wb") as f:
+            f.write(self.evaluation_key)
+    def encrypt_query(self, text: str):
         # Pattern to identify words and non-words (including punctuation, spaces, etc.)
+        tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", text)
+        encrypted_tokens = []
         for token in tokens:
+            if bool(re.match(r"^\s+$", token)):
                 continue
+            # Directly append non-word tokens or whitespace to processed_tokens
             # Prediction for each word
+            emb_x = get_batch_text_representation([token], self.embeddings_model, self.tokenizer)
+            encrypted_x = self.client.quantize_encrypt_serialize(emb_x)
+            assert isinstance(encrypted_x, bytes)
+            encrypted_tokens.append(encrypted_x)
+        write_pickle(KEYS_DIR / f"encrypted_quantized_query", encrypted_tokens)
+    def run_server(self):
+        encrypted_tokens = read_pickle(KEYS_DIR / f"encrypted_quantized_query")
+        encrypted_output, timing = [], []
+        for enc_x in encrypted_tokens:
+            start_time = time.time()
+            enc_y = self.server.run(enc_x, self.evaluation_key)
+            timing.append((time.time() - start_time) / 60.0)
+            encrypted_output.append(enc_y)
+        write_pickle(KEYS_DIR / f"encrypted_output", encrypted_output)
+        write_pickle(KEYS_DIR / f"encrypted_timing", timing)
+        return encrypted_output, timing
+    def decrypt_output(self, text):
+        encrypted_output = read_pickle(KEYS_DIR / f"encrypted_output")
+        tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", text)
+        decrypted_output, identified_words_with_prob = [], []
+        i = 0
+        for token in tokens:
+            # Directly append non-word tokens or whitespace to processed_tokens
+            if bool(re.match(r"^\s+$", token)):
+                continue
             else:
+                encrypted_token = encrypted_output[i]
+                prediction_proba = self.client.deserialize_decrypt_dequantize(encrypted_token)
+                probability = prediction_proba[0][1]
+                i += 1
+                if probability >= TOLERANCE_PROBA:
+                    identified_words_with_prob.append((token, probability))
+                    # Use the existing UUID if available, otherwise generate a new one
+                    tmp_uuid = self.uuid_map.get(token, str(uuid.uuid4())[:8])
+                    decrypted_output.append(tmp_uuid)
+                    self.uuid_map[token] = tmp_uuid
+                else:
+                    decrypted_output.append(token)
+            # Update the UUID map with query.
+            with open(MAPPING_UUID_PATH, "w") as file:
+                json.dump(self.uuid_map, file)
+        write_pickle(KEYS_DIR / f"reconstructed_sentence", " ".join(decrypted_output))
+        write_pickle(KEYS_DIR / f"identified_words_with_prob", identified_words_with_prob)
+    def run_server_and_decrypt_output(self, text):
+        self.run_server()
+        self.decrypt_output(text)

files/anonymized_document.txt CHANGED Viewed

@@ -1,10 +1,10 @@
-84381322, my name is 8b9ec610 8c6d3442 and I live in 269b9686.
-My credit card number is c075beec and my crypto wallet id is 54344fd4.
-On 9d6193ab 57c4ba7a I visited ea9cc7db and sent an email to d2934e4f,  from the IP 1a26727d.
-My 694a9044: 8d6f2b87 and my phone number: 6491a9cd 2a61cfbc.
-This is a valid a1cc4c7e 46e4a44b Account Number: de6fd087 . Can you please check the status on bank account 9277229c?
-4571d08d's social security number is 095fa9c8.  290451c3 driver license? it is 778679d7.

+Hello, my name is ebe99761 53a9291d and I live in 6337f12f.
+My credit card number is e5b499b0 and my crypto wallet id is ac41d58b.
+On September 18 I visited 0d574451 and sent an email to 1f78e797,  from the IP 116fe81e.
+My passport: 59a83e41 and my phone number: 144a2acc d9e5704e.
+This is a valid 71d0f51c Bank Account Number: 5ca977a4. Can you please check the status on bank account 9eb07461?
+b474d794's social security number is d8da62f1.  Her driver license? it is 5e63c327.

files/chatgpt_prompt.txt CHANGED Viewed

@@ -5,5 +5,6 @@ Details:
 - Sensitive information includes: names, locations, credit card numbers, email addresses, IP addresses, passport details, phone numbers, bank accounts, social security numbers, and driver's licenses.
 - Each piece of information is represented by a unique identifier, maintaining privacy while discussing document content.
 - Your role is to interpret the document's anonymized content and accurately respond to queries using the identifiers.
 - Consistency in identifiers is crucial for connecting the text with the queries correctly.
 - You must not discuss the anonymized nature of the text and use the identifiers as if they were real words for a smooth chat with users.

 - Sensitive information includes: names, locations, credit card numbers, email addresses, IP addresses, passport details, phone numbers, bank accounts, social security numbers, and driver's licenses.
 - Each piece of information is represented by a unique identifier, maintaining privacy while discussing document content.
 - Your role is to interpret the document's anonymized content and accurately respond to queries using the identifiers.
+- Any question outside the content of the document are forbidden, reply that it is out of the scope, do not answer that question, and warn the user to try another question.
 - Consistency in identifiers is crucial for connecting the text with the queries correctly.
 - You must not discuss the anonymized nature of the text and use the identifiers as if they were real words for a smooth chat with users.

models/embedded_model.model → files/mapping_clear_to_anonymized.pkl RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:28fcf483356bf2bef29b8220b84803acf9518f19fbc9342e76cac06b30803f28
-size 73056

 version https://git-lfs.github.com/spec/v1
+oid sha256:944e5c32bd04e955194c513d35b91467615c08973c767745a1756d015b3e6ebb
+size 1085

files/original_document.txt CHANGED Viewed

@@ -5,6 +5,6 @@ On September 18 I visited microsoft.com and sent an email to test@presidio.site,
 My passport: 191280342 and my phone number: (212) 555-1234.
-This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?
 Kate's social security number is 078-05-1126.  Her driver license? it is 1234567A.

 My passport: 191280342 and my phone number: (212) 555-1234.
+This is a valid International Bank Account Number: IL150120690000003111111. Can you please check the status on bank account 954567876544?
 Kate's social security number is 078-05-1126.  Her driver license? it is 1234567A.

files/original_document_uuid_mapping.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"078-05-1126": "d8da62f1", "1234567A": "5e63c327", "16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ": "ac41d58b", "191280342": "59a83e41", "192.168.0.1": "116fe81e", "212": "144a2acc", "4095-2609-9393-4932": "e5b499b0", "555-1234": "d9e5704e", "954567876544": "9eb07461", "David": "ebe99761", "IL150120690000003111111": "5ca977a4", "International": "71d0f51c", "Johnson": "53a9291d", "Kate": "b474d794", "Maine": "6337f12f", "microsoft.com": "0d574451", "test@presidio.site": "1f78e797"}

models/embedded_model.model.wv.vectors_ngrams.npy DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:faf08ed9c3bc29cf71c16f5d2b311f3bfb730a92f12c2e52d742bc6b59bf9e5f
-size 800000128

models/without_pronoun_cml_xgboost.model DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:933d1d5c5f83c30211dd9a497482c517a822df809c0498fed164de72bd7bf910
-size 1085795

models/without_pronoun_embedded_model.model DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:762240ca4040c68e44c403f16abce5683a0c4a005ec10f3dd0135a0e429a66c1
-size 1189196

models/without_pronoun_embedded_model.model.wv.vectors_ngrams.npy DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5cf06fe78185b373c97ee0616f599ce6b1aceb6445b8f666fac6cd4cd307fe46
-size 400000128

original_document_uuid_mapping.json DELETED Viewed

@@ -1 +0,0 @@

- {"078-05-1126": "095fa9c8", "1234567A": "778679d7", "16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ": "54344fd4", "18": "57c4ba7a", "191280342": "8d6f2b87", "192.168.0.1": "1a26727d", "212": "6491a9cd", "4095-2609-9393-4932": "c075beec", "555-1234": "2a61cfbc", "954567876544": "9277229c", "Bank": "46e4a44b", "David": "8b9ec610", "Hello": "84381322", "Her": "290451c3", "IL150120690000003111111": "de6fd087", "International": "a1cc4c7e", "Johnson": "8c6d3442", "Kate": "4571d08d", "Maine": "269b9686", "September": "9d6193ab", "microsoft.com": "ea9cc7db", "passport": "694a9044", "test@presidio.site": "d2934e4f"}

utils_demo.py CHANGED Viewed

@@ -1,28 +1,68 @@
-import torch
-import numpy as np
-MAX_USER_QUERY_LEN = 35
 # List of example queries for easy access
 DEFAULT_QUERIES = {
     "Example Query 1": "Who visited microsoft.com on September 18?",
-    "Example Query 2": "Does Kate has drive ?",
-    "Example Query 3": "What phone number can be used to contact David Johnson?",
 }
 def get_batch_text_representation(texts, model, tokenizer, batch_size=1):
-    """
-    Get mean-pooled representations of given texts in batches.
-    """
     mean_pooled_batch = []
     for i in range(0, len(texts), batch_size):
-        batch_texts = texts[i:i+batch_size]
         inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True)
         with torch.no_grad():
             outputs = model(**inputs, output_hidden_states=False)
         last_hidden_states = outputs.last_hidden_state
-        input_mask_expanded = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden_states.size()).float()
         sum_embeddings = torch.sum(last_hidden_states * input_mask_expanded, 1)
         sum_mask = input_mask_expanded.sum(1)
         mean_pooled = sum_embeddings / sum_mask
@@ -39,11 +79,82 @@ def is_user_query_valid(user_query: str) -> bool:
         bool: True if the `user_query` is None or empty, False otherwise.
     """
     # If the query is not part of the default queries
-    is_default_query = user_query in DEFAULT_QUERIES.values()
     # Check if the query exceeds the length limit
     is_exceeded_max_length = user_query is not None and len(user_query) <= MAX_USER_QUERY_LEN
     return not is_default_query and not is_exceeded_max_length

+import json
+import os
+import pickle as pkl
+import re
+import shutil
+import string
+from collections import Counter
+from pathlib import Path
+import numpy as np
+import torch
+MAX_USER_QUERY_LEN = 80
 # List of example queries for easy access
 DEFAULT_QUERIES = {
     "Example Query 1": "Who visited microsoft.com on September 18?",
+    "Example Query 2": "Does Kate have a driving licence?",
+    "Example Query 3": "What's David Johnson's phone number?",
 }
+CURRENT_DIR = Path(__file__).parent
+DATA_PATH = CURRENT_DIR / "files"
+LOGREG_MODEL_PATH = CURRENT_DIR / "models" / "cml_logreg.model"
+DEPLOYMENT_DIR = CURRENT_DIR / "deployment"
+KEYS_DIR = DEPLOYMENT_DIR / ".fhe_keys"
+ORIGINAL_FILE_PATH = DATA_PATH / "original_document.txt"
+ANONYMIZED_FILE_PATH = DATA_PATH / "anonymized_document.txt"
+MAPPING_UUID_PATH = DATA_PATH / "original_document_uuid_mapping.json"
+MAPPING_SENTENCES_PATH = DATA_PATH / "mapping_clear_to_anonymized.pkl"
+PROMPT_PATH = DATA_PATH / "chatgpt_prompt.txt"
+ALL_DIRS = [KEYS_DIR]
+PUNCTUATION_LIST = list(string.punctuation)
+PUNCTUATION_LIST.remove("%")
+PUNCTUATION_LIST.remove("$")
+PUNCTUATION_LIST = "".join(PUNCTUATION_LIST)
+def clean_directory() -> None:
+    """Clear direcgtories"""
+    print("Cleaning...\n")
+    for target_dir in ALL_DIRS:
+        if os.path.exists(target_dir) and os.path.isdir(target_dir):
+            shutil.rmtree(target_dir)
+        target_dir.mkdir(exist_ok=True, parents=True)
 def get_batch_text_representation(texts, model, tokenizer, batch_size=1):
+    """Get mean-pooled representations of given texts in batches."""
     mean_pooled_batch = []
     for i in range(0, len(texts), batch_size):
+        batch_texts = texts[i : i + batch_size]
         inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True)
         with torch.no_grad():
             outputs = model(**inputs, output_hidden_states=False)
         last_hidden_states = outputs.last_hidden_state
+        input_mask_expanded = (
+            inputs["attention_mask"].unsqueeze(-1).expand(last_hidden_states.size()).float()
+        )
         sum_embeddings = torch.sum(last_hidden_states * input_mask_expanded, 1)
         sum_mask = input_mask_expanded.sum(1)
         mean_pooled = sum_embeddings / sum_mask
         bool: True if the `user_query` is None or empty, False otherwise.
     """
     # If the query is not part of the default queries
+    is_default_query = user_query in DEFAULT_QUERIES.values()
     # Check if the query exceeds the length limit
     is_exceeded_max_length = user_query is not None and len(user_query) <= MAX_USER_QUERY_LEN
     return not is_default_query and not is_exceeded_max_length
+def compare_texts_ignoring_extra_spaces(original_text, modified_text):
+    """Check if the modified_text is identical to the original_text except for additional spaces.
+    Args:
+        original_text (str): The original text for comparison.
+        modified_text (str): The modified text to compare against the original.
+    Returns:
+        (bool): True if the modified_text is the same as the original_text except for
+            additional spaces; False otherwise.
+    """
+    normalized_original = " ".join(original_text.split())
+    normalized_modified = " ".join(modified_text.split())
+    return normalized_original == normalized_modified
+def is_strict_deletion_only(original_text, modified_text):
+    # Define a regex pattern that matches a word character next to a punctuation
+    # or a punctuation next to a word character, without a space between them.
+    pattern = r"(?<=[\w])(?=[^\w\s])|(?<=[^\w\s])(?=[\w])"
+    # Replace instances found by the pattern with a space
+    original_text = re.sub(pattern, " ", original_text)
+    modified_text = re.sub(pattern, " ", modified_text)
+    # Tokenize the texts into words, considering also punctuation
+    original_words = Counter(original_text.lower().split())
+    modified_words = Counter(modified_text.lower().split())
+    base_words = all(item in original_words.keys() for item in modified_words.keys())
+    base_count = all(original_words[k] >= v for k, v in modified_words.items())
+    return base_words and base_count
+def read_txt(file_path):
+    """Read text from a file."""
+    with open(file_path, "r", encoding="utf-8") as file:
+        return file.read()
+def write_txt(file_path, data):
+    """Write text to a file."""
+    with open(file_path, "w", encoding="utf-8") as file:
+        file.write(data)
+def write_pickle(file_path, data):
+    """Save data to a pickle file."""
+    with open(file_path, "wb") as f:
+        pkl.dump(data, f)
+def read_pickle(file_name):
+    """Load data from a pickle file."""
+    with open(file_name, "rb") as file:
+        return pkl.load(file)
+def read_json(file_name):
+    """Load data from a json file."""
+    with open(file_name, "r") as file:
+        return json.load(file)
+def write_json(file_name, data):
+    """Save data to a json file."""
+    with open(file_name, "w", encoding="utf-8") as file:
+        json.dump(data, file, indent=4, sort_keys=True)