Spaces:
Sleeping
Sleeping
import random | |
from difflib import Differ | |
from textattack.attack_recipes import BAEGarg2019 | |
from textattack.datasets import Dataset | |
from textattack.models.wrappers import HuggingFaceModelWrapper | |
from findfile import find_files | |
from flask import Flask | |
from textattack import Attacker | |
class ModelWrapper(HuggingFaceModelWrapper): | |
def __init__(self, model): | |
self.model = model # pipeline = pipeline | |
def __call__(self, text_inputs, **kwargs): | |
outputs = [] | |
for text_input in text_inputs: | |
raw_outputs = self.model.infer(text_input, print_result=False, **kwargs) | |
outputs.append(raw_outputs["probs"]) | |
return outputs | |
class SentAttacker: | |
def __init__(self, model, recipe_class=BAEGarg2019): | |
model = model | |
model_wrapper = ModelWrapper(model) | |
recipe = recipe_class.build(model_wrapper) | |
# WordNet defaults to english. Set the default language to French ('fra') | |
# recipe.transformation.language = "en" | |
_dataset = [("", 0)] | |
_dataset = Dataset(_dataset) | |
self.attacker = Attacker(recipe, _dataset) | |
def diff_texts(text1, text2): | |
d = Differ() | |
text1_words = text1.split() | |
text2_words = text2.split() | |
return [ | |
(token[2:], token[0] if token[0] != " " else None) | |
for token in d.compare(text1_words, text2_words) | |
] | |
def get_ensembled_tad_results(results): | |
target_dict = {} | |
for r in results: | |
target_dict[r["label"]] = ( | |
target_dict.get(r["label"]) + 1 if r["label"] in target_dict else 1 | |
) | |
return dict(zip(target_dict.values(), target_dict.keys()))[ | |
max(target_dict.values()) | |
] | |
def get_sst2_example(): | |
filter_key_words = [ | |
".py", | |
".md", | |
"readme", | |
"log", | |
"result", | |
"zip", | |
".state_dict", | |
".model", | |
".png", | |
"acc_", | |
"f1_", | |
".origin", | |
".adv", | |
".csv", | |
] | |
dataset_file = {"train": [], "test": [], "valid": []} | |
dataset = "sst2" | |
search_path = "./" | |
task = "text_defense" | |
dataset_file["test"] += find_files( | |
search_path, | |
[dataset, "test", task], | |
exclude_key=[".adv", ".org", ".defense", ".inference", "train."] | |
+ filter_key_words, | |
) | |
for dat_type in ["test"]: | |
data = [] | |
label_set = set() | |
for data_file in dataset_file[dat_type]: | |
with open(data_file, mode="r", encoding="utf8") as fin: | |
lines = fin.readlines() | |
for line in lines: | |
text, label = line.split("$LABEL$") | |
text = text.strip() | |
label = int(label.strip()) | |
data.append((text, label)) | |
label_set.add(label) | |
return random.choice(data) | |
def get_agnews_example(): | |
filter_key_words = [ | |
".py", | |
".md", | |
"readme", | |
"log", | |
"result", | |
"zip", | |
".state_dict", | |
".model", | |
".png", | |
"acc_", | |
"f1_", | |
".origin", | |
".adv", | |
".csv", | |
] | |
dataset_file = {"train": [], "test": [], "valid": []} | |
dataset = "agnews" | |
search_path = "./" | |
task = "text_defense" | |
dataset_file["test"] += find_files( | |
search_path, | |
[dataset, "test", task], | |
exclude_key=[".adv", ".org", ".defense", ".inference", "train."] | |
+ filter_key_words, | |
) | |
for dat_type in ["test"]: | |
data = [] | |
label_set = set() | |
for data_file in dataset_file[dat_type]: | |
with open(data_file, mode="r", encoding="utf8") as fin: | |
lines = fin.readlines() | |
for line in lines: | |
text, label = line.split("$LABEL$") | |
text = text.strip() | |
label = int(label.strip()) | |
data.append((text, label)) | |
label_set.add(label) | |
return random.choice(data) | |
def get_amazon_example(): | |
filter_key_words = [ | |
".py", | |
".md", | |
"readme", | |
"log", | |
"result", | |
"zip", | |
".state_dict", | |
".model", | |
".png", | |
"acc_", | |
"f1_", | |
".origin", | |
".adv", | |
".csv", | |
] | |
dataset_file = {"train": [], "test": [], "valid": []} | |
dataset = "amazon" | |
search_path = "./" | |
task = "text_defense" | |
dataset_file["test"] += find_files( | |
search_path, | |
[dataset, "test", task], | |
exclude_key=[".adv", ".org", ".defense", ".inference", "train."] | |
+ filter_key_words, | |
) | |
for dat_type in ["test"]: | |
data = [] | |
label_set = set() | |
for data_file in dataset_file[dat_type]: | |
with open(data_file, mode="r", encoding="utf8") as fin: | |
lines = fin.readlines() | |
for line in lines: | |
text, label = line.split("$LABEL$") | |
text = text.strip() | |
label = int(label.strip()) | |
data.append((text, label)) | |
label_set.add(label) | |
return random.choice(data) | |
def get_imdb_example(): | |
filter_key_words = [ | |
".py", | |
".md", | |
"readme", | |
"log", | |
"result", | |
"zip", | |
".state_dict", | |
".model", | |
".png", | |
"acc_", | |
"f1_", | |
".origin", | |
".adv", | |
".csv", | |
] | |
dataset_file = {"train": [], "test": [], "valid": []} | |
dataset = "imdb" | |
search_path = "./" | |
task = "text_defense" | |
dataset_file["test"] += find_files( | |
search_path, | |
[dataset, "test", task], | |
exclude_key=[".adv", ".org", ".defense", ".inference", "train."] | |
+ filter_key_words, | |
) | |
for dat_type in ["test"]: | |
data = [] | |
label_set = set() | |
for data_file in dataset_file[dat_type]: | |
with open(data_file, mode="r", encoding="utf8") as fin: | |
lines = fin.readlines() | |
for line in lines: | |
text, label = line.split("$LABEL$") | |
text = text.strip() | |
label = int(label.strip()) | |
data.append((text, label)) | |
label_set.add(label) | |
return random.choice(data) | |