Spaces:

fnavales
/

hate-speech

Runtime error

App Files Files Community

fnavales commited on Oct 17, 2022

Commit

1ac2cab

•

1 Parent(s): f26c37e

Trained model to detect Hate Speech

Browse files

Files changed (2) hide show

app.py +96 -5
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -1,5 +1,97 @@
 import gradio as gr
-from detoxify import Detoxify
 all_categories = {'all_categories': [
                                     'toxicity',
@@ -25,11 +117,8 @@ examples = [
 ]
-model = Detoxify('multilingual')
 def toxicity(sentence, threshold):
-    predicts = model.predict(sentence)
     return [ x for x in predicts if predicts[x] > threshold/100 ], all_categories
 gr.Interface(fn=toxicity,
@@ -42,3 +131,5 @@ gr.Interface(fn=toxicity,
                  gr.JSON(all_categories)
              ],
              examples=examples).launch()

 import gradio as gr
+import torch.nn as nn
+import torch
+from transformers import BertTokenizerFast as BertTokenizer, BertModel
+import pytorch_lightning as pl
+BERT_MODEL_NAME = 'bert-base-cased'
+tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
+LABEL_COLUMNS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
+class ToxicCommentTagger(pl.LightningModule):
+    def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
+        super().__init__()
+        self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
+        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
+        self.n_training_steps = n_training_steps
+        self.n_warmup_steps = n_warmup_steps
+        self.criterion = nn.BCELoss()
+def predict(model, tokenizer, sentence):
+    encoding = tokenizer.encode_plus(
+        sentence,
+        add_special_tokens=False,
+        max_length=510,
+        return_token_type_ids=False,
+        padding="max_length",
+        return_attention_mask=True,
+        return_tensors='pt'
+    )
+    # define target chunksize
+    chunksize = 512
+    # split into chunks of 510 tokens, we also convert to list (default is tuple which is immutable)
+    input_id_chunks = list(encoding['input_ids'][0].split(chunksize - 2))
+    mask_chunks = list(encoding['attention_mask'][0].split(chunksize - 2))
+    # loop through each chunk
+    for i in range(len(input_id_chunks)):
+        # add CLS and SEP tokens to input IDs
+        input_id_chunks[i] = torch.cat([
+            torch.tensor([101]), input_id_chunks[i], torch.tensor([102])
+        ])
+        # add attention tokens to attention mask
+        mask_chunks[i] = torch.cat([
+            torch.tensor([1]), mask_chunks[i], torch.tensor([1])
+        ])
+        # get required padding length
+        pad_len = chunksize - input_id_chunks[i].shape[0]
+        # check if tensor length satisfies required chunk size
+        if pad_len > 0:
+            # if padding length is more than 0, we must add padding
+            input_id_chunks[i] = torch.cat([
+                input_id_chunks[i], torch.Tensor([0] * pad_len)
+            ])
+            mask_chunks[i] = torch.cat([
+                mask_chunks[i], torch.Tensor([0] * pad_len)
+            ])
+    input_ids = torch.stack(input_id_chunks)
+    attention_mask = torch.stack(mask_chunks)
+    input_dict = {
+        'input_ids': input_ids.long(),
+        'attention_mask': attention_mask.int()
+    }
+    _, test_prediction = model(**input_dict)
+    test_prediction = test_prediction.numpy()
+    output = {}
+    for chunk in test_prediction:
+        for label, prediction in zip(LABEL_COLUMNS, chunk):
+            if label in output:
+                output[label] = max(prediction, output[label])
+            else:
+                output[label] = prediction
+    return output
+model = ToxicCommentTagger.load_from_checkpoint(
+    '/content/drive/MyDrive/checkpoints/best-checkpoint.ckpt',
+    n_classes=len(LABEL_COLUMNS)
+)
+model.eval()
+model.freeze()
 all_categories = {'all_categories': [
                                     'toxicity',
 ]
 def toxicity(sentence, threshold):
+    predicts = predict(model, tokenizer, sentence)
     return [ x for x in predicts if predicts[x] > threshold/100 ], all_categories
 gr.Interface(fn=toxicity,
                  gr.JSON(all_categories)
              ],
              examples=examples).launch()

requirements.txt CHANGED Viewed

	@@ -1 +1,3 @@
1	- ~~detoxify~~==0.5.0

+transformers==4.23.1
+torch @ https://download.pytorch.org/whl/cu113/torch-1.12.1%2Bcu113-cp37-cp37m-linux_x86_64.whl
+pytorch-lightning==1.7.7