fnavales commited on
Commit
1ac2cab
1 Parent(s): f26c37e

Trained model to detect Hate Speech

Browse files
Files changed (2) hide show
  1. app.py +96 -5
  2. requirements.txt +3 -1
app.py CHANGED
@@ -1,5 +1,97 @@
1
  import gradio as gr
2
- from detoxify import Detoxify
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  all_categories = {'all_categories': [
5
  'toxicity',
@@ -25,11 +117,8 @@ examples = [
25
  ]
26
 
27
 
28
- model = Detoxify('multilingual')
29
-
30
-
31
  def toxicity(sentence, threshold):
32
- predicts = model.predict(sentence)
33
  return [ x for x in predicts if predicts[x] > threshold/100 ], all_categories
34
 
35
  gr.Interface(fn=toxicity,
@@ -42,3 +131,5 @@ gr.Interface(fn=toxicity,
42
  gr.JSON(all_categories)
43
  ],
44
  examples=examples).launch()
 
 
 
1
  import gradio as gr
2
+ import torch.nn as nn
3
+ import torch
4
+ from transformers import BertTokenizerFast as BertTokenizer, BertModel
5
+ import pytorch_lightning as pl
6
+
7
+
8
+ BERT_MODEL_NAME = 'bert-base-cased'
9
+ tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
10
+ LABEL_COLUMNS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
11
+
12
+
13
+ class ToxicCommentTagger(pl.LightningModule):
14
+
15
+ def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
16
+ super().__init__()
17
+ self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
18
+ self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
19
+ self.n_training_steps = n_training_steps
20
+ self.n_warmup_steps = n_warmup_steps
21
+ self.criterion = nn.BCELoss()
22
+
23
+
24
+ def predict(model, tokenizer, sentence):
25
+
26
+ encoding = tokenizer.encode_plus(
27
+ sentence,
28
+ add_special_tokens=False,
29
+ max_length=510,
30
+ return_token_type_ids=False,
31
+ padding="max_length",
32
+ return_attention_mask=True,
33
+ return_tensors='pt'
34
+ )
35
+
36
+ # define target chunksize
37
+ chunksize = 512
38
+
39
+ # split into chunks of 510 tokens, we also convert to list (default is tuple which is immutable)
40
+ input_id_chunks = list(encoding['input_ids'][0].split(chunksize - 2))
41
+ mask_chunks = list(encoding['attention_mask'][0].split(chunksize - 2))
42
+
43
+ # loop through each chunk
44
+ for i in range(len(input_id_chunks)):
45
+ # add CLS and SEP tokens to input IDs
46
+ input_id_chunks[i] = torch.cat([
47
+ torch.tensor([101]), input_id_chunks[i], torch.tensor([102])
48
+ ])
49
+ # add attention tokens to attention mask
50
+ mask_chunks[i] = torch.cat([
51
+ torch.tensor([1]), mask_chunks[i], torch.tensor([1])
52
+ ])
53
+ # get required padding length
54
+ pad_len = chunksize - input_id_chunks[i].shape[0]
55
+ # check if tensor length satisfies required chunk size
56
+ if pad_len > 0:
57
+ # if padding length is more than 0, we must add padding
58
+ input_id_chunks[i] = torch.cat([
59
+ input_id_chunks[i], torch.Tensor([0] * pad_len)
60
+ ])
61
+ mask_chunks[i] = torch.cat([
62
+ mask_chunks[i], torch.Tensor([0] * pad_len)
63
+ ])
64
+
65
+ input_ids = torch.stack(input_id_chunks)
66
+ attention_mask = torch.stack(mask_chunks)
67
+
68
+ input_dict = {
69
+ 'input_ids': input_ids.long(),
70
+ 'attention_mask': attention_mask.int()
71
+ }
72
+
73
+ _, test_prediction = model(**input_dict)
74
+ test_prediction = test_prediction.numpy()
75
+
76
+ output = {}
77
+ for chunk in test_prediction:
78
+ for label, prediction in zip(LABEL_COLUMNS, chunk):
79
+ if label in output:
80
+ output[label] = max(prediction, output[label])
81
+ else:
82
+ output[label] = prediction
83
+
84
+ return output
85
+
86
+
87
+ model = ToxicCommentTagger.load_from_checkpoint(
88
+ '/content/drive/MyDrive/checkpoints/best-checkpoint.ckpt',
89
+ n_classes=len(LABEL_COLUMNS)
90
+ )
91
+
92
+ model.eval()
93
+ model.freeze()
94
+
95
 
96
  all_categories = {'all_categories': [
97
  'toxicity',
 
117
  ]
118
 
119
 
 
 
 
120
  def toxicity(sentence, threshold):
121
+ predicts = predict(model, tokenizer, sentence)
122
  return [ x for x in predicts if predicts[x] > threshold/100 ], all_categories
123
 
124
  gr.Interface(fn=toxicity,
 
131
  gr.JSON(all_categories)
132
  ],
133
  examples=examples).launch()
134
+
135
+
requirements.txt CHANGED
@@ -1 +1,3 @@
1
- detoxify==0.5.0
 
 
 
1
+ transformers==4.23.1
2
+ torch @ https://download.pytorch.org/whl/cu113/torch-1.12.1%2Bcu113-cp37-cp37m-linux_x86_64.whl
3
+ pytorch-lightning==1.7.7