sashdev commited on
Commit
e2f13a4
1 Parent(s): 6ecbb4f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -94
app.py CHANGED
@@ -1,107 +1,108 @@
1
- # Import dependencies
2
- import gradio as gr
3
- from transformers import AutoTokenizer, AutoModelForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration
4
- import torch
5
- import nltk
 
 
6
  import spacy
7
- from nltk.corpus import wordnet
8
- import subprocess
9
 
10
- # Download NLTK data (if not already downloaded)
11
- nltk.download('punkt')
12
- nltk.download('stopwords')
13
- nltk.download('wordnet') # Download WordNet
14
 
15
- # Download spaCy model if not already installed
16
- try:
17
- nlp = spacy.load("en_core_web_sm")
18
- except OSError:
19
- subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
20
- nlp = spacy.load("en_core_web_sm")
21
 
22
- # Check for GPU and set the device accordingly
23
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
24
 
25
- # Load AI Detector model and tokenizer from Hugging Face (DistilBERT)
26
- tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
27
- model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english").to(device)
 
 
 
28
 
29
- # Load SRDdev Paraphrase model and tokenizer for humanizing text
30
- paraphrase_tokenizer = T5Tokenizer.from_pretrained("SRDdev/Paraphrase")
31
- paraphrase_model = T5ForConditionalGeneration.from_pretrained("SRDdev/Paraphrase").to(device)
 
 
 
 
 
 
32
 
33
- # Function to find synonyms using WordNet via NLTK
34
- def get_synonyms(word):
35
- synonyms = set()
36
- for syn in wordnet.synsets(word):
37
- for lemma in syn.lemmas():
38
- synonyms.add(lemma.name())
39
- return list(synonyms)
 
40
 
41
- # Replace words with synonyms using spaCy and WordNet
42
- def replace_with_synonyms(text):
43
- doc = nlp(text)
44
- processed_text = []
45
- for token in doc:
46
- synonyms = get_synonyms(token.text.lower())
47
- if synonyms and token.pos_ in {"NOUN", "VERB", "ADJ", "ADV"}: # Only replace certain types of words
48
- replacement = synonyms[0] # Replace with the first synonym
49
- if token.is_title:
50
- replacement = replacement.capitalize()
51
- processed_text.append(replacement)
52
- else:
53
- processed_text.append(token.text)
54
- return " ".join(processed_text)
55
 
56
- # AI detection function using DistilBERT
57
- def detect_ai_generated(text):
58
- inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
59
- with torch.no_grad():
60
- outputs = model(**inputs)
61
- probabilities = torch.softmax(outputs.logits, dim=1)
62
- ai_probability = probabilities[0][1].item() # Probability of being AI-generated
63
- return ai_probability
64
 
65
- # Humanize the AI-detected text using the SRDdev Paraphrase model
66
- def humanize_text(AI_text):
67
- paragraphs = AI_text.split("\n")
68
- paraphrased_paragraphs = []
69
- for paragraph in paragraphs:
70
- if paragraph.strip():
71
- inputs = paraphrase_tokenizer(paragraph, return_tensors="pt", max_length=512, truncation=True).to(device)
72
- paraphrased_ids = paraphrase_model.generate(
73
- inputs['input_ids'],
74
- max_length=inputs['input_ids'].shape[-1] + 20, # Slightly more than the original input length
75
- num_beams=4,
76
- early_stopping=True,
77
- length_penalty=1.0,
78
- no_repeat_ngram_size=3,
79
- )
80
- paraphrased_text = paraphrase_tokenizer.decode(paraphrased_ids[0], skip_special_tokens=True)
81
- paraphrased_paragraphs.append(paraphrased_text)
82
- return "\n\n".join(paraphrased_paragraphs)
83
 
84
- # Main function to handle the overall process
85
- def main_function(AI_text):
86
- # Replace words with synonyms
87
- text_with_synonyms = replace_with_synonyms(AI_text)
88
-
89
- # Detect AI-generated content
90
- ai_probability = detect_ai_generated(text_with_synonyms)
91
-
92
- # Humanize AI text
93
- humanized_text = humanize_text(text_with_synonyms)
94
-
95
- return f"AI-Generated Content: {ai_probability:.2f}%\n\nHumanized Text:\n{humanized_text}"
96
 
97
- # Gradio interface definition
98
- interface = gr.Interface(
99
- fn=main_function,
100
- inputs="textbox",
101
- outputs="textbox",
102
- title="AI Text Humanizer with Synonym Replacement",
103
- description="Enter AI-generated text and get a human-written version, with synonyms replaced for more natural output. This space uses models from Hugging Face directly."
104
- )
 
 
 
 
 
 
105
 
106
- # Launch the Gradio app
107
- interface.launch(debug=True)
 
1
+ from nltk.tokenize import word_tokenize
2
+ from nltk.tag import pos_tag
3
+ from nltk.corpus import wordnet as wn
4
+ from sematch.semantic.similarity import WordNetSimilarity
5
+ from vocabulary.vocabulary import Vocabulary as vb
6
+ import json
7
+ from random import randint
8
  import spacy
9
+ import os.path
 
10
 
11
+ nlp = spacy.load('en_core_web_sm')
 
 
 
12
 
13
+ # Function to tag sentence with part of speach
14
+ def tag(sentence):
15
+ words = word_tokenize(sentence)
16
+ words = pos_tag(words)
17
+ return words
 
18
 
19
+ # Determine the POS to paraphrase
20
+ def paraphraseable(tag):
21
+ return tag.startswith('NN') or tag =='VB' or tag.startswith('JJ')
22
 
23
+ # POS tagging
24
+ def pos(tag):
25
+ if tag.startswith('NN'):
26
+ return wn.NOUN
27
+ elif tag.startswith('V'):
28
+ return wn.VERB
29
 
30
+ # Function to crate synonyms using wordnet nltk
31
+ def synonyms(word, tag):
32
+ listOfLemmas = [baseWord.lemmas() for baseWord in wn.synsets(word, pos(tag))]
33
+ if len(listOfLemmas) > 0:
34
+ listOfLemmas = listOfLemmas[0]
35
+ lemmas = [lemma.name().encode('ascii', 'ignore') for lemma in listOfLemmas]
36
+ return set(lemmas)
37
+ else:
38
+ return set([])
39
 
40
+ # Create dictonary synonums
41
+ def dictonarySynonums(word):
42
+ synJSON = vb.synonym(word)
43
+ if synJSON != False:
44
+ synonyms_lists = [dictSyno["text"].encode('ascii', 'ignore') for dictSyno in json.loads(vb.synonym(word))]
45
+ return set(synonyms_lists)
46
+ else:
47
+ return set([])
48
 
49
+ # controll set to calculate the semantic similarity of synonums from the base words using SPACY
50
+ def controlledSetSpacy(word,similarWords):
51
+ utf_en_word = nlp(word.decode('utf-8', 'ignore'))
52
+ for similarWord in similarWords.copy():
53
+ utf_en_similarWord = nlp(similarWord.decode('utf-8','ignore'))
54
+ if utf_en_word.similarity(utf_en_similarWord) <.76: # Variable to control accuracy of controlset
55
+ similarWords.discard(similarWord)
56
+ return similarWords
 
 
 
 
 
 
57
 
58
+ # controll set to calculate the semantic similarity of synonums from the base words using WordNetSimilarity
59
+ def controlledSetWordNetSimilarity(word,similarWords):
60
+ wns = WordNetSimilarity()
61
+ for similarWord in similarWords.copy():
62
+ if wns.word_similarity(word, similarWord, 'li') < 0.9996: # Variable to control accuracy of controlset
63
+ similarWords.discard(similarWord)
64
+ return similarWords
 
65
 
66
+ # to to get synonums from wordnet nltk as well as from python dictonary synonums
67
+ def synonymIfExists(sentence):
68
+ for (word, t) in tag(sentence):
69
+ if paraphraseable(t) and word not in ["i","I"]:
70
+ syns = synonyms(word, t)
71
+ syns.update(dictonarySynonums(word))
72
+ if syns:
73
+ syns = controlledSetWordNetSimilarity(word,syns) # Or use the commented controlled set
74
+ #syns = controlledSetSpacy(word,syns)
75
+ if len(syns) > 1:
76
+ yield [word, list(syns)]
77
+ continue
78
+ yield [word,[]]
 
 
 
 
 
79
 
80
+ # Function to get the semantic similar synonums and the total count of synonums in the entire sentence
81
+ def paraphrase(sentence):
82
+ bagOfWords = []
83
+ counter = 1
84
+ for tempArray in synonymIfExists(sentence):
85
+ eachBoW=[]
86
+ eachBoW.append(tempArray[0])
87
+ eachBoW.extend(tempArray[1])
88
+ eachBoW=list(set(eachBoW))
89
+ counter *= len(eachBoW)
90
+ bagOfWords.append(eachBoW)
91
+ return bagOfWords,counter
92
 
93
+ # Function to re-create sentence with synonums where the synonums are taken in randon order
94
+ def paraPhraseThisSentence(sentence):
95
+ ppList = []
96
+ vList,count = paraphrase(sentence)
97
+ allWordsCount = len(vList)
98
+ for y in range(count):
99
+ str = []
100
+ returnStr = " "
101
+ for w in range(allWordsCount):
102
+ str.append(vList[w][randint(0,len(vList[w])-1)].replace("_"," "))
103
+ ppList.append(returnStr.join(str))
104
+ ppList = list(set(ppList))
105
+ print (ppList)
106
+ return ppList
107
 
108
+ paraPhraseThisSentence("Financial Institutes have always helped the society to become better version of itself.")