add example
Browse files- .idea/.gitignore +3 -0
- .idea/AcroBERT.iml +11 -0
- .idea/inspectionProfiles/Project_Default.xml +23 -0
- .idea/inspectionProfiles/profiles_settings.xml +6 -0
- .idea/misc.xml +4 -0
- .idea/modules.xml +8 -0
- .idea/vcs.xml +6 -0
- __pycache__/constant.cpython-38.pyc +0 -0
- __pycache__/maddog.cpython-38.pyc +0 -0
- __pycache__/utils.cpython-38.pyc +0 -0
- acrobert.py +13 -8
- app.py +11 -0
.idea/.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# Default ignored files
|
2 |
+
/shelf/
|
3 |
+
/workspace.xml
|
.idea/AcroBERT.iml
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<module type="PYTHON_MODULE" version="4">
|
3 |
+
<component name="NewModuleRootManager">
|
4 |
+
<content url="file://$MODULE_DIR$" />
|
5 |
+
<orderEntry type="inheritedJdk" />
|
6 |
+
<orderEntry type="sourceFolder" forTests="false" />
|
7 |
+
</component>
|
8 |
+
<component name="TestRunnerService">
|
9 |
+
<option name="PROJECT_TEST_RUNNER" value="pytest" />
|
10 |
+
</component>
|
11 |
+
</module>
|
.idea/inspectionProfiles/Project_Default.xml
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<profile version="1.0">
|
3 |
+
<option name="myName" value="Project Default" />
|
4 |
+
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
5 |
+
<option name="ignoredPackages">
|
6 |
+
<value>
|
7 |
+
<list size="10">
|
8 |
+
<item index="0" class="java.lang.String" itemvalue="scipy" />
|
9 |
+
<item index="1" class="java.lang.String" itemvalue="tensorflow" />
|
10 |
+
<item index="2" class="java.lang.String" itemvalue="tensorflow-estimator" />
|
11 |
+
<item index="3" class="java.lang.String" itemvalue="tensorboard" />
|
12 |
+
<item index="4" class="java.lang.String" itemvalue="Keras" />
|
13 |
+
<item index="5" class="java.lang.String" itemvalue="numpy" />
|
14 |
+
<item index="6" class="java.lang.String" itemvalue="t" />
|
15 |
+
<item index="7" class="java.lang.String" itemvalue="torch" />
|
16 |
+
<item index="8" class="java.lang.String" itemvalue="python-Levenshtein" />
|
17 |
+
<item index="9" class="java.lang.String" itemvalue="pytorch-metric-learning" />
|
18 |
+
</list>
|
19 |
+
</value>
|
20 |
+
</option>
|
21 |
+
</inspection_tool>
|
22 |
+
</profile>
|
23 |
+
</component>
|
.idea/inspectionProfiles/profiles_settings.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<settings>
|
3 |
+
<option name="USE_PROJECT_PROFILE" value="false" />
|
4 |
+
<version value="1.0" />
|
5 |
+
</settings>
|
6 |
+
</component>
|
.idea/misc.xml
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" />
|
4 |
+
</project>
|
.idea/modules.xml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectModuleManager">
|
4 |
+
<modules>
|
5 |
+
<module fileurl="file://$PROJECT_DIR$/.idea/AcroBERT.iml" filepath="$PROJECT_DIR$/.idea/AcroBERT.iml" />
|
6 |
+
</modules>
|
7 |
+
</component>
|
8 |
+
</project>
|
.idea/vcs.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="VcsDirectoryMappings">
|
4 |
+
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
5 |
+
</component>
|
6 |
+
</project>
|
__pycache__/constant.cpython-38.pyc
ADDED
Binary file (49.2 kB). View file
|
|
__pycache__/maddog.cpython-38.pyc
ADDED
Binary file (25.1 kB). View file
|
|
__pycache__/utils.cpython-38.pyc
ADDED
Binary file (7.67 kB). View file
|
|
acrobert.py
CHANGED
@@ -8,8 +8,7 @@ from maddog import Extractor
|
|
8 |
import spacy
|
9 |
import constant
|
10 |
|
11 |
-
|
12 |
-
spacy.cli.download("en_core_web_sm")
|
13 |
nlp = spacy.load("en_core_web_sm")
|
14 |
ruleExtractor = Extractor()
|
15 |
kb = utils.load_acronym_kb('acronym_kb.json')
|
@@ -40,14 +39,15 @@ def softmax(elements):
|
|
40 |
|
41 |
|
42 |
def predict(topk, model, short_form, context, batch_size, acronym_kb, device):
|
43 |
-
ori_candidate = utils.get_candidate(acronym_kb, short_form, can_num=
|
44 |
long_terms = [str.lower(can) for can in ori_candidate]
|
45 |
scores = cal_score(model.model, model.tokenizer, long_terms, context, batch_size, device)
|
46 |
#indexes = [np.argmax(scores)]
|
47 |
topk = min(len(scores), topk)
|
48 |
indexes = np.array(scores).argsort()[::-1][:topk]
|
49 |
names = [ori_candidate[i] for i in indexes]
|
50 |
-
|
|
|
51 |
|
52 |
|
53 |
def cal_score(model, tokenizer, long_forms, contexts, batch_size, device):
|
@@ -79,13 +79,16 @@ def acrobert(sentence, model, device):
|
|
79 |
tokens = [t.text for t in nlp(sentence) if len(t.text.strip()) > 0]
|
80 |
rulebased_pairs = ruleExtractor.extract(tokens, constant.RULES)
|
81 |
|
82 |
-
results =
|
83 |
for acronym in rulebased_pairs.keys():
|
84 |
if rulebased_pairs[acronym][0] != '':
|
85 |
results.append((acronym, rulebased_pairs[acronym][0]))
|
86 |
else:
|
87 |
-
pred = predict(
|
88 |
-
|
|
|
|
|
|
|
89 |
return results
|
90 |
|
91 |
|
@@ -124,6 +127,8 @@ if __name__ == '__main__':
|
|
124 |
# be discredited and diminished in the public ’s eye. More often than not, PR is
|
125 |
# a preemptive process. Celebrity publicists are paid lots of money to keep certain
|
126 |
# stories out of the news."""
|
127 |
-
sentence = "
|
|
|
|
|
128 |
results = acronym_linker(sentence)
|
129 |
print(results)
|
|
|
8 |
import spacy
|
9 |
import constant
|
10 |
|
11 |
+
|
|
|
12 |
nlp = spacy.load("en_core_web_sm")
|
13 |
ruleExtractor = Extractor()
|
14 |
kb = utils.load_acronym_kb('acronym_kb.json')
|
|
|
39 |
|
40 |
|
41 |
def predict(topk, model, short_form, context, batch_size, acronym_kb, device):
|
42 |
+
ori_candidate = utils.get_candidate(acronym_kb, short_form, can_num=20)
|
43 |
long_terms = [str.lower(can) for can in ori_candidate]
|
44 |
scores = cal_score(model.model, model.tokenizer, long_terms, context, batch_size, device)
|
45 |
#indexes = [np.argmax(scores)]
|
46 |
topk = min(len(scores), topk)
|
47 |
indexes = np.array(scores).argsort()[::-1][:topk]
|
48 |
names = [ori_candidate[i] for i in indexes]
|
49 |
+
confidences = [round(scores[i], 3) for i in indexes]
|
50 |
+
return names, confidences
|
51 |
|
52 |
|
53 |
def cal_score(model, tokenizer, long_forms, contexts, batch_size, device):
|
|
|
79 |
tokens = [t.text for t in nlp(sentence) if len(t.text.strip()) > 0]
|
80 |
rulebased_pairs = ruleExtractor.extract(tokens, constant.RULES)
|
81 |
|
82 |
+
results = dict()
|
83 |
for acronym in rulebased_pairs.keys():
|
84 |
if rulebased_pairs[acronym][0] != '':
|
85 |
results.append((acronym, rulebased_pairs[acronym][0]))
|
86 |
else:
|
87 |
+
pred, scores = predict(5, model, acronym, sentence, batch_size=10, acronym_kb=kb, device=device)
|
88 |
+
output = list(zip(pred, scores))
|
89 |
+
#print(output)
|
90 |
+
results[acronym] = output
|
91 |
+
#results.append((acronym, pred[0], scores[0]))
|
92 |
return results
|
93 |
|
94 |
|
|
|
127 |
# be discredited and diminished in the public ’s eye. More often than not, PR is
|
128 |
# a preemptive process. Celebrity publicists are paid lots of money to keep certain
|
129 |
# stories out of the news."""
|
130 |
+
sentence = """
|
131 |
+
AI is a wide-ranging branch of computer science concerned with building smart machines capable of performing tasks that typically require human intelligence.
|
132 |
+
"""
|
133 |
results = acronym_linker(sentence)
|
134 |
print(results)
|
app.py
CHANGED
@@ -6,5 +6,16 @@ def greet(sentence):
|
|
6 |
results = acronym_linker(sentence, mode='acrobert')
|
7 |
return results
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
iface = gr.Interface(fn=greet, inputs="text", outputs="text")
|
10 |
iface.launch()
|
|
|
6 |
results = acronym_linker(sentence, mode='acrobert')
|
7 |
return results
|
8 |
|
9 |
+
|
10 |
+
sample_list = [
|
11 |
+
"AI is a wide-ranging branch of computer science concerned with building smart machines capable of performing tasks that typically require human intelligence. ",
|
12 |
+
"""A whistleblower like monologist Mike Daisey gets targeted as a scapegoat who must
|
13 |
+
be discredited and diminished in the public eyes. More often than not, PR is
|
14 |
+
a preemptive process. Celebrity publicists are paid lots of money to keep certain
|
15 |
+
stories out of the news.""",
|
16 |
+
"This new genome assembly and the annotation are tagged as a RefSeq genome by NCBI and thus provide substantially enhanced genomic resources for future research involving S. scovelli.",
|
17 |
+
"In this study , we found that miR-34a demonstrated greater expression in the lungs of patients with IPF and in mice with experimental pulmonary fibrosis , with its primary localization in lung fibroblasts.",
|
18 |
+
]
|
19 |
+
|
20 |
iface = gr.Interface(fn=greet, inputs="text", outputs="text")
|
21 |
iface.launch()
|