Spaces:
Running
Running
- Biweekly Feedback Summary.md +19 -0
- scripts/inference/inference.py +17 -0
- scripts/inference/lrt.ipynb +310 -0
- scripts/queryAPI/API_Summary.ipynb +0 -0
- scripts/readme.md +5 -0
- scripts/tests/lrt_test_run.py +65 -0
- scripts/tests/model_test.py +103 -0
- scripts/train/KeyBartAdapter_train.ipynb +0 -0
- scripts/train/train.py +171 -0
- setup.py +23 -0
Biweekly Feedback Summary.md
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Biweekly Feedback Summary
|
2 |
+
|
3 |
+
## 21.10.2022
|
4 |
+
1. [new dataset](https://huggingface.co/datasets/Adapting/abstract-keyphrases)
|
5 |
+
1. new model architecture:[KeyBartAdapter](https://github.com/Mondkuchen/idp_LiteratureResearch_Tool/blob/main/lrt/clustering/models/keyBartPlus.py)
|
6 |
+
- [train script](https://github.com/Mondkuchen/idp_LiteratureResearch_Tool/blob/main/scripts/train/train.py)
|
7 |
+
- [training result](https://huggingface.co/Adapting/KeyBartAdapter)
|
8 |
+
2. [文献分析平台比较](https://leoxiang66.github.io/LRT-Doc/4-%E6%96%87%E7%8C%AE%E5%88%86%E6%9E%90%E5%B9%B3%E5%8F%B0%E6%AF%94%E8%BE%83/)
|
9 |
+
|
10 |
+
## TODOs
|
11 |
+
- [x] a new keywords generation model
|
12 |
+
- [x] 比较其他文献工具
|
13 |
+
- [x] 开始研究query API
|
14 |
+
- [ ] 可视化
|
15 |
+
- [ ] 和其他文献工具比较结果
|
16 |
+
- [ ] 收集更多训练数据/fine-tune
|
17 |
+
- [ ] add clustering: using other clustering algorithms such as Gausian Mixture Model (GMM)
|
18 |
+
- [ ] add dimension reduction
|
19 |
+
- [ ] [better PLM](https://huggingface.co/spaces/mteb/leaderboard)
|
scripts/inference/inference.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
if __name__ == '__main__':
|
2 |
+
import sys
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
project_root = Path(__file__).parent.parent.parent.absolute() # /home/adapting/git/leoxiang66/idp_LiteratureResearch_Tool
|
6 |
+
sys.path.append(project_root.__str__())
|
7 |
+
|
8 |
+
from transformers import Text2TextGenerationPipeline
|
9 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
10 |
+
tokenizer = AutoTokenizer.from_pretrained("Adapting/KeyBartAdapter")
|
11 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("Adapting/KeyBartAdapter")
|
12 |
+
|
13 |
+
pipe = Text2TextGenerationPipeline(model=model, tokenizer=tokenizer)
|
14 |
+
|
15 |
+
abstract = '''Non-referential face image quality assessment methods have gained popularity as a pre-filtering step on face recognition systems. In most of them, the quality score is usually designed with face matching in mind. However, a small amount of work has been done on measuring their impact and usefulness on Presentation Attack Detection (PAD). In this paper, we study the effect of quality assessment methods on filtering bona fide and attack samples, their impact on PAD systems, and how the performance of such systems is improved when training on a filtered (by quality) dataset. On a Vision Transformer PAD algorithm, a reduction of 20% of the training dataset by removing lower quality samples allowed us to improve the BPCER by 3% in a cross-dataset test.'''
|
16 |
+
|
17 |
+
print(pipe(abstract))
|
scripts/inference/lrt.ipynb
ADDED
@@ -0,0 +1,310 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"nbformat": 4,
|
3 |
+
"nbformat_minor": 0,
|
4 |
+
"metadata": {
|
5 |
+
"colab": {
|
6 |
+
"provenance": [],
|
7 |
+
"collapsed_sections": [],
|
8 |
+
"machine_shape": "hm",
|
9 |
+
"mount_file_id": "1aBrZOQRBhTOgg2wvc0sh1d79m9abNU-O",
|
10 |
+
"authorship_tag": "ABX9TyOdcckjc7kMuJJm+A64/dzt",
|
11 |
+
"include_colab_link": true
|
12 |
+
},
|
13 |
+
"kernelspec": {
|
14 |
+
"name": "python3",
|
15 |
+
"display_name": "Python 3"
|
16 |
+
},
|
17 |
+
"language_info": {
|
18 |
+
"name": "python"
|
19 |
+
},
|
20 |
+
"accelerator": "GPU"
|
21 |
+
},
|
22 |
+
"cells": [
|
23 |
+
{
|
24 |
+
"cell_type": "markdown",
|
25 |
+
"metadata": {
|
26 |
+
"id": "view-in-github",
|
27 |
+
"colab_type": "text"
|
28 |
+
},
|
29 |
+
"source": [
|
30 |
+
"<a href=\"https://colab.research.google.com/github/Mondkuchen/idp_LiteratureResearch_Tool/blob/main/scripts/inference/lrt.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
31 |
+
]
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"cell_type": "markdown",
|
35 |
+
"source": [],
|
36 |
+
"metadata": {
|
37 |
+
"id": "NDK6pgcVQ6RI"
|
38 |
+
}
|
39 |
+
},
|
40 |
+
{
|
41 |
+
"cell_type": "code",
|
42 |
+
"source": [
|
43 |
+
"from google.colab import drive\n",
|
44 |
+
"drive.mount('/content/drive')"
|
45 |
+
],
|
46 |
+
"metadata": {
|
47 |
+
"colab": {
|
48 |
+
"base_uri": "https://localhost:8080/"
|
49 |
+
},
|
50 |
+
"id": "L76IjCQkviFl",
|
51 |
+
"outputId": "eebb493e-ff37-4336-9a03-8b39307627fd"
|
52 |
+
},
|
53 |
+
"execution_count": null,
|
54 |
+
"outputs": [
|
55 |
+
{
|
56 |
+
"output_type": "stream",
|
57 |
+
"name": "stdout",
|
58 |
+
"text": [
|
59 |
+
"Mounted at /content/drive\n"
|
60 |
+
]
|
61 |
+
}
|
62 |
+
]
|
63 |
+
},
|
64 |
+
{
|
65 |
+
"cell_type": "code",
|
66 |
+
"source": [
|
67 |
+
"%cd /content/drive/MyDrive/git/idp_LiteratureResearch_Tool/\n"
|
68 |
+
],
|
69 |
+
"metadata": {
|
70 |
+
"colab": {
|
71 |
+
"base_uri": "https://localhost:8080/"
|
72 |
+
},
|
73 |
+
"id": "PnedAltsxot6",
|
74 |
+
"outputId": "0de30b5e-0ce2-4adf-aff0-7e952e5087c3"
|
75 |
+
},
|
76 |
+
"execution_count": null,
|
77 |
+
"outputs": [
|
78 |
+
{
|
79 |
+
"output_type": "stream",
|
80 |
+
"name": "stdout",
|
81 |
+
"text": [
|
82 |
+
"/content/drive/MyDrive/git/idp_LiteratureResearch_Tool\n"
|
83 |
+
]
|
84 |
+
}
|
85 |
+
]
|
86 |
+
},
|
87 |
+
{
|
88 |
+
"cell_type": "code",
|
89 |
+
"source": [
|
90 |
+
"!ls"
|
91 |
+
],
|
92 |
+
"metadata": {
|
93 |
+
"colab": {
|
94 |
+
"base_uri": "https://localhost:8080/"
|
95 |
+
},
|
96 |
+
"id": "CPRrgG9Fx06U",
|
97 |
+
"outputId": "62224f1a-a049-4c40-89a5-4f4a1b888842"
|
98 |
+
},
|
99 |
+
"execution_count": null,
|
100 |
+
"outputs": [
|
101 |
+
{
|
102 |
+
"output_type": "stream",
|
103 |
+
"name": "stdout",
|
104 |
+
"text": [
|
105 |
+
"example_run.py\tliterature README.md requirements.txt\n",
|
106 |
+
"examples\tlrt\t reports setup.py\n"
|
107 |
+
]
|
108 |
+
}
|
109 |
+
]
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"cell_type": "code",
|
113 |
+
"source": [
|
114 |
+
"!pip install -r requirements.txt"
|
115 |
+
],
|
116 |
+
"metadata": {
|
117 |
+
"colab": {
|
118 |
+
"base_uri": "https://localhost:8080/",
|
119 |
+
"height": 1000
|
120 |
+
},
|
121 |
+
"id": "w2ruvvI-yLeD",
|
122 |
+
"outputId": "58b61e2e-42a0-462b-8745-934b14aee1fd"
|
123 |
+
},
|
124 |
+
"execution_count": null,
|
125 |
+
"outputs": [
|
126 |
+
{
|
127 |
+
"output_type": "stream",
|
128 |
+
"name": "stdout",
|
129 |
+
"text": [
|
130 |
+
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
|
131 |
+
"Collecting evaluate==0.2.2\n",
|
132 |
+
" Downloading evaluate-0.2.2-py3-none-any.whl (69 kB)\n",
|
133 |
+
"\u001b[K |████████████████████████████████| 69 kB 4.9 MB/s \n",
|
134 |
+
"\u001b[?25hCollecting kmeans_pytorch==0.3\n",
|
135 |
+
" Downloading kmeans_pytorch-0.3-py3-none-any.whl (4.4 kB)\n",
|
136 |
+
"Requirement already satisfied: numpy==1.21.6 in /usr/local/lib/python3.7/dist-packages (from -r requirements.txt (line 3)) (1.21.6)\n",
|
137 |
+
"Requirement already satisfied: scikit_learn==1.0.2 in /usr/local/lib/python3.7/dist-packages (from -r requirements.txt (line 4)) (1.0.2)\n",
|
138 |
+
"Collecting sentence_transformers==2.2.2\n",
|
139 |
+
" Downloading sentence-transformers-2.2.2.tar.gz (85 kB)\n",
|
140 |
+
"\u001b[K |████████████████████████████████| 85 kB 4.9 MB/s \n",
|
141 |
+
"\u001b[?25hCollecting setuptools==63.4.1\n",
|
142 |
+
" Downloading setuptools-63.4.1-py3-none-any.whl (1.2 MB)\n",
|
143 |
+
"\u001b[K |████████████████████████████████| 1.2 MB 47.7 MB/s \n",
|
144 |
+
"\u001b[?25hRequirement already satisfied: torch==1.12.1 in /usr/local/lib/python3.7/dist-packages (from -r requirements.txt (line 7)) (1.12.1+cu113)\n",
|
145 |
+
"Requirement already satisfied: yellowbrick==1.5 in /usr/local/lib/python3.7/dist-packages (from -r requirements.txt (line 8)) (1.5)\n",
|
146 |
+
"Collecting transformers==4.22.1\n",
|
147 |
+
" Downloading transformers-4.22.1-py3-none-any.whl (4.9 MB)\n",
|
148 |
+
"\u001b[K |████████████████████████████████| 4.9 MB 56.6 MB/s \n",
|
149 |
+
"\u001b[?25hCollecting textdistance==4.5.0\n",
|
150 |
+
" Downloading textdistance-4.5.0-py3-none-any.whl (31 kB)\n",
|
151 |
+
"Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.7/dist-packages (from evaluate==0.2.2->-r requirements.txt (line 1)) (2.23.0)\n",
|
152 |
+
"Requirement already satisfied: fsspec[http]>=2021.05.0 in /usr/local/lib/python3.7/dist-packages (from evaluate==0.2.2->-r requirements.txt (line 1)) (2022.8.2)\n",
|
153 |
+
"Collecting multiprocess\n",
|
154 |
+
" Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)\n",
|
155 |
+
"\u001b[K |████████████████████████████████| 115 kB 65.0 MB/s \n",
|
156 |
+
"\u001b[?25hRequirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.7/dist-packages (from evaluate==0.2.2->-r requirements.txt (line 1)) (4.64.1)\n",
|
157 |
+
"Collecting responses<0.19\n",
|
158 |
+
" Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n",
|
159 |
+
"Collecting huggingface-hub>=0.7.0\n",
|
160 |
+
" Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)\n",
|
161 |
+
"\u001b[K |████████████████████████████████| 163 kB 60.3 MB/s \n",
|
162 |
+
"\u001b[?25hRequirement already satisfied: dill in /usr/local/lib/python3.7/dist-packages (from evaluate==0.2.2->-r requirements.txt (line 1)) (0.3.5.1)\n",
|
163 |
+
"Collecting datasets>=2.0.0\n",
|
164 |
+
" Downloading datasets-2.5.1-py3-none-any.whl (431 kB)\n",
|
165 |
+
"\u001b[K |████████████████████████████████| 431 kB 51.2 MB/s \n",
|
166 |
+
"\u001b[?25hCollecting xxhash\n",
|
167 |
+
" Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)\n",
|
168 |
+
"\u001b[K |████████████████████████████████| 212 kB 52.0 MB/s \n",
|
169 |
+
"\u001b[?25hRequirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from evaluate==0.2.2->-r requirements.txt (line 1)) (4.12.0)\n",
|
170 |
+
"Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from evaluate==0.2.2->-r requirements.txt (line 1)) (1.3.5)\n",
|
171 |
+
"Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from evaluate==0.2.2->-r requirements.txt (line 1)) (21.3)\n",
|
172 |
+
"Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit_learn==1.0.2->-r requirements.txt (line 4)) (3.1.0)\n",
|
173 |
+
"Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit_learn==1.0.2->-r requirements.txt (line 4)) (1.1.0)\n",
|
174 |
+
"Requirement already satisfied: scipy>=1.1.0 in /usr/local/lib/python3.7/dist-packages (from scikit_learn==1.0.2->-r requirements.txt (line 4)) (1.7.3)\n",
|
175 |
+
"Requirement already satisfied: torchvision in /usr/local/lib/python3.7/dist-packages (from sentence_transformers==2.2.2->-r requirements.txt (line 5)) (0.13.1+cu113)\n",
|
176 |
+
"Requirement already satisfied: nltk in /usr/local/lib/python3.7/dist-packages (from sentence_transformers==2.2.2->-r requirements.txt (line 5)) (3.7)\n",
|
177 |
+
"Collecting sentencepiece\n",
|
178 |
+
" Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
|
179 |
+
"\u001b[K |████████████████████████████████| 1.3 MB 53.7 MB/s \n",
|
180 |
+
"\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch==1.12.1->-r requirements.txt (line 7)) (4.1.1)\n",
|
181 |
+
"Requirement already satisfied: matplotlib!=3.0.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from yellowbrick==1.5->-r requirements.txt (line 8)) (3.2.2)\n",
|
182 |
+
"Requirement already satisfied: cycler>=0.10.0 in /usr/local/lib/python3.7/dist-packages (from yellowbrick==1.5->-r requirements.txt (line 8)) (0.11.0)\n",
|
183 |
+
"Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers==4.22.1->-r requirements.txt (line 9)) (3.8.0)\n",
|
184 |
+
"Collecting tokenizers!=0.11.3,<0.13,>=0.11.1\n",
|
185 |
+
" Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)\n",
|
186 |
+
"\u001b[K |████████████████████████████████| 6.6 MB 40.9 MB/s \n",
|
187 |
+
"\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers==4.22.1->-r requirements.txt (line 9)) (2022.6.2)\n",
|
188 |
+
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.7/dist-packages (from transformers==4.22.1->-r requirements.txt (line 9)) (6.0)\n",
|
189 |
+
"Requirement already satisfied: aiohttp in /usr/local/lib/python3.7/dist-packages (from datasets>=2.0.0->evaluate==0.2.2->-r requirements.txt (line 1)) (3.8.1)\n",
|
190 |
+
"Requirement already satisfied: pyarrow>=6.0.0 in /usr/local/lib/python3.7/dist-packages (from datasets>=2.0.0->evaluate==0.2.2->-r requirements.txt (line 1)) (6.0.1)\n",
|
191 |
+
"Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets>=2.0.0->evaluate==0.2.2->-r requirements.txt (line 1)) (1.3.1)\n",
|
192 |
+
"Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets>=2.0.0->evaluate==0.2.2->-r requirements.txt (line 1)) (6.0.2)\n",
|
193 |
+
"Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets>=2.0.0->evaluate==0.2.2->-r requirements.txt (line 1)) (22.1.0)\n",
|
194 |
+
"Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets>=2.0.0->evaluate==0.2.2->-r requirements.txt (line 1)) (1.8.1)\n",
|
195 |
+
"Requirement already satisfied: asynctest==0.13.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets>=2.0.0->evaluate==0.2.2->-r requirements.txt (line 1)) (0.13.0)\n",
|
196 |
+
"Requirement already satisfied: charset-normalizer<3.0,>=2.0 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets>=2.0.0->evaluate==0.2.2->-r requirements.txt (line 1)) (2.1.1)\n",
|
197 |
+
"Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets>=2.0.0->evaluate==0.2.2->-r requirements.txt (line 1)) (1.2.0)\n",
|
198 |
+
"Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.7/dist-packages (from aiohttp->datasets>=2.0.0->evaluate==0.2.2->-r requirements.txt (line 1)) (4.0.2)\n",
|
199 |
+
"Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick==1.5->-r requirements.txt (line 8)) (1.4.4)\n",
|
200 |
+
"Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick==1.5->-r requirements.txt (line 8)) (3.0.9)\n",
|
201 |
+
"Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick==1.5->-r requirements.txt (line 8)) (2.8.2)\n",
|
202 |
+
"Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.1->matplotlib!=3.0.0,>=2.0.2->yellowbrick==1.5->-r requirements.txt (line 8)) (1.15.0)\n",
|
203 |
+
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->evaluate==0.2.2->-r requirements.txt (line 1)) (2022.6.15)\n",
|
204 |
+
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->evaluate==0.2.2->-r requirements.txt (line 1)) (3.0.4)\n",
|
205 |
+
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->evaluate==0.2.2->-r requirements.txt (line 1)) (1.24.3)\n",
|
206 |
+
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests>=2.19.0->evaluate==0.2.2->-r requirements.txt (line 1)) (2.10)\n",
|
207 |
+
"Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1\n",
|
208 |
+
" Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)\n",
|
209 |
+
"\u001b[K |████████████████████████████████| 127 kB 53.0 MB/s \n",
|
210 |
+
"\u001b[?25hRequirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->evaluate==0.2.2->-r requirements.txt (line 1)) (3.8.1)\n",
|
211 |
+
"Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from nltk->sentence_transformers==2.2.2->-r requirements.txt (line 5)) (7.1.2)\n",
|
212 |
+
"Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas->evaluate==0.2.2->-r requirements.txt (line 1)) (2022.2.1)\n",
|
213 |
+
"Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in /usr/local/lib/python3.7/dist-packages (from torchvision->sentence_transformers==2.2.2->-r requirements.txt (line 5)) (7.1.2)\n",
|
214 |
+
"Building wheels for collected packages: sentence-transformers\n",
|
215 |
+
" Building wheel for sentence-transformers (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
|
216 |
+
" Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=422c6b8ba07037cbc0021b7dd77779f2f4cabd92e9a6edd18099753cd88d92d1\n",
|
217 |
+
" Stored in directory: /root/.cache/pip/wheels/bf/06/fb/d59c1e5bd1dac7f6cf61ec0036cc3a10ab8fecaa6b2c3d3ee9\n",
|
218 |
+
"Successfully built sentence-transformers\n",
|
219 |
+
"Installing collected packages: urllib3, xxhash, tokenizers, responses, multiprocess, huggingface-hub, transformers, sentencepiece, datasets, textdistance, setuptools, sentence-transformers, kmeans-pytorch, evaluate\n",
|
220 |
+
" Attempting uninstall: urllib3\n",
|
221 |
+
" Found existing installation: urllib3 1.24.3\n",
|
222 |
+
" Uninstalling urllib3-1.24.3:\n",
|
223 |
+
" Successfully uninstalled urllib3-1.24.3\n",
|
224 |
+
" Attempting uninstall: setuptools\n",
|
225 |
+
" Found existing installation: setuptools 57.4.0\n",
|
226 |
+
" Uninstalling setuptools-57.4.0:\n",
|
227 |
+
" Successfully uninstalled setuptools-57.4.0\n",
|
228 |
+
"\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
|
229 |
+
"ipython 7.9.0 requires jedi>=0.10, which is not installed.\n",
|
230 |
+
"numba 0.56.2 requires setuptools<60, but you have setuptools 63.4.1 which is incompatible.\u001b[0m\n",
|
231 |
+
"Successfully installed datasets-2.5.1 evaluate-0.2.2 huggingface-hub-0.10.0 kmeans-pytorch-0.3 multiprocess-0.70.13 responses-0.18.0 sentence-transformers-2.2.2 sentencepiece-0.1.97 setuptools-63.4.1 textdistance-4.5.0 tokenizers-0.12.1 transformers-4.22.1 urllib3-1.25.11 xxhash-3.0.0\n"
|
232 |
+
]
|
233 |
+
},
|
234 |
+
{
|
235 |
+
"output_type": "display_data",
|
236 |
+
"data": {
|
237 |
+
"application/vnd.colab-display-data+json": {
|
238 |
+
"pip_warning": {
|
239 |
+
"packages": [
|
240 |
+
"pkg_resources"
|
241 |
+
]
|
242 |
+
}
|
243 |
+
}
|
244 |
+
},
|
245 |
+
"metadata": {}
|
246 |
+
}
|
247 |
+
]
|
248 |
+
},
|
249 |
+
{
|
250 |
+
"cell_type": "code",
|
251 |
+
"source": [
|
252 |
+
"!python example_run.py"
|
253 |
+
],
|
254 |
+
"metadata": {
|
255 |
+
"colab": {
|
256 |
+
"base_uri": "https://localhost:8080/"
|
257 |
+
},
|
258 |
+
"id": "r5s28dVs4vmi",
|
259 |
+
"outputId": "17395da1-2d67-48ad-a4f4-e885dfedee77"
|
260 |
+
},
|
261 |
+
"execution_count": null,
|
262 |
+
"outputs": [
|
263 |
+
{
|
264 |
+
"metadata": {
|
265 |
+
"tags": null
|
266 |
+
},
|
267 |
+
"name": "stdout",
|
268 |
+
"output_type": "stream",
|
269 |
+
"text": [
|
270 |
+
"The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.\n",
|
271 |
+
"Moving 0 files to the new cache system\n",
|
272 |
+
"0it [00:00, ?it/s]\n",
|
273 |
+
"Downloading: 100% 1.18k/1.18k [00:00<00:00, 1.23MB/s]\n",
|
274 |
+
"Downloading: 100% 190/190 [00:00<00:00, 183kB/s]\n",
|
275 |
+
"Downloading: 100% 10.6k/10.6k [00:00<00:00, 5.27MB/s]\n",
|
276 |
+
"Downloading: 100% 612/612 [00:00<00:00, 537kB/s]\n",
|
277 |
+
"Downloading: 100% 116/116 [00:00<00:00, 108kB/s]\n",
|
278 |
+
"Downloading: 100% 39.3k/39.3k [00:00<00:00, 628kB/s]\n",
|
279 |
+
"Downloading: 100% 90.9M/90.9M [00:01<00:00, 47.6MB/s]\n",
|
280 |
+
"Downloading: 100% 53.0/53.0 [00:00<00:00, 52.2kB/s]\n",
|
281 |
+
"Downloading: 100% 112/112 [00:00<00:00, 93.7kB/s]\n",
|
282 |
+
"Downloading: 100% 466k/466k [00:00<00:00, 1.49MB/s]\n",
|
283 |
+
"Downloading: 100% 350/350 [00:00<00:00, 299kB/s]\n",
|
284 |
+
"Downloading: 100% 13.2k/13.2k [00:00<00:00, 8.80MB/s]\n",
|
285 |
+
"Downloading: 100% 232k/232k [00:00<00:00, 1.24MB/s]\n",
|
286 |
+
"Downloading: 100% 349/349 [00:00<00:00, 293kB/s]\n",
|
287 |
+
"Downloading: 100% 1.92k/1.92k [00:00<00:00, 1.72MB/s]\n",
|
288 |
+
"Downloading: 100% 792k/792k [00:00<00:00, 12.8MB/s]\n",
|
289 |
+
"Downloading: 100% 2.42M/2.42M [00:00<00:00, 5.44MB/s]\n",
|
290 |
+
"Downloading: 100% 1.79k/1.79k [00:00<00:00, 1.58MB/s]\n",
|
291 |
+
"Downloading: 100% 1.38k/1.38k [00:00<00:00, 1.00MB/s]\n",
|
292 |
+
"Downloading: 100% 892M/892M [00:17<00:00, 51.9MB/s]\n",
|
293 |
+
">>> pipeline starts...\n",
|
294 |
+
">>> start generating word embeddings...\n",
|
295 |
+
">>> successfully generated word embeddings...\n",
|
296 |
+
">>> start clustering...\n",
|
297 |
+
">>> The best K is 2.\n",
|
298 |
+
">>> finished clustering...\n",
|
299 |
+
">>> start keywords extraction\n",
|
300 |
+
">>> finished keywords extraction\n",
|
301 |
+
">>> pipeline finished!\n",
|
302 |
+
"\n",
|
303 |
+
"['machine translation/similar language translation/news translation', 'natural language processing/nlp/natural language inference', 'model pretraining/pretraining/pre-training', 'wmt 2020', 'model architecture']\n",
|
304 |
+
"['deep learning/bayesian deep learning/machine learning', 'scene reconstruction/face recognition', 'convolutional networks', 'ilsvr', 'classification']\n"
|
305 |
+
]
|
306 |
+
}
|
307 |
+
]
|
308 |
+
}
|
309 |
+
]
|
310 |
+
}
|
scripts/queryAPI/API_Summary.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
scripts/readme.md
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Scripts
|
2 |
+
This folder contains scripts for
|
3 |
+
- model training and evaluation
|
4 |
+
- model inference
|
5 |
+
- tests and debugging
|
scripts/tests/lrt_test_run.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
if __name__ == '__main__':
|
2 |
+
import sys
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
project_root = Path(
|
6 |
+
__file__).parent.parent.parent.absolute() # /home/adapting/git/leoxiang66/idp_LiteratureResearch_Tool
|
7 |
+
sys.path.append(project_root.__str__())
|
8 |
+
|
9 |
+
from lrt import LiteratureResearchTool, Configuration
|
10 |
+
from lrt.utils import ArticleList
|
11 |
+
config = Configuration(
|
12 |
+
plm= 'all-mpnet-base-v2',
|
13 |
+
dimension_reduction='none',
|
14 |
+
clustering='kmeans-euclidean',
|
15 |
+
# keywords_extraction='KeyBartAdapter'
|
16 |
+
keywords_extraction= 'keyphrase-transformer'
|
17 |
+
)
|
18 |
+
# import evaluate
|
19 |
+
# import numpy as np
|
20 |
+
|
21 |
+
|
22 |
+
# accuracy_metric = evaluate.load("accuracy")
|
23 |
+
# # minimal_config = BaselineConfig
|
24 |
+
|
25 |
+
# sentences = [
|
26 |
+
# "This paper presents the results of the news translation task and the similar language translation task, both organised alongside the Conference on Machine Translation (WMT) 2020. In the news task, participants were asked to build machine translation systems for any of 11 language pairs, to be evaluated on test sets consisting mainly of news stories. The task was also opened up to additional test suites to probe specific aspects of translation. In the similar language translation task, participants built machine translation systems for translating between closely related pairs of languages.",
|
27 |
+
# "Recent progress in natural language processing has been driven by advances in both model architecture and model pretraining. Transformer architectures have facilitated building higher-capacity models and pretraining has made it possible to effectively utilize this capacity for a wide variety of tasks. Transformers is an open-source library with the goal of opening up these advances to the wider machine learning community. The library consists of carefully engineered state-of-the art Transformer architectures under a unified API. Backing this library is a curated collection of pretrained models made by and available for the community. Transformers is designed to be extensible by researchers, simple for practitioners, and fast and robust in industrial deployments. The library is available at https://github.com/huggingface/transformers.",
|
28 |
+
# 'Convolutional networks are at the core of most state of-the-art computer vision solutions for a wide variety of tasks. Since 2014 very deep convolutional networks started to become mainstream, yielding substantial gains in various benchmarks. Although increased model size and computational cost tend to translate to immediate quality gains for most tasks (as long as enough labeled data is provided for training), computational efficiency and low parameter count are still enabling factors for various use cases such as mobile vision and big-data scenarios. Here we are exploring ways to scale up networks in ways that aim at utilizing the added computation as efficiently as possible by suitably factorized convolutions and aggressive regularization. We benchmark our methods on the ILSVRC 2012 classification challenge validation set demonstrate substantial gains over the state of the art: 21:2% top-1 and 5:6% top-5 error for single frame evaluation using a network with a computational cost of 5 billion multiply-adds per inference and with using less than 25 million parameters. With an ensemble of 4 models and multi-crop evaluation, we report 3:5% top-5 error and 17:3% top-1 error on the validation set and 3:6% top-5 error on the official test set.',
|
29 |
+
# 'Deep learning is at the heart of the current rise of artificial intelligence. In the field of computer vision, it has become the workhorse for applications ranging from self-driving cars to surveillance and security. Whereas, deep neural networks have demonstrated phenomenal success (often beyond human capabilities) in solving complex problems, recent studies show that they are vulnerable to adversarial attacks in the form of subtle perturbations to inputs that lead a model to predict incorrect outputs. For images, such perturbations are often too small to be perceptible, yet they completely fool the deep learning models. Adversarial attacks pose a serious threat to the success of deep learning in practice. This fact has recently led to a large influx of contributions in this direction. This paper presents the first comprehensive survey on adversarial attacks on deep learning in computer vision. We review the works that design adversarial attacks, analyze the existence of such attacks and propose defenses against them. To emphasize that adversarial attacks are possible in practical conditions, we separately review the contributions that evaluate adversarial attacks in the real-world scenarios. Finally, drawing on the reviewed literature, we provide a broader outlook of this research direction.',
|
30 |
+
# '''Feed-forward layers constitute two-thirds of a transformer model's parameters, yet their role in the network remains under-explored. We show that feed-forward layers in transformer-based language models operate as key-value memories, where each key correlates with textual patterns in the training examples, and each value induces a distribution over the output vocabulary. Our experiments show that the learned patterns are human-interpretable, and that lower layers tend to capture shallow patterns, while upper layers learn more semantic ones. The values complement the keys' input patterns by inducing output distributions that concentrate probability mass on tokens likely to appear immediately after each pattern, particularly in the upper layers. Finally, we demonstrate that the output of a feed-forward layer is a composition of its memories, which is subsequently refined throughout the model's layers via residual connections to produce the final output distribution.''',
|
31 |
+
# '''Bidirectional Encoder Representations from Transformers (BERT) has shown marvelous improvements across various NLP tasks, and consecutive variants have been proposed to further improve the performance of the pre-trained language models. In this paper, we target on revisiting Chinese pre-trained language models to examine their effectiveness in a non-English language and release the Chinese pre-trained language model series to the community. We also propose a simple but effective model called MacBERT, which improves upon RoBERTa in several ways, especially the masking strategy that adopts MLM as correction (Mac). We carried out extensive experiments on eight Chinese NLP tasks to revisit the existing pre-trained language models as well as the proposed MacBERT. Experimental results show that MacBERT could achieve state-of-the-art performances on many NLP tasks, and we also ablate details with several findings that may help future research. https://github.com/ymcui/MacBERT''',
|
32 |
+
# '''From the Publisher: A basic problem in computer vision is to understand the structure of a real world scene given several images of it. Recent major developments in the theory and practice of scene reconstruction are described in detail in a unified framework. The book covers the geometric principles and how to represent objects algebraically so they can be computed and applied. The authors provide comprehensive background material and explain how to apply the methods and implement the algorithms directly.''',
|
33 |
+
# '''There are two major types of uncertainty one can model. Aleatoric uncertainty captures noise inherent in the observations. On the other hand, epistemic uncertainty accounts for uncertainty in the model -- uncertainty which can be explained away given enough data. Traditionally it has been difficult to model epistemic uncertainty in computer vision, but with new Bayesian deep learning tools this is now possible. We study the benefits of modeling epistemic vs. aleatoric uncertainty in Bayesian deep learning models for vision tasks. For this we present a Bayesian deep learning framework combining input-dependent aleatoric uncertainty together with epistemic uncertainty. We study models under the framework with per-pixel semantic segmentation and depth regression tasks. Further, our explicit uncertainty formulation leads to new loss functions for these tasks, which can be interpreted as learned attenuation. This makes the loss more robust to noisy data, also giving new state-of-the-art results on segmentation and depth regression benchmarks.''',
|
34 |
+
# '''Language model pre-training, such as BERT, has significantly improved the performances of many natural language processing tasks. However, pre-trained language models are usually computationally expensive, so it is difficult to efficiently execute them on resource-restricted devices. To accelerate inference and reduce model size while maintaining accuracy, we first propose a novel Transformer distillation method that is specially designed for knowledge distillation (KD) of the Transformer-based models. By leveraging this new KD method, the plenty of knowledge encoded in a large “teacher” BERT can be effectively transferred to a small “student” TinyBERT. Then, we introduce a new two-stage learning framework for TinyBERT, which performs Transformer distillation at both the pre-training and task-specific learning stages. This framework ensures that TinyBERT can capture the general-domain as well as the task-specific knowledge in BERT. TinyBERT4 with 4 layers is empirically effective and achieves more than 96.8% the performance of its teacher BERT-Base on GLUE benchmark, while being 7.5x smaller and 9.4x faster on inference. TinyBERT4 is also significantly better than 4-layer state-of-the-art baselines on BERT distillation, with only ~28% parameters and ~31% inference time of them. Moreover, TinyBERT6 with 6 layers performs on-par with its teacher BERT-Base.''',
|
35 |
+
# '''This paper presents SimCSE, a simple contrastive learning framework that greatly advances the state-of-the-art sentence embeddings. We first describe an unsupervised approach, which takes an input sentence and predicts itself in a contrastive objective, with only standard dropout used as noise. This simple method works surprisingly well, performing on par with previous supervised counterparts. We hypothesize that dropout acts as minimal data augmentation and removing it leads to a representation collapse. Then, we draw inspiration from the recent success of learning sentence embeddings from natural language inference (NLI) datasets and incorporate annotated pairs from NLI datasets into contrastive learning by using entailment pairs as positives and contradiction pairs as hard negatives. We evaluate SimCSE on standard semantic textual similarity (STS) tasks, and our unsupervised and supervised models using BERT-base achieve an average of 74.5% and 81.6% Spearman's correlation respectively, a 7.9 and 4.6 points improvement compared to previous best results. We also show that contrastive learning theoretically regularizes pre-trained embeddings' anisotropic space to be more uniform, and it better aligns positive pairs when supervised signals are available.''',
|
36 |
+
# '''Over the last years deep learning methods have been shown to outperform previous state-of-the-art machine learning techniques in several fields, with computer vision being one of the most prominent cases. This review paper provides a brief overview of some of the most significant deep learning schemes used in computer vision problems, that is, Convolutional Neural Networks, Deep Boltzmann Machines and Deep Belief Networks, and Stacked Denoising Autoencoders. A brief account of their history, structure, advantages, and limitations is given, followed by a description of their applications in various computer vision tasks, such as object detection, face recognition, action and activity recognition, and human pose estimation. Finally, a brief overview is given of future directions in designing deep learning schemes for computer vision problems and the challenges involved therein.''',
|
37 |
+
# '''Computer vision is an interdisciplinary scientific field that deals with how computers can gain high-level understanding from digital images or videos. From the perspective of engineering, it seeks to understand and automate tasks that the human visual system can do. Computer vision tasks include methods for acquiring, processing, analyzing and understanding digital images, and extraction of high-dimensional data from the real world in order to produce numerical or symbolic information, e.g. in the forms of decisions.[3][4][5][6] Understanding in this context means the transformation of visual images (the input of the retina) into descriptions of the world that make sense to thought processes and can elicit appropriate action. This image understanding can be seen as the disentangling of symbolic information from image data using models constructed with the aid of geometry, physics, statistics, and learning theory.''',
|
38 |
+
#
|
39 |
+
# ]
|
40 |
+
|
41 |
+
|
42 |
+
|
43 |
+
lrt = LiteratureResearchTool(config)
|
44 |
+
platforms = [
|
45 |
+
'IEEE',
|
46 |
+
# 'Arxiv',
|
47 |
+
# 'Paper with Code'
|
48 |
+
]
|
49 |
+
ret = lrt('machine learning',100,2020,2022,platforms, best_k=5)
|
50 |
+
for plat in platforms:
|
51 |
+
clusters, articles = next(ret)
|
52 |
+
print(plat)
|
53 |
+
print(clusters)
|
54 |
+
print('keyphrases:')
|
55 |
+
for c in clusters:
|
56 |
+
print(c.top_5_keyphrases)
|
57 |
+
|
58 |
+
# 打印每个cluster包含的articles
|
59 |
+
# ids = c.elements()
|
60 |
+
# articles_in_cluster = ArticleList([articles[i] for i in ids] )
|
61 |
+
# print(articles_in_cluster)
|
62 |
+
print()
|
63 |
+
|
64 |
+
|
65 |
+
|
scripts/tests/model_test.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
if __name__ == '__main__':
|
2 |
+
import sys
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
project_root = Path(
|
6 |
+
__file__).parent.parent.parent.absolute() # /home/adapting/git/leoxiang66/idp_LiteratureResearch_Tool
|
7 |
+
sys.path.append(project_root.__str__())
|
8 |
+
|
9 |
+
import torch
|
10 |
+
from lrt.clustering.models.keyBartPlus import *
|
11 |
+
from lrt.clustering.models.adapter import *
|
12 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
13 |
+
import os
|
14 |
+
|
15 |
+
####################### Adapter Test #############################
|
16 |
+
input_dim = 1024
|
17 |
+
adapter_hid_dim = 256
|
18 |
+
adapter = Adapter(input_dim,adapter_hid_dim)
|
19 |
+
|
20 |
+
data = torch.randn(10, 20, input_dim)
|
21 |
+
|
22 |
+
tmp = adapter(data)
|
23 |
+
|
24 |
+
assert data.size() == tmp.size()
|
25 |
+
####################### Adapter Test #############################
|
26 |
+
|
27 |
+
####################### BartDecoderPlus Test #############################
|
28 |
+
keyBart = AutoModelForSeq2SeqLM.from_pretrained("bloomberg/KeyBART")
|
29 |
+
bartDecoderP = BartDecoderPlus(keyBart, 100)
|
30 |
+
tmp = bartDecoderP(inputs_embeds=data,
|
31 |
+
output_attentions = True,
|
32 |
+
output_hidden_states = True,
|
33 |
+
encoder_hidden_states = data
|
34 |
+
)
|
35 |
+
print(type(tmp))
|
36 |
+
# print(tmp.__dict__)
|
37 |
+
print(dir(tmp))
|
38 |
+
last_hid_states = tmp.last_hidden_state
|
39 |
+
hidden_states = tmp.hidden_states
|
40 |
+
attentions = tmp.attentions
|
41 |
+
cross_attention = tmp.cross_attentions
|
42 |
+
print(last_hid_states.shape)
|
43 |
+
print(hidden_states.__len__())
|
44 |
+
print(attentions.__len__())
|
45 |
+
print(len(cross_attention))
|
46 |
+
# print(cross_attention[0])
|
47 |
+
print(cross_attention[0].shape)
|
48 |
+
|
49 |
+
####################### BartDecoderPlus Test #############################
|
50 |
+
|
51 |
+
####################### BartPlus Test #############################
|
52 |
+
bartP = BartPlus(keyBart,100)
|
53 |
+
tmp = bartP(
|
54 |
+
inputs_embeds = data,
|
55 |
+
decoder_inputs_embeds = data,
|
56 |
+
output_attentions=True,
|
57 |
+
output_hidden_states=True,
|
58 |
+
)
|
59 |
+
print(type(tmp))
|
60 |
+
# print(tmp.__dict__)
|
61 |
+
print(dir(tmp))
|
62 |
+
last_hid_states = tmp.last_hidden_state
|
63 |
+
hidden_states = tmp.decoder_hidden_states
|
64 |
+
attentions = tmp.decoder_attentions
|
65 |
+
cross_attention = tmp.cross_attentions
|
66 |
+
print(last_hid_states.shape)
|
67 |
+
print(hidden_states.__len__())
|
68 |
+
print(attentions.__len__())
|
69 |
+
print(len(cross_attention))
|
70 |
+
# print(cross_attention[0])
|
71 |
+
print(cross_attention[0].shape)
|
72 |
+
####################### BartPlus Test #############################
|
73 |
+
|
74 |
+
####################### Summary #############################
|
75 |
+
from torchinfo import summary
|
76 |
+
|
77 |
+
summary(bartP)
|
78 |
+
# summary(bartDecoderP)
|
79 |
+
####################### Summary #############################
|
80 |
+
|
81 |
+
####################### KeyBartAdapter Test #############################
|
82 |
+
keybart_adapter = KeyBartAdapter(100)
|
83 |
+
tmp = keybart_adapter(
|
84 |
+
inputs_embeds=data,
|
85 |
+
decoder_inputs_embeds=data,
|
86 |
+
output_attentions=True,
|
87 |
+
output_hidden_states=True,
|
88 |
+
)
|
89 |
+
print(type(tmp))
|
90 |
+
# print(tmp.__dict__)
|
91 |
+
print(dir(tmp))
|
92 |
+
last_hid_states = tmp.encoder_last_hidden_state
|
93 |
+
hidden_states = tmp.decoder_hidden_states
|
94 |
+
attentions = tmp.decoder_attentions
|
95 |
+
cross_attention = tmp.cross_attentions
|
96 |
+
print(last_hid_states.shape)
|
97 |
+
print(hidden_states.__len__())
|
98 |
+
print(attentions.__len__())
|
99 |
+
print(len(cross_attention))
|
100 |
+
# print(cross_attention[0])
|
101 |
+
print(cross_attention[0].shape)
|
102 |
+
summary(keybart_adapter)
|
103 |
+
####################### KeyBartAdapter Test #############################
|
scripts/train/KeyBartAdapter_train.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
scripts/train/train.py
ADDED
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def train(
|
2 |
+
push_to_hub:bool,
|
3 |
+
num_epoch: int,
|
4 |
+
train_batch_size: int,
|
5 |
+
eval_batch_size: int,
|
6 |
+
):
|
7 |
+
import torch
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
# 1. Dataset
|
11 |
+
from datasets import load_dataset
|
12 |
+
dataset = load_dataset("Adapting/abstract-keyphrases")
|
13 |
+
|
14 |
+
# 2. Model
|
15 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
16 |
+
from lrt.clustering.models import KeyBartAdapter
|
17 |
+
tokenizer = AutoTokenizer.from_pretrained("Adapting/KeyBartAdapter")
|
18 |
+
|
19 |
+
'''
|
20 |
+
Or you can just use the initial model weights from Huggingface:
|
21 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("Adapting/KeyBartAdapter",
|
22 |
+
revision='9c3ed39c6ed5c7e141363e892d77cf8f589d5999')
|
23 |
+
'''
|
24 |
+
|
25 |
+
model = KeyBartAdapter(256)
|
26 |
+
|
27 |
+
# 3. preprocess dataset
|
28 |
+
dataset = dataset.shuffle()
|
29 |
+
|
30 |
+
def preprocess_function(examples):
|
31 |
+
inputs = examples['Abstract']
|
32 |
+
targets = examples['Keywords']
|
33 |
+
model_inputs = tokenizer(inputs, truncation=True)
|
34 |
+
|
35 |
+
# Set up the tokenizer for targets
|
36 |
+
with tokenizer.as_target_tokenizer():
|
37 |
+
labels = tokenizer(targets, truncation=True)
|
38 |
+
|
39 |
+
model_inputs["labels"] = labels["input_ids"]
|
40 |
+
return model_inputs
|
41 |
+
|
42 |
+
tokenized_dataset = dataset.map(
|
43 |
+
preprocess_function,
|
44 |
+
batched=True,
|
45 |
+
remove_columns=dataset["train"].column_names,
|
46 |
+
)
|
47 |
+
|
48 |
+
# 4. evaluation metrics
|
49 |
+
def compute_metrics(eval_preds):
|
50 |
+
preds = eval_preds.predictions
|
51 |
+
labels = eval_preds.label_ids
|
52 |
+
if isinstance(preds, tuple):
|
53 |
+
preds = preds[0]
|
54 |
+
print(preds.shape)
|
55 |
+
if len(preds.shape) == 3:
|
56 |
+
preds = preds.argmax(axis=-1)
|
57 |
+
|
58 |
+
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
|
59 |
+
# Replace -100 in the labels as we can't decode them.
|
60 |
+
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
|
61 |
+
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
|
62 |
+
|
63 |
+
# Some simple post-processing
|
64 |
+
decoded_preds = [a.strip().split(';') for a in decoded_preds]
|
65 |
+
decoded_labels = [a.strip().split(';') for a in decoded_labels]
|
66 |
+
|
67 |
+
precs, recalls, f_scores = [], [], []
|
68 |
+
num_match, num_pred, num_gold = [], [], []
|
69 |
+
for pred, label in zip(decoded_preds, decoded_labels):
|
70 |
+
pred_set = set(pred)
|
71 |
+
label_set = set(label)
|
72 |
+
match_set = label_set.intersection(pred_set)
|
73 |
+
p = float(len(match_set)) / float(len(pred_set)) if len(pred_set) > 0 else 0.0
|
74 |
+
r = float(len(match_set)) / float(len(label_set)) if len(label_set) > 0 else 0.0
|
75 |
+
f1 = float(2 * (p * r)) / (p + r) if (p + r) > 0 else 0.0
|
76 |
+
precs.append(p)
|
77 |
+
recalls.append(r)
|
78 |
+
f_scores.append(f1)
|
79 |
+
num_match.append(len(match_set))
|
80 |
+
num_pred.append(len(pred_set))
|
81 |
+
num_gold.append(len(label_set))
|
82 |
+
|
83 |
+
# print(f'raw_PRED: {raw_pred}')
|
84 |
+
print(f'PRED: num={len(pred_set)} - {pred_set}')
|
85 |
+
print(f'GT: num={len(label_set)} - {label_set}')
|
86 |
+
print(f'p={p}, r={r}, f1={f1}')
|
87 |
+
print('-' * 20)
|
88 |
+
|
89 |
+
result = {
|
90 |
+
'precision@M': np.mean(precs) * 100.0,
|
91 |
+
'recall@M': np.mean(recalls) * 100.0,
|
92 |
+
'fscore@M': np.mean(f_scores) * 100.0,
|
93 |
+
'num_match': np.mean(num_match),
|
94 |
+
'num_pred': np.mean(num_pred),
|
95 |
+
'num_gold': np.mean(num_gold),
|
96 |
+
}
|
97 |
+
|
98 |
+
result = {k: round(v, 2) for k, v in result.items()}
|
99 |
+
return result
|
100 |
+
|
101 |
+
# 5. train
|
102 |
+
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
|
103 |
+
|
104 |
+
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
|
105 |
+
|
106 |
+
model_name = 'KeyBartAdapter'
|
107 |
+
|
108 |
+
args = Seq2SeqTrainingArguments(
|
109 |
+
model_name,
|
110 |
+
evaluation_strategy="epoch",
|
111 |
+
save_strategy="epoch",
|
112 |
+
learning_rate=2e-5,
|
113 |
+
per_device_train_batch_size=train_batch_size,
|
114 |
+
per_device_eval_batch_size=eval_batch_size,
|
115 |
+
weight_decay=0.01,
|
116 |
+
save_total_limit=3,
|
117 |
+
num_train_epochs=num_epoch,
|
118 |
+
logging_steps=4,
|
119 |
+
load_best_model_at_end=True,
|
120 |
+
metric_for_best_model='fscore@M',
|
121 |
+
predict_with_generate=True,
|
122 |
+
fp16=torch.cuda.is_available(), # speeds up training on modern GPUs.
|
123 |
+
# eval_accumulation_steps=10,
|
124 |
+
)
|
125 |
+
|
126 |
+
trainer = Seq2SeqTrainer(
|
127 |
+
model,
|
128 |
+
args,
|
129 |
+
train_dataset=tokenized_dataset["train"],
|
130 |
+
eval_dataset=tokenized_dataset["train"],
|
131 |
+
data_collator=data_collator,
|
132 |
+
tokenizer=tokenizer,
|
133 |
+
compute_metrics=compute_metrics
|
134 |
+
)
|
135 |
+
|
136 |
+
trainer.train()
|
137 |
+
|
138 |
+
# 6. push
|
139 |
+
if push_to_hub:
|
140 |
+
commit_msg = f'{model_name}_{num_epoch}'
|
141 |
+
tokenizer.push_to_hub(commit_message=commit_msg, repo_id=model_name)
|
142 |
+
model.push_to_hub(commit_message=commit_msg, repo_id=model_name)
|
143 |
+
|
144 |
+
return model, tokenizer
|
145 |
+
|
146 |
+
if __name__ == '__main__':
|
147 |
+
import sys
|
148 |
+
from pathlib import Path
|
149 |
+
project_root = Path(__file__).parent.parent.parent.absolute()
|
150 |
+
sys.path.append(project_root.__str__())
|
151 |
+
|
152 |
+
|
153 |
+
# code
|
154 |
+
import argparse
|
155 |
+
parser = argparse.ArgumentParser()
|
156 |
+
|
157 |
+
parser.add_argument("--epoch", help="number of epochs", default=30)
|
158 |
+
parser.add_argument("--train_batch_size", help="training batch size", default=16)
|
159 |
+
parser.add_argument("--eval_batch_size", help="evaluation batch size", default=16)
|
160 |
+
parser.add_argument("--push", help="whether push the model to hub", action='store_true')
|
161 |
+
|
162 |
+
args = parser.parse_args()
|
163 |
+
print(args)
|
164 |
+
|
165 |
+
model, tokenizer = train(
|
166 |
+
push_to_hub= bool(args.push),
|
167 |
+
num_epoch= int(args.epoch),
|
168 |
+
train_batch_size= int(args.train_batch_size),
|
169 |
+
eval_batch_size= int(args.eval_batch_size)
|
170 |
+
)
|
171 |
+
|
setup.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setuptools import setup, find_packages
|
2 |
+
|
3 |
+
with open("README.md", "r") as readme_file:
|
4 |
+
readme = readme_file.read()
|
5 |
+
|
6 |
+
requirements = ["sentence_transformers"]
|
7 |
+
|
8 |
+
setup(
|
9 |
+
name="LiteratureResearchTool",
|
10 |
+
version="0.4.0",
|
11 |
+
author="Tao Xiang",
|
12 |
+
author_email="tao.xiang@tum.de",
|
13 |
+
description="A tool for literature research and analysis",
|
14 |
+
long_description=readme,
|
15 |
+
long_description_content_type="text/markdown",
|
16 |
+
url="https://github.com/Mondkuchen/idp_LiteratureResearch_Tool",
|
17 |
+
packages=find_packages(),
|
18 |
+
install_requires=requirements,
|
19 |
+
classifiers=[
|
20 |
+
"Programming Language :: Python :: 3.7",
|
21 |
+
"License :: OSI Approved :: MIT License",
|
22 |
+
],
|
23 |
+
)
|