add GLM code
Browse files- LICENSE.txt +201 -0
- added_tokens.json +10 -0
- config.json +34 -0
- configuration_glm.py +136 -0
- generation_config.json +4 -0
- merges.txt +0 -0
- modeling_glm.py +975 -0
- tokenization_glm.py +362 -0
- tokenizer_config.json +18 -0
- vocab.json +0 -0
LICENSE.txt
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright Zhengxiao Du
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
added_tokens.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"<|startofpiece|>": 50257,
|
3 |
+
"<|endofpiece|>": 50258,
|
4 |
+
"[CLS]": 50259,
|
5 |
+
"[MASK]": 50260,
|
6 |
+
"[SEP]": 50261,
|
7 |
+
"[UNUSED]": 50262,
|
8 |
+
"[gMASK]": 50263,
|
9 |
+
"[sMASK]": 50264
|
10 |
+
}
|
config.json
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "/workspace/hanyu/hanyu/WebGLM-HGF/WebGLM",
|
3 |
+
"architectures": [
|
4 |
+
"GLMForConditionalGeneration"
|
5 |
+
],
|
6 |
+
"attention_dropout_prob": 0.1,
|
7 |
+
"attention_scale": 1.0,
|
8 |
+
"auto_map": {
|
9 |
+
"AutoConfig": "configuration_glm.GLMConfig",
|
10 |
+
"AutoModel": "modeling_glm.GLMModel",
|
11 |
+
"AutoModelForMultipleChoice": "modeling_glm.GLMForMultipleChoice",
|
12 |
+
"AutoModelForSeq2SeqLM": "modeling_glm.GLMForConditionalGeneration"
|
13 |
+
},
|
14 |
+
"block_position_encoding": true,
|
15 |
+
"checkpoint_activations": false,
|
16 |
+
"checkpoint_num_layers": 1,
|
17 |
+
"embedding_dropout_prob": 0.1,
|
18 |
+
"hidden_size": 4096,
|
19 |
+
"initializer_range": 0.02,
|
20 |
+
"max_sequence_length": 1024,
|
21 |
+
"model_type": "glm",
|
22 |
+
"num_attention_heads": 64,
|
23 |
+
"num_layers": 48,
|
24 |
+
"output_dropout_prob": 0.1,
|
25 |
+
"output_predict": true,
|
26 |
+
"parallel_output": true,
|
27 |
+
"pool_token": "cls",
|
28 |
+
"relative_encoding": false,
|
29 |
+
"spell_func": "lstm",
|
30 |
+
"spell_length": null,
|
31 |
+
"torch_dtype": "float32",
|
32 |
+
"transformers_version": "4.27.4",
|
33 |
+
"vocab_size": 50304
|
34 |
+
}
|
configuration_glm.py
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2022 shunxing1234 and The HuggingFace Inc. team. All rights reserved.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
""" GLM model configuration """
|
16 |
+
|
17 |
+
from transformers.configuration_utils import PretrainedConfig
|
18 |
+
from transformers.utils import logging
|
19 |
+
|
20 |
+
logger = logging.get_logger(__name__)
|
21 |
+
|
22 |
+
GLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
23 |
+
"shunxing1234/GLM": "https://huggingface.co/shunxing1234/GLM/resolve/main/config.json",
|
24 |
+
# See all GLM models at https://huggingface.co/models?filter=glm
|
25 |
+
}
|
26 |
+
|
27 |
+
|
28 |
+
class GLMConfig(PretrainedConfig):
|
29 |
+
r"""
|
30 |
+
This is the configuration class to store the configuration of a [`~GLMModel`].
|
31 |
+
It is used to instantiate an GLM model according to the specified arguments, defining the model
|
32 |
+
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
|
33 |
+
the GLM [shunxing1234/GLM-base-cased](https://huggingface.co/shunxing1234/GLM-base-cased) architecture.
|
34 |
+
|
35 |
+
Configuration objects inherit from [`PretrainedConfig`] and can be used
|
36 |
+
to control the model outputs. Read the documentation from [`PretrainedConfig`]
|
37 |
+
for more information.
|
38 |
+
|
39 |
+
|
40 |
+
Args:
|
41 |
+
vocab_size (`int`, *optional*, defaults to 30522):
|
42 |
+
Vocabulary size of the GLM model. Defines the number of different tokens that can be represented by the
|
43 |
+
`inputs_ids` passed when calling [`~GLMModel`] or
|
44 |
+
[`~TFGLMModel`].
|
45 |
+
hidden_size (`int`, *optional*, defaults to 768):
|
46 |
+
Dimension of the encoder layers and the pooler layer.
|
47 |
+
num_hidden_layers (`int`, *optional*, defaults to 12):
|
48 |
+
Number of hidden layers in the Transformer encoder.
|
49 |
+
num_attention_heads (`int`, *optional*, defaults to 12):
|
50 |
+
Number of attention heads for each attention layer in the Transformer encoder.
|
51 |
+
intermediate_size (`int`, *optional*, defaults to 3072):
|
52 |
+
Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
53 |
+
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
|
54 |
+
The non-linear activation function (function or string) in the encoder and pooler.
|
55 |
+
If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
|
56 |
+
hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
|
57 |
+
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
58 |
+
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
|
59 |
+
The dropout ratio for the attention probabilities.
|
60 |
+
max_position_embeddings (`int`, *optional*, defaults to 512):
|
61 |
+
The maximum sequence length that this model might ever be used with.
|
62 |
+
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
|
63 |
+
type_vocab_size (`int`, *optional*, defaults to 2):
|
64 |
+
The vocabulary size of the `token_type_ids` passed when calling [`~GLMModel`] or
|
65 |
+
[`~TFGLMModel`].
|
66 |
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
67 |
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
68 |
+
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
69 |
+
The epsilon used by the layer normalization layers.
|
70 |
+
use_cache (`bool`, *optional*, defaults to `True`):
|
71 |
+
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
72 |
+
relevant if `config.is_decoder=True`.
|
73 |
+
Example:
|
74 |
+
|
75 |
+
```python
|
76 |
+
>>> from transformers import GLMModel, GLMConfig
|
77 |
+
|
78 |
+
>>> # Initializing a GLM shunxing1234/GLM-base-cased style configuration
|
79 |
+
>>> configuration = GLMConfig()
|
80 |
+
|
81 |
+
>>> # Initializing a model from the shunxing1234/GLM-base-cased style configuration
|
82 |
+
>>> model = GLMModel(configuration)
|
83 |
+
|
84 |
+
>>> # Accessing the model configuration
|
85 |
+
>>> configuration = model.config
|
86 |
+
```
|
87 |
+
"""
|
88 |
+
model_type = "glm"
|
89 |
+
attribute_map = {
|
90 |
+
"num_hidden_layers": "num_layers"
|
91 |
+
}
|
92 |
+
|
93 |
+
def __init__(
|
94 |
+
self,
|
95 |
+
num_layers=24,
|
96 |
+
vocab_size=30592,
|
97 |
+
hidden_size=1024,
|
98 |
+
num_attention_heads=16,
|
99 |
+
embedding_dropout_prob=0.1,
|
100 |
+
attention_dropout_prob=0.1,
|
101 |
+
output_dropout_prob=0.1,
|
102 |
+
max_sequence_length=512,
|
103 |
+
checkpoint_activations=False,
|
104 |
+
checkpoint_num_layers=1,
|
105 |
+
parallel_output=True,
|
106 |
+
relative_encoding=False,
|
107 |
+
block_position_encoding=True,
|
108 |
+
output_predict=False,
|
109 |
+
spell_length=None,
|
110 |
+
spell_func="lstm",
|
111 |
+
attention_scale=1.0,
|
112 |
+
initializer_range=0.02,
|
113 |
+
pool_token="cls",
|
114 |
+
**kwargs
|
115 |
+
):
|
116 |
+
self.num_layers = num_layers
|
117 |
+
self.vocab_size = vocab_size
|
118 |
+
self.hidden_size = hidden_size
|
119 |
+
self.num_attention_heads = num_attention_heads
|
120 |
+
self.embedding_dropout_prob = embedding_dropout_prob
|
121 |
+
self.attention_dropout_prob = attention_dropout_prob
|
122 |
+
self.output_dropout_prob = output_dropout_prob
|
123 |
+
self.max_sequence_length = max_sequence_length
|
124 |
+
self.checkpoint_activations = checkpoint_activations
|
125 |
+
self.checkpoint_num_layers = checkpoint_num_layers
|
126 |
+
self.parallel_output = parallel_output
|
127 |
+
self.relative_encoding = relative_encoding
|
128 |
+
self.block_position_encoding = block_position_encoding
|
129 |
+
self.output_predict = output_predict
|
130 |
+
self.spell_length = spell_length
|
131 |
+
self.spell_func = spell_func
|
132 |
+
self.attention_scale = attention_scale
|
133 |
+
self.initializer_range = initializer_range
|
134 |
+
self.pool_token = pool_token
|
135 |
+
|
136 |
+
super().__init__(**kwargs)
|
generation_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"transformers_version": "4.27.4"
|
4 |
+
}
|
merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
modeling_glm.py
ADDED
@@ -0,0 +1,975 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2022 shunxing1234 The HuggingFace Inc. team. All rights reserved.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
""" PyTorch GLM model. """
|
16 |
+
|
17 |
+
import math
|
18 |
+
|
19 |
+
import torch
|
20 |
+
import torch.utils.checkpoint
|
21 |
+
import torch.nn.functional as F
|
22 |
+
from torch.nn import init, LayerNorm, Linear, CrossEntropyLoss
|
23 |
+
|
24 |
+
from transformers.activations import gelu
|
25 |
+
from transformers.utils import (
|
26 |
+
add_code_sample_docstrings,
|
27 |
+
add_start_docstrings,
|
28 |
+
add_start_docstrings_to_model_forward,
|
29 |
+
)
|
30 |
+
from transformers.modeling_outputs import (
|
31 |
+
BaseModelOutputWithPastAndCrossAttentions,
|
32 |
+
ModelOutput,
|
33 |
+
SequenceClassifierOutput,
|
34 |
+
)
|
35 |
+
|
36 |
+
from transformers.modeling_utils import (
|
37 |
+
PreTrainedModel,
|
38 |
+
)
|
39 |
+
from .configuration_glm import GLMConfig
|
40 |
+
from torch.nn.parameter import Parameter
|
41 |
+
|
42 |
+
_CHECKPOINT_FOR_DOC = "shunxing1234/GLM"
|
43 |
+
_CONFIG_FOR_DOC = "GLMConfig"
|
44 |
+
_TOKENIZER_FOR_DOC = "GLMTokenizer"
|
45 |
+
|
46 |
+
GLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
47 |
+
"shunxing1234/GLM",
|
48 |
+
# See all GLM models at https://huggingface.co/models?filter=glm
|
49 |
+
]
|
50 |
+
|
51 |
+
|
52 |
+
def unscaled_init_method(sigma):
|
53 |
+
"""Init method based on N(0, sigma)."""
|
54 |
+
|
55 |
+
def init_(tensor):
|
56 |
+
return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
|
57 |
+
|
58 |
+
return init_
|
59 |
+
|
60 |
+
|
61 |
+
def scaled_init_method(mean, std, num_layers):
|
62 |
+
"""Init method based on N(0, sigma/sqrt(2*num_layers)."""
|
63 |
+
std = std / math.sqrt(2.0 * num_layers)
|
64 |
+
|
65 |
+
def init_(tensor):
|
66 |
+
return torch.nn.init.normal_(tensor, mean=mean, std=std)
|
67 |
+
|
68 |
+
return init_
|
69 |
+
|
70 |
+
|
71 |
+
def ensure_divisibility(numerator, denominator):
|
72 |
+
"""Ensure that numerator is divisible by the denominator."""
|
73 |
+
assert numerator % denominator == 0, '{} is not divisible by {}'.format(
|
74 |
+
numerator, denominator)
|
75 |
+
|
76 |
+
|
77 |
+
def divide(numerator, denominator):
|
78 |
+
"""Ensure that numerator is divisible by the denominator and return
|
79 |
+
the division value."""
|
80 |
+
ensure_divisibility(numerator, denominator)
|
81 |
+
return numerator // denominator
|
82 |
+
|
83 |
+
|
84 |
+
def split_tensor_along_last_dim(tensor, num_partitions,
|
85 |
+
contiguous_split_chunks=False):
|
86 |
+
"""Split a tensor along its last dimension.
|
87 |
+
Arguments:
|
88 |
+
tensor: input tensor.
|
89 |
+
num_partitions: number of partitions to split the tensor
|
90 |
+
contiguous_split_chunks: If True, make each chunk contiguous
|
91 |
+
in memory.
|
92 |
+
"""
|
93 |
+
# Get the size and dimension.
|
94 |
+
last_dim = tensor.dim() - 1
|
95 |
+
last_dim_size = divide(tensor.size()[last_dim], num_partitions)
|
96 |
+
# Split.
|
97 |
+
tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
|
98 |
+
# Note: torch.split does not create contiguous tensors by default.
|
99 |
+
if contiguous_split_chunks:
|
100 |
+
return tuple(chunk.contiguous() for chunk in tensor_list)
|
101 |
+
|
102 |
+
return tensor_list
|
103 |
+
|
104 |
+
|
105 |
+
class MLP(torch.nn.Module):
|
106 |
+
"""MLP for GPT2.
|
107 |
+
|
108 |
+
MLP will take the input with h hidden state, project it to 4*h
|
109 |
+
hidden dimension, perform gelu transformation, and project the
|
110 |
+
state back into h hidden dimension. At the end, dropout is also
|
111 |
+
applied.
|
112 |
+
|
113 |
+
Arguments:
|
114 |
+
hidden_size: The hidden size of the self attention.
|
115 |
+
output_dropout_prob: dropout probability for the outputs
|
116 |
+
after self attention and final output.
|
117 |
+
init_method: initialization method used for the weights. Note
|
118 |
+
that all biases are initialized to zero and
|
119 |
+
layernorm weight are initialized to one.
|
120 |
+
output_layer_init_method: output layer initialization. If None,
|
121 |
+
use `init_method`.
|
122 |
+
"""
|
123 |
+
|
124 |
+
def __init__(self, hidden_size, output_dropout_prob, init_method,
|
125 |
+
output_layer_init_method=None):
|
126 |
+
super(MLP, self).__init__()
|
127 |
+
# Set output layer initialization if not provided.
|
128 |
+
if output_layer_init_method is None:
|
129 |
+
output_layer_init_method = init_method
|
130 |
+
# Project to 4h.
|
131 |
+
self.dense_h_to_4h = Linear(hidden_size, 4 * hidden_size)
|
132 |
+
|
133 |
+
# Project back to h.
|
134 |
+
self.dense_4h_to_h = Linear(
|
135 |
+
4 * hidden_size,
|
136 |
+
hidden_size)
|
137 |
+
|
138 |
+
self.dropout = torch.nn.Dropout(output_dropout_prob)
|
139 |
+
|
140 |
+
def forward(self, hidden_states):
|
141 |
+
# [b, s, 4hp]
|
142 |
+
intermediate_parallel = self.dense_h_to_4h(hidden_states)
|
143 |
+
intermediate_parallel = gelu(intermediate_parallel)
|
144 |
+
|
145 |
+
# [b, s, h]
|
146 |
+
output = self.dense_4h_to_h(intermediate_parallel)
|
147 |
+
output = self.dropout(output)
|
148 |
+
return output
|
149 |
+
|
150 |
+
|
151 |
+
class VocabEmbedding(torch.nn.Module):
|
152 |
+
"""Embedding parallelized in the vocabulary dimension.
|
153 |
+
|
154 |
+
This is mainly adapted from torch.nn.Embedding and all the default
|
155 |
+
values are kept.
|
156 |
+
Arguments:
|
157 |
+
num_embeddings: vocabulary size.
|
158 |
+
embedding_dim: size of hidden state.
|
159 |
+
init_method: method to initialize weights.
|
160 |
+
"""
|
161 |
+
|
162 |
+
def __init__(self, config):
|
163 |
+
super(VocabEmbedding, self).__init__()
|
164 |
+
# Keep the input dimensions.
|
165 |
+
self.num_embeddings = config.vocab_size
|
166 |
+
self.embedding_dim = config.hidden_size
|
167 |
+
# Set the detauls for compatibility.
|
168 |
+
self.padding_idx = None
|
169 |
+
self.max_norm = None
|
170 |
+
self.norm_type = 2.
|
171 |
+
self.scale_grad_by_freq = False
|
172 |
+
self.sparse = False
|
173 |
+
self._weight = None
|
174 |
+
|
175 |
+
self.vocab_start_index = 0
|
176 |
+
self.vocab_end_index = self.num_embeddings
|
177 |
+
|
178 |
+
# Allocate weights.
|
179 |
+
self.weight = Parameter(torch.Tensor(self.num_embeddings,
|
180 |
+
self.embedding_dim))
|
181 |
+
# And initialize.
|
182 |
+
init.xavier_normal_(self.weight)
|
183 |
+
|
184 |
+
def forward(self, input_):
|
185 |
+
# Get the embeddings.
|
186 |
+
output = F.embedding(input_, self.weight,
|
187 |
+
self.padding_idx, self.max_norm,
|
188 |
+
self.norm_type, self.scale_grad_by_freq,
|
189 |
+
self.sparse)
|
190 |
+
return output
|
191 |
+
|
192 |
+
|
193 |
+
class PositionalEmbedding(torch.nn.Module):
|
194 |
+
|
195 |
+
def __init__(self, hidden_size):
|
196 |
+
super(PositionalEmbedding, self).__init__()
|
197 |
+
|
198 |
+
self.hidden_size = hidden_size
|
199 |
+
|
200 |
+
inv_freq = 1 / (10000 ** (torch.arange(0.0, hidden_size, 2.0) / hidden_size))
|
201 |
+
self.register_buffer('inv_freq', inv_freq)
|
202 |
+
|
203 |
+
def forward(self, pos_seq, bsz=None):
|
204 |
+
sinusoid_inp = torch.ger(pos_seq, self.inv_freq)
|
205 |
+
pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
|
206 |
+
|
207 |
+
if bsz is not None:
|
208 |
+
return pos_emb[None, :, :].expand(bsz, -1, -1)
|
209 |
+
else:
|
210 |
+
return pos_emb[None, :, :]
|
211 |
+
|
212 |
+
|
213 |
+
class SelfAttention(torch.nn.Module):
|
214 |
+
"""self-attention layer for GLM.
|
215 |
+
|
216 |
+
Self-attention layer takes input with size [b, s, h] where b is
|
217 |
+
the batch size, s is the sequence lenght, and h is the hidden size
|
218 |
+
and creates output of the same size.
|
219 |
+
Arguments:
|
220 |
+
hidden_size: total hidden size of the layer (h).
|
221 |
+
num_attention_heads: number of attention heads (n). Note that we
|
222 |
+
require n to be divisible by number of GPUs
|
223 |
+
used to parallelize the model. Also, we
|
224 |
+
require hidden size to be divisible by n.
|
225 |
+
attention_dropout_prob: dropout probability for the attention scores.
|
226 |
+
init_method: weight initialization.
|
227 |
+
output_layer_init_method: output layer initialization. If None, use
|
228 |
+
`init_method`.
|
229 |
+
We use the following notation:
|
230 |
+
h: hidden_size
|
231 |
+
n: num_attention_heads
|
232 |
+
p: number of partitions
|
233 |
+
np: n/p
|
234 |
+
hp: h/p
|
235 |
+
hn: h/n
|
236 |
+
b: batch size
|
237 |
+
s: sequence length
|
238 |
+
"""
|
239 |
+
|
240 |
+
def __init__(self, hidden_size, num_attention_heads,
|
241 |
+
attention_dropout_prob, output_dropout_prob,
|
242 |
+
init_method, output_layer_init_method=None,
|
243 |
+
attention_scale=1.0):
|
244 |
+
super(SelfAttention, self).__init__()
|
245 |
+
# Set output layer initialization if not provided.
|
246 |
+
if output_layer_init_method is None:
|
247 |
+
output_layer_init_method = init_method
|
248 |
+
# Per attention head and per partition values.
|
249 |
+
self.hidden_size = hidden_size
|
250 |
+
self.hidden_size_per_attention_head = divide(hidden_size,
|
251 |
+
num_attention_heads)
|
252 |
+
|
253 |
+
self.num_attention_heads = num_attention_heads
|
254 |
+
self.attention_scale = attention_scale
|
255 |
+
# Strided linear layer.
|
256 |
+
self.query_key_value = Linear(hidden_size, 3 * hidden_size)
|
257 |
+
|
258 |
+
# Dropout. Note that for a single iteration, this layer will generate
|
259 |
+
# different outputs on different number of parallel partitions but
|
260 |
+
# on average it should not be partition dependent.
|
261 |
+
self.attention_dropout = torch.nn.Dropout(attention_dropout_prob)
|
262 |
+
|
263 |
+
# Output.
|
264 |
+
self.dense = Linear(hidden_size,
|
265 |
+
hidden_size)
|
266 |
+
self.output_dropout = torch.nn.Dropout(output_dropout_prob)
|
267 |
+
|
268 |
+
def _transpose_for_scores(self, tensor):
|
269 |
+
"""Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with
|
270 |
+
size [b, np, s, hn].
|
271 |
+
"""
|
272 |
+
new_tensor_shape = tensor.size()[:-1] + \
|
273 |
+
(self.num_attention_heads,
|
274 |
+
self.hidden_size_per_attention_head)
|
275 |
+
tensor = tensor.view(*new_tensor_shape)
|
276 |
+
return tensor.permute(0, 2, 1, 3)
|
277 |
+
|
278 |
+
def forward(self, hidden_states, ltor_mask, mem=None):
|
279 |
+
# hidden_states: [b, s, h]
|
280 |
+
# ltor_mask: [b,1,s,s]
|
281 |
+
|
282 |
+
# Attention heads. [b, s, hp]
|
283 |
+
query_length = hidden_states.size(1)
|
284 |
+
# self attention
|
285 |
+
if mem is None:
|
286 |
+
mixed_x_layer = self.query_key_value(hidden_states)
|
287 |
+
(mixed_query_layer,
|
288 |
+
mixed_key_layer,
|
289 |
+
mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
|
290 |
+
else:
|
291 |
+
cat = torch.cat((mem, hidden_states), 1)
|
292 |
+
mixed_x_layer = self.query_key_value(cat)
|
293 |
+
(mixed_query_layer,
|
294 |
+
mixed_key_layer,
|
295 |
+
mixed_value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
|
296 |
+
mixed_query_layer = mixed_query_layer[:, -query_length:]
|
297 |
+
|
298 |
+
# Reshape and transpose [b, np, s, hn]
|
299 |
+
query_layer = self._transpose_for_scores(mixed_query_layer)
|
300 |
+
key_layer = self._transpose_for_scores(mixed_key_layer)
|
301 |
+
value_layer = self._transpose_for_scores(mixed_value_layer)
|
302 |
+
|
303 |
+
if self.attention_scale > 1.0:
|
304 |
+
# Raw attention scores. [b, np, s, s]
|
305 |
+
attention_scores = torch.matmul(query_layer / math.sqrt(self.attention_scale),
|
306 |
+
key_layer.transpose(-1, -2) / math.sqrt(
|
307 |
+
self.hidden_size_per_attention_head * self.attention_scale))
|
308 |
+
else:
|
309 |
+
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2) / math.sqrt(
|
310 |
+
self.hidden_size_per_attention_head))
|
311 |
+
|
312 |
+
# Apply the left to right attention mask.
|
313 |
+
ltor_mask = ltor_mask.type_as(attention_scores)
|
314 |
+
attention_scores = torch.mul(attention_scores, ltor_mask)
|
315 |
+
if self.attention_scale > 1.0:
|
316 |
+
max_attention_scores = attention_scores.max(dim=-1, keepdim=True)[0]
|
317 |
+
attention_scores -= max_attention_scores
|
318 |
+
attention_scores *= self.attention_scale
|
319 |
+
|
320 |
+
attention_scores = attention_scores + (-65504.0) * (1.0 - ltor_mask)
|
321 |
+
# Attention probabilities. [b, np, s, s]
|
322 |
+
attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
|
323 |
+
# This is actually dropping out entire tokens to attend to, which might
|
324 |
+
# seem a bit unusual, but is taken from the original Transformer paper.
|
325 |
+
# with get_cuda_rng_tracker().fork():
|
326 |
+
attention_probs = self.attention_dropout(attention_probs)
|
327 |
+
|
328 |
+
# Context layer.
|
329 |
+
# [b, np, s, hn]
|
330 |
+
context_layer = torch.matmul(attention_probs, value_layer)
|
331 |
+
# [b, s, np, hn]
|
332 |
+
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
|
333 |
+
new_context_layer_shape = context_layer.size()[:-2] + \
|
334 |
+
(self.hidden_size,)
|
335 |
+
# [b, s, hp]
|
336 |
+
context_layer = context_layer.view(*new_context_layer_shape)
|
337 |
+
|
338 |
+
# Output. [b, s, h]
|
339 |
+
output = self.dense(context_layer)
|
340 |
+
output = self.output_dropout(output)
|
341 |
+
|
342 |
+
return output
|
343 |
+
|
344 |
+
|
345 |
+
class GLMBlock(torch.nn.Module):
|
346 |
+
"""A single layer transformer for GLM.
|
347 |
+
|
348 |
+
We use the following notation:
|
349 |
+
h: hidden size
|
350 |
+
n: number of attention heads
|
351 |
+
b: batch size
|
352 |
+
s: sequence length
|
353 |
+
Transformore layer takes input with size [b, s, h] and returns an
|
354 |
+
output of the same size.
|
355 |
+
|
356 |
+
Arguments:
|
357 |
+
hidden_size: The hidden size of the self attention.
|
358 |
+
num_attention_heads: number of attention head in the self
|
359 |
+
attention.
|
360 |
+
attention_dropout_prob: dropout probability of the attention
|
361 |
+
score in self attention.
|
362 |
+
output_dropout_prob: dropout probability for the outputs
|
363 |
+
after self attention and final output.
|
364 |
+
layernorm_epsilon: epsilon used in layernorm to avoid
|
365 |
+
division by zero.
|
366 |
+
init_method: initialization method used for the weights. Note
|
367 |
+
that all biases are initialized to zero and
|
368 |
+
layernorm weight are initialized to one.
|
369 |
+
output_layer_init_method: output layers (attention output and
|
370 |
+
mlp output) initialization. If None,
|
371 |
+
use `init_method`.
|
372 |
+
"""
|
373 |
+
|
374 |
+
def __init__(self,
|
375 |
+
hidden_size,
|
376 |
+
num_attention_heads,
|
377 |
+
attention_dropout_prob,
|
378 |
+
output_dropout_prob,
|
379 |
+
layernorm_epsilon,
|
380 |
+
init_method,
|
381 |
+
output_layer_init_method=None,
|
382 |
+
attention_scale=1.0):
|
383 |
+
super(GLMBlock, self).__init__()
|
384 |
+
# Set output layer initialization if not provided.
|
385 |
+
if output_layer_init_method is None:
|
386 |
+
output_layer_init_method = init_method
|
387 |
+
|
388 |
+
# Layernorm on the input data.
|
389 |
+
self.input_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
|
390 |
+
|
391 |
+
# Self attention.
|
392 |
+
self.attention = SelfAttention(
|
393 |
+
hidden_size,
|
394 |
+
num_attention_heads,
|
395 |
+
attention_dropout_prob,
|
396 |
+
output_dropout_prob,
|
397 |
+
init_method,
|
398 |
+
output_layer_init_method=output_layer_init_method,
|
399 |
+
attention_scale=attention_scale)
|
400 |
+
|
401 |
+
# Layernorm on the input data.
|
402 |
+
self.post_attention_layernorm = LayerNorm(hidden_size,
|
403 |
+
eps=layernorm_epsilon)
|
404 |
+
|
405 |
+
# MLP
|
406 |
+
self.mlp = MLP(
|
407 |
+
hidden_size,
|
408 |
+
output_dropout_prob,
|
409 |
+
init_method,
|
410 |
+
output_layer_init_method=output_layer_init_method)
|
411 |
+
|
412 |
+
def forward(self, hidden_states, ltor_mask, mem=None):
|
413 |
+
# hidden_states: [b, s, h]
|
414 |
+
# ltor_mask: [b,1, s,s]
|
415 |
+
|
416 |
+
# Layer norm at the begining of the transformer layer.
|
417 |
+
layernorm_output = self.input_layernorm(hidden_states)
|
418 |
+
mem = self.input_layernorm(mem) if mem is not None else None
|
419 |
+
# Self attention.
|
420 |
+
attention_output = self.attention(layernorm_output, ltor_mask, mem)
|
421 |
+
# Residual connection.
|
422 |
+
layernorm_input = hidden_states + attention_output
|
423 |
+
# Layer norm post the self attention.
|
424 |
+
layernorm_output = self.post_attention_layernorm(layernorm_input)
|
425 |
+
# MLP.
|
426 |
+
mlp_output = self.mlp(layernorm_output)
|
427 |
+
# Second residual connection.
|
428 |
+
output = layernorm_input + mlp_output
|
429 |
+
|
430 |
+
return output
|
431 |
+
|
432 |
+
|
433 |
+
class GLMStack(torch.nn.Module):
|
434 |
+
"""GLM transformer.
|
435 |
+
|
436 |
+
This module takes input from embedding layer and it's output can
|
437 |
+
be used directly by a logit layer. It consists of L (num-layers)
|
438 |
+
blocks of:
|
439 |
+
layer norm
|
440 |
+
self attention
|
441 |
+
residual connection
|
442 |
+
layer norm
|
443 |
+
mlp
|
444 |
+
residual connection
|
445 |
+
followed by a final layer norm.
|
446 |
+
|
447 |
+
Arguments:
|
448 |
+
num_layers: Number of transformer layers.
|
449 |
+
hidden_size: The hidden size of the self attention.
|
450 |
+
num_attention_heads: number of attention head in the self
|
451 |
+
attention.
|
452 |
+
attention_dropout_prob: dropout probability of the attention
|
453 |
+
score in self attention.
|
454 |
+
output_dropout_prob: dropout probability for the outputs
|
455 |
+
after self attention and final output.
|
456 |
+
checkpoint_activations: if True, checkpoint activations.
|
457 |
+
checkpoint_num_layers: number of layers to checkpoint. This
|
458 |
+
is basically the chunk size in checkpoitning.
|
459 |
+
layernorm_epsilon: epsilon used in layernorm to avoid
|
460 |
+
division by zero.
|
461 |
+
init_method_std: standard deviation of the init method which has
|
462 |
+
the form N(0, std).
|
463 |
+
use_scaled_init_for_output_weights: If Ture use 1/sqrt(2*num_layers)
|
464 |
+
scaling for the output weights (
|
465 |
+
output of self attention and mlp).
|
466 |
+
"""
|
467 |
+
|
468 |
+
def __init__(self,
|
469 |
+
num_layers,
|
470 |
+
hidden_size,
|
471 |
+
num_attention_heads,
|
472 |
+
max_sequence_length,
|
473 |
+
embedding_dropout_prob,
|
474 |
+
attention_dropout_prob,
|
475 |
+
output_dropout_prob,
|
476 |
+
checkpoint_activations,
|
477 |
+
checkpoint_num_layers=1,
|
478 |
+
layernorm_epsilon=1.0e-5,
|
479 |
+
init_method_std=0.02,
|
480 |
+
use_scaled_init_for_output_weights=True,
|
481 |
+
block_position_encoding=False,
|
482 |
+
attention_scale=1.0,
|
483 |
+
):
|
484 |
+
super(GLMStack, self).__init__()
|
485 |
+
self.hidden_size = hidden_size
|
486 |
+
# Store activation checkpoiting flag.
|
487 |
+
self.checkpoint_activations = checkpoint_activations
|
488 |
+
self.checkpoint_num_layers = checkpoint_num_layers
|
489 |
+
|
490 |
+
output_layer_init_method = None
|
491 |
+
if use_scaled_init_for_output_weights:
|
492 |
+
output_layer_init_method = scaled_init_method(0.0, init_method_std,
|
493 |
+
num_layers)
|
494 |
+
# Embeddings dropout
|
495 |
+
self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
|
496 |
+
self.block_position_encoding = block_position_encoding
|
497 |
+
|
498 |
+
# Position embedding (serial).
|
499 |
+
if block_position_encoding:
|
500 |
+
self.position_embeddings = torch.nn.Embedding(max_sequence_length + 1, hidden_size)
|
501 |
+
self.block_position_embeddings = torch.nn.Embedding(max_sequence_length + 1, hidden_size)
|
502 |
+
torch.nn.init.normal_(self.block_position_embeddings.weight, mean=0.0, std=init_method_std)
|
503 |
+
else:
|
504 |
+
self.position_embeddings = torch.nn.Embedding(max_sequence_length, hidden_size)
|
505 |
+
# Initialize the position embeddings.
|
506 |
+
torch.nn.init.normal_(self.position_embeddings.weight, mean=0.0, std=init_method_std)
|
507 |
+
|
508 |
+
def get_layer():
|
509 |
+
|
510 |
+
return GLMBlock(
|
511 |
+
hidden_size,
|
512 |
+
num_attention_heads,
|
513 |
+
attention_dropout_prob,
|
514 |
+
output_dropout_prob,
|
515 |
+
layernorm_epsilon,
|
516 |
+
unscaled_init_method(init_method_std),
|
517 |
+
output_layer_init_method=output_layer_init_method,
|
518 |
+
attention_scale=attention_scale)
|
519 |
+
|
520 |
+
# Transformer layers.
|
521 |
+
self.layers = torch.nn.ModuleList(
|
522 |
+
[get_layer() for _ in range(num_layers)])
|
523 |
+
|
524 |
+
# Final layer norm before output.
|
525 |
+
self.final_layernorm = LayerNorm(hidden_size, eps=layernorm_epsilon)
|
526 |
+
|
527 |
+
def forward(self, hidden_states, position_ids, attention_mask, memory_states=None):
|
528 |
+
|
529 |
+
batch_size, query_length = hidden_states.size()[:2]
|
530 |
+
memory_length = memory_states[0].size(1) if memory_states else 0
|
531 |
+
# attention mask is the beginning postion of B region, \in [0, query_len)
|
532 |
+
is_scalar = torch.numel(attention_mask) == 1
|
533 |
+
is_sep = is_scalar or torch.numel(attention_mask) == batch_size
|
534 |
+
if is_sep:
|
535 |
+
sep = attention_mask.item() if is_scalar else attention_mask
|
536 |
+
|
537 |
+
# conventional transformer
|
538 |
+
def build_mask_matrix(seq_length, sep, memory_length=0):
|
539 |
+
m = hidden_states.new_ones((1, seq_length, seq_length))
|
540 |
+
m = torch.tril(m)
|
541 |
+
if is_scalar:
|
542 |
+
m[0, :, :int(sep)] = 1
|
543 |
+
else:
|
544 |
+
m = m.expand(batch_size, -1, -1)
|
545 |
+
ids = torch.arange(seq_length, device=sep.device, dtype=sep.dtype).view(1, -1)
|
546 |
+
mask = ids < sep.view(-1, 1)
|
547 |
+
m = m.masked_fill(mask.unsqueeze(1).expand_as(m), 1)
|
548 |
+
if memory_length > 0:
|
549 |
+
m = m.expand(batch_size, -1, -1)
|
550 |
+
m = torch.cat((hidden_states.new_ones((batch_size, seq_length, memory_length)), m), dim=2)
|
551 |
+
m = m.unsqueeze(1)
|
552 |
+
return m
|
553 |
+
|
554 |
+
attention_mask = build_mask_matrix(query_length, sep, memory_length=memory_length)
|
555 |
+
else:
|
556 |
+
if attention_mask.dim() == 2:
|
557 |
+
attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)
|
558 |
+
attention_mask = attention_mask[:, :, :, -query_length - memory_length:]
|
559 |
+
|
560 |
+
if self.block_position_encoding:
|
561 |
+
position_ids, block_position_ids = position_ids[:, 0], position_ids[:, 1]
|
562 |
+
position_embeddings = self.position_embeddings(position_ids)
|
563 |
+
|
564 |
+
hidden_states = hidden_states + position_embeddings
|
565 |
+
if self.block_position_encoding:
|
566 |
+
block_position_embeddings = self.block_position_embeddings(block_position_ids)
|
567 |
+
hidden_states = hidden_states + block_position_embeddings
|
568 |
+
hidden_states = self.embedding_dropout(hidden_states)
|
569 |
+
|
570 |
+
def check_detach(_hidden_states):
|
571 |
+
return _hidden_states.detach()
|
572 |
+
|
573 |
+
mem_layers = [check_detach(hidden_states)]
|
574 |
+
|
575 |
+
for i, layer in enumerate(self.layers):
|
576 |
+
|
577 |
+
args = [hidden_states, attention_mask]
|
578 |
+
|
579 |
+
def create_custom_forward(module):
|
580 |
+
def custom_forward(*inputs):
|
581 |
+
# None for past_key_value
|
582 |
+
return module(*inputs)
|
583 |
+
|
584 |
+
return custom_forward
|
585 |
+
|
586 |
+
mem_i = memory_states[i] if memory_states else None
|
587 |
+
|
588 |
+
if self.checkpoint_activations:
|
589 |
+
hidden_states = torch.utils.checkpoint.checkpoint(
|
590 |
+
create_custom_forward(layer),
|
591 |
+
hidden_states,
|
592 |
+
mem=mem_i,
|
593 |
+
)
|
594 |
+
else:
|
595 |
+
hidden_states = layer(*args, mem=mem_i)
|
596 |
+
mem_layers.append(check_detach(hidden_states))
|
597 |
+
|
598 |
+
# Final layer norm.
|
599 |
+
output = self.final_layernorm(hidden_states)
|
600 |
+
mem_layers = self.update_mems(mem_layers, memory_states)
|
601 |
+
return (output, mem_layers)
|
602 |
+
|
603 |
+
def update_mems(self, hiddens, mems):
|
604 |
+
memory_length = mems[0].size(1) if mems else 0
|
605 |
+
query_length = hiddens[0].size(1)
|
606 |
+
new_memory_length = memory_length + query_length
|
607 |
+
|
608 |
+
new_mems = []
|
609 |
+
# with torch.no_grad():
|
610 |
+
for i in range(len(hiddens)):
|
611 |
+
if new_memory_length <= query_length:
|
612 |
+
new_mems.append(hiddens[i][:, -new_memory_length:])
|
613 |
+
else:
|
614 |
+
new_mems.append(torch.cat((mems[i][:, -new_memory_length + query_length:], hiddens[i]), dim=1))
|
615 |
+
return new_mems
|
616 |
+
|
617 |
+
|
618 |
+
class GLMPreTrainedModel(PreTrainedModel):
|
619 |
+
"""
|
620 |
+
An abstract class to handle weights initialization and
|
621 |
+
a simple interface for downloading and loading pretrained models.
|
622 |
+
"""
|
623 |
+
|
624 |
+
config_class = GLMConfig
|
625 |
+
base_model_prefix = "glm"
|
626 |
+
supports_gradient_checkpointing = True
|
627 |
+
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
628 |
+
|
629 |
+
def _init_weights(self, module):
|
630 |
+
""" Initialize the weights """
|
631 |
+
if isinstance(module, torch.nn.Linear):
|
632 |
+
# Slightly different from the TF version which uses truncated_normal for initialization
|
633 |
+
# cf https://github.com/pytorch/pytorch/pull/5617
|
634 |
+
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
635 |
+
if module.bias is not None:
|
636 |
+
module.bias.data.zero_()
|
637 |
+
elif isinstance(module, torch.nn.Embedding):
|
638 |
+
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
639 |
+
if module.padding_idx is not None:
|
640 |
+
module.weight.data[module.padding_idx].zero_()
|
641 |
+
elif isinstance(module, torch.nn.LayerNorm):
|
642 |
+
module.bias.data.zero_()
|
643 |
+
module.weight.data.fill_(1.0)
|
644 |
+
|
645 |
+
def _set_gradient_checkpointing(self, module, value=False):
|
646 |
+
if isinstance(module, GLMModel):
|
647 |
+
module.gradient_checkpointing = value
|
648 |
+
|
649 |
+
|
650 |
+
GLM_START_DOCSTRING = r"""
|
651 |
+
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class.
|
652 |
+
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
|
653 |
+
usage and behavior.
|
654 |
+
|
655 |
+
Parameters:
|
656 |
+
config ([`~GLMConfig`]): Model configuration class with all the parameters of the model.
|
657 |
+
Initializing with a config file does not load the weights associated with the model, only the configuration.
|
658 |
+
Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
659 |
+
"""
|
660 |
+
|
661 |
+
GLM_INPUTS_DOCSTRING = r"""
|
662 |
+
Args:
|
663 |
+
input_ids (`torch.LongTensor` of shape `({0})`):
|
664 |
+
Indices of input sequence tokens in the vocabulary.
|
665 |
+
|
666 |
+
Indices can be obtained using [`GLMTokenizer`].
|
667 |
+
See [`PreTrainedTokenizer.encode`] and
|
668 |
+
[`PreTrainedTokenizer.__call__`] for details.
|
669 |
+
|
670 |
+
[What are input IDs?](../glossary#input-ids)
|
671 |
+
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
|
672 |
+
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
673 |
+
|
674 |
+
- 1 for tokens that are **not masked**,
|
675 |
+
- 0 for tokens that are **masked**.
|
676 |
+
|
677 |
+
[What are attention masks?](../glossary#attention-mask)
|
678 |
+
token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
|
679 |
+
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
|
680 |
+
|
681 |
+
- 0 corresponds to a *sentence A* token,
|
682 |
+
- 1 corresponds to a *sentence B* token.
|
683 |
+
|
684 |
+
[What are token type IDs?](../glossary#token-type-ids)
|
685 |
+
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
|
686 |
+
Indices of positions of each input sequence tokens in the position embeddings.
|
687 |
+
Selected in the range `[0, config.max_position_embeddings - 1]`.
|
688 |
+
|
689 |
+
[What are position IDs?](../glossary#position-ids)
|
690 |
+
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
|
691 |
+
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
|
692 |
+
|
693 |
+
- 1 indicates the head is **not masked**,
|
694 |
+
- 0 indicates the head is **masked**.
|
695 |
+
|
696 |
+
inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
|
697 |
+
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
|
698 |
+
This is useful if you want more control over how to convert *input_ids* indices into associated vectors
|
699 |
+
than the model's internal embedding lookup matrix.
|
700 |
+
output_attentions (`bool`, *optional*):
|
701 |
+
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
702 |
+
tensors for more detail.
|
703 |
+
output_hidden_states (`bool`, *optional*):
|
704 |
+
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
705 |
+
more detail.
|
706 |
+
return_dict (`bool`, *optional*):
|
707 |
+
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
708 |
+
"""
|
709 |
+
|
710 |
+
|
711 |
+
@add_start_docstrings(
|
712 |
+
"The bare GLM Model transformer outputting raw hidden-states without any specific head on top.",
|
713 |
+
GLM_START_DOCSTRING,
|
714 |
+
)
|
715 |
+
class GLMModel(GLMPreTrainedModel):
|
716 |
+
"""
|
717 |
+
|
718 |
+
The model can behave as an encoder (with only self-attention) as well
|
719 |
+
as a decoder, in which case a layer of cross-attention is added between
|
720 |
+
the self-attention layers, following the architecture described in [Attention is
|
721 |
+
all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani,
|
722 |
+
Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
|
723 |
+
|
724 |
+
To behave as an decoder the model needs to be initialized with the
|
725 |
+
`is_decoder` argument of the configuration set to `True`.
|
726 |
+
To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder`
|
727 |
+
argument and `add_cross_attention` set to `True`; an
|
728 |
+
`encoder_hidden_states` is then expected as an input to the forward pass.
|
729 |
+
"""
|
730 |
+
|
731 |
+
def __init__(self, config):
|
732 |
+
super().__init__(config)
|
733 |
+
self.config = config
|
734 |
+
self.output_predict = config.output_predict
|
735 |
+
# Word embeddings (parallel).
|
736 |
+
self.word_embeddings = VocabEmbedding(config)
|
737 |
+
|
738 |
+
# Transformer
|
739 |
+
self.transformer = GLMStack(config.num_layers,
|
740 |
+
config.hidden_size,
|
741 |
+
config.num_attention_heads,
|
742 |
+
config.max_sequence_length,
|
743 |
+
config.embedding_dropout_prob,
|
744 |
+
config.attention_dropout_prob,
|
745 |
+
config.output_dropout_prob,
|
746 |
+
config.checkpoint_activations,
|
747 |
+
config.checkpoint_num_layers,
|
748 |
+
attention_scale=config.attention_scale,
|
749 |
+
block_position_encoding=config.block_position_encoding)
|
750 |
+
|
751 |
+
# Initialize weights and apply final processing
|
752 |
+
self.post_init()
|
753 |
+
|
754 |
+
@add_start_docstrings_to_model_forward(GLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
755 |
+
@add_code_sample_docstrings(
|
756 |
+
processor_class=_TOKENIZER_FOR_DOC,
|
757 |
+
checkpoint=_CHECKPOINT_FOR_DOC,
|
758 |
+
output_type=BaseModelOutputWithPastAndCrossAttentions,
|
759 |
+
config_class=_CONFIG_FOR_DOC,
|
760 |
+
)
|
761 |
+
def forward(
|
762 |
+
self,
|
763 |
+
input_ids=None,
|
764 |
+
position_ids=None,
|
765 |
+
attention_mask=None,
|
766 |
+
mems=None,
|
767 |
+
**kwargs
|
768 |
+
):
|
769 |
+
batch_size = input_ids.size(0)
|
770 |
+
words_embeddings = self.word_embeddings(input_ids)
|
771 |
+
embeddings = words_embeddings
|
772 |
+
|
773 |
+
device = input_ids.device
|
774 |
+
input_shape = input_ids.size()
|
775 |
+
|
776 |
+
if position_ids is None:
|
777 |
+
position_ids = torch.arange(0, input_shape[-1], dtype=torch.long, device=device)
|
778 |
+
block_position_ids = torch.zeros(input_shape[-1], dtype=torch.long, device=device)
|
779 |
+
position_ids = torch.stack((position_ids, block_position_ids), dim=0).unsqueeze(0)
|
780 |
+
if attention_mask is None:
|
781 |
+
attention_mask = torch.zeros(batch_size)
|
782 |
+
# Transformer.
|
783 |
+
transformer_output = self.transformer(embeddings, position_ids, attention_mask, mems)
|
784 |
+
last_hidden_states, mems = transformer_output
|
785 |
+
logits = None
|
786 |
+
if self.output_predict:
|
787 |
+
logits = F.linear(last_hidden_states, self.word_embeddings.weight)
|
788 |
+
|
789 |
+
return ModelOutput(
|
790 |
+
last_hidden_states=last_hidden_states,
|
791 |
+
logits=logits,
|
792 |
+
mems=mems,
|
793 |
+
)
|
794 |
+
|
795 |
+
|
796 |
+
@add_start_docstrings(
|
797 |
+
"""GLM Model transformer for multiple choice classification""",
|
798 |
+
GLM_START_DOCSTRING
|
799 |
+
)
|
800 |
+
class GLMForMultipleChoice(GLMPreTrainedModel):
|
801 |
+
def __init__(self, config):
|
802 |
+
super().__init__(config)
|
803 |
+
self.glm = GLMModel(config)
|
804 |
+
self.post_init()
|
805 |
+
|
806 |
+
def forward(
|
807 |
+
self,
|
808 |
+
input_ids=None,
|
809 |
+
position_ids=None,
|
810 |
+
attention_mask=None,
|
811 |
+
choice_ids=None,
|
812 |
+
choice_indices=None,
|
813 |
+
labels=None,
|
814 |
+
mems=None,
|
815 |
+
**kwargs
|
816 |
+
):
|
817 |
+
model_output = self.glm(input_ids, position_ids, attention_mask, mems=mems, **kwargs)
|
818 |
+
lm_logits = model_output.logits
|
819 |
+
log_probs = []
|
820 |
+
for output, choices, choice_index in zip(F.log_softmax(lm_logits, dim=-1), choice_ids, choice_indices):
|
821 |
+
log_probs_single = []
|
822 |
+
for choice, choice_target_id in zip(choices, choice_index):
|
823 |
+
tmp = output[choice_target_id, choice]
|
824 |
+
log_probs_single.append(tmp.sum())
|
825 |
+
log_probs.append(torch.stack(log_probs_single))
|
826 |
+
log_probs = torch.stack(log_probs)
|
827 |
+
loss = None
|
828 |
+
if labels is not None:
|
829 |
+
loss_fct = CrossEntropyLoss()
|
830 |
+
loss = loss_fct(log_probs, labels)
|
831 |
+
return ModelOutput(
|
832 |
+
loss=loss,
|
833 |
+
logits=log_probs,
|
834 |
+
lm_logits=lm_logits,
|
835 |
+
mems=model_output.mems
|
836 |
+
)
|
837 |
+
|
838 |
+
@add_start_docstrings(
|
839 |
+
"""GLM Model transformer with a `language modeling` head on top""",
|
840 |
+
GLM_START_DOCSTRING,
|
841 |
+
)
|
842 |
+
class GLMForConditionalGeneration(GLMPreTrainedModel):
|
843 |
+
def __init__(self, config):
|
844 |
+
super().__init__(config)
|
845 |
+
self.glm = GLMModel(config)
|
846 |
+
self.post_init()
|
847 |
+
|
848 |
+
def _reorder_cache(self, past, beam_idx):
|
849 |
+
# if decoder past is not included in output
|
850 |
+
# speedy decoding is disabled and no need to reorder
|
851 |
+
if past is None:
|
852 |
+
return past
|
853 |
+
reordered_decoder_past = ()
|
854 |
+
for layer_past_states in past:
|
855 |
+
# get the correct batch idx from layer past batch dim
|
856 |
+
reordered_decoder_past = reordered_decoder_past + (
|
857 |
+
layer_past_states.index_select(0, beam_idx.to(layer_past_states.device)),)
|
858 |
+
return reordered_decoder_past
|
859 |
+
|
860 |
+
def prepare_inputs_for_generation(self, input_ids, past=None, position_ids=None, generation_attention_mask=None,
|
861 |
+
**kwargs):
|
862 |
+
# only last token for inputs_ids if past is defined in kwargs
|
863 |
+
attention_mask = generation_attention_mask
|
864 |
+
seq_length = input_ids.shape[1]
|
865 |
+
if past:
|
866 |
+
if position_ids is not None:
|
867 |
+
position_ids = position_ids[:, :, seq_length - 1].unsqueeze(-1)
|
868 |
+
if attention_mask is not None:
|
869 |
+
attention_mask = attention_mask[:, :, seq_length - 1, :seq_length].unsqueeze(-2)
|
870 |
+
input_ids = input_ids[:, -1].unsqueeze(-1)
|
871 |
+
else:
|
872 |
+
if position_ids is not None:
|
873 |
+
position_ids = position_ids[:, :, :seq_length]
|
874 |
+
if attention_mask is not None:
|
875 |
+
attention_mask = attention_mask[:, :, :seq_length, :seq_length]
|
876 |
+
if position_ids is not None and input_ids.size(0) > position_ids.size(0):
|
877 |
+
batch_size = position_ids.size(0)
|
878 |
+
num_beams = input_ids.size(0) // batch_size
|
879 |
+
position_ids = position_ids.unsqueeze(1).expand(-1, num_beams, -1, -1)
|
880 |
+
position_ids = position_ids.reshape(batch_size * num_beams, *position_ids.shape[-2:])
|
881 |
+
if attention_mask is not None and input_ids.size(0) > attention_mask.size(0):
|
882 |
+
batch_size = attention_mask.size(0)
|
883 |
+
num_beams = input_ids.size(0) // batch_size
|
884 |
+
attention_mask = attention_mask.unsqueeze(1).expand(-1, num_beams, -1, -1, -1)
|
885 |
+
attention_mask = attention_mask.reshape(batch_size * num_beams, *attention_mask.shape[-3:])
|
886 |
+
return {
|
887 |
+
"input_ids": input_ids,
|
888 |
+
"position_ids": position_ids,
|
889 |
+
"attention_mask": attention_mask,
|
890 |
+
"mems": past,
|
891 |
+
}
|
892 |
+
|
893 |
+
def forward(
|
894 |
+
self,
|
895 |
+
input_ids=None,
|
896 |
+
position_ids=None,
|
897 |
+
attention_mask=None,
|
898 |
+
labels=None,
|
899 |
+
mems=None,
|
900 |
+
**kwargs
|
901 |
+
):
|
902 |
+
model_output = self.glm(input_ids, position_ids, attention_mask, mems=mems, **kwargs)
|
903 |
+
lm_logits = model_output.logits
|
904 |
+
loss = None
|
905 |
+
if labels is not None:
|
906 |
+
loss_fct = CrossEntropyLoss(ignore_index=-100)
|
907 |
+
loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
|
908 |
+
return ModelOutput(
|
909 |
+
loss=loss,
|
910 |
+
logits=lm_logits,
|
911 |
+
mems=model_output.mems
|
912 |
+
)
|
913 |
+
|
914 |
+
|
915 |
+
@add_start_docstrings(
|
916 |
+
"""GLM Model transformer with a sequence classification/regression head on top (a linear layer on top of
|
917 |
+
the pooled output) e.g. for GLUE tasks. """,
|
918 |
+
GLM_START_DOCSTRING,
|
919 |
+
)
|
920 |
+
class GLMForSequenceClassification(GLMPreTrainedModel):
|
921 |
+
def __init__(self, config: GLMConfig, hidden_dropout=None, num_class=1):
|
922 |
+
super().__init__(config)
|
923 |
+
self.pool_token = config.pool_token
|
924 |
+
self.glm = GLMModel(config)
|
925 |
+
self.glm.output_predict = False
|
926 |
+
self.num_class = num_class
|
927 |
+
# Multi-choice head.
|
928 |
+
self.dense = torch.nn.Linear(config.hidden_size, config.hidden_size)
|
929 |
+
classifier_dropout = (
|
930 |
+
config.classifier_dropout if config.classifier_dropout is not None else config.output_dropout_prob
|
931 |
+
)
|
932 |
+
self.dropout = torch.nn.Dropout(classifier_dropout)
|
933 |
+
self.out_proj = torch.nn.Linear(config.hidden_size, config.num_labels)
|
934 |
+
|
935 |
+
# Initialize weights and apply final processing
|
936 |
+
self.post_init()
|
937 |
+
|
938 |
+
@add_start_docstrings_to_model_forward(GLM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
|
939 |
+
@add_code_sample_docstrings(
|
940 |
+
processor_class=_TOKENIZER_FOR_DOC,
|
941 |
+
checkpoint=_CHECKPOINT_FOR_DOC,
|
942 |
+
output_type=SequenceClassifierOutput,
|
943 |
+
config_class=_CONFIG_FOR_DOC,
|
944 |
+
)
|
945 |
+
def forward(self,
|
946 |
+
input_ids=None,
|
947 |
+
position_ids=None,
|
948 |
+
attention_mask=None,
|
949 |
+
labels=None):
|
950 |
+
|
951 |
+
num_choices = None
|
952 |
+
|
953 |
+
if len(input_ids.shape) == 3:
|
954 |
+
batch_size, num_choices = input_ids.shape[:2]
|
955 |
+
input_ids = input_ids.reshape(-1, input_ids.size(-1))
|
956 |
+
attention_mask = attention_mask.reshape(-1, *attention_mask.size()[2:])
|
957 |
+
position_ids = position_ids.reshape(-1, *position_ids.size()[2:])
|
958 |
+
model_out = self.glm(input_ids, position_ids, attention_mask)
|
959 |
+
outputs, mems = model_out.last_hidden_states, model_out.mems
|
960 |
+
|
961 |
+
output = outputs[:, 0, :]
|
962 |
+
output = self.dropout(output)
|
963 |
+
output = torch.tanh(self.dense(output))
|
964 |
+
output = self.dropout(output)
|
965 |
+
logits = self.out_proj(output)
|
966 |
+
if num_choices is not None:
|
967 |
+
logits = logits.view(-1, num_choices)
|
968 |
+
loss = None
|
969 |
+
if labels is not None:
|
970 |
+
loss_fct = CrossEntropyLoss()
|
971 |
+
loss = loss_fct(logits, labels)
|
972 |
+
# loss = F.cross_entropy(logits.contiguous().float(), labels.long())
|
973 |
+
return SequenceClassifierOutput(loss=loss,
|
974 |
+
logits=logits,
|
975 |
+
hidden_states=outputs)
|
tokenization_glm.py
ADDED
@@ -0,0 +1,362 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import Optional, Tuple, List, Union
|
3 |
+
from shutil import copyfile
|
4 |
+
import torch
|
5 |
+
|
6 |
+
from transformers import PreTrainedTokenizer, RobertaTokenizer, GPT2Tokenizer, BertTokenizer
|
7 |
+
from transformers.utils import logging
|
8 |
+
from transformers.tokenization_utils_base import BatchEncoding
|
9 |
+
from transformers.models.auto.tokenization_auto import get_tokenizer_config
|
10 |
+
from transformers.utils.generic import _is_torch_device
|
11 |
+
import sentencepiece as spm
|
12 |
+
|
13 |
+
logger = logging.get_logger(__name__)
|
14 |
+
|
15 |
+
|
16 |
+
class GLMBatchEncoding(BatchEncoding):
|
17 |
+
def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding":
|
18 |
+
"""
|
19 |
+
Send all values to device by calling `v.to(device)` (PyTorch only).
|
20 |
+
|
21 |
+
Args:
|
22 |
+
device (`str` or `torch.device`): The device to put the tensors on.
|
23 |
+
|
24 |
+
Returns:
|
25 |
+
[`BatchEncoding`]: The same instance after modification.
|
26 |
+
"""
|
27 |
+
|
28 |
+
# This check catches things like APEX blindly calling "to" on all inputs to a module
|
29 |
+
# Otherwise it passes the casts down and casts the LongTensor containing the token idxs
|
30 |
+
# into a HalfTensor
|
31 |
+
if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int):
|
32 |
+
self.data = {k: v.to(device=device) if torch.is_tensor(v) else v for k, v in self.data.items()}
|
33 |
+
else:
|
34 |
+
logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.")
|
35 |
+
return self
|
36 |
+
|
37 |
+
|
38 |
+
class GLMTokenizerMixin:
|
39 |
+
@property
|
40 |
+
def sop_token(self) -> Optional[str]:
|
41 |
+
return "<|startofpiece|>"
|
42 |
+
|
43 |
+
@property
|
44 |
+
def sop_token_id(self) -> Optional[int]:
|
45 |
+
"""
|
46 |
+
`Optional[int]`: Id of the start token in the vocabulary, used when training a model with autoregressive blank filling.
|
47 |
+
"""
|
48 |
+
return self.convert_tokens_to_ids(self.sop_token)
|
49 |
+
|
50 |
+
@property
|
51 |
+
def eop_token(self) -> Optional[str]:
|
52 |
+
return "<|endofpiece|>"
|
53 |
+
|
54 |
+
@property
|
55 |
+
def eop_token_id(self) -> Optional[int]:
|
56 |
+
"""
|
57 |
+
`Optional[int]`: Id of the end token in the vocabulary, used when training a model with autoregressive blank filling.
|
58 |
+
"""
|
59 |
+
return self.convert_tokens_to_ids(self.eop_token)
|
60 |
+
|
61 |
+
@property
|
62 |
+
def gmask_token_id(self) -> int:
|
63 |
+
return self.convert_tokens_to_ids("[gMASK]")
|
64 |
+
|
65 |
+
@property
|
66 |
+
def smask_token_id(self) -> int:
|
67 |
+
return self.convert_tokens_to_ids("[sMASK]")
|
68 |
+
|
69 |
+
@property
|
70 |
+
def mask_token_ids(self):
|
71 |
+
return [self.mask_token_id, self.smask_token_id, self.gmask_token_id]
|
72 |
+
|
73 |
+
def _build_input_for_multiple_choice(self, context, choices):
|
74 |
+
context_id = context["input_ids"]
|
75 |
+
if torch.is_tensor(context_id):
|
76 |
+
context_id = context_id.tolist()
|
77 |
+
|
78 |
+
division = len(context_id)
|
79 |
+
mask_position = context_id.index(self.mask_token_id)
|
80 |
+
|
81 |
+
token = torch.tensor(context_id, dtype=torch.long)
|
82 |
+
attention_mask = [context["attention_mask"].expand(division, -1)]
|
83 |
+
position_id = torch.arange(division, dtype=torch.long)
|
84 |
+
block_position_id = torch.zeros(division, dtype=torch.long)
|
85 |
+
|
86 |
+
choice_ids, choice_indices = [], []
|
87 |
+
|
88 |
+
for choice_str in choices:
|
89 |
+
choice = torch.tensor(self(choice_str, add_special_tokens=False, padding=False)['input_ids'],
|
90 |
+
dtype=torch.long)
|
91 |
+
choice_ids.append(choice)
|
92 |
+
choice_indices.append(torch.arange(len(token), len(token) + len(choice), dtype=torch.long))
|
93 |
+
attention_mask.append(torch.tril(torch.ones((len(choice), len(choice)), dtype=torch.long)))
|
94 |
+
|
95 |
+
token = torch.cat((token, torch.tensor([self.sop_token_id], dtype=torch.long), choice[:-1]))
|
96 |
+
position_id = torch.cat((position_id, torch.tensor([mask_position] * len(choice), dtype=torch.long)))
|
97 |
+
block_position_id = torch.cat((block_position_id, torch.arange(1, 1 + len(choice), dtype=torch.long)))
|
98 |
+
|
99 |
+
attention_mask = torch.block_diag(*attention_mask)
|
100 |
+
attention_mask[division:, :division] = context["attention_mask"].unsqueeze(0)
|
101 |
+
|
102 |
+
return {
|
103 |
+
"input_ids": token,
|
104 |
+
"position_ids": torch.stack((position_id, block_position_id)),
|
105 |
+
"attention_mask": attention_mask,
|
106 |
+
"choice_ids": choice_ids,
|
107 |
+
"choice_indices": choice_indices
|
108 |
+
}
|
109 |
+
|
110 |
+
def _pad_batch(self, tokens, position_ids, attention_mask, max_seq_length):
|
111 |
+
pad_length = max_seq_length - len(tokens)
|
112 |
+
attention_mask = torch.nn.functional.pad(
|
113 |
+
attention_mask,
|
114 |
+
(0, pad_length, 0, pad_length),
|
115 |
+
mode="constant",
|
116 |
+
value=0,
|
117 |
+
)
|
118 |
+
tokens = torch.cat((tokens, torch.zeros(pad_length, dtype=torch.long)))
|
119 |
+
position_ids = torch.cat((position_ids, position_ids[..., -1:].expand(-1, pad_length)), dim=-1)
|
120 |
+
return tokens, position_ids, attention_mask
|
121 |
+
|
122 |
+
def _collate(self, samples):
|
123 |
+
TILE = 1
|
124 |
+
length_to_pad = (max(map(lambda spl: len(spl["input_ids"]), samples)) + TILE - 1) // TILE * TILE
|
125 |
+
|
126 |
+
token_batch, position_id_batch, attention_mask_batch = [], [], []
|
127 |
+
choices_batch, choice_target_ids_batch = [], []
|
128 |
+
|
129 |
+
for sample in samples:
|
130 |
+
token, position_id, attention_mask = self._pad_batch(
|
131 |
+
sample["input_ids"], sample["position_ids"], sample["attention_mask"], length_to_pad
|
132 |
+
)
|
133 |
+
token_batch.append(token)
|
134 |
+
position_id_batch.append(position_id)
|
135 |
+
attention_mask_batch.append(attention_mask)
|
136 |
+
choices_batch.append(sample["choice_ids"])
|
137 |
+
choice_target_ids_batch.append(sample["choice_indices"])
|
138 |
+
return {
|
139 |
+
"input_ids": torch.stack(token_batch),
|
140 |
+
"position_ids": torch.stack(position_id_batch),
|
141 |
+
"attention_mask": torch.stack(attention_mask_batch).unsqueeze(1),
|
142 |
+
"choice_ids": choices_batch,
|
143 |
+
"choice_indices": choice_target_ids_batch,
|
144 |
+
}
|
145 |
+
|
146 |
+
def build_inputs_for_multiple_choice(self, model_input: BatchEncoding, choices, max_length=None):
|
147 |
+
samples = [{key: value[i] for key, value in model_input.items()} for i in range(len(model_input["input_ids"]))]
|
148 |
+
samples = [self._build_input_for_multiple_choice(sample, choice) for sample, choice in
|
149 |
+
zip(samples, choices)]
|
150 |
+
inputs = self._collate(samples)
|
151 |
+
return GLMBatchEncoding(inputs)
|
152 |
+
|
153 |
+
def build_inputs_for_generation(self, model_input: BatchEncoding, max_gen_length=512, targets=None, padding=False):
|
154 |
+
mask_ids = self.mask_token_ids
|
155 |
+
input_ids = model_input.input_ids
|
156 |
+
batch_size, seq_length = input_ids.shape[:2]
|
157 |
+
position_id, block_position_id = list(range(seq_length)), [0 for _ in range(seq_length)]
|
158 |
+
position_ids, block_position_ids = [], []
|
159 |
+
labels = None
|
160 |
+
if targets is not None:
|
161 |
+
is_batched = isinstance(targets, (list, tuple))
|
162 |
+
targets = self(targets, add_special_tokens=False, padding=False).input_ids
|
163 |
+
if not is_batched:
|
164 |
+
targets = [targets]
|
165 |
+
assert len(targets) == len(input_ids)
|
166 |
+
targets = [(target + [self.eop_token_id])[:max_gen_length] for target in targets]
|
167 |
+
if not padding:
|
168 |
+
max_gen_length = max(map(len, targets))
|
169 |
+
targets = [[self.sop_token_id] + target for target in targets]
|
170 |
+
labels = [target[1:] for target in targets]
|
171 |
+
targets = [target + [self.pad_token_id] * (max_gen_length + 1 - len(target)) for target in targets]
|
172 |
+
labels = [label + [-100] * (max_gen_length - len(label)) for label in labels]
|
173 |
+
targets = torch.tensor(targets, dtype=input_ids.dtype, device=input_ids.device)
|
174 |
+
labels = torch.tensor(labels, dtype=input_ids.dtype, device=input_ids.device)
|
175 |
+
labels = torch.cat((input_ids.new_full((batch_size, seq_length), -100), labels), dim=1)
|
176 |
+
for i in range(batch_size):
|
177 |
+
mask_positions = []
|
178 |
+
for mask_id in mask_ids:
|
179 |
+
mask_positions += (input_ids[i] == mask_id).nonzero(as_tuple=True)[0].tolist()
|
180 |
+
if not mask_positions:
|
181 |
+
raise ValueError("Cannot find mask token in the input")
|
182 |
+
mask_positions.sort()
|
183 |
+
mask_pos = mask_positions[0]
|
184 |
+
position_ids.append(position_id + [mask_pos] * max_gen_length)
|
185 |
+
block_position_ids.append(block_position_id + list(range(1, max_gen_length + 1)))
|
186 |
+
position_ids = torch.tensor(position_ids, dtype=input_ids.dtype, device=input_ids.device)
|
187 |
+
block_position_ids = torch.tensor(block_position_ids, dtype=input_ids.dtype, device=input_ids.device)
|
188 |
+
position_ids = torch.stack((position_ids, block_position_ids), dim=1)
|
189 |
+
attention_mask = model_input.attention_mask
|
190 |
+
attention_mask = attention_mask.unsqueeze(1).expand(-1, seq_length + max_gen_length, -1)
|
191 |
+
generation_attention_mask = torch.cat([attention_mask.new_zeros((seq_length, max_gen_length)),
|
192 |
+
torch.tril(attention_mask.new_ones((max_gen_length, max_gen_length)))],
|
193 |
+
dim=0).unsqueeze(0).expand(batch_size, -1, -1)
|
194 |
+
attention_mask = torch.cat((attention_mask, generation_attention_mask), dim=2)
|
195 |
+
attention_mask = attention_mask.unsqueeze(1)
|
196 |
+
if targets is None:
|
197 |
+
input_ids = torch.cat((input_ids, input_ids.new_full((batch_size, 1), self.sop_token_id)), dim=-1)
|
198 |
+
else:
|
199 |
+
input_ids = torch.cat((input_ids, targets[:, :-1]), dim=1)
|
200 |
+
batch = {"input_ids": input_ids, "position_ids": position_ids}
|
201 |
+
if labels is None:
|
202 |
+
batch["generation_attention_mask"] = attention_mask
|
203 |
+
else:
|
204 |
+
batch["attention_mask"] = attention_mask
|
205 |
+
batch["labels"] = labels
|
206 |
+
return BatchEncoding(batch)
|
207 |
+
|
208 |
+
|
209 |
+
class GLMRobertaTokenizer(RobertaTokenizer, GLMTokenizerMixin):
|
210 |
+
model_input_names = ["input_ids", "position_ids", "attention_mask"]
|
211 |
+
truncation_side: str = "left"
|
212 |
+
|
213 |
+
@property
|
214 |
+
def gmask_token_id(self) -> int:
|
215 |
+
raise NotImplementedError("The model doesn't support gMASK")
|
216 |
+
|
217 |
+
@property
|
218 |
+
def smask_token_id(self) -> int:
|
219 |
+
raise NotImplementedError("The model doesn't support sMASK")
|
220 |
+
|
221 |
+
@property
|
222 |
+
def mask_token_ids(self):
|
223 |
+
return [self.mask_token_id]
|
224 |
+
|
225 |
+
|
226 |
+
class GLMChineseTokenizer(PreTrainedTokenizer, GLMTokenizerMixin):
|
227 |
+
vocab_files_names = {"vocab_file": "cog-pretrain.model"}
|
228 |
+
truncation_side: str = "left"
|
229 |
+
|
230 |
+
def __init__(self, vocab_file, **kwargs):
|
231 |
+
super().__init__(**kwargs)
|
232 |
+
self.vocab_file = vocab_file
|
233 |
+
self.sp_model = spm.SentencePieceProcessor()
|
234 |
+
self.sp_model.Load(vocab_file)
|
235 |
+
|
236 |
+
@property
|
237 |
+
def vocab_size(self):
|
238 |
+
return len(self.sp_model)
|
239 |
+
|
240 |
+
def get_vocab(self):
|
241 |
+
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
242 |
+
vocab.update(self.added_tokens_encoder)
|
243 |
+
return vocab
|
244 |
+
|
245 |
+
def _tokenize(self, text, **kwargs):
|
246 |
+
return self.sp_model.encode(text, out_type=str)
|
247 |
+
|
248 |
+
def _convert_token_to_id(self, token):
|
249 |
+
"""Converts a token (str) in an id using the vocab."""
|
250 |
+
return self.sp_model.PieceToId(token)
|
251 |
+
|
252 |
+
def _convert_id_to_token(self, index):
|
253 |
+
"""Converts an index (integer) in a token (str) using the vocab."""
|
254 |
+
return self.sp_model.IdToPiece(index)
|
255 |
+
|
256 |
+
def convert_tokens_to_string(self, tokens):
|
257 |
+
return self.sp_model.decode(tokens)
|
258 |
+
|
259 |
+
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
260 |
+
if not os.path.isdir(save_directory):
|
261 |
+
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
262 |
+
return
|
263 |
+
out_vocab_file = os.path.join(
|
264 |
+
save_directory, (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"]
|
265 |
+
)
|
266 |
+
|
267 |
+
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
|
268 |
+
copyfile(self.vocab_file, out_vocab_file)
|
269 |
+
elif not os.path.isfile(self.vocab_file):
|
270 |
+
with open(out_vocab_file, "wb") as fi:
|
271 |
+
content_spiece_model = self.sp_model.serialized_model_proto()
|
272 |
+
fi.write(content_spiece_model)
|
273 |
+
|
274 |
+
return (out_vocab_file,)
|
275 |
+
|
276 |
+
def build_inputs_with_special_tokens(
|
277 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
278 |
+
) -> List[int]:
|
279 |
+
"""
|
280 |
+
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
|
281 |
+
adding special tokens. A BERT sequence has the following format:
|
282 |
+
|
283 |
+
- single sequence: ``[CLS] X [SEP]``
|
284 |
+
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
|
285 |
+
|
286 |
+
Args:
|
287 |
+
token_ids_0 (:obj:`List[int]`):
|
288 |
+
List of IDs to which the special tokens will be added.
|
289 |
+
token_ids_1 (:obj:`List[int]`, `optional`):
|
290 |
+
Optional second list of IDs for sequence pairs.
|
291 |
+
|
292 |
+
Returns:
|
293 |
+
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
|
294 |
+
"""
|
295 |
+
assert token_ids_1 is None
|
296 |
+
cls = [self.cls_token_id]
|
297 |
+
eos = [self.eos_token_id]
|
298 |
+
return cls + token_ids_0 + eos
|
299 |
+
|
300 |
+
|
301 |
+
class GLMGPT2Tokenizer(GPT2Tokenizer, GLMTokenizerMixin):
|
302 |
+
model_input_names = ["input_ids", "position_ids", "attention_mask"]
|
303 |
+
truncation_side: str = "left"
|
304 |
+
|
305 |
+
def build_inputs_with_special_tokens(
|
306 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
307 |
+
) -> List[int]:
|
308 |
+
"""
|
309 |
+
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
|
310 |
+
adding special tokens. A BERT sequence has the following format:
|
311 |
+
|
312 |
+
- single sequence: ``[CLS] X [SEP]``
|
313 |
+
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
|
314 |
+
|
315 |
+
Args:
|
316 |
+
token_ids_0 (:obj:`List[int]`):
|
317 |
+
List of IDs to which the special tokens will be added.
|
318 |
+
token_ids_1 (:obj:`List[int]`, `optional`):
|
319 |
+
Optional second list of IDs for sequence pairs.
|
320 |
+
|
321 |
+
Returns:
|
322 |
+
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
|
323 |
+
"""
|
324 |
+
assert token_ids_1 is None
|
325 |
+
cls = [self.cls_token_id]
|
326 |
+
eos = [self.eos_token_id]
|
327 |
+
return cls + token_ids_0 + eos
|
328 |
+
|
329 |
+
|
330 |
+
class GLMBertTokenizer(BertTokenizer, GLMTokenizerMixin):
|
331 |
+
model_input_names = ["input_ids", "position_ids", "attention_mask"]
|
332 |
+
truncation_side: str = "left"
|
333 |
+
|
334 |
+
@property
|
335 |
+
def gmask_token_id(self) -> int:
|
336 |
+
raise NotImplementedError("The model doesn't support gMASK")
|
337 |
+
|
338 |
+
@property
|
339 |
+
def smask_token_id(self) -> int:
|
340 |
+
raise NotImplementedError("The model doesn't support sMASK")
|
341 |
+
|
342 |
+
@property
|
343 |
+
def mask_token_ids(self):
|
344 |
+
return [self.mask_token_id]
|
345 |
+
|
346 |
+
|
347 |
+
class GLMTokenizer:
|
348 |
+
@classmethod
|
349 |
+
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
|
350 |
+
tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
|
351 |
+
config_tokenizer_class = tokenizer_config.get("tokenizer_class")
|
352 |
+
if config_tokenizer_class == "GLMRobertaTokenizer":
|
353 |
+
tokenizer_class = GLMRobertaTokenizer
|
354 |
+
elif config_tokenizer_class == "GLMChineseTokenizer":
|
355 |
+
tokenizer_class = GLMChineseTokenizer
|
356 |
+
elif config_tokenizer_class == "GLMGPT2Tokenizer":
|
357 |
+
tokenizer_class = GLMGPT2Tokenizer
|
358 |
+
elif config_tokenizer_class == "GLMBertTokenizer":
|
359 |
+
tokenizer_class = GLMBertTokenizer
|
360 |
+
else:
|
361 |
+
raise NotImplementedError("Not implemented tokenizer type:", config_tokenizer_class)
|
362 |
+
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
tokenizer_config.json
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"name_or_path": "THUDM/glm-10b",
|
3 |
+
"eos_token": "<|endoftext|>",
|
4 |
+
"pad_token": "<|endoftext|>",
|
5 |
+
"cls_token": "[CLS]",
|
6 |
+
"mask_token": "[MASK]",
|
7 |
+
"unk_token": "[UNK]",
|
8 |
+
"additional_special_tokens": ["<|startofpiece|>", "<|endofpiece|>", "[gMASK]", "[sMASK]"],
|
9 |
+
"add_prefix_space": false,
|
10 |
+
"tokenizer_class": "GLMGPT2Tokenizer",
|
11 |
+
"use_fast": false,
|
12 |
+
"auto_map": {
|
13 |
+
"AutoTokenizer": [
|
14 |
+
"tokenization_glm.GLMGPT2Tokenizer",
|
15 |
+
null
|
16 |
+
]
|
17 |
+
}
|
18 |
+
}
|
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|