svanlin-tencent
commited on
Commit
•
77e2af4
1
Parent(s):
cc1965e
fix
Browse files
Hunyuan-A50B-Pretrain/configuration_hunyuan.py
CHANGED
@@ -1,16 +1,10 @@
|
|
1 |
-
#
|
2 |
-
# Copyright 2024 Tencent Inc. All Rights Reserved.
|
3 |
#
|
4 |
-
#
|
5 |
-
# and OPT implementations in this library. It has been modified from its
|
6 |
-
# original forms to accommodate minor architectural differences compared
|
7 |
-
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
8 |
-
#
|
9 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
10 |
# you may not use this file except in compliance with the License.
|
11 |
# You may obtain a copy of the License at
|
12 |
#
|
13 |
-
#
|
14 |
#
|
15 |
# Unless required by applicable law or agreed to in writing, software
|
16 |
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
1 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
|
|
2 |
#
|
3 |
+
# Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
|
|
|
|
|
|
|
|
|
|
|
4 |
# you may not use this file except in compliance with the License.
|
5 |
# You may obtain a copy of the License at
|
6 |
#
|
7 |
+
# https://github.com/Tencent/Tencent-Hunyuan-Large/blob/main/License.docx
|
8 |
#
|
9 |
# Unless required by applicable law or agreed to in writing, software
|
10 |
# distributed under the License is distributed on an "AS IS" BASIS,
|
Hunyuan-A50B-Pretrain/modeling_hunyuan.py
CHANGED
@@ -1,5 +1,16 @@
|
|
1 |
-
#
|
2 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
#
|
4 |
""" PyTorch HunYuan model."""
|
5 |
|
|
|
1 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# https://github.com/Tencent/Tencent-Hunyuan-Large/blob/main/License.docx
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
#
|
15 |
""" PyTorch HunYuan model."""
|
16 |
|
Hunyuan-A50B-Pretrain/test.py
CHANGED
@@ -1,3 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from tokenizers import ByteLevelBPETokenizer
|
2 |
from transformers import AutoTokenizer
|
3 |
|
@@ -32,4 +46,4 @@ print('messages:', messages)
|
|
32 |
ids = auto_tokenizer.apply_chat_template(messages)
|
33 |
print(f"input_ids:\t{ids}")
|
34 |
text = auto_tokenizer.decode(ids)
|
35 |
-
print(f"input_text:\t[{text}]")
|
|
|
1 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# https://github.com/Tencent/Tencent-Hunyuan-Large/blob/main/License.docx
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
from tokenizers import ByteLevelBPETokenizer
|
16 |
from transformers import AutoTokenizer
|
17 |
|
|
|
46 |
ids = auto_tokenizer.apply_chat_template(messages)
|
47 |
print(f"input_ids:\t{ids}")
|
48 |
text = auto_tokenizer.decode(ids)
|
49 |
+
print(f"input_text:\t[{text}]")
|
Hunyuan-A50B-Pretrain/test4consistent.py
CHANGED
@@ -1,9 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
|
2 |
# test tokenizer encode & decode consistency
|
3 |
from transformers import AutoTokenizer
|
4 |
-
tokenizer = AutoTokenizer.from_pretrained('/
|
5 |
|
6 |
-
test_data = [line.strip() for line in open('/
|
7 |
|
8 |
num_origi_len = 0
|
9 |
num_token_len = 0
|
|
|
1 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# https://github.com/Tencent/Tencent-Hunyuan-Large/blob/main/License.docx
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
|
15 |
# test tokenizer encode & decode consistency
|
16 |
from transformers import AutoTokenizer
|
17 |
+
tokenizer = AutoTokenizer.from_pretrained('/tokenizer_exp/other_tokenizer_vocab/hy', local_files_only=True, trust_remote_code=True)
|
18 |
|
19 |
+
test_data = [line.strip() for line in open('/tokenizer_exp/data/test.txt', 'r').readlines()]
|
20 |
|
21 |
num_origi_len = 0
|
22 |
num_token_len = 0
|
Hunyuan-A50B-Pretrain/tokenization_hy.py
CHANGED
@@ -1,17 +1,17 @@
|
|
1 |
-
#
|
2 |
-
# Copyright 2024 The Tencent Inc. HunYuan Team.
|
3 |
#
|
4 |
-
# Licensed under the
|
5 |
# you may not use this file except in compliance with the License.
|
6 |
# You may obtain a copy of the License at
|
7 |
#
|
8 |
-
#
|
9 |
#
|
10 |
# Unless required by applicable law or agreed to in writing, software
|
11 |
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
# See the License for the specific language governing permissions and
|
14 |
# limitations under the License.
|
|
|
15 |
import os
|
16 |
import base64
|
17 |
import logging
|
|
|
1 |
+
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
|
|
|
2 |
#
|
3 |
+
# Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
|
4 |
# you may not use this file except in compliance with the License.
|
5 |
# You may obtain a copy of the License at
|
6 |
#
|
7 |
+
# https://github.com/Tencent/Tencent-Hunyuan-Large/blob/main/License.docx
|
8 |
#
|
9 |
# Unless required by applicable law or agreed to in writing, software
|
10 |
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
# See the License for the specific language governing permissions and
|
13 |
# limitations under the License.
|
14 |
+
|
15 |
import os
|
16 |
import base64
|
17 |
import logging
|