svanlin-tencent commited on
Commit
77e2af4
1 Parent(s): cc1965e
Hunyuan-A50B-Pretrain/configuration_hunyuan.py CHANGED
@@ -1,16 +1,10 @@
1
- # coding=utf-8
2
- # Copyright 2024 Tencent Inc. All Rights Reserved.
3
  #
4
- # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
- # and OPT implementations in this library. It has been modified from its
6
- # original forms to accommodate minor architectural differences compared
7
- # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
- #
9
- # Licensed under the Apache License, Version 2.0 (the "License");
10
  # you may not use this file except in compliance with the License.
11
  # You may obtain a copy of the License at
12
  #
13
- # http://www.apache.org/licenses/LICENSE-2.0
14
  #
15
  # Unless required by applicable law or agreed to in writing, software
16
  # distributed under the License is distributed on an "AS IS" BASIS,
 
1
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
 
2
  #
3
+ # Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
 
 
 
 
 
4
  # you may not use this file except in compliance with the License.
5
  # You may obtain a copy of the License at
6
  #
7
+ # https://github.com/Tencent/Tencent-Hunyuan-Large/blob/main/License.docx
8
  #
9
  # Unless required by applicable law or agreed to in writing, software
10
  # distributed under the License is distributed on an "AS IS" BASIS,
Hunyuan-A50B-Pretrain/modeling_hunyuan.py CHANGED
@@ -1,5 +1,16 @@
1
- # coding=utf-8
2
- # Copyright 2024 Tencent Inc. All Rights Reserved.
 
 
 
 
 
 
 
 
 
 
 
3
  #
4
  """ PyTorch HunYuan model."""
5
 
 
1
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
2
+ #
3
+ # Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://github.com/Tencent/Tencent-Hunyuan-Large/blob/main/License.docx
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
  #
15
  """ PyTorch HunYuan model."""
16
 
Hunyuan-A50B-Pretrain/test.py CHANGED
@@ -1,3 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from tokenizers import ByteLevelBPETokenizer
2
  from transformers import AutoTokenizer
3
 
@@ -32,4 +46,4 @@ print('messages:', messages)
32
  ids = auto_tokenizer.apply_chat_template(messages)
33
  print(f"input_ids:\t{ids}")
34
  text = auto_tokenizer.decode(ids)
35
- print(f"input_text:\t[{text}]")
 
1
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
2
+ #
3
+ # Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://github.com/Tencent/Tencent-Hunyuan-Large/blob/main/License.docx
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
  from tokenizers import ByteLevelBPETokenizer
16
  from transformers import AutoTokenizer
17
 
 
46
  ids = auto_tokenizer.apply_chat_template(messages)
47
  print(f"input_ids:\t{ids}")
48
  text = auto_tokenizer.decode(ids)
49
+ print(f"input_text:\t[{text}]")
Hunyuan-A50B-Pretrain/test4consistent.py CHANGED
@@ -1,9 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
  # test tokenizer encode & decode consistency
3
  from transformers import AutoTokenizer
4
- tokenizer = AutoTokenizer.from_pretrained('/apdcephfs/share_1502809/shaneshu/tokenizer_exp/other_tokenizer_vocab/hy', local_files_only=True, trust_remote_code=True)
5
 
6
- test_data = [line.strip() for line in open('/apdcephfs/share_1502809/shaneshu/tokenizer_exp/data/test.txt', 'r').readlines()]
7
 
8
  num_origi_len = 0
9
  num_token_len = 0
 
1
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
2
+ #
3
+ # Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://github.com/Tencent/Tencent-Hunyuan-Large/blob/main/License.docx
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
 
15
  # test tokenizer encode & decode consistency
16
  from transformers import AutoTokenizer
17
+ tokenizer = AutoTokenizer.from_pretrained('/tokenizer_exp/other_tokenizer_vocab/hy', local_files_only=True, trust_remote_code=True)
18
 
19
+ test_data = [line.strip() for line in open('/tokenizer_exp/data/test.txt', 'r').readlines()]
20
 
21
  num_origi_len = 0
22
  num_token_len = 0
Hunyuan-A50B-Pretrain/tokenization_hy.py CHANGED
@@ -1,17 +1,17 @@
1
- # coding=utf-8
2
- # Copyright 2024 The Tencent Inc. HunYuan Team.
3
  #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
  # you may not use this file except in compliance with the License.
6
  # You may obtain a copy of the License at
7
  #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
  #
10
  # Unless required by applicable law or agreed to in writing, software
11
  # distributed under the License is distributed on an "AS IS" BASIS,
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
 
15
  import os
16
  import base64
17
  import logging
 
1
+ # Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
 
2
  #
3
+ # Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License");
4
  # you may not use this file except in compliance with the License.
5
  # You may obtain a copy of the License at
6
  #
7
+ # https://github.com/Tencent/Tencent-Hunyuan-Large/blob/main/License.docx
8
  #
9
  # Unless required by applicable law or agreed to in writing, software
10
  # distributed under the License is distributed on an "AS IS" BASIS,
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
14
+
15
  import os
16
  import base64
17
  import logging