UL2-nemo-conversion / hf_t5_v1_1_to_nemo.py
Faton Rekathati
UL2 conversion instructions
0fd282e
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This script generates a NeMo-Megatron compatible `.nemo` file for a Huggingface T5-v1_1 model.
List of Huggingface models that this script can covert:
1. google/t5-v1_1-small
2. google/t5-v1_1-base
3. google/t5-v1_1-large
4. google/t5-v1_1-xl
5. google/t5-v1_1-xxl
6. google/mt5-small
7. google/mt5-base
8. google/mt5-large
9. google/mt5-xl
10. google/mt5-xxl
11. google/ul2
13. bigscience/T0pp
14. google/t5-small-lm-adapt
15. google/t5-base-lm-adapt
16. google/t5-large-lm-adapt
17. google/t5-xl-lm-adapt
18. google/t5-xxl-lm-adapt
19. google/flan-t5-small
20. google/flan-t5-base
21. google/flan-t5-large
22. google/flan-t5-xl
23. google/flan-t5-xxl
Use instructions:
python hf_t5-v1_1_to_nemo.py \
--hf_model_name bigscience/T0pp \
--nemo_state_dict /path/to/nemo_state_dict.pt \
--nemo_file_path /path/to/nemo_file.nemo
"""
import collections
import os
import tempfile
from argparse import ArgumentParser
import torch
from omegaconf.omegaconf import OmegaConf, open_dict
from pytorch_lightning import Trainer
from transformers import AutoTokenizer, T5ForConditionalGeneration
from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model
from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
try:
import accelerate
except ImportError:
raise ImportError("Please install accelerate package via `pip install accelerate` to use this script.")
def convert_weights(hf_model, nemo_state_dict_path):
if hf_model == "google/ul2":
torch_dtype = torch.bfloat16
else:
torch_dtype = torch.float32
hf_model = T5ForConditionalGeneration.from_pretrained(hf_model, low_cpu_mem_usage=True, torch_dtype=torch_dtype)
hf_model_config = hf_model.config
with tempfile.TemporaryDirectory() as tmp:
torch.save(hf_model.state_dict(), os.path.join(tmp, "model.pt"))
hf_weights = torch.load(os.path.join(tmp, "model.pt"))
nemo_weights = collections.OrderedDict()
print(f"Found {len(hf_weights.keys())} keys in the checkpoint")
def _get_model_type_block_layer(k):
if k.startswith("encoder"):
model_type = "encoder"
elif k.startswith("decoder"):
model_type = "decoder"
else:
raise ValueError(f"Unknown model type for {k}")
return model_type, int(k.split(".")[2]), int(k.split(".")[4])
for k, v in hf_weights.items():
#################################################
###### Enc-Dec Embeddings and Output Layer ######
#################################################
# Tied decoder embedding and decoder output layer.
if k == "shared.weight":
pass
elif k == "lm_head.weight":
nemo_weights["enc_dec_model.tokens_head.weight"] = v
print(
f"Mapped {k} to enc_dec_model.decoder_embedding.word_embeddings.weight and enc_dec_model.tokens_head.weight"
)
# Decoder embeddings
elif k == "decoder.embed_tokens.weight":
nemo_weights["enc_dec_model.decoder_embedding.word_embeddings.weight"] = v
elif k == "encoder.embed_tokens.weight":
nemo_weights["enc_dec_model.encoder_embedding.word_embeddings.weight"] = v
print(f"Mapped {k} to enc_dec_model.encoder_embedding.word_embeddings.weight")
#################################################
################# RPE Weights ###################
#################################################
elif k == "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight":
nemo_weights["enc_dec_model.encoder_relative_position_embedding.relative_position_embedding.weight"] = v
print(
f"Mapped {k} to enc_dec_model.encoder_relative_position_embedding.relative_position_embedding.weight"
)
elif k == "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight":
nemo_weights["enc_dec_model.decoder_relative_position_embedding.relative_position_embedding.weight"] = v
print(
f"Mapped {k} to enc_dec_model.decoder_relative_position_embedding.relative_position_embedding.weight"
)
# Block in HF corresponds to layer in NeMo.
# Layer in HF does not correspond to anything in NeMo. Layer 0 is self attn, layer 1 is cross-attn.
#################################################
############### Attention Layers ################
#################################################
# Self-Attention
# Q, k, V in NeMo-Megatron is bundled into a single matrix.
elif "SelfAttention.q.weight" in k:
model_type, block_number, layer_number = _get_model_type_block_layer(k)
k_weight = hf_weights[k.replace("q.weight", "k.weight")]
v_weight = hf_weights[k.replace("q.weight", "v.weight")]
concat_weights = torch.cat([v, k_weight, v_weight], dim=0)
nemo_weights[
f"enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.self_attention.query_key_value.weight"
] = concat_weights
print(
f"Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.self_attention.query_key_value.weight"
)
# We can skip processing of k, v weights since we already concat them into qkv above.
elif "SelfAttention.k.weight" in k or "SelfAttention.v.weight" in k:
pass
# Output self-attn matrix.
elif "SelfAttention.o.weight" in k:
model_type, block_number, layer_number = _get_model_type_block_layer(k)
block_number = int(k.split(".")[2]) # Block in HF corresponds to layer in NeMo.
layer_number = int(
k.split(".")[4]
) # Layer in HF does not correspond to anything in NeMo. Layer 0 is self attn, layer 1 is cross-attn.
nemo_weights[
f"enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.self_attention.dense.weight"
] = v
print(
f"Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.self_attention.dense.weight"
)
# Cross-Attention projection matrices are merged into K, V matrices in NeMo-Megatron
elif "EncDecAttention.k.weight" in k:
model_type, block_number, layer_number = _get_model_type_block_layer(k)
v_weight = hf_weights[k.replace("k.weight", "v.weight")]
concat_weights = torch.cat([v, v_weight], dim=0)
nemo_weights[
f"enc_dec_model.enc_dec_model.decoder.model.layers.{block_number}.inter_attention.key_value.weight"
] = concat_weights
print(
f"Mapped {k} to enc_dec_model.enc_dec_model.decoder.model.layers.{block_number}.inter_attention.key_value.weight"
)
# We can skip processing of v weights since we already concat them with k above.
elif "EncDecAttention.v.weight" in k:
pass
# Cross-Attention Q matrix is separate in NeMo-Megatron
elif "EncDecAttention.q.weight" in k:
model_type, block_number, layer_number = _get_model_type_block_layer(k)
nemo_weights[
f"enc_dec_model.enc_dec_model.decoder.model.layers.{block_number}.inter_attention.query.weight"
] = v
print(
f"Mapped {k} to enc_dec_model.enc_dec_model.decoder.model.layers.{block_number}.inter_attention.query.weight"
)
# Cross-Attention Q matrix is separate in NeMo-Megatron
elif "EncDecAttention.o.weight" in k:
model_type, block_number, layer_number = _get_model_type_block_layer(k)
nemo_weights[
f"enc_dec_model.enc_dec_model.decoder.model.layers.{block_number}.inter_attention.dense.weight"
] = v
print(
f"Mapped {k} to enc_dec_model.enc_dec_model.decoder.model.layers.{block_number}.inter_attention.dense.weight"
)
#################################################
#################$ FFN Layers ###################
#################################################
elif "DenseReluDense.wi_0.weight" in k:
model_type, block_number, layer_number = _get_model_type_block_layer(k)
nemo_weights[
f"enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.mlp.dense_h_to_4h.weight"
] = v
print(
f"Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.mlp.dense_h_to_4h.weight"
)
elif "DenseReluDense.wi_1.weight" in k:
model_type, block_number, layer_number = _get_model_type_block_layer(k)
nemo_weights[
f"enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.mlp.dense_h_to_4h_2.weight"
] = v
print(
f"Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.mlp.dense_h_to_4h_2.weight"
)
elif "DenseReluDense.wo.weight" in k:
model_type, block_number, layer_number = _get_model_type_block_layer(k)
nemo_weights[
f"enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.mlp.dense_4h_to_h.weight"
] = v
print(
f"Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.mlp.dense_4h_to_h.weight"
)
#################################################
#################$ LayerNorm ####################
#################################################
elif "layer_norm" in k:
if "final" in k:
model_type = "encoder" if k.startswith("encoder") else "decoder"
nemo_weights[f"enc_dec_model.enc_dec_model.{model_type}.model.final_layernorm.weight"] = v
print(f"Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.final_layernorm.weight")
else:
model_type, block_number, layer_number = _get_model_type_block_layer(k)
if layer_number == 0 and model_type == "encoder":
nemo_weights[
f"enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.input_layernorm.weight"
] = v
print(
f"Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.input_layernorm.weight"
)
elif layer_number == 1 and model_type == "encoder":
nemo_weights[
f"enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.post_attention_layernorm.weight"
] = v
print(
f"Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.post_attention_layernorm.weight"
)
elif layer_number == 0 and model_type == "decoder":
nemo_weights[
f"enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.input_layernorm.weight"
] = v
print(
f"Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.input_layernorm.weight"
)
elif layer_number == 1 and model_type == "decoder":
nemo_weights[
f"enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.post_attention_layernorm.weight"
] = v
print(
f"Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.post_attention_layernorm.weight"
)
elif layer_number == 2 and model_type == "decoder":
nemo_weights[
f"enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.post_inter_attention_layernorm.weight"
] = v
print(
f"Mapped {k} to enc_dec_model.enc_dec_model.{model_type}.model.layers.{block_number}.post_inter_attention_layernorm.weight"
)
else:
raise ValueError("Unknown layer_norm key: {}".format(k))
else:
raise ValueError(f"Unknown key: {k}")
torch.save(nemo_weights, nemo_state_dict_path)
print("Saved weights to {}".format(nemo_state_dict_path))
return hf_model_config
def package_into_nemo_file(
state_dict_path, base_yaml_config, hf_model_config, nemo_file_path, hf_model_name, megatron_amp_O2
):
"""
Packages the state dict, config file and tokenizer into a `.nemo` file.
"""
trainer = Trainer(devices=1, strategy=NLPDDPStrategy(), accelerator="cpu", precision=32)
base_cfg = OmegaConf.load(base_yaml_config)
if hf_model_config.dense_act_fn == "silu":
act_fn = "swiglu"
elif hf_model_config.dense_act_fn == "gelu_new":
act_fn = "geglu"
# FLAN-T5 models have things configured this way.
elif hf_model_config.dense_act_fn == "gelu" and hf_model_config.is_gated_act:
act_fn = "geglu"
else:
raise ValueError(f"Unknown dense_act_fn: {hf_model_config.dense_act_fn}")
with open_dict(base_cfg):
base_cfg.encoder.num_layers = hf_model_config.num_layers
base_cfg.encoder.hidden_size = hf_model_config.d_model
base_cfg.encoder.ffn_hidden_size = hf_model_config.d_ff
base_cfg.encoder.kv_channels = hf_model_config.d_kv
base_cfg.encoder.num_attention_heads = hf_model_config.num_heads
base_cfg.encoder.activation = act_fn
base_cfg.encoder.relative_attention_num_buckets = hf_model_config.relative_attention_num_buckets
base_cfg.decoder.num_layers = hf_model_config.num_decoder_layers
base_cfg.decoder.hidden_size = hf_model_config.d_model
base_cfg.decoder.ffn_hidden_size = hf_model_config.d_ff
base_cfg.decoder.kv_channels = hf_model_config.d_kv
base_cfg.decoder.num_attention_heads = hf_model_config.num_heads
base_cfg.decoder.activation = act_fn
base_cfg.decoder.relative_attention_num_buckets = hf_model_config.relative_attention_num_buckets
base_cfg.megatron_amp_O2 = megatron_amp_O2
with tempfile.TemporaryDirectory() as tmp:
tokenizer = AutoTokenizer.from_pretrained(hf_model_name)
tokenizer_path = tokenizer.save_vocabulary(tmp)[0]
base_cfg.tokenizer.model = tokenizer_path
model = MegatronT5Model(base_cfg, trainer).to("cpu")
model._save_restore_connector = NLPSaveRestoreConnector()
state_dict = torch.load(state_dict_path)
if megatron_amp_O2:
new_state_dict = {}
for key in state_dict.keys():
new_key = key.replace("model.", "model.module.", 1)
new_state_dict[new_key] = state_dict[key]
state_dict = new_state_dict
model.load_state_dict(state_dict)
model.save_to(nemo_file_path)
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument(
"--hf_model_name",
type=str,
required=True,
help="Valid Huggingface T5v1_1 model name ex: google/t5-v1_1-large or google/ul2. Example something that can be loaded with T5ForConditionalGeneration.from_pretrained()",
)
parser.add_argument(
"--nemo_state_dict_path",
type=str,
required=True,
help="Path to write the intermediate nemo state dict file ex: /path/to/nemo_state_dict.pt",
)
parser.add_argument(
"--nemo_file_path",
type=str,
required=True,
help="Path to write the converted .nemo file ex: /path/to/t5_base_converted_to_nemo.nemo",
)
parser.add_argument(
"--base_yaml_config",
type=str,
default="hf_t5v1_1_base_config.yaml",
help="Path to a base yaml config that we edit based on the provided model.",
)
parser.add_argument(
"--megatron_amp_O2",
action="store_true",
help="Whether to store O2 weights. This may be useful for models like ul2 where only pre-trained half precision weights were released.",
)
args = parser.parse_args()
if not os.path.exists(args.base_yaml_config):
raise FileNotFoundError(f"Base yaml config file {args.base_yaml_config} does not exist.")
hf_model_config = convert_weights(args.hf_model_name, args.nemo_state_dict_path)
package_into_nemo_file(
state_dict_path=args.nemo_state_dict_path,
base_yaml_config=args.base_yaml_config,
hf_model_config=hf_model_config,
nemo_file_path=args.nemo_file_path,
hf_model_name=args.hf_model_name,
megatron_amp_O2=args.megatron_amp_O2,
)