|
import argparse |
|
import json |
|
import os |
|
import re |
|
import zipfile |
|
|
|
import torch |
|
|
|
#################################################################################################### |
|
# This code is modified from https: |
|
|
|
def recursive_print(name, val, spaces=0): |
|
# Format the message. |
|
if name is None: |
|
msg = None |
|
else: |
|
fmt = "." * max(0, spaces - 2) + "# {:" + str(50 - spaces) + "s}" |
|
msg = fmt.format(name) |
|
|
|
# Print and recurse (if needed). |
|
if isinstance(val, dict): |
|
if msg is not None: |
|
print(msg) |
|
for k in val.keys(): |
|
recursive_print(k, val[k], spaces + 2) |
|
elif isinstance(val, torch.Tensor): |
|
print(msg, ":", val.size()) |
|
else: |
|
print(msg, ":", val) |
|
|
|
|
|
def convert_megatron_checkpoint(input_state_dict, head_model=True): |
|
# The converted output model. |
|
output_state_dict = {} |
|
|
|
# The model. |
|
model = input_state_dict["model"] |
|
# The language model. |
|
lm = model["language_model"] |
|
# The embeddings. |
|
embeddings = lm["embedding"] |
|
|
|
# The word embeddings. |
|
word_embeddings = embeddings["word_embeddings"]["weight"] |
|
# Store the word embeddings. |
|
output_state_dict["bert.embeddings.word_embeddings.weight"] = word_embeddings |
|
|
|
# The position embeddings. |
|
pos_embeddings = embeddings["position_embeddings"]["weight"] |
|
# Trained for 512 x 1024. |
|
assert pos_embeddings.size(0) == 512 and pos_embeddings.size(1) == 1024 |
|
# Store the position embeddings. |
|
output_state_dict["bert.embeddings.position_embeddings.weight"] = pos_embeddings |
|
|
|
# The token-type embeddings. |
|
tokentype_embeddings = embeddings["tokentype_embeddings"]["weight"] |
|
# Store the position embeddings. |
|
output_state_dict["bert.embeddings.token_type_embeddings.weight"] = tokentype_embeddings |
|
|
|
# The transformer. |
|
transformer = lm["transformer"] |
|
|
|
# The regex to extract layer names. |
|
layer_re = re.compile("layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)") |
|
|
|
# The simple map of names for "automated" rules. |
|
megatron_to_transformers = { |
|
"attention.dense": ".attention.output.dense.", |
|
"mlp.dense_h_to_4h": ".intermediate.dense.", |
|
"mlp.dense_4h_to_h": ".output.dense.", |
|
} |
|
|
|
# Keep track of the attention/query/value tensor. |
|
attention_qkv_weight = None |
|
|
|
# Extract the layers. |
|
for key, val in transformer.items(): |
|
# Match the name. |
|
m = layer_re.match(key) |
|
|
|
# Stop if that's not a layer |
|
if m is None: |
|
break |
|
|
|
# The index of the layer. |
|
layer_idx = int(m.group(1)) |
|
# The name of the operation. |
|
op_name = m.group(2) |
|
# Is it a weight or a bias? |
|
weight_or_bias = m.group(3) |
|
|
|
# The name of the layer. |
|
layer_name = f"bert.encoder.layer.{layer_idx}" |
|
|
|
# For layernorm(s), simply store the layer norm. |
|
if op_name.endswith("layernorm"): |
|
|
|
ln_name = "attention.ln" if op_name.startswith("input") else "ln" |
|
output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = val |
|
|
|
# Transpose the QKV matrix. |
|
elif op_name == "attention.query_key_value" and weight_or_bias == "weight": |
|
|
|
# Make sure the QKV pointer is nil. |
|
assert attention_qkv_weight is None, "" |
|
|
|
# Store the tensor as we need the bias as well to interleave QKV and biases. |
|
attention_qkv_weight = val |
|
|
|
# Transpose the bias. |
|
elif op_name == "attention.query_key_value" and weight_or_bias == "bias": |
|
|
|
# Make sure we read the weight tensor. |
|
assert attention_qkv_weight is not None, "" |
|
|
|
# Split the QKV matrix into Q, K and V. Megatron stores Q,K,V interleaved. |
|
q = attention_qkv_weight[0 * 1024 : 1 * 1024, :] |
|
k = attention_qkv_weight[1 * 1024 : 2 * 1024, :] |
|
v = attention_qkv_weight[2 * 1024 : 3 * 1024, :] |
|
|
|
# Split the bias. |
|
q_bias = val[0 * 1024 : 1 * 1024] |
|
k_bias = val[1 * 1024 : 2 * 1024] |
|
v_bias = val[2 * 1024 : 3 * 1024] |
|
|
|
# Store. |
|
output_state_dict[f"{layer_name}.attention.self.query.weight"] = q |
|
output_state_dict[f"{layer_name}.attention.self.query.bias"] = q_bias |
|
output_state_dict[f"{layer_name}.attention.self.key.weight"] = k |
|
output_state_dict[f"{layer_name}.attention.self.key.bias"] = k_bias |
|
output_state_dict[f"{layer_name}.attention.self.value.weight"] = v |
|
output_state_dict[f"{layer_name}.attention.self.value.bias"] = v_bias |
|
|
|
# Clear the stored tensor. |
|
attention_qkv_weight = None |
|
|
|
# Copy weights and biases as is. |
|
elif weight_or_bias in ["weight", "bias"]: |
|
|
|
out_name = megatron_to_transformers[op_name] |
|
output_state_dict[layer_name + out_name + weight_or_bias] = val |
|
|
|
# The final layernorm. |
|
output_state_dict["bert.encoder.ln.weight"] = transformer["final_layernorm.weight"] |
|
output_state_dict["bert.encoder.ln.bias"] = transformer["final_layernorm.bias"] |
|
|
|
# The config. |
|
output_config = { |
|
"vocab_size": word_embeddings.size(0), |
|
"hidden_size": 1024, |
|
"num_hidden_layers": 24, |
|
"num_attention_heads": 16, |
|
"hidden_act": "gelu_new", |
|
"intermediate_size": 4096, |
|
"hidden_dropout_prob": 0.1, |
|
"attention_probs_dropout_prob": 0.1, |
|
"max_position_embeddings": 512, |
|
"type_vocab_size": 2, |
|
"initializer_range": 0.2, |
|
"layer_norm_eps": 1e-12, |
|
"position_embedding_type": "absolute", |
|
"use_cache": False, |
|
"model_type": "megatron-bert", |
|
} |
|
|
|
if head_model: |
|
# The pooler. |
|
pooler = lm["pooler"] |
|
|
|
# Store the matrix and the bias. |
|
output_state_dict["bert.pooler.dense.weight"] = pooler["dense.weight"] |
|
output_state_dict["bert.pooler.dense.bias"] = pooler["dense.bias"] |
|
|
|
# The LM head from Megatron (for RACE). |
|
lm_head = model["lm_head"] |
|
|
|
# The transform matrix. |
|
output_state_dict["cls.predictions.transform.dense.weight"] = lm_head["dense.weight"] |
|
output_state_dict["cls.predictions.transform.dense.bias"] = lm_head["dense.bias"] |
|
|
|
# The transform LN. |
|
output_state_dict["cls.predictions.transform.LayerNorm.weight"] = lm_head["layernorm.weight"] |
|
output_state_dict["cls.predictions.transform.LayerNorm.bias"] = lm_head["layernorm.bias"] |
|
|
|
# For the decoder, we replicate the weights. |
|
output_state_dict["cls.predictions.decoder.weight"] = word_embeddings |
|
output_state_dict["cls.predictions.bias"] = lm_head["bias"] |
|
|
|
# The classifier from Megatron (for MLNI). |
|
binary_head = model["binary_head"] |
|
|
|
# Store the classifier. |
|
output_state_dict["cls.seq_relationship.weight"] = binary_head["weight"] |
|
output_state_dict["cls.seq_relationship.bias"] = binary_head["bias"] |
|
|
|
# It should be done! |
|
return output_state_dict, output_config |
|
|