File size: 3,796 Bytes
0fd282e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
encoder:
num_layers: 8
hidden_size: 512
ffn_hidden_size: 1024
num_attention_heads: 6
init_method_std: 0.02
hidden_dropout: 0.0
attention_dropout: 0.0
ffn_dropout: 0.0
position_embedding_type: relative
relative_attention_num_buckets: 32
relative_attention_max_distance: 128
relative_position_bias_self_attention_only: true
kv_channels: 64
apply_query_key_layer_scaling: false
layernorm_epsilon: 1.0e-06
persist_layer_norm: true
bias_activation_fusion: false
grad_div_ar_fusion: true
masked_softmax_fusion: false
bias_dropout_add_fusion: false
bias: false
normalization: rmsnorm
arch: transformer
activation: geglu
headscale: false
transformer_block_type: pre_ln
hidden_steps: 32
num_self_attention_per_cross_attention: 1
openai_gelu: true
onnx_safe: false
fp32_residual_connection: false
activations_checkpoint_method: null
activations_checkpoint_num_layers: 1
megatron_legacy: true
normalize_attention_scores: false
decoder:
num_layers: 8
hidden_size: 512
ffn_hidden_size: 1024
num_attention_heads: 6
init_method_std: 0.02
hidden_dropout: 0.0
attention_dropout: 0.0
ffn_dropout: 0.0
position_embedding_type: relative
relative_attention_num_buckets: 32
relative_attention_max_distance: 128
relative_position_bias_self_attention_only: true
kv_channels: 64
apply_query_key_layer_scaling: false
layernorm_epsilon: 1.0e-06
persist_layer_norm: true
bias_activation_fusion: false
grad_div_ar_fusion: true
masked_softmax_fusion: false
bias_dropout_add_fusion: false
bias: false
normalization: rmsnorm
arch: transformer
activation: geglu
headscale: false
transformer_block_type: pre_ln
hidden_steps: 32
num_self_attention_per_cross_attention: 1
openai_gelu: true
onnx_safe: false
fp32_residual_connection: false
activations_checkpoint_method: null
activations_checkpoint_num_layers: 1
megatron_legacy: true
normalize_attention_scores: false
micro_batch_size: 4
global_batch_size: 8
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
resume_from_checkpoint: null
pipeline_model_parallel_split_rank: 0
make_vocab_size_divisible_by: 128
megatron_amp_O2: false
grad_allreduce_chunk_size_mb: 125
grad_div_ar_fusion: true
gradient_as_bucket_view: true
seq_length: 512
max_position_embeddings: 512
tokenizer:
library: sentencepiece
type: null
model: nemo:ce65b6d8f4fb4975955e935db699cba3_t5_small_tokenizer.model
vocab_file: null
merge_file: null
num_sentinel_tokens: 100
sentencepiece_legacy: true
add_sentinel_tokens_in_reverse_order: true
add_sentinel_tokens_first: true
embedding_init_method_std: 0.02
embedding_dropout: 0.1
share_token_embeddings: true
share_decoder_tokens_head_embeddings: false
tokens_head_bias: false
native_amp_init_scale: 4294967296
native_amp_growth_interval: 1000
fp16_lm_cross_entropy: false
seed: 1234
use_cpu_initialization: false
apex_transformer_log_level: 30
data:
data_prefix: null
index_mapping_dir: null
data_impl: mmap
splits_string: 949,45,5
seq_length: 512
seq_length_dec: 128
skip_warmup: true
num_workers: 0
dataloader_type: single
masked_lm_prob: 0.15
dataset_type: t5
short_seq_prob: 0.0
max_ngram_size: 10
mean_ngram_size: null
geometric_dist: true
permutation: false
whole_word_masking: false
favor_longer_ngrams: false
respect_document_boundaries: true
optim:
name: fused_adam
lr: 0.0001
betas:
- 0.9
- 0.999
eps: 1.0e-08
weight_decay: 0.01
sched:
name: WarmupAnnealing
min_lr: 1.0e-05
last_epoch: -1
warmup_ratio: 0.01
precision: bf16
target: nemo.collections.nlp.models.language_modeling.megatron_t5_model.MegatronT5Model
nemo_version: 1.11.0rc0
library: huggingface-t5v1_1 # options ['huggingface-t5v1_1', 'nemo-megatron'] |