File size: 3,796 Bytes
0fd282e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
encoder:
  num_layers: 8
  hidden_size: 512
  ffn_hidden_size: 1024
  num_attention_heads: 6
  init_method_std: 0.02
  hidden_dropout: 0.0
  attention_dropout: 0.0
  ffn_dropout: 0.0
  position_embedding_type: relative
  relative_attention_num_buckets: 32
  relative_attention_max_distance: 128
  relative_position_bias_self_attention_only: true
  kv_channels: 64
  apply_query_key_layer_scaling: false
  layernorm_epsilon: 1.0e-06
  persist_layer_norm: true
  bias_activation_fusion: false
  grad_div_ar_fusion: true
  masked_softmax_fusion: false
  bias_dropout_add_fusion: false
  bias: false
  normalization: rmsnorm
  arch: transformer
  activation: geglu
  headscale: false
  transformer_block_type: pre_ln
  hidden_steps: 32
  num_self_attention_per_cross_attention: 1
  openai_gelu: true
  onnx_safe: false
  fp32_residual_connection: false
  activations_checkpoint_method: null
  activations_checkpoint_num_layers: 1
  megatron_legacy: true
  normalize_attention_scores: false
decoder:
  num_layers: 8
  hidden_size: 512
  ffn_hidden_size: 1024
  num_attention_heads: 6
  init_method_std: 0.02
  hidden_dropout: 0.0
  attention_dropout: 0.0
  ffn_dropout: 0.0
  position_embedding_type: relative
  relative_attention_num_buckets: 32
  relative_attention_max_distance: 128
  relative_position_bias_self_attention_only: true
  kv_channels: 64
  apply_query_key_layer_scaling: false
  layernorm_epsilon: 1.0e-06
  persist_layer_norm: true
  bias_activation_fusion: false
  grad_div_ar_fusion: true
  masked_softmax_fusion: false
  bias_dropout_add_fusion: false
  bias: false
  normalization: rmsnorm
  arch: transformer
  activation: geglu
  headscale: false
  transformer_block_type: pre_ln
  hidden_steps: 32
  num_self_attention_per_cross_attention: 1
  openai_gelu: true
  onnx_safe: false
  fp32_residual_connection: false
  activations_checkpoint_method: null
  activations_checkpoint_num_layers: 1
  megatron_legacy: true
  normalize_attention_scores: false
micro_batch_size: 4
global_batch_size: 8
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
resume_from_checkpoint: null
pipeline_model_parallel_split_rank: 0
make_vocab_size_divisible_by: 128
megatron_amp_O2: false
grad_allreduce_chunk_size_mb: 125
grad_div_ar_fusion: true
gradient_as_bucket_view: true
seq_length: 512
max_position_embeddings: 512
tokenizer:
  library: sentencepiece
  type: null
  model: nemo:ce65b6d8f4fb4975955e935db699cba3_t5_small_tokenizer.model
  vocab_file: null
  merge_file: null
  num_sentinel_tokens: 100
  sentencepiece_legacy: true
  add_sentinel_tokens_in_reverse_order: true
  add_sentinel_tokens_first: true
embedding_init_method_std: 0.02
embedding_dropout: 0.1
share_token_embeddings: true
share_decoder_tokens_head_embeddings: false
tokens_head_bias: false
native_amp_init_scale: 4294967296
native_amp_growth_interval: 1000
fp16_lm_cross_entropy: false
seed: 1234
use_cpu_initialization: false
apex_transformer_log_level: 30
data:
  data_prefix: null
  index_mapping_dir: null
  data_impl: mmap
  splits_string: 949,45,5
  seq_length: 512
  seq_length_dec: 128
  skip_warmup: true
  num_workers: 0
  dataloader_type: single
  masked_lm_prob: 0.15
  dataset_type: t5
  short_seq_prob: 0.0
  max_ngram_size: 10
  mean_ngram_size: null
  geometric_dist: true
  permutation: false
  whole_word_masking: false
  favor_longer_ngrams: false
  respect_document_boundaries: true
optim:
  name: fused_adam
  lr: 0.0001
  betas:
  - 0.9
  - 0.999
  eps: 1.0e-08
  weight_decay: 0.01
  sched:
    name: WarmupAnnealing
    min_lr: 1.0e-05
    last_epoch: -1
    warmup_ratio: 0.01
precision: bf16
target: nemo.collections.nlp.models.language_modeling.megatron_t5_model.MegatronT5Model
nemo_version: 1.11.0rc0
library: huggingface-t5v1_1 # options ['huggingface-t5v1_1', 'nemo-megatron']