File size: 7,551 Bytes
0fd282e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
defaults:
- .@model.encoder: megatron_model_ul2base_config
- .@model.decoder: megatron_model_ul2base_config
name: megatron_ul2
restore_from_path: null # used when starting from a .nemo file
trainer:
devices: 1
num_nodes: 1
accelerator: gpu
precision: 16
logger: False # logger provided by exp_manager
enable_checkpointing: False
replace_sampler_ddp: False
max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
max_steps: 524288 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
log_every_n_steps: 100
val_check_interval: 1000
limit_val_batches: 30
limit_test_batches: 500
accumulate_grad_batches: 1
gradient_clip_val: 1.0
exp_manager:
explicit_log_dir: null
exp_dir: /project/scratch/p200097/nemo_experiments/
name: megatron.ul2-base-nl36.unigram-64k-pretok-small_data.all-clean
create_wandb_logger: False
wandb_logger_kwargs:
project: null
name: null
resume_if_exists: True
resume_ignore_no_checkpoint: True
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: val_loss
save_top_k: 10
mode: min
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
filename: '${name}--{val_loss:.2f}-{step}-{consumed_samples}'
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
model:
# model parallelism
micro_batch_size: 10
# 4 GPUS * 24 nodes = 96 GPUS
# 96 GPUS * 7 micro_batch_size = 672 batch_size
# 672 * 3 = 2016 global_batch_size
global_batch_size: 2080 # will use more micro batches to reach global batch size
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
resume_from_checkpoint: null # manually set the checkpoint file to load from
pipeline_model_parallel_split_rank: 0 # rank at which decoder starts.
# model architecture
make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting.
grad_allreduce_chunk_size_mb: 125
grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
seq_length: 512
max_position_embeddings: ${.seq_length}
tokenizer:
library: 'huggingface'
type: 'KBLab/unigram-64k-pretok-small_data-tokenizer'
model: null
vocab_file: null
merge_file: null
num_sentinel_tokens: 256
sentencepiece_legacy: True # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
# tokenizer:
# library: 'megatron'
# type: 'BertWordPieceCase'
# model: null
# vocab_file: null
# merge_file: null
# num_sentinel_tokens: 100
# sentencepiece_legacy: True # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
# weight init
embedding_init_method_std: 0.02 # Standard deviation of the zero mean normal distribution used for weight initialization.')
# embedding dropout
embedding_dropout: 0.1
# embedding sharing
share_token_embeddings: True # If True share encoder/decoder embeddings
share_decoder_tokens_head_embeddings: True # If True share decoder embeddings and decoder projection to logits
# token head
tokens_head_bias: False
# precision
native_amp_init_scale: 4294967296 # 2 ** 32
native_amp_growth_interval: 1000
fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
# miscellaneous
seed: 1234
use_cpu_initialization: False # Init weights on the CPU (slow for large models)
apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
data:
# Path to data must be specified by the user.
# can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-t5_00_text_document,.5,/raid/data/pile/my-t5_01_text_document]",
# Or see example below:
# data_prefix:
# - .5
# - /raid/data/pile/my-t5_00_text_document
# - .5
# - /raid/data/pile/my-t5_01_text_document
data_prefix:
- 0.005
- /project/scratch/p200097/data/unigram-64k-pretok-small_data/wikipedia-unigram-64k-pretok-small_data_text_sentence
- 0.035
- /project/scratch/p200097/data/unigram-64k-pretok-small_data/edepos_html-unigram-64k-pretok-small_data_text_sentence
- 0.030
- /project/scratch/p200097/data/unigram-64k-pretok-small_data/oscar-unigram-64k-pretok-small_data_text_sentence
- 0.105
- /project/scratch/p200097/data/unigram-64k-pretok-small_data/kw3-2017-unigram-64k-pretok-small_data_text_sentence
- 0.177
- /project/scratch/p200097/data/unigram-64k-pretok-small_data/issues-unigram-64k-pretok-small_data_text_sentence
- 0.648
- /project/scratch/p200097/data/unigram-64k-pretok-small_data/mc4-unigram-64k-pretok-small_data_text_sentence
index_mapping_dir: /project/scratch/p200097/data/unigram-64k-pretok-small_data/npy_files_ul2/ # path to save index mapping .npy files, by default will save in the same location as data_prefix
data_impl: mmap
# data_impl_kwargs: # currently used only for text_mmap, csv_mmap (should be data_impl dependant)
# # defaults for text_memmap
# newline_int: 10 # byte-value of newline (Use ord('\n') to get value)
# header_lines: 0 # skip first N header lines
# workers: null # number of workers when creating missing index files (null defaults to cpu_num // 2)
# sort_dataset_paths: False # if True datasets will be sorted by name
# # defaults for csv_memmap
# newline_int: 10 # byte-value of newline
# header_lines: 1 # skip first N header lines
# workers: null # number of workers when creating missing index files (null defaults to cpu_num // 2)
# sort_dataset_paths: False # if True datasets will be sorted by name
# data_col: 1 # column to use for data
# data_sep: ',' # string to split text into columns
splits_string: 996,2,2
seq_length: ${model.seq_length}
seq_length_dec: ${model.seq_length}
skip_warmup: True
num_workers: 32
dataloader_type: single # cyclic
masked_lm_prob: 0.15
extreme_masked_lm_prob: 0.5
dataset_type: 'ul2'
short_seq_prob: 0.0
max_ngram_size: 10
extreme_max_ngram_size: 128
extreme_min_ngram_size: 32
extreme_mean_ngram_size: 64
ngram_span_length_distribution: 'geometric'
extreme_ngram_span_length_distribution: 'truncated_normal'
prefix_lm_pivot_mean: 0.25
mean_ngram_size: 3
permutation: False
whole_word_masking: True
favor_longer_ngrams: False
respect_document_boundaries: True # If true, a single training exampl cannot cross document boundaries, increasing the fraction of <pad> tokens within a batch.
optim:
name: fused_adam
lr: 0.001
weight_decay: 0.01
betas:
- 0.9
- 0.999
eps: 1e-8
sched:
name: CosineAnnealing
warmup_steps: 1600
constant_steps: 30000 #40000
min_lr: 5e-6
# optim:
# name: fused_adam
# lr: 0.0001
# betas:
# - 0.9
# - 0.999
# eps: 1e-8
# weight_decay: 0.01
# sched:
# name: WarmupAnnealing
# min_lr: 0.00001
# last_epoch: -1
# warmup_ratio: 0.005 |