|
tokenizer_name: eluzhnica/mpt-7b-instruct-peft-compatible |
|
max_seq_len: 8192 |
|
global_seed: 17 |
|
|
|
model: |
|
name: hf_causal_lm |
|
pretrained: true |
|
pretrained_model_name_or_path: eluzhnica/mpt-7b-instruct-peft-compatible |
|
init_device: meta |
|
config_overrides: |
|
max_seq_len: ${max_seq_len} |
|
attn_config: |
|
attn_uses_sequence_id: false |
|
|
|
tokenizer: |
|
name: ${tokenizer_name} |
|
kwargs: |
|
model_max_length: ${max_seq_len} |
|
|
|
train_loader: |
|
name: finetuning |
|
dataset: |
|
hf_name: json |
|
hf_kwargs: |
|
data_dir: finetune-data |
|
split: train |
|
max_seq_len: ${max_seq_len} |
|
allow_pad_trimming: false |
|
decoder_only_format: true |
|
packing_ratio: 9 |
|
shuffle: true |
|
drop_last: true |
|
num_workers: 8 |
|
pin_memory: false |
|
prefetch_factor: 2 |
|
persistent_workers: true |
|
timeout: 0 |
|
|
|
|
|
|
|
scheduler: |
|
name: cosine_with_warmup |
|
t_warmup: 100ba |
|
alpha_f: 0.1 |
|
|
|
optimizer: |
|
name: decoupled_adamw |
|
lr: 6.0e-4 |
|
betas: |
|
- 0.9 |
|
- 0.95 |
|
eps: 1.0e-08 |
|
weight_decay: 0.0 |
|
|
|
algorithms: |
|
gradient_clipping: |
|
clipping_type: norm |
|
clipping_threshold: 1.0 |
|
|
|
|
|
|
|
max_duration: 1ep |
|
eval_interval: 1 |
|
eval_first: false |
|
global_train_batch_size: 2 |
|
|
|
seed: ${global_seed} |
|
device_eval_batch_size: 4 |
|
device_train_microbatch_size: 1 |
|
precision: fp32 |
|
|
|
fsdp_config: |
|
sharding_strategy: FULL_SHARD |
|
mixed_precision: PURE |
|
activation_checkpointing: true |
|
activation_checkpointing_reentrant: false |
|
activation_cpu_offload: false |
|
limit_all_gathers: true |
|
sync_module_states: true |
|
verbose: false |
|
|
|
progress_bar: true |
|
log_to_console: true |
|
console_log_interval: 20ba |
|
|
|
callbacks: |
|
speed_monitor: |
|
window_size: 10 |
|
runtime_estimator: {} |
|
lr_monitor: {} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
autoresume: false |
|
load_weights_only: false |
|
python_log_level: debug |
|
|
|
|
|
icl_max_seq_len: 2048 |