File size: 2,355 Bytes
6e73cd3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
tokenizer_name: eluzhnica/mpt-7b-instruct-peft-compatible # Change to 30b when training is working
max_seq_len: 8192
global_seed: 17

model:
  name: hf_causal_lm
  pretrained: true
  pretrained_model_name_or_path: eluzhnica/mpt-7b-instruct-peft-compatible # Change to 30b when training is working
  init_device: meta
  config_overrides:
    max_seq_len: ${max_seq_len}
    attn_config:
      attn_uses_sequence_id: false

tokenizer:
  name: ${tokenizer_name}
  kwargs:
    model_max_length: ${max_seq_len}

train_loader:
  name: finetuning
  dataset:
    hf_name: json
    hf_kwargs:
      data_dir: finetune-data
    split: train
    max_seq_len: ${max_seq_len}
    allow_pad_trimming: false
    decoder_only_format: true
    packing_ratio: 9
    shuffle: true
  drop_last: true
  num_workers: 8
  pin_memory: false
  prefetch_factor: 2
  persistent_workers: true
  timeout: 0


####### This part is copy pasted from CPU example. Change back with other examples
scheduler:
  name: cosine_with_warmup
  t_warmup: 100ba
  alpha_f: 0.1

optimizer:
  name: decoupled_adamw
  lr: 6.0e-4
  betas:
  - 0.9
  - 0.95
  eps: 1.0e-08
  weight_decay: 0.0

algorithms:
  gradient_clipping:
    clipping_type: norm
    clipping_threshold: 1.0

############

max_duration: 1ep # Change to different values. I changed to 1ep because CPU example said so. Default was 8ep
eval_interval: 1 # Same here. But default was 1ep
eval_first: false # Same here again. Default was true
global_train_batch_size: 2 # Increase when training samples are complete

seed: ${global_seed}
device_eval_batch_size: 4
device_train_microbatch_size: 1
precision: fp32 # Change to amp_bf16 when on a supported GPU. Use fp32 of CPU

fsdp_config:
  sharding_strategy: FULL_SHARD
  mixed_precision: PURE
  activation_checkpointing: true
  activation_checkpointing_reentrant: false
  activation_cpu_offload: false
  limit_all_gathers: true
  sync_module_states: true
  verbose: false

progress_bar: true
log_to_console: true
console_log_interval: 20ba

callbacks:
  speed_monitor:
    window_size: 10
  runtime_estimator: {}
  lr_monitor: {}

# loggers:
#   wandb: {}

# save_folder:
# save_interval: 3ep
# save_num_checkpoints_to_keep: 1

# need to use converted checkpoint with llm-foundry code
# load_path:
autoresume: false
load_weights_only: false
python_log_level: debug


icl_max_seq_len: 2048