|
{ |
|
"async_checkpointing": false, |
|
"async_eval_ngpus": -1, |
|
"batch_size": 4, |
|
"data": "", |
|
"disable_logging": false, |
|
"disable_workers_print": false, |
|
"dtype": "bf16", |
|
"dump_after_steps": 0, |
|
"dump_dir": "/fsx-onellm/rpasunuru/SFT/v2.1_textpp_30b_730k_sftv1.4_exp1/v2.1_textpp_30b_730k_sftv1.4_exp1_run000", |
|
"dump_freq": 400, |
|
"dump_profile_traces": false, |
|
"enable_loss_tracker": false, |
|
"epochs": -1, |
|
"eval_freq": 100000, |
|
"exp_id": "", |
|
"exp_name": "", |
|
"finetuning_dir": "/fsx-onellm/shared/from_rsc//v2.1_30b_qk_zloss_snorm_Nov_26_3_run000_checkpoint_0730000", |
|
"fp32_reduce_scatter": "all", |
|
"gpu_check_level": 3, |
|
"image_loss_weight": 1.0, |
|
"image_text_rotation_prob": 0.0, |
|
"instruct": { |
|
"no_loss_prompt": true, |
|
"no_loss_truncated": false, |
|
"use_eot": true |
|
}, |
|
"instruct_data": "/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/long_caption:2.92,/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/vqa:4.59,/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/text2image:10.44,/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/llama2_rjv6_helpful:43.27,/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/code_llama:0.51,/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/interleaved_batch1-17:27.45,/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/image_dialogue:7.46,/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/llama2_rjv6_harmless:0.97,/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/cybersec_safety:0.33,/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/onellm_multimodal_safety:0.86,/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/autosafety:0.51,/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/rainbow_safety:0.10,/fsx-onellm/shared/finetuning/sft_v1.4_data/splits/train/genai_safety:0.58", |
|
"iter_gopher": { |
|
"buffer_size": 16, |
|
"max_precompute": 10, |
|
"n_chars_by_tok": 15, |
|
"n_seqs_to_concat": 10, |
|
"num_processes": 1 |
|
}, |
|
"iter_jsonl": { |
|
"buffer_size": 64, |
|
"same_data": false |
|
}, |
|
"iter_multi": { |
|
"buffer_size": 512, |
|
"ignore_extra_chunks": true, |
|
"max_precompute": 20, |
|
"multiprocess": true |
|
}, |
|
"iter_type": "multi", |
|
"keep_checkpoints_every_steps": 400, |
|
"keep_eval_checkpoints": true, |
|
"keep_n_last_checkpoints": 2, |
|
"log_all_steps": false, |
|
"log_freq": 10, |
|
"log_updates": true, |
|
"log_wandb": false, |
|
"loss_rescaling": false, |
|
"model": { |
|
"add_extra_toks": "0", |
|
"alpha_depth": "disabled", |
|
"attn_dropout": 0, |
|
"attn_to_keep": "all", |
|
"custom_bwd": false, |
|
"dim": 8192, |
|
"dropout": 0.05, |
|
"efficient_attn": "flash", |
|
"emb_dropout": 0, |
|
"ffn_dim_multiplier": 1.0, |
|
"ffn_dropout": 0, |
|
"full_logging_n_layers": 4, |
|
"fuse_sequence_parallel": false, |
|
"init": { |
|
"coeff_std": null, |
|
"depth_last": false, |
|
"fixed_std": null, |
|
"no_init": false, |
|
"pos_init_scalar": null, |
|
"use_depth": "current", |
|
"use_gaussian": true |
|
}, |
|
"layer_ckpt": "0::2", |
|
"linear_residual_dropout": false, |
|
"loss_parallel": true, |
|
"max_length": 2048, |
|
"multiple_of": 256, |
|
"n_heads": 64, |
|
"n_kv_heads": 8, |
|
"n_layers": 48, |
|
"non_linearity": "swiglu", |
|
"norm_affine": true, |
|
"norm_eps": 1e-05, |
|
"norm_type": "rmsnorm", |
|
"output_dropout": 0, |
|
"output_size": -1, |
|
"pre_norm": true, |
|
"qk_normalization": true, |
|
"recompute_attn": true, |
|
"recompute_fc1_out": true, |
|
"recompute_fc3_out": true, |
|
"residual_dropout": 0.0, |
|
"rope_theta": 10000.0, |
|
"sequence_parallel": false, |
|
"swin_norm": true, |
|
"turn_eos_token": "<eos>", |
|
"use_rope": true, |
|
"vocab_size": 65536 |
|
}, |
|
"model_parallel_size": 4, |
|
"no_final_ckpt": false, |
|
"num_retrieved_docs": 0, |
|
"old_mp": -1, |
|
"old_world_size": -1, |
|
"optim": { |
|
"beta1": 0.9, |
|
"beta2": 0.95, |
|
"clip": 1.0, |
|
"cosine_theta": 1.0, |
|
"cycle_length": 1.0, |
|
"epsilon": 1e-08, |
|
"exp_factor": 0.5, |
|
"lr": 1e-05, |
|
"lr_min_ratio": 0.1, |
|
"scheduler": "cosine", |
|
"use_deprecated_optim": false, |
|
"warmup": 100, |
|
"weight_decay": 0.1 |
|
}, |
|
"periodic_gpu_check": true, |
|
"profile_freq": -1, |
|
"reshard_after_forward": true, |
|
"restore_dataloader_position": false, |
|
"retrieval_prob": 0.0, |
|
"rlhf": null, |
|
"root_dump_dir": "", |
|
"save_optimizer_states": true, |
|
"seq_len": 4096, |
|
"slurm": { |
|
"global_rank": 0, |
|
"is_slurm_job": true, |
|
"world_size": 128 |
|
}, |
|
"steps": 1200, |
|
"tokenizer": "/fsx-onellm/rpasunuru/models/cm3z/cm3v2_7b_placeholder/gpt2-unified-image-sentinel.json", |
|
"tokenizer_dir": "/fsx/guismay/data/large_experiments/fair_llm/datasets/tokenizers", |
|
"torch_seed": -1, |
|
"unlimited_steps": false, |
|
"use_hf_tokenizer": true, |
|
"valid": { |
|
"batch_size": 1, |
|
"debug": false, |
|
"majority_voting": 0, |
|
"n_batches": 100, |
|
"onellm_eval": false, |
|
"onellm_eval_media_storage": "", |
|
"ppl_files_str": "", |
|
"prompt_path": "", |
|
"prompt_templates": "{}", |
|
"random_fewshots": false, |
|
"seq_len": 4096, |
|
"tasks_root_dir": "", |
|
"tasks_str": "", |
|
"temperature": 1.0, |
|
"top_k": 0, |
|
"top_p": 0.0, |
|
"use_sampling": false, |
|
"write_eval": false |
|
}, |
|
"wandb_entity": "violet-zct", |
|
"wandb_project": "instruct_sft", |
|
"water_marking_codes_str": null, |
|
"z_loss_weight": 0.0001 |
|
} |