Spaces:
Running
on
Zero
Running
on
Zero
set -x | |
lpips_lambda=0.8 | |
image_size=128 # final rendered resolution | |
image_size_encoder=256 | |
patch_size=14 | |
# ! 29GB -> 37GB | |
batch_size=4 # BS=256 is enough | |
microbatch=${batch_size} | |
num_samples=$((50/${batch_size})) # follow ssdnerf and functa | |
cfg_dropout_prob=0.1 # SD config | |
unconditional_guidance_scale=6.5 | |
num_workers=0 | |
eval_data_dir="NONE" | |
shards_lst=/cpfs01/user/lanyushi.p/Repo/diffusion-3d/shell_scripts/baselines/reconstruction/sr/final_mv/diff_shards_lst_ani.txt | |
eval_shards_lst="/cpfs01/user/lanyushi.p/Repo/diffusion-3d/shell_scripts/baselines/reconstruction/sr/final_mv/shards_animals_lst.txt" | |
data_dir="NONE" | |
DATASET_FLAGS=" | |
--data_dir ${data_dir} \ | |
--eval_shards_lst ${eval_shards_lst} \ | |
--shards_lst ${shards_lst} \ | |
" | |
lr=2e-5 # for official DiT, lr=1e-4 for BS=256 | |
kl_lambda=0 | |
vit_lr=1e-5 # for improved-diffusion unet | |
ce_lambda=0.5 # ? | |
conv_lr=5e-5 | |
alpha_lambda=1 | |
scale_clip_encoding=1 | |
triplane_scaling_divider=0.88 | |
# prompt="A blue plastic chair." | |
prompt="A sailboat with mast." | |
# * above the best lr config | |
LR_FLAGS="--encoder_lr $vit_lr \ | |
--vit_decoder_lr $vit_lr \ | |
--lpips_lambda $lpips_lambda \ | |
--triplane_decoder_lr $conv_lr \ | |
--super_resolution_lr $conv_lr \ | |
--lr $lr \ | |
--kl_lambda ${kl_lambda} \ | |
--bg_lamdba 0.01 \ | |
--alpha_lambda ${alpha_lambda} \ | |
" | |
TRAIN_FLAGS="--iterations 10001 --anneal_lr False \ | |
--batch_size $batch_size --save_interval 10000 \ | |
--microbatch ${microbatch} \ | |
--image_size_encoder $image_size_encoder \ | |
--image_size $image_size \ | |
--dino_version mv-sd-dit \ | |
--sr_training False \ | |
--encoder_cls_token False \ | |
--decoder_cls_token False \ | |
--cls_token False \ | |
--weight_decay 0.05 \ | |
--no_dim_up_mlp True \ | |
--uvit_skip_encoder True \ | |
--decoder_load_pretrained False \ | |
--fg_mse False \ | |
--vae_p 2 \ | |
--plucker_embedding True \ | |
--encoder_in_channels 9 \ | |
--arch_dit_decoder DiT2-B/2 \ | |
--sd_E_ch 64 \ | |
--sd_E_num_res_blocks 1 \ | |
--lrm_decoder False \ | |
--resume_checkpoint /home/yslan/Repo/open-source/data/model_joint_denoise_rec_model2310000.pt \ | |
" | |
DDPM_MODEL_FLAGS=" | |
--learn_sigma False \ | |
--num_heads 8 \ | |
--num_res_blocks 2 \ | |
--num_channels 320 \ | |
--attention_resolutions "4,2,1" \ | |
--use_spatial_transformer True \ | |
--transformer_depth 1 \ | |
--context_dim 768 \ | |
" | |
# --pred_type x0 \ | |
# --iw_sample_p drop_all_uniform \ | |
# --loss_type x0 \ | |
# ! diffusion steps and noise schedule not used, since the continuous diffusion is adopted. | |
DIFFUSION_FLAGS="--diffusion_steps 1000 --noise_schedule linear \ | |
--use_kl False \ | |
--use_amp False \ | |
--triplane_scaling_divider ${triplane_scaling_divider} \ | |
--trainer_name vpsde_crossattn_objv \ | |
--mixed_prediction False \ | |
--train_vae False \ | |
--denoise_in_channels 4 \ | |
--denoise_out_channels 4 \ | |
--diffusion_input_size 32 \ | |
--diffusion_ce_anneal True \ | |
--create_controlnet False \ | |
--p_rendering_loss False \ | |
--pred_type v \ | |
--predict_v True \ | |
--create_dit False \ | |
--train_vae False \ | |
--use_eos_feature False \ | |
--roll_out True \ | |
" | |
DDIM_FLAGS=" | |
--timestep_respacing ddim250 \ | |
--use_ddim True \ | |
--unconditional_guidance_scale ${unconditional_guidance_scale} \ | |
" | |
logdir=./logs/LSGM/inference/t23d/Objaverse/cfg=${unconditional_guidance_scale}/fixing-DDIM/231w/mast3 | |
SR_TRAIN_FLAGS_v1_2XC=" | |
--decoder_in_chans 32 \ | |
--out_chans 96 \ | |
--ae_classname vit.vit_triplane.RodinSR_256_fusionv6_ConvQuant_liteSR_dinoInit3DAttn_SD_B_3L_C_withrollout_withSD_D_ditDecoder \ | |
--logdir $logdir \ | |
--arch_encoder vits \ | |
--arch_decoder vitb \ | |
--vit_decoder_wd 0.001 \ | |
--encoder_weight_decay 0.001 \ | |
--color_criterion mse \ | |
--triplane_in_chans 32 \ | |
--decoder_output_dim 3 \ | |
" | |
# --resume_checkpoint /mnt/lustre/yslan/logs/nips23/LSGM/ssd/chair/scaling/entropy/kl0_ema0.9999_vpsde_TrainLoop3DDiffusionLSGM_cvD_scaling_lsgm_unfreezeD_weightingv0_lsgm_unfreezeD_0.01_gradclip_nocesquare_clipH@0_noallAMP_dataset500/model_joint_denoise_rec_model0910000.pt \ | |
SR_TRAIN_FLAGS=${SR_TRAIN_FLAGS_v1_2XC} | |
NUM_GPUS=1 | |
rm -rf "$logdir"/runs | |
mkdir -p "$logdir"/ | |
cp "$0" "$logdir"/ | |
export OMP_NUM_THREADS=12 | |
export NCCL_ASYNC_ERROR_HANDLING=1 | |
export OPENCV_IO_ENABLE_OPENEXR=1 | |
export NCCL_IB_GID_INDEX=3 # https://github.com/huggingface/accelerate/issues/314#issuecomment-1821973930 | |
# export CUDA_VISIBLE_DEVICES=0,1,2 | |
# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 | |
# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 | |
# export CUDA_VISIBLE_DEVICES=7 | |
# export CUDA_VISIBLE_DEVICES=3,7 | |
# export CUDA_VISIBLE_DEVICES=3,4,5 | |
# export CUDA_VISIBLE_DEVICES=0,1,2,3 | |
export CUDA_VISIBLE_DEVICES=0 | |
# export CUDA_VISIBLE_DEVICES=4,5,6 | |
# export CUDA_VISIBLE_DEVICES=6,7 | |
# export CUDA_VISIBLE_DEVICES=7 | |
torchrun --nproc_per_node=$NUM_GPUS \ | |
--nnodes 1 \ | |
--rdzv-endpoint=localhost:24369 \ | |
scripts/vit_triplane_diffusion_sample_objaverse.py \ | |
--num_workers ${num_workers} \ | |
--eval_data_dir $eval_data_dir \ | |
--depth_lambda 0 \ | |
${TRAIN_FLAGS} \ | |
${SR_TRAIN_FLAGS} \ | |
${DATASET_FLAGS} \ | |
${DIFFUSION_FLAGS} \ | |
${DDPM_MODEL_FLAGS} \ | |
${DDIM_FLAGS} \ | |
--overfitting False \ | |
--load_pretrain_encoder False \ | |
--iterations 5000001 \ | |
--save_interval 10000 \ | |
--eval_interval 5000 \ | |
--decomposed True \ | |
--logdir $logdir \ | |
--cfg objverse_tuneray_aug_resolution_64_64_auto \ | |
--patch_size ${patch_size} \ | |
--eval_batch_size 1 \ | |
${LR_FLAGS} \ | |
--ce_lambda ${ce_lambda} \ | |
--negative_entropy_lambda ${ce_lambda} \ | |
--triplane_fg_bg False \ | |
--grad_clip True \ | |
--interval 5 \ | |
--normalize_clip_encoding True \ | |
--scale_clip_encoding ${scale_clip_encoding} \ | |
--objv_dataset True \ | |
--cfg_dropout_prob ${cfg_dropout_prob} \ | |
--cond_key caption \ | |
--enable_mixing_normal False \ | |
--use_lmdb_compressed False \ | |
--use_lmdb False \ | |
--load_wds_diff True \ | |
--mv_input True \ | |
--compile False \ | |
--prompt "$prompt" \ | |
--num_samples ${num_samples} \ | |
--use_wds False \ |