dev-hitesh-gupta commited on Sep 4

Commit

4eae4b0

•

1 Parent(s): 83739db

Upload folder using huggingface_hub

Browse files

Files changed (38) hide show

.gitattributes +1 -0
log_1725411796.txt +0 -0
log_1725411883.txt +0 -0
log_1725411949.txt +0 -0
log_1725412477.txt +0 -0
log_1725412528.txt +0 -0
log_1725412706.txt +0 -0
log_1725412729.txt +0 -0
log_1725412766.txt +0 -0
log_1725413132.txt +0 -0
log_1725413248.txt +0 -0
log_1725413416.txt +0 -0
log_1725413811.txt +0 -0
log_1725413837.txt +0 -0
log_1725414865.txt +0 -0
log_1725415318.txt +0 -0
log_1725415584.txt +0 -0
log_1725415618.txt +0 -0
log_1725415890.txt +0 -0
log_1725416792.txt +0 -0
log_1725416985.txt +0 -0
log_1725417415.txt +0 -0
log_1725417643.txt +0 -0
log_1725417865.txt +0 -0
log_1725418121.txt +0 -0
log_1725418385.txt +0 -0
log_1725418403.txt +0 -0
log_1725418471.txt +0 -0
log_1725418487.txt +0 -0
log_1725418585.txt +0 -0
log_1725418906.txt +0 -0
log_1725418928.txt +0 -0
log_1725418952.txt +0 -0
log_1725419330.txt +10 -0
profiling_outputs/iteration_12/rank0.1725411827523394495.pt.trace.json.gz +3 -0
profiling_outputs/iteration_12/rank0.1725419372986525138.pt.trace.json.gz +3 -0
profiling_outputs/iteration_12/rank0_key_averages.txt +157 -0
profiling_outputs/iteration_12/rank0_stacks.txt +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+profiling_outputs/iteration_12/rank0_stacks.txt filter=lfs diff=lfs merge=lfs -text

log_1725411796.txt ADDED Viewed

File without changes

log_1725411883.txt ADDED Viewed

File without changes

log_1725411949.txt ADDED Viewed

File without changes

log_1725412477.txt ADDED Viewed

File without changes

log_1725412528.txt ADDED Viewed

File without changes

log_1725412706.txt ADDED Viewed

File without changes

log_1725412729.txt ADDED Viewed

File without changes

log_1725412766.txt ADDED Viewed

File without changes

log_1725413132.txt ADDED Viewed

File without changes

log_1725413248.txt ADDED Viewed

File without changes

log_1725413416.txt ADDED Viewed

File without changes

log_1725413811.txt ADDED Viewed

File without changes

log_1725413837.txt ADDED Viewed

File without changes

log_1725414865.txt ADDED Viewed

File without changes

log_1725415318.txt ADDED Viewed

File without changes

log_1725415584.txt ADDED Viewed

File without changes

log_1725415618.txt ADDED Viewed

File without changes

log_1725415890.txt ADDED Viewed

File without changes

log_1725416792.txt ADDED Viewed

File without changes

log_1725416985.txt ADDED Viewed

File without changes

log_1725417415.txt ADDED Viewed

File without changes

log_1725417643.txt ADDED Viewed

File without changes

log_1725417865.txt ADDED Viewed

File without changes

log_1725418121.txt ADDED Viewed

File without changes

log_1725418385.txt ADDED Viewed

File without changes

log_1725418403.txt ADDED Viewed

File without changes

log_1725418471.txt ADDED Viewed

File without changes

log_1725418487.txt ADDED Viewed

File without changes

log_1725418585.txt ADDED Viewed

File without changes

log_1725418906.txt ADDED Viewed

File without changes

log_1725418928.txt ADDED Viewed

File without changes

log_1725418952.txt ADDED Viewed

File without changes

log_1725419330.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+Step 1 | loss:0.8410570025444031 lr:2.9999999999999997e-06 tokens_per_second_per_gpu:704.4238888314976
+Step 2 | loss:0.8031501173973083 lr:5.999999999999999e-06 tokens_per_second_per_gpu:1024.7700646671149
+Step 3 | loss:0.8186150789260864 lr:8.999999999999999e-06 tokens_per_second_per_gpu:1043.6909191161417
+Step 4 | loss:0.8145633339881897 lr:1.1999999999999999e-05 tokens_per_second_per_gpu:1019.2992522263147
+Step 5 | loss:0.7742070555686951 lr:1.4999999999999999e-05 tokens_per_second_per_gpu:1036.7547177946178
+Step 6 | loss:0.8019543886184692 lr:1.7999999999999997e-05 tokens_per_second_per_gpu:1038.9270556360439
+Step 7 | loss:0.8277010321617126 lr:2.1e-05 tokens_per_second_per_gpu:1036.3345335641322
+Step 8 | loss:0.7890640497207642 lr:2.3999999999999997e-05 tokens_per_second_per_gpu:1035.6141064563976
+Step 9 | loss:0.803084671497345 lr:2.6999999999999996e-05 tokens_per_second_per_gpu:1029.8257838448487
+Step 10 | loss:0.8352741003036499 lr:2.9999999999999997e-05 tokens_per_second_per_gpu:1037.948880337632

profiling_outputs/iteration_12/rank0.1725411827523394495.pt.trace.json.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b39ce2e6a8c41b095ccf0ba616e02546cf107212af1f2db0317745f542b4af47
+size 10297860

profiling_outputs/iteration_12/rank0.1725419372986525138.pt.trace.json.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dba010bb1a8b9c4dd17fdf7acd23de237845bf0098d51e3a53636571688deff8
+size 10243085

profiling_outputs/iteration_12/rank0_key_averages.txt ADDED Viewed

	@@ -0,0 +1,157 @@

+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  ---------------------------------------------------------------------------  ------------
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls                                                                      Input Shapes  Source Location                                                               Total FLOPs
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  ---------------------------------------------------------------------------  ------------
+                                          ProfilerStep*         0.00%       0.000us         0.00%       0.000us       0.000us     806.035ms        22.19%     806.035ms     403.018ms           0 b           0 b           0 b           0 b             2                                                                                []  --                                                                           <built-in method to of Tensor object at 0x7f79fa3fbf10>
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(622): train
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(643): recipe_main
+                                                                                                                                                                                                                                                                                                                                                                                                                            torchtune/config/_parse.py(50): wrapper
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(648): <module>
+ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_3...         0.00%       0.000us         0.00%       0.000us       0.000us     212.867ms         5.86%     212.867ms     485.997us           0 b           0 b           0 b           0 b           438                                                                                []  --                                                                           ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/autograd/graph.py(763): _engine_run_backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/_tensor.py(465): backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(622): train
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(643): recipe_main
+void cutlass::Kernel<cutlass_80_tensorop_bf16_s16816...         0.00%       0.000us         0.00%       0.000us       0.000us     190.122ms         5.23%     190.122ms       1.980ms           0 b           0 b           0 b           0 b            96                                                                                []  --                                                                           ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/autograd/graph.py(763): _engine_run_backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/_tensor.py(465): backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(622): train
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(643): recipe_main
+ampere_bf16_s16816gemm_bf16_256x128_ldg8_f2f_stages_...         0.00%       0.000us         0.00%       0.000us       0.000us     100.057ms         2.75%     100.057ms       2.943ms           0 b           0 b           0 b           0 b            34                                                                                []  --                                                                           ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/autograd/graph.py(763): _engine_run_backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/_tensor.py(465): backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(622): train
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(643): recipe_main
+void at::native::(anonymous namespace)::cunn_Spatial...         0.00%       0.000us         0.00%       0.000us       0.000us      70.645ms         1.94%      70.645ms      35.322ms           0 b           0 b           0 b           0 b             2                                                                                []  --                                                                           <built-in function cross_entropy_loss>
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/nn/functional.py(3014): cross_entropy
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/nn/modules/loss.py(1187): forward
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/nn/modules/module.py(1555): _call_impl
+                                                                                                                                                                                                                                                                                                                                                                                                                            nn.Module: CrossEntropyLoss_0
+ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_3...         0.00%       0.000us         0.00%       0.000us       0.000us      66.469ms         1.83%      66.469ms       2.770ms           0 b           0 b           0 b           0 b            24                                                                                []  --                                                                           <built-in method to of Tensor object at 0x7f79fa3fbf10>
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(622): train
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(643): recipe_main
+                                                                                                                                                                                                                                                                                                                                                                                                                            torchtune/config/_parse.py(50): wrapper
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(648): <module>
+void cutlass::Kernel<cutlass_80_tensorop_bf16_s16816...         0.00%       0.000us         0.00%       0.000us       0.000us      64.458ms         1.77%      64.458ms       5.372ms           0 b           0 b           0 b           0 b            12                                                                                []  --                                                                           <built-in method to of Tensor object at 0x7f79fa3fbf10>
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(622): train
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(643): recipe_main
+                                                                                                                                                                                                                                                                                                                                                                                                                            torchtune/config/_parse.py(50): wrapper
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(648): <module>
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      63.230ms         1.74%      63.230ms     124.960us           0 b           0 b           0 b           0 b           506                                                                                []  --                                                                           ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/autograd/graph.py(763): _engine_run_backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/_tensor.py(465): backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(622): train
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(643): recipe_main
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      59.520ms         1.64%      59.520ms     117.166us           0 b           0 b           0 b           0 b           508                                                                                []  --                                                                           ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/autograd/graph.py(763): _engine_run_backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/_tensor.py(465): backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(622): train
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(643): recipe_main
+ampere_bf16_s16816gemm_bf16_256x128_ldg8_f2f_stages_...         0.00%       0.000us         0.00%       0.000us       0.000us      52.740ms         1.45%      52.740ms       2.198ms           0 b           0 b           0 b           0 b            24                                                                                []  --                                                                           <built-in method to of Tensor object at 0x7f79fa3fbf10>
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(622): train
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(643): recipe_main
+                                                                                                                                                                                                                                                                                                                                                                                                                            torchtune/config/_parse.py(50): wrapper
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(648): <module>
+ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_3...         0.00%       0.000us         0.00%       0.000us       0.000us      47.180ms         1.30%      47.180ms       1.966ms           0 b           0 b           0 b           0 b            24                                                                                []  --                                                                           <built-in method to of Tensor object at 0x7f79fa3fbf10>
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(622): train
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(643): recipe_main
+                                                                                                                                                                                                                                                                                                                                                                                                                            torchtune/config/_parse.py(50): wrapper
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(648): <module>
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      41.515ms         1.14%      41.515ms      65.688us           0 b           0 b           0 b           0 b           632                                                                                []  --                                                                           ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/autograd/graph.py(763): _engine_run_backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/_tensor.py(465): backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(622): train
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(643): recipe_main
+                                     aten::_log_softmax         0.00%      23.940us         0.00%     144.092us     144.092us      35.624ms         0.98%      37.220ms      37.220ms           0 b           0 b     222.77 Mb           0 b             1                                                         [[2, 32016, 912], [], []]  --                                                                           <built-in function cross_entropy_loss>
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/nn/functional.py(3014): cross_entropy
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/nn/modules/loss.py(1187): forward
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/nn/modules/module.py(1555): _call_impl
+                                                                                                                                                                                                                                                                                                                                                                                                                            nn.Module: CrossEntropyLoss_0
+                                             aten::add_         0.03%       1.329ms         2.03%      90.196ms     477.229us      35.471ms         0.98%      35.471ms     187.675us           0 b           0 b           0 b           0 b           189                                    [[2, 913, 32, 64, 2], [2, 913, 32, 64, 2], []]  --                                                                           ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/autograd/graph.py(763): _engine_run_backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/_tensor.py(465): backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(622): train
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(643): recipe_main
+                                     aten::_log_softmax         0.00%      30.790us         0.00%      70.611us      70.611us      35.020ms         0.96%      35.411ms      35.411ms           0 b           0 b      71.32 Mb           0 b             1                                                         [[2, 32016, 292], [], []]  --                                                                           <built-in function cross_entropy_loss>
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/nn/functional.py(3014): cross_entropy
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/nn/modules/loss.py(1187): forward
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/nn/modules/module.py(1555): _call_impl
+                                                                                                                                                                                                                                                                                                                                                                                                                            nn.Module: CrossEntropyLoss_0
+void at::native::(anonymous namespace)::cunn_Spatial...         0.00%       0.000us         0.00%       0.000us       0.000us      32.035ms         0.88%      32.035ms      16.018ms           0 b           0 b           0 b           0 b             2                                                                                []  --                                                                           ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/autograd/graph.py(763): _engine_run_backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/_tensor.py(465): backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(622): train
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(643): recipe_main
+void cutlass::Kernel<cutlass_80_tensorop_bf16_s16816...         0.00%       0.000us         0.00%       0.000us       0.000us      31.991ms         0.88%      31.991ms       5.332ms           0 b           0 b           0 b           0 b             6                                                                                []  --                                                                           <built-in method to of Tensor object at 0x7f79fa3fbf10>
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(622): train
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(643): recipe_main
+                                                                                                                                                                                                                                                                                                                                                                                                                            torchtune/config/_parse.py(50): wrapper
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(648): <module>
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      31.808ms         0.88%      31.808ms      82.833us           0 b           0 b           0 b           0 b           384                                                                                []  --                                                                           ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/autograd/graph.py(763): _engine_run_backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/_tensor.py(465): backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(622): train
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(643): recipe_main
+                                             aten::add_         0.04%       1.868ms         2.02%      89.872ms     358.056us      29.127ms         0.80%      29.127ms     116.044us           0 b           0 b           0 b           0 b           251                                              [[2, 913, 4096], [2, 913, 4096], []]  --                                                                           ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/autograd/graph.py(763): _engine_run_backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/_tensor.py(465): backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(622): train
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(643): recipe_main
+void pytorch_flash::flash_bwd_dq_dk_dv_loop_seqk_par...         0.00%       0.000us         0.00%       0.000us       0.000us      27.914ms         0.77%      27.914ms     436.163us           0 b           0 b           0 b           0 b            64                                                                                []  --                                                                           ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/autograd/graph.py(763): _engine_run_backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/_tensor.py(465): backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(622): train
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(643): recipe_main
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      27.004ms         0.74%      27.004ms     355.322us           0 b           0 b           0 b           0 b            76                                                                                []  --                                                                           <built-in method to of Tensor object at 0x7f79fa3fbf10>
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(622): train
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(643): recipe_main
+                                                                                                                                                                                                                                                                                                                                                                                                                            torchtune/config/_parse.py(50): wrapper
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(648): <module>
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      26.862ms         0.74%      26.862ms      38.706us           0 b           0 b           0 b           0 b           694                                                                                []  --                                                                           ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/autograd/graph.py(763): _engine_run_backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/_tensor.py(465): backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(622): train
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(643): recipe_main
+void at::native::elementwise_kernel<128, 2, at::nati...         0.00%       0.000us         0.00%       0.000us       0.000us      25.891ms         0.71%      25.891ms     208.798us           0 b           0 b           0 b           0 b           124                                                                                []  --                                                                           <built-in method to of Tensor object at 0x7f79fa3fbf10>
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(622): train
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(643): recipe_main
+                                                                                                                                                                                                                                                                                                                                                                                                                            torchtune/config/_parse.py(50): wrapper
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(648): <module>
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      25.065ms         0.69%      25.065ms     334.207us           0 b           0 b           0 b           0 b            75                                                                                []  --                                                                           <built-in method to of Tensor object at 0x7f79fa3fbf10>
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(622): train
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(643): recipe_main
+                                                                                                                                                                                                                                                                                                                                                                                                                            torchtune/config/_parse.py(50): wrapper
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(648): <module>
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us      22.140ms         0.61%      22.140ms      43.412us           0 b           0 b           0 b           0 b           510                                                                                []  --                                                                           ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/autograd/graph.py(763): _engine_run_backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            torch/_tensor.py(465): backward
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(622): train
+                                                                                                                                                                                                                                                                                                                                                                                                                            recipes/lora_finetune_single_device.py(643): recipe_main
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  ---------------------------------------------------------------------------  ------------
+Self CPU time total: 4.453s
+Self CUDA time total: 3.632s

profiling_outputs/iteration_12/rank0_stacks.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b4489b7f30fc000a3a5f9a1dcfe8879c00b547f631645074dbaf194fb5f18913
+size 28216209