dev-hitesh-gupta commited on
Commit
4eae4b0
1 Parent(s): 83739db

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ profiling_outputs/iteration_12/rank0_stacks.txt filter=lfs diff=lfs merge=lfs -text
log_1725411796.txt ADDED
File without changes
log_1725411883.txt ADDED
File without changes
log_1725411949.txt ADDED
File without changes
log_1725412477.txt ADDED
File without changes
log_1725412528.txt ADDED
File without changes
log_1725412706.txt ADDED
File without changes
log_1725412729.txt ADDED
File without changes
log_1725412766.txt ADDED
File without changes
log_1725413132.txt ADDED
File without changes
log_1725413248.txt ADDED
File without changes
log_1725413416.txt ADDED
File without changes
log_1725413811.txt ADDED
File without changes
log_1725413837.txt ADDED
File without changes
log_1725414865.txt ADDED
File without changes
log_1725415318.txt ADDED
File without changes
log_1725415584.txt ADDED
File without changes
log_1725415618.txt ADDED
File without changes
log_1725415890.txt ADDED
File without changes
log_1725416792.txt ADDED
File without changes
log_1725416985.txt ADDED
File without changes
log_1725417415.txt ADDED
File without changes
log_1725417643.txt ADDED
File without changes
log_1725417865.txt ADDED
File without changes
log_1725418121.txt ADDED
File without changes
log_1725418385.txt ADDED
File without changes
log_1725418403.txt ADDED
File without changes
log_1725418471.txt ADDED
File without changes
log_1725418487.txt ADDED
File without changes
log_1725418585.txt ADDED
File without changes
log_1725418906.txt ADDED
File without changes
log_1725418928.txt ADDED
File without changes
log_1725418952.txt ADDED
File without changes
log_1725419330.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ Step 1 | loss:0.8410570025444031 lr:2.9999999999999997e-06 tokens_per_second_per_gpu:704.4238888314976
2
+ Step 2 | loss:0.8031501173973083 lr:5.999999999999999e-06 tokens_per_second_per_gpu:1024.7700646671149
3
+ Step 3 | loss:0.8186150789260864 lr:8.999999999999999e-06 tokens_per_second_per_gpu:1043.6909191161417
4
+ Step 4 | loss:0.8145633339881897 lr:1.1999999999999999e-05 tokens_per_second_per_gpu:1019.2992522263147
5
+ Step 5 | loss:0.7742070555686951 lr:1.4999999999999999e-05 tokens_per_second_per_gpu:1036.7547177946178
6
+ Step 6 | loss:0.8019543886184692 lr:1.7999999999999997e-05 tokens_per_second_per_gpu:1038.9270556360439
7
+ Step 7 | loss:0.8277010321617126 lr:2.1e-05 tokens_per_second_per_gpu:1036.3345335641322
8
+ Step 8 | loss:0.7890640497207642 lr:2.3999999999999997e-05 tokens_per_second_per_gpu:1035.6141064563976
9
+ Step 9 | loss:0.803084671497345 lr:2.6999999999999996e-05 tokens_per_second_per_gpu:1029.8257838448487
10
+ Step 10 | loss:0.8352741003036499 lr:2.9999999999999997e-05 tokens_per_second_per_gpu:1037.948880337632
profiling_outputs/iteration_12/rank0.1725411827523394495.pt.trace.json.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b39ce2e6a8c41b095ccf0ba616e02546cf107212af1f2db0317745f542b4af47
3
+ size 10297860
profiling_outputs/iteration_12/rank0.1725419372986525138.pt.trace.json.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dba010bb1a8b9c4dd17fdf7acd23de237845bf0098d51e3a53636571688deff8
3
+ size 10243085
profiling_outputs/iteration_12/rank0_key_averages.txt ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -------------------------------------------------------------------------------- --------------------------------------------------------------------------- ------------
2
+ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg CPU Mem Self CPU Mem CUDA Mem Self CUDA Mem # of Calls Input Shapes Source Location Total FLOPs
3
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -------------------------------------------------------------------------------- --------------------------------------------------------------------------- ------------
4
+ ProfilerStep* 0.00% 0.000us 0.00% 0.000us 0.000us 806.035ms 22.19% 806.035ms 403.018ms 0 b 0 b 0 b 0 b 2 [] -- <built-in method to of Tensor object at 0x7f79fa3fbf10>
5
+ recipes/lora_finetune_single_device.py(622): train
6
+ recipes/lora_finetune_single_device.py(643): recipe_main
7
+ torchtune/config/_parse.py(50): wrapper
8
+ recipes/lora_finetune_single_device.py(648): <module>
9
+
10
+ ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_3... 0.00% 0.000us 0.00% 0.000us 0.000us 212.867ms 5.86% 212.867ms 485.997us 0 b 0 b 0 b 0 b 438 [] -- ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
11
+ torch/autograd/graph.py(763): _engine_run_backward
12
+ torch/_tensor.py(465): backward
13
+ recipes/lora_finetune_single_device.py(622): train
14
+ recipes/lora_finetune_single_device.py(643): recipe_main
15
+
16
+ void cutlass::Kernel<cutlass_80_tensorop_bf16_s16816... 0.00% 0.000us 0.00% 0.000us 0.000us 190.122ms 5.23% 190.122ms 1.980ms 0 b 0 b 0 b 0 b 96 [] -- ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
17
+ torch/autograd/graph.py(763): _engine_run_backward
18
+ torch/_tensor.py(465): backward
19
+ recipes/lora_finetune_single_device.py(622): train
20
+ recipes/lora_finetune_single_device.py(643): recipe_main
21
+
22
+ ampere_bf16_s16816gemm_bf16_256x128_ldg8_f2f_stages_... 0.00% 0.000us 0.00% 0.000us 0.000us 100.057ms 2.75% 100.057ms 2.943ms 0 b 0 b 0 b 0 b 34 [] -- ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
23
+ torch/autograd/graph.py(763): _engine_run_backward
24
+ torch/_tensor.py(465): backward
25
+ recipes/lora_finetune_single_device.py(622): train
26
+ recipes/lora_finetune_single_device.py(643): recipe_main
27
+
28
+ void at::native::(anonymous namespace)::cunn_Spatial... 0.00% 0.000us 0.00% 0.000us 0.000us 70.645ms 1.94% 70.645ms 35.322ms 0 b 0 b 0 b 0 b 2 [] -- <built-in function cross_entropy_loss>
29
+ torch/nn/functional.py(3014): cross_entropy
30
+ torch/nn/modules/loss.py(1187): forward
31
+ torch/nn/modules/module.py(1555): _call_impl
32
+ nn.Module: CrossEntropyLoss_0
33
+
34
+ ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_3... 0.00% 0.000us 0.00% 0.000us 0.000us 66.469ms 1.83% 66.469ms 2.770ms 0 b 0 b 0 b 0 b 24 [] -- <built-in method to of Tensor object at 0x7f79fa3fbf10>
35
+ recipes/lora_finetune_single_device.py(622): train
36
+ recipes/lora_finetune_single_device.py(643): recipe_main
37
+ torchtune/config/_parse.py(50): wrapper
38
+ recipes/lora_finetune_single_device.py(648): <module>
39
+
40
+ void cutlass::Kernel<cutlass_80_tensorop_bf16_s16816... 0.00% 0.000us 0.00% 0.000us 0.000us 64.458ms 1.77% 64.458ms 5.372ms 0 b 0 b 0 b 0 b 12 [] -- <built-in method to of Tensor object at 0x7f79fa3fbf10>
41
+ recipes/lora_finetune_single_device.py(622): train
42
+ recipes/lora_finetune_single_device.py(643): recipe_main
43
+ torchtune/config/_parse.py(50): wrapper
44
+ recipes/lora_finetune_single_device.py(648): <module>
45
+
46
+ void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 63.230ms 1.74% 63.230ms 124.960us 0 b 0 b 0 b 0 b 506 [] -- ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
47
+ torch/autograd/graph.py(763): _engine_run_backward
48
+ torch/_tensor.py(465): backward
49
+ recipes/lora_finetune_single_device.py(622): train
50
+ recipes/lora_finetune_single_device.py(643): recipe_main
51
+
52
+ void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 59.520ms 1.64% 59.520ms 117.166us 0 b 0 b 0 b 0 b 508 [] -- ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
53
+ torch/autograd/graph.py(763): _engine_run_backward
54
+ torch/_tensor.py(465): backward
55
+ recipes/lora_finetune_single_device.py(622): train
56
+ recipes/lora_finetune_single_device.py(643): recipe_main
57
+
58
+ ampere_bf16_s16816gemm_bf16_256x128_ldg8_f2f_stages_... 0.00% 0.000us 0.00% 0.000us 0.000us 52.740ms 1.45% 52.740ms 2.198ms 0 b 0 b 0 b 0 b 24 [] -- <built-in method to of Tensor object at 0x7f79fa3fbf10>
59
+ recipes/lora_finetune_single_device.py(622): train
60
+ recipes/lora_finetune_single_device.py(643): recipe_main
61
+ torchtune/config/_parse.py(50): wrapper
62
+ recipes/lora_finetune_single_device.py(648): <module>
63
+
64
+ ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_3... 0.00% 0.000us 0.00% 0.000us 0.000us 47.180ms 1.30% 47.180ms 1.966ms 0 b 0 b 0 b 0 b 24 [] -- <built-in method to of Tensor object at 0x7f79fa3fbf10>
65
+ recipes/lora_finetune_single_device.py(622): train
66
+ recipes/lora_finetune_single_device.py(643): recipe_main
67
+ torchtune/config/_parse.py(50): wrapper
68
+ recipes/lora_finetune_single_device.py(648): <module>
69
+
70
+ void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 41.515ms 1.14% 41.515ms 65.688us 0 b 0 b 0 b 0 b 632 [] -- ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
71
+ torch/autograd/graph.py(763): _engine_run_backward
72
+ torch/_tensor.py(465): backward
73
+ recipes/lora_finetune_single_device.py(622): train
74
+ recipes/lora_finetune_single_device.py(643): recipe_main
75
+
76
+ aten::_log_softmax 0.00% 23.940us 0.00% 144.092us 144.092us 35.624ms 0.98% 37.220ms 37.220ms 0 b 0 b 222.77 Mb 0 b 1 [[2, 32016, 912], [], []] -- <built-in function cross_entropy_loss>
77
+ torch/nn/functional.py(3014): cross_entropy
78
+ torch/nn/modules/loss.py(1187): forward
79
+ torch/nn/modules/module.py(1555): _call_impl
80
+ nn.Module: CrossEntropyLoss_0
81
+
82
+ aten::add_ 0.03% 1.329ms 2.03% 90.196ms 477.229us 35.471ms 0.98% 35.471ms 187.675us 0 b 0 b 0 b 0 b 189 [[2, 913, 32, 64, 2], [2, 913, 32, 64, 2], []] -- ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
83
+ torch/autograd/graph.py(763): _engine_run_backward
84
+ torch/_tensor.py(465): backward
85
+ recipes/lora_finetune_single_device.py(622): train
86
+ recipes/lora_finetune_single_device.py(643): recipe_main
87
+
88
+ aten::_log_softmax 0.00% 30.790us 0.00% 70.611us 70.611us 35.020ms 0.96% 35.411ms 35.411ms 0 b 0 b 71.32 Mb 0 b 1 [[2, 32016, 292], [], []] -- <built-in function cross_entropy_loss>
89
+ torch/nn/functional.py(3014): cross_entropy
90
+ torch/nn/modules/loss.py(1187): forward
91
+ torch/nn/modules/module.py(1555): _call_impl
92
+ nn.Module: CrossEntropyLoss_0
93
+
94
+ void at::native::(anonymous namespace)::cunn_Spatial... 0.00% 0.000us 0.00% 0.000us 0.000us 32.035ms 0.88% 32.035ms 16.018ms 0 b 0 b 0 b 0 b 2 [] -- ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
95
+ torch/autograd/graph.py(763): _engine_run_backward
96
+ torch/_tensor.py(465): backward
97
+ recipes/lora_finetune_single_device.py(622): train
98
+ recipes/lora_finetune_single_device.py(643): recipe_main
99
+
100
+ void cutlass::Kernel<cutlass_80_tensorop_bf16_s16816... 0.00% 0.000us 0.00% 0.000us 0.000us 31.991ms 0.88% 31.991ms 5.332ms 0 b 0 b 0 b 0 b 6 [] -- <built-in method to of Tensor object at 0x7f79fa3fbf10>
101
+ recipes/lora_finetune_single_device.py(622): train
102
+ recipes/lora_finetune_single_device.py(643): recipe_main
103
+ torchtune/config/_parse.py(50): wrapper
104
+ recipes/lora_finetune_single_device.py(648): <module>
105
+
106
+ void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 31.808ms 0.88% 31.808ms 82.833us 0 b 0 b 0 b 0 b 384 [] -- ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
107
+ torch/autograd/graph.py(763): _engine_run_backward
108
+ torch/_tensor.py(465): backward
109
+ recipes/lora_finetune_single_device.py(622): train
110
+ recipes/lora_finetune_single_device.py(643): recipe_main
111
+
112
+ aten::add_ 0.04% 1.868ms 2.02% 89.872ms 358.056us 29.127ms 0.80% 29.127ms 116.044us 0 b 0 b 0 b 0 b 251 [[2, 913, 4096], [2, 913, 4096], []] -- ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
113
+ torch/autograd/graph.py(763): _engine_run_backward
114
+ torch/_tensor.py(465): backward
115
+ recipes/lora_finetune_single_device.py(622): train
116
+ recipes/lora_finetune_single_device.py(643): recipe_main
117
+
118
+ void pytorch_flash::flash_bwd_dq_dk_dv_loop_seqk_par... 0.00% 0.000us 0.00% 0.000us 0.000us 27.914ms 0.77% 27.914ms 436.163us 0 b 0 b 0 b 0 b 64 [] -- ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
119
+ torch/autograd/graph.py(763): _engine_run_backward
120
+ torch/_tensor.py(465): backward
121
+ recipes/lora_finetune_single_device.py(622): train
122
+ recipes/lora_finetune_single_device.py(643): recipe_main
123
+
124
+ void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 27.004ms 0.74% 27.004ms 355.322us 0 b 0 b 0 b 0 b 76 [] -- <built-in method to of Tensor object at 0x7f79fa3fbf10>
125
+ recipes/lora_finetune_single_device.py(622): train
126
+ recipes/lora_finetune_single_device.py(643): recipe_main
127
+ torchtune/config/_parse.py(50): wrapper
128
+ recipes/lora_finetune_single_device.py(648): <module>
129
+
130
+ void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 26.862ms 0.74% 26.862ms 38.706us 0 b 0 b 0 b 0 b 694 [] -- ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
131
+ torch/autograd/graph.py(763): _engine_run_backward
132
+ torch/_tensor.py(465): backward
133
+ recipes/lora_finetune_single_device.py(622): train
134
+ recipes/lora_finetune_single_device.py(643): recipe_main
135
+
136
+ void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 25.891ms 0.71% 25.891ms 208.798us 0 b 0 b 0 b 0 b 124 [] -- <built-in method to of Tensor object at 0x7f79fa3fbf10>
137
+ recipes/lora_finetune_single_device.py(622): train
138
+ recipes/lora_finetune_single_device.py(643): recipe_main
139
+ torchtune/config/_parse.py(50): wrapper
140
+ recipes/lora_finetune_single_device.py(648): <module>
141
+
142
+ void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 25.065ms 0.69% 25.065ms 334.207us 0 b 0 b 0 b 0 b 75 [] -- <built-in method to of Tensor object at 0x7f79fa3fbf10>
143
+ recipes/lora_finetune_single_device.py(622): train
144
+ recipes/lora_finetune_single_device.py(643): recipe_main
145
+ torchtune/config/_parse.py(50): wrapper
146
+ recipes/lora_finetune_single_device.py(648): <module>
147
+
148
+ void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 22.140ms 0.61% 22.140ms 43.412us 0 b 0 b 0 b 0 b 510 [] -- ...in method run_backward of torch._C._EngineBase object at 0x7f7ab8266140>
149
+ torch/autograd/graph.py(763): _engine_run_backward
150
+ torch/_tensor.py(465): backward
151
+ recipes/lora_finetune_single_device.py(622): train
152
+ recipes/lora_finetune_single_device.py(643): recipe_main
153
+
154
+ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ -------------------------------------------------------------------------------- --------------------------------------------------------------------------- ------------
155
+ Self CPU time total: 4.453s
156
+ Self CUDA time total: 3.632s
157
+
profiling_outputs/iteration_12/rank0_stacks.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4489b7f30fc000a3a5f9a1dcfe8879c00b547f631645074dbaf194fb5f18913
3
+ size 28216209