derek-thomas HF staff commited on
Commit
18ddd47
1 Parent(s): e16200c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +258 -256
app.py CHANGED
@@ -111,19 +111,19 @@ def calc_flops(vocab_size, hidden_size, sequence_length, num_layers, kv_size_rat
111
 
112
 
113
  # ---- Gradio Interface ---- #
114
- with gr.Blocks() as demo:
115
-
116
- with gr.Tabs():
117
  gr.Markdown("""
118
  This app is a re-creation of [this calculator](https://github.com/EleutherAI/cookbook/tree/main/calc) from EleutherAI.
119
 
120
  Before training or inference even begins, common practical questions about potential models must be answered such as:
121
-
122
  1. How many parameters are we targeting? How should those parameters be allocated within the model?
123
  1. How many FLOPs does the model from step 1 take to train on t tokens? How about inference?
124
  1. How much memory does the model from step 1 take to train/infer on d devices? What memory-saving strategies (e.g. parallelism, quantization, etc) are necessary to fit the model on device memory?
125
  """)
126
- with gr.TabItem("Memory Calculation"):
 
 
127
  gr.Markdown("""
128
  ## Memory Calculation
129
 
@@ -131,293 +131,295 @@ with gr.Blocks() as demo:
131
  Take this estimation with a grain of salt, because every implementation is different and these calculations were written to match the GPT-NeoX library as close as possible.
132
  Even for other training and inference libraries, however, we expect our script to give approximate memory estimations within acceptable error.
133
  (Please see [LLM finetuning memory requirements](https://blog.scottlogic.com/2023/11/24/llm-mem.html) for a treatment of how specific memory costs may vary framework-to-framework). Other good resources that we consulted are the [ZeRO Paper](https://arxiv.org/abs/1910.02054) and [Reducing Activation Recomputation in Large Transformer Models](https://arxiv.org/pdf/2205.05198.pdf).
134
-
 
 
135
  ## To Use
136
  Fill in the required details below and click 'Calculate Memory' to get a result.
137
  """)
138
- with gr.Row():
139
- with gr.Column("Generatable"):
140
- with gr.Group():
141
- hf_model_name_or_path = gr.Textbox(
142
- label="HuggingFace Model Name or Path",
143
- info="Name of the HuggingFace Hub repository or the local file path for it"
144
- )
145
- sequence_length = gr.Number(
146
- label="Sequence Length",
147
- value=2048,
148
- info="Sequence length used for training"
149
- )
150
- vocab_size = gr.Number(
151
- label="Vocab Size",
152
- value=51200,
153
- info="How many tokens are in the embedding layer"
154
- )
155
- hidden_size = gr.Number(
156
- label="Hidden Size",
157
- value=6144,
158
- info="Dimension of the model's hidden size"
159
- )
160
- num_attention_heads = gr.Number(
161
- label="Number of Attention Heads",
162
- value=64,
163
- info="Number of attention heads used in the model"
164
- )
165
- num_layers = gr.Number(
166
- label="Number of Layers",
167
- value=44,
168
- info="Number of transformer layers used in the model"
169
- )
170
- with gr.Column("User Defined"):
171
- num_gpus = gr.Number(
172
- label="Number of GPUs",
173
- value=1,
174
- info="Number of GPUs used for training"
175
  )
176
- tensor_parallel_size = gr.Number(
177
- label="Tensor Parallel Size",
178
- value=1,
179
- info="Tensor parallel degree (1 if not used)"
180
- )
181
- pipeline_parallel_size = gr.Number(
182
- label="Pipeline Parallel Size",
183
- value=1,
184
- info="Pipeline parallel degree (1 if not used)"
185
- )
186
- batch_size_per_gpu = gr.Number(
187
- label="Batch Size per GPU",
188
- value=8,
189
- info="Batch size per GPU"
190
  )
191
- ffn_expansion_factor = gr.Number(
192
- label="FFN Expansion Factor",
193
- value=4,
194
- info="How much the MLP hidden size expands"
195
  )
196
- is_mixed_precision = gr.Checkbox(
197
- label="Mixed Precision",
198
- value=True,
199
- info="Whether mixed precision is enabled"
200
  )
201
- misc_mem_gib = gr.Number(
202
- label="Miscellaneous Memory Overhead (GiB)",
203
- value=5,
204
- info="Miscellaneous memory overhead per GPU by DL frameworks, communication libraries, etc."
205
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
- calc_memory_button = gr.Button("Calculate Memory")
208
- memory_result = gr.Textbox(label="Memory Calculation Result", interactive=False)
209
- calc_memory_button.click(
210
  calc_mem,
211
  inputs=[
212
  hf_model_name_or_path, num_gpus, tensor_parallel_size, pipeline_parallel_size, batch_size_per_gpu, sequence_length, vocab_size, hidden_size, num_attention_heads, num_layers, ffn_expansion_factor, is_mixed_precision, misc_mem_gib
213
  ],
214
  outputs=memory_result
215
- )
216
 
217
- hf_model_name_or_path.change(
218
  fn=update_from_hf_model,
219
  inputs=[hf_model_name_or_path],
220
  outputs=[num_layers, hidden_size, num_attention_heads, vocab_size, sequence_length, memory_result]
221
- )
222
 
223
  # Parameter Calculation Tab
224
- with gr.TabItem("Parameter Calculation"):
225
- gr.Markdown("""
226
- ## Parameter Calculation
227
-
228
- Parameter Calculation calculates the number of parameters present in a given model based on its hyperparams.
229
- Such calculations are important to determine memory overheads, FLOPs, or to determine the size of an unknown transformer model.
230
- We also found the following resources helpful:
231
- [How does GPT-3 spend its 175B parameters?](https://www.lesswrong.com/posts/3duR8CrvcHywrnhLo/how-does-gpt-3-spend-its-175b-parameters)
232
- and [LLM Parameter Counting](https://kipp.ly/transformer-param-count/).
233
-
234
- ## How To Use
235
- Simply input the model details, such as the hidden size, number of layers, and attention heads, and press 'Calculate Parameters' to get a result.
236
-
237
- """)
238
- with gr.Row():
239
- with gr.Column("Generatable"):
240
- with gr.Group():
241
- hf_model_name_or_path = gr.Textbox(
242
- label="HuggingFace Model Name or Path",
243
- info="Name of the HuggingFace Hub repository or the local file path for it"
244
- )
245
- vocab_size = gr.Number(
246
- label="Vocab Size",
247
- value=51200,
248
- info="How many tokens are in the embedding layer"
249
- )
250
- hidden_size = gr.Number(
251
- label="Hidden Size",
252
- value=6144,
253
- info="Dimension of the model's hidden size"
254
- )
255
- sequence_length = gr.Number(
256
- label="Sequence Length",
257
- value=2048,
258
- info="Sequence length used for training"
259
- )
260
- num_layers = gr.Number(
261
- label="Number of Layers",
262
- value=44,
263
- info="Number of transformer layers used in the model"
264
- )
265
- with gr.Column("User Defined"):
266
- tied_embeddings = gr.Checkbox(
267
- label="Tied Embeddings",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  value=False,
269
- info="Whether embeddings are tied (shared between input and output)"
270
  )
271
- ffn_expansion_factor = gr.Number(
272
- label="FFN Expansion Factor",
273
- value=4,
274
- info="How much the MLP hidden size expands"
275
  )
276
- num_mlp_linears = gr.Number(
277
- label="Number of Linear Layers per MLP Block",
278
- value=2,
279
- info="How many linear layers per MLP block"
280
  )
281
- kv_size_ratio = gr.Number(
282
- label="KV Size Ratio",
283
- value=1.0,
284
- info="Ratio of total query heads to key/value heads. 1.0 for MHA, 1/num_attention_heads for MQA"
285
  )
286
-
287
- with gr.Accordion("MoE Parameters", open=False):
288
- moe = gr.Checkbox(
289
- label="MoE",
290
- value=False,
291
- info="Whether the model is MoE"
292
- )
293
- num_experts = gr.Number(
294
- label="Number of Experts",
295
- value=8,
296
- info="Number of experts for MoE"
297
- )
298
- expert_interval = gr.Number(
299
- label="Expert Interval",
300
- value=1,
301
- info="Expert interval for MoE"
302
- )
303
- topk = gr.Number(
304
- label="Top k Routing",
305
- value=1,
306
- info="Top k routing for MoE"
307
- )
308
 
309
- calc_param_button = gr.Button("Calculate Parameters")
310
- param_result = gr.Textbox(label="Parameter Calculation Result", interactive=False)
311
- calc_param_button.click(calc_params,
312
- inputs=[vocab_size, tied_embeddings, hidden_size, sequence_length, num_layers, moe, num_experts, expert_interval, topk, ffn_expansion_factor, num_mlp_linears, kv_size_ratio],
313
- outputs=param_result)
314
 
315
- hf_model_name_or_path.change(fn=update_from_hf_model,
316
- inputs=[hf_model_name_or_path],
317
- outputs=[num_layers, hidden_size, num_attention_heads, vocab_size, sequence_length])
318
 
319
- # New FLOP Calculation Tab
320
- with gr.TabItem("FLOP Calculation"):
321
- gr.Markdown("""
322
- ## FLOP Calculation
323
-
324
- FLOP Calculation calculates the number of theoretical FLOPs required to train a model on t tokens.
325
- See [Transformers Math 101](https://blog.eleuther.ai/transformer-math/) for more details on how FLOPs are calculated.
326
- Other good resources that we consulted are the [Chinchilla Paper](https://arxiv.org/abs/2203.15556) and
327
- [Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM](https://people.eecs.berkeley.edu/~matei/papers/2021/sc_megatron_lm.pdf).
328
- """)
329
- with gr.Row():
330
- with gr.Column("Generatable"):
331
- with gr.Group():
332
- hf_model_name_or_path = gr.Textbox(
333
- label="HuggingFace Model Name or Path",
334
- info="Name of the HuggingFace Hub repository or the local file path for it"
335
- )
336
- vocab_size = gr.Number(
337
- label="Vocab Size",
338
- value=51200,
339
- info="How many tokens are in the embedding layer"
340
- )
341
- hidden_size = gr.Number(
342
- label="Hidden Size",
343
- value=6144,
344
- info="Dimension of the model's hidden size"
345
- )
346
- sequence_length = gr.Number(
347
- label="Sequence Length",
348
- value=2048,
349
- info="Sequence length used for training"
350
- )
351
- num_layers = gr.Number(
352
- label="Number of Layers",
353
- value=44,
354
- info="Number of transformer layers used in the model"
355
- )
356
- with gr.Column("Generatable"):
357
- kv_size_ratio = gr.Number(
358
- label="KV Size Ratio",
359
- value=1.0,
360
- info="Ratio of kv heads to query heads used in model. 1.0 for MHA"
361
  )
362
- ffn_expansion_factor = gr.Number(
363
- label="FFN Expansion Factor",
364
- value=4,
365
- info="How much the MLP hidden size expands"
366
  )
367
- batch_size = gr.Number(
368
- label="Batch Size",
369
- value=1,
370
- info="Global batch size in units of samples"
371
  )
372
- tokens = gr.Number(
373
- label="Number of GigaTokens",
374
- value=300,
375
- info="Total number of GigaTokens for training"
376
  )
377
- checkpoint_activations = gr.Checkbox(
378
- label="Checkpoint Activations",
379
- value=True,
380
- info="Whether Megatron-style activation checkpointing is being used"
381
  )
382
- infer = gr.Checkbox(
383
- label="Inference-Only",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  value=False,
385
- info="Whether the model is being used for inference-only"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
  )
387
 
388
- # MoE parameters hidden in accordion
389
- with gr.Accordion("Mixture of Experts (MoE)", open=False):
390
- moe = gr.Checkbox(
391
- label="Mixture of Experts (MoE)",
392
- value=False,
393
- info="Whether the model uses Mixture of Experts"
394
- )
395
- num_experts = gr.Number(
396
- label="Number of Experts",
397
- value=128,
398
- info="Number of experts for Mixture of Experts (MoE)"
399
- )
400
- expert_interval = gr.Number(
401
- label="Expert Interval",
402
- value=2,
403
- info="Expert interval for Mixture of Experts (MoE)"
404
- )
405
- topk = gr.Number(
406
- label="Top K Routing for MoE",
407
- value=1,
408
- info="Top k routing for Mixture of Experts (MoE)"
409
- )
410
-
411
- calc_flops_button = gr.Button("Calculate FLOPs")
412
- flops_result = gr.JSON(label="FLOP Calculation Result")
413
- calc_flops_button.click(
414
- calc_flops,
415
- inputs=[vocab_size, hidden_size, sequence_length, num_layers, kv_size_ratio, topk, moe, num_experts, expert_interval, batch_size, tokens, checkpoint_activations, ffn_expansion_factor, infer],
416
- outputs=flops_result
417
- )
418
 
419
- hf_model_name_or_path.change(fn=update_from_hf_model,
420
- inputs=[hf_model_name_or_path],
421
- outputs=[num_layers, hidden_size, vocab_size, sequence_length])
422
 
423
  demo.launch()
 
111
 
112
 
113
  # ---- Gradio Interface ---- #
114
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
115
+ with gr.Accordion("Credits and General Idea", open=False):
 
116
  gr.Markdown("""
117
  This app is a re-creation of [this calculator](https://github.com/EleutherAI/cookbook/tree/main/calc) from EleutherAI.
118
 
119
  Before training or inference even begins, common practical questions about potential models must be answered such as:
 
120
  1. How many parameters are we targeting? How should those parameters be allocated within the model?
121
  1. How many FLOPs does the model from step 1 take to train on t tokens? How about inference?
122
  1. How much memory does the model from step 1 take to train/infer on d devices? What memory-saving strategies (e.g. parallelism, quantization, etc) are necessary to fit the model on device memory?
123
  """)
124
+ with gr.Tab("Memory Calculation"):
125
+ #with gr.TabItem("Memory Calculation"):
126
+ with gr.Accordion("About Memory Calculation", open=False):
127
  gr.Markdown("""
128
  ## Memory Calculation
129
 
 
131
  Take this estimation with a grain of salt, because every implementation is different and these calculations were written to match the GPT-NeoX library as close as possible.
132
  Even for other training and inference libraries, however, we expect our script to give approximate memory estimations within acceptable error.
133
  (Please see [LLM finetuning memory requirements](https://blog.scottlogic.com/2023/11/24/llm-mem.html) for a treatment of how specific memory costs may vary framework-to-framework). Other good resources that we consulted are the [ZeRO Paper](https://arxiv.org/abs/1910.02054) and [Reducing Activation Recomputation in Large Transformer Models](https://arxiv.org/pdf/2205.05198.pdf).
134
+ """)
135
+ with gr.Accordion("How to use it?", open=False):
136
+ gr.Markdown("""
137
  ## To Use
138
  Fill in the required details below and click 'Calculate Memory' to get a result.
139
  """)
140
+ with gr.Row():
141
+ with gr.Column("Generatable"):
142
+ gr.Markdown("## Generatable")
143
+ with gr.Group():
144
+ hf_model_name_or_path = gr.Textbox(
145
+ label="HuggingFace Model Name or Path",
146
+ info="Name of the HuggingFace Hub repository or the local file path for it"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  )
148
+ sequence_length = gr.Number(
149
+ label="Sequence Length",
150
+ value=2048,
151
+ info="Sequence length used for training"
 
 
 
 
 
 
 
 
 
 
152
  )
153
+ vocab_size = gr.Number(
154
+ label="Vocab Size",
155
+ value=51200,
156
+ info="How many tokens are in the embedding layer"
157
  )
158
+ hidden_size = gr.Number(
159
+ label="Hidden Size",
160
+ value=6144,
161
+ info="Dimension of the model's hidden size"
162
  )
163
+ num_attention_heads = gr.Number(
164
+ label="Number of Attention Heads",
165
+ value=64,
166
+ info="Number of attention heads used in the model"
167
  )
168
+ num_layers = gr.Number(
169
+ label="Number of Layers",
170
+ value=44,
171
+ info="Number of transformer layers used in the model"
172
+ )
173
+ with gr.Column("User Defined"):
174
+ gr.Markdown("## User Defined")
175
+ num_gpus = gr.Number(
176
+ label="Number of GPUs",
177
+ value=1,
178
+ info="Number of GPUs used for training"
179
+ )
180
+ tensor_parallel_size = gr.Number(
181
+ label="Tensor Parallel Size",
182
+ value=1,
183
+ info="Tensor parallel degree (1 if not used)"
184
+ )
185
+ pipeline_parallel_size = gr.Number(
186
+ label="Pipeline Parallel Size",
187
+ value=1,
188
+ info="Pipeline parallel degree (1 if not used)"
189
+ )
190
+ batch_size_per_gpu = gr.Number(
191
+ label="Batch Size per GPU",
192
+ value=8,
193
+ info="Batch size per GPU"
194
+ )
195
+ ffn_expansion_factor = gr.Number(
196
+ label="FFN Expansion Factor",
197
+ value=4,
198
+ info="How much the MLP hidden size expands"
199
+ )
200
+ is_mixed_precision = gr.Checkbox(
201
+ label="Mixed Precision",
202
+ value=True,
203
+ info="Whether mixed precision is enabled"
204
+ )
205
+ misc_mem_gib = gr.Number(
206
+ label="Miscellaneous Memory Overhead (GiB)",
207
+ value=5,
208
+ info="Miscellaneous memory overhead per GPU by DL frameworks, communication libraries, etc."
209
+ )
210
 
211
+ calc_memory_button = gr.Button("Calculate Memory")
212
+ memory_result = gr.Textbox(label="Memory Calculation Result", interactive=False)
213
+ calc_memory_button.click(
214
  calc_mem,
215
  inputs=[
216
  hf_model_name_or_path, num_gpus, tensor_parallel_size, pipeline_parallel_size, batch_size_per_gpu, sequence_length, vocab_size, hidden_size, num_attention_heads, num_layers, ffn_expansion_factor, is_mixed_precision, misc_mem_gib
217
  ],
218
  outputs=memory_result
219
+ )
220
 
221
+ hf_model_name_or_path.change(
222
  fn=update_from_hf_model,
223
  inputs=[hf_model_name_or_path],
224
  outputs=[num_layers, hidden_size, num_attention_heads, vocab_size, sequence_length, memory_result]
225
+ )
226
 
227
  # Parameter Calculation Tab
228
+ with gr.TabItem("Parameter Calculation"):
229
+ gr.Markdown("""
230
+ ## Parameter Calculation
231
+
232
+ Parameter Calculation calculates the number of parameters present in a given model based on its hyperparams.
233
+ Such calculations are important to determine memory overheads, FLOPs, or to determine the size of an unknown transformer model.
234
+ We also found the following resources helpful:
235
+ [How does GPT-3 spend its 175B parameters?](https://www.lesswrong.com/posts/3duR8CrvcHywrnhLo/how-does-gpt-3-spend-its-175b-parameters)
236
+ and [LLM Parameter Counting](https://kipp.ly/transformer-param-count/).
237
+ ## How To Use
238
+ Simply input the model details, such as the hidden size, number of layers, and attention heads, and press 'Calculate Parameters' to get a result.
239
+ """)
240
+ with gr.Row():
241
+ with gr.Column("Generatable"):
242
+ with gr.Group():
243
+ hf_model_name_or_path = gr.Textbox(
244
+ label="HuggingFace Model Name or Path",
245
+ info="Name of the HuggingFace Hub repository or the local file path for it"
246
+ )
247
+ vocab_size = gr.Number(
248
+ label="Vocab Size",
249
+ value=51200,
250
+ info="How many tokens are in the embedding layer"
251
+ )
252
+ hidden_size = gr.Number(
253
+ label="Hidden Size",
254
+ value=6144,
255
+ info="Dimension of the model's hidden size"
256
+ )
257
+ sequence_length = gr.Number(
258
+ label="Sequence Length",
259
+ value=2048,
260
+ info="Sequence length used for training"
261
+ )
262
+ num_layers = gr.Number(
263
+ label="Number of Layers",
264
+ value=44,
265
+ info="Number of transformer layers used in the model"
266
+ )
267
+ with gr.Column("User Defined"):
268
+ tied_embeddings = gr.Checkbox(
269
+ label="Tied Embeddings",
270
+ value=False,
271
+ info="Whether embeddings are tied (shared between input and output)"
272
+ )
273
+ ffn_expansion_factor = gr.Number(
274
+ label="FFN Expansion Factor",
275
+ value=4,
276
+ info="How much the MLP hidden size expands"
277
+ )
278
+ num_mlp_linears = gr.Number(
279
+ label="Number of Linear Layers per MLP Block",
280
+ value=2,
281
+ info="How many linear layers per MLP block"
282
+ )
283
+ kv_size_ratio = gr.Number(
284
+ label="KV Size Ratio",
285
+ value=1.0,
286
+ info="Ratio of total query heads to key/value heads. 1.0 for MHA, 1/num_attention_heads for MQA"
287
+ )
288
+
289
+ with gr.Accordion("MoE Parameters", open=False):
290
+ moe = gr.Checkbox(
291
+ label="MoE",
292
  value=False,
293
+ info="Whether the model is MoE"
294
  )
295
+ num_experts = gr.Number(
296
+ label="Number of Experts",
297
+ value=8,
298
+ info="Number of experts for MoE"
299
  )
300
+ expert_interval = gr.Number(
301
+ label="Expert Interval",
302
+ value=1,
303
+ info="Expert interval for MoE"
304
  )
305
+ topk = gr.Number(
306
+ label="Top k Routing",
307
+ value=1,
308
+ info="Top k routing for MoE"
309
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
 
311
+ calc_param_button = gr.Button("Calculate Parameters")
312
+ param_result = gr.Textbox(label="Parameter Calculation Result", interactive=False)
313
+ calc_param_button.click(calc_params,
314
+ inputs=[vocab_size, tied_embeddings, hidden_size, sequence_length, num_layers, moe, num_experts, expert_interval, topk, ffn_expansion_factor, num_mlp_linears, kv_size_ratio],
315
+ outputs=param_result)
316
 
317
+ hf_model_name_or_path.change(fn=update_from_hf_model,
318
+ inputs=[hf_model_name_or_path],
319
+ outputs=[num_layers, hidden_size, num_attention_heads, vocab_size, sequence_length])
320
 
321
+ # New FLOP Calculation Tab
322
+ with gr.TabItem("FLOP Calculation"):
323
+ gr.Markdown("""
324
+ ## FLOP Calculation
325
+
326
+ FLOP Calculation calculates the number of theoretical FLOPs required to train a model on t tokens.
327
+ See [Transformers Math 101](https://blog.eleuther.ai/transformer-math/) for more details on how FLOPs are calculated.
328
+ Other good resources that we consulted are the [Chinchilla Paper](https://arxiv.org/abs/2203.15556) and
329
+ [Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM](https://people.eecs.berkeley.edu/~matei/papers/2021/sc_megatron_lm.pdf).
330
+ """)
331
+ with gr.Row():
332
+ with gr.Column("Generatable"):
333
+ with gr.Group():
334
+ hf_model_name_or_path = gr.Textbox(
335
+ label="HuggingFace Model Name or Path",
336
+ info="Name of the HuggingFace Hub repository or the local file path for it"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
  )
338
+ vocab_size = gr.Number(
339
+ label="Vocab Size",
340
+ value=51200,
341
+ info="How many tokens are in the embedding layer"
342
  )
343
+ hidden_size = gr.Number(
344
+ label="Hidden Size",
345
+ value=6144,
346
+ info="Dimension of the model's hidden size"
347
  )
348
+ sequence_length = gr.Number(
349
+ label="Sequence Length",
350
+ value=2048,
351
+ info="Sequence length used for training"
352
  )
353
+ num_layers = gr.Number(
354
+ label="Number of Layers",
355
+ value=44,
356
+ info="Number of transformer layers used in the model"
357
  )
358
+ with gr.Column("Generatable"):
359
+ kv_size_ratio = gr.Number(
360
+ label="KV Size Ratio",
361
+ value=1.0,
362
+ info="Ratio of kv heads to query heads used in model. 1.0 for MHA"
363
+ )
364
+ ffn_expansion_factor = gr.Number(
365
+ label="FFN Expansion Factor",
366
+ value=4,
367
+ info="How much the MLP hidden size expands"
368
+ )
369
+ batch_size = gr.Number(
370
+ label="Batch Size",
371
+ value=1,
372
+ info="Global batch size in units of samples"
373
+ )
374
+ tokens = gr.Number(
375
+ label="Number of GigaTokens",
376
+ value=300,
377
+ info="Total number of GigaTokens for training"
378
+ )
379
+ checkpoint_activations = gr.Checkbox(
380
+ label="Checkpoint Activations",
381
+ value=True,
382
+ info="Whether Megatron-style activation checkpointing is being used"
383
+ )
384
+ infer = gr.Checkbox(
385
+ label="Inference-Only",
386
+ value=False,
387
+ info="Whether the model is being used for inference-only"
388
+ )
389
+
390
+ # MoE parameters hidden in accordion
391
+ with gr.Accordion("Mixture of Experts (MoE)", open=False):
392
+ moe = gr.Checkbox(
393
+ label="Mixture of Experts (MoE)",
394
  value=False,
395
+ info="Whether the model uses Mixture of Experts"
396
+ )
397
+ num_experts = gr.Number(
398
+ label="Number of Experts",
399
+ value=128,
400
+ info="Number of experts for Mixture of Experts (MoE)"
401
+ )
402
+ expert_interval = gr.Number(
403
+ label="Expert Interval",
404
+ value=2,
405
+ info="Expert interval for Mixture of Experts (MoE)"
406
+ )
407
+ topk = gr.Number(
408
+ label="Top K Routing for MoE",
409
+ value=1,
410
+ info="Top k routing for Mixture of Experts (MoE)"
411
  )
412
 
413
+ calc_flops_button = gr.Button("Calculate FLOPs")
414
+ flops_result = gr.JSON(label="FLOP Calculation Result")
415
+ calc_flops_button.click(
416
+ calc_flops,
417
+ inputs=[vocab_size, hidden_size, sequence_length, num_layers, kv_size_ratio, topk, moe, num_experts, expert_interval, batch_size, tokens, checkpoint_activations, ffn_expansion_factor, infer],
418
+ outputs=flops_result
419
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
 
421
+ hf_model_name_or_path.change(fn=update_from_hf_model,
422
+ inputs=[hf_model_name_or_path],
423
+ outputs=[num_layers, hidden_size, vocab_size, sequence_length])
424
 
425
  demo.launch()