trojblue commited on
Commit
106188f
1 Parent(s): ce70ea9

Upload nd_param_calculator_latest.ipynb

Browse files
Files changed (1) hide show
  1. nd_param_calculator_latest.ipynb +776 -0
nd_param_calculator_latest.ipynb ADDED
@@ -0,0 +1,776 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 7,
6
+ "id": "82d89348-fb09-462d-b78f-f1dab3447c3d",
7
+ "metadata": {
8
+ "tags": []
9
+ },
10
+ "outputs": [],
11
+ "source": [
12
+ "# !pip install -U unibox omegaconf -q\n",
13
+ "import unibox\n",
14
+ "import subprocess\n",
15
+ "import re\n",
16
+ "import os\n",
17
+ "import math\n",
18
+ "from omegaconf import OmegaConf\n",
19
+ "\n",
20
+ "# default values for different optimizers\n",
21
+ "optimizer_dict = {\n",
22
+ " \"prodigy\": {\n",
23
+ " \"name\": \"prodigyopt.Prodigy\",\n",
24
+ " \"params\": {\n",
25
+ " \"lr\": 1,\n",
26
+ " \"d_coef\": 2,\n",
27
+ " \"d0\": 1e-6,\n",
28
+ " \"safeguard_warmup\": True,\n",
29
+ " \"use_bias_correction\": True,\n",
30
+ " \"weight_decay\": 1e-2,\n",
31
+ " \"eps\": 1e-8,\n",
32
+ " } \n",
33
+ " },\n",
34
+ "\"adamw\":{\n",
35
+ " \"name\": \"torch.optim.AdamW\",\n",
36
+ " \"params\":{\n",
37
+ " \"lr\": 3e-5,\n",
38
+ " \"weight_decay\": 1e-2,\n",
39
+ " },\n",
40
+ " \n",
41
+ "}\n",
42
+ "}\n",
43
+ "\n",
44
+ "# default scheduler dict\n",
45
+ "default_scheduler_dict = {\n",
46
+ " \"scheduler\":{\n",
47
+ " \"name\": \"transformers.get_cosine_schedule_with_warmup\",\n",
48
+ " \"params\": {\n",
49
+ " \"num_warmup_steps\": 0,\n",
50
+ " \"num_training_steps\": 1000,\n",
51
+ " \"last_epoch\": -1,\n",
52
+ " }\n",
53
+ " }\n",
54
+ "}\n",
55
+ "\n",
56
+ "# assuming training on 1024x1024 resolution\n",
57
+ "default_batch_size_dict = {\n",
58
+ " \"prodigy\": {\n",
59
+ " 80: 8, # For 80 GB VRAM, batch size is 8\n",
60
+ " 20: 1, \n",
61
+ " },\n",
62
+ " \"adamw\": {\n",
63
+ " 80: 24\n",
64
+ " },\n",
65
+ " \"lion\": {\n",
66
+ " 78: 48\n",
67
+ " },\n",
68
+ "}\n",
69
+ "\n",
70
+ "\n",
71
+ "def get_vram_in_gb():\n",
72
+ " \"\"\" Returns the total GPU memory in GB. \"\"\"\n",
73
+ " try:\n",
74
+ " # Running the command 'nvidia-smi' and capturing its output\n",
75
+ " output = subprocess.check_output(['nvidia-smi'], text=True)\n",
76
+ "\n",
77
+ " # Regular expression to find the memory part\n",
78
+ " mem_regex = re.compile(r'\\|\\s+\\d+MiB / (\\d+)MiB\\s+\\|')\n",
79
+ " match = mem_regex.search(output)\n",
80
+ " if match:\n",
81
+ " total_memory_mib = int(match.group(1))\n",
82
+ " # Converting MiB to GiB (1 GiB = 1024 MiB) and rounding to 2 decimal places\n",
83
+ " total_memory_gb = round(total_memory_mib / 1024, 2)\n",
84
+ " return total_memory_gb\n",
85
+ " else:\n",
86
+ " raise ValueError(\"Could not parse total memory from nvidia-smi output.\")\n",
87
+ " except Exception as e:\n",
88
+ " return f\"An error occurred: {e}\"\n",
89
+ "\n",
90
+ "\n",
91
+ "def get_batch_size(optimizer: str, vram: int) -> int:\n",
92
+ " # allocate batch size based on vram, assuming training on 1024x1024 resolution\n",
93
+ " _bs_dict = default_batch_size_dict\n",
94
+ " \n",
95
+ " if optimizer in _bs_dict:\n",
96
+ " # Find the closest lower VRAM value that we have a batch size for\n",
97
+ " closest_vram = max(vram_key for vram_key in _bs_dict[optimizer] if vram_key <= vram)\n",
98
+ " return _bs_dict[optimizer][closest_vram]\n",
99
+ " else:\n",
100
+ " raise ValueError(f\"Optimizer '{optimizer}' not supported.\")\n",
101
+ "\n",
102
+ "\n",
103
+ "def get_train_image_count(dataset_dir:str) -> int:\n",
104
+ " files = unibox.traverses(DATASET_DIR, include_extensions = unibox.constants.IMG_FILES)\n",
105
+ " return len(files)\n",
106
+ "\n",
107
+ "\n",
108
+ "def get_scheduler_dict(it_per_epoch:int, epoch_per_cycle:int, warmup_epochs:float):\n",
109
+ "\n",
110
+ " _warmup_step_count = int(it_per_epoch * warmup_epochs)\n",
111
+ " print(f\"_warmup_step_count: {_warmup_step_count}\")\n",
112
+ "\n",
113
+ " _cycle_step_count = it_per_epoch * epoch_per_cycle\n",
114
+ " print(f\"_cycle_step_count: {_cycle_step_count}\")\n",
115
+ "\n",
116
+ " scheduler_dict = default_scheduler_dict.copy()\n",
117
+ " scheduler_dict[\"scheduler\"][\"params\"][\"num_training_steps\"] = _cycle_step_count\n",
118
+ " scheduler_dict[\"scheduler\"][\"params\"][\"num_warmup_steps\"] = _warmup_step_count\n",
119
+ " return scheduler_dict\n",
120
+ "\n",
121
+ "\n",
122
+ "def evaluate_template_dict(template_dict):\n",
123
+ " # generate a filled dictionary from a template\n",
124
+ " new_dict = {}\n",
125
+ " for key, value in template_dict.items():\n",
126
+ " if isinstance(value, dict):\n",
127
+ " new_dict[key] = evaluate_template_dict(value)\n",
128
+ " elif callable(value):\n",
129
+ " new_dict[key] = value()\n",
130
+ " else:\n",
131
+ " new_dict[key] = value\n",
132
+ " return new_dict\n",
133
+ "\n",
134
+ "\n",
135
+ "def write_config_to_yaml(config_dict, yaml_path):\n",
136
+ " yaml_config = OmegaConf.to_yaml(config_dict)\n",
137
+ "\n",
138
+ " # Splitting the YAML string into lines\n",
139
+ " lines = yaml_config.split('\\n')\n",
140
+ "\n",
141
+ " # Iterating through the lines and adding an empty line before each major section\n",
142
+ " formatted_lines = []\n",
143
+ " for line in lines:\n",
144
+ " if line.startswith(' ') or line == '':\n",
145
+ " # It's a subline or already an empty line, just add it\n",
146
+ " formatted_lines.append(line)\n",
147
+ " else:\n",
148
+ " # It's a new major section, add an empty line before it (if it's not the first line)\n",
149
+ " if formatted_lines:\n",
150
+ " formatted_lines.append('')\n",
151
+ " formatted_lines.append(line)\n",
152
+ "\n",
153
+ " # Joining the lines back into a single string\n",
154
+ " formatted_yaml_config = '\\n'.join(formatted_lines)\n",
155
+ "\n",
156
+ " # Write the formatted YAML string to a file\n",
157
+ " with open(yaml_path, 'w') as file:\n",
158
+ " file.write(formatted_yaml_config)\n",
159
+ "\n",
160
+ " print()\n",
161
+ " print(f\"Configuration saved to [{yaml_path}]\")\n",
162
+ "\n",
163
+ "\n",
164
+ "def get_optimizer_dict(optimizer:str):\n",
165
+ "\n",
166
+ " return_dict = {\n",
167
+ " \"optimizer\": optimizer_dict[optimizer],\n",
168
+ " }\n",
169
+ "\n",
170
+ " return return_dict"
171
+ ]
172
+ },
173
+ {
174
+ "cell_type": "code",
175
+ "execution_count": 14,
176
+ "id": "12a8d495-565e-455b-a8ee-fcaf09b199a5",
177
+ "metadata": {},
178
+ "outputs": [],
179
+ "source": [
180
+ "DEFAULT_CONFIG = \"https://huggingface.co/kiriyamaX/nd-configs/resolve/main/nd_config_template_sdxl_80g.yaml\"\n",
181
+ "\n",
182
+ "# ============= CONFIGS =============\n",
183
+ "\n",
184
+ "# IMPORTANT\n",
185
+ "CONFIG_VERSION = 1\n",
186
+ "RUN_NAME = \"qft_twitter_aes_167k-of-798k\"\n",
187
+ "DATASET_DIR = \"../datasets/twitter-aes_trained-best-167k-of-798k\"\n",
188
+ "# MODEL_PATH = \"../models/playground-v2-1024px-aesthetic.safetensors\"\n",
189
+ "MODEL_PATH = \"../models/fd5me9.ckpt\" \n",
190
+ "\n",
191
+ "# ===================================\n",
192
+ "\n",
193
+ "# hyperparams\n",
194
+ "OFFSET_NOISE_VAL = 0.12\n",
195
+ "UCG = 0.1\n",
196
+ "\n",
197
+ "# optimizer\n",
198
+ "TRAIN_OPTIMIZER = \"adamw\"\n",
199
+ "WARMUP_EPOCHS = 0.3\n",
200
+ "EPOCH_PER_CYCLE = 10\n",
201
+ "\n",
202
+ "# saving\n",
203
+ "SAVE_INTERVAL_EPOCH = 1\n",
204
+ "SAVE_INTERVAL_STEPS = -1\n",
205
+ "# ==================================="
206
+ ]
207
+ },
208
+ {
209
+ "cell_type": "code",
210
+ "execution_count": 15,
211
+ "id": "c4089d9b",
212
+ "metadata": {},
213
+ "outputs": [
214
+ {
215
+ "name": "stderr",
216
+ "output_type": "stream",
217
+ "text": [
218
+ "2023-12-18 07:51:41,207 [INFO] UniLogger: UniLoader.loads: .yaml LOADED from \"/tmp/tmptm3kzw5a.yaml\" in 0.04s\n"
219
+ ]
220
+ },
221
+ {
222
+ "name": "stdout",
223
+ "output_type": "stream",
224
+ "text": [
225
+ "sys_vram: 80 GB \n",
226
+ "train_batch_size: 24 \n",
227
+ "train_image_count: 166110 \n",
228
+ "_it_per_epoch: 6921\n",
229
+ "_warmup_step_count: 2076\n",
230
+ "_cycle_step_count: 69210\n",
231
+ "\n",
232
+ "Configuration saved to [./config_nd_qft_twitter_aes_167k-of-798k_v1.yaml]\n"
233
+ ]
234
+ }
235
+ ],
236
+ "source": [
237
+ "regulars_dict_template = {\n",
238
+ " \"trainer\": {\n",
239
+ " \"model_path\": lambda: MODEL_PATH,\n",
240
+ " \"checkpoint_dir\": lambda: CHECKPOINT_DIR,\n",
241
+ " \"offset_noise\": True,\n",
242
+ " \"offset_noise_val\": lambda: OFFSET_NOISE_VAL,\n",
243
+ " \"checkpoint_steps\": lambda: SAVE_INTERVAL_STEPS,\n",
244
+ " \"checkpoint_freq\": lambda: SAVE_INTERVAL_EPOCH,\n",
245
+ " },\n",
246
+ " \"dataset\": {\n",
247
+ " \"ucg\": lambda: UCG,\n",
248
+ " \"img_path\": lambda: [DATASET_DIR],\n",
249
+ " },\n",
250
+ " \"sampling\": {\n",
251
+ " \"every_n_steps\": lambda: SAVE_INTERVAL_STEPS,\n",
252
+ " \"every_n_epochs\": lambda: SAVE_INTERVAL_EPOCH,\n",
253
+ " },\n",
254
+ "}\n",
255
+ "\n",
256
+ "def get_regulars_dict():\n",
257
+ " return evaluate_template_dict(regulars_dict_template)\n",
258
+ "\n",
259
+ "\n",
260
+ "CHECKPOINT_DIR = f\"checkpoint_{RUN_NAME}_v{CONFIG_VERSION}\"\n",
261
+ "\n",
262
+ "# sys_vram = get_vram_in_gb()\n",
263
+ "sys_vram = 80\n",
264
+ "train_batch_size = get_batch_size(TRAIN_OPTIMIZER, sys_vram)\n",
265
+ "train_image_count = get_train_image_count(DATASET_DIR)\n",
266
+ "config = unibox.loads(DEFAULT_CONFIG)\n",
267
+ "\n",
268
+ "if not config:\n",
269
+ " raise FileNotFoundError\n",
270
+ "\n",
271
+ "_it_per_epoch = math.floor(train_image_count / train_batch_size)\n",
272
+ "print(f\"sys_vram: {sys_vram} GB \\ntrain_batch_size: {train_batch_size} \\ntrain_image_count: {train_image_count} \\n_it_per_epoch: {_it_per_epoch}\")\n",
273
+ "\n",
274
+ "config = OmegaConf.merge(config, get_optimizer_dict(TRAIN_OPTIMIZER))\n",
275
+ "config = OmegaConf.merge(config, get_scheduler_dict(_it_per_epoch, EPOCH_PER_CYCLE, WARMUP_EPOCHS))\n",
276
+ "config = OmegaConf.merge(config, get_regulars_dict())\n",
277
+ "\n",
278
+ "\n",
279
+ "YAML_FOLDER = \"./\"\n",
280
+ "YAML_NAME = f\"config_nd_{RUN_NAME}_v{CONFIG_VERSION}.yaml\"\n",
281
+ "_yaml_path = os.path.join(YAML_FOLDER, YAML_NAME)\n",
282
+ "write_config_to_yaml(config, _yaml_path)"
283
+ ]
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "execution_count": 18,
288
+ "id": "fe376cc7",
289
+ "metadata": {},
290
+ "outputs": [],
291
+ "source": [
292
+ "# !pip install -U unibox omegaconf -q\n",
293
+ "import unibox\n",
294
+ "import subprocess\n",
295
+ "import re\n",
296
+ "import os\n",
297
+ "import math\n",
298
+ "from omegaconf import OmegaConf\n",
299
+ "\n",
300
+ "# default values for different optimizers\n",
301
+ "optimizer_dict = {\n",
302
+ " \"prodigy\": {\n",
303
+ " \"name\": \"prodigyopt.Prodigy\",\n",
304
+ " \"params\": {\n",
305
+ " \"lr\": 1,\n",
306
+ " \"d_coef\": 2,\n",
307
+ " \"d0\": 1e-6,\n",
308
+ " \"safeguard_warmup\": True,\n",
309
+ " \"use_bias_correction\": True,\n",
310
+ " \"weight_decay\": 1e-2,\n",
311
+ " \"eps\": 1e-8,\n",
312
+ " } \n",
313
+ " },\n",
314
+ "\"adamw\":{\n",
315
+ " \"name\": \"torch.optim.AdamW\",\n",
316
+ " \"params\":{\n",
317
+ " \"lr\": 3e-5,\n",
318
+ " \"weight_decay\": 1e-2,\n",
319
+ " },\n",
320
+ " \n",
321
+ "}\n",
322
+ "}\n",
323
+ "\n",
324
+ "# default scheduler dict\n",
325
+ "default_scheduler_dict = {\n",
326
+ " \"scheduler\":{\n",
327
+ " \"name\": \"transformers.get_cosine_schedule_with_warmup\",\n",
328
+ " \"params\": {\n",
329
+ " \"num_warmup_steps\": 0,\n",
330
+ " \"num_training_steps\": 1000,\n",
331
+ " \"last_epoch\": -1,\n",
332
+ " }\n",
333
+ " }\n",
334
+ "}\n",
335
+ "\n",
336
+ "# assuming training on 1024x1024 resolution\n",
337
+ "default_batch_size_dict = {\n",
338
+ " \"prodigy\": {\n",
339
+ " 80: 8, # For 80 GB VRAM, batch size is 8\n",
340
+ " 20: 1, \n",
341
+ " },\n",
342
+ " \"adamw\": {\n",
343
+ " 80: 24,\n",
344
+ " },\n",
345
+ " \"lion\": {\n",
346
+ " 78: 48\n",
347
+ " },\n",
348
+ "}\n",
349
+ "\n",
350
+ "\n",
351
+ "def get_vram_in_gb():\n",
352
+ " \"\"\" Returns the total GPU memory in GB. \"\"\"\n",
353
+ " try:\n",
354
+ " # Running the command 'nvidia-smi' and capturing its output\n",
355
+ " output = subprocess.check_output(['nvidia-smi'], text=True)\n",
356
+ "\n",
357
+ " # Regular expression to find the memory part\n",
358
+ " mem_regex = re.compile(r'\\|\\s+\\d+MiB / (\\d+)MiB\\s+\\|')\n",
359
+ " match = mem_regex.search(output)\n",
360
+ " if match:\n",
361
+ " total_memory_mib = int(match.group(1))\n",
362
+ " # Converting MiB to GiB (1 GiB = 1024 MiB) and rounding to 2 decimal places\n",
363
+ " total_memory_gb = round(total_memory_mib / 1024, 2)\n",
364
+ " return total_memory_gb\n",
365
+ " else:\n",
366
+ " raise ValueError(\"Could not parse total memory from nvidia-smi output.\")\n",
367
+ " except Exception as e:\n",
368
+ " return f\"An error occurred: {e}\"\n",
369
+ "\n",
370
+ "\n",
371
+ "def get_batch_size(optimizer: str, vram: int) -> int:\n",
372
+ " # allocate batch size based on vram, assuming training on 1024x1024 resolution\n",
373
+ " _bs_dict = default_batch_size_dict\n",
374
+ " \n",
375
+ " if optimizer in _bs_dict:\n",
376
+ " # Find the closest lower VRAM value that we have a batch size for\n",
377
+ " closest_vram = max(vram_key for vram_key in _bs_dict[optimizer] if vram_key <= vram)\n",
378
+ " return _bs_dict[optimizer][closest_vram]\n",
379
+ " else:\n",
380
+ " raise ValueError(f\"Optimizer '{optimizer}' not supported.\")\n",
381
+ "\n",
382
+ "\n",
383
+ "def get_train_image_count(dataset_dir:str) -> int:\n",
384
+ " files = unibox.traverses(DATASET_DIR, include_extensions = unibox.constants.IMG_FILES)\n",
385
+ " return len(files)\n",
386
+ "\n",
387
+ "\n",
388
+ "def get_scheduler_dict(it_per_epoch:int, epoch_per_cycle:int, warmup_epochs:float):\n",
389
+ "\n",
390
+ " _warmup_step_count = int(it_per_epoch * warmup_epochs)\n",
391
+ " print(f\"_warmup_step_count: {_warmup_step_count}\")\n",
392
+ "\n",
393
+ " _cycle_step_count = it_per_epoch * epoch_per_cycle\n",
394
+ " print(f\"_cycle_step_count: {_cycle_step_count}\")\n",
395
+ "\n",
396
+ " scheduler_dict = default_scheduler_dict.copy()\n",
397
+ " scheduler_dict[\"scheduler\"][\"params\"][\"num_training_steps\"] = _cycle_step_count\n",
398
+ " scheduler_dict[\"scheduler\"][\"params\"][\"num_warmup_steps\"] = _warmup_step_count\n",
399
+ " return scheduler_dict\n",
400
+ "\n",
401
+ "\n",
402
+ "def evaluate_template_dict(template_dict):\n",
403
+ " # generate a filled dictionary from a template\n",
404
+ " new_dict = {}\n",
405
+ " for key, value in template_dict.items():\n",
406
+ " if isinstance(value, dict):\n",
407
+ " new_dict[key] = evaluate_template_dict(value)\n",
408
+ " elif callable(value):\n",
409
+ " new_dict[key] = value()\n",
410
+ " else:\n",
411
+ " new_dict[key] = value\n",
412
+ " return new_dict\n",
413
+ "\n",
414
+ "\n",
415
+ "def write_config_to_yaml(config_dict, yaml_path):\n",
416
+ " yaml_config = OmegaConf.to_yaml(config_dict)\n",
417
+ "\n",
418
+ " # Splitting the YAML string into lines\n",
419
+ " lines = yaml_config.split('\\n')\n",
420
+ "\n",
421
+ " # Iterating through the lines and adding an empty line before each major section\n",
422
+ " formatted_lines = []\n",
423
+ " for line in lines:\n",
424
+ " if line.startswith(' ') or line == '':\n",
425
+ " # It's a subline or already an empty line, just add it\n",
426
+ " formatted_lines.append(line)\n",
427
+ " else:\n",
428
+ " # It's a new major section, add an empty line before it (if it's not the first line)\n",
429
+ " if formatted_lines:\n",
430
+ " formatted_lines.append('')\n",
431
+ " formatted_lines.append(line)\n",
432
+ "\n",
433
+ " # Joining the lines back into a single string\n",
434
+ " formatted_yaml_config = '\\n'.join(formatted_lines)\n",
435
+ "\n",
436
+ " # Write the formatted YAML string to a file\n",
437
+ " with open(yaml_path, 'w') as file:\n",
438
+ " file.write(formatted_yaml_config)\n",
439
+ "\n",
440
+ " print()\n",
441
+ " print(f\"Configuration saved to [{yaml_path}]\")\n",
442
+ "\n",
443
+ "\n",
444
+ "def get_optimizer_dict(optimizer:str):\n",
445
+ "\n",
446
+ " return_dict = {\n",
447
+ " \"optimizer\": optimizer_dict[optimizer],\n",
448
+ " }\n",
449
+ "\n",
450
+ " return return_dict"
451
+ ]
452
+ },
453
+ {
454
+ "cell_type": "code",
455
+ "execution_count": 21,
456
+ "id": "33db0266",
457
+ "metadata": {},
458
+ "outputs": [],
459
+ "source": [
460
+ "DEFAULT_CONFIG = \"https://huggingface.co/kiriyamaX/nd-configs/resolve/main/nd_config_template_sdxl_80g.yaml\"\n",
461
+ "\n",
462
+ "# ============= CONFIGS =============\n",
463
+ "\n",
464
+ "# IMPORTANT\n",
465
+ "CONFIG_VERSION = 1\n",
466
+ "RUN_NAME = \"qft_twitter_aes_trained-best-26k-of-798k\"\n",
467
+ "DATASET_DIR = \"../datasets/twitter-aes_trained-best-26k-of-798k\"\n",
468
+ "# MODEL_PATH = \"../models/playground-v2-1024px-aesthetic.safetensors\"\n",
469
+ "MODEL_PATH = \"../models/fd5me9.ckpt\" \n",
470
+ "\n",
471
+ "# ===================================\n",
472
+ "\n",
473
+ "# hyperparams\n",
474
+ "OFFSET_NOISE_VAL = 0.1\n",
475
+ "UCG = 0.1\n",
476
+ "\n",
477
+ "# optimizer\n",
478
+ "TRAIN_OPTIMIZER = \"adamw\"\n",
479
+ "WARMUP_EPOCHS = 0.3\n",
480
+ "EPOCH_PER_CYCLE = 10\n",
481
+ "\n",
482
+ "# saving\n",
483
+ "SAVE_INTERVAL_EPOCH = 1\n",
484
+ "SAVE_INTERVAL_STEPS = -1\n",
485
+ "# ==================================="
486
+ ]
487
+ },
488
+ {
489
+ "cell_type": "code",
490
+ "execution_count": 22,
491
+ "id": "68e09efd",
492
+ "metadata": {},
493
+ "outputs": [
494
+ {
495
+ "name": "stderr",
496
+ "output_type": "stream",
497
+ "text": [
498
+ "2023-12-16 15:43:45,081 [INFO] UniLogger: UniLoader.loads: .yaml LOADED from \"/tmp/tmpsszr87yd.yaml\" in 0.04s\n"
499
+ ]
500
+ },
501
+ {
502
+ "name": "stdout",
503
+ "output_type": "stream",
504
+ "text": [
505
+ "sys_vram: 80 GB \n",
506
+ "train_batch_size: 24 \n",
507
+ "train_image_count: 26655 \n",
508
+ "_it_per_epoch: 1110\n",
509
+ "_warmup_step_count: 333\n",
510
+ "_cycle_step_count: 11100\n",
511
+ "\n",
512
+ "Configuration saved to [./config_nd_qft_twitter_aes_trained-best-26k-of-798k_v1.yaml]\n"
513
+ ]
514
+ }
515
+ ],
516
+ "source": [
517
+ "regulars_dict_template = {\n",
518
+ " \"trainer\": {\n",
519
+ " \"model_path\": lambda: MODEL_PATH,\n",
520
+ " \"checkpoint_dir\": lambda: CHECKPOINT_DIR,\n",
521
+ " \"offset_noise\": True,\n",
522
+ " \"offset_noise_val\": lambda: OFFSET_NOISE_VAL,\n",
523
+ " \"checkpoint_steps\": lambda: SAVE_INTERVAL_STEPS,\n",
524
+ " \"checkpoint_freq\": lambda: SAVE_INTERVAL_EPOCH,\n",
525
+ " },\n",
526
+ " \"dataset\": {\n",
527
+ " \"ucg\": lambda: UCG,\n",
528
+ " \"img_path\": lambda: [DATASET_DIR],\n",
529
+ " },\n",
530
+ " \"sampling\": {\n",
531
+ " \"every_n_steps\": lambda: SAVE_INTERVAL_STEPS,\n",
532
+ " \"every_n_epochs\": lambda: SAVE_INTERVAL_EPOCH,\n",
533
+ " },\n",
534
+ "}\n",
535
+ "\n",
536
+ "def get_regulars_dict():\n",
537
+ " return evaluate_template_dict(regulars_dict_template)\n",
538
+ "\n",
539
+ "\n",
540
+ "CHECKPOINT_DIR = f\"checkpoint_{RUN_NAME}_v{CONFIG_VERSION}\"\n",
541
+ "\n",
542
+ "# sys_vram = get_vram_in_gb()\n",
543
+ "sys_vram = 80\n",
544
+ "train_batch_size = get_batch_size(TRAIN_OPTIMIZER, sys_vram)\n",
545
+ "train_image_count = get_train_image_count(DATASET_DIR)\n",
546
+ "config = unibox.loads(DEFAULT_CONFIG)\n",
547
+ "\n",
548
+ "if not config:\n",
549
+ " raise FileNotFoundError\n",
550
+ "\n",
551
+ "_it_per_epoch = math.floor(train_image_count / train_batch_size)\n",
552
+ "print(f\"sys_vram: {sys_vram} GB \\ntrain_batch_size: {train_batch_size} \\ntrain_image_count: {train_image_count} \\n_it_per_epoch: {_it_per_epoch}\")\n",
553
+ "\n",
554
+ "config = OmegaConf.merge(config, get_optimizer_dict(TRAIN_OPTIMIZER))\n",
555
+ "config = OmegaConf.merge(config, get_scheduler_dict(_it_per_epoch, EPOCH_PER_CYCLE, WARMUP_EPOCHS))\n",
556
+ "config = OmegaConf.merge(config, get_regulars_dict())\n",
557
+ "\n",
558
+ "\n",
559
+ "YAML_FOLDER = \"./\"\n",
560
+ "YAML_NAME = f\"config_nd_{RUN_NAME}_v{CONFIG_VERSION}.yaml\"\n",
561
+ "_yaml_path = os.path.join(YAML_FOLDER, YAML_NAME)\n",
562
+ "write_config_to_yaml(config, _yaml_path)"
563
+ ]
564
+ },
565
+ {
566
+ "cell_type": "markdown",
567
+ "id": "70859336-6ae3-4b55-a88d-3bc21a0e6a09",
568
+ "metadata": {},
569
+ "source": [
570
+ "## docker transformer engine"
571
+ ]
572
+ },
573
+ {
574
+ "cell_type": "code",
575
+ "execution_count": null,
576
+ "id": "569387df-9e56-44f9-8001-1bd2d61ea8b5",
577
+ "metadata": {},
578
+ "outputs": [],
579
+ "source": [
580
+ "# https://github.com/NVIDIA/TransformerEngine?tab=readme-ov-file#installation\n",
581
+ "docker run --gpus all -it -v /home/ubuntu/datasets:/datasets -v /home/ubuntu/models:/models -v /home/ubuntu/ndtr:/ndtr --rm nvcr.io/nvidia/pytorch:23.10-py3"
582
+ ]
583
+ },
584
+ {
585
+ "cell_type": "code",
586
+ "execution_count": null,
587
+ "id": "9e4a5dad-0c9d-402b-9cf7-fa3fd08e4f37",
588
+ "metadata": {},
589
+ "outputs": [],
590
+ "source": [
591
+ "git config --global --add safe.directory /ndtr\n",
592
+ "wandb login 0025f0bc67dba1846edaf9c2425b288b23ae0f99"
593
+ ]
594
+ },
595
+ {
596
+ "cell_type": "markdown",
597
+ "id": "11eae193-980f-449b-ac8f-4976ca235da4",
598
+ "metadata": {},
599
+ "source": [
600
+ "## create txt if not exist"
601
+ ]
602
+ },
603
+ {
604
+ "cell_type": "code",
605
+ "execution_count": 2,
606
+ "id": "0d8cf284-cb15-4218-b68e-b99e72ef53cf",
607
+ "metadata": {
608
+ "tags": []
609
+ },
610
+ "outputs": [
611
+ {
612
+ "name": "stdout",
613
+ "output_type": "stream",
614
+ "text": [
615
+ "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
616
+ "\u001b[0m"
617
+ ]
618
+ }
619
+ ],
620
+ "source": [
621
+ "!pip install -q unibox"
622
+ ]
623
+ },
624
+ {
625
+ "cell_type": "code",
626
+ "execution_count": 10,
627
+ "id": "701cae45-da02-4ea7-81f3-9ee1c2f14d47",
628
+ "metadata": {
629
+ "tags": []
630
+ },
631
+ "outputs": [
632
+ {
633
+ "name": "stderr",
634
+ "output_type": "stream",
635
+ "text": [
636
+ " \r"
637
+ ]
638
+ },
639
+ {
640
+ "data": {
641
+ "text/plain": [
642
+ "{'metadata': {'len': 40022, 'item_type': 'str'},\n",
643
+ " 'preview': ['1604906847521017857_3.jpg',\n",
644
+ " '703970524313956352_1.jpg',\n",
645
+ " '1631451367620370434_1.jpg']}"
646
+ ]
647
+ },
648
+ "execution_count": 10,
649
+ "metadata": {},
650
+ "output_type": "execute_result"
651
+ }
652
+ ],
653
+ "source": [
654
+ "import unibox as ub\n",
655
+ "from tqdm.auto import tqdm\n",
656
+ "# /home/ubuntu/datasets/twitter-aes_trained-best-167k-of-798k\"\n",
657
+ "TARGET_DIR = \"/notebooks/datasets/twitter-aes_trained-best-167k-of-798k\"\n",
658
+ "\n",
659
+ "# read\n",
660
+ "files_in_dir = ub.traverses(TARGET_DIR, relative_unix=True, \n",
661
+ " include_extensions=ub.constants.IMG_FILES)\n",
662
+ "ub.peeks(files_in_dir)"
663
+ ]
664
+ },
665
+ {
666
+ "cell_type": "code",
667
+ "execution_count": 11,
668
+ "id": "79746897-03cc-4aa2-ae74-ac62ea00e389",
669
+ "metadata": {
670
+ "tags": []
671
+ },
672
+ "outputs": [
673
+ {
674
+ "data": {
675
+ "application/vnd.jupyter.widget-view+json": {
676
+ "model_id": "7a3dfcb29c6640b3a7638fecb9b2a1e7",
677
+ "version_major": 2,
678
+ "version_minor": 0
679
+ },
680
+ "text/plain": [
681
+ " 0%| | 0/40022 [00:00<?, ?it/s]"
682
+ ]
683
+ },
684
+ "metadata": {},
685
+ "output_type": "display_data"
686
+ },
687
+ {
688
+ "ename": "KeyboardInterrupt",
689
+ "evalue": "",
690
+ "output_type": "error",
691
+ "traceback": [
692
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
693
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
694
+ "Cell \u001b[0;32mIn[11], line 12\u001b[0m\n\u001b[1;32m 10\u001b[0m os\u001b[38;5;241m.\u001b[39mmakedirs(full_subdir_path, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 11\u001b[0m txt_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(txt_root_dir, txt_file)\n\u001b[0;32m---> 12\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtxt_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mw\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m 13\u001b[0m f\u001b[38;5;241m.\u001b[39mwrite(placeholder_txt_content)\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFiles and directories created successfully.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
695
+ "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py:310\u001b[0m, in \u001b[0;36m_modified_open\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m 303\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m {\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m}:\n\u001b[1;32m 304\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 305\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIPython won\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt let you open fd=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m by default \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 306\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 307\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124myou can use builtins\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m open.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 308\u001b[0m )\n\u001b[0;32m--> 310\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mio_open\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
696
+ "File \u001b[0;32m/usr/lib/python3.10/codecs.py:186\u001b[0m, in \u001b[0;36mIncrementalEncoder.__init__\u001b[0;34m(self, errors)\u001b[0m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mIncrementalEncoder\u001b[39;00m(\u001b[38;5;28mobject\u001b[39m):\n\u001b[1;32m 181\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 182\u001b[0m \u001b[38;5;124;03m An IncrementalEncoder encodes an input in multiple steps. The input can\u001b[39;00m\n\u001b[1;32m 183\u001b[0m \u001b[38;5;124;03m be passed piece by piece to the encode() method. The IncrementalEncoder\u001b[39;00m\n\u001b[1;32m 184\u001b[0m \u001b[38;5;124;03m remembers the state of the encoding process between calls to encode().\u001b[39;00m\n\u001b[1;32m 185\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 186\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, errors\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mstrict\u001b[39m\u001b[38;5;124m'\u001b[39m):\n\u001b[1;32m 187\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 188\u001b[0m \u001b[38;5;124;03m Creates an IncrementalEncoder instance.\u001b[39;00m\n\u001b[1;32m 189\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 192\u001b[0m \u001b[38;5;124;03m for a list of possible values.\u001b[39;00m\n\u001b[1;32m 193\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m 194\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39merrors \u001b[38;5;241m=\u001b[39m errors\n",
697
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
698
+ ]
699
+ }
700
+ ],
701
+ "source": [
702
+ "# create\n",
703
+ "txt_root_dir = TARGET_DIR\n",
704
+ "placeholder_txt_content = \"\"\n",
705
+ "\n",
706
+ "txt_files_todo = [os.path.splitext(file)[0] + '.txt' for file in files_in_dir]\n",
707
+ "os.makedirs(txt_root_dir, exist_ok=True)\n",
708
+ "for txt_file in tqdm(txt_files_todo):\n",
709
+ " subdir = os.path.dirname(txt_file)\n",
710
+ " full_subdir_path = os.path.join(txt_root_dir, subdir)\n",
711
+ " os.makedirs(full_subdir_path, exist_ok=True)\n",
712
+ " txt_path = os.path.join(txt_root_dir, txt_file)\n",
713
+ " with open(txt_path, 'w') as f:\n",
714
+ " f.write(placeholder_txt_content)\n",
715
+ "\n",
716
+ "print(\"Files and directories created successfully.\")"
717
+ ]
718
+ },
719
+ {
720
+ "cell_type": "code",
721
+ "execution_count": 12,
722
+ "id": "38e5d854-e66b-4644-8119-02051789bcde",
723
+ "metadata": {
724
+ "tags": []
725
+ },
726
+ "outputs": [
727
+ {
728
+ "name": "stderr",
729
+ "output_type": "stream",
730
+ "text": [
731
+ " \r"
732
+ ]
733
+ },
734
+ {
735
+ "data": {
736
+ "text/plain": [
737
+ "{'metadata': {'len': 40022, 'item_type': 'str'},\n",
738
+ " 'preview': ['1615643911099138048_1.txt',\n",
739
+ " '1587049940366204928_1.txt',\n",
740
+ " '1416561591043166211_2.txt']}"
741
+ ]
742
+ },
743
+ "execution_count": 12,
744
+ "metadata": {},
745
+ "output_type": "execute_result"
746
+ }
747
+ ],
748
+ "source": [
749
+ "# verify\n",
750
+ "files_in_dir = unibox.traverses(TARGET_DIR, relative_unix=True, include_extensions=[\".txt\"])\n",
751
+ "ub.peeks(files_in_dir)"
752
+ ]
753
+ }
754
+ ],
755
+ "metadata": {
756
+ "kernelspec": {
757
+ "display_name": "Python 3 (ipykernel)",
758
+ "language": "python",
759
+ "name": "python3"
760
+ },
761
+ "language_info": {
762
+ "codemirror_mode": {
763
+ "name": "ipython",
764
+ "version": 3
765
+ },
766
+ "file_extension": ".py",
767
+ "mimetype": "text/x-python",
768
+ "name": "python",
769
+ "nbconvert_exporter": "python",
770
+ "pygments_lexer": "ipython3",
771
+ "version": "3.10.12"
772
+ }
773
+ },
774
+ "nbformat": 4,
775
+ "nbformat_minor": 5
776
+ }