trojblue
/

nd-param-calculator

Model card Files Files and versions Community

trojblue commited on Jan 19

Commit

106188f

•

1 Parent(s): ce70ea9

Upload nd_param_calculator_latest.ipynb

Browse files

Files changed (1) hide show

nd_param_calculator_latest.ipynb +776 -0

nd_param_calculator_latest.ipynb ADDED Viewed

	@@ -0,0 +1,776 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "82d89348-fb09-462d-b78f-f1dab3447c3d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# !pip install -U unibox omegaconf -q\n",
+    "import unibox\n",
+    "import subprocess\n",
+    "import re\n",
+    "import os\n",
+    "import math\n",
+    "from omegaconf import OmegaConf\n",
+    "\n",
+    "# default values for different optimizers\n",
+    "optimizer_dict = {\n",
+    " \"prodigy\": {\n",
+    "    \"name\": \"prodigyopt.Prodigy\",\n",
+    "    \"params\": {\n",
+    "        \"lr\": 1,\n",
+    "        \"d_coef\": 2,\n",
+    "        \"d0\": 1e-6,\n",
+    "        \"safeguard_warmup\": True,\n",
+    "        \"use_bias_correction\": True,\n",
+    "        \"weight_decay\": 1e-2,\n",
+    "        \"eps\": 1e-8,\n",
+    "        }    \n",
+    "    },\n",
+    "\"adamw\":{\n",
+    "    \"name\": \"torch.optim.AdamW\",\n",
+    "    \"params\":{\n",
+    "      \"lr\": 3e-5,\n",
+    "      \"weight_decay\": 1e-2,\n",
+    "    },\n",
+    "       \n",
+    "}\n",
+    "}\n",
+    "\n",
+    "# default scheduler dict\n",
+    "default_scheduler_dict = {\n",
+    "    \"scheduler\":{\n",
+    "        \"name\": \"transformers.get_cosine_schedule_with_warmup\",\n",
+    "        \"params\": {\n",
+    "            \"num_warmup_steps\": 0,\n",
+    "            \"num_training_steps\": 1000,\n",
+    "            \"last_epoch\": -1,\n",
+    "        }\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "# assuming training on 1024x1024 resolution\n",
+    "default_batch_size_dict = {\n",
+    "        \"prodigy\": {\n",
+    "            80: 8,  # For 80 GB VRAM, batch size is 8\n",
+    "            20: 1,  \n",
+    "        },\n",
+    "        \"adamw\": {\n",
+    "            80: 24\n",
+    "        },\n",
+    "        \"lion\": {\n",
+    "            78: 48\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "\n",
+    "def get_vram_in_gb():\n",
+    "    \"\"\" Returns the total GPU memory in GB. \"\"\"\n",
+    "    try:\n",
+    "        # Running the command 'nvidia-smi' and capturing its output\n",
+    "        output = subprocess.check_output(['nvidia-smi'], text=True)\n",
+    "\n",
+    "        # Regular expression to find the memory part\n",
+    "        mem_regex = re.compile(r'\\|\\s+\\d+MiB / (\\d+)MiB\\s+\\|')\n",
+    "        match = mem_regex.search(output)\n",
+    "        if match:\n",
+    "            total_memory_mib = int(match.group(1))\n",
+    "            # Converting MiB to GiB (1 GiB = 1024 MiB) and rounding to 2 decimal places\n",
+    "            total_memory_gb = round(total_memory_mib / 1024, 2)\n",
+    "            return total_memory_gb\n",
+    "        else:\n",
+    "            raise ValueError(\"Could not parse total memory from nvidia-smi output.\")\n",
+    "    except Exception as e:\n",
+    "        return f\"An error occurred: {e}\"\n",
+    "\n",
+    "\n",
+    "def get_batch_size(optimizer: str, vram: int) -> int:\n",
+    "    # allocate batch size based on vram, assuming training on 1024x1024 resolution\n",
+    "    _bs_dict = default_batch_size_dict\n",
+    "    \n",
+    "    if optimizer in _bs_dict:\n",
+    "        # Find the closest lower VRAM value that we have a batch size for\n",
+    "        closest_vram = max(vram_key for vram_key in _bs_dict[optimizer] if vram_key <= vram)\n",
+    "        return _bs_dict[optimizer][closest_vram]\n",
+    "    else:\n",
+    "        raise ValueError(f\"Optimizer '{optimizer}' not supported.\")\n",
+    "\n",
+    "\n",
+    "def get_train_image_count(dataset_dir:str) -> int:\n",
+    "    files = unibox.traverses(DATASET_DIR, include_extensions = unibox.constants.IMG_FILES)\n",
+    "    return len(files)\n",
+    "\n",
+    "\n",
+    "def get_scheduler_dict(it_per_epoch:int, epoch_per_cycle:int, warmup_epochs:float):\n",
+    "\n",
+    "    _warmup_step_count = int(it_per_epoch * warmup_epochs)\n",
+    "    print(f\"_warmup_step_count: {_warmup_step_count}\")\n",
+    "\n",
+    "    _cycle_step_count = it_per_epoch * epoch_per_cycle\n",
+    "    print(f\"_cycle_step_count: {_cycle_step_count}\")\n",
+    "\n",
+    "    scheduler_dict = default_scheduler_dict.copy()\n",
+    "    scheduler_dict[\"scheduler\"][\"params\"][\"num_training_steps\"] = _cycle_step_count\n",
+    "    scheduler_dict[\"scheduler\"][\"params\"][\"num_warmup_steps\"] = _warmup_step_count\n",
+    "    return scheduler_dict\n",
+    "\n",
+    "\n",
+    "def evaluate_template_dict(template_dict):\n",
+    "    # generate a filled dictionary from a template\n",
+    "    new_dict = {}\n",
+    "    for key, value in template_dict.items():\n",
+    "        if isinstance(value, dict):\n",
+    "            new_dict[key] = evaluate_template_dict(value)\n",
+    "        elif callable(value):\n",
+    "            new_dict[key] = value()\n",
+    "        else:\n",
+    "            new_dict[key] = value\n",
+    "    return new_dict\n",
+    "\n",
+    "\n",
+    "def write_config_to_yaml(config_dict, yaml_path):\n",
+    "    yaml_config = OmegaConf.to_yaml(config_dict)\n",
+    "\n",
+    "    # Splitting the YAML string into lines\n",
+    "    lines = yaml_config.split('\\n')\n",
+    "\n",
+    "    # Iterating through the lines and adding an empty line before each major section\n",
+    "    formatted_lines = []\n",
+    "    for line in lines:\n",
+    "        if line.startswith(' ') or line == '':\n",
+    "            # It's a subline or already an empty line, just add it\n",
+    "            formatted_lines.append(line)\n",
+    "        else:\n",
+    "            # It's a new major section, add an empty line before it (if it's not the first line)\n",
+    "            if formatted_lines:\n",
+    "                formatted_lines.append('')\n",
+    "            formatted_lines.append(line)\n",
+    "\n",
+    "    # Joining the lines back into a single string\n",
+    "    formatted_yaml_config = '\\n'.join(formatted_lines)\n",
+    "\n",
+    "    # Write the formatted YAML string to a file\n",
+    "    with open(yaml_path, 'w') as file:\n",
+    "        file.write(formatted_yaml_config)\n",
+    "\n",
+    "    print()\n",
+    "    print(f\"Configuration saved to [{yaml_path}]\")\n",
+    "\n",
+    "\n",
+    "def get_optimizer_dict(optimizer:str):\n",
+    "\n",
+    "    return_dict = {\n",
+    "        \"optimizer\": optimizer_dict[optimizer],\n",
+    "    }\n",
+    "\n",
+    "    return return_dict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "12a8d495-565e-455b-a8ee-fcaf09b199a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DEFAULT_CONFIG = \"https://huggingface.co/kiriyamaX/nd-configs/resolve/main/nd_config_template_sdxl_80g.yaml\"\n",
+    "\n",
+    "# ============= CONFIGS =============\n",
+    "\n",
+    "# IMPORTANT\n",
+    "CONFIG_VERSION = 1\n",
+    "RUN_NAME = \"qft_twitter_aes_167k-of-798k\"\n",
+    "DATASET_DIR = \"../datasets/twitter-aes_trained-best-167k-of-798k\"\n",
+    "# MODEL_PATH = \"../models/playground-v2-1024px-aesthetic.safetensors\"\n",
+    "MODEL_PATH = \"../models/fd5me9.ckpt\" \n",
+    "\n",
+    "# ===================================\n",
+    "\n",
+    "# hyperparams\n",
+    "OFFSET_NOISE_VAL = 0.12\n",
+    "UCG = 0.1\n",
+    "\n",
+    "# optimizer\n",
+    "TRAIN_OPTIMIZER = \"adamw\"\n",
+    "WARMUP_EPOCHS = 0.3\n",
+    "EPOCH_PER_CYCLE = 10\n",
+    "\n",
+    "# saving\n",
+    "SAVE_INTERVAL_EPOCH = 1\n",
+    "SAVE_INTERVAL_STEPS = -1\n",
+    "# ==================================="
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "c4089d9b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-12-18 07:51:41,207 [INFO] UniLogger: UniLoader.loads: .yaml LOADED from \"/tmp/tmptm3kzw5a.yaml\" in 0.04s\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sys_vram: 80 GB \n",
+      "train_batch_size: 24 \n",
+      "train_image_count: 166110 \n",
+      "_it_per_epoch: 6921\n",
+      "_warmup_step_count: 2076\n",
+      "_cycle_step_count: 69210\n",
+      "\n",
+      "Configuration saved to [./config_nd_qft_twitter_aes_167k-of-798k_v1.yaml]\n"
+     ]
+    }
+   ],
+   "source": [
+    "regulars_dict_template = {\n",
+    "    \"trainer\": {\n",
+    "        \"model_path\": lambda: MODEL_PATH,\n",
+    "        \"checkpoint_dir\": lambda: CHECKPOINT_DIR,\n",
+    "        \"offset_noise\": True,\n",
+    "        \"offset_noise_val\": lambda: OFFSET_NOISE_VAL,\n",
+    "        \"checkpoint_steps\": lambda: SAVE_INTERVAL_STEPS,\n",
+    "        \"checkpoint_freq\": lambda: SAVE_INTERVAL_EPOCH,\n",
+    "    },\n",
+    "    \"dataset\": {\n",
+    "        \"ucg\": lambda: UCG,\n",
+    "        \"img_path\": lambda: [DATASET_DIR],\n",
+    "    },\n",
+    "    \"sampling\": {\n",
+    "        \"every_n_steps\": lambda: SAVE_INTERVAL_STEPS,\n",
+    "        \"every_n_epochs\": lambda: SAVE_INTERVAL_EPOCH,\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "def get_regulars_dict():\n",
+    "    return evaluate_template_dict(regulars_dict_template)\n",
+    "\n",
+    "\n",
+    "CHECKPOINT_DIR = f\"checkpoint_{RUN_NAME}_v{CONFIG_VERSION}\"\n",
+    "\n",
+    "# sys_vram = get_vram_in_gb()\n",
+    "sys_vram = 80\n",
+    "train_batch_size = get_batch_size(TRAIN_OPTIMIZER, sys_vram)\n",
+    "train_image_count = get_train_image_count(DATASET_DIR)\n",
+    "config = unibox.loads(DEFAULT_CONFIG)\n",
+    "\n",
+    "if not config:\n",
+    "    raise FileNotFoundError\n",
+    "\n",
+    "_it_per_epoch = math.floor(train_image_count / train_batch_size)\n",
+    "print(f\"sys_vram: {sys_vram} GB \\ntrain_batch_size: {train_batch_size} \\ntrain_image_count: {train_image_count} \\n_it_per_epoch: {_it_per_epoch}\")\n",
+    "\n",
+    "config = OmegaConf.merge(config, get_optimizer_dict(TRAIN_OPTIMIZER))\n",
+    "config = OmegaConf.merge(config, get_scheduler_dict(_it_per_epoch, EPOCH_PER_CYCLE, WARMUP_EPOCHS))\n",
+    "config = OmegaConf.merge(config, get_regulars_dict())\n",
+    "\n",
+    "\n",
+    "YAML_FOLDER = \"./\"\n",
+    "YAML_NAME = f\"config_nd_{RUN_NAME}_v{CONFIG_VERSION}.yaml\"\n",
+    "_yaml_path = os.path.join(YAML_FOLDER, YAML_NAME)\n",
+    "write_config_to_yaml(config, _yaml_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "fe376cc7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install -U unibox omegaconf -q\n",
+    "import unibox\n",
+    "import subprocess\n",
+    "import re\n",
+    "import os\n",
+    "import math\n",
+    "from omegaconf import OmegaConf\n",
+    "\n",
+    "# default values for different optimizers\n",
+    "optimizer_dict = {\n",
+    " \"prodigy\": {\n",
+    "    \"name\": \"prodigyopt.Prodigy\",\n",
+    "    \"params\": {\n",
+    "        \"lr\": 1,\n",
+    "        \"d_coef\": 2,\n",
+    "        \"d0\": 1e-6,\n",
+    "        \"safeguard_warmup\": True,\n",
+    "        \"use_bias_correction\": True,\n",
+    "        \"weight_decay\": 1e-2,\n",
+    "        \"eps\": 1e-8,\n",
+    "        }    \n",
+    "    },\n",
+    "\"adamw\":{\n",
+    "    \"name\": \"torch.optim.AdamW\",\n",
+    "    \"params\":{\n",
+    "      \"lr\": 3e-5,\n",
+    "      \"weight_decay\": 1e-2,\n",
+    "    },\n",
+    "        \n",
+    "}\n",
+    "}\n",
+    "\n",
+    "# default scheduler dict\n",
+    "default_scheduler_dict = {\n",
+    "    \"scheduler\":{\n",
+    "        \"name\": \"transformers.get_cosine_schedule_with_warmup\",\n",
+    "        \"params\": {\n",
+    "            \"num_warmup_steps\": 0,\n",
+    "            \"num_training_steps\": 1000,\n",
+    "            \"last_epoch\": -1,\n",
+    "        }\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "# assuming training on 1024x1024 resolution\n",
+    "default_batch_size_dict = {\n",
+    "        \"prodigy\": {\n",
+    "            80: 8,  # For 80 GB VRAM, batch size is 8\n",
+    "            20: 1,  \n",
+    "        },\n",
+    "        \"adamw\": {\n",
+    "            80: 24,\n",
+    "        },\n",
+    "        \"lion\": {\n",
+    "            78: 48\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "\n",
+    "def get_vram_in_gb():\n",
+    "    \"\"\" Returns the total GPU memory in GB. \"\"\"\n",
+    "    try:\n",
+    "        # Running the command 'nvidia-smi' and capturing its output\n",
+    "        output = subprocess.check_output(['nvidia-smi'], text=True)\n",
+    "\n",
+    "        # Regular expression to find the memory part\n",
+    "        mem_regex = re.compile(r'\\|\\s+\\d+MiB / (\\d+)MiB\\s+\\|')\n",
+    "        match = mem_regex.search(output)\n",
+    "        if match:\n",
+    "            total_memory_mib = int(match.group(1))\n",
+    "            # Converting MiB to GiB (1 GiB = 1024 MiB) and rounding to 2 decimal places\n",
+    "            total_memory_gb = round(total_memory_mib / 1024, 2)\n",
+    "            return total_memory_gb\n",
+    "        else:\n",
+    "            raise ValueError(\"Could not parse total memory from nvidia-smi output.\")\n",
+    "    except Exception as e:\n",
+    "        return f\"An error occurred: {e}\"\n",
+    "\n",
+    "\n",
+    "def get_batch_size(optimizer: str, vram: int) -> int:\n",
+    "    # allocate batch size based on vram, assuming training on 1024x1024 resolution\n",
+    "    _bs_dict = default_batch_size_dict\n",
+    "    \n",
+    "    if optimizer in _bs_dict:\n",
+    "        # Find the closest lower VRAM value that we have a batch size for\n",
+    "        closest_vram = max(vram_key for vram_key in _bs_dict[optimizer] if vram_key <= vram)\n",
+    "        return _bs_dict[optimizer][closest_vram]\n",
+    "    else:\n",
+    "        raise ValueError(f\"Optimizer '{optimizer}' not supported.\")\n",
+    "\n",
+    "\n",
+    "def get_train_image_count(dataset_dir:str) -> int:\n",
+    "    files = unibox.traverses(DATASET_DIR, include_extensions = unibox.constants.IMG_FILES)\n",
+    "    return len(files)\n",
+    "\n",
+    "\n",
+    "def get_scheduler_dict(it_per_epoch:int, epoch_per_cycle:int, warmup_epochs:float):\n",
+    "\n",
+    "    _warmup_step_count = int(it_per_epoch * warmup_epochs)\n",
+    "    print(f\"_warmup_step_count: {_warmup_step_count}\")\n",
+    "\n",
+    "    _cycle_step_count = it_per_epoch * epoch_per_cycle\n",
+    "    print(f\"_cycle_step_count: {_cycle_step_count}\")\n",
+    "\n",
+    "    scheduler_dict = default_scheduler_dict.copy()\n",
+    "    scheduler_dict[\"scheduler\"][\"params\"][\"num_training_steps\"] = _cycle_step_count\n",
+    "    scheduler_dict[\"scheduler\"][\"params\"][\"num_warmup_steps\"] = _warmup_step_count\n",
+    "    return scheduler_dict\n",
+    "\n",
+    "\n",
+    "def evaluate_template_dict(template_dict):\n",
+    "    # generate a filled dictionary from a template\n",
+    "    new_dict = {}\n",
+    "    for key, value in template_dict.items():\n",
+    "        if isinstance(value, dict):\n",
+    "            new_dict[key] = evaluate_template_dict(value)\n",
+    "        elif callable(value):\n",
+    "            new_dict[key] = value()\n",
+    "        else:\n",
+    "            new_dict[key] = value\n",
+    "    return new_dict\n",
+    "\n",
+    "\n",
+    "def write_config_to_yaml(config_dict, yaml_path):\n",
+    "    yaml_config = OmegaConf.to_yaml(config_dict)\n",
+    "\n",
+    "    # Splitting the YAML string into lines\n",
+    "    lines = yaml_config.split('\\n')\n",
+    "\n",
+    "    # Iterating through the lines and adding an empty line before each major section\n",
+    "    formatted_lines = []\n",
+    "    for line in lines:\n",
+    "        if line.startswith(' ') or line == '':\n",
+    "            # It's a subline or already an empty line, just add it\n",
+    "            formatted_lines.append(line)\n",
+    "        else:\n",
+    "            # It's a new major section, add an empty line before it (if it's not the first line)\n",
+    "            if formatted_lines:\n",
+    "                formatted_lines.append('')\n",
+    "            formatted_lines.append(line)\n",
+    "\n",
+    "    # Joining the lines back into a single string\n",
+    "    formatted_yaml_config = '\\n'.join(formatted_lines)\n",
+    "\n",
+    "    # Write the formatted YAML string to a file\n",
+    "    with open(yaml_path, 'w') as file:\n",
+    "        file.write(formatted_yaml_config)\n",
+    "\n",
+    "    print()\n",
+    "    print(f\"Configuration saved to [{yaml_path}]\")\n",
+    "\n",
+    "\n",
+    "def get_optimizer_dict(optimizer:str):\n",
+    "\n",
+    "    return_dict = {\n",
+    "        \"optimizer\": optimizer_dict[optimizer],\n",
+    "    }\n",
+    "\n",
+    "    return return_dict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "33db0266",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DEFAULT_CONFIG = \"https://huggingface.co/kiriyamaX/nd-configs/resolve/main/nd_config_template_sdxl_80g.yaml\"\n",
+    "\n",
+    "# ============= CONFIGS =============\n",
+    "\n",
+    "# IMPORTANT\n",
+    "CONFIG_VERSION = 1\n",
+    "RUN_NAME = \"qft_twitter_aes_trained-best-26k-of-798k\"\n",
+    "DATASET_DIR = \"../datasets/twitter-aes_trained-best-26k-of-798k\"\n",
+    "# MODEL_PATH = \"../models/playground-v2-1024px-aesthetic.safetensors\"\n",
+    "MODEL_PATH = \"../models/fd5me9.ckpt\" \n",
+    "\n",
+    "# ===================================\n",
+    "\n",
+    "# hyperparams\n",
+    "OFFSET_NOISE_VAL = 0.1\n",
+    "UCG = 0.1\n",
+    "\n",
+    "# optimizer\n",
+    "TRAIN_OPTIMIZER = \"adamw\"\n",
+    "WARMUP_EPOCHS = 0.3\n",
+    "EPOCH_PER_CYCLE = 10\n",
+    "\n",
+    "# saving\n",
+    "SAVE_INTERVAL_EPOCH = 1\n",
+    "SAVE_INTERVAL_STEPS = -1\n",
+    "# ==================================="
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "68e09efd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-12-16 15:43:45,081 [INFO] UniLogger: UniLoader.loads: .yaml LOADED from \"/tmp/tmpsszr87yd.yaml\" in 0.04s\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sys_vram: 80 GB \n",
+      "train_batch_size: 24 \n",
+      "train_image_count: 26655 \n",
+      "_it_per_epoch: 1110\n",
+      "_warmup_step_count: 333\n",
+      "_cycle_step_count: 11100\n",
+      "\n",
+      "Configuration saved to [./config_nd_qft_twitter_aes_trained-best-26k-of-798k_v1.yaml]\n"
+     ]
+    }
+   ],
+   "source": [
+    "regulars_dict_template = {\n",
+    "    \"trainer\": {\n",
+    "        \"model_path\": lambda: MODEL_PATH,\n",
+    "        \"checkpoint_dir\": lambda: CHECKPOINT_DIR,\n",
+    "        \"offset_noise\": True,\n",
+    "        \"offset_noise_val\": lambda: OFFSET_NOISE_VAL,\n",
+    "        \"checkpoint_steps\": lambda: SAVE_INTERVAL_STEPS,\n",
+    "        \"checkpoint_freq\": lambda: SAVE_INTERVAL_EPOCH,\n",
+    "    },\n",
+    "    \"dataset\": {\n",
+    "        \"ucg\": lambda: UCG,\n",
+    "        \"img_path\": lambda: [DATASET_DIR],\n",
+    "    },\n",
+    "    \"sampling\": {\n",
+    "        \"every_n_steps\": lambda: SAVE_INTERVAL_STEPS,\n",
+    "        \"every_n_epochs\": lambda: SAVE_INTERVAL_EPOCH,\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "def get_regulars_dict():\n",
+    "    return evaluate_template_dict(regulars_dict_template)\n",
+    "\n",
+    "\n",
+    "CHECKPOINT_DIR = f\"checkpoint_{RUN_NAME}_v{CONFIG_VERSION}\"\n",
+    "\n",
+    "# sys_vram = get_vram_in_gb()\n",
+    "sys_vram = 80\n",
+    "train_batch_size = get_batch_size(TRAIN_OPTIMIZER, sys_vram)\n",
+    "train_image_count = get_train_image_count(DATASET_DIR)\n",
+    "config = unibox.loads(DEFAULT_CONFIG)\n",
+    "\n",
+    "if not config:\n",
+    "    raise FileNotFoundError\n",
+    "\n",
+    "_it_per_epoch = math.floor(train_image_count / train_batch_size)\n",
+    "print(f\"sys_vram: {sys_vram} GB \\ntrain_batch_size: {train_batch_size} \\ntrain_image_count: {train_image_count} \\n_it_per_epoch: {_it_per_epoch}\")\n",
+    "\n",
+    "config = OmegaConf.merge(config, get_optimizer_dict(TRAIN_OPTIMIZER))\n",
+    "config = OmegaConf.merge(config, get_scheduler_dict(_it_per_epoch, EPOCH_PER_CYCLE, WARMUP_EPOCHS))\n",
+    "config = OmegaConf.merge(config, get_regulars_dict())\n",
+    "\n",
+    "\n",
+    "YAML_FOLDER = \"./\"\n",
+    "YAML_NAME = f\"config_nd_{RUN_NAME}_v{CONFIG_VERSION}.yaml\"\n",
+    "_yaml_path = os.path.join(YAML_FOLDER, YAML_NAME)\n",
+    "write_config_to_yaml(config, _yaml_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "70859336-6ae3-4b55-a88d-3bc21a0e6a09",
+   "metadata": {},
+   "source": [
+    "## docker transformer engine"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "569387df-9e56-44f9-8001-1bd2d61ea8b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# https://github.com/NVIDIA/TransformerEngine?tab=readme-ov-file#installation\n",
+    "docker run --gpus all -it -v /home/ubuntu/datasets:/datasets -v /home/ubuntu/models:/models -v /home/ubuntu/ndtr:/ndtr --rm nvcr.io/nvidia/pytorch:23.10-py3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9e4a5dad-0c9d-402b-9cf7-fa3fd08e4f37",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "git config --global --add safe.directory /ndtr\n",
+    "wandb login 0025f0bc67dba1846edaf9c2425b288b23ae0f99"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "11eae193-980f-449b-ac8f-4976ca235da4",
+   "metadata": {},
+   "source": [
+    "## create txt if not exist"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "0d8cf284-cb15-4218-b68e-b99e72ef53cf",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
+      "\u001b[0m"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install -q unibox"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "701cae45-da02-4ea7-81f3-9ee1c2f14d47",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                     \r"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'metadata': {'len': 40022, 'item_type': 'str'},\n",
+       " 'preview': ['1604906847521017857_3.jpg',\n",
+       "  '703970524313956352_1.jpg',\n",
+       "  '1631451367620370434_1.jpg']}"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import unibox as ub\n",
+    "from tqdm.auto import tqdm\n",
+    "# /home/ubuntu/datasets/twitter-aes_trained-best-167k-of-798k\"\n",
+    "TARGET_DIR = \"/notebooks/datasets/twitter-aes_trained-best-167k-of-798k\"\n",
+    "\n",
+    "# read\n",
+    "files_in_dir = ub.traverses(TARGET_DIR, relative_unix=True, \n",
+    "                               include_extensions=ub.constants.IMG_FILES)\n",
+    "ub.peeks(files_in_dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "79746897-03cc-4aa2-ae74-ac62ea00e389",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7a3dfcb29c6640b3a7638fecb9b2a1e7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/40022 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[11], line 12\u001b[0m\n\u001b[1;32m     10\u001b[0m     os\u001b[38;5;241m.\u001b[39mmakedirs(full_subdir_path, exist_ok\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m     11\u001b[0m     txt_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(txt_root_dir, txt_file)\n\u001b[0;32m---> 12\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtxt_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mw\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m     13\u001b[0m         f\u001b[38;5;241m.\u001b[39mwrite(placeholder_txt_content)\n\u001b[1;32m     15\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFiles and directories created successfully.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py:310\u001b[0m, in \u001b[0;36m_modified_open\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m    303\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m {\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m}:\n\u001b[1;32m    304\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m    305\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIPython won\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt let you open fd=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m by default \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    306\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    307\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124myou can use builtins\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m open.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    308\u001b[0m     )\n\u001b[0;32m--> 310\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mio_open\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/lib/python3.10/codecs.py:186\u001b[0m, in \u001b[0;36mIncrementalEncoder.__init__\u001b[0;34m(self, errors)\u001b[0m\n\u001b[1;32m    180\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mIncrementalEncoder\u001b[39;00m(\u001b[38;5;28mobject\u001b[39m):\n\u001b[1;32m    181\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    182\u001b[0m \u001b[38;5;124;03m    An IncrementalEncoder encodes an input in multiple steps. The input can\u001b[39;00m\n\u001b[1;32m    183\u001b[0m \u001b[38;5;124;03m    be passed piece by piece to the encode() method. The IncrementalEncoder\u001b[39;00m\n\u001b[1;32m    184\u001b[0m \u001b[38;5;124;03m    remembers the state of the encoding process between calls to encode().\u001b[39;00m\n\u001b[1;32m    185\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 186\u001b[0m     \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, errors\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mstrict\u001b[39m\u001b[38;5;124m'\u001b[39m):\n\u001b[1;32m    187\u001b[0m \u001b[38;5;250m        \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    188\u001b[0m \u001b[38;5;124;03m        Creates an IncrementalEncoder instance.\u001b[39;00m\n\u001b[1;32m    189\u001b[0m \n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    192\u001b[0m \u001b[38;5;124;03m        for a list of possible values.\u001b[39;00m\n\u001b[1;32m    193\u001b[0m \u001b[38;5;124;03m        \"\"\"\u001b[39;00m\n\u001b[1;32m    194\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39merrors \u001b[38;5;241m=\u001b[39m errors\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "# create\n",
+    "txt_root_dir = TARGET_DIR\n",
+    "placeholder_txt_content = \"\"\n",
+    "\n",
+    "txt_files_todo = [os.path.splitext(file)[0] + '.txt' for file in files_in_dir]\n",
+    "os.makedirs(txt_root_dir, exist_ok=True)\n",
+    "for txt_file in tqdm(txt_files_todo):\n",
+    "    subdir = os.path.dirname(txt_file)\n",
+    "    full_subdir_path = os.path.join(txt_root_dir, subdir)\n",
+    "    os.makedirs(full_subdir_path, exist_ok=True)\n",
+    "    txt_path = os.path.join(txt_root_dir, txt_file)\n",
+    "    with open(txt_path, 'w') as f:\n",
+    "        f.write(placeholder_txt_content)\n",
+    "\n",
+    "print(\"Files and directories created successfully.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "38e5d854-e66b-4644-8119-02051789bcde",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                     \r"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'metadata': {'len': 40022, 'item_type': 'str'},\n",
+       " 'preview': ['1615643911099138048_1.txt',\n",
+       "  '1587049940366204928_1.txt',\n",
+       "  '1416561591043166211_2.txt']}"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# verify\n",
+    "files_in_dir = unibox.traverses(TARGET_DIR, relative_unix=True, include_extensions=[\".txt\"])\n",
+    "ub.peeks(files_in_dir)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}