In [7]:
# !pip install -U unibox omegaconf -q
import unibox
import subprocess
import re
import os
import math
from omegaconf import OmegaConf

# default values for different optimizers
optimizer_dict = {
 "prodigy": {
    "name": "prodigyopt.Prodigy",
    "params": {
        "lr": 1,
        "d_coef": 2,
        "d0": 1e-6,
        "safeguard_warmup": True,
        "use_bias_correction": True,
        "weight_decay": 1e-2,
        "eps": 1e-8,
        }    
    },
"adamw":{
    "name": "torch.optim.AdamW",
    "params":{
      "lr": 3e-5,
      "weight_decay": 1e-2,
    },
       
}
}

# default scheduler dict
default_scheduler_dict = {
    "scheduler":{
        "name": "transformers.get_cosine_schedule_with_warmup",
        "params": {
            "num_warmup_steps": 0,
            "num_training_steps": 1000,
            "last_epoch": -1,
        }
    }
}

# assuming training on 1024x1024 resolution
default_batch_size_dict = {
        "prodigy": {
            80: 8,  # For 80 GB VRAM, batch size is 8
            20: 1,  
        },
        "adamw": {
            80: 24
        },
        "lion": {
            78: 48
    },
}


def get_vram_in_gb():
    """ Returns the total GPU memory in GB. """
    try:
        # Running the command 'nvidia-smi' and capturing its output
        output = subprocess.check_output(['nvidia-smi'], text=True)

        # Regular expression to find the memory part
        mem_regex = re.compile(r'\|\s+\d+MiB / (\d+)MiB\s+\|')
        match = mem_regex.search(output)
        if match:
            total_memory_mib = int(match.group(1))
            # Converting MiB to GiB (1 GiB = 1024 MiB) and rounding to 2 decimal places
            total_memory_gb = round(total_memory_mib / 1024, 2)
            return total_memory_gb
        else:
            raise ValueError("Could not parse total memory from nvidia-smi output.")
    except Exception as e:
        return f"An error occurred: {e}"


def get_batch_size(optimizer: str, vram: int) -> int:
    # allocate batch size based on vram, assuming training on 1024x1024 resolution
    _bs_dict = default_batch_size_dict
    
    if optimizer in _bs_dict:
        # Find the closest lower VRAM value that we have a batch size for
        closest_vram = max(vram_key for vram_key in _bs_dict[optimizer] if vram_key <= vram)
        return _bs_dict[optimizer][closest_vram]
    else:
        raise ValueError(f"Optimizer '{optimizer}' not supported.")


def get_train_image_count(dataset_dir:str) -> int:
    files = unibox.traverses(DATASET_DIR, include_extensions = unibox.constants.IMG_FILES)
    return len(files)


def get_scheduler_dict(it_per_epoch:int, epoch_per_cycle:int, warmup_epochs:float):

    _warmup_step_count = int(it_per_epoch * warmup_epochs)
    print(f"_warmup_step_count: {_warmup_step_count}")

    _cycle_step_count = it_per_epoch * epoch_per_cycle
    print(f"_cycle_step_count: {_cycle_step_count}")

    scheduler_dict = default_scheduler_dict.copy()
    scheduler_dict["scheduler"]["params"]["num_training_steps"] = _cycle_step_count
    scheduler_dict["scheduler"]["params"]["num_warmup_steps"] = _warmup_step_count
    return scheduler_dict


def evaluate_template_dict(template_dict):
    # generate a filled dictionary from a template
    new_dict = {}
    for key, value in template_dict.items():
        if isinstance(value, dict):
            new_dict[key] = evaluate_template_dict(value)
        elif callable(value):
            new_dict[key] = value()
        else:
            new_dict[key] = value
    return new_dict


def write_config_to_yaml(config_dict, yaml_path):
    yaml_config = OmegaConf.to_yaml(config_dict)

    # Splitting the YAML string into lines
    lines = yaml_config.split('\n')

    # Iterating through the lines and adding an empty line before each major section
    formatted_lines = []
    for line in lines:
        if line.startswith(' ') or line == '':
            # It's a subline or already an empty line, just add it
            formatted_lines.append(line)
        else:
            # It's a new major section, add an empty line before it (if it's not the first line)
            if formatted_lines:
                formatted_lines.append('')
            formatted_lines.append(line)

    # Joining the lines back into a single string
    formatted_yaml_config = '\n'.join(formatted_lines)

    # Write the formatted YAML string to a file
    with open(yaml_path, 'w') as file:
        file.write(formatted_yaml_config)

    print()
    print(f"Configuration saved to [{yaml_path}]")


def get_optimizer_dict(optimizer:str):

    return_dict = {
        "optimizer": optimizer_dict[optimizer],
    }

    return return_dict

In [14]:
DEFAULT_CONFIG = "https://huggingface.co/kiriyamaX/nd-configs/resolve/main/nd_config_template_sdxl_80g.yaml"

# ============= CONFIGS =============

# IMPORTANT
CONFIG_VERSION = 1
RUN_NAME = "qft_twitter_aes_167k-of-798k"
DATASET_DIR = "../datasets/twitter-aes_trained-best-167k-of-798k"
# MODEL_PATH = "../models/playground-v2-1024px-aesthetic.safetensors"
MODEL_PATH = "../models/fd5me9.ckpt" 

# ===================================

# hyperparams
OFFSET_NOISE_VAL = 0.12
UCG = 0.1

# optimizer
TRAIN_OPTIMIZER = "adamw"
WARMUP_EPOCHS = 0.3
EPOCH_PER_CYCLE = 10

# saving
SAVE_INTERVAL_EPOCH = 1
SAVE_INTERVAL_STEPS = -1
# ===================================

In [15]:
regulars_dict_template = {
    "trainer": {
        "model_path": lambda: MODEL_PATH,
        "checkpoint_dir": lambda: CHECKPOINT_DIR,
        "offset_noise": True,
        "offset_noise_val": lambda: OFFSET_NOISE_VAL,
        "checkpoint_steps": lambda: SAVE_INTERVAL_STEPS,
        "checkpoint_freq": lambda: SAVE_INTERVAL_EPOCH,
    },
    "dataset": {
        "ucg": lambda: UCG,
        "img_path": lambda: [DATASET_DIR],
    },
    "sampling": {
        "every_n_steps": lambda: SAVE_INTERVAL_STEPS,
        "every_n_epochs": lambda: SAVE_INTERVAL_EPOCH,
    },
}

def get_regulars_dict():
    return evaluate_template_dict(regulars_dict_template)


CHECKPOINT_DIR = f"checkpoint_{RUN_NAME}_v{CONFIG_VERSION}"

# sys_vram = get_vram_in_gb()
sys_vram = 80
train_batch_size = get_batch_size(TRAIN_OPTIMIZER, sys_vram)
train_image_count = get_train_image_count(DATASET_DIR)
config = unibox.loads(DEFAULT_CONFIG)

if not config:
    raise FileNotFoundError

_it_per_epoch = math.floor(train_image_count / train_batch_size)
print(f"sys_vram: {sys_vram} GB \ntrain_batch_size: {train_batch_size} \ntrain_image_count: {train_image_count} \n_it_per_epoch: {_it_per_epoch}")

config = OmegaConf.merge(config, get_optimizer_dict(TRAIN_OPTIMIZER))
config = OmegaConf.merge(config, get_scheduler_dict(_it_per_epoch, EPOCH_PER_CYCLE, WARMUP_EPOCHS))
config = OmegaConf.merge(config, get_regulars_dict())


YAML_FOLDER = "./"
YAML_NAME = f"config_nd_{RUN_NAME}_v{CONFIG_VERSION}.yaml"
_yaml_path = os.path.join(YAML_FOLDER, YAML_NAME)
write_config_to_yaml(config, _yaml_path)

2023-12-18 07:51:41,207 [INFO] UniLogger: UniLoader.loads: .yaml LOADED from "/tmp/tmptm3kzw5a.yaml" in 0.04s


sys_vram: 80 GB 
train_batch_size: 24 
train_image_count: 166110 
_it_per_epoch: 6921
_warmup_step_count: 2076
_cycle_step_count: 69210

Configuration saved to [./config_nd_qft_twitter_aes_167k-of-798k_v1.yaml]


In [18]:
# !pip install -U unibox omegaconf -q
import unibox
import subprocess
import re
import os
import math
from omegaconf import OmegaConf

# default values for different optimizers
optimizer_dict = {
 "prodigy": {
    "name": "prodigyopt.Prodigy",
    "params": {
        "lr": 1,
        "d_coef": 2,
        "d0": 1e-6,
        "safeguard_warmup": True,
        "use_bias_correction": True,
        "weight_decay": 1e-2,
        "eps": 1e-8,
        }    
    },
"adamw":{
    "name": "torch.optim.AdamW",
    "params":{
      "lr": 3e-5,
      "weight_decay": 1e-2,
    },
        
}
}

# default scheduler dict
default_scheduler_dict = {
    "scheduler":{
        "name": "transformers.get_cosine_schedule_with_warmup",
        "params": {
            "num_warmup_steps": 0,
            "num_training_steps": 1000,
            "last_epoch": -1,
        }
    }
}

# assuming training on 1024x1024 resolution
default_batch_size_dict = {
        "prodigy": {
            80: 8,  # For 80 GB VRAM, batch size is 8
            20: 1,  
        },
        "adamw": {
            80: 24,
        },
        "lion": {
            78: 48
    },
}


def get_vram_in_gb():
    """ Returns the total GPU memory in GB. """
    try:
        # Running the command 'nvidia-smi' and capturing its output
        output = subprocess.check_output(['nvidia-smi'], text=True)

        # Regular expression to find the memory part
        mem_regex = re.compile(r'\|\s+\d+MiB / (\d+)MiB\s+\|')
        match = mem_regex.search(output)
        if match:
            total_memory_mib = int(match.group(1))
            # Converting MiB to GiB (1 GiB = 1024 MiB) and rounding to 2 decimal places
            total_memory_gb = round(total_memory_mib / 1024, 2)
            return total_memory_gb
        else:
            raise ValueError("Could not parse total memory from nvidia-smi output.")
    except Exception as e:
        return f"An error occurred: {e}"


def get_batch_size(optimizer: str, vram: int) -> int:
    # allocate batch size based on vram, assuming training on 1024x1024 resolution
    _bs_dict = default_batch_size_dict
    
    if optimizer in _bs_dict:
        # Find the closest lower VRAM value that we have a batch size for
        closest_vram = max(vram_key for vram_key in _bs_dict[optimizer] if vram_key <= vram)
        return _bs_dict[optimizer][closest_vram]
    else:
        raise ValueError(f"Optimizer '{optimizer}' not supported.")


def get_train_image_count(dataset_dir:str) -> int:
    files = unibox.traverses(DATASET_DIR, include_extensions = unibox.constants.IMG_FILES)
    return len(files)


def get_scheduler_dict(it_per_epoch:int, epoch_per_cycle:int, warmup_epochs:float):

    _warmup_step_count = int(it_per_epoch * warmup_epochs)
    print(f"_warmup_step_count: {_warmup_step_count}")

    _cycle_step_count = it_per_epoch * epoch_per_cycle
    print(f"_cycle_step_count: {_cycle_step_count}")

    scheduler_dict = default_scheduler_dict.copy()
    scheduler_dict["scheduler"]["params"]["num_training_steps"] = _cycle_step_count
    scheduler_dict["scheduler"]["params"]["num_warmup_steps"] = _warmup_step_count
    return scheduler_dict


def evaluate_template_dict(template_dict):
    # generate a filled dictionary from a template
    new_dict = {}
    for key, value in template_dict.items():
        if isinstance(value, dict):
            new_dict[key] = evaluate_template_dict(value)
        elif callable(value):
            new_dict[key] = value()
        else:
            new_dict[key] = value
    return new_dict


def write_config_to_yaml(config_dict, yaml_path):
    yaml_config = OmegaConf.to_yaml(config_dict)

    # Splitting the YAML string into lines
    lines = yaml_config.split('\n')

    # Iterating through the lines and adding an empty line before each major section
    formatted_lines = []
    for line in lines:
        if line.startswith(' ') or line == '':
            # It's a subline or already an empty line, just add it
            formatted_lines.append(line)
        else:
            # It's a new major section, add an empty line before it (if it's not the first line)
            if formatted_lines:
                formatted_lines.append('')
            formatted_lines.append(line)

    # Joining the lines back into a single string
    formatted_yaml_config = '\n'.join(formatted_lines)

    # Write the formatted YAML string to a file
    with open(yaml_path, 'w') as file:
        file.write(formatted_yaml_config)

    print()
    print(f"Configuration saved to [{yaml_path}]")


def get_optimizer_dict(optimizer:str):

    return_dict = {
        "optimizer": optimizer_dict[optimizer],
    }

    return return_dict

In [21]:
DEFAULT_CONFIG = "https://huggingface.co/kiriyamaX/nd-configs/resolve/main/nd_config_template_sdxl_80g.yaml"

# ============= CONFIGS =============

# IMPORTANT
CONFIG_VERSION = 1
RUN_NAME = "qft_twitter_aes_trained-best-26k-of-798k"
DATASET_DIR = "../datasets/twitter-aes_trained-best-26k-of-798k"
# MODEL_PATH = "../models/playground-v2-1024px-aesthetic.safetensors"
MODEL_PATH = "../models/fd5me9.ckpt" 

# ===================================

# hyperparams
OFFSET_NOISE_VAL = 0.1
UCG = 0.1

# optimizer
TRAIN_OPTIMIZER = "adamw"
WARMUP_EPOCHS = 0.3
EPOCH_PER_CYCLE = 10

# saving
SAVE_INTERVAL_EPOCH = 1
SAVE_INTERVAL_STEPS = -1
# ===================================

In [22]:
regulars_dict_template = {
    "trainer": {
        "model_path": lambda: MODEL_PATH,
        "checkpoint_dir": lambda: CHECKPOINT_DIR,
        "offset_noise": True,
        "offset_noise_val": lambda: OFFSET_NOISE_VAL,
        "checkpoint_steps": lambda: SAVE_INTERVAL_STEPS,
        "checkpoint_freq": lambda: SAVE_INTERVAL_EPOCH,
    },
    "dataset": {
        "ucg": lambda: UCG,
        "img_path": lambda: [DATASET_DIR],
    },
    "sampling": {
        "every_n_steps": lambda: SAVE_INTERVAL_STEPS,
        "every_n_epochs": lambda: SAVE_INTERVAL_EPOCH,
    },
}

def get_regulars_dict():
    return evaluate_template_dict(regulars_dict_template)


CHECKPOINT_DIR = f"checkpoint_{RUN_NAME}_v{CONFIG_VERSION}"

# sys_vram = get_vram_in_gb()
sys_vram = 80
train_batch_size = get_batch_size(TRAIN_OPTIMIZER, sys_vram)
train_image_count = get_train_image_count(DATASET_DIR)
config = unibox.loads(DEFAULT_CONFIG)

if not config:
    raise FileNotFoundError

_it_per_epoch = math.floor(train_image_count / train_batch_size)
print(f"sys_vram: {sys_vram} GB \ntrain_batch_size: {train_batch_size} \ntrain_image_count: {train_image_count} \n_it_per_epoch: {_it_per_epoch}")

config = OmegaConf.merge(config, get_optimizer_dict(TRAIN_OPTIMIZER))
config = OmegaConf.merge(config, get_scheduler_dict(_it_per_epoch, EPOCH_PER_CYCLE, WARMUP_EPOCHS))
config = OmegaConf.merge(config, get_regulars_dict())


YAML_FOLDER = "./"
YAML_NAME = f"config_nd_{RUN_NAME}_v{CONFIG_VERSION}.yaml"
_yaml_path = os.path.join(YAML_FOLDER, YAML_NAME)
write_config_to_yaml(config, _yaml_path)

2023-12-16 15:43:45,081 [INFO] UniLogger: UniLoader.loads: .yaml LOADED from "/tmp/tmpsszr87yd.yaml" in 0.04s


sys_vram: 80 GB 
train_batch_size: 24 
train_image_count: 26655 
_it_per_epoch: 1110
_warmup_step_count: 333
_cycle_step_count: 11100

Configuration saved to [./config_nd_qft_twitter_aes_trained-best-26k-of-798k_v1.yaml]


## docker transformer engine

In [None]:
# https://github.com/NVIDIA/TransformerEngine?tab=readme-ov-file#installation
docker run --gpus all -it -v /home/ubuntu/datasets:/datasets -v /home/ubuntu/models:/models -v /home/ubuntu/ndtr:/ndtr --rm nvcr.io/nvidia/pytorch:23.10-py3

In [None]:
git config --global --add safe.directory /ndtr
wandb login 0025f0bc67dba1846edaf9c2425b288b23ae0f99

## create txt if not exist

In [2]:
!pip install -q unibox

[0m

In [10]:
import unibox as ub
from tqdm.auto import tqdm
# /home/ubuntu/datasets/twitter-aes_trained-best-167k-of-798k"
TARGET_DIR = "/notebooks/datasets/twitter-aes_trained-best-167k-of-798k"

# read
files_in_dir = ub.traverses(TARGET_DIR, relative_unix=True, 
                               include_extensions=ub.constants.IMG_FILES)
ub.peeks(files_in_dir)

                                                     

{'metadata': {'len': 40022, 'item_type': 'str'},
 'preview': ['1604906847521017857_3.jpg',
  '703970524313956352_1.jpg',
  '1631451367620370434_1.jpg']}

In [11]:
# create
txt_root_dir = TARGET_DIR
placeholder_txt_content = ""

txt_files_todo = [os.path.splitext(file)[0] + '.txt' for file in files_in_dir]
os.makedirs(txt_root_dir, exist_ok=True)
for txt_file in tqdm(txt_files_todo):
    subdir = os.path.dirname(txt_file)
    full_subdir_path = os.path.join(txt_root_dir, subdir)
    os.makedirs(full_subdir_path, exist_ok=True)
    txt_path = os.path.join(txt_root_dir, txt_file)
    with open(txt_path, 'w') as f:
        f.write(placeholder_txt_content)

print("Files and directories created successfully.")

  0%|          | 0/40022 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [12]:
# verify
files_in_dir = unibox.traverses(TARGET_DIR, relative_unix=True, include_extensions=[".txt"])
ub.peeks(files_in_dir)

                                                     

{'metadata': {'len': 40022, 'item_type': 'str'},
 'preview': ['1615643911099138048_1.txt',
  '1587049940366204928_1.txt',
  '1416561591043166211_2.txt']}