{ "cells": [ { "cell_type": "code", "execution_count": 7, "id": "82d89348-fb09-462d-b78f-f1dab3447c3d", "metadata": { "tags": [] }, "outputs": [], "source": [ "# !pip install -U unibox omegaconf -q\n", "import unibox\n", "import subprocess\n", "import re\n", "import os\n", "import math\n", "from omegaconf import OmegaConf\n", "\n", "# default values for different optimizers\n", "optimizer_dict = {\n", " \"prodigy\": {\n", " \"name\": \"prodigyopt.Prodigy\",\n", " \"params\": {\n", " \"lr\": 1,\n", " \"d_coef\": 2,\n", " \"d0\": 1e-6,\n", " \"safeguard_warmup\": True,\n", " \"use_bias_correction\": True,\n", " \"weight_decay\": 1e-2,\n", " \"eps\": 1e-8,\n", " } \n", " },\n", "\"adamw\":{\n", " \"name\": \"torch.optim.AdamW\",\n", " \"params\":{\n", " \"lr\": 3e-5,\n", " \"weight_decay\": 1e-2,\n", " },\n", " \n", "}\n", "}\n", "\n", "# default scheduler dict\n", "default_scheduler_dict = {\n", " \"scheduler\":{\n", " \"name\": \"transformers.get_cosine_schedule_with_warmup\",\n", " \"params\": {\n", " \"num_warmup_steps\": 0,\n", " \"num_training_steps\": 1000,\n", " \"last_epoch\": -1,\n", " }\n", " }\n", "}\n", "\n", "# assuming training on 1024x1024 resolution\n", "default_batch_size_dict = {\n", " \"prodigy\": {\n", " 80: 8, # For 80 GB VRAM, batch size is 8\n", " 20: 1, \n", " },\n", " \"adamw\": {\n", " 80: 24\n", " },\n", " \"lion\": {\n", " 78: 48\n", " },\n", "}\n", "\n", "\n", "def get_vram_in_gb():\n", " \"\"\" Returns the total GPU memory in GB. \"\"\"\n", " try:\n", " # Running the command 'nvidia-smi' and capturing its output\n", " output = subprocess.check_output(['nvidia-smi'], text=True)\n", "\n", " # Regular expression to find the memory part\n", " mem_regex = re.compile(r'\\|\\s+\\d+MiB / (\\d+)MiB\\s+\\|')\n", " match = mem_regex.search(output)\n", " if match:\n", " total_memory_mib = int(match.group(1))\n", " # Converting MiB to GiB (1 GiB = 1024 MiB) and rounding to 2 decimal places\n", " total_memory_gb = round(total_memory_mib / 1024, 2)\n", " return total_memory_gb\n", " else:\n", " raise ValueError(\"Could not parse total memory from nvidia-smi output.\")\n", " except Exception as e:\n", " return f\"An error occurred: {e}\"\n", "\n", "\n", "def get_batch_size(optimizer: str, vram: int) -> int:\n", " # allocate batch size based on vram, assuming training on 1024x1024 resolution\n", " _bs_dict = default_batch_size_dict\n", " \n", " if optimizer in _bs_dict:\n", " # Find the closest lower VRAM value that we have a batch size for\n", " closest_vram = max(vram_key for vram_key in _bs_dict[optimizer] if vram_key <= vram)\n", " return _bs_dict[optimizer][closest_vram]\n", " else:\n", " raise ValueError(f\"Optimizer '{optimizer}' not supported.\")\n", "\n", "\n", "def get_train_image_count(dataset_dir:str) -> int:\n", " files = unibox.traverses(DATASET_DIR, include_extensions = unibox.constants.IMG_FILES)\n", " return len(files)\n", "\n", "\n", "def get_scheduler_dict(it_per_epoch:int, epoch_per_cycle:int, warmup_epochs:float):\n", "\n", " _warmup_step_count = int(it_per_epoch * warmup_epochs)\n", " print(f\"_warmup_step_count: {_warmup_step_count}\")\n", "\n", " _cycle_step_count = it_per_epoch * epoch_per_cycle\n", " print(f\"_cycle_step_count: {_cycle_step_count}\")\n", "\n", " scheduler_dict = default_scheduler_dict.copy()\n", " scheduler_dict[\"scheduler\"][\"params\"][\"num_training_steps\"] = _cycle_step_count\n", " scheduler_dict[\"scheduler\"][\"params\"][\"num_warmup_steps\"] = _warmup_step_count\n", " return scheduler_dict\n", "\n", "\n", "def evaluate_template_dict(template_dict):\n", " # generate a filled dictionary from a template\n", " new_dict = {}\n", " for key, value in template_dict.items():\n", " if isinstance(value, dict):\n", " new_dict[key] = evaluate_template_dict(value)\n", " elif callable(value):\n", " new_dict[key] = value()\n", " else:\n", " new_dict[key] = value\n", " return new_dict\n", "\n", "\n", "def write_config_to_yaml(config_dict, yaml_path):\n", " yaml_config = OmegaConf.to_yaml(config_dict)\n", "\n", " # Splitting the YAML string into lines\n", " lines = yaml_config.split('\\n')\n", "\n", " # Iterating through the lines and adding an empty line before each major section\n", " formatted_lines = []\n", " for line in lines:\n", " if line.startswith(' ') or line == '':\n", " # It's a subline or already an empty line, just add it\n", " formatted_lines.append(line)\n", " else:\n", " # It's a new major section, add an empty line before it (if it's not the first line)\n", " if formatted_lines:\n", " formatted_lines.append('')\n", " formatted_lines.append(line)\n", "\n", " # Joining the lines back into a single string\n", " formatted_yaml_config = '\\n'.join(formatted_lines)\n", "\n", " # Write the formatted YAML string to a file\n", " with open(yaml_path, 'w') as file:\n", " file.write(formatted_yaml_config)\n", "\n", " print()\n", " print(f\"Configuration saved to [{yaml_path}]\")\n", "\n", "\n", "def get_optimizer_dict(optimizer:str):\n", "\n", " return_dict = {\n", " \"optimizer\": optimizer_dict[optimizer],\n", " }\n", "\n", " return return_dict" ] }, { "cell_type": "code", "execution_count": 14, "id": "12a8d495-565e-455b-a8ee-fcaf09b199a5", "metadata": {}, "outputs": [], "source": [ "DEFAULT_CONFIG = \"https://huggingface.co/kiriyamaX/nd-configs/resolve/main/nd_config_template_sdxl_80g.yaml\"\n", "\n", "# ============= CONFIGS =============\n", "\n", "# IMPORTANT\n", "CONFIG_VERSION = 1\n", "RUN_NAME = \"qft_twitter_aes_167k-of-798k\"\n", "DATASET_DIR = \"../datasets/twitter-aes_trained-best-167k-of-798k\"\n", "# MODEL_PATH = \"../models/playground-v2-1024px-aesthetic.safetensors\"\n", "MODEL_PATH = \"../models/fd5me9.ckpt\" \n", "\n", "# ===================================\n", "\n", "# hyperparams\n", "OFFSET_NOISE_VAL = 0.12\n", "UCG = 0.1\n", "\n", "# optimizer\n", "TRAIN_OPTIMIZER = \"adamw\"\n", "WARMUP_EPOCHS = 0.3\n", "EPOCH_PER_CYCLE = 10\n", "\n", "# saving\n", "SAVE_INTERVAL_EPOCH = 1\n", "SAVE_INTERVAL_STEPS = -1\n", "# ===================================" ] }, { "cell_type": "code", "execution_count": 15, "id": "c4089d9b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2023-12-18 07:51:41,207 [INFO] UniLogger: UniLoader.loads: .yaml LOADED from \"/tmp/tmptm3kzw5a.yaml\" in 0.04s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "sys_vram: 80 GB \n", "train_batch_size: 24 \n", "train_image_count: 166110 \n", "_it_per_epoch: 6921\n", "_warmup_step_count: 2076\n", "_cycle_step_count: 69210\n", "\n", "Configuration saved to [./config_nd_qft_twitter_aes_167k-of-798k_v1.yaml]\n" ] } ], "source": [ "regulars_dict_template = {\n", " \"trainer\": {\n", " \"model_path\": lambda: MODEL_PATH,\n", " \"checkpoint_dir\": lambda: CHECKPOINT_DIR,\n", " \"offset_noise\": True,\n", " \"offset_noise_val\": lambda: OFFSET_NOISE_VAL,\n", " \"checkpoint_steps\": lambda: SAVE_INTERVAL_STEPS,\n", " \"checkpoint_freq\": lambda: SAVE_INTERVAL_EPOCH,\n", " },\n", " \"dataset\": {\n", " \"ucg\": lambda: UCG,\n", " \"img_path\": lambda: [DATASET_DIR],\n", " },\n", " \"sampling\": {\n", " \"every_n_steps\": lambda: SAVE_INTERVAL_STEPS,\n", " \"every_n_epochs\": lambda: SAVE_INTERVAL_EPOCH,\n", " },\n", "}\n", "\n", "def get_regulars_dict():\n", " return evaluate_template_dict(regulars_dict_template)\n", "\n", "\n", "CHECKPOINT_DIR = f\"checkpoint_{RUN_NAME}_v{CONFIG_VERSION}\"\n", "\n", "# sys_vram = get_vram_in_gb()\n", "sys_vram = 80\n", "train_batch_size = get_batch_size(TRAIN_OPTIMIZER, sys_vram)\n", "train_image_count = get_train_image_count(DATASET_DIR)\n", "config = unibox.loads(DEFAULT_CONFIG)\n", "\n", "if not config:\n", " raise FileNotFoundError\n", "\n", "_it_per_epoch = math.floor(train_image_count / train_batch_size)\n", "print(f\"sys_vram: {sys_vram} GB \\ntrain_batch_size: {train_batch_size} \\ntrain_image_count: {train_image_count} \\n_it_per_epoch: {_it_per_epoch}\")\n", "\n", "config = OmegaConf.merge(config, get_optimizer_dict(TRAIN_OPTIMIZER))\n", "config = OmegaConf.merge(config, get_scheduler_dict(_it_per_epoch, EPOCH_PER_CYCLE, WARMUP_EPOCHS))\n", "config = OmegaConf.merge(config, get_regulars_dict())\n", "\n", "\n", "YAML_FOLDER = \"./\"\n", "YAML_NAME = f\"config_nd_{RUN_NAME}_v{CONFIG_VERSION}.yaml\"\n", "_yaml_path = os.path.join(YAML_FOLDER, YAML_NAME)\n", "write_config_to_yaml(config, _yaml_path)" ] }, { "cell_type": "code", "execution_count": 18, "id": "fe376cc7", "metadata": {}, "outputs": [], "source": [ "# !pip install -U unibox omegaconf -q\n", "import unibox\n", "import subprocess\n", "import re\n", "import os\n", "import math\n", "from omegaconf import OmegaConf\n", "\n", "# default values for different optimizers\n", "optimizer_dict = {\n", " \"prodigy\": {\n", " \"name\": \"prodigyopt.Prodigy\",\n", " \"params\": {\n", " \"lr\": 1,\n", " \"d_coef\": 2,\n", " \"d0\": 1e-6,\n", " \"safeguard_warmup\": True,\n", " \"use_bias_correction\": True,\n", " \"weight_decay\": 1e-2,\n", " \"eps\": 1e-8,\n", " } \n", " },\n", "\"adamw\":{\n", " \"name\": \"torch.optim.AdamW\",\n", " \"params\":{\n", " \"lr\": 3e-5,\n", " \"weight_decay\": 1e-2,\n", " },\n", " \n", "}\n", "}\n", "\n", "# default scheduler dict\n", "default_scheduler_dict = {\n", " \"scheduler\":{\n", " \"name\": \"transformers.get_cosine_schedule_with_warmup\",\n", " \"params\": {\n", " \"num_warmup_steps\": 0,\n", " \"num_training_steps\": 1000,\n", " \"last_epoch\": -1,\n", " }\n", " }\n", "}\n", "\n", "# assuming training on 1024x1024 resolution\n", "default_batch_size_dict = {\n", " \"prodigy\": {\n", " 80: 8, # For 80 GB VRAM, batch size is 8\n", " 20: 1, \n", " },\n", " \"adamw\": {\n", " 80: 24,\n", " },\n", " \"lion\": {\n", " 78: 48\n", " },\n", "}\n", "\n", "\n", "def get_vram_in_gb():\n", " \"\"\" Returns the total GPU memory in GB. \"\"\"\n", " try:\n", " # Running the command 'nvidia-smi' and capturing its output\n", " output = subprocess.check_output(['nvidia-smi'], text=True)\n", "\n", " # Regular expression to find the memory part\n", " mem_regex = re.compile(r'\\|\\s+\\d+MiB / (\\d+)MiB\\s+\\|')\n", " match = mem_regex.search(output)\n", " if match:\n", " total_memory_mib = int(match.group(1))\n", " # Converting MiB to GiB (1 GiB = 1024 MiB) and rounding to 2 decimal places\n", " total_memory_gb = round(total_memory_mib / 1024, 2)\n", " return total_memory_gb\n", " else:\n", " raise ValueError(\"Could not parse total memory from nvidia-smi output.\")\n", " except Exception as e:\n", " return f\"An error occurred: {e}\"\n", "\n", "\n", "def get_batch_size(optimizer: str, vram: int) -> int:\n", " # allocate batch size based on vram, assuming training on 1024x1024 resolution\n", " _bs_dict = default_batch_size_dict\n", " \n", " if optimizer in _bs_dict:\n", " # Find the closest lower VRAM value that we have a batch size for\n", " closest_vram = max(vram_key for vram_key in _bs_dict[optimizer] if vram_key <= vram)\n", " return _bs_dict[optimizer][closest_vram]\n", " else:\n", " raise ValueError(f\"Optimizer '{optimizer}' not supported.\")\n", "\n", "\n", "def get_train_image_count(dataset_dir:str) -> int:\n", " files = unibox.traverses(DATASET_DIR, include_extensions = unibox.constants.IMG_FILES)\n", " return len(files)\n", "\n", "\n", "def get_scheduler_dict(it_per_epoch:int, epoch_per_cycle:int, warmup_epochs:float):\n", "\n", " _warmup_step_count = int(it_per_epoch * warmup_epochs)\n", " print(f\"_warmup_step_count: {_warmup_step_count}\")\n", "\n", " _cycle_step_count = it_per_epoch * epoch_per_cycle\n", " print(f\"_cycle_step_count: {_cycle_step_count}\")\n", "\n", " scheduler_dict = default_scheduler_dict.copy()\n", " scheduler_dict[\"scheduler\"][\"params\"][\"num_training_steps\"] = _cycle_step_count\n", " scheduler_dict[\"scheduler\"][\"params\"][\"num_warmup_steps\"] = _warmup_step_count\n", " return scheduler_dict\n", "\n", "\n", "def evaluate_template_dict(template_dict):\n", " # generate a filled dictionary from a template\n", " new_dict = {}\n", " for key, value in template_dict.items():\n", " if isinstance(value, dict):\n", " new_dict[key] = evaluate_template_dict(value)\n", " elif callable(value):\n", " new_dict[key] = value()\n", " else:\n", " new_dict[key] = value\n", " return new_dict\n", "\n", "\n", "def write_config_to_yaml(config_dict, yaml_path):\n", " yaml_config = OmegaConf.to_yaml(config_dict)\n", "\n", " # Splitting the YAML string into lines\n", " lines = yaml_config.split('\\n')\n", "\n", " # Iterating through the lines and adding an empty line before each major section\n", " formatted_lines = []\n", " for line in lines:\n", " if line.startswith(' ') or line == '':\n", " # It's a subline or already an empty line, just add it\n", " formatted_lines.append(line)\n", " else:\n", " # It's a new major section, add an empty line before it (if it's not the first line)\n", " if formatted_lines:\n", " formatted_lines.append('')\n", " formatted_lines.append(line)\n", "\n", " # Joining the lines back into a single string\n", " formatted_yaml_config = '\\n'.join(formatted_lines)\n", "\n", " # Write the formatted YAML string to a file\n", " with open(yaml_path, 'w') as file:\n", " file.write(formatted_yaml_config)\n", "\n", " print()\n", " print(f\"Configuration saved to [{yaml_path}]\")\n", "\n", "\n", "def get_optimizer_dict(optimizer:str):\n", "\n", " return_dict = {\n", " \"optimizer\": optimizer_dict[optimizer],\n", " }\n", "\n", " return return_dict" ] }, { "cell_type": "code", "execution_count": 21, "id": "33db0266", "metadata": {}, "outputs": [], "source": [ "DEFAULT_CONFIG = \"https://huggingface.co/kiriyamaX/nd-configs/resolve/main/nd_config_template_sdxl_80g.yaml\"\n", "\n", "# ============= CONFIGS =============\n", "\n", "# IMPORTANT\n", "CONFIG_VERSION = 1\n", "RUN_NAME = \"qft_twitter_aes_trained-best-26k-of-798k\"\n", "DATASET_DIR = \"../datasets/twitter-aes_trained-best-26k-of-798k\"\n", "# MODEL_PATH = \"../models/playground-v2-1024px-aesthetic.safetensors\"\n", "MODEL_PATH = \"../models/fd5me9.ckpt\" \n", "\n", "# ===================================\n", "\n", "# hyperparams\n", "OFFSET_NOISE_VAL = 0.1\n", "UCG = 0.1\n", "\n", "# optimizer\n", "TRAIN_OPTIMIZER = \"adamw\"\n", "WARMUP_EPOCHS = 0.3\n", "EPOCH_PER_CYCLE = 10\n", "\n", "# saving\n", "SAVE_INTERVAL_EPOCH = 1\n", "SAVE_INTERVAL_STEPS = -1\n", "# ===================================" ] }, { "cell_type": "code", "execution_count": 22, "id": "68e09efd", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2023-12-16 15:43:45,081 [INFO] UniLogger: UniLoader.loads: .yaml LOADED from \"/tmp/tmpsszr87yd.yaml\" in 0.04s\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "sys_vram: 80 GB \n", "train_batch_size: 24 \n", "train_image_count: 26655 \n", "_it_per_epoch: 1110\n", "_warmup_step_count: 333\n", "_cycle_step_count: 11100\n", "\n", "Configuration saved to [./config_nd_qft_twitter_aes_trained-best-26k-of-798k_v1.yaml]\n" ] } ], "source": [ "regulars_dict_template = {\n", " \"trainer\": {\n", " \"model_path\": lambda: MODEL_PATH,\n", " \"checkpoint_dir\": lambda: CHECKPOINT_DIR,\n", " \"offset_noise\": True,\n", " \"offset_noise_val\": lambda: OFFSET_NOISE_VAL,\n", " \"checkpoint_steps\": lambda: SAVE_INTERVAL_STEPS,\n", " \"checkpoint_freq\": lambda: SAVE_INTERVAL_EPOCH,\n", " },\n", " \"dataset\": {\n", " \"ucg\": lambda: UCG,\n", " \"img_path\": lambda: [DATASET_DIR],\n", " },\n", " \"sampling\": {\n", " \"every_n_steps\": lambda: SAVE_INTERVAL_STEPS,\n", " \"every_n_epochs\": lambda: SAVE_INTERVAL_EPOCH,\n", " },\n", "}\n", "\n", "def get_regulars_dict():\n", " return evaluate_template_dict(regulars_dict_template)\n", "\n", "\n", "CHECKPOINT_DIR = f\"checkpoint_{RUN_NAME}_v{CONFIG_VERSION}\"\n", "\n", "# sys_vram = get_vram_in_gb()\n", "sys_vram = 80\n", "train_batch_size = get_batch_size(TRAIN_OPTIMIZER, sys_vram)\n", "train_image_count = get_train_image_count(DATASET_DIR)\n", "config = unibox.loads(DEFAULT_CONFIG)\n", "\n", "if not config:\n", " raise FileNotFoundError\n", "\n", "_it_per_epoch = math.floor(train_image_count / train_batch_size)\n", "print(f\"sys_vram: {sys_vram} GB \\ntrain_batch_size: {train_batch_size} \\ntrain_image_count: {train_image_count} \\n_it_per_epoch: {_it_per_epoch}\")\n", "\n", "config = OmegaConf.merge(config, get_optimizer_dict(TRAIN_OPTIMIZER))\n", "config = OmegaConf.merge(config, get_scheduler_dict(_it_per_epoch, EPOCH_PER_CYCLE, WARMUP_EPOCHS))\n", "config = OmegaConf.merge(config, get_regulars_dict())\n", "\n", "\n", "YAML_FOLDER = \"./\"\n", "YAML_NAME = f\"config_nd_{RUN_NAME}_v{CONFIG_VERSION}.yaml\"\n", "_yaml_path = os.path.join(YAML_FOLDER, YAML_NAME)\n", "write_config_to_yaml(config, _yaml_path)" ] }, { "cell_type": "markdown", "id": "70859336-6ae3-4b55-a88d-3bc21a0e6a09", "metadata": {}, "source": [ "## docker transformer engine" ] }, { "cell_type": "code", "execution_count": null, "id": "569387df-9e56-44f9-8001-1bd2d61ea8b5", "metadata": {}, "outputs": [], "source": [ "# https://github.com/NVIDIA/TransformerEngine?tab=readme-ov-file#installation\n", "docker run --gpus all -it -v /home/ubuntu/datasets:/datasets -v /home/ubuntu/models:/models -v /home/ubuntu/ndtr:/ndtr --rm nvcr.io/nvidia/pytorch:23.10-py3" ] }, { "cell_type": "code", "execution_count": null, "id": "9e4a5dad-0c9d-402b-9cf7-fa3fd08e4f37", "metadata": {}, "outputs": [], "source": [ "git config --global --add safe.directory /ndtr\n", "wandb login 0025f0bc67dba1846edaf9c2425b288b23ae0f99" ] }, { "cell_type": "markdown", "id": "11eae193-980f-449b-ac8f-4976ca235da4", "metadata": {}, "source": [ "## create txt if not exist" ] }, { "cell_type": "code", "execution_count": 2, "id": "0d8cf284-cb15-4218-b68e-b99e72ef53cf", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0m" ] } ], "source": [ "!pip install -q unibox" ] }, { "cell_type": "code", "execution_count": 10, "id": "701cae45-da02-4ea7-81f3-9ee1c2f14d47", "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " \r" ] }, { "data": { "text/plain": [ "{'metadata': {'len': 40022, 'item_type': 'str'},\n", " 'preview': ['1604906847521017857_3.jpg',\n", " '703970524313956352_1.jpg',\n", " '1631451367620370434_1.jpg']}" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import unibox as ub\n", "from tqdm.auto import tqdm\n", "# /home/ubuntu/datasets/twitter-aes_trained-best-167k-of-798k\"\n", "TARGET_DIR = \"/notebooks/datasets/twitter-aes_trained-best-167k-of-798k\"\n", "\n", "# read\n", "files_in_dir = ub.traverses(TARGET_DIR, relative_unix=True, \n", " include_extensions=ub.constants.IMG_FILES)\n", "ub.peeks(files_in_dir)" ] }, { "cell_type": "code", "execution_count": 11, "id": "79746897-03cc-4aa2-ae74-ac62ea00e389", "metadata": { "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7a3dfcb29c6640b3a7638fecb9b2a1e7", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/40022 [00:00 12\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtxt_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mw\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m 13\u001b[0m f\u001b[38;5;241m.\u001b[39mwrite(placeholder_txt_content)\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mFiles and directories created successfully.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py:310\u001b[0m, in \u001b[0;36m_modified_open\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m 303\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m {\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m}:\n\u001b[1;32m 304\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 305\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIPython won\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt let you open fd=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m by default \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 306\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 307\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124myou can use builtins\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m open.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 308\u001b[0m )\n\u001b[0;32m--> 310\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mio_open\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/usr/lib/python3.10/codecs.py:186\u001b[0m, in \u001b[0;36mIncrementalEncoder.__init__\u001b[0;34m(self, errors)\u001b[0m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mIncrementalEncoder\u001b[39;00m(\u001b[38;5;28mobject\u001b[39m):\n\u001b[1;32m 181\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 182\u001b[0m \u001b[38;5;124;03m An IncrementalEncoder encodes an input in multiple steps. The input can\u001b[39;00m\n\u001b[1;32m 183\u001b[0m \u001b[38;5;124;03m be passed piece by piece to the encode() method. The IncrementalEncoder\u001b[39;00m\n\u001b[1;32m 184\u001b[0m \u001b[38;5;124;03m remembers the state of the encoding process between calls to encode().\u001b[39;00m\n\u001b[1;32m 185\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 186\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, errors\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mstrict\u001b[39m\u001b[38;5;124m'\u001b[39m):\n\u001b[1;32m 187\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 188\u001b[0m \u001b[38;5;124;03m Creates an IncrementalEncoder instance.\u001b[39;00m\n\u001b[1;32m 189\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 192\u001b[0m \u001b[38;5;124;03m for a list of possible values.\u001b[39;00m\n\u001b[1;32m 193\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m 194\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39merrors \u001b[38;5;241m=\u001b[39m errors\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "# create\n", "txt_root_dir = TARGET_DIR\n", "placeholder_txt_content = \"\"\n", "\n", "txt_files_todo = [os.path.splitext(file)[0] + '.txt' for file in files_in_dir]\n", "os.makedirs(txt_root_dir, exist_ok=True)\n", "for txt_file in tqdm(txt_files_todo):\n", " subdir = os.path.dirname(txt_file)\n", " full_subdir_path = os.path.join(txt_root_dir, subdir)\n", " os.makedirs(full_subdir_path, exist_ok=True)\n", " txt_path = os.path.join(txt_root_dir, txt_file)\n", " with open(txt_path, 'w') as f:\n", " f.write(placeholder_txt_content)\n", "\n", "print(\"Files and directories created successfully.\")" ] }, { "cell_type": "code", "execution_count": 12, "id": "38e5d854-e66b-4644-8119-02051789bcde", "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " \r" ] }, { "data": { "text/plain": [ "{'metadata': {'len': 40022, 'item_type': 'str'},\n", " 'preview': ['1615643911099138048_1.txt',\n", " '1587049940366204928_1.txt',\n", " '1416561591043166211_2.txt']}" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# verify\n", "files_in_dir = unibox.traverses(TARGET_DIR, relative_unix=True, include_extensions=[\".txt\"])\n", "ub.peeks(files_in_dir)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 5 }