Spaces:

FoundationVision
/

LlamaGen

Runtime error

App Files Files Community

ShoufaChen commited on Jun 8, 2024

Commit

4d20c2f

0 Parent(s):

init

Browse files

Files changed (17) hide show

.gitattributes +35 -0
.gitignore +34 -0
README.md +13 -0
app.py +160 -0
imagenet_en_cn.py +1002 -0
models/generate.py +176 -0
models/gpt.py +465 -0
requirements.txt +1 -0
tokenizer_image/discriminator.py +255 -0
tokenizer_image/discriminator_patchgan.py +152 -0
tokenizer_image/discriminator_stylegan.py +101 -0
tokenizer_image/lpips.py +164 -0
tokenizer_image/reconstruction_vq_ddp.py +197 -0
tokenizer_image/vq_demo.py +84 -0
tokenizer_image/vq_loss.py +168 -0
tokenizer_image/vq_model.py +424 -0
tokenizer_image/vq_train.py +316 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,34 @@

+# Python
+__pycache__
+*.pyc
+*.egg-info
+dist
+.ipynb_checkpoints
+*.ipynb
+# Log
+*.log
+*.log.*
+*.json
+*.jsonl
+# Data
+datasets
+*.zip
+*.png
+*.jpg
+*.jpeg
+# Model
+checkpoints
+ckpts*
+*.ckpt
+*.pth
+*.pt
+pretrained_models
+# Other
+.DS_Store
+wandb
+output
+results

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: LlamaGen
+emoji: 🏆
+colorFrom: indigo
+colorTo: pink
+sdk: gradio
+sdk_version: 4.36.0
+app_file: app.py
+pinned: false
+license: mit
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,160 @@

+from PIL import Image
+import gradio as gr
+from imagenet_en_cn import IMAGENET_1K_CLASSES
+from huggingface_hub import hf_hub_download
+import torch
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+torch.set_float32_matmul_precision('high')
+setattr(torch.nn.Linear, 'reset_parameters', lambda self: None)
+setattr(torch.nn.LayerNorm, 'reset_parameters', lambda self: None)
+import time
+import argparse
+from tokenizer_image.vq_model import VQ_models
+from models.gpt import GPT_models
+from models.generate import generate
+device = "cuda"
+model2ckpt = {
+    "GPT-XL": ("vq_ds16_c2i.pt", "c2i_XL_384.pt", 384),
+    "GPT-B": ("vq_ds16_c2i.pt", "c2i_B_256.pt", 256),
+}
+def load_model(args):
+    ckpt_folder = "./"
+    vq_ckpt, gpt_ckpt, image_size = model2ckpt[args.gpt_model]
+    hf_hub_download(repo_id="FoundationVision/LlamaGen", filename=vq_ckpt, local_dir=ckpt_folder)
+    hf_hub_download(repo_id="FoundationVision/LlamaGen", filename=gpt_ckpt, local_dir=ckpt_folder)
+    # create and load model
+    vq_model = VQ_models[args.vq_model](
+        codebook_size=args.codebook_size,
+        codebook_embed_dim=args.codebook_embed_dim)
+    vq_model.to(device)
+    vq_model.eval()
+    checkpoint = torch.load(f"{ckpt_folder}{vq_ckpt}", map_location="cpu")
+    vq_model.load_state_dict(checkpoint["model"])
+    del checkpoint
+    print(f"image tokenizer is loaded")
+    # create and load gpt model
+    precision = {'none': torch.float32, 'bf16': torch.bfloat16, 'fp16': torch.float16}[args.precision]
+    latent_size = image_size // args.downsample_size
+    gpt_model = GPT_models[args.gpt_model](
+        vocab_size=args.codebook_size,
+        block_size=latent_size ** 2,
+        num_classes=args.num_classes,
+        cls_token_num=args.cls_token_num,
+        model_type=args.gpt_type,
+    ).to(device=device, dtype=precision)
+    checkpoint = torch.load(f"{ckpt_folder}{gpt_ckpt}", map_location="cpu")
+    if args.from_fsdp: # fspd
+        model_weight = checkpoint
+    elif "model" in checkpoint:  # ddp
+        model_weight = checkpoint["model"]
+    elif "module" in checkpoint: # deepspeed
+        model_weight = checkpoint["module"]
+    elif "state_dict" in checkpoint:
+        model_weight = checkpoint["state_dict"]
+    else:
+        raise Exception("please check model weight")
+    # if 'freqs_cis' in model_weight:
+    #     model_weight.pop('freqs_cis')
+    gpt_model.load_state_dict(model_weight, strict=False)
+    gpt_model.eval()
+    del checkpoint
+    print(f"gpt model is loaded")
+    if args.compile:
+        print(f"compiling the model...")
+        gpt_model = torch.compile(
+            gpt_model,
+            mode="reduce-overhead",
+            fullgraph=True
+        ) # requires PyTorch 2.0 (optional)
+    else:
+        print(f"no need to compile model in demo")
+    return vq_model, gpt_model, image_size
+def infer(cfg_scale, top_k, top_p, temperature, class_label, seed):
+    n = 4
+    latent_size = image_size // args.downsample_size
+    # Labels to condition the model with (feel free to change):
+    class_labels = [class_label for _ in range(n)]
+    c_indices = torch.tensor(class_labels, device=device)
+    qzshape = [len(class_labels), args.codebook_embed_dim, latent_size, latent_size]
+    t1 = time.time()
+    torch.manual_seed(seed)
+    index_sample = generate(
+        gpt_model, c_indices, latent_size ** 2,
+        cfg_scale=cfg_scale, cfg_interval=args.cfg_interval,
+        temperature=temperature, top_k=top_k,
+        top_p=top_p, sample_logits=True,
+        )
+    sampling_time = time.time() - t1
+    print(f"gpt sampling takes about {sampling_time:.2f} seconds.")
+    t2 = time.time()
+    samples = vq_model.decode_code(index_sample, qzshape) # output value is between [-1, 1]
+    decoder_time = time.time() - t2
+    print(f"decoder takes about {decoder_time:.2f} seconds.")
+    # Convert to PIL.Image format:
+    samples = samples.mul(127.5).add_(128.0).clamp_(0, 255).permute(0, 2, 3, 1).to("cpu", torch.uint8).numpy()
+    samples = [Image.fromarray(sample) for sample in samples]
+    return samples
+parser = argparse.ArgumentParser()
+parser.add_argument("--gpt-model", type=str, choices=list(GPT_models.keys()), default="GPT-XL")
+parser.add_argument("--gpt-type", type=str, choices=['c2i', 't2i'], default="c2i", help="class-conditional or text-conditional")
+parser.add_argument("--from-fsdp", action='store_true')
+parser.add_argument("--cls-token-num", type=int, default=1, help="max token number of condition input")
+parser.add_argument("--precision", type=str, default='bf16', choices=["none", "fp16", "bf16"])
+parser.add_argument("--compile", action='store_true', default=False)
+parser.add_argument("--vq-model", type=str, choices=list(VQ_models.keys()), default="VQ-16")
+parser.add_argument("--codebook-size", type=int, default=16384, help="codebook size for vector quantization")
+parser.add_argument("--codebook-embed-dim", type=int, default=8, help="codebook dimension for vector quantization")
+parser.add_argument("--downsample-size", type=int, choices=[8, 16], default=16)
+parser.add_argument("--num-classes", type=int, default=1000)
+parser.add_argument("--cfg-scale", type=float, default=4.0)
+parser.add_argument("--cfg-interval", type=float, default=-1)
+parser.add_argument("--seed", type=int, default=0)
+parser.add_argument("--top-k", type=int, default=2000,help="top-k value to sample with")
+parser.add_argument("--temperature", type=float, default=1.0, help="temperature value to sample with")
+parser.add_argument("--top-p", type=float, default=1.0, help="top-p value to sample with")
+args = parser.parse_args()
+vq_model, gpt_model, image_size = load_model(args)
+with gr.Blocks() as demo:
+    gr.Markdown("<h1 style='text-align: center'>Autoregressive Model Beats Diffusion: Llama for Scalable Image Generation</h1>")
+    with gr.Tabs():
+        with gr.TabItem('Generate'):
+            with gr.Row():
+                with gr.Column():
+                    # with gr.Row():
+                    #     image_size = gr.Radio(choices=[384], value=384, label='Peize Model Resolution')
+                    with gr.Row():
+                        i1k_class = gr.Dropdown(
+                            list(IMAGENET_1K_CLASSES.values()),
+                            value='Eskimo dog, husky [爱斯基摩犬,哈士奇]',
+                            type="index", label='ImageNet-1K Class'
+                        )
+                    cfg_scale = gr.Slider(minimum=1, maximum=25, step=0.1, value=4.0, label='Classifier-free Guidance Scale')
+                    top_k = gr.Slider(minimum=1, maximum=16384, step=1, value=4000, label='Top-K')
+                    top_p = gr.Slider(minimum=0., maximum=1.0, step=0.1, value=1.0, label="Top-P")
+                    temperature = gr.Slider(minimum=0., maximum=1.0, step=0.1, value=1.0, label='Temperature')
+                    seed = gr.Slider(minimum=0, maximum=1000, step=1, value=42, label='Seed')
+                    # seed = gr.Number(value=0, label='Seed')
+                    button = gr.Button("Generate", variant="primary")
+                with gr.Column():
+                    output = gr.Gallery(label='Generated Images', height=700)
+                    button.click(infer, inputs=[cfg_scale, top_k, top_p, temperature, i1k_class, seed], outputs=[output])
+    demo.queue()
+    demo.launch(debug=True)

imagenet_en_cn.py ADDED Viewed

	@@ -0,0 +1,1002 @@

+IMAGENET_1K_CLASSES = {
+  0: 'tench, Tinca tinca [丁鲷]',
+  1: 'goldfish, Carassius auratus [金鱼]',
+  2: 'great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias [大白鲨]',
+  3: 'tiger shark, Galeocerdo cuvieri [虎鲨]',
+  4: 'hammerhead, hammerhead shark [锤头鲨]',
+  5: 'electric ray, crampfish, numbfish, torpedo [电鳐]',
+  6: 'stingray [黄貂鱼]',
+  7: 'cock [公鸡]',
+  8: 'hen [母鸡]',
+  9: 'ostrich, Struthio camelus [鸵鸟]',
+  10: 'brambling, Fringilla montifringilla [燕雀]',
+  11: 'goldfinch, Carduelis carduelis [金翅雀]',
+  12: 'house finch, linnet, Carpodacus mexicanus [家朱雀]',
+  13: 'junco, snowbird [灯芯草雀]',
+  14: 'indigo bunting, indigo finch, indigo bird, Passerina cyanea [靛蓝雀,靛蓝鸟]',
+  15: 'robin, American robin, Turdus migratorius [蓝鹀]',
+  16: 'bulbul [夜莺]',
+  17: 'jay [松鸦]',
+  18: 'magpie [喜鹊]',
+  19: 'chickadee [山雀]',
+  20: 'water ouzel, dipper [河鸟]',
+  21: 'kite [鸢（猛禽）]',
+  22: 'bald eagle, American eagle, Haliaeetus leucocephalus [秃头鹰]',
+  23: 'vulture [秃鹫]',
+  24: 'great grey owl, great gray owl, Strix nebulosa [大灰猫头鹰]',
+  25: 'European fire salamander, Salamandra salamandra [欧洲火蝾螈]',
+  26: 'common newt, Triturus vulgaris [普通蝾螈]',
+  27: 'eft [水蜥]',
+  28: 'spotted salamander, Ambystoma maculatum [斑点蝾螈]',
+  29: 'axolotl, mud puppy, Ambystoma mexicanum [蝾螈,泥狗]',
+  30: 'bullfrog, Rana catesbeiana [牛蛙]',
+  31: 'tree frog, tree-frog [树蛙]',
+  32: 'tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui [尾蛙,铃蟾蜍,肋蟾蜍,尾蟾蜍]',
+  33: 'loggerhead, loggerhead turtle, Caretta caretta [红海龟]',
+  34: 'leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea [皮革龟]',
+  35: 'mud turtle [泥龟]',
+  36: 'terrapin [淡水龟]',
+  37: 'box turtle, box tortoise [箱龟]',
+  38: 'banded gecko [带状壁虎]',
+  39: 'common iguana, iguana, Iguana iguana [普通鬣蜥]',
+  40: 'American chameleon, anole, Anolis carolinensis [美国变色龙]',
+  41: 'whiptail, whiptail lizard [鞭尾蜥蜴]',
+  42: 'agama [飞龙科蜥蜴]',
+  43: 'frilled lizard, Chlamydosaurus kingi [褶边蜥蜴]',
+  44: 'alligator lizard [鳄鱼蜥蜴]',
+  45: 'Gila monster, Heloderma suspectum [毒蜥]',
+  46: 'green lizard, Lacerta viridis [绿蜥蜴]',
+  47: 'African chameleon, Chamaeleo chamaeleon [非洲变色龙]',
+  48: 'Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis [科莫多蜥蜴]',
+  49: 'African crocodile, Nile crocodile, Crocodylus niloticus [非洲鳄,尼罗河鳄鱼]',
+  50: 'American alligator, Alligator mississipiensis [美国鳄鱼,鳄鱼]',
+  51: 'triceratops [三角龙]',
+  52: 'thunder snake, worm snake, Carphophis amoenus [雷蛇,蠕虫蛇]',
+  53: 'ringneck snake, ring-necked snake, ring snake [环蛇,环颈蛇]',
+  54: 'hognose snake, puff adder, sand viper [希腊蛇]',
+  55: 'green snake, grass snake [绿蛇,草蛇]',
+  56: 'king snake, kingsnake [国王蛇]',
+  57: 'garter snake, grass snake [袜带蛇,草蛇]',
+  58: 'water snake [水蛇]',
+  59: 'vine snake [藤蛇]',
+  60: 'night snake, Hypsiglena torquata [夜蛇]',
+  61: 'boa constrictor, Constrictor constrictor [大蟒蛇]',
+  62: 'rock python, rock snake, Python sebae [岩石蟒蛇,岩蛇,蟒蛇]',
+  63: 'Indian cobra, Naja naja [印度眼镜蛇]',
+  64: 'green mamba [绿曼巴]',
+  65: 'sea snake [海蛇]',
+  66: 'horned viper, cerastes, sand viper, horned asp, Cerastes cornutus [角腹蛇]',
+  67: 'diamondback, diamondback rattlesnake, Crotalus adamanteus [菱纹响尾蛇]',
+  68: 'sidewinder, horned rattlesnake, Crotalus cerastes [角响尾蛇]',
+  69: 'trilobite [三叶虫]',
+  70: 'harvestman, daddy longlegs, Phalangium opilio [盲蜘蛛]',
+  71: 'scorpion [蝎子]',
+  72: 'black and gold garden spider, Argiope aurantia [黑金花园蜘蛛]',
+  73: 'barn spider, Araneus cavaticus [谷仓蜘蛛]',
+  74: 'garden spider, Aranea diademata [花园蜘蛛]',
+  75: 'black widow, Latrodectus mactans [黑寡妇蜘蛛]',
+  76: 'tarantula [狼蛛]',
+  77: 'wolf spider, hunting spider [狼蜘蛛,狩猎蜘蛛]',
+  78: 'tick [壁虱]',
+  79: 'centipede [蜈蚣]',
+  80: 'black grouse [黑松鸡]',
+  81: 'ptarmigan [松鸡,雷鸟]',
+  82: 'ruffed grouse, partridge, Bonasa umbellus [披肩鸡,披肩榛鸡]',
+  83: 'prairie chicken, prairie grouse, prairie fowl [草原鸡,草原松鸡]',
+  84: 'peacock [孔雀]',
+  85: 'quail [鹌鹑]',
+  86: 'partridge [鹧鸪]',
+  87: 'African grey, African gray, Psittacus erithacus [非洲灰鹦鹉]',
+  88: 'macaw [金刚鹦鹉]',
+  89: 'sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita [硫冠鹦鹉]',
+  90: 'lorikeet [短尾鹦鹉]',
+  91: 'coucal [褐翅鸦鹃]',
+  92: 'bee eater [蜜蜂]',
+  93: 'hornbill [犀鸟]',
+  94: 'hummingbird [蜂鸟]',
+  95: 'jacamar [鹟䴕]',
+  96: 'toucan [犀鸟]',
+  97: 'drake [野鸭]',
+  98: 'red-breasted merganser, Mergus serrator [���胸秋沙鸭]',
+  99: 'goose [鹅]',
+  100: 'black swan, Cygnus atratus [黑天鹅]',
+  101: 'tusker [大象]',
+  102: 'echidna, spiny anteater, anteater [针鼹鼠]',
+  103: 'platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus [鸭嘴兽]',
+  104: 'wallaby, brush kangaroo [沙袋鼠]',
+  105: 'koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus [考拉,考拉熊]',
+  106: 'wombat [袋熊]',
+  107: 'jellyfish [水母]',
+  108: 'sea anemone, anemone [海葵]',
+  109: 'brain coral [脑珊瑚]',
+  110: 'flatworm, platyhelminth [扁形虫扁虫]',
+  111: 'nematode, nematode worm, roundworm [线虫,蛔虫]',
+  112: 'conch [海螺]',
+  113: 'snail [蜗牛]',
+  114: 'slug [鼻涕虫]',
+  115: 'sea slug, nudibranch [海参]',
+  116: 'chiton, coat-of-mail shell, sea cradle, polyplacophore [石鳖]',
+  117: 'chambered nautilus, pearly nautilus, nautilus [鹦鹉螺]',
+  118: 'Dungeness crab, Cancer magister [珍宝蟹]',
+  119: 'rock crab, Cancer irroratus [石蟹]',
+  120: 'fiddler crab [招潮蟹]',
+  121: 'king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica [帝王蟹,阿拉斯加蟹,阿拉斯加帝王蟹]',
+  122: 'American lobster, Northern lobster, Maine lobster, Homarus americanus [美国龙虾,缅因州龙虾]',
+  123: 'spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish [大螯虾]',
+  124: 'crayfish, crawfish, crawdad, crawdaddy [小龙虾]',
+  125: 'hermit crab [寄居蟹]',
+  126: 'isopod [等足目动物(明虾和螃蟹近亲)]',
+  127: 'white stork, Ciconia ciconia [白鹳]',
+  128: 'black stork, Ciconia nigra [黑鹳]',
+  129: 'spoonbill [鹭]',
+  130: 'flamingo [火烈鸟]',
+  131: 'little blue heron, Egretta caerulea [小蓝鹭]',
+  132: 'American egret, great white heron, Egretta albus [美国鹭,大白鹭]',
+  133: 'bittern [麻鸦]',
+  134: 'crane [鹤]',
+  135: 'limpkin, Aramus pictus [秧鹤]',
+  136: 'European gallinule, Porphyrio porphyrio [欧洲水鸡,紫水鸡]',
+  137: 'American coot, marsh hen, mud hen, water hen, Fulica americana [沼泽泥母鸡,水母鸡]',
+  138: 'bustard [鸨]',
+  139: 'ruddy turnstone, Arenaria interpres [红翻石鹬]',
+  140: 'red-backed sandpiper, dunlin, Erolia alpina [红背鹬,黑腹滨鹬]',
+  141: 'redshank, Tringa totanus [红脚鹬]',
+  142: 'dowitcher [半蹼鹬]',
+  143: 'oystercatcher, oyster catcher [蛎鹬]',
+  144: 'pelican [鹈鹕]',
+  145: 'king penguin, Aptenodytes patagonica [国王企鹅]',
+  146: 'albatross, mollymawk [信天翁,大海鸟]',
+  147: 'grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus [灰鲸]',
+  148: 'killer whale, killer, orca, grampus, sea wolf, Orcinus orca [杀人鲸,逆戟鲸,虎鲸]',
+  149: 'dugong, Dugong dugon [海牛]',
+  150: 'sea lion [海狮]',
+  151: 'Chihuahua [奇瓦瓦]',
+  152: 'Japanese spaniel [日本猎犬]',
+  153: 'Maltese dog, Maltese terrier, Maltese [马尔济斯犬]',
+  154: 'Pekinese, Pekingese, Peke [狮子狗]',
+  155: 'Shih-Tzu [西施犬]',
+  156: 'Blenheim spaniel [布莱尼姆猎犬]',
+  157: 'papillon [巴比狗]',
+  158: 'toy terrier [玩具犬]',
+  159: 'Rhodesian ridgeback [罗得西亚长背猎狗]',
+  160: 'Afghan hound, Afghan [阿富汗猎犬]',
+  161: 'basset, basset hound [猎犬]',
+  162: 'beagle [比格犬,猎兔犬]',
+  163: 'bloodhound, sleuthhound [侦探犬]',
+  164: 'bluetick [蓝色快狗]',
+  165: 'black-and-tan coonhound [黑褐猎浣熊犬]',
+  166: 'Walker hound, Walker foxhound [沃克猎犬]',
+  167: 'English foxhound [英国猎狐犬]',
+  168: 'redbone [美洲赤狗]',
+  169: 'borzoi, Russian wolfhound [俄罗斯猎狼犬]',
+  170: 'Irish wolfhound [爱尔兰猎狼犬]',
+  171: 'Italian greyhound [意大利灰狗]',
+  172: 'whippet [惠比特犬]',
+  173: 'Ibizan hound, Ibizan Podenco [依比沙猎犬]',
+  174: 'Norwegian elkhound, elkhound [挪威猎犬]',
+  175: 'otterhound, otter hound [奥达猎犬,水獭猎犬]',
+  176: 'Saluki, gazelle hound [沙克犬,瞪羚猎犬]',
+  177: 'Scottish deerhound, deerhound [苏格兰猎鹿犬,猎鹿犬]',
+  178: 'Weimaraner [威玛猎犬]',
+  179: 'Staffordshire bullterrier, Staffordshire bull terrier [斯塔福德郡牛头梗,斯塔福德郡斗牛梗]',
+  180: 'American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier [美国斯塔福德郡梗,美国比特斗牛梗,斗牛梗]',
+  181: 'Bedlington terrier [贝德灵顿梗]',
+  182: 'Border terrier [边境梗]',
+  183: 'Kerry blue terrier [凯丽蓝梗]',
+  184: 'Irish terrier [爱尔兰梗]',
+  185: 'Norfolk terrier [诺福克梗]',
+  186: 'Norwich terrier [诺维奇梗]',
+  187: 'Yorkshire terrier [约克郡梗]',
+  188: 'wire-haired fox terrier [刚毛猎狐梗]',
+  189: 'Lakeland terrier [莱克兰梗]',
+  190: 'Sealyham terrier, Sealyham [锡利哈姆梗]',
+  191: 'Airedale, Airedale terrier [艾尔谷犬]',
+  192: 'cairn, cairn terrier [凯恩梗]',
+  193: 'Australian terrier [澳大利亚梗]',
+  194: 'Dandie Dinmont, Dandie Dinmont terrier [丹迪丁蒙梗]',
+  195: 'Boston bull, Boston terrier [波士顿梗]',
+  196: 'miniature schnauzer [迷你雪纳瑞犬]',
+  197: 'giant schnauzer [巨型雪纳瑞犬]',
+  198: 'standard schnauzer [标准雪纳瑞犬]',
+  199: 'Scotch terrier, Scottish terrier, Scottie [苏格兰梗]',
+  200: 'Tibetan terrier, chrysanthemum dog [西藏梗,菊花狗]',
+  201: 'silky terrier, Sydney silky [丝毛梗]',
+  202: 'soft-coated wheaten terrier [软毛麦色梗]',
+  203: 'West Highland white terrier [西高地白梗]',
+  204: 'Lhasa, Lhasa apso [拉萨阿普索犬]',
+  205: 'flat-coated retriever [平毛寻回犬]',
+  206: 'curly-coated retriever [卷毛寻回犬]',
+  207: 'golden retriever [金毛猎犬]',
+  208: 'Labrador retriever [拉布拉多猎犬]',
+  209: 'Chesapeake Bay retriever [乞沙比克猎犬]',
+  210: 'German short-haired pointer [德国短毛猎犬]',
+  211: 'vizsla, Hungarian pointer [维兹拉犬]',
+  212: 'English setter [英国谍犬]',
+  213: 'Irish setter, red setter [爱尔兰雪达犬,红色猎犬]',
+  214: 'Gordon setter [戈登雪达犬]',
+  215: 'Brittany spaniel [布列塔尼犬猎犬]',
+  216: 'clumber, clumber spaniel [黄毛,黄毛猎犬]',
+  217: 'English springer, English springer spaniel [英国史宾格犬]',
+  218: 'Welsh springer spaniel [威尔士史宾格犬]',
+  219: 'cocker spaniel, English cocker spaniel, cocker [可卡犬,英国可卡犬]',
+  220: 'Sussex spaniel [萨塞克斯猎犬]',
+  221: 'Irish water spaniel [爱尔兰水猎犬]',
+  222: 'kuvasz [哥威斯犬]',
+  223: 'schipperke [舒柏奇犬]',
+  224: 'groenendael [比利时牧羊犬]',
+  225: 'malinois [马里努阿犬]',
+  226: 'briard [伯瑞犬]',
+  227: 'kelpie [凯尔皮犬]',
+  228: 'komondor [匈牙利牧羊犬]',
+  229: 'Old English sheepdog, bobtail [老英国牧羊犬]',
+  230: 'Shetland sheepdog, Shetland sheep dog, Shetland [喜乐蒂牧羊犬]',
+  231: 'collie [牧羊犬]',
+  232: 'Border collie [边境牧羊犬]',
+  233: 'Bouvier des Flandres, Bouviers des Flandres [法兰德斯牧牛狗]',
+  234: 'Rottweiler [罗特韦尔犬]',
+  235: 'German shepherd, German shepherd dog, German police dog, alsatian [德国牧羊犬,德国警犬,阿尔萨斯]',
+  236: 'Doberman, Doberman pinscher [多伯曼犬,杜宾犬]',
+  237: 'miniature pinscher [迷你杜宾犬]',
+  238: 'Greater Swiss Mountain dog [大瑞士山地犬]',
+  239: 'Bernese mountain dog [伯恩山犬]',
+  240: 'Appenzeller [Appenzeller狗]',
+  241: 'EntleBucher [EntleBucher狗]',
+  242: 'boxer [拳师狗]',
+  243: 'bull mastiff [斗牛獒]',
+  244: 'Tibetan mastiff [藏獒]',
+  245: 'French bulldog [法国斗牛犬]',
+  246: 'Great Dane [大丹犬]',
+  247: 'Saint Bernard, St Bernard [圣伯纳德狗]',
+  248: 'Eskimo dog, husky [爱斯基摩犬,哈士奇]',
+  249: 'malamute, malemute, Alaskan malamute [雪橇犬,阿拉斯加爱斯基摩狗]',
+  250: 'Siberian husky [哈士奇]',
+  251: 'dalmatian, coach dog, carriage dog [达尔马提亚,教练车狗]',
+  252: 'affenpinscher, monkey pinscher, monkey dog [狮毛狗]',
+  253: 'basenji [巴辛吉狗]',
+  254: 'pug, pug-dog [哈巴狗,狮子狗]',
+  255: 'Leonberg [莱昂贝格狗]',
+  256: 'Newfoundland, Newfoundland dog [纽芬兰岛狗]',
+  257: 'Great Pyrenees [大白熊犬]',
+  258: 'Samoyed, Samoyede [萨摩耶犬]',
+  259: 'Pomeranian [博美犬]',
+  260: 'chow, chow chow [松狮,松狮]',
+  261: 'keeshond [荷兰卷尾狮毛狗]',
+  262: 'Brabancon griffon [布鲁塞尔格林芬犬]',
+  263: 'Pembroke, Pembroke Welsh corgi [彭布洛克威尔士科基犬]',
+  264: 'Cardigan, Cardigan Welsh corgi [威尔士柯基犬]',
+  265: 'toy poodle [玩具贵宾犬]',
+  266: 'miniature poodle [迷你贵宾犬]',
+  267: 'standard poodle [标准贵宾犬]',
+  268: 'Mexican hairless [墨西哥无毛犬]',
+  269: 'timber wolf, grey wolf, gray wolf, Canis lupus [灰狼]',
+  270: 'white wolf, Arctic wolf, Canis lupus tundrarum [白狼,北极狼]',
+  271: 'red wolf, maned wolf, Canis rufus, Canis niger [红太狼,鬃狼,犬犬鲁弗斯]',
+  272: 'coyote, prairie wolf, brush wolf, Canis latrans [狼,草原狼,刷狼,郊狼]',
+  273: 'dingo, warrigal, warragal, Canis dingo [澳洲野狗,澳大利亚野犬]',
+  274: 'dhole, Cuon alpinus [豺]',
+  275: 'African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus [非洲猎犬,土狼犬]',
+  276: 'hyena, hyaena [鬣狗]',
+  277: 'red fox, Vulpes vulpes [红狐狸]',
+  278: 'kit fox, Vulpes macrotis [沙狐]',
+  279: 'Arctic fox, white fox, Alopex lagopus [北极狐狸,白狐狸]',
+  280: 'grey fox, gray fox, Urocyon cinereoargenteus [灰狐狸]',
+  281: 'tabby, tabby cat [虎斑猫]',
+  282: 'tiger cat [山猫,虎猫]',
+  283: 'Persian cat [波斯猫]',
+  284: 'Siamese cat, Siamese [暹罗暹罗猫,]',
+  285: 'Egyptian cat [埃及猫]',
+  286: 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor [美洲狮,美洲豹]',
+  287: 'lynx, catamount [猞猁,山猫]',
+  288: 'leopard, Panthera pardus [豹子]',
+  289: 'snow leopard, ounce, Panthera uncia [雪豹]',
+  290: 'jaguar, panther, Panthera onca, Felis onca [美洲虎]',
+  291: 'lion, king of beasts, Panthera leo [狮子]',
+  292: 'tiger, Panthera tigris [老虎]',
+  293: 'cheetah, chetah, Acinonyx jubatus [猎豹]',
+  294: 'brown bear, bruin, Ursus arctos [棕熊]',
+  295: 'American black bear, black bear, Ursus americanus, Euarctos americanus [美洲黑熊]',
+  296: 'ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus [冰熊,北极熊]',
+  297: 'sloth bear, Melursus ursinus, Ursus ursinus [懒熊]',
+  298: 'mongoose [猫鼬]',
+  299: 'meerkat, mierkat [猫鼬,海猫]',
+  300: 'tiger beetle [虎甲虫]',
+  301: 'ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle [瓢虫]',
+  302: 'ground beetle, carabid beetle [土鳖虫]',
+  303: 'long-horned beetle, longicorn, longicorn beetle [天牛]',
+  304: 'leaf beetle, chrysomelid [龟甲虫]',
+  305: 'dung beetle [粪甲虫]',
+  306: 'rhinoceros beetle [犀牛甲虫]',
+  307: 'weevil [象甲]',
+  308: 'fly [苍蝇]',
+  309: 'bee [蜜蜂]',
+  310: 'ant, emmet, pismire [蚂蚁]',
+  311: 'grasshopper, hopper [蚱蜢]',
+  312: 'cricket [蟋蟀]',
+  313: 'walking stick, walkingstick, stick insect [竹节虫]',
+  314: 'cockroach, roach [蟑螂]',
+  315: 'mantis, mantid [螳螂]',
+  316: 'cicada, cicala [蝉]',
+  317: 'leafhopper [叶蝉]',
+  318: 'lacewing, lacewing fly [草蜻蛉]',
+  319: 'dragonfly, darning needle, devils darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk [蜻蜓]',
+  320: 'damselfly [豆娘,蜻蛉]',
+  321: 'admiral [优红蛱蝶]',
+  322: 'ringlet, ringlet butterfly [小环蝴蝶]',
+  323: 'monarch, monarch butterfly, milkweed butterfly, Danaus plexippus [君主蝴蝶,大斑蝶]',
+  324: 'cabbage butterfly [菜粉蝶]',
+  325: 'sulphur butterfly, sulfur butterfly [白蝴蝶]',
+  326: 'lycaenid, lycaenid butterfly [灰蝶]',
+  327: 'starfish, sea star [海星]',
+  328: 'sea urchin [海胆]',
+  329: 'sea cucumber, holothurian [海参,海黄瓜]',
+  330: 'wood rabbit, cottontail, cottontail rabbit [野兔]',
+  331: 'hare [兔]',
+  332: 'Angora, Angora rabbit [安哥拉兔]',
+  333: 'hamster [仓鼠]',
+  334: 'porcupine, hedgehog [刺猬,豪猪,]',
+  335: 'fox squirrel, eastern fox squirrel, Sciurus niger [黑松鼠]',
+  336: 'marmot [土拨鼠]',
+  337: 'beaver [海狸]',
+  338: 'guinea pig, Cavia cobaya [豚鼠,豚鼠]',
+  339: 'sorrel [栗色马]',
+  340: 'zebra [斑马]',
+  341: 'hog, pig, grunter, squealer, Sus scrofa [猪]',
+  342: 'wild boar, boar, Sus scrofa [野猪]',
+  343: 'warthog [疣猪]',
+  344: 'hippopotamus, hippo, river horse, Hippopotamus amphibius [河马]',
+  345: 'ox [牛]',
+  346: 'water buffalo, water ox, Asiatic buffalo, Bubalus bubalis [水牛,亚洲水牛]',
+  347: 'bison [野牛]',
+  348: 'ram, tup [公羊]',
+  349: 'bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis [大角羊,洛矶山大角羊]',
+  350: 'ibex, Capra ibex [山羊]',
+  351: 'hartebeest [狷羚]',
+  352: 'impala, Aepyceros melampus [黑斑羚]',
+  353: 'gazelle [瞪羚]',
+  354: 'Arabian camel, dromedary, Camelus dromedarius [阿拉伯单峰骆驼,骆驼]',
+  355: 'llama [骆驼]',
+  356: 'weasel [黄鼠狼]',
+  357: 'mink [水貂]',
+  358: 'polecat, fitch, foulmart, foumart, Mustela putorius [臭猫]',
+  359: 'black-footed ferret, ferret, Mustela nigripes [黑足鼬]',
+  360: 'otter [水獭]',
+  361: 'skunk, polecat, wood pussy [臭鼬,木猫]',
+  362: 'badger [獾]',
+  363: 'armadillo [犰狳]',
+  364: 'three-toed sloth, ai, Bradypus tridactylus [树懒]',
+  365: 'orangutan, orang, orangutang, Pongo pygmaeus [猩猩,婆罗洲猩猩]',
+  366: 'gorilla, Gorilla gorilla [大猩猩]',
+  367: 'chimpanzee, chimp, Pan troglodytes [黑猩猩]',
+  368: 'gibbon, Hylobates lar [长臂猿]',
+  369: 'siamang, Hylobates syndactylus, Symphalangus syndactylus [合趾猿长臂猿,合趾猿]',
+  370: 'guenon, guenon monkey [长尾猴]',
+  371: 'patas, hussar monkey, Erythrocebus patas [赤猴]',
+  372: 'baboon [狒狒]',
+  373: 'macaque [恒河猴,猕猴]',
+  374: 'langur [白头叶猴]',
+  375: 'colobus, colobus monkey [疣猴]',
+  376: 'proboscis monkey, Nasalis larvatus [长鼻猴]',
+  377: 'marmoset [狨（美洲产小型长尾猴）]',
+  378: 'capuchin, ringtail, Cebus capucinus [卷尾猴]',
+  379: 'howler monkey, howler [吼猴]',
+  380: 'titi, titi monkey [伶猴]',
+  381: 'spider monkey, Ateles geoffroyi [蜘蛛猴]',
+  382: 'squirrel monkey, Saimiri sciureus [松鼠猴]',
+  383: 'Madagascar cat, ring-tailed lemur, Lemur catta [马达加斯加环尾狐猴,鼠狐猴]',
+  384: 'indri, indris, Indri indri, Indri brevicaudatus [大狐猴,马达加斯加大狐猴]',
+  385: 'Indian elephant, Elephas maximus [印度大象,亚洲象]',
+  386: 'African elephant, Loxodonta africana [非洲象,非洲象]',
+  387: 'lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens [小熊猫]',
+  388: 'giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca [大熊猫]',
+  389: 'barracouta, snoek [杖鱼]',
+  390: 'eel [鳗鱼]',
+  391: 'coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch [银鲑,银鲑鱼]',
+  392: 'rock beauty, Holocanthus tricolor [三色刺蝶鱼]',
+  393: 'anemone fish [海葵鱼]',
+  394: 'sturgeon [鲟鱼]',
+  395: 'gar, garfish, garpike, billfish, Lepisosteus osseus [雀鳝]',
+  396: 'lionfish [狮子鱼]',
+  397: 'puffer, pufferfish, blowfish, globefish [河豚]',
+  398: 'abacus [算盘]',
+  399: 'abaya [长袍]',
+  400: 'academic gown, academic robe, judge robe [学位袍]',
+  401: 'accordion, piano accordion, squeeze box [手风琴]',
+  402: 'acoustic guitar [原声吉他]',
+  403: 'aircraft carrier, carrier, flattop, attack aircraft carrier [航空母舰]',
+  404: 'airliner [客机]',
+  405: 'airship, dirigible [飞艇]',
+  406: 'altar [祭坛]',
+  407: 'ambulance [救护车]',
+  408: 'amphibian, amphibious vehicle [水陆两用车]',
+  409: 'analog clock [模拟时钟]',
+  410: 'apiary, bee house [蜂房]',
+  411: 'apron [围裙]',
+  412: 'ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin [垃圾桶]',
+  413: 'assault rifle, assault gun [攻击步枪,枪]',
+  414: 'backpack, back pack, knapsack, packsack, rucksack, haversack [背包]',
+  415: 'bakery, bakeshop, bakehouse [面包店,面包铺,]',
+  416: 'balance beam, beam [平衡木]',
+  417: 'balloon [热气球]',
+  418: 'ballpoint, ballpoint pen, ballpen, Biro [圆珠笔]',
+  419: 'Band Aid [创可贴]',
+  420: 'banjo [班卓琴]',
+  421: 'bannister, banister, balustrade, balusters, handrail [栏杆,楼梯扶手]',
+  422: 'barbell [杠铃]',
+  423: 'barber chair [理发师的椅子]',
+  424: 'barbershop [理发店]',
+  425: 'barn [牲口棚]',
+  426: 'barometer [晴雨表]',
+  427: 'barrel, cask [圆筒]',
+  428: 'barrow, garden cart, lawn cart, wheelbarrow [园地小车,手推车]',
+  429: 'baseball [棒球]',
+  430: 'basketball [篮球]',
+  431: 'bassinet [婴儿床]',
+  432: 'bassoon [巴松管,低音管]',
+  433: 'bathing cap, swimming cap [游泳帽]',
+  434: 'bath towel [沐浴毛巾]',
+  435: 'bathtub, bathing tub, bath, tub [浴缸,澡盆]',
+  436: 'beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon [沙滩车,旅行车]',
+  437: 'beacon, lighthouse, beacon light, pharos [灯塔]',
+  438: 'beaker [高脚杯]',
+  439: 'bearskin, busby, shako [熊皮高帽]',
+  440: 'beer bottle [啤酒瓶]',
+  441: 'beer glass [啤酒杯]',
+  442: 'bell cote, bell cot [钟塔]',
+  443: 'bib [（小儿用的）围嘴]',
+  444: 'bicycle-built-for-two, tandem bicycle, tandem [串联自行车,]',
+  445: 'bikini, two-piece [比基尼]',
+  446: 'binder, ring-binder [装订册]',
+  447: 'binoculars, field glasses, opera glasses [双筒望远镜]',
+  448: 'birdhouse [鸟舍]',
+  449: 'boathouse [船库]',
+  450: 'bobsled, bobsleigh, bob [雪橇]',
+  451: 'bolo tie, bolo, bola tie, bola [饰扣式领带]',
+  452: 'bonnet, poke bonnet [阔边女帽]',
+  453: 'bookcase [书橱]',
+  454: 'bookshop, bookstore, bookstall [书店,书摊]',
+  455: 'bottlecap [瓶盖]',
+  456: 'bow [弓箭]',
+  457: 'bow tie, bow-tie, bowtie [蝴蝶结领结]',
+  458: 'brass, memorial tablet, plaque [铜制牌位]',
+  459: 'brassiere, bra, bandeau [奶罩]',
+  460: 'breakwater, groin, groyne, mole, bulwark, seawall, jetty [防波堤,海堤]',
+  461: 'breastplate, aegis, egis [铠甲]',
+  462: 'broom [扫帚]',
+  463: 'bucket, pail [桶]',
+  464: 'buckle [扣环]',
+  465: 'bulletproof vest [防弹背心]',
+  466: 'bullet train, bullet [动车,子弹头列车]',
+  467: 'butcher shop, meat market [肉铺,肉菜市场]',
+  468: 'cab, hack, taxi, taxicab [出租车]',
+  469: 'caldron, cauldron [大锅]',
+  470: 'candle, taper, wax light [蜡烛]',
+  471: 'cannon [大炮]',
+  472: 'canoe [独木舟]',
+  473: 'can opener, tin opener [开瓶器,开罐器]',
+  474: 'cardigan [开衫]',
+  475: 'car mirror [车镜]',
+  476: 'carousel, carrousel, merry-go-round, roundabout, whirligig [旋转木马]',
+  477: 'carpenters kit, tool kit [木匠的工具包,工具包]',
+  478: 'carton [纸箱]',
+  479: 'car wheel [车轮]',
+  480: 'cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM [取款机,自动取款机]',
+  481: 'cassette [盒式录音带]',
+  482: 'cassette player [卡带播放器]',
+  483: 'castle [城堡]',
+  484: 'catamaran [双体船]',
+  485: 'CD player [CD播放器]',
+  486: 'cello, violoncello [大提琴]',
+  487: 'cellular telephone, cellular phone, cellphone, cell, mobile phone [移动电话,手机]',
+  488: 'chain [铁链]',
+  489: 'chainlink fence [围栏]',
+  490: 'chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour [链甲]',
+  491: 'chain saw, chainsaw [电锯,油锯]',
+  492: 'chest [箱子]',
+  493: 'chiffonier, commode [衣柜,洗脸台]',
+  494: 'chime, bell, gong [编钟,钟,锣]',
+  495: 'china cabinet, china closet [中国橱柜]',
+  496: 'Christmas stocking [圣诞袜]',
+  497: 'church, church building [教堂,教堂建筑]',
+  498: 'cinema, movie theater, movie theatre, movie house, picture palace [电影院,剧场]',
+  499: 'cleaver, meat cleaver, chopper [切肉刀,菜刀]',
+  500: 'cliff dwelling [悬崖屋]',
+  501: 'cloak [斗篷]',
+  502: 'clog, geta, patten, sabot [木屐,木鞋]',
+  503: 'cocktail shaker [鸡尾酒调酒器]',
+  504: 'coffee mug [咖啡杯]',
+  505: 'coffeepot [咖啡壶]',
+  506: 'coil, spiral, volute, whorl, helix [螺旋结构（楼梯）]',
+  507: 'combination lock [组合锁]',
+  508: 'computer keyboard, keypad [电脑键盘,键盘]',
+  509: 'confectionery, confectionary, candy store [糖果,糖果店]',
+  510: 'container ship, containership, container vessel [集装箱船]',
+  511: 'convertible [敞篷车]',
+  512: 'corkscrew, bottle screw [开瓶器,瓶螺杆]',
+  513: 'cornet, horn, trumpet, trump [短号,喇叭]',
+  514: 'cowboy boot [牛仔靴]',
+  515: 'cowboy hat, ten-gallon hat [牛仔帽]',
+  516: 'cradle [摇篮]',
+  517: 'crane [起重机]',
+  518: 'crash helmet [头盔]',
+  519: 'crate [板条箱]',
+  520: 'crib, cot [小儿床]',
+  521: 'Crock Pot [砂锅]',
+  522: 'croquet ball [槌球]',
+  523: 'crutch [拐杖]',
+  524: 'cuirass [胸甲]',
+  525: 'dam, dike, dyke [大坝,堤防]',
+  526: 'desk [书桌]',
+  527: 'desktop computer [台式电脑]',
+  528: 'dial telephone, dial phone [有线电话]',
+  529: 'diaper, nappy, napkin [尿布湿]',
+  530: 'digital clock [数字时钟]',
+  531: 'digital watch [数字手表]',
+  532: 'dining table, board [餐桌板]',
+  533: 'dishrag, dishcloth [抹布]',
+  534: 'dishwasher, dish washer, dishwashing machine [洗碗机,洗碟机]',
+  535: 'disk brake, disc brake [盘式制动器]',
+  536: 'dock, dockage, docking facility [码头,船坞,码头设施]',
+  537: 'dogsled, dog sled, dog sleigh [狗拉雪橇]',
+  538: 'dome [圆顶]',
+  539: 'doormat, welcome mat [门垫,垫子]',
+  540: 'drilling platform, offshore rig [钻井平台,海上钻井]',
+  541: 'drum, membranophone, tympan [鼓,乐器,鼓膜]',
+  542: 'drumstick [鼓槌]',
+  543: 'dumbbell [哑铃]',
+  544: 'Dutch oven [荷兰烤箱]',
+  545: 'electric fan, blower [电风扇,鼓风机]',
+  546: 'electric guitar [电吉他]',
+  547: 'electric locomotive [电力机车]',
+  548: 'entertainment center [电视,电视柜]',
+  549: 'envelope [信封]',
+  550: 'espresso maker [浓缩咖啡机]',
+  551: 'face powder [扑面粉]',
+  552: 'feather boa, boa [女用长围巾]',
+  553: 'file, file cabinet, filing cabinet [文件,文件柜,档案柜]',
+  554: 'fireboat [消防船]',
+  555: 'fire engine, fire truck [消防车]',
+  556: 'fire screen, fireguard [火炉栏]',
+  557: 'flagpole, flagstaff [旗杆]',
+  558: 'flute, transverse flute [长笛]',
+  559: 'folding chair [折叠椅]',
+  560: 'football helmet [橄榄球头盔]',
+  561: 'forklift [叉车]',
+  562: 'fountain [喷泉]',
+  563: 'fountain pen [钢笔]',
+  564: 'four-poster [有四根帷柱的床]',
+  565: 'freight car [运货车厢]',
+  566: 'French horn, horn [圆号,喇叭]',
+  567: 'frying pan, frypan, skillet [煎锅]',
+  568: 'fur coat [裘皮大衣]',
+  569: 'garbage truck, dustcart [垃圾车]',
+  570: 'gasmask, respirator, gas helmet [防毒面具,呼吸器]',
+  571: 'gas pump, gasoline pump, petrol pump, island dispenser [汽油泵]',
+  572: 'goblet [高脚杯]',
+  573: 'go-kart [卡丁车]',
+  574: 'golf ball [高尔夫球]',
+  575: 'golfcart, golf cart [高尔夫球车]',
+  576: 'gondola [狭长小船]',
+  577: 'gong, tam-tam [锣]',
+  578: 'gown [礼服]',
+  579: 'grand piano, grand [钢琴]',
+  580: 'greenhouse, nursery, glasshouse [温室,苗圃]',
+  581: 'grille, radiator grille [散热器格栅]',
+  582: 'grocery store, grocery, food market, market [杂货店,食品市场]',
+  583: 'guillotine [断头台]',
+  584: 'hair slide [小发夹]',
+  585: 'hair spray [头发喷雾]',
+  586: 'half track [半履带装甲车]',
+  587: 'hammer [锤子]',
+  588: 'hamper [大篮子]',
+  589: 'hand blower, blow dryer, blow drier, hair dryer, hair drier [手摇鼓风机,吹风机]',
+  590: 'hand-held computer, hand-held microcomputer [手提电脑]',
+  591: 'handkerchief, hankie, hanky, hankey [手帕]',
+  592: 'hard disc, hard disk, fixed disk [硬盘]',
+  593: 'harmonica, mouth organ, harp, mouth harp [口琴,口风琴]',
+  594: 'harp [竖琴]',
+  595: 'harvester, reaper [收割机]',
+  596: 'hatchet [斧头]',
+  597: 'holster [手枪皮套]',
+  598: 'home theater, home theatre [家庭影院]',
+  599: 'honeycomb [蜂窝]',
+  600: 'hook, claw [钩爪]',
+  601: 'hoopskirt, crinoline [衬裙]',
+  602: 'horizontal bar, high bar [单杠]',
+  603: 'horse cart, horse-cart [马车]',
+  604: 'hourglass [沙漏]',
+  605: 'iPod [手机，iPad]',
+  606: 'iron, smoothing iron [熨斗]',
+  607: 'jack-o-lantern [南瓜灯笼]',
+  608: 'jean, blue jean, denim [牛仔裤,蓝色牛仔裤]',
+  609: 'jeep, landrover [吉普车]',
+  610: 'jersey, T-shirt, tee shirt [运动衫,T恤]',
+  611: 'jigsaw puzzle [拼图]',
+  612: 'jinrikisha, ricksha, rickshaw [人力车]',
+  613: 'joystick [操纵杆]',
+  614: 'kimono [和服]',
+  615: 'knee pad [护膝]',
+  616: 'knot [蝴蝶结]',
+  617: 'lab coat, laboratory coat [大褂,实验室外套]',
+  618: 'ladle [长柄勺]',
+  619: 'lampshade, lamp shade [灯罩]',
+  620: 'laptop, laptop computer [笔记本电脑]',
+  621: 'lawn mower, mower [割草机]',
+  622: 'lens cap, lens cover [镜头盖]',
+  623: 'letter opener, paper knife, paperknife [开信刀,裁纸刀]',
+  624: 'library [图书馆]',
+  625: 'lifeboat [救生艇]',
+  626: 'lighter, light, igniter, ignitor [点火器,打火机]',
+  627: 'limousine, limo [豪华轿车]',
+  628: 'liner, ocean liner [远洋班轮]',
+  629: 'lipstick, lip rouge [唇膏,口红]',
+  630: 'Loafer [平底便鞋]',
+  631: 'lotion [洗剂]',
+  632: 'loudspeaker, speaker, speaker unit, loudspeaker system, speaker system [扬声器]',
+  633: 'loupe, jewelers loupe [放大镜]',
+  634: 'lumbermill, sawmill [锯木厂]',
+  635: 'magnetic compass [磁罗盘]',
+  636: 'mailbag, postbag [邮袋]',
+  637: 'mailbox, letter box [信箱]',
+  638: 'maillot [女游泳衣]',
+  639: 'maillot, tank suit [有肩带浴衣]',
+  640: 'manhole cover [窨井盖]',
+  641: 'maraca [沙球（一种打击乐器）]',
+  642: 'marimba, xylophone [马林巴木琴]',
+  643: 'mask [面膜]',
+  644: 'matchstick [火柴]',
+  645: 'maypole [花柱]',
+  646: 'maze, labyrinth [迷宫]',
+  647: 'measuring cup [量杯]',
+  648: 'medicine chest, medicine cabinet [药箱]',
+  649: 'megalith, megalithic structure [巨石,巨石结构]',
+  650: 'microphone, mike [麦克风]',
+  651: 'microwave, microwave oven [微波炉]',
+  652: 'military uniform [军装]',
+  653: 'milk can [奶桶]',
+  654: 'minibus [迷你巴士]',
+  655: 'miniskirt, mini [迷你裙]',
+  656: 'minivan [面包车]',
+  657: 'missile [导弹]',
+  658: 'mitten [连指手套]',
+  659: 'mixing bowl [搅拌钵]',
+  660: 'mobile home, manufactured home [活动房屋（由汽车拖拉的）]',
+  661: 'Model T [T型发动机小汽车]',
+  662: 'modem [调制解调器]',
+  663: 'monastery [修道院]',
+  664: 'monitor [显示器]',
+  665: 'moped [电瓶车]',
+  666: 'mortar [砂浆]',
+  667: 'mortarboard [学士]',
+  668: 'mosque [清真寺]',
+  669: 'mosquito net [蚊帐]',
+  670: 'motor scooter, scooter [摩托车]',
+  671: 'mountain bike, all-terrain bike, off-roader [山地自行车]',
+  672: 'mountain tent [登山帐]',
+  673: 'mouse, computer mouse [鼠标,电脑鼠标]',
+  674: 'mousetrap [捕鼠器]',
+  675: 'moving van [搬家车]',
+  676: 'muzzle [口套]',
+  677: 'nail [钉子]',
+  678: 'neck brace [颈托]',
+  679: 'necklace [项链]',
+  680: 'nipple [乳头（瓶）]',
+  681: 'notebook, notebook computer [笔记本,笔记本电脑]',
+  682: 'obelisk [方尖碑]',
+  683: 'oboe, hautboy, hautbois [双簧管]',
+  684: 'ocarina, sweet potato [陶笛,卵形笛]',
+  685: 'odometer, hodometer, mileometer, milometer [里程表]',
+  686: 'oil filter [滤油器]',
+  687: 'organ, pipe organ [风琴,管风琴]',
+  688: 'oscilloscope, scope, cathode-ray oscilloscope, CRO [示波器]',
+  689: 'overskirt [罩裙]',
+  690: 'oxcart [牛车]',
+  691: 'oxygen mask [氧气面罩]',
+  692: 'packet [包装]',
+  693: 'paddle, boat paddle [船桨]',
+  694: 'paddlewheel, paddle wheel [明轮,桨轮]',
+  695: 'padlock [挂锁,扣锁]',
+  696: 'paintbrush [画笔]',
+  697: 'pajama, pyjama, pjs, jammies [睡衣]',
+  698: 'palace [宫殿]',
+  699: 'panpipe, pandean pipe, syrinx [排箫,鸣管]',
+  700: 'paper towel [纸巾]',
+  701: 'parachute, chute [降落伞]',
+  702: 'parallel bars, bars [双杠]',
+  703: 'park bench [公园长椅]',
+  704: 'parking meter [停车收费表,停车计时器]',
+  705: 'passenger car, coach, carriage [客车,教练车]',
+  706: 'patio, terrace [露台,阳台]',
+  707: 'pay-phone, pay-station [付费电话]',
+  708: 'pedestal, plinth, footstall [基座,基脚]',
+  709: 'pencil box, pencil case [铅笔盒]',
+  710: 'pencil sharpener [卷笔刀]',
+  711: 'perfume, essence [香水（瓶）]',
+  712: 'Petri dish [培养皿]',
+  713: 'photocopier [复印机]',
+  714: 'pick, plectrum, plectron [拨弦片,拨子]',
+  715: 'pickelhaube [尖顶头盔]',
+  716: 'picket fence, paling [栅栏,栅栏]',
+  717: 'pickup, pickup truck [皮卡,皮卡车]',
+  718: 'pier [桥墩]',
+  719: 'piggy bank, penny bank [存钱罐]',
+  720: 'pill bottle [药瓶]',
+  721: 'pillow [枕头]',
+  722: 'ping-pong ball [乒乓球]',
+  723: 'pinwheel [风车]',
+  724: 'pirate, pirate ship [海盗船]',
+  725: 'pitcher, ewer [水罐]',
+  726: 'plane, carpenters plane, woodworking plane [木工刨]',
+  727: 'planetarium [天文馆]',
+  728: 'plastic bag [塑料袋]',
+  729: 'plate rack [板架]',
+  730: 'plow, plough [犁型铲雪机]',
+  731: 'plunger, plumbers helper [手压皮碗泵]',
+  732: 'Polaroid camera, Polaroid Land camera [宝丽来相机]',
+  733: 'pole [电线杆]',
+  734: 'police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria [警车,巡逻车]',
+  735: 'poncho [雨披]',
+  736: 'pool table, billiard table, snooker table [台球桌]',
+  737: 'pop bottle, soda bottle [充气饮料瓶]',
+  738: 'pot, flowerpot [花盆]',
+  739: 'potters wheel [陶工旋盘]',
+  740: 'power drill [电钻]',
+  741: 'prayer rug, prayer mat [祈祷垫,地毯]',
+  742: 'printer [打印机]',
+  743: 'prison, prison house [监狱]',
+  744: 'projectile, missile [炮弹,导弹]',
+  745: 'projector [投影仪]',
+  746: 'puck, hockey puck [冰球]',
+  747: 'punching bag, punch bag, punching ball, punchball [沙包,吊球]',
+  748: 'purse [钱包]',
+  749: 'quill, quill pen [羽管笔]',
+  750: 'quilt, comforter, comfort, puff [被子]',
+  751: 'racer, race car, racing car [赛车]',
+  752: 'racket, racquet [球拍]',
+  753: 'radiator [散热器]',
+  754: 'radio, wireless [收音机]',
+  755: 'radio telescope, radio reflector [射电望远镜,无线电反射器]',
+  756: 'rain barrel [雨桶]',
+  757: 'recreational vehicle, RV, R.V. [休闲车,房车]',
+  758: 'reel [卷轴,卷筒]',
+  759: 'reflex camera [反射式照相机]',
+  760: 'refrigerator, icebox [冰箱,冰柜]',
+  761: 'remote control, remote [遥控器]',
+  762: 'restaurant, eating house, eating place, eatery [餐厅,饮食店,食堂]',
+  763: 'revolver, six-gun, six-shooter [左轮手枪]',
+  764: 'rifle [步枪]',
+  765: 'rocking chair, rocker [摇椅]',
+  766: 'rotisserie [电转烤肉架]',
+  767: 'rubber eraser, rubber, pencil eraser [橡皮]',
+  768: 'rugby ball [橄榄球]',
+  769: 'rule, ruler [直尺]',
+  770: 'running shoe [跑步鞋]',
+  771: 'safe [保险柜]',
+  772: 'safety pin [安全别针]',
+  773: 'saltshaker, salt shaker [盐瓶（调味用）]',
+  774: 'sandal [凉鞋]',
+  775: 'sarong [纱笼,围裙]',
+  776: 'sax, saxophone [萨克斯管]',
+  777: 'scabbard [剑鞘]',
+  778: 'scale, weighing machine [秤,称重机]',
+  779: 'school bus [校车]',
+  780: 'schooner [帆船]',
+  781: 'scoreboard [记分牌]',
+  782: 'screen, CRT screen [屏幕]',
+  783: 'screw [螺丝]',
+  784: 'screwdriver [螺丝刀]',
+  785: 'seat belt, seatbelt [安全带]',
+  786: 'sewing machine [缝纫机]',
+  787: 'shield, buckler [盾牌,盾牌]',
+  788: 'shoe shop, shoe-shop, shoe store [皮鞋店,鞋店]',
+  789: 'shoji [障子]',
+  790: 'shopping basket [购物篮]',
+  791: 'shopping cart [购物车]',
+  792: 'shovel [铁锹]',
+  793: 'shower cap [浴帽]',
+  794: 'shower curtain [浴帘]',
+  795: 'ski [滑雪板]',
+  796: 'ski mask [滑雪面罩]',
+  797: 'sleeping bag [睡袋]',
+  798: 'slide rule, slipstick [滑尺]',
+  799: 'sliding door [滑动门]',
+  800: 'slot, one-armed bandit [角子老虎机]',
+  801: 'snorkel [潜水通气管]',
+  802: 'snowmobile [雪橇]',
+  803: 'snowplow, snowplough [扫雪机,扫雪机]',
+  804: 'soap dispenser [皂液器]',
+  805: 'soccer ball [足球]',
+  806: 'sock [袜子]',
+  807: 'solar dish, solar collector, solar furnace [碟式太阳能,太阳能集热器,太阳能炉]',
+  808: 'sombrero [宽边帽]',
+  809: 'soup bowl [汤碗]',
+  810: 'space bar [空格键]',
+  811: 'space heater [空间加热器]',
+  812: 'space shuttle [航天飞机]',
+  813: 'spatula [铲（搅拌或涂敷用的）]',
+  814: 'speedboat [快艇]',
+  815: 'spider web, spiders web [蜘蛛网]',
+  816: 'spindle [纺锤,纱锭]',
+  817: 'sports car, sport car [跑车]',
+  818: 'spotlight, spot [聚光灯]',
+  819: 'stage [舞台]',
+  820: 'steam locomotive [蒸汽机车]',
+  821: 'steel arch bridge [钢拱桥]',
+  822: 'steel drum [钢滚筒]',
+  823: 'stethoscope [听诊器]',
+  824: 'stole [女用披肩]',
+  825: 'stone wall [石头墙]',
+  826: 'stopwatch, stop watch [秒表]',
+  827: 'stove [火炉]',
+  828: 'strainer [过滤器]',
+  829: 'streetcar, tram, tramcar, trolley, trolley car [有轨电车,电车]',
+  830: 'stretcher [担架]',
+  831: 'studio couch, day bed [沙发床]',
+  832: 'stupa, tope [佛塔]',
+  833: 'submarine, pigboat, sub, U-boat [潜艇,潜水艇]',
+  834: 'suit, suit of clothes [套装,衣服]',
+  835: 'sundial [日晷]',
+  836: 'sunglass [太阳镜]',
+  837: 'sunglasses, dark glasses, shades [太阳镜,墨镜]',
+  838: 'sunscreen, sunblock, sun blocker [防晒霜,防晒剂]',
+  839: 'suspension bridge [悬索桥]',
+  840: 'swab, swob, mop [拖把]',
+  841: 'sweatshirt [运动衫]',
+  842: 'swimming trunks, bathing trunks [游泳裤]',
+  843: 'swing [秋千]',
+  844: 'switch, electric switch, electrical switch [开关,电器开关]',
+  845: 'syringe [注射器]',
+  846: 'table lamp [台灯]',
+  847: 'tank, army tank, armored combat vehicle, armoured combat vehicle [坦克,装甲战车,装甲战斗车辆]',
+  848: 'tape player [磁带播放器]',
+  849: 'teapot [茶壶]',
+  850: 'teddy, teddy bear [泰迪,泰迪熊]',
+  851: 'television, television system [电视]',
+  852: 'tennis ball [网球]',
+  853: 'thatch, thatched roof [茅草,茅草屋顶]',
+  854: 'theater curtain, theatre curtain [幕布,剧院的帷幕]',
+  855: 'thimble [顶针]',
+  856: 'thresher, thrasher, threshing machine [脱粒机]',
+  857: 'throne [宝座]',
+  858: 'tile roof [瓦屋顶]',
+  859: 'toaster [烤面包机]',
+  860: 'tobacco shop, tobacconist shop, tobacconist [烟草店,烟草]',
+  861: 'toilet seat [马桶]',
+  862: 'torch [火炬]',
+  863: 'totem pole [图腾柱]',
+  864: 'tow truck, tow car, wrecker [拖车,牵引车,清障车]',
+  865: 'toyshop [玩具店]',
+  866: 'tractor [拖拉机]',
+  867: 'trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi [拖车,铰接式卡车]',
+  868: 'tray [托盘]',
+  869: 'trench coat [风衣]',
+  870: 'tricycle, trike, velocipede [三轮车]',
+  871: 'trimaran [三体船]',
+  872: 'tripod [三脚架]',
+  873: 'triumphal arch [凯旋门]',
+  874: 'trolleybus, trolley coach, trackless trolley [无轨电车]',
+  875: 'trombone [长号]',
+  876: 'tub, vat [浴盆,浴缸]',
+  877: 'turnstile [旋转式栅门]',
+  878: 'typewriter keyboard [打字机键盘]',
+  879: 'umbrella [伞]',
+  880: 'unicycle, monocycle [独轮车]',
+  881: 'upright, upright piano [直立式钢琴]',
+  882: 'vacuum, vacuum cleaner [真空吸尘器]',
+  883: 'vase [花瓶]',
+  884: 'vault [拱顶]',
+  885: 'velvet [天鹅绒]',
+  886: 'vending machine [自动售货机]',
+  887: 'vestment [祭服]',
+  888: 'viaduct [高架桥]',
+  889: 'violin, fiddle [小提琴,小提琴]',
+  890: 'volleyball [排球]',
+  891: 'waffle iron [松饼机]',
+  892: 'wall clock [挂钟]',
+  893: 'wallet, billfold, notecase, pocketbook [钱包,皮夹]',
+  894: 'wardrobe, closet, press [衣柜,壁橱]',
+  895: 'warplane, military plane [军用飞机]',
+  896: 'washbasin, handbasin, washbowl, lavabo, wash-hand basin [洗脸盆,洗手盆]',
+  897: 'washer, automatic washer, washing machine [洗衣机,自动洗衣机]',
+  898: 'water bottle [水瓶]',
+  899: 'water jug [水壶]',
+  900: 'water tower [水塔]',
+  901: 'whiskey jug [威士忌壶]',
+  902: 'whistle [哨子]',
+  903: 'wig [假发]',
+  904: 'window screen [纱窗]',
+  905: 'window shade [百叶窗]',
+  906: 'Windsor tie [温莎领带]',
+  907: 'wine bottle [葡萄酒瓶]',
+  908: 'wing [飞机翅膀,飞机]',
+  909: 'wok [炒菜锅]',
+  910: 'wooden spoon [木制的勺子]',
+  911: 'wool, woolen, woollen [毛织品,羊绒]',
+  912: 'worm fence, snake fence, snake-rail fence, Virginia fence [栅栏,围栏]',
+  913: 'wreck [沉船]',
+  914: 'yawl [双桅船]',
+  915: 'yurt [蒙古包]',
+  916: 'web site, website, internet site, site [网站,互联网网站]',
+  917: 'comic book [漫画]',
+  918: 'crossword puzzle, crossword [纵横字谜]',
+  919: 'street sign [路标]',
+  920: 'traffic light, traffic signal, stoplight [交通信号灯]',
+  921: 'book jacket, dust cover, dust jacket, dust wrapper [防尘罩,书皮]',
+  922: 'menu [菜单]',
+  923: 'plate [盘子]',
+  924: 'guacamole [鳄梨酱]',
+  925: 'consomme [清汤]',
+  926: 'hot pot, hotpot [罐焖土豆烧肉]',
+  927: 'trifle [蛋糕]',
+  928: 'ice cream, icecream [冰淇淋]',
+  929: 'ice lolly, lolly, lollipop, popsicle [雪糕,冰棍,冰棒]',
+  930: 'French loaf [法式面包]',
+  931: 'bagel, beigel [百吉饼]',
+  932: 'pretzel [椒盐脆饼]',
+  933: 'cheeseburger [芝士汉堡]',
+  934: 'hotdog, hot dog, red hot [热狗]',
+  935: 'mashed potato [土豆泥]',
+  936: 'head cabbage [结球甘蓝]',
+  937: 'broccoli [西兰花]',
+  938: 'cauliflower [菜花]',
+  939: 'zucchini, courgette [绿皮密生西葫芦]',
+  940: 'spaghetti squash [西葫芦]',
+  941: 'acorn squash [小青南瓜]',
+  942: 'butternut squash [南瓜]',
+  943: 'cucumber, cuke [黄瓜]',
+  944: 'artichoke, globe artichoke [朝鲜蓟]',
+  945: 'bell pepper [甜椒]',
+  946: 'cardoon [刺棘蓟]',
+  947: 'mushroom [蘑菇]',
+  948: 'Granny Smith [绿苹果]',
+  949: 'strawberry [草莓]',
+  950: 'orange [橘子]',
+  951: 'lemon [柠檬]',
+  952: 'fig [无花果]',
+  953: 'pineapple, ananas [菠萝]',
+  954: 'banana [香蕉]',
+  955: 'jackfruit, jak, jack [菠萝蜜]',
+  956: 'custard apple [蛋奶冻苹果]',
+  957: 'pomegranate [石榴]',
+  958: 'hay [干草]',
+  959: 'carbonara [烤面条加干酪沙司]',
+  960: 'chocolate sauce, chocolate syrup [巧克力酱,巧克力糖浆]',
+  961: 'dough [面团]',
+  962: 'meat loaf, meatloaf [瑞士肉包,肉饼]',
+  963: 'pizza, pizza pie [披萨,披萨饼]',
+  964: 'potpie [馅饼]',
+  965: 'burrito [卷饼]',
+  966: 'red wine [红葡萄酒]',
+  967: 'espresso [意大利浓咖啡]',
+  968: 'cup [杯子]',
+  969: 'eggnog [蛋酒]',
+  970: 'alp [高山]',
+  971: 'bubble [泡泡]',
+  972: 'cliff, drop, drop-off [悬崖]',
+  973: 'coral reef [珊瑚礁]',
+  974: 'geyser [间歇泉]',
+  975: 'lakeside, lakeshore [湖边,湖岸]',
+  976: 'promontory, headland, head, foreland [海角]',
+  977: 'sandbar, sand bar [沙洲,沙坝]',
+  978: 'seashore, coast, seacoast, sea-coast [海滨,海岸]',
+  979: 'valley, vale [峡谷]',
+  980: 'volcano [火山]',
+  981: 'ballplayer, baseball player [棒球,棒球运动员]',
+  982: 'groom, bridegroom [新郎]',
+  983: 'scuba diver [潜水员]',
+  984: 'rapeseed [油菜]',
+  985: 'daisy [雏菊]',
+  986: 'yellow ladys slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum [杓兰]',
+  987: 'corn [玉米]',
+  988: 'acorn [橡子]',
+  989: 'hip, rose hip, rosehip [玫瑰果]',
+  990: 'buckeye, horse chestnut, conker [七叶树果实]',
+  991: 'coral fungus [珊瑚菌]',
+  992: 'agaric [木耳]',
+  993: 'gyromitra [鹿花菌]',
+  994: 'stinkhorn, carrion fungus [鬼笔菌]',
+  995: 'earthstar [地星（菌类）]',
+  996: 'hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa [多叶奇果菌]',
+  997: 'bolete [牛肝菌]',
+  998: 'ear, spike, capitulum [玉米穗]',
+  999: 'toilet tissue, toilet paper, bathroom tissue [卫生纸]',
+}

models/generate.py ADDED Viewed

	@@ -0,0 +1,176 @@

+# Modified from:
+#   gpt-fast: https://github.com/pytorch-labs/gpt-fast/blob/main/generate.py
+#   DiT:      https://github.com/facebookresearch/DiT/blob/main/models.py
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+import torch._dynamo.config
+import torch._inductor.config
+import copy
+# torch._inductor.config.coordinate_descent_tuning = True
+# torch._inductor.config.triton.unique_kernel_names = True
+# torch._inductor.config.fx_graph_cache = True # Experimental feature to reduce compilation times, will be on by default in future
+### from https://huggingface.co/transformers/v3.2.0/_modules/transformers/generation_utils.html
+def top_k_top_p_filtering(
+    logits,
+    top_k: int = 0,
+    top_p: float = 1.0,
+    filter_value: float = -float("Inf"),
+    min_tokens_to_keep: int = 1,
+):
+    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+    Args:
+        logits: logits distribution shape (batch size, vocabulary size)
+        if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
+        if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+            Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+        Make sure we keep at least min_tokens_to_keep per batch example in the output
+    From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """
+    if top_k > 0:
+        top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1))  # Safety check
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+    if top_p < 1.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+        # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        if min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
+            sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        # scatter sorted tensors to original indexing
+        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+        logits[indices_to_remove] = filter_value
+    return logits
+def sample(logits, temperature: float=1.0, top_k: int=0, top_p: float=1.0, sample_logits=True):
+    logits = logits[:, -1, :] / max(temperature, 1e-5)
+    if top_k > 0 or top_p < 1.0:
+        logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
+    probs = F.softmax(logits, dim=-1)
+    if sample_logits:
+        idx = torch.multinomial(probs, num_samples=1)
+    else:
+        _, idx = torch.topk(probs, k=1, dim=-1)
+    return idx, probs
+def logits_to_probs(logits, temperature: float = 1.0, top_p: float=1.0, top_k: int = None, **kwargs):
+    logits = logits / max(temperature, 1e-5)
+    if top_k > 0 or top_p < 1.0:
+        logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
+    probs = torch.nn.functional.softmax(logits, dim=-1)
+    return probs
+def prefill(model, cond_idx: torch.Tensor, input_pos: torch.Tensor, cfg_scale: float, **sampling_kwargs):
+    if cfg_scale > 1.0:
+        logits, _ = model(None, cond_idx, input_pos)
+        logits_combined = logits
+        cond_logits, uncond_logits = torch.split(logits_combined, len(logits_combined) // 2, dim=0)
+        logits = uncond_logits + (cond_logits - uncond_logits) * cfg_scale
+    else:
+        logits, _ = model(None, cond_idx, input_pos)
+    return sample(logits, **sampling_kwargs)[0]
+def decode_one_token(model, x: torch.Tensor, input_pos: torch.Tensor, cfg_scale: float, cfg_flag: bool, **sampling_kwargs):
+    assert input_pos.shape[-1] == 1
+    if cfg_scale > 1.0:
+        x_combined = torch.cat([x, x])
+        logits, _ = model(x_combined, cond_idx=None, input_pos=input_pos)
+        logits_combined = logits
+        cond_logits, uncond_logits = torch.split(logits_combined, len(logits_combined) // 2, dim=0)
+        if cfg_flag:
+            logits = uncond_logits + (cond_logits - uncond_logits) * cfg_scale
+        else:
+            logits = cond_logits
+    else:
+        logits, _ = model(x, cond_idx=None, input_pos=input_pos)
+    return sample(logits, **sampling_kwargs)
+def decode_n_tokens(
+    model, cur_token: torch.Tensor, input_pos: torch.Tensor, num_new_tokens: int,
+    cfg_scale: float, cfg_interval: int,
+    **sampling_kwargs):
+    new_tokens, new_probs = [], []
+    cfg_flag = True
+    for i in range(num_new_tokens):
+        with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True): # Actually better for Inductor to codegen attention here
+            if cfg_interval > -1 and i > cfg_interval:
+                cfg_flag = False
+            next_token, next_prob = decode_one_token(
+                model, cur_token, input_pos, cfg_scale, cfg_flag, **sampling_kwargs
+            )
+            input_pos += 1
+            new_tokens.append(next_token.clone())
+            new_probs.append(next_prob.clone())
+            cur_token = next_token.view(-1, 1)
+    return new_tokens, new_probs
+@torch.no_grad()
+def generate(model, cond, max_new_tokens, emb_masks=None, cfg_scale=1.0, cfg_interval=-1, **sampling_kwargs):
+    if model.model_type == 'c2i':
+        if cfg_scale > 1.0:
+            cond_null = torch.ones_like(cond) * model.num_classes
+            cond_combined = torch.cat([cond, cond_null])
+        else:
+            cond_combined = cond
+        T = 1
+    elif model.model_type == 't2i':
+        if cfg_scale > 1.0:
+            cond_null = torch.zeros_like(cond) + model.cls_embedding.uncond_embedding
+            cond_combined = torch.cat([cond, cond_null])
+        else:
+            cond_combined = cond
+        T = cond.shape[1]
+    else:
+        raise Exception("please check model type")
+    T_new = T + max_new_tokens
+    max_seq_length = T_new
+    max_batch_size = cond.shape[0]
+    device = cond.device
+    with torch.device(device):
+        max_batch_size_cfg = max_batch_size * 2 if cfg_scale > 1.0 else max_batch_size
+        model.setup_caches(max_batch_size=max_batch_size_cfg, max_seq_length=max_seq_length, dtype=model.tok_embeddings.weight.dtype)
+    if emb_masks is not None:
+        assert emb_masks.shape[0] == max_batch_size
+        assert emb_masks.shape[-1] == T
+        if cfg_scale > 1.0:
+            model.causal_mask[:, :, :T] = model.causal_mask[:, :, :T] * torch.cat([emb_masks, emb_masks]).unsqueeze(1)
+        else:
+            model.causal_mask[:, :, :T] = model.causal_mask[:, :, :T] * emb_masks.unsqueeze(1)
+        eye_matrix = torch.eye(model.causal_mask.size(1), model.causal_mask.size(2), device=device)
+        model.causal_mask[:] = model.causal_mask * (1 - eye_matrix) + eye_matrix
+    # create an empty tensor of the expected final shape and fill in the current tokens
+    seq = torch.empty((max_batch_size, T_new), dtype=torch.int, device=device)
+    input_pos = torch.arange(0, T, device=device)
+    next_token = prefill(model, cond_combined, input_pos, cfg_scale, **sampling_kwargs)
+    seq[:, T:T+1] = next_token
+    input_pos = torch.tensor([T], device=device, dtype=torch.int)
+    generated_tokens, _ = decode_n_tokens(model, next_token, input_pos, max_new_tokens-1, cfg_scale, cfg_interval, **sampling_kwargs)
+    seq[:, T+1:] = torch.cat(generated_tokens, dim=1)
+    return seq[:, T:]

models/gpt.py ADDED Viewed

	@@ -0,0 +1,465 @@

+# Modified from:
+#   VQGAN:    https://github.com/CompVis/taming-transformers/blob/master/taming/modules/transformer/mingpt.py
+#   DiT:      https://github.com/facebookresearch/DiT/blob/main/models.py
+#   nanoGPT:  https://github.com/karpathy/nanoGPT/blob/master/model.py
+#   llama:    https://github.com/facebookresearch/llama/blob/main/llama/model.py
+#   gpt-fast: https://github.com/pytorch-labs/gpt-fast/blob/main/model.py
+#   PixArt:   https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
+from dataclasses import dataclass
+from typing import Optional, List
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+def find_multiple(n: int, k: int):
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+@dataclass
+class ModelArgs:
+    dim: int = 4096
+    n_layer: int = 32
+    n_head: int = 32
+    n_kv_head: Optional[int] = None
+    multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
+    ffn_dim_multiplier: Optional[float] = None
+    rope_base: float = 10000
+    norm_eps: float = 1e-5
+    initializer_range: float = 0.02
+    token_dropout_p: float = 0.1
+    attn_dropout_p: float = 0.0
+    resid_dropout_p: float = 0.1
+    ffn_dropout_p: float = 0.1
+    drop_path_rate: float = 0.0
+    num_classes: int = 1000
+    caption_dim: int = 2048
+    class_dropout_prob: float = 0.1
+    model_type: str = 'c2i'
+    vocab_size: int = 16384
+    cls_token_num: int = 1
+    block_size: int = 256
+    max_batch_size: int = 32
+    max_seq_len: int = 2048
+#################################################################################
+#                      Embedding Layers for Class Labels                        #
+#################################################################################
+class LabelEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        super().__init__()
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+    def token_drop(self, labels, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        labels = torch.where(drop_ids, self.num_classes, labels)
+        return labels
+    def forward(self, labels, train, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            labels = self.token_drop(labels, force_drop_ids)
+        embeddings = self.embedding_table(labels).unsqueeze(1)
+        return embeddings
+#################################################################################
+#                      Embedding Layers for Text Feature                        #
+#################################################################################
+class CaptionEmbedder(nn.Module):
+    """
+    Embeds text caption into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, in_channels, hidden_size, uncond_prob, token_num=120):
+        super().__init__()
+        self.cap_proj = MLP(in_features=in_channels, hidden_features=hidden_size, out_features=hidden_size)
+        self.register_buffer("uncond_embedding", nn.Parameter(torch.randn(token_num, in_channels) / in_channels ** 0.5))
+        self.uncond_prob = uncond_prob
+    def token_drop(self, caption, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(caption.shape[0], device=caption.device) < self.uncond_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        caption = torch.where(drop_ids[:, None, None], self.uncond_embedding, caption)
+        return caption
+    def forward(self, caption, train, force_drop_ids=None):
+        use_dropout = self.uncond_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            caption = self.token_drop(caption, force_drop_ids)
+        embeddings = self.cap_proj(caption)
+        return embeddings
+class MLP(nn.Module):
+    def __init__(self, in_features, hidden_features, out_features):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=False)
+        self.act = nn.GELU(approximate='tanh')
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=False)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        return x
+#################################################################################
+#                                  GPT Model                                    #
+#################################################################################
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+class FeedForward(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        hidden_dim = 4 * config.dim
+        hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        if config.ffn_dim_multiplier is not None:
+            hidden_dim = int(config.ffn_dim_multiplier * hidden_dim)
+        hidden_dim = find_multiple(hidden_dim, config.multiple_of)
+        self.w1 = nn.Linear(config.dim, hidden_dim, bias=False)
+        self.w3 = nn.Linear(config.dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, config.dim, bias=False)
+        self.ffn_dropout = nn.Dropout(config.ffn_dropout_p)
+    def forward(self, x):
+        return self.ffn_dropout(self.w2(F.silu(self.w1(x)) * self.w3(x)))
+class KVCache(nn.Module):
+    def __init__(self, max_batch_size, max_seq_length, n_head, head_dim, dtype):
+        super().__init__()
+        cache_shape = (max_batch_size, n_head, max_seq_length, head_dim)
+        self.register_buffer('k_cache', torch.zeros(cache_shape, dtype=dtype))
+        self.register_buffer('v_cache', torch.zeros(cache_shape, dtype=dtype))
+    def update(self, input_pos, k_val, v_val):
+        # input_pos: [S], k_val: [B, H, S, D]
+        assert input_pos.shape[0] == k_val.shape[2]
+        k_out = self.k_cache
+        v_out = self.v_cache
+        k_out[:, :, input_pos] = k_val
+        v_out[:, :, input_pos] = v_val
+        return k_out, v_out
+class Attention(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        assert config.dim % config.n_head == 0
+        self.dim = config.dim
+        self.head_dim = config.dim // config.n_head
+        self.n_head = config.n_head
+        self.n_kv_head = config.n_kv_head if config.n_kv_head is not None else config.n_head
+        total_kv_dim = (self.n_head + 2 * self.n_kv_head) * self.head_dim
+        # key, query, value projections for all heads, but in a batch
+        self.wqkv = nn.Linear(config.dim, total_kv_dim, bias=False)
+        self.wo = nn.Linear(config.dim, config.dim, bias=False)
+        self.kv_cache = None
+        # regularization
+        self.attn_dropout_p = config.attn_dropout_p
+        self.resid_dropout = nn.Dropout(config.resid_dropout_p)
+    def forward(
+        self, x: torch.Tensor, freqs_cis: torch.Tensor = None,
+        input_pos: Optional[torch.Tensor] = None,
+        mask: Optional[torch.Tensor] = None
+    ):
+        bsz, seqlen, _ = x.shape
+        kv_size = self.n_kv_head * self.head_dim
+        xq, xk, xv = self.wqkv(x).split([self.dim, kv_size, kv_size], dim=-1)
+        xq = xq.view(bsz, seqlen, self.n_head, self.head_dim)
+        xk = xk.view(bsz, seqlen, self.n_kv_head, self.head_dim)
+        xv = xv.view(bsz, seqlen, self.n_kv_head, self.head_dim)
+        xq = apply_rotary_emb(xq, freqs_cis)
+        xk = apply_rotary_emb(xk, freqs_cis)
+        xq, xk, xv = map(lambda x: x.transpose(1, 2), (xq, xk, xv))
+        if self.kv_cache is not None:
+            keys, values = self.kv_cache.update(input_pos, xk, xv)
+        else:
+            keys, values = xk, xv
+        keys = keys.repeat_interleave(self.n_head // self.n_kv_head, dim=1)
+        values = values.repeat_interleave(self.n_head // self.n_kv_head, dim=1)
+        output = F.scaled_dot_product_attention(
+            xq, keys, values,
+            attn_mask=mask,
+            is_causal=True if mask is None else False, # is_causal=False is for KV cache
+            dropout_p=self.attn_dropout_p if self.training else 0)
+        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
+        output = self.resid_dropout(self.wo(output))
+        return output
+class TransformerBlock(nn.Module):
+    def __init__(self, config: ModelArgs, drop_path: float):
+        super().__init__()
+        self.attention = Attention(config)
+        self.feed_forward = FeedForward(config)
+        self.attention_norm = RMSNorm(config.dim, eps=config.norm_eps)
+        self.ffn_norm = RMSNorm(config.dim, eps=config.norm_eps)
+    def forward(
+        self, x: torch.Tensor, freqs_cis: torch.Tensor, start_pos: int, mask: Optional[torch.Tensor] = None):
+        h = x + self.attention(self.attention_norm(x), freqs_cis, start_pos, mask)
+        out = h + self.feed_forward(self.ffn_norm(h))
+        return out
+class Transformer(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.n_layer = config.n_layer
+        self.block_size = config.block_size
+        self.num_classes = config.num_classes
+        self.model_type = config.model_type
+        self.cls_token_num = config.cls_token_num
+        if self.model_type == 'c2i':
+            self.cls_embedding = LabelEmbedder(config.num_classes, config.dim, config.class_dropout_prob)
+        elif self.model_type == 't2i':
+            self.cls_embedding = CaptionEmbedder(config.caption_dim, config.dim, config.class_dropout_prob)
+        else:
+            raise Exception("please check model type")
+        self.tok_embeddings = nn.Embedding(config.vocab_size, config.dim)
+        self.tok_dropout = nn.Dropout(config.token_dropout_p)
+        # transformer blocks
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.n_layer)]
+        self.layers = torch.nn.ModuleList()
+        for layer_id in range(config.n_layer):
+            self.layers.append(TransformerBlock(config, dpr[layer_id]))
+        # output layer
+        self.norm = RMSNorm(config.dim, eps=config.norm_eps)
+        self.output = nn.Linear(config.dim, config.vocab_size, bias=False)
+        # 2d rotary pos embedding
+        grid_size = int(self.block_size ** 0.5)
+        assert grid_size * grid_size == self.block_size
+        self.freqs_cis = precompute_freqs_cis_2d(grid_size, self.config.dim // self.config.n_head, self.config.rope_base, self.cls_token_num)
+        # KVCache
+        self.max_batch_size = -1
+        self.max_seq_length = -1
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize nn.Linear and nn.Embedding
+        self.apply(self._init_weights)
+        # Zero-out output layers:
+        nn.init.constant_(self.output.weight, 0)
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+    def setup_caches(self, max_batch_size, max_seq_length, dtype):
+        # if self.max_seq_length >= max_seq_length and self.max_batch_size >= max_batch_size:
+        #     return
+        head_dim = self.config.dim // self.config.n_head
+        max_seq_length = find_multiple(max_seq_length, 8)
+        self.max_seq_length = max_seq_length
+        self.max_batch_size = max_batch_size
+        for b in self.layers:
+            b.attention.kv_cache = KVCache(max_batch_size, max_seq_length, self.config.n_head, head_dim, dtype)
+        causal_mask = torch.tril(torch.ones(self.max_seq_length, self.max_seq_length, dtype=torch.bool))
+        self.causal_mask = causal_mask.unsqueeze(0).repeat(self.max_batch_size, 1, 1)
+        grid_size = int(self.config.block_size ** 0.5)
+        assert grid_size * grid_size == self.block_size
+        self.freqs_cis = precompute_freqs_cis_2d(grid_size, self.config.dim // self.config.n_head, self.config.rope_base, self.cls_token_num)
+    def forward(
+        self,
+        idx: torch.Tensor,
+        cond_idx: torch.Tensor,  # cond_idx_or_embed
+        input_pos:  Optional[torch.Tensor] = None,
+        targets: Optional[torch.Tensor] = None,
+        mask: Optional[torch.Tensor] = None,
+        valid: Optional[torch.Tensor] = None,
+    ):
+        if idx is not None and cond_idx is not None: # training or naive inference
+            cond_embeddings = self.cls_embedding(cond_idx, train=self.training)[:,:self.cls_token_num]
+            token_embeddings = self.tok_embeddings(idx)
+            token_embeddings = torch.cat((cond_embeddings, token_embeddings), dim=1)
+            h = self.tok_dropout(token_embeddings)
+            self.freqs_cis = self.freqs_cis.to(h.device)
+        else:
+            if cond_idx is not None: # prefill in inference
+                token_embeddings = self.cls_embedding(cond_idx, train=self.training)[:,:self.cls_token_num]
+            else: # decode_n_tokens(kv cache) in inference
+                token_embeddings = self.tok_embeddings(idx)
+            bs = token_embeddings.shape[0]
+            mask = self.causal_mask[:bs, None, input_pos]
+            h = self.tok_dropout(token_embeddings)
+            self.freqs_cis = self.freqs_cis
+        if self.training:
+            freqs_cis = self.freqs_cis[:token_embeddings.shape[1]]
+        else:
+            freqs_cis = self.freqs_cis[input_pos]
+        # transformer blocks
+        for layer in self.layers:
+            h = layer(h, freqs_cis, input_pos, mask)
+        # output layers
+        h = self.norm(h)
+        logits = self.output(h).float()
+        if self.training:
+            logits = logits[:, self.cls_token_num - 1:].contiguous()
+        # if we are given some desired targets also calculate the loss
+        loss = None
+        if valid is not None:
+            loss_all = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), reduction='none')
+            valid_all = valid[:,None].repeat(1, targets.shape[1]).view(-1)
+            loss = (loss_all * valid_all).sum() / max(valid_all.sum(), 1)
+        elif targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+        return logits, loss
+    def get_fsdp_wrap_module_list(self) -> List[nn.Module]:
+        return list(self.layers)
+#################################################################################
+#                      Rotary Positional Embedding Functions                    #
+#################################################################################
+# https://github.com/pytorch-labs/gpt-fast/blob/main/model.py
+def precompute_freqs_cis(seq_len: int, n_elem: int, base: int = 10000, cls_token_num=120):
+    freqs = 1.0 / (base ** (torch.arange(0, n_elem, 2)[: (n_elem // 2)].float() / n_elem))
+    t = torch.arange(seq_len, device=freqs.device)
+    freqs = torch.outer(t, freqs) # (seq_len, head_dim // 2)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+    cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1) # (cls_token_num+seq_len, head_dim // 2, 2)
+    cond_cache = torch.cat([torch.zeros(cls_token_num, n_elem // 2, 2), cache]) # (cls_token_num+seq_len, head_dim // 2, 2)
+    return cond_cache
+def precompute_freqs_cis_2d(grid_size: int, n_elem: int, base: int = 10000, cls_token_num=120):
+    # split the dimension into half, one for x and one for y
+    half_dim = n_elem // 2
+    freqs = 1.0 / (base ** (torch.arange(0, half_dim, 2)[: (half_dim // 2)].float() / half_dim))
+    t = torch.arange(grid_size, device=freqs.device)
+    freqs = torch.outer(t, freqs) # (grid_size, head_dim // 2)
+    freqs_grid = torch.concat([
+        freqs[:, None, :].expand(-1, grid_size, -1),
+        freqs[None, :, :].expand(grid_size, -1, -1),
+    ], dim=-1)  # (grid_size, grid_size, head_dim // 2)
+    cache_grid = torch.stack([torch.cos(freqs_grid), torch.sin(freqs_grid)], dim=-1) # (grid_size, grid_size, head_dim // 2, 2)
+    cache = cache_grid.flatten(0, 1)
+    cond_cache = torch.cat([torch.zeros(cls_token_num, n_elem // 2, 2), cache]) # (cls_token_num+grid_size**2, head_dim // 2, 2)
+    return cond_cache
+def apply_rotary_emb(x: torch.Tensor, freqs_cis: torch.Tensor):
+    # x: (bs, seq_len, n_head, head_dim)
+    # freqs_cis (seq_len, head_dim // 2, 2)
+    xshaped = x.float().reshape(*x.shape[:-1], -1, 2) # (bs, seq_len, n_head, head_dim//2, 2)
+    freqs_cis = freqs_cis.view(1, xshaped.size(1), 1, xshaped.size(3), 2) # (1, seq_len, 1, head_dim//2, 2)
+    x_out2 = torch.stack([
+            xshaped[..., 0] * freqs_cis[..., 0] - xshaped[..., 1] * freqs_cis[..., 1],
+            xshaped[..., 1] * freqs_cis[..., 0] + xshaped[..., 0] * freqs_cis[..., 1],
+    ], dim=-1)
+    x_out2 = x_out2.flatten(3)
+    return x_out2.type_as(x)
+#################################################################################
+#                                GPT Configs                                    #
+#################################################################################
+### text-conditional
+def GPT_7B(**kwargs):
+    return Transformer(ModelArgs(n_layer=32, n_head=32, dim=4096, **kwargs)) # 6.6B
+def GPT_3B(**kwargs):
+    return Transformer(ModelArgs(n_layer=24, n_head=32, dim=3200, **kwargs)) # 3.1B
+def GPT_1B(**kwargs):
+    return Transformer(ModelArgs(n_layer=22, n_head=32, dim=2048, **kwargs)) # 1.2B
+### class-conditional
+def GPT_XXXL(**kwargs):
+    return Transformer(ModelArgs(n_layer=48, n_head=40, dim=2560, **kwargs)) # 3.9B
+def GPT_XXL(**kwargs):
+    return Transformer(ModelArgs(n_layer=48, n_head=24, dim=1536, **kwargs)) # 1.4B
+def GPT_XL(**kwargs):
+    return Transformer(ModelArgs(n_layer=36, n_head=20, dim=1280, **kwargs)) # 775M
+def GPT_L(**kwargs):
+    return Transformer(ModelArgs(n_layer=24, n_head=16, dim=1024, **kwargs)) # 343M
+def GPT_B(**kwargs):
+    return Transformer(ModelArgs(n_layer=12, n_head=12, dim=768, **kwargs)) # 111M
+GPT_models = {
+    'GPT-B': GPT_B, 'GPT-L': GPT_L, 'GPT-XL': GPT_XL, 'GPT-XXL': GPT_XXL, 'GPT-XXXL': GPT_XXXL,
+    'GPT-1B': GPT_1B, 'GPT-3B': GPT_3B, 'GPT-7B': GPT_7B,
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ torch

tokenizer_image/discriminator.py ADDED Viewed

	@@ -0,0 +1,255 @@

+# Modified from:
+#   taming-transformers:  https://github.com/CompVis/taming-transformers
+#   stylegan2-pytorch:    https://github.com/rosinality/stylegan2-pytorch/blob/master/model.py
+#   maskgit: https://github.com/google-research/maskgit/blob/main/maskgit/nets/discriminator.py
+import functools
+import math
+import torch
+import torch.nn as nn
+try:
+    from kornia.filters import filter2d
+except:
+    pass
+#################################################################################
+#                                    PatchGAN                                   #
+#################################################################################
+class PatchGANDiscriminator(nn.Module):
+    """Defines a PatchGAN discriminator as in Pix2Pix
+        --> see https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/models/networks.py
+    """
+    def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False):
+        """Construct a PatchGAN discriminator
+        Parameters:
+            input_nc (int)  -- the number of channels in input images
+            ndf (int)       -- the number of filters in the last conv layer
+            n_layers (int)  -- the number of conv layers in the discriminator
+            norm_layer      -- normalization layer
+        """
+        super(PatchGANDiscriminator, self).__init__()
+        if not use_actnorm:
+            norm_layer = nn.BatchNorm2d
+        else:
+            norm_layer = ActNorm
+        if type(norm_layer) == functools.partial:  # no need to use bias as BatchNorm2d has affine parameters
+            use_bias = norm_layer.func != nn.BatchNorm2d
+        else:
+            use_bias = norm_layer != nn.BatchNorm2d
+        kw = 4
+        padw = 1
+        sequence = [nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)]
+        nf_mult = 1
+        nf_mult_prev = 1
+        for n in range(1, n_layers):  # gradually increase the number of filters
+            nf_mult_prev = nf_mult
+            nf_mult = min(2 ** n, 8)
+            sequence += [
+                nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=2, padding=padw, bias=use_bias),
+                norm_layer(ndf * nf_mult),
+                nn.LeakyReLU(0.2, True)
+            ]
+        nf_mult_prev = nf_mult
+        nf_mult = min(2 ** n_layers, 8)
+        sequence += [
+            nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=1, padding=padw, bias=use_bias),
+            norm_layer(ndf * nf_mult),
+            nn.LeakyReLU(0.2, True)
+        ]
+        sequence += [
+            nn.Conv2d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw)]  # output 1 channel prediction map
+        self.main = nn.Sequential(*sequence)
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Conv2d):
+            nn.init.normal_(module.weight.data, 0.0, 0.02)
+        elif isinstance(module, nn.BatchNorm2d):
+            nn.init.normal_(module.weight.data, 1.0, 0.02)
+            nn.init.constant_(module.bias.data, 0)
+    def forward(self, input):
+        """Standard forward."""
+        return self.main(input)
+class ActNorm(nn.Module):
+    def __init__(self, num_features, logdet=False, affine=True,
+                 allow_reverse_init=False):
+        assert affine
+        super().__init__()
+        self.logdet = logdet
+        self.loc = nn.Parameter(torch.zeros(1, num_features, 1, 1))
+        self.scale = nn.Parameter(torch.ones(1, num_features, 1, 1))
+        self.allow_reverse_init = allow_reverse_init
+        self.register_buffer('initialized', torch.tensor(0, dtype=torch.uint8))
+    def initialize(self, input):
+        with torch.no_grad():
+            flatten = input.permute(1, 0, 2, 3).contiguous().view(input.shape[1], -1)
+            mean = (
+                flatten.mean(1)
+                .unsqueeze(1)
+                .unsqueeze(2)
+                .unsqueeze(3)
+                .permute(1, 0, 2, 3)
+            )
+            std = (
+                flatten.std(1)
+                .unsqueeze(1)
+                .unsqueeze(2)
+                .unsqueeze(3)
+                .permute(1, 0, 2, 3)
+            )
+            self.loc.data.copy_(-mean)
+            self.scale.data.copy_(1 / (std + 1e-6))
+    def forward(self, input, reverse=False):
+        if reverse:
+            return self.reverse(input)
+        if len(input.shape) == 2:
+            input = input[:,:,None,None]
+            squeeze = True
+        else:
+            squeeze = False
+        _, _, height, width = input.shape
+        if self.training and self.initialized.item() == 0:
+            self.initialize(input)
+            self.initialized.fill_(1)
+        h = self.scale * (input + self.loc)
+        if squeeze:
+            h = h.squeeze(-1).squeeze(-1)
+        if self.logdet:
+            log_abs = torch.log(torch.abs(self.scale))
+            logdet = height*width*torch.sum(log_abs)
+            logdet = logdet * torch.ones(input.shape[0]).to(input)
+            return h, logdet
+        return h
+    def reverse(self, output):
+        if self.training and self.initialized.item() == 0:
+            if not self.allow_reverse_init:
+                raise RuntimeError(
+                    "Initializing ActNorm in reverse direction is "
+                    "disabled by default. Use allow_reverse_init=True to enable."
+                )
+            else:
+                self.initialize(output)
+                self.initialized.fill_(1)
+        if len(output.shape) == 2:
+            output = output[:,:,None,None]
+            squeeze = True
+        else:
+            squeeze = False
+        h = output / self.scale - self.loc
+        if squeeze:
+            h = h.squeeze(-1).squeeze(-1)
+        return h
+#################################################################################
+#                                    StyleGAN                                   #
+#################################################################################
+class StyleGANDiscriminator(nn.Module):
+    def __init__(self, input_nc=3, ndf=64, n_layers=3, channel_multiplier=1, image_size=256):
+        super().__init__()
+        channels = {
+            4: 512,
+            8: 512,
+            16: 512,
+            32: 512,
+            64: 256 * channel_multiplier,
+            128: 128 * channel_multiplier,
+            256: 64 * channel_multiplier,
+            512: 32 * channel_multiplier,
+            1024: 16 * channel_multiplier,
+        }
+        log_size = int(math.log(image_size, 2))
+        in_channel = channels[image_size]
+        blocks = [nn.Conv2d(input_nc, in_channel, 3, padding=1), leaky_relu()]
+        for i in range(log_size, 2, -1):
+            out_channel = channels[2 ** (i - 1)]
+            blocks.append(DiscriminatorBlock(in_channel, out_channel))
+            in_channel = out_channel
+        self.blocks = nn.ModuleList(blocks)
+        self.final_conv = nn.Sequential(
+            nn.Conv2d(in_channel, channels[4], 3, padding=1),
+            leaky_relu(),
+        )
+        self.final_linear = nn.Sequential(
+            nn.Linear(channels[4] * 4 * 4, channels[4]),
+            leaky_relu(),
+            nn.Linear(channels[4], 1)
+        )
+    def forward(self, x):
+        for block in self.blocks:
+            x = block(x)
+        x = self.final_conv(x)
+        x = x.view(x.shape[0], -1)
+        x = self.final_linear(x)
+        return x
+class DiscriminatorBlock(nn.Module):
+    def __init__(self, input_channels, filters, downsample=True):
+        super().__init__()
+        self.conv_res = nn.Conv2d(input_channels, filters, 1, stride = (2 if downsample else 1))
+        self.net = nn.Sequential(
+            nn.Conv2d(input_channels, filters, 3, padding=1),
+            leaky_relu(),
+            nn.Conv2d(filters, filters, 3, padding=1),
+            leaky_relu()
+        )
+        self.downsample = nn.Sequential(
+            Blur(),
+            nn.Conv2d(filters, filters, 3, padding = 1, stride = 2)
+        ) if downsample else None
+    def forward(self, x):
+        res = self.conv_res(x)
+        x = self.net(x)
+        if exists(self.downsample):
+            x = self.downsample(x)
+        x = (x + res) * (1 / math.sqrt(2))
+        return x
+class Blur(nn.Module):
+    def __init__(self):
+        super().__init__()
+        f = torch.Tensor([1, 2, 1])
+        self.register_buffer('f', f)
+    def forward(self, x):
+        f = self.f
+        f = f[None, None, :] * f [None, :, None]
+        return filter2d(x, f, normalized=True)
+def leaky_relu(p=0.2):
+    return nn.LeakyReLU(p, inplace=True)
+def exists(val):
+    return val is not None

tokenizer_image/discriminator_patchgan.py ADDED Viewed

	@@ -0,0 +1,152 @@

+# Modified from:
+#   taming-transformers:  https://github.com/CompVis/taming-transformers
+import functools
+import torch
+import torch.nn as nn
+class NLayerDiscriminator(nn.Module):
+    """Defines a PatchGAN discriminator as in Pix2Pix
+        --> see https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/models/networks.py
+    """
+    def __init__(self, input_nc=3, ndf=64, n_layers=3, use_actnorm=False):
+        """Construct a PatchGAN discriminator
+        Parameters:
+            input_nc (int)  -- the number of channels in input images
+            ndf (int)       -- the number of filters in the last conv layer
+            n_layers (int)  -- the number of conv layers in the discriminator
+            norm_layer      -- normalization layer
+        """
+        super(NLayerDiscriminator, self).__init__()
+        if not use_actnorm:
+            norm_layer = nn.BatchNorm2d
+        else:
+            norm_layer = ActNorm
+        if type(norm_layer) == functools.partial:  # no need to use bias as BatchNorm2d has affine parameters
+            use_bias = norm_layer.func != nn.BatchNorm2d
+        else:
+            use_bias = norm_layer != nn.BatchNorm2d
+        kw = 4
+        padw = 1
+        sequence = [nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw), nn.LeakyReLU(0.2, True)]
+        nf_mult = 1
+        nf_mult_prev = 1
+        for n in range(1, n_layers):  # gradually increase the number of filters
+            nf_mult_prev = nf_mult
+            nf_mult = min(2 ** n, 8)
+            sequence += [
+                nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=2, padding=padw, bias=use_bias),
+                norm_layer(ndf * nf_mult),
+                nn.LeakyReLU(0.2, True)
+            ]
+        nf_mult_prev = nf_mult
+        nf_mult = min(2 ** n_layers, 8)
+        sequence += [
+            nn.Conv2d(ndf * nf_mult_prev, ndf * nf_mult, kernel_size=kw, stride=1, padding=padw, bias=use_bias),
+            norm_layer(ndf * nf_mult),
+            nn.LeakyReLU(0.2, True)
+        ]
+        sequence += [
+            nn.Conv2d(ndf * nf_mult, 1, kernel_size=kw, stride=1, padding=padw)]  # output 1 channel prediction map
+        self.main = nn.Sequential(*sequence)
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Conv2d):
+            nn.init.normal_(module.weight.data, 0.0, 0.02)
+        elif isinstance(module, nn.BatchNorm2d):
+            nn.init.normal_(module.weight.data, 1.0, 0.02)
+            nn.init.constant_(module.bias.data, 0)
+    def forward(self, input):
+        """Standard forward."""
+        return self.main(input)
+class ActNorm(nn.Module):
+    def __init__(self, num_features, logdet=False, affine=True,
+                 allow_reverse_init=False):
+        assert affine
+        super().__init__()
+        self.logdet = logdet
+        self.loc = nn.Parameter(torch.zeros(1, num_features, 1, 1))
+        self.scale = nn.Parameter(torch.ones(1, num_features, 1, 1))
+        self.allow_reverse_init = allow_reverse_init
+        self.register_buffer('initialized', torch.tensor(0, dtype=torch.uint8))
+    def initialize(self, input):
+        with torch.no_grad():
+            flatten = input.permute(1, 0, 2, 3).contiguous().view(input.shape[1], -1)
+            mean = (
+                flatten.mean(1)
+                .unsqueeze(1)
+                .unsqueeze(2)
+                .unsqueeze(3)
+                .permute(1, 0, 2, 3)
+            )
+            std = (
+                flatten.std(1)
+                .unsqueeze(1)
+                .unsqueeze(2)
+                .unsqueeze(3)
+                .permute(1, 0, 2, 3)
+            )
+            self.loc.data.copy_(-mean)
+            self.scale.data.copy_(1 / (std + 1e-6))
+    def forward(self, input, reverse=False):
+        if reverse:
+            return self.reverse(input)
+        if len(input.shape) == 2:
+            input = input[:,:,None,None]
+            squeeze = True
+        else:
+            squeeze = False
+        _, _, height, width = input.shape
+        if self.training and self.initialized.item() == 0:
+            self.initialize(input)
+            self.initialized.fill_(1)
+        h = self.scale * (input + self.loc)
+        if squeeze:
+            h = h.squeeze(-1).squeeze(-1)
+        if self.logdet:
+            log_abs = torch.log(torch.abs(self.scale))
+            logdet = height*width*torch.sum(log_abs)
+            logdet = logdet * torch.ones(input.shape[0]).to(input)
+            return h, logdet
+        return h
+    def reverse(self, output):
+        if self.training and self.initialized.item() == 0:
+            if not self.allow_reverse_init:
+                raise RuntimeError(
+                    "Initializing ActNorm in reverse direction is "
+                    "disabled by default. Use allow_reverse_init=True to enable."
+                )
+            else:
+                self.initialize(output)
+                self.initialized.fill_(1)
+        if len(output.shape) == 2:
+            output = output[:,:,None,None]
+            squeeze = True
+        else:
+            squeeze = False
+        h = output / self.scale - self.loc
+        if squeeze:
+            h = h.squeeze(-1).squeeze(-1)
+        return h

tokenizer_image/discriminator_stylegan.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# Modified from:
+#   stylegan2-pytorch: https://github.com/lucidrains/stylegan2-pytorch/blob/master/stylegan2_pytorch/stylegan2_pytorch.py
+#   stylegan2-pytorch: https://github.com/rosinality/stylegan2-pytorch/blob/master/model.py
+#   maskgit: https://github.com/google-research/maskgit/blob/main/maskgit/nets/discriminator.py
+import math
+import torch
+import torch.nn as nn
+try:
+    from kornia.filters import filter2d
+except:
+    pass
+class Discriminator(nn.Module):
+    def __init__(self, input_nc=3, ndf=64, n_layers=3, channel_multiplier=1, image_size=256):
+        super().__init__()
+        channels = {
+            4: 512,
+            8: 512,
+            16: 512,
+            32: 512,
+            64: 256 * channel_multiplier,
+            128: 128 * channel_multiplier,
+            256: 64 * channel_multiplier,
+            512: 32 * channel_multiplier,
+            1024: 16 * channel_multiplier,
+        }
+        log_size = int(math.log(image_size, 2))
+        in_channel = channels[image_size]
+        blocks = [nn.Conv2d(input_nc, in_channel, 3, padding=1), leaky_relu()]
+        for i in range(log_size, 2, -1):
+            out_channel = channels[2 ** (i - 1)]
+            blocks.append(DiscriminatorBlock(in_channel, out_channel))
+            in_channel = out_channel
+        self.blocks = nn.ModuleList(blocks)
+        self.final_conv = nn.Sequential(
+            nn.Conv2d(in_channel, channels[4], 3, padding=1),
+            leaky_relu(),
+        )
+        self.final_linear = nn.Sequential(
+            nn.Linear(channels[4] * 4 * 4, channels[4]),
+            leaky_relu(),
+            nn.Linear(channels[4], 1)
+        )
+    def forward(self, x):
+        for block in self.blocks:
+            x = block(x)
+        x = self.final_conv(x)
+        x = x.view(x.shape[0], -1)
+        x = self.final_linear(x)
+        return x
+class DiscriminatorBlock(nn.Module):
+    def __init__(self, input_channels, filters, downsample=True):
+        super().__init__()
+        self.conv_res = nn.Conv2d(input_channels, filters, 1, stride = (2 if downsample else 1))
+        self.net = nn.Sequential(
+            nn.Conv2d(input_channels, filters, 3, padding=1),
+            leaky_relu(),
+            nn.Conv2d(filters, filters, 3, padding=1),
+            leaky_relu()
+        )
+        self.downsample = nn.Sequential(
+            Blur(),
+            nn.Conv2d(filters, filters, 3, padding = 1, stride = 2)
+        ) if downsample else None
+    def forward(self, x):
+        res = self.conv_res(x)
+        x = self.net(x)
+        if exists(self.downsample):
+            x = self.downsample(x)
+        x = (x + res) * (1 / math.sqrt(2))
+        return x
+class Blur(nn.Module):
+    def __init__(self):
+        super().__init__()
+        f = torch.Tensor([1, 2, 1])
+        self.register_buffer('f', f)
+    def forward(self, x):
+        f = self.f
+        f = f[None, None, :] * f [None, :, None]
+        return filter2d(x, f, normalized=True)
+def leaky_relu(p=0.2):
+    return nn.LeakyReLU(p, inplace=True)
+def exists(val):
+    return val is not None

tokenizer_image/lpips.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""Stripped version of https://github.com/richzhang/PerceptualSimilarity/tree/master/models"""
+import os, hashlib
+import requests
+from tqdm import tqdm
+import torch
+import torch.nn as nn
+from torchvision import models
+from collections import namedtuple
+URL_MAP = {
+    "vgg_lpips": "https://heibox.uni-heidelberg.de/f/607503859c864bc1b30b/?dl=1"
+}
+CKPT_MAP = {
+    "vgg_lpips": "vgg.pth"
+}
+MD5_MAP = {
+    "vgg_lpips": "d507d7349b931f0638a25a48a722f98a"
+}
+def download(url, local_path, chunk_size=1024):
+    os.makedirs(os.path.split(local_path)[0], exist_ok=True)
+    with requests.get(url, stream=True) as r:
+        total_size = int(r.headers.get("content-length", 0))
+        with tqdm(total=total_size, unit="B", unit_scale=True) as pbar:
+            with open(local_path, "wb") as f:
+                for data in r.iter_content(chunk_size=chunk_size):
+                    if data:
+                        f.write(data)
+                        pbar.update(chunk_size)
+def md5_hash(path):
+    with open(path, "rb") as f:
+        content = f.read()
+    return hashlib.md5(content).hexdigest()
+def get_ckpt_path(name, root, check=False):
+    assert name in URL_MAP
+    path = os.path.join(root, CKPT_MAP[name])
+    if not os.path.exists(path) or (check and not md5_hash(path) == MD5_MAP[name]):
+        print("Downloading {} model from {} to {}".format(name, URL_MAP[name], path))
+        download(URL_MAP[name], path)
+        md5 = md5_hash(path)
+        assert md5 == MD5_MAP[name], md5
+    return path
+class LPIPS(nn.Module):
+    # Learned perceptual metric
+    def __init__(self, use_dropout=True):
+        super().__init__()
+        self.scaling_layer = ScalingLayer()
+        self.chns = [64, 128, 256, 512, 512]  # vg16 features
+        self.net = vgg16(pretrained=True, requires_grad=False)
+        self.lin0 = NetLinLayer(self.chns[0], use_dropout=use_dropout)
+        self.lin1 = NetLinLayer(self.chns[1], use_dropout=use_dropout)
+        self.lin2 = NetLinLayer(self.chns[2], use_dropout=use_dropout)
+        self.lin3 = NetLinLayer(self.chns[3], use_dropout=use_dropout)
+        self.lin4 = NetLinLayer(self.chns[4], use_dropout=use_dropout)
+        self.load_from_pretrained()
+        for param in self.parameters():
+            param.requires_grad = False
+    def load_from_pretrained(self, name="vgg_lpips"):
+        ckpt = get_ckpt_path(name, os.path.join(os.path.dirname(os.path.abspath(__file__)), "cache"))
+        self.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu")), strict=False)
+        print("loaded pretrained LPIPS loss from {}".format(ckpt))
+    @classmethod
+    def from_pretrained(cls, name="vgg_lpips"):
+        if name != "vgg_lpips":
+            raise NotImplementedError
+        model = cls()
+        ckpt = get_ckpt_path(name, os.path.join(os.path.dirname(os.path.abspath(__file__)), "cache"))
+        model.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu")), strict=False)
+        return model
+    def forward(self, input, target):
+        in0_input, in1_input = (self.scaling_layer(input), self.scaling_layer(target))
+        outs0, outs1 = self.net(in0_input), self.net(in1_input)
+        feats0, feats1, diffs = {}, {}, {}
+        lins = [self.lin0, self.lin1, self.lin2, self.lin3, self.lin4]
+        for kk in range(len(self.chns)):
+            feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk])
+            diffs[kk] = (feats0[kk] - feats1[kk]) ** 2
+        res = [spatial_average(lins[kk].model(diffs[kk]), keepdim=True) for kk in range(len(self.chns))]
+        val = res[0]
+        for l in range(1, len(self.chns)):
+            val += res[l]
+        return val
+class ScalingLayer(nn.Module):
+    def __init__(self):
+        super(ScalingLayer, self).__init__()
+        self.register_buffer('shift', torch.Tensor([-.030, -.088, -.188])[None, :, None, None])
+        self.register_buffer('scale', torch.Tensor([.458, .448, .450])[None, :, None, None])
+    def forward(self, inp):
+        return (inp - self.shift) / self.scale
+class NetLinLayer(nn.Module):
+    """ A single linear layer which does a 1x1 conv """
+    def __init__(self, chn_in, chn_out=1, use_dropout=False):
+        super(NetLinLayer, self).__init__()
+        layers = [nn.Dropout(), ] if (use_dropout) else []
+        layers += [nn.Conv2d(chn_in, chn_out, 1, stride=1, padding=0, bias=False), ]
+        self.model = nn.Sequential(*layers)
+class vgg16(torch.nn.Module):
+    def __init__(self, requires_grad=False, pretrained=True):
+        super(vgg16, self).__init__()
+        vgg_pretrained_features = models.vgg16(pretrained=pretrained).features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        self.N_slices = 5
+        for x in range(4):
+            self.slice1.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(4, 9):
+            self.slice2.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(9, 16):
+            self.slice3.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(16, 23):
+            self.slice4.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(23, 30):
+            self.slice5.add_module(str(x), vgg_pretrained_features[x])
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+    def forward(self, X):
+        h = self.slice1(X)
+        h_relu1_2 = h
+        h = self.slice2(h)
+        h_relu2_2 = h
+        h = self.slice3(h)
+        h_relu3_3 = h
+        h = self.slice4(h)
+        h_relu4_3 = h
+        h = self.slice5(h)
+        h_relu5_3 = h
+        vgg_outputs = namedtuple("VggOutputs", ['relu1_2', 'relu2_2', 'relu3_3', 'relu4_3', 'relu5_3'])
+        out = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3)
+        return out
+def normalize_tensor(x,eps=1e-10):
+    norm_factor = torch.sqrt(torch.sum(x**2,dim=1,keepdim=True))
+    return x/(norm_factor+eps)
+def spatial_average(x, keepdim=True):
+    return x.mean([2,3],keepdim=keepdim)

tokenizer_image/reconstruction_vq_ddp.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import torch
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+import torch.nn.functional as F
+import torch.distributed as dist
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+from torchvision import transforms
+from tqdm import tqdm
+import os
+from PIL import Image
+import numpy as np
+import argparse
+import itertools
+from skimage.metrics import peak_signal_noise_ratio as psnr_loss
+from skimage.metrics import structural_similarity as ssim_loss
+from dataset.augmentation import center_crop_arr
+from dataset.build import build_dataset
+from tokenizer.tokenizer_image.vq_model import VQ_models
+def create_npz_from_sample_folder(sample_dir, num=50000):
+    """
+    Builds a single .npz file from a folder of .png samples.
+    """
+    samples = []
+    for i in tqdm(range(num), desc="Building .npz file from samples"):
+        sample_pil = Image.open(f"{sample_dir}/{i:06d}.png")
+        sample_np = np.asarray(sample_pil).astype(np.uint8)
+        samples.append(sample_np)
+    samples = np.stack(samples)
+    assert samples.shape == (num, samples.shape[1], samples.shape[2], 3)
+    npz_path = f"{sample_dir}.npz"
+    np.savez(npz_path, arr_0=samples)
+    print(f"Saved .npz file to {npz_path} [shape={samples.shape}].")
+    return npz_path
+def main(args):
+    # Setup PyTorch:
+    assert torch.cuda.is_available(), "Sampling with DDP requires at least one GPU. sample.py supports CPU-only usage"
+    torch.set_grad_enabled(False)
+    # Setup DDP:
+    dist.init_process_group("nccl")
+    rank = dist.get_rank()
+    device = rank % torch.cuda.device_count()
+    seed = args.global_seed * dist.get_world_size() + rank
+    torch.manual_seed(seed)
+    torch.cuda.set_device(device)
+    print(f"Starting rank={rank}, seed={seed}, world_size={dist.get_world_size()}.")
+    # create and load model
+    vq_model = VQ_models[args.vq_model](
+        codebook_size=args.codebook_size,
+        codebook_embed_dim=args.codebook_embed_dim)
+    vq_model.to(device)
+    vq_model.eval()
+    checkpoint = torch.load(args.vq_ckpt, map_location="cpu")
+    if "ema" in checkpoint:  # ema
+        model_weight = checkpoint["ema"]
+    elif "model" in checkpoint:  # ddp
+        model_weight = checkpoint["model"]
+    elif "state_dict" in checkpoint:
+        model_weight = checkpoint["state_dict"]
+    else:
+        raise Exception("please check model weight")
+    vq_model.load_state_dict(model_weight)
+    del checkpoint
+    # Create folder to save samples:
+    folder_name = (f"{args.vq_model}-{args.dataset}-size-{args.image_size}-size-{args.image_size_eval}"
+                  f"-codebook-size-{args.codebook_size}-dim-{args.codebook_embed_dim}-seed-{args.global_seed}")
+    sample_folder_dir = f"{args.sample_dir}/{folder_name}"
+    if rank == 0:
+        os.makedirs(sample_folder_dir, exist_ok=True)
+        print(f"Saving .png samples at {sample_folder_dir}")
+    dist.barrier()
+    # Setup data:
+    transform = transforms.Compose([
+        transforms.Lambda(lambda pil_image: center_crop_arr(pil_image, args.image_size)),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+    ])
+    if args.dataset == 'imagenet':
+        dataset = build_dataset(args, transform=transform)
+        num_fid_samples = 50000
+    elif args.dataset == 'coco':
+        dataset = build_dataset(args, transform=transform)
+        num_fid_samples = 5000
+    else:
+        raise Exception("please check dataset")
+    sampler = DistributedSampler(
+        dataset,
+        num_replicas=dist.get_world_size(),
+        rank=rank,
+        shuffle=False,
+        seed=args.global_seed
+    )
+    loader = DataLoader(
+        dataset,
+        batch_size=args.per_proc_batch_size,
+        shuffle=False,
+        sampler=sampler,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=False
+    )
+    # Figure out how many samples we need to generate on each GPU and how many iterations we need to run:
+    n = args.per_proc_batch_size
+    global_batch_size = n * dist.get_world_size()
+    psnr_val_rgb = []
+    ssim_val_rgb = []
+    loader = tqdm(loader) if rank == 0 else loader
+    total = 0
+    for x, _ in loader:
+        if args.image_size_eval != args.image_size:
+            rgb_gts = F.interpolate(x, size=(args.image_size_eval, args.image_size_eval), mode='bicubic')
+        else:
+            rgb_gts = x
+        rgb_gts = (rgb_gts.permute(0, 2, 3, 1).to("cpu").numpy() + 1.0) / 2.0 # rgb_gt value is between [0, 1]
+        x = x.to(device, non_blocking=True)
+        with torch.no_grad():
+            latent, _, [_, _, indices] = vq_model.encode(x)
+            samples = vq_model.decode_code(indices, latent.shape) # output value is between [-1, 1]
+            if args.image_size_eval != args.image_size:
+                samples = F.interpolate(samples, size=(args.image_size_eval, args.image_size_eval), mode='bicubic')
+        samples = torch.clamp(127.5 * samples + 128.0, 0, 255).permute(0, 2, 3, 1).to("cpu", dtype=torch.uint8).numpy()
+        # Save samples to disk as individual .png files
+        for i, (sample, rgb_gt) in enumerate(zip(samples, rgb_gts)):
+            index = i * dist.get_world_size() + rank + total
+            Image.fromarray(sample).save(f"{sample_folder_dir}/{index:06d}.png")
+            # metric
+            rgb_restored = sample.astype(np.float32) / 255. # rgb_restored value is between [0, 1]
+            psnr = psnr_loss(rgb_restored, rgb_gt)
+            ssim = ssim_loss(rgb_restored, rgb_gt, multichannel=True, data_range=2.0, channel_axis=-1)
+            psnr_val_rgb.append(psnr)
+            ssim_val_rgb.append(ssim)
+        total += global_batch_size
+    # ------------------------------------
+    #       Summary
+    # ------------------------------------
+    # Make sure all processes have finished saving their samples
+    dist.barrier()
+    world_size = dist.get_world_size()
+    gather_psnr_val = [None for _ in range(world_size)]
+    gather_ssim_val = [None for _ in range(world_size)]
+    dist.all_gather_object(gather_psnr_val, psnr_val_rgb)
+    dist.all_gather_object(gather_ssim_val, ssim_val_rgb)
+    if rank == 0:
+        gather_psnr_val = list(itertools.chain(*gather_psnr_val))
+        gather_ssim_val = list(itertools.chain(*gather_ssim_val))
+        psnr_val_rgb = sum(gather_psnr_val) / len(gather_psnr_val)
+        ssim_val_rgb = sum(gather_ssim_val) / len(gather_ssim_val)
+        print("PSNR: %f, SSIM: %f " % (psnr_val_rgb, ssim_val_rgb))
+        result_file = f"{sample_folder_dir}_results.txt"
+        print("writing results to {}".format(result_file))
+        with open(result_file, 'w') as f:
+            print("PSNR: %f, SSIM: %f " % (psnr_val_rgb, ssim_val_rgb), file=f)
+        create_npz_from_sample_folder(sample_folder_dir, num_fid_samples)
+        print("Done.")
+    dist.barrier()
+    dist.destroy_process_group()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, required=True)
+    parser.add_argument("--dataset", type=str, choices=['imagenet', 'coco'], default='imagenet')
+    parser.add_argument("--vq-model", type=str, choices=list(VQ_models.keys()), default="VQ-16")
+    parser.add_argument("--vq-ckpt", type=str, default=None, help="ckpt path for vq model")
+    parser.add_argument("--codebook-size", type=int, default=16384, help="codebook size for vector quantization")
+    parser.add_argument("--codebook-embed-dim", type=int, default=8, help="codebook dimension for vector quantization")
+    parser.add_argument("--image-size", type=int, choices=[256, 384, 512], default=256)
+    parser.add_argument("--image-size-eval", type=int, choices=[256, 384, 512], default=256)
+    parser.add_argument("--sample-dir", type=str, default="reconstructions")
+    parser.add_argument("--per-proc-batch-size", type=int, default=32)
+    parser.add_argument("--global-seed", type=int, default=0)
+    parser.add_argument("--num-workers", type=int, default=4)
+    args = parser.parse_args()
+    main(args)

tokenizer_image/vq_demo.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import torch
+import torch.nn.functional as F
+import os
+import argparse
+import numpy as np
+from PIL import Image
+from tokenizer.tokenizer_image.vq_model import VQ_models
+from dataset.augmentation import center_crop_arr
+def main(args):
+    # Setup PyTorch:
+    torch.manual_seed(args.seed)
+    torch.set_grad_enabled(False)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # create and load model
+    model = VQ_models[args.vq_model](
+        codebook_size=args.codebook_size,
+        codebook_embed_dim=args.codebook_embed_dim)
+    model.to(device)
+    model.eval()
+    checkpoint = torch.load(args.vq_ckpt, map_location="cpu")
+    if "ema" in checkpoint:  # ema
+        model_weight = checkpoint["ema"]
+    elif "model" in checkpoint:  # ddp
+        model_weight = checkpoint["model"]
+    elif "state_dict" in checkpoint:
+        model_weight = checkpoint["state_dict"]
+    else:
+        raise Exception("please check model weight")
+    model.load_state_dict(model_weight)
+    del checkpoint
+    # output dir
+    os.makedirs(args.output_dir, exist_ok=True)
+    out_path = args.image_path.replace('.jpg', '_{}.jpg'.format(args.suffix))
+    out_path = out_path.replace('.jpeg', '_{}.jpeg'.format(args.suffix))
+    out_path = out_path.replace('.png', '_{}.png'.format(args.suffix))
+    out_filename = out_path.split('/')[-1]
+    out_path = os.path.join(args.output_dir, out_filename)
+    # load image
+    pil_image = Image.open(args.image_path).convert("RGB")
+    img = center_crop_arr(pil_image, args.image_size)
+    # # preprocess
+    # size_org = img.size
+    # img = img.resize((input_size, input_size))
+    img = np.array(img) / 255.
+    x = 2.0 * img - 1.0 # x value is between [-1, 1]
+    x = torch.tensor(x)
+    x = x.unsqueeze(dim=0)
+    x = torch.einsum('nhwc->nchw', x)
+    x_input = x.float().to("cuda")
+    # inference
+    with torch.no_grad():
+        latent, _, [_, _, indices] = model.encode(x_input)
+        output = model.decode_code(indices, latent.shape) # output value is between [-1, 1]
+    # postprocess
+    output = F.interpolate(output, size=[args.image_size, args.image_size], mode='bicubic').permute(0, 2, 3, 1)[0]
+    sample = torch.clamp(127.5 * output + 128.0, 0, 255).to("cpu", dtype=torch.uint8).numpy()
+    # save
+    Image.fromarray(sample).save(out_path)
+    print("Reconstructed image is saved to {}".format(out_path))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--image-path", type=str, default="assets/example.jpg")
+    parser.add_argument("--output-dir", type=str, default="output_vq_demo")
+    parser.add_argument("--suffix", type=str, default="tokenizer_image")
+    parser.add_argument("--vq-model", type=str, choices=list(VQ_models.keys()), default="VQ-16")
+    parser.add_argument("--vq-ckpt", type=str, default=None, help="ckpt path for vq model")
+    parser.add_argument("--codebook-size", type=int, default=16384, help="codebook size for vector quantization")
+    parser.add_argument("--codebook-embed-dim", type=int, default=8, help="codebook dimension for vector quantization")
+    parser.add_argument("--image-size", type=int, choices=[256, 384, 448, 512, 1024], default=512)
+    parser.add_argument("--seed", type=int, default=0)
+    args = parser.parse_args()
+    main(args)

tokenizer_image/vq_loss.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# Modified from:
+#   taming-transformers:  https://github.com/CompVis/taming-transformers
+#   muse-maskgit-pytorch: https://github.com/lucidrains/muse-maskgit-pytorch/blob/main/muse_maskgit_pytorch/vqgan_vae.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from tokenizer.tokenizer_image.lpips import LPIPS
+from tokenizer.tokenizer_image.discriminator_patchgan import NLayerDiscriminator as PatchGANDiscriminator
+from tokenizer.tokenizer_image.discriminator_stylegan import Discriminator as StyleGANDiscriminator
+def hinge_d_loss(logits_real, logits_fake):
+    loss_real = torch.mean(F.relu(1. - logits_real))
+    loss_fake = torch.mean(F.relu(1. + logits_fake))
+    d_loss = 0.5 * (loss_real + loss_fake)
+    return d_loss
+def vanilla_d_loss(logits_real, logits_fake):
+    loss_real = torch.mean(F.softplus(-logits_real))
+    loss_fake = torch.mean(F.softplus(logits_fake))
+    d_loss = 0.5 * (loss_real + loss_fake)
+    return d_loss
+def non_saturating_d_loss(logits_real, logits_fake):
+    loss_real = torch.mean(F.binary_cross_entropy_with_logits(torch.ones_like(logits_real),  logits_real))
+    loss_fake = torch.mean(F.binary_cross_entropy_with_logits(torch.zeros_like(logits_fake), logits_fake))
+    d_loss = 0.5 * (loss_real + loss_fake)
+    return d_loss
+def hinge_gen_loss(logit_fake):
+    return -torch.mean(logit_fake)
+def non_saturating_gen_loss(logit_fake):
+    return torch.mean(F.binary_cross_entropy_with_logits(torch.ones_like(logit_fake),  logit_fake))
+def adopt_weight(weight, global_step, threshold=0, value=0.):
+    if global_step < threshold:
+        weight = value
+    return weight
+class VQLoss(nn.Module):
+    def __init__(self, disc_start, disc_loss="hinge", disc_dim=64, disc_type='patchgan', image_size=256,
+                 disc_num_layers=3, disc_in_channels=3, disc_weight=1.0, disc_adaptive_weight = False,
+                 gen_adv_loss='hinge', reconstruction_loss='l2', reconstruction_weight=1.0,
+                 codebook_weight=1.0, perceptual_weight=1.0,
+    ):
+        super().__init__()
+        # discriminator loss
+        assert disc_type in ["patchgan", "stylegan"]
+        assert disc_loss in ["hinge", "vanilla", "non-saturating"]
+        if disc_type == "patchgan":
+            self.discriminator = PatchGANDiscriminator(
+                input_nc=disc_in_channels,
+                n_layers=disc_num_layers,
+                ndf=disc_dim,
+            )
+        elif disc_type == "stylegan":
+            self.discriminator = StyleGANDiscriminator(
+                input_nc=disc_in_channels,
+                image_size=image_size,
+            )
+        else:
+            raise ValueError(f"Unknown GAN discriminator type '{disc_type}'.")
+        if disc_loss == "hinge":
+            self.disc_loss = hinge_d_loss
+        elif disc_loss == "vanilla":
+            self.disc_loss = vanilla_d_loss
+        elif disc_loss == "non-saturating":
+            self.disc_loss = non_saturating_d_loss
+        else:
+            raise ValueError(f"Unknown GAN discriminator loss '{disc_loss}'.")
+        self.discriminator_iter_start = disc_start
+        self.disc_weight = disc_weight
+        self.disc_adaptive_weight = disc_adaptive_weight
+        assert gen_adv_loss in ["hinge", "non-saturating"]
+        # gen_adv_loss
+        if gen_adv_loss == "hinge":
+            self.gen_adv_loss = hinge_gen_loss
+        elif gen_adv_loss == "non-saturating":
+            self.gen_adv_loss = non_saturating_gen_loss
+        else:
+            raise ValueError(f"Unknown GAN generator loss '{gen_adv_loss}'.")
+        # perceptual loss
+        self.perceptual_loss = LPIPS().eval()
+        self.perceptual_weight = perceptual_weight
+        # reconstruction loss
+        if reconstruction_loss == "l1":
+            self.rec_loss = F.l1_loss
+        elif reconstruction_loss == "l2":
+            self.rec_loss = F.mse_loss
+        else:
+            raise ValueError(f"Unknown rec loss '{reconstruction_loss}'.")
+        self.rec_weight = reconstruction_weight
+        # codebook loss
+        self.codebook_weight = codebook_weight
+    def calculate_adaptive_weight(self, nll_loss, g_loss, last_layer):
+        nll_grads = torch.autograd.grad(nll_loss, last_layer, retain_graph=True)[0]
+        g_grads = torch.autograd.grad(g_loss, last_layer, retain_graph=True)[0]
+        d_weight = torch.norm(nll_grads) / (torch.norm(g_grads) + 1e-4)
+        d_weight = torch.clamp(d_weight, 0.0, 1e4).detach()
+        return d_weight.detach()
+    def forward(self, codebook_loss, inputs, reconstructions, optimizer_idx, global_step, last_layer=None,
+                logger=None, log_every=100):
+        # generator update
+        if optimizer_idx == 0:
+            # reconstruction loss
+            rec_loss = self.rec_loss(inputs.contiguous(), reconstructions.contiguous())
+            # perceptual loss
+            p_loss = self.perceptual_loss(inputs.contiguous(), reconstructions.contiguous())
+            p_loss = torch.mean(p_loss)
+            # discriminator loss
+            logits_fake = self.discriminator(reconstructions.contiguous())
+            generator_adv_loss = self.gen_adv_loss(logits_fake)
+            if self.disc_adaptive_weight:
+                null_loss = self.rec_weight * rec_loss + self.perceptual_weight * p_loss
+                disc_adaptive_weight = self.calculate_adaptive_weight(null_loss, generator_adv_loss, last_layer=last_layer)
+            else:
+                disc_adaptive_weight = 1
+            disc_weight = adopt_weight(self.disc_weight, global_step, threshold=self.discriminator_iter_start)
+            loss = self.rec_weight * rec_loss + \
+                self.perceptual_weight * p_loss + \
+                disc_adaptive_weight * disc_weight * generator_adv_loss + \
+                codebook_loss[0] + codebook_loss[1] + codebook_loss[2]
+            if global_step % log_every == 0:
+                rec_loss = self.rec_weight * rec_loss
+                p_loss = self.perceptual_weight * p_loss
+                generator_adv_loss = disc_adaptive_weight * disc_weight * generator_adv_loss
+                logger.info(f"(Generator) rec_loss: {rec_loss:.4f}, perceptual_loss: {p_loss:.4f}, "
+                            f"vq_loss: {codebook_loss[0]:.4f}, commit_loss: {codebook_loss[1]:.4f}, entropy_loss: {codebook_loss[2]:.4f}, "
+                            f"codebook_usage: {codebook_loss[3]:.4f}, generator_adv_loss: {generator_adv_loss:.4f}, "
+                            f"disc_adaptive_weight: {disc_adaptive_weight:.4f}, disc_weight: {disc_weight:.4f}")
+            return loss
+        # discriminator update
+        if optimizer_idx == 1:
+            logits_real = self.discriminator(inputs.contiguous().detach())
+            logits_fake = self.discriminator(reconstructions.contiguous().detach())
+            disc_weight = adopt_weight(self.disc_weight, global_step, threshold=self.discriminator_iter_start)
+            d_adversarial_loss = disc_weight * self.disc_loss(logits_real, logits_fake)
+            if global_step % log_every == 0:
+                logits_real = logits_real.detach().mean()
+                logits_fake = logits_fake.detach().mean()
+                logger.info(f"(Discriminator) "
+                            f"discriminator_adv_loss: {d_adversarial_loss:.4f}, disc_weight: {disc_weight:.4f}, "
+                            f"logits_real: {logits_real:.4f}, logits_fake: {logits_fake:.4f}")
+            return d_adversarial_loss

tokenizer_image/vq_model.py ADDED Viewed

	@@ -0,0 +1,424 @@

+# Modified from:
+#   taming-transformers: https://github.com/CompVis/taming-transformers
+#   maskgit: https://github.com/google-research/maskgit
+from dataclasses import dataclass, field
+from typing import List
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+@dataclass
+class ModelArgs:
+    codebook_size: int = 16384
+    codebook_embed_dim: int = 8
+    codebook_l2_norm: bool = True
+    codebook_show_usage: bool = True
+    commit_loss_beta: float = 0.25
+    entropy_loss_ratio: float = 0.0
+    encoder_ch_mult: List[int] = field(default_factory=lambda: [1, 1, 2, 2, 4])
+    decoder_ch_mult: List[int] = field(default_factory=lambda: [1, 1, 2, 2, 4])
+    z_channels: int = 256
+    dropout_p: float = 0.0
+class VQModel(nn.Module):
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.config = config
+        self.encoder = Encoder(ch_mult=config.encoder_ch_mult, z_channels=config.z_channels, dropout=config.dropout_p)
+        self.decoder = Decoder(ch_mult=config.decoder_ch_mult, z_channels=config.z_channels, dropout=config.dropout_p)
+        self.quantize = VectorQuantizer(config.codebook_size, config.codebook_embed_dim,
+                                        config.commit_loss_beta, config.entropy_loss_ratio,
+                                        config.codebook_l2_norm, config.codebook_show_usage)
+        self.quant_conv = nn.Conv2d(config.z_channels, config.codebook_embed_dim, 1)
+        self.post_quant_conv = nn.Conv2d(config.codebook_embed_dim, config.z_channels, 1)
+    def encode(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        quant, emb_loss, info = self.quantize(h)
+        return quant, emb_loss, info
+    def decode(self, quant):
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
+        return dec
+    def decode_code(self, code_b, shape=None, channel_first=True):
+        quant_b = self.quantize.get_codebook_entry(code_b, shape, channel_first)
+        dec = self.decode(quant_b)
+        return dec
+    def forward(self, input):
+        quant, diff, _ = self.encode(input)
+        dec = self.decode(quant)
+        return dec, diff
+class Encoder(nn.Module):
+    def __init__(self, in_channels=3, ch=128, ch_mult=(1,1,2,2,4), num_res_blocks=2,
+                 norm_type='group', dropout=0.0, resamp_with_conv=True, z_channels=256):
+        super().__init__()
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.conv_in = nn.Conv2d(in_channels, ch, kernel_size=3, stride=1, padding=1)
+        # downsampling
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.conv_blocks = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            conv_block = nn.Module()
+            # res & attn
+            res_block = nn.ModuleList()
+            attn_block = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                res_block.append(ResnetBlock(block_in, block_out, dropout=dropout, norm_type=norm_type))
+                block_in = block_out
+                if i_level == self.num_resolutions - 1:
+                    attn_block.append(AttnBlock(block_in, norm_type))
+            conv_block.res = res_block
+            conv_block.attn = attn_block
+            # downsample
+            if i_level != self.num_resolutions-1:
+                conv_block.downsample = Downsample(block_in, resamp_with_conv)
+            self.conv_blocks.append(conv_block)
+        # middle
+        self.mid = nn.ModuleList()
+        self.mid.append(ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type))
+        self.mid.append(AttnBlock(block_in, norm_type=norm_type))
+        self.mid.append(ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type))
+        # end
+        self.norm_out = Normalize(block_in, norm_type)
+        self.conv_out = nn.Conv2d(block_in, z_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x):
+        h = self.conv_in(x)
+        # downsampling
+        for i_level, block in enumerate(self.conv_blocks):
+            for i_block in range(self.num_res_blocks):
+                h = block.res[i_block](h)
+                if len(block.attn) > 0:
+                    h = block.attn[i_block](h)
+            if i_level != self.num_resolutions - 1:
+                h = block.downsample(h)
+        # middle
+        for mid_block in self.mid:
+            h = mid_block(h)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(self, z_channels=256, ch=128, ch_mult=(1,1,2,2,4), num_res_blocks=2, norm_type="group",
+                 dropout=0.0, resamp_with_conv=True, out_channels=3):
+        super().__init__()
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        block_in = ch*ch_mult[self.num_resolutions-1]
+        # z to block_in
+        self.conv_in = nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+       # middle
+        self.mid = nn.ModuleList()
+        self.mid.append(ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type))
+        self.mid.append(AttnBlock(block_in, norm_type=norm_type))
+        self.mid.append(ResnetBlock(block_in, block_in, dropout=dropout, norm_type=norm_type))
+        # upsampling
+        self.conv_blocks = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            conv_block = nn.Module()
+            # res & attn
+            res_block = nn.ModuleList()
+            attn_block = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                res_block.append(ResnetBlock(block_in, block_out, dropout=dropout, norm_type=norm_type))
+                block_in = block_out
+                if i_level == self.num_resolutions - 1:
+                    attn_block.append(AttnBlock(block_in, norm_type))
+            conv_block.res = res_block
+            conv_block.attn = attn_block
+            # downsample
+            if i_level != 0:
+                conv_block.upsample = Upsample(block_in, resamp_with_conv)
+            self.conv_blocks.append(conv_block)
+        # end
+        self.norm_out = Normalize(block_in, norm_type)
+        self.conv_out = nn.Conv2d(block_in, out_channels, kernel_size=3, stride=1, padding=1)
+    @property
+    def last_layer(self):
+        return self.conv_out.weight
+    def forward(self, z):
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        for mid_block in self.mid:
+            h = mid_block(h)
+        # upsampling
+        for i_level, block in enumerate(self.conv_blocks):
+            for i_block in range(self.num_res_blocks + 1):
+                h = block.res[i_block](h)
+                if len(block.attn) > 0:
+                    h = block.attn[i_block](h)
+            if i_level != self.num_resolutions - 1:
+                h = block.upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class VectorQuantizer(nn.Module):
+    def __init__(self, n_e, e_dim, beta, entropy_loss_ratio, l2_norm, show_usage):
+        super().__init__()
+        self.n_e = n_e
+        self.e_dim = e_dim
+        self.beta = beta
+        self.entropy_loss_ratio = entropy_loss_ratio
+        self.l2_norm = l2_norm
+        self.show_usage = show_usage
+        self.embedding = nn.Embedding(self.n_e, self.e_dim)
+        self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
+        if self.l2_norm:
+            self.embedding.weight.data = F.normalize(self.embedding.weight.data, p=2, dim=-1)
+        if self.show_usage:
+            self.register_buffer("codebook_used", nn.Parameter(torch.zeros(65536)))
+    def forward(self, z):
+        # reshape z -> (batch, height, width, channel) and flatten
+        z = torch.einsum('b c h w -> b h w c', z).contiguous()
+        z_flattened = z.view(-1, self.e_dim)
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+        if self.l2_norm:
+            z = F.normalize(z, p=2, dim=-1)
+            z_flattened = F.normalize(z_flattened, p=2, dim=-1)
+            embedding = F.normalize(self.embedding.weight, p=2, dim=-1)
+        else:
+            embedding = self.embedding.weight
+        d = torch.sum(z_flattened ** 2, dim=1, keepdim=True) + \
+            torch.sum(embedding**2, dim=1) - 2 * \
+            torch.einsum('bd,dn->bn', z_flattened, torch.einsum('n d -> d n', embedding))
+        min_encoding_indices = torch.argmin(d, dim=1)
+        z_q = embedding[min_encoding_indices].view(z.shape)
+        perplexity = None
+        min_encodings = None
+        vq_loss = None
+        commit_loss = None
+        entropy_loss = None
+        codebook_usage = 0
+        if self.show_usage and self.training:
+            cur_len = min_encoding_indices.shape[0]
+            self.codebook_used[:-cur_len] = self.codebook_used[cur_len:].clone()
+            self.codebook_used[-cur_len:] = min_encoding_indices
+            codebook_usage = len(torch.unique(self.codebook_used)) / self.n_e
+        # compute loss for embedding
+        if self.training:
+            vq_loss = torch.mean((z_q - z.detach()) ** 2)
+            commit_loss = self.beta * torch.mean((z_q.detach() - z) ** 2)
+            entropy_loss = self.entropy_loss_ratio * compute_entropy_loss(-d)
+        # preserve gradients
+        z_q = z + (z_q - z).detach()
+        # reshape back to match original input shape
+        z_q = torch.einsum('b h w c -> b c h w', z_q)
+        return z_q, (vq_loss, commit_loss, entropy_loss, codebook_usage), (perplexity, min_encodings, min_encoding_indices)
+    def get_codebook_entry(self, indices, shape=None, channel_first=True):
+        # shape = (batch, channel, height, width) if channel_first else (batch, height, width, channel)
+        if self.l2_norm:
+            embedding = F.normalize(self.embedding.weight, p=2, dim=-1)
+        else:
+            embedding = self.embedding.weight
+        z_q = embedding[indices]  # (b*h*w, c)
+        if shape is not None:
+            if channel_first:
+                z_q = z_q.reshape(shape[0], shape[2], shape[3], shape[1])
+                # reshape back to match original input shape
+                z_q = z_q.permute(0, 3, 1, 2).contiguous()
+            else:
+                z_q = z_q.view(shape)
+        return z_q
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels, out_channels=None, conv_shortcut=False, dropout=0.0, norm_type='group'):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels, norm_type)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = Normalize(out_channels, norm_type)
+        self.dropout = nn.Dropout(dropout)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+            else:
+                self.nin_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x+h
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels, norm_type='group'):
+        super().__init__()
+        self.norm = Normalize(in_channels, norm_type)
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b,c,h,w = q.shape
+        q = q.reshape(b,c,h*w)
+        q = q.permute(0,2,1)   # b,hw,c
+        k = k.reshape(b,c,h*w) # b,c,hw
+        w_ = torch.bmm(q,k)     # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = F.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b,c,h*w)
+        w_ = w_.permute(0,2,1)   # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v,w_)     # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b,c,h,w)
+        h_ = self.proj_out(h_)
+        return x+h_
+def nonlinearity(x):
+    # swish
+    return x*torch.sigmoid(x)
+def Normalize(in_channels, norm_type='group'):
+    assert norm_type in ['group', 'batch']
+    if norm_type == 'group':
+        return nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+    elif norm_type == 'batch':
+        return nn.SyncBatchNorm(in_channels)
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x):
+        x = F.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0,1,0,1)
+            x = F.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = F.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+def compute_entropy_loss(affinity, loss_type="softmax", temperature=0.01):
+    flat_affinity = affinity.reshape(-1, affinity.shape[-1])
+    flat_affinity /= temperature
+    probs = F.softmax(flat_affinity, dim=-1)
+    log_probs = F.log_softmax(flat_affinity + 1e-5, dim=-1)
+    if loss_type == "softmax":
+        target_probs = probs
+    else:
+        raise ValueError("Entropy loss {} not supported".format(loss_type))
+    avg_probs = torch.mean(target_probs, dim=0)
+    avg_entropy = - torch.sum(avg_probs * torch.log(avg_probs + 1e-5))
+    sample_entropy = - torch.mean(torch.sum(target_probs * log_probs, dim=-1))
+    loss = sample_entropy - avg_entropy
+    return loss
+#################################################################################
+#                              VQ Model Configs                                 #
+#################################################################################
+def VQ_8(**kwargs):
+    return VQModel(ModelArgs(encoder_ch_mult=[1, 2, 2, 4], decoder_ch_mult=[1, 2, 2, 4], **kwargs))
+def VQ_16(**kwargs):
+    return VQModel(ModelArgs(encoder_ch_mult=[1, 1, 2, 2, 4], decoder_ch_mult=[1, 1, 2, 2, 4], **kwargs))
+VQ_models = {'VQ-16': VQ_16, 'VQ-8': VQ_8}

tokenizer_image/vq_train.py ADDED Viewed

	@@ -0,0 +1,316 @@

+# Modified from:
+#   fast-DiT: https://github.com/chuanyangjin/fast-DiT/blob/main/train.py
+#   nanoGPT: https://github.com/karpathy/nanoGPT/blob/master/model.py
+import torch
+# the first flag below was False when we tested this script but True makes A100 training a lot faster:
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.data import Dataset, DataLoader
+from torch.utils.data.distributed import DistributedSampler
+from torchvision.datasets import ImageFolder
+from torchvision import transforms
+import os
+import time
+import argparse
+from glob import glob
+from copy import deepcopy
+from utils.logger import create_logger
+from utils.distributed import init_distributed_mode
+from utils.ema import update_ema, requires_grad
+from dataset.augmentation import random_crop_arr
+from dataset.build import build_dataset
+from tokenizer.tokenizer_image.vq_model import VQ_models
+from tokenizer.tokenizer_image.vq_loss import VQLoss
+import warnings
+warnings.filterwarnings('ignore')
+#################################################################################
+#                                  Training Loop                                #
+#################################################################################
+def main(args):
+    """
+    Trains a new model.
+    """
+    assert torch.cuda.is_available(), "Training currently requires at least one GPU."
+    # Setup DDP:
+    init_distributed_mode(args)
+    assert args.global_batch_size % dist.get_world_size() == 0, f"Batch size must be divisible by world size."
+    rank = dist.get_rank()
+    device = rank % torch.cuda.device_count()
+    seed = args.global_seed * dist.get_world_size() + rank
+    torch.manual_seed(seed)
+    torch.cuda.set_device(device)
+    # Setup an experiment folder:
+    if rank == 0:
+        os.makedirs(args.results_dir, exist_ok=True)  # Make results folder (holds all experiment subfolders)
+        experiment_index = len(glob(f"{args.results_dir}/*"))
+        model_string_name = args.vq_model.replace("/", "-")
+        experiment_dir = f"{args.results_dir}/{experiment_index:03d}-{model_string_name}"  # Create an experiment folder
+        checkpoint_dir = f"{experiment_dir}/checkpoints"  # Stores saved model checkpoints
+        os.makedirs(checkpoint_dir, exist_ok=True)
+        logger = create_logger(experiment_dir)
+        logger.info(f"Experiment directory created at {experiment_dir}")
+        time_record = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
+        cloud_results_dir = f"{args.cloud_save_path}/{time_record}"
+        cloud_checkpoint_dir = f"{cloud_results_dir}/{experiment_index:03d}-{model_string_name}/checkpoints"
+        os.makedirs(cloud_checkpoint_dir, exist_ok=True)
+        logger.info(f"Experiment directory created in cloud at {cloud_checkpoint_dir}")
+    else:
+        logger = create_logger(None)
+    # training args
+    logger.info(f"{args}")
+    # training env
+    logger.info(f"Starting rank={rank}, seed={seed}, world_size={dist.get_world_size()}.")
+    # create and load model
+    vq_model = VQ_models[args.vq_model](
+        codebook_size=args.codebook_size,
+        codebook_embed_dim=args.codebook_embed_dim,
+        commit_loss_beta=args.commit_loss_beta,
+        entropy_loss_ratio=args.entropy_loss_ratio,
+        dropout_p=args.dropout_p,
+    )
+    logger.info(f"VQ Model Parameters: {sum(p.numel() for p in vq_model.parameters()):,}")
+    if args.ema:
+        ema = deepcopy(vq_model).to(device)  # Create an EMA of the model for use after training
+        requires_grad(ema, False)
+        logger.info(f"VQ Model EMA Parameters: {sum(p.numel() for p in ema.parameters()):,}")
+    vq_model = vq_model.to(device)
+    vq_loss = VQLoss(
+        disc_start=args.disc_start,
+        disc_weight=args.disc_weight,
+        disc_type=args.disc_type,
+        disc_loss=args.disc_loss,
+        gen_adv_loss=args.gen_loss,
+        image_size=args.image_size,
+        perceptual_weight=args.perceptual_weight,
+        reconstruction_weight=args.reconstruction_weight,
+        reconstruction_loss=args.reconstruction_loss,
+        codebook_weight=args.codebook_weight,
+    ).to(device)
+    logger.info(f"Discriminator Parameters: {sum(p.numel() for p in vq_loss.discriminator.parameters()):,}")
+    # initialize a GradScaler. If enabled=False scaler is a no-op
+    scaler = torch.cuda.amp.GradScaler(enabled=(args.mixed_precision =='fp16'))
+    scaler_disc = torch.cuda.amp.GradScaler(enabled=(args.mixed_precision =='fp16'))
+    # Setup optimizer
+    optimizer = torch.optim.Adam(vq_model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2))
+    optimizer_disc = torch.optim.Adam(vq_loss.discriminator.parameters(), lr=args.lr, betas=(args.beta1, args.beta2))
+    # Setup data:
+    transform = transforms.Compose([
+        transforms.Lambda(lambda pil_image: random_crop_arr(pil_image, args.image_size)),
+        transforms.RandomHorizontalFlip(),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+    ])
+    dataset = build_dataset(args, transform=transform)
+    sampler = DistributedSampler(
+        dataset,
+        num_replicas=dist.get_world_size(),
+        rank=rank,
+        shuffle=True,
+        seed=args.global_seed
+    )
+    loader = DataLoader(
+        dataset,
+        batch_size=int(args.global_batch_size // dist.get_world_size()),
+        shuffle=False,
+        sampler=sampler,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=True
+    )
+    logger.info(f"Dataset contains {len(dataset):,} images ({args.data_path})")
+    # Prepare models for training:
+    if args.vq_ckpt:
+        checkpoint = torch.load(args.vq_ckpt, map_location="cpu")
+        vq_model.load_state_dict(checkpoint["model"])
+        if args.ema:
+            ema.load_state_dict(checkpoint["ema"])
+        optimizer.load_state_dict(checkpoint["optimizer"])
+        vq_loss.discriminator.load_state_dict(checkpoint["discriminator"])
+        optimizer_disc.load_state_dict(checkpoint["optimizer_disc"])
+        if not args.finetune:
+            train_steps = checkpoint["steps"] if "steps" in checkpoint else int(args.vq_ckpt.split('/')[-1].split('.')[0])
+            start_epoch = int(train_steps / int(len(dataset) / args.global_batch_size))
+            train_steps = int(start_epoch * int(len(dataset) / args.global_batch_size))
+        else:
+            train_steps = 0
+            start_epoch = 0
+        del checkpoint
+        logger.info(f"Resume training from checkpoint: {args.vq_ckpt}")
+        logger.info(f"Initial state: steps={train_steps}, epochs={start_epoch}")
+    else:
+        train_steps = 0
+        start_epoch = 0
+        if args.ema:
+            update_ema(ema, vq_model, decay=0)  # Ensure EMA is initialized with synced weights
+    if args.compile:
+        logger.info("compiling the model... (may take several minutes)")
+        vq_model = torch.compile(vq_model) # requires PyTorch 2.0
+    vq_model = DDP(vq_model.to(device), device_ids=[args.gpu])
+    vq_model.train()
+    if args.ema:
+        ema.eval()  # EMA model should always be in eval mode
+    vq_loss = DDP(vq_loss.to(device), device_ids=[args.gpu])
+    vq_loss.train()
+    ptdtype = {'none': torch.float32, 'bf16': torch.bfloat16, 'fp16': torch.float16}[args.mixed_precision]
+    # Variables for monitoring/logging purposes:
+    log_steps = 0
+    running_loss = 0
+    start_time = time.time()
+    logger.info(f"Training for {args.epochs} epochs...")
+    for epoch in range(start_epoch, args.epochs):
+        sampler.set_epoch(epoch)
+        logger.info(f"Beginning epoch {epoch}...")
+        for x, y in loader:
+            imgs = x.to(device, non_blocking=True)
+            # generator training
+            optimizer.zero_grad()
+            with torch.cuda.amp.autocast(dtype=ptdtype):
+                recons_imgs, codebook_loss = vq_model(imgs)
+                loss_gen = vq_loss(codebook_loss, imgs, recons_imgs, optimizer_idx=0, global_step=train_steps+1,
+                                   last_layer=vq_model.module.decoder.last_layer,
+                                   logger=logger, log_every=args.log_every)
+            scaler.scale(loss_gen).backward()
+            if args.max_grad_norm != 0.0:
+                scaler.unscale_(optimizer)
+                torch.nn.utils.clip_grad_norm_(vq_model.parameters(), args.max_grad_norm)
+            scaler.step(optimizer)
+            scaler.update()
+            if args.ema:
+                update_ema(ema, vq_model.module._orig_mod if args.compile else vq_model.module)
+            # discriminator training
+            optimizer_disc.zero_grad()
+            with torch.cuda.amp.autocast(dtype=ptdtype):
+                loss_disc = vq_loss(codebook_loss, imgs, recons_imgs, optimizer_idx=1, global_step=train_steps+1,
+                                    logger=logger, log_every=args.log_every)
+            scaler_disc.scale(loss_disc).backward()
+            if args.max_grad_norm != 0.0:
+                scaler_disc.unscale_(optimizer_disc)
+                torch.nn.utils.clip_grad_norm_(vq_loss.module.discriminator.parameters(), args.max_grad_norm)
+            scaler_disc.step(optimizer_disc)
+            scaler_disc.update()
+            # # Log loss values:
+            running_loss += loss_gen.item() + loss_disc.item()
+            log_steps += 1
+            train_steps += 1
+            if train_steps % args.log_every == 0:
+                # Measure training speed:
+                torch.cuda.synchronize()
+                end_time = time.time()
+                steps_per_sec = log_steps / (end_time - start_time)
+                # Reduce loss history over all processes:
+                avg_loss = torch.tensor(running_loss / log_steps, device=device)
+                dist.all_reduce(avg_loss, op=dist.ReduceOp.SUM)
+                avg_loss = avg_loss.item() / dist.get_world_size()
+                logger.info(f"(step={train_steps:07d}) Train Loss: {avg_loss:.4f}, Train Steps/Sec: {steps_per_sec:.2f}")
+                # Reset monitoring variables:
+                running_loss = 0
+                log_steps = 0
+                start_time = time.time()
+            # Save checkpoint:
+            if train_steps % args.ckpt_every == 0 and train_steps > 0:
+                if rank == 0:
+                    if args.compile:
+                        model_weight = vq_model.module._orig_mod.state_dict()
+                    else:
+                        model_weight = vq_model.module.state_dict()
+                    checkpoint = {
+                        "model": model_weight,
+                        "optimizer": optimizer.state_dict(),
+                        "discriminator": vq_loss.module.discriminator.state_dict(),
+                        "optimizer_disc": optimizer_disc.state_dict(),
+                        "steps": train_steps,
+                        "args": args
+                    }
+                    if args.ema:
+                        checkpoint["ema"] = ema.state_dict()
+                    if not args.no_local_save:
+                        checkpoint_path = f"{checkpoint_dir}/{train_steps:07d}.pt"
+                        torch.save(checkpoint, checkpoint_path)
+                        logger.info(f"Saved checkpoint to {checkpoint_path}")
+                    cloud_checkpoint_path = f"{cloud_checkpoint_dir}/{train_steps:07d}.pt"
+                    torch.save(checkpoint, cloud_checkpoint_path)
+                    logger.info(f"Saved checkpoint in cloud to {cloud_checkpoint_path}")
+                dist.barrier()
+    vq_model.eval()  # important! This disables randomized embedding dropout
+    # do any sampling/FID calculation/etc. with ema (or model) in eval mode ...
+    logger.info("Done!")
+    dist.destroy_process_group()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-path", type=str, required=True)
+    parser.add_argument("--data-face-path", type=str, default=None, help="face datasets to improve vq model")
+    parser.add_argument("--cloud-save-path", type=str, required=True, help='please specify a cloud disk path, if not, local path')
+    parser.add_argument("--no-local-save", action='store_true', help='no save checkpoints to local path for limited disk volume')
+    parser.add_argument("--vq-model", type=str, choices=list(VQ_models.keys()), default="VQ-16")
+    parser.add_argument("--vq-ckpt", type=str, default=None, help="ckpt path for resume training")
+    parser.add_argument("--finetune", action='store_true', help="finetune a pre-trained vq model")
+    parser.add_argument("--ema", action='store_true', help="whether using ema training")
+    parser.add_argument("--codebook-size", type=int, default=16384, help="codebook size for vector quantization")
+    parser.add_argument("--codebook-embed-dim", type=int, default=8, help="codebook dimension for vector quantization")
+    parser.add_argument("--codebook-l2-norm", action='store_true', default=True, help="l2 norm codebook")
+    parser.add_argument("--codebook-weight", type=float, default=1.0, help="codebook loss weight for vector quantization")
+    parser.add_argument("--entropy-loss-ratio", type=float, default=0.0, help="entropy loss ratio in codebook loss")
+    parser.add_argument("--commit-loss-beta", type=float, default=0.25, help="commit loss beta in codebook loss")
+    parser.add_argument("--reconstruction-weight", type=float, default=1.0, help="reconstruction loss weight of image pixel")
+    parser.add_argument("--reconstruction-loss", type=str, default='l2', help="reconstruction loss type of image pixel")
+    parser.add_argument("--perceptual-weight", type=float, default=1.0, help="perceptual loss weight of LPIPS")
+    parser.add_argument("--disc-weight", type=float, default=0.5, help="discriminator loss weight for gan training")
+    parser.add_argument("--disc-start", type=int, default=20000, help="iteration to start discriminator training and loss")
+    parser.add_argument("--disc-type", type=str, choices=['patchgan', 'stylegan'], default='patchgan', help="discriminator type")
+    parser.add_argument("--disc-loss", type=str, choices=['hinge', 'vanilla', 'non-saturating'], default='hinge', help="discriminator loss")
+    parser.add_argument("--gen-loss", type=str, choices=['hinge', 'non-saturating'], default='hinge', help="generator loss for gan training")
+    parser.add_argument("--compile", action='store_true', default=False)
+    parser.add_argument("--dropout-p", type=float, default=0.0, help="dropout_p")
+    parser.add_argument("--results-dir", type=str, default="results_tokenizer_image")
+    parser.add_argument("--dataset", type=str, default='imagenet')
+    parser.add_argument("--image-size", type=int, choices=[256, 512], default=256)
+    parser.add_argument("--epochs", type=int, default=50)
+    parser.add_argument("--lr", type=float, default=1e-4)
+    parser.add_argument("--weight-decay", type=float, default=5e-2, help="Weight decay to use.")
+    parser.add_argument("--beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--beta2", type=float, default=0.95, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--max-grad-norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--global-batch-size", type=int, default=128)
+    parser.add_argument("--global-seed", type=int, default=0)
+    parser.add_argument("--num-workers", type=int, default=16)
+    parser.add_argument("--log-every", type=int, default=100)
+    parser.add_argument("--ckpt-every", type=int, default=5000)
+    parser.add_argument("--gradient-accumulation-steps", type=int, default=1)
+    parser.add_argument("--mixed-precision", type=str, default='bf16', choices=["none", "fp16", "bf16"])
+    args = parser.parse_args()
+    main(args)