{ "cells": [ { "cell_type": "markdown", "id": "d0b72877", "metadata": {}, "source": [ "# vqgan-jax-encoding-yfcc100m" ] }, { "cell_type": "markdown", "id": "ba7b31e6", "metadata": {}, "source": [ "Same as `vqgan-jax-encoding-with-captions`, but for YFCC100M.\n", "\n", "This dataset was prepared by @borisdayma in Json lines format." ] }, { "cell_type": "code", "execution_count": 92, "id": "3b59489e", "metadata": {}, "outputs": [], "source": [ "import io\n", "\n", "import requests\n", "from PIL import Image\n", "import numpy as np\n", "from tqdm import tqdm\n", "\n", "import torch\n", "import torchvision.transforms as T\n", "import torchvision.transforms.functional as TF\n", "from torchvision.transforms import InterpolationMode\n", "from torch.utils.data import Dataset, DataLoader\n", "from torchvision.datasets.folder import default_loader\n", "import os\n", "\n", "import jax\n", "from jax import pmap" ] }, { "cell_type": "markdown", "id": "511c3b9e", "metadata": {}, "source": [ "## VQGAN-JAX model" ] }, { "cell_type": "markdown", "id": "bb408f6c", "metadata": {}, "source": [ "`dalle_mini` is a local package that contains the VQGAN-JAX model and other utilities." ] }, { "cell_type": "code", "execution_count": 93, "id": "2ca50dc7", "metadata": {}, "outputs": [], "source": [ "from dalle_mini.vqgan_jax.modeling_flax_vqgan import VQModel" ] }, { "cell_type": "markdown", "id": "7b60da9a", "metadata": {}, "source": [ "We'll use a VQGAN trained by using Taming Transformers and converted to a JAX model." ] }, { "cell_type": "code", "execution_count": 167, "id": "29ce8b15", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Working with z of shape (1, 256, 16, 16) = 65536 dimensions.\n" ] } ], "source": [ "model = VQModel.from_pretrained(\"flax-community/vqgan_f16_16384\")" ] }, { "cell_type": "markdown", "id": "c7c4c1e6", "metadata": {}, "source": [ "## Dataset" ] }, { "cell_type": "code", "execution_count": 94, "id": "33861477", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from pathlib import Path" ] }, { "cell_type": "code", "execution_count": 134, "id": "81b19eca", "metadata": {}, "outputs": [], "source": [ "yfcc100m = Path('/home/khali/TPU-Test/YFCC100M_OpenAI_subset')\n", "# Images are 'sharded' from the following directory\n", "yfcc100m_images = yfcc100m/'data'/'data'/'images'\n", "yfcc100m_metadata = yfcc100m/'metadata_YFCC100M.jsonl'\n", "yfcc100m_output = yfcc100m/'metadata_encoded.tsv'" ] }, { "cell_type": "markdown", "id": "1c58bb4a", "metadata": {}, "source": [ "### Cleanup" ] }, { "cell_type": "markdown", "id": "1a14ae3d", "metadata": {}, "source": [ "We need to select entries with images that exist. Otherwise we can't build batches because `Dataloader` does not support `None` in batches. We use Huggingface Datasets, I understand they support threaded reading of jsonl files, and I was running out of memory when using pandas." ] }, { "cell_type": "code", "execution_count": 96, "id": "7811648c", "metadata": {}, "outputs": [], "source": [ "import datasets\n", "from datasets import Dataset, load_dataset" ] }, { "cell_type": "code", "execution_count": 10, "id": "4811a230", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "tcmalloc: large alloc 1254047744 bytes == 0xb2b08000 @ 0x7f9e78632680 0x7f9e78653824 0x585b92 0x504d56 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56 0x56acb6 0x5f5956 0x56aadf 0x5f5956 0x56acb6 0x568d9a 0x5f5b33 0x50b7f8 0x5f2702 0x56c332\n", "tcmalloc: large alloc 1254047744 bytes == 0xfd74e000 @ 0x7f9e78632680 0x7f9e78653824 0x590214 0x586f90 0x56e1f3 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56 0x56acb6 0x5f5956 0x56aadf 0x5f5956 0x56acb6 0x568d9a 0x5f5b33 0x50b7f8 0x5f2702 0x56c332\n", "tcmalloc: large alloc 5016190976 bytes == 0x148b42000 @ 0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n", "tcmalloc: large alloc 5019099136 bytes == 0x273f12000 @ 0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n", "tcmalloc: large alloc 5019811840 bytes == 0x39f9a8000 @ 0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n", "tcmalloc: large alloc 5024571392 bytes == 0x4cb4ec000 @ 0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n", "tcmalloc: large alloc 5021097984 bytes == 0x4cb4ec000 @ 0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n", "tcmalloc: large alloc 5022818304 bytes == 0x4cb4ec000 @ 0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n", "tcmalloc: large alloc 5020794880 bytes == 0x4cb4ec000 @ 0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n", "tcmalloc: large alloc 5019451392 bytes == 0x39f9a8000 @ 0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n", "tcmalloc: large alloc 5020565504 bytes == 0x4cb4ec000 @ 0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n", "tcmalloc: large alloc 5012561920 bytes == 0x273f12000 @ 0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n", "tcmalloc: large alloc 5021835264 bytes == 0x5f6cba000 @ 0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n", "tcmalloc: large alloc 5017436160 bytes == 0x273f12000 @ 0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n" ] } ], "source": [ "# The metadata is too bog to load into memory at once, so chopping it into chunks\n", "chunk_size=1000000\n", "batch_no=1\n", "for chunk in pd.read_json(yfcc100m_metadata, orient=\"records\", lines=True,chunksize=chunk_size):\n", " chunk.to_csv('./chunks/chunk'+str(batch_no)+'.tsv', sep=\"\\t\", index=False)\n", " batch_no+=1" ] }, { "cell_type": "code", "execution_count": 25, "id": "46b2f083", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | photoid | \n", "uid | \n", "unickname | \n", "datetaken | \n", "dateuploaded | \n", "capturedevice | \n", "title | \n", "description | \n", "usertags | \n", "machinetags | \n", "... | \n", "licenseurl | \n", "serverid | \n", "farmid | \n", "secret | \n", "secretoriginal | \n", "ext | \n", "marker | \n", "key | \n", "title_clean | \n", "description_clean | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "137943 | \n", "48600072071@N01 | \n", "doctor+paradox | \n", "2004-08-01 18:13:06.0 | \n", "1091409186 | \n", "NaN | \n", "A+Picture+Share%21 | \n", "Antenna | \n", "cameraphone,cayugaheights,green,hydrant,ithaca... | \n", "NaN | \n", "... | \n", "http://creativecommons.org/licenses/by-nc-sa/2.0/ | \n", "1 | \n", "1 | \n", "1650c7cdc6 | \n", "1650c7cdc6 | \n", "jpg | \n", "0 | \n", "d29e7c6a3028418c64eb15e3cf577c2 | \n", "A Picture Share! | \n", "Antenna | \n", "
1 | \n", "1246361 | \n", "44124324682@N01 | \n", "mharrsch | \n", "2004-11-03 23:04:02.0 | \n", "1099523042 | \n", "NaN | \n", "An+ornate+Roman+urn | \n", "Photographed+at+the+%3Ca+href%3D%22http%3A%2F%... | \n", "ancient,baltimore,burial,death,empire,funeral,... | \n", "NaN | \n", "... | \n", "http://creativecommons.org/licenses/by-nc-sa/2.0/ | \n", "1 | \n", "1 | \n", "cf37054610 | \n", "cf37054610 | \n", "jpg | \n", "0 | \n", "d29f01b149167d683f9ddde464bb3db | \n", "An ornate Roman urn | \n", "Photographed at the Walters Art Museum, Baltim... | \n", "
2 | \n", "1251599 | \n", "51035803024@N01 | \n", "bmitd67 | \n", "2004-10-30 17:09:32.0 | \n", "1099538888 | \n", "Canon+PowerShot+S30 | \n", "Jai+%26+Tara+on+the+Cumberland | \n", "Another+trip+for+the+happy+couple. | \n", "blue+heron,cumberland+river,jai,tara,tennessee | \n", "NaN | \n", "... | \n", "http://creativecommons.org/licenses/by-nc-sa/2.0/ | \n", "1 | \n", "1 | \n", "4a4234e32c | \n", "4a4234e32c | \n", "jpg | \n", "0 | \n", "d296e9e34bdae41edb6c679ff824ab2a | \n", "Jai & Tara on the Cumberland | \n", "Another trip for the happy couple. | \n", "
3 | \n", "2348587 | \n", "73621375@N00 | \n", "Thom+Watson | \n", "2004-12-18 21:08:09.0 | \n", "1103497228 | \n", "SONY+DSC-W1 | \n", "Castle+gate+-+%22lite-brited%22 | \n", "Taken+at+the+Miracle+of+Lights+display+in+Cent... | \n", "bullrunpark,castle,centreville,christmas,decor... | \n", "NaN | \n", "... | \n", "http://creativecommons.org/licenses/by-nc-sa/2.0/ | \n", "2 | \n", "1 | \n", "7162c974c3 | \n", "7162c974c3 | \n", "jpg | \n", "0 | \n", "d29ce96395848478b1e8396e44899 | \n", "Castle gate - \"lite-brited\" | \n", "Taken at the Miracle of Lights display in Cent... | \n", "
4 | \n", "3516047 | \n", "48600072071@N01 | \n", "doctor+paradox | \n", "2005-01-18 16:44:18.0 | \n", "1106084658 | \n", "NaN | \n", "A+Picture+Share%21 | \n", "Tabular | \n", "cameraphone,moblog,unfound | \n", "NaN | \n", "... | \n", "http://creativecommons.org/licenses/by-nc-sa/2.0/ | \n", "3 | \n", "1 | \n", "663e0d8b3d | \n", "663e0d8b3d | \n", "jpg | \n", "0 | \n", "d29abf32c4e12ff881f975b70e0cec0 | \n", "A Picture Share! | \n", "Tabular | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
999995 | \n", "4648651054 | \n", "24511045@N04 | \n", "mtfrazier | \n", "2010-05-02 15:47:45.0 | \n", "1275083371 | \n", "Canon+EOS+50D | \n", "U.S.+Navy+Blue+Angels%3A+2010 | \n", "2+May+2010%0ASunday%0ASt.+Joseph%2C+Missouri | \n", "NaN | \n", "NaN | \n", "... | \n", "http://creativecommons.org/licenses/by-nc-nd/2.0/ | \n", "4072 | \n", "5 | \n", "2d12d73fb0 | \n", "dd5856ea42 | \n", "jpg | \n", "0 | \n", "60fa2911cb81eb25b356e9fee978aef | \n", "U.S. Navy Blue Angels: 2010 | \n", "2 May 2010 Sunday St. Joseph, Missouri | \n", "
999996 | \n", "4652130996 | \n", "21963865@N04 | \n", "GRAB1.0 | \n", "2010-05-29 19:23:10.0 | \n", "1275200833 | \n", "SONY+DSLR-A230 | \n", "Attempts+on+Her+Life | \n", "BAPA+1+production+of+Martin+Crimp%27s+Attempts... | \n", "NaN | \n", "NaN | \n", "... | \n", "http://creativecommons.org/licenses/by-nc-nd/2.0/ | \n", "4003 | \n", "5 | \n", "8889121579 | \n", "2f46599456 | \n", "jpg | \n", "0 | \n", "60f5ef5ce4c2d24566226abebd67d4 | \n", "Attempts on Her Life | \n", "BAPA 1 production of Martin Crimp's Attempts o... | \n", "
999997 | \n", "4652568339 | \n", "64025277@N00 | \n", "1Sock | \n", "2010-05-13 15:38:37.0 | \n", "1275234267 | \n", "Canon+EOS+DIGITAL+REBEL+XT | \n", "Carlsbad+Caverns+3 | \n", "%E2%99%A5%E2%99%A5%E2%99%A5%E2%99%A5%E2%99%A5%... | \n", "carlsbad,carlsbad+caverns,cave,faa,new+mexico,... | \n", "NaN | \n", "... | \n", "http://creativecommons.org/licenses/by-nc-nd/2.0/ | \n", "4010 | \n", "5 | \n", "0a1808a69e | \n", "cf6d348e3d | \n", "jpg | \n", "0 | \n", "60f029482d1d1028fda5281daf498f | \n", "Carlsbad Caverns 3 | \n", "♥♥♥♥♥♥♥ Interested in purchasing this photogra... | \n", "
999998 | \n", "4653110895 | \n", "20483509@N00 | \n", "subberculture | \n", "2010-05-30 15:37:05.0 | \n", "1275245596 | \n", "Canon+DIGITAL+IXUS+40 | \n", "Want | \n", "Isn%27t+that+gorgeous%3F | \n", "2010,edinburgh+museum,may,phonebox,wood | \n", "NaN | \n", "... | \n", "http://creativecommons.org/licenses/by-sa/2.0/ | \n", "4066 | \n", "5 | \n", "77c3b3a254 | \n", "c4697e1511 | \n", "jpg | \n", "0 | \n", "60f72775f433cf8de3efaeb431866153 | \n", "Want | \n", "Isn't that gorgeous? | \n", "
999999 | \n", "4655503987 | \n", "8457193@N07 | \n", "zackojones | \n", "2010-05-30 15:34:58.0 | \n", "1275310230 | \n", "Canon+EOS+7D | \n", "Summertime | \n", "You+gotta+love+it%21 | \n", "georgia,savannah,united+states,us | \n", "NaN | \n", "... | \n", "http://creativecommons.org/licenses/by-nc-sa/2.0/ | \n", "4043 | \n", "5 | \n", "caff543bfe | \n", "f60952ac4d | \n", "jpg | \n", "0 | \n", "60f687e11b913bce461e9525d8047e0 | \n", "Summertime | \n", "You gotta love it! | \n", "
1000000 rows × 26 columns
\n", "\n", " | key | \n", "title_clean | \n", "description_clean | \n", "ext | \n", "
---|---|---|---|---|
0 | \n", "d29e7c6a3028418c64eb15e3cf577c2 | \n", "A Picture Share! | \n", "Antenna | \n", "jpg | \n", "
1 | \n", "d29f01b149167d683f9ddde464bb3db | \n", "An ornate Roman urn | \n", "Photographed at the Walters Art Museum, Baltim... | \n", "jpg | \n", "
2 | \n", "d296e9e34bdae41edb6c679ff824ab2a | \n", "Jai & Tara on the Cumberland | \n", "Another trip for the happy couple. | \n", "jpg | \n", "
3 | \n", "d29ce96395848478b1e8396e44899 | \n", "Castle gate - \"lite-brited\" | \n", "Taken at the Miracle of Lights display in Cent... | \n", "jpg | \n", "
4 | \n", "d29abf32c4e12ff881f975b70e0cec0 | \n", "A Picture Share! | \n", "Tabular | \n", "jpg | \n", "