diff --git "a/encoding/vqgan-jax-encoding-yfcc100m.ipynb" "b/encoding/vqgan-jax-encoding-yfcc100m.ipynb" --- "a/encoding/vqgan-jax-encoding-yfcc100m.ipynb" +++ "b/encoding/vqgan-jax-encoding-yfcc100m.ipynb" @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 92, "id": "3b59489e", "metadata": {}, "outputs": [], @@ -38,6 +38,7 @@ "from torchvision.transforms import InterpolationMode\n", "from torch.utils.data import Dataset, DataLoader\n", "from torchvision.datasets.folder import default_loader\n", + "import os\n", "\n", "import jax\n", "from jax import pmap" @@ -61,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 93, "id": "2ca50dc7", "metadata": {}, "outputs": [], @@ -77,22 +78,22 @@ "We'll use a VQGAN trained by using Taming Transformers and converted to a JAX model." ] }, - { - "cell_type": "markdown", - "id": "ad05a1bd", - "metadata": {}, - "source": [ - "**Disabling** Does not work in my local system right now." - ] - }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 167, "id": "29ce8b15", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Working with z of shape (1, 256, 16, 16) = 65536 dimensions.\n" + ] + } + ], "source": [ - "#model = VQModel.from_pretrained(\"flax-community/vqgan_f16_16384\")" + "model = VQModel.from_pretrained(\"flax-community/vqgan_f16_16384\")" ] }, { @@ -105,7 +106,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 94, "id": "33861477", "metadata": {}, "outputs": [], @@ -116,16 +117,16 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 134, "id": "81b19eca", "metadata": {}, "outputs": [], "source": [ - "yfcc100m = Path('/sddata/dalle-mini/YFCC100M_OpenAI_subset')\n", + "yfcc100m = Path('/home/khali/TPU-Test/YFCC100M_OpenAI_subset')\n", "# Images are 'sharded' from the following directory\n", - "yfcc100m_images = yfcc100m/'data'/'images'\n", + "yfcc100m_images = yfcc100m/'data'/'data'/'images'\n", "yfcc100m_metadata = yfcc100m/'metadata_YFCC100M.jsonl'\n", - "yfcc100m_output = yfcc100m/'metadata_encoded.jsonl'" + "yfcc100m_output = yfcc100m/'metadata_encoded.tsv'" ] }, { @@ -146,7 +147,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 96, "id": "7811648c", "metadata": {}, "outputs": [], @@ -157,1255 +158,803 @@ }, { "cell_type": "code", - "execution_count": 82, - "id": "753659fe", + "execution_count": 10, + "id": "4811a230", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Using custom data configuration default-57592e8ed16d752b\n", - "Reusing dataset json (/home/pedro/.cache/huggingface/datasets/json/default-57592e8ed16d752b/0.0.0/793d004298099bd3c4e61eb7878475bcf1dc212bf2e34437d85126758720d7f9)\n" + "tcmalloc: large alloc 1254047744 bytes == 0xb2b08000 @ 0x7f9e78632680 0x7f9e78653824 0x585b92 0x504d56 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56 0x56acb6 0x5f5956 0x56aadf 0x5f5956 0x56acb6 0x568d9a 0x5f5b33 0x50b7f8 0x5f2702 0x56c332\n", + "tcmalloc: large alloc 1254047744 bytes == 0xfd74e000 @ 0x7f9e78632680 0x7f9e78653824 0x590214 0x586f90 0x56e1f3 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56 0x56acb6 0x5f5956 0x56aadf 0x5f5956 0x56acb6 0x568d9a 0x5f5b33 0x50b7f8 0x5f2702 0x56c332\n", + "tcmalloc: large alloc 5016190976 bytes == 0x148b42000 @ 0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n", + "tcmalloc: large alloc 5019099136 bytes == 0x273f12000 @ 0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n", + "tcmalloc: large alloc 5019811840 bytes == 0x39f9a8000 @ 0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n", + "tcmalloc: large alloc 5024571392 bytes == 0x4cb4ec000 @ 0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n", + "tcmalloc: large alloc 5021097984 bytes == 0x4cb4ec000 @ 0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n", + "tcmalloc: large alloc 5022818304 bytes == 0x4cb4ec000 @ 0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n", + "tcmalloc: large alloc 5020794880 bytes == 0x4cb4ec000 @ 0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n", + "tcmalloc: large alloc 5019451392 bytes == 0x39f9a8000 @ 0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n", + "tcmalloc: large alloc 5020565504 bytes == 0x4cb4ec000 @ 0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n", + "tcmalloc: large alloc 5012561920 bytes == 0x273f12000 @ 0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n", + "tcmalloc: large alloc 5021835264 bytes == 0x5f6cba000 @ 0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n", + "tcmalloc: large alloc 5017436160 bytes == 0x273f12000 @ 0x7f9e78632680 0x7f9e78653824 0x5b9144 0x7f9b2929127e 0x7f9b29291a19 0x7f9b29291886 0x7f9b29291cef 0x7f9b2928f204 0x5f2cc9 0x5f30ff 0x5705f6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x56acb6 0x5f5956 0x5a8cb3 0x56ae94 0x568d9a 0x68cdc7 0x5ff5d4 0x5c3cb0 0x56aadf 0x501148 0x56c422 0x501148 0x56c422 0x501148 0x504d56\n" ] } ], "source": [ - "dataset = load_dataset(\"json\", data_files=[str(yfcc100m_metadata)])" + "# The metadata is too bog to load into memory at once, so chopping it into chunks\n", + "chunk_size=1000000\n", + "batch_no=1\n", + "for chunk in pd.read_json(yfcc100m_metadata, orient=\"records\", lines=True,chunksize=chunk_size):\n", + " chunk.to_csv('./chunks/chunk'+str(batch_no)+'.tsv', sep=\"\\t\", index=False)\n", + " batch_no+=1" ] }, { "cell_type": "code", - "execution_count": 83, - "id": "9343df1b", + "execution_count": 25, + "id": "46b2f083", "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
photoiduidunicknamedatetakendateuploadedcapturedevicetitledescriptionusertagsmachinetags...licenseurlserveridfarmidsecretsecretoriginalextmarkerkeytitle_cleandescription_clean
013794348600072071@N01doctor+paradox2004-08-01 18:13:06.01091409186NaNA+Picture+Share%21Antennacameraphone,cayugaheights,green,hydrant,ithaca...NaN...http://creativecommons.org/licenses/by-nc-sa/2.0/111650c7cdc61650c7cdc6jpg0d29e7c6a3028418c64eb15e3cf577c2A Picture Share!Antenna
1124636144124324682@N01mharrsch2004-11-03 23:04:02.01099523042NaNAn+ornate+Roman+urnPhotographed+at+the+%3Ca+href%3D%22http%3A%2F%...ancient,baltimore,burial,death,empire,funeral,...NaN...http://creativecommons.org/licenses/by-nc-sa/2.0/11cf37054610cf37054610jpg0d29f01b149167d683f9ddde464bb3dbAn ornate Roman urnPhotographed at the Walters Art Museum, Baltim...
2125159951035803024@N01bmitd672004-10-30 17:09:32.01099538888Canon+PowerShot+S30Jai+%26+Tara+on+the+CumberlandAnother+trip+for+the+happy+couple.blue+heron,cumberland+river,jai,tara,tennesseeNaN...http://creativecommons.org/licenses/by-nc-sa/2.0/114a4234e32c4a4234e32cjpg0d296e9e34bdae41edb6c679ff824ab2aJai & Tara on the CumberlandAnother trip for the happy couple.
3234858773621375@N00Thom+Watson2004-12-18 21:08:09.01103497228SONY+DSC-W1Castle+gate+-+%22lite-brited%22Taken+at+the+Miracle+of+Lights+display+in+Cent...bullrunpark,castle,centreville,christmas,decor...NaN...http://creativecommons.org/licenses/by-nc-sa/2.0/217162c974c37162c974c3jpg0d29ce96395848478b1e8396e44899Castle gate - \"lite-brited\"Taken at the Miracle of Lights display in Cent...
4351604748600072071@N01doctor+paradox2005-01-18 16:44:18.01106084658NaNA+Picture+Share%21Tabularcameraphone,moblog,unfoundNaN...http://creativecommons.org/licenses/by-nc-sa/2.0/31663e0d8b3d663e0d8b3djpg0d29abf32c4e12ff881f975b70e0cec0A Picture Share!Tabular
..................................................................
999995464865105424511045@N04mtfrazier2010-05-02 15:47:45.01275083371Canon+EOS+50DU.S.+Navy+Blue+Angels%3A+20102+May+2010%0ASunday%0ASt.+Joseph%2C+MissouriNaNNaN...http://creativecommons.org/licenses/by-nc-nd/2.0/407252d12d73fb0dd5856ea42jpg060fa2911cb81eb25b356e9fee978aefU.S. Navy Blue Angels: 20102 May 2010 Sunday St. Joseph, Missouri
999996465213099621963865@N04GRAB1.02010-05-29 19:23:10.01275200833SONY+DSLR-A230Attempts+on+Her+LifeBAPA+1+production+of+Martin+Crimp%27s+Attempts...NaNNaN...http://creativecommons.org/licenses/by-nc-nd/2.0/4003588891215792f46599456jpg060f5ef5ce4c2d24566226abebd67d4Attempts on Her LifeBAPA 1 production of Martin Crimp's Attempts o...
999997465256833964025277@N001Sock2010-05-13 15:38:37.01275234267Canon+EOS+DIGITAL+REBEL+XTCarlsbad+Caverns+3%E2%99%A5%E2%99%A5%E2%99%A5%E2%99%A5%E2%99%A5%...carlsbad,carlsbad+caverns,cave,faa,new+mexico,...NaN...http://creativecommons.org/licenses/by-nc-nd/2.0/401050a1808a69ecf6d348e3djpg060f029482d1d1028fda5281daf498fCarlsbad Caverns 3♥♥♥♥♥♥♥ Interested in purchasing this photogra...
999998465311089520483509@N00subberculture2010-05-30 15:37:05.01275245596Canon+DIGITAL+IXUS+40WantIsn%27t+that+gorgeous%3F2010,edinburgh+museum,may,phonebox,woodNaN...http://creativecommons.org/licenses/by-sa/2.0/4066577c3b3a254c4697e1511jpg060f72775f433cf8de3efaeb431866153WantIsn't that gorgeous?
99999946555039878457193@N07zackojones2010-05-30 15:34:58.01275310230Canon+EOS+7DSummertimeYou+gotta+love+it%21georgia,savannah,united+states,usNaN...http://creativecommons.org/licenses/by-nc-sa/2.0/40435caff543bfef60952ac4djpg060f687e11b913bce461e9525d8047e0SummertimeYou gotta love it!
\n", + "

1000000 rows × 26 columns

\n", + "
" + ], "text/plain": [ - "Dataset({\n", - " features: ['photoid', 'uid', 'unickname', 'datetaken', 'dateuploaded', 'capturedevice', 'title', 'description', 'usertags', 'machinetags', 'longitude', 'latitude', 'accuracy', 'pageurl', 'downloadurl', 'licensename', 'licenseurl', 'serverid', 'farmid', 'secret', 'secretoriginal', 'ext', 'marker', 'key', 'title_clean', 'description_clean'],\n", - " num_rows: 14825233\n", - "})" + " photoid uid unickname datetaken \\\n", + "0 137943 48600072071@N01 doctor+paradox 2004-08-01 18:13:06.0 \n", + "1 1246361 44124324682@N01 mharrsch 2004-11-03 23:04:02.0 \n", + "2 1251599 51035803024@N01 bmitd67 2004-10-30 17:09:32.0 \n", + "3 2348587 73621375@N00 Thom+Watson 2004-12-18 21:08:09.0 \n", + "4 3516047 48600072071@N01 doctor+paradox 2005-01-18 16:44:18.0 \n", + "... ... ... ... ... \n", + "999995 4648651054 24511045@N04 mtfrazier 2010-05-02 15:47:45.0 \n", + "999996 4652130996 21963865@N04 GRAB1.0 2010-05-29 19:23:10.0 \n", + "999997 4652568339 64025277@N00 1Sock 2010-05-13 15:38:37.0 \n", + "999998 4653110895 20483509@N00 subberculture 2010-05-30 15:37:05.0 \n", + "999999 4655503987 8457193@N07 zackojones 2010-05-30 15:34:58.0 \n", + "\n", + " dateuploaded capturedevice \\\n", + "0 1091409186 NaN \n", + "1 1099523042 NaN \n", + "2 1099538888 Canon+PowerShot+S30 \n", + "3 1103497228 SONY+DSC-W1 \n", + "4 1106084658 NaN \n", + "... ... ... \n", + "999995 1275083371 Canon+EOS+50D \n", + "999996 1275200833 SONY+DSLR-A230 \n", + "999997 1275234267 Canon+EOS+DIGITAL+REBEL+XT \n", + "999998 1275245596 Canon+DIGITAL+IXUS+40 \n", + "999999 1275310230 Canon+EOS+7D \n", + "\n", + " title \\\n", + "0 A+Picture+Share%21 \n", + "1 An+ornate+Roman+urn \n", + "2 Jai+%26+Tara+on+the+Cumberland \n", + "3 Castle+gate+-+%22lite-brited%22 \n", + "4 A+Picture+Share%21 \n", + "... ... \n", + "999995 U.S.+Navy+Blue+Angels%3A+2010 \n", + "999996 Attempts+on+Her+Life \n", + "999997 Carlsbad+Caverns+3 \n", + "999998 Want \n", + "999999 Summertime \n", + "\n", + " description \\\n", + "0 Antenna \n", + "1 Photographed+at+the+%3Ca+href%3D%22http%3A%2F%... \n", + "2 Another+trip+for+the+happy+couple. \n", + "3 Taken+at+the+Miracle+of+Lights+display+in+Cent... \n", + "4 Tabular \n", + "... ... \n", + "999995 2+May+2010%0ASunday%0ASt.+Joseph%2C+Missouri \n", + "999996 BAPA+1+production+of+Martin+Crimp%27s+Attempts... \n", + "999997 %E2%99%A5%E2%99%A5%E2%99%A5%E2%99%A5%E2%99%A5%... \n", + "999998 Isn%27t+that+gorgeous%3F \n", + "999999 You+gotta+love+it%21 \n", + "\n", + " usertags machinetags ... \\\n", + "0 cameraphone,cayugaheights,green,hydrant,ithaca... NaN ... \n", + "1 ancient,baltimore,burial,death,empire,funeral,... NaN ... \n", + "2 blue+heron,cumberland+river,jai,tara,tennessee NaN ... \n", + "3 bullrunpark,castle,centreville,christmas,decor... NaN ... \n", + "4 cameraphone,moblog,unfound NaN ... \n", + "... ... ... ... \n", + "999995 NaN NaN ... \n", + "999996 NaN NaN ... \n", + "999997 carlsbad,carlsbad+caverns,cave,faa,new+mexico,... NaN ... \n", + "999998 2010,edinburgh+museum,may,phonebox,wood NaN ... \n", + "999999 georgia,savannah,united+states,us NaN ... \n", + "\n", + " licenseurl serverid farmid \\\n", + "0 http://creativecommons.org/licenses/by-nc-sa/2.0/ 1 1 \n", + "1 http://creativecommons.org/licenses/by-nc-sa/2.0/ 1 1 \n", + "2 http://creativecommons.org/licenses/by-nc-sa/2.0/ 1 1 \n", + "3 http://creativecommons.org/licenses/by-nc-sa/2.0/ 2 1 \n", + "4 http://creativecommons.org/licenses/by-nc-sa/2.0/ 3 1 \n", + "... ... ... ... \n", + "999995 http://creativecommons.org/licenses/by-nc-nd/2.0/ 4072 5 \n", + "999996 http://creativecommons.org/licenses/by-nc-nd/2.0/ 4003 5 \n", + "999997 http://creativecommons.org/licenses/by-nc-nd/2.0/ 4010 5 \n", + "999998 http://creativecommons.org/licenses/by-sa/2.0/ 4066 5 \n", + "999999 http://creativecommons.org/licenses/by-nc-sa/2.0/ 4043 5 \n", + "\n", + " secret secretoriginal ext marker \\\n", + "0 1650c7cdc6 1650c7cdc6 jpg 0 \n", + "1 cf37054610 cf37054610 jpg 0 \n", + "2 4a4234e32c 4a4234e32c jpg 0 \n", + "3 7162c974c3 7162c974c3 jpg 0 \n", + "4 663e0d8b3d 663e0d8b3d jpg 0 \n", + "... ... ... ... ... \n", + "999995 2d12d73fb0 dd5856ea42 jpg 0 \n", + "999996 8889121579 2f46599456 jpg 0 \n", + "999997 0a1808a69e cf6d348e3d jpg 0 \n", + "999998 77c3b3a254 c4697e1511 jpg 0 \n", + "999999 caff543bfe f60952ac4d jpg 0 \n", + "\n", + " key title_clean \\\n", + "0 d29e7c6a3028418c64eb15e3cf577c2 A Picture Share! \n", + "1 d29f01b149167d683f9ddde464bb3db An ornate Roman urn \n", + "2 d296e9e34bdae41edb6c679ff824ab2a Jai & Tara on the Cumberland \n", + "3 d29ce96395848478b1e8396e44899 Castle gate - \"lite-brited\" \n", + "4 d29abf32c4e12ff881f975b70e0cec0 A Picture Share! \n", + "... ... ... \n", + "999995 60fa2911cb81eb25b356e9fee978aef U.S. Navy Blue Angels: 2010 \n", + "999996 60f5ef5ce4c2d24566226abebd67d4 Attempts on Her Life \n", + "999997 60f029482d1d1028fda5281daf498f Carlsbad Caverns 3 \n", + "999998 60f72775f433cf8de3efaeb431866153 Want \n", + "999999 60f687e11b913bce461e9525d8047e0 Summertime \n", + "\n", + " description_clean \n", + "0 Antenna \n", + "1 Photographed at the Walters Art Museum, Baltim... \n", + "2 Another trip for the happy couple. \n", + "3 Taken at the Miracle of Lights display in Cent... \n", + "4 Tabular \n", + "... ... \n", + "999995 2 May 2010 Sunday St. Joseph, Missouri \n", + "999996 BAPA 1 production of Martin Crimp's Attempts o... \n", + "999997 ♥♥♥♥♥♥♥ Interested in purchasing this photogra... \n", + "999998 Isn't that gorgeous? \n", + "999999 You gotta love it! \n", + "\n", + "[1000000 rows x 26 columns]" ] }, - "execution_count": 83, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "dataset = dataset['train']\n", - "dataset" + "# looking up at a chunk\n", + "pd.read_csv(\"./chunks/chunk1.tsv\", sep=\"\\t\")" ] }, { "cell_type": "code", - "execution_count": 84, - "id": "c4794c29", + "execution_count": 98, + "id": "c51c5597", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
keytitle_cleandescription_cleanext
0d29e7c6a3028418c64eb15e3cf577c2A Picture Share!Antennajpg
1d29f01b149167d683f9ddde464bb3dbAn ornate Roman urnPhotographed at the Walters Art Museum, Baltim...jpg
2d296e9e34bdae41edb6c679ff824ab2aJai & Tara on the CumberlandAnother trip for the happy couple.jpg
3d29ce96395848478b1e8396e44899Castle gate - \"lite-brited\"Taken at the Miracle of Lights display in Cent...jpg
4d29abf32c4e12ff881f975b70e0cec0A Picture Share!Tabularjpg
\n", + "
" + ], + "text/plain": [ + " key title_clean \\\n", + "0 d29e7c6a3028418c64eb15e3cf577c2 A Picture Share! \n", + "1 d29f01b149167d683f9ddde464bb3db An ornate Roman urn \n", + "2 d296e9e34bdae41edb6c679ff824ab2a Jai & Tara on the Cumberland \n", + "3 d29ce96395848478b1e8396e44899 Castle gate - \"lite-brited\" \n", + "4 d29abf32c4e12ff881f975b70e0cec0 A Picture Share! \n", + "\n", + " description_clean ext \n", + "0 Antenna jpg \n", + "1 Photographed at the Walters Art Museum, Baltim... jpg \n", + "2 Another trip for the happy couple. jpg \n", + "3 Taken at the Miracle of Lights display in Cent... jpg \n", + "4 Tabular jpg " + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Looking at a chunk with only the relevant columns that we need\n", + "df = pd.read_csv(\"./chunks/chunk1.tsv\", sep=\"\\t\")[[\"key\", \"title_clean\", \"description_clean\", \"ext\"]]\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "cc1668f8", "metadata": {}, - "outputs": [], "source": [ - "def image_exists(root: str, name: str, ext: str):\n", - " image_path = (Path(root)/name[0:3]/name[3:6]/name).with_suffix(ext)\n", - " return image_path.exists()" + "### Grabbing each chunks from the folder, cleaning it up, only taking the entries which image exist and appending it to the global df" ] }, { "cell_type": "code", - "execution_count": 90, - "id": "1b500078", + "execution_count": null, + "id": "abbcccf3", "metadata": {}, "outputs": [], "source": [ - "def select_existing_rows(examples):\n", - " # Select lists we want to keep\n", - " keys = examples['key']\n", - " titles_clean = examples['title_clean']\n", - " descriptions_clean = examples.get('description_clean', '')\n", - " exts = examples['ext']\n", - " \n", - " result = {'key': [], 'title_clean': [], 'description_clean': [], 'ext': []}\n", - " for i, image_name in enumerate(keys):\n", - " print(i, image_name)\n", - " if image_exists(root=str(yfcc100m_images), name=image_name, ext='.' + exts[i]):\n", - " result[\"key\"].append(image_name)\n", - " result[\"title_clean\"].append(titles_clean[i])\n", - " result[\"description_clean\"].append(descriptions_clean[i])\n", - " result[\"ext\"].append(exts[i])\n", - " print(f'returning {len(result[\"key\"])}')\n", - " return result" + "# the function that helps us to decide whether an image with certain id exists in storage, we only take the ones that we have the images for\n", + "def image_exists(item):\n", + " name, _, _, ext, _ = item\n", + " root=str(yfcc100m_images)\n", + " image_path = (Path(root)/name[0:3]/name[3:6]/name).with_suffix(\".\"+ext)\n", + " if image_path.exists():\n", + " return True\n", + " else:\n", + " return None" ] }, { "cell_type": "code", - "execution_count": 91, - "id": "467378c1", + "execution_count": 86, + "id": "44fa86ab", "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b72e866c3f174e9e9aa2430e204f2baf", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Selecting rows with images that exist: 0%| | 0/14826 [00:00\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m filtered_dataset = dataset.map(\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mselect_existing_rows\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mremove_columns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdataset\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumn_names\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mbatched\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mnum_proc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/code/hf_jax/datasets/src/datasets/arrow_dataset.py\u001b[0m in \u001b[0;36mmap\u001b[0;34m(self, function, with_indices, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)\u001b[0m\n\u001b[1;32m 1655\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1656\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnum_proc\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mnum_proc\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1657\u001b[0;31m return self._map_single(\n\u001b[0m\u001b[1;32m 1658\u001b[0m \u001b[0mfunction\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfunction\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1659\u001b[0m \u001b[0mwith_indices\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mwith_indices\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/code/hf_jax/datasets/src/datasets/arrow_dataset.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 183\u001b[0m }\n\u001b[1;32m 184\u001b[0m \u001b[0;31m# apply actual function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 185\u001b[0;31m \u001b[0mout\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mUnion\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Dataset\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"DatasetDict\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 186\u001b[0m \u001b[0mdatasets\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mList\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Dataset\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mout\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 187\u001b[0m \u001b[0;31m# re-apply format to the output\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/code/hf_jax/datasets/src/datasets/fingerprint.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 395\u001b[0m \u001b[0;31m# Call actual function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 396\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 397\u001b[0;31m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 398\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 399\u001b[0m \u001b[0;31m# Update fingerprint of in-place transforms + update in-place history of transforms\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/code/hf_jax/datasets/src/datasets/arrow_dataset.py\u001b[0m in \u001b[0;36m_map_single\u001b[0;34m(self, function, with_indices, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset, desc)\u001b[0m\n\u001b[1;32m 2022\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2023\u001b[0m \u001b[0mbatch\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcast_to_python_objects\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2024\u001b[0;31m \u001b[0mwriter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite_batch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2025\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mupdate_data\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mwriter\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2026\u001b[0m \u001b[0mwriter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfinalize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# close_stream=bool(buf_writer is None)) # We only close if we are writing in a file\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/code/hf_jax/datasets/src/datasets/arrow_writer.py\u001b[0m in \u001b[0;36mwrite_batch\u001b[0;34m(self, batch_examples, writer_batch_size)\u001b[0m\n\u001b[1;32m 386\u001b[0m \u001b[0mtyped_sequence\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mOptimizedTypedSequence\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch_examples\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcol_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtry_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcol_try_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcol\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 387\u001b[0m \u001b[0mtyped_sequence_examples\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtyped_sequence\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 388\u001b[0;31m \u001b[0mpa_table\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpa\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTable\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_pydict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtyped_sequence_examples\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 389\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite_table\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpa_table\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwriter_batch_size\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 390\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/hf_jax/lib/python3.8/site-packages/pyarrow/table.pxi\u001b[0m in \u001b[0;36mpyarrow.lib.Table.from_pydict\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/hf_jax/lib/python3.8/site-packages/pyarrow/array.pxi\u001b[0m in \u001b[0;36mpyarrow.lib.asarray\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/hf_jax/lib/python3.8/site-packages/pyarrow/array.pxi\u001b[0m in \u001b[0;36mpyarrow.lib.array\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/hf_jax/lib/python3.8/site-packages/pyarrow/array.pxi\u001b[0m in \u001b[0;36mpyarrow.lib._handle_arrow_array_protocol\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m~/code/hf_jax/datasets/src/datasets/arrow_writer.py\u001b[0m in \u001b[0;36m__arrow_array__\u001b[0;34m(self, type)\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 99\u001b[0m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpa\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 100\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mtrying_type\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mout\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_py\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 101\u001b[0m raise TypeError(\n\u001b[1;32m 102\u001b[0m \u001b[0;34m\"Specified try_type alters data. Please check that the type/feature that you provided match the type/features of the data.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/hf_jax/lib/python3.8/site-packages/pyarrow/array.pxi\u001b[0m in \u001b[0;36mpyarrow.lib.Array.__getitem__\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/hf_jax/lib/python3.8/site-packages/pyarrow/array.pxi\u001b[0m in \u001b[0;36mpyarrow.lib._normalize_index\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mIndexError\u001b[0m: index out of bounds" - ] - } - ], + "outputs": [], "source": [ - "filtered_dataset = dataset.map(\n", - " select_existing_rows,\n", - " remove_columns = dataset.column_names,\n", - " batched = True,\n", - " num_proc = 1,\n", - " desc = \"Selecting rows with images that exist\"\n", - ")" + "# This cell does it all, grabs each chunk, cleans it up based on image existing condition, etc.\n", + "global_df = pd.DataFrame()\n", + "chunks_dir = \"./chunks\"\n", + "for filename in os.listdir(chunks_dir):\n", + " df = pd.read_csv(f\"./chunks/{str(filename)}\", sep=\"\\t\")[[\"key\", \"title_clean\", \"description_clean\", \"ext\"]]\n", + " df['caption'] = df[\"title_clean\"]+\". \"+df['description_clean']\n", + " df['is_exist'] = df.apply(image_exists, axis=1)\n", + " df = df.dropna()[[\"key\", \"caption\"]]\n", + " df.columns = ['image_file', 'caption']\n", + " global_df = global_df.append(df, ignore_index=True)" ] }, { "cell_type": "code", - "execution_count": 109, - "id": "7060ff8f", + "execution_count": 89, + "id": "45024fdc", "metadata": {}, "outputs": [], "source": [ - "# df['image_exists'] = df.apply(lambda row: image_exists(row['key']), axis=1)" + "# saving the tsv to disk\n", + "global_df.to_csv('./chunks/YFCC_subset_clean.tsv', sep=\"\\t\", index=False)" ] }, { "cell_type": "code", - "execution_count": 113, - "id": "fecc9a00", + "execution_count": 101, + "id": "dca4eb73", "metadata": {}, "outputs": [], "source": [ - "image_size = 256\n", - "def image_transform(image):\n", - " s = min(image.size)\n", - " r = image_size / s\n", - " s = (round(r * image.size[1]), round(r * image.size[0]))\n", - " image = TF.resize(image, s, interpolation=InterpolationMode.LANCZOS)\n", - " image = TF.center_crop(image, output_size = 2 * [image_size])\n", - " image = torch.unsqueeze(T.ToTensor()(image), 0)\n", - " image = image.permute(0, 2, 3, 1).numpy()\n", - " return image" + "# loading the tsv from disk (for explicitness, also my electricity was gone, glad it happened after I saved to the disk :( )\n", + "\n", + "dataset = pd.read_csv(f\"./chunks/YFCC_subset_clean.tsv\", sep=\"\\t\")" ] }, { "cell_type": "code", - "execution_count": 98, - "id": "1a065700", + "execution_count": 153, + "id": "a511264a", "metadata": {}, "outputs": [], "source": [ - "class YFC100Dataset(Dataset):\n", - " def __init__(self, image_list_path: str, images_root: str, image_size: int, max_items=None):\n", + "\"\"\"\n", + "Luke Melas-Kyriazi's dataset.py's modified version for YFCC\n", + "\"\"\"\n", + "import warnings\n", + "from typing import Optional, Callable\n", + "from pathlib import Path\n", + "import numpy as np\n", + "import torch\n", + "import pandas as pd\n", + "from torch.utils.data import Dataset\n", + "from torchvision.datasets.folder import default_loader\n", + "from PIL import ImageFile\n", + "from PIL.Image import DecompressionBombWarning\n", + "ImageFile.LOAD_TRUNCATED_IMAGES = True\n", + "warnings.filterwarnings(\"ignore\", category=UserWarning)\n", + "warnings.filterwarnings(\"ignore\", category=DecompressionBombWarning)\n", + "\n", + "\n", + "class CaptionDataset(Dataset):\n", + " \"\"\"\n", + " A PyTorch Dataset class for (image, texts) tasks. Note that this dataset \n", + " returns the raw text rather than tokens. This is done on purpose, because\n", + " it's easy to tokenize a batch of text after loading it from this dataset.\n", + " \"\"\"\n", + "\n", + " def __init__(self, *, images_root: str, captions_path: str, text_transform: Optional[Callable] = None, \n", + " image_transform: Optional[Callable] = None, image_transform_type: str = 'torchvision',\n", + " include_captions: bool = True):\n", " \"\"\"\n", - " :param image_list_path: Path to a file containing a list of all images, in jsonl format.\n", - " :param images_root: Root directory containing the images\n", - " :param image_size: Image size. Source images will be resized and center-cropped.\n", - " :max_items: Limit dataset size for debugging\n", + " :param images_root: folder where images are stored\n", + " :param captions_path: path to csv that maps image filenames to captions\n", + " :param image_transform: image transform pipeline\n", + " :param text_transform: image transform pipeline\n", + " :param image_transform_type: image transform type, either `torchvision` or `albumentations`\n", + " :param include_captions: Returns a dictionary with `image`, `text` if `true`; otherwise returns just the images.\n", " \"\"\"\n", - " self.image_list = pd.read_json(image_list_path, orient=\"records\", lines=True)\n", + "\n", + " # Base path for images\n", " self.images_root = Path(images_root)\n", - " if max_items is not None: self.image_list = self.image_list[:max_items]\n", - " self.image_size = image_size\n", - " \n", - " def __len__(self):\n", - " return len(self.image_list)\n", + "\n", + " # Load captions as DataFrame\n", + " self.captions = pd.read_csv(f\"./chunks/YFCC_subset_clean.tsv\", sep=\"\\t\")\n", + " self.captions['image_file'] = self.captions['image_file'].astype(str)\n", + "\n", + " # PyTorch transformation pipeline for the image (normalizing, etc.)\n", + " self.text_transform = text_transform\n", + " self.image_transform = image_transform\n", + " self.image_transform_type = image_transform_type.lower()\n", + " assert self.image_transform_type in ['torchvision', 'albumentations']\n", + "\n", + " # Total number of datapoints\n", + " self.size = len(self.captions)\n", + "\n", + " # Return image+captions or just images\n", + " self.include_captions = include_captions\n", " \n", + " def image_exists(item):\n", + " name, caption = item\n", + " root=str(self.images_root)\n", + " image_path = (Path(root)/name[0:3]/name[3:6]/name).with_suffix(\".jpg\")\n", + "\n", + " return image_path.exists()\n", + "\n", + " def verify_that_all_images_exist(self):\n", + " for image_file in self.captions['image_file']:\n", + " if not image_exists:\n", + " print(f'file does not exist: {p}')\n", + "\n", " def _get_raw_image(self, i):\n", - " image_name = self.image_list.iloc[0].key\n", - " image_path = (self.images_root/image_name[0:3]/image_name[3:6]/image_name).with_suffix('.jpg')\n", - " return default_loader(image_path) if image_path.exists() else None\n", + " name = self.captions.iloc[i]['image_file']\n", + " image_path = (Path(self.images_root)/name[0:3]/name[3:6]/name).with_suffix(\".jpg\")\n", + " image = default_loader(image_path)\n", + " return image\n", + "\n", + " def _get_raw_text(self, i):\n", + " return self.captions.iloc[i]['caption']\n", + "\n", + " def __getitem__(self, i):\n", + " image = self._get_raw_image(i)\n", + " caption = self._get_raw_text(i)\n", + " if self.image_transform is not None:\n", + " if self.image_transform_type == 'torchvision':\n", + " image = self.image_transform(image)\n", + " elif self.image_transform_type == 'albumentations':\n", + " image = self.image_transform(image=np.array(image))['image']\n", + " else:\n", + " raise NotImplementedError(f\"{self.image_transform_type=}\")\n", + " return {'image': image, 'text': caption} if self.include_captions else image\n", + "\n", + " def __len__(self):\n", + " return self.size\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " import albumentations as A\n", + " from albumentations.pytorch import ToTensorV2\n", + " from transformers import AutoTokenizer\n", + " \n", + "\n", + " images_root = \"/home/khali/TPU-Test/YFCC100M_OpenAI_subset/data/data/images\"\n", + " captions_path = './YFCC_subset_clean.tsv'\n", + " image_size = 256\n", " \n", - " # TODO: we could maybe use jax resizing / scaling functions\n", - " def resize_image(self, image):\n", + " # Create transforms\n", + " def image_transform(image):\n", " s = min(image.size)\n", - " r = self.image_size / s\n", + " r = image_size / s\n", " s = (round(r * image.size[1]), round(r * image.size[0]))\n", " image = TF.resize(image, s, interpolation=InterpolationMode.LANCZOS)\n", - " image = TF.center_crop(image, output_size = 2 * [self.image_size])\n", + " image = TF.center_crop(image, output_size = 2 * [image_size])\n", " image = torch.unsqueeze(T.ToTensor()(image), 0)\n", " image = image.permute(0, 2, 3, 1).numpy()\n", " return image\n", " \n", - " def _get_caption(self, i):\n", - " # We are currently appending title and caption. Should we use another separator?\n", - " row = self.image_list.iloc[i]\n", - " return ' '.join(row.title_clean, row.description_clean)\n", - " \n", - " def __getitem__(self, i):\n", - " image = self._get_raw_image(i)\n", - " if image is None: return None\n", - " image = self.resize_image(image)\n", - " caption = self._get_caption(i)\n", - " return {'image': image, 'text': caption}" + " # Create dataset\n", + " dataset = CaptionDataset(\n", + " images_root=images_root,\n", + " captions_path=captions_path,\n", + " image_transform=image_transform,\n", + " image_transform_type='torchvision',\n", + " include_captions=False\n", + " )" ] }, { "cell_type": "code", - "execution_count": 99, - "id": "4ce2211f", - "metadata": {}, - "outputs": [], - "source": [ - "dataset = YFC100Dataset(\n", - " image_list_path = yfc100m_metadata,\n", - " images_root = yfc100m_images,\n", - " image_size = 256,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 100, + "execution_count": 155, "id": "cc922704", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "5000" + "2483316" ] }, - "execution_count": 100, + "execution_count": 155, "metadata": {}, "output_type": "execute_result" } @@ -1416,7 +965,7 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 156, "id": "6e47ba46", "metadata": {}, "outputs": [], @@ -1426,30 +975,29 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 1, "id": "c8a130eb", "metadata": {}, - "outputs": [ - { - "ename": "TypeError", - "evalue": "Caught TypeError in DataLoader worker process 0.\nOriginal Traceback (most recent call last):\n File \"/home/pedro/miniconda3/envs/hf_jax/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py\", line 287, in _worker_loop\n data = fetcher.fetch(index)\n File \"/home/pedro/miniconda3/envs/hf_jax/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py\", line 47, in fetch\n return self.collate_fn(data)\n File \"/home/pedro/miniconda3/envs/hf_jax/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py\", line 86, in default_collate\n raise TypeError(default_collate_err_msg_format.format(elem_type))\nTypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found \n", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/tmp/ipykernel_320049/1409168804.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0miter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataloader\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/miniconda3/envs/hf_jax/lib/python3.8/site-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 519\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sampler_iter\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 520\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 521\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_next_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 522\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_num_yielded\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 523\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_dataset_kind\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0m_DatasetKind\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIterable\u001b[0m \u001b[0;32mand\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/hf_jax/lib/python3.8/site-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m_next_data\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1201\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1202\u001b[0m \u001b[0;32mdel\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_task_info\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1203\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_process_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1204\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1205\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_try_put_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/hf_jax/lib/python3.8/site-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m_process_data\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 1227\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_try_put_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1228\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mExceptionWrapper\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1229\u001b[0;31m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreraise\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1230\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1231\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/hf_jax/lib/python3.8/site-packages/torch/_utils.py\u001b[0m in \u001b[0;36mreraise\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 423\u001b[0m \u001b[0;31m# have message field\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 424\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexc_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 425\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexc_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 426\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 427\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mTypeError\u001b[0m: Caught TypeError in DataLoader worker process 0.\nOriginal Traceback (most recent call last):\n File \"/home/pedro/miniconda3/envs/hf_jax/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py\", line 287, in _worker_loop\n data = fetcher.fetch(index)\n File \"/home/pedro/miniconda3/envs/hf_jax/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py\", line 47, in fetch\n return self.collate_fn(data)\n File \"/home/pedro/miniconda3/envs/hf_jax/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py\", line 86, in default_collate\n raise TypeError(default_collate_err_msg_format.format(elem_type))\nTypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found \n" - ] - } - ], + "outputs": [], "source": [ + "# looking at a batch\n", "next(iter(dataloader))" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "c192fd44", + "metadata": {}, + "outputs": [], + "source": [ + "# import matplotlib.pyplot as plt\n", + "# for tensor_image, _ in dataloader:\n", + "# print(tensor_image)\n", + "# plt.imshow(tensor_image.permute(1, 2, 0))\n", + "# break" + ] + }, { "cell_type": "markdown", "id": "62ad01c3", @@ -1460,23 +1008,20 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 158, "id": "88f36d0b", "metadata": {}, "outputs": [], "source": [ "def encode(model, batch):\n", - " print(\"jitting encode function\")\n", - "# _, indices = model.encode(batch)\n", - "\n", - " # The model does not run in my computer (no cudNN currently installed) - faking it\n", - " indices = [random.randint(0, 16384) for _ in range(256)]\n", + "# print(\"jitting encode function\")\n", + " _, indices = model.encode(batch)\n", " return indices" ] }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 160, "id": "1f35f0cb", "metadata": {}, "outputs": [], @@ -1501,20 +1046,19 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 170, "id": "2210705b", "metadata": {}, "outputs": [], "source": [ "import os\n", - "import jax\n", "\n", - "def encode_captioned_dataset(dataset, output_jsonl, batch_size=32, num_workers=16):\n", - " if os.path.isfile(output_jsonl):\n", - " print(f\"Destination file {output_jsonl} already exists, please move away.\")\n", + "def encode_captioned_dataset(dataset, output_tsv, batch_size=32, num_workers=16):\n", + " if os.path.isfile(output_tsv):\n", + " print(f\"Destination file {output_tsv} already exists, please move away.\")\n", " return\n", " \n", - " num_tpus = jax.device_count()\n", + " num_tpus = 8 \n", " dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers)\n", " superbatches = superbatch_generator(dataloader, num_tpus=num_tpus)\n", " \n", @@ -1522,7 +1066,7 @@ "\n", " # We save each superbatch to avoid reallocation of buffers as we process them.\n", " # We keep the file open to prevent excessive file seeks.\n", - " with open(output_jsonl, \"w\") as file:\n", + " with open(output_tsv, \"w\") as file:\n", " iterations = len(dataset) // (batch_size * num_tpus)\n", " for n in tqdm(range(iterations)):\n", " superbatch = next(superbatches)\n", @@ -1536,14 +1080,12 @@ " captions = dataset.captions[\"caption\"][start_index:end_index].values\n", " encoded_as_string = list(map(lambda item: np.array2string(item, separator=',', max_line_width=50000, formatter={'int':lambda x: str(x)}), encoded))\n", " batch_df = pd.DataFrame.from_dict({\"image_file\": paths, \"caption\": captions, \"encoding\": encoded_as_string})\n", - " batch_df = batch_df.dropna()\n", - " batch_df.to_json(file, orient='records', lines=True, index=None)\n", - " " + " batch_df.to_csv(file, sep='\\t', header=(n==0), index=None)" ] }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 171, "id": "7704863d", "metadata": {}, "outputs": [ @@ -1551,29 +1093,12 @@ "name": "stderr", "output_type": "stream", "text": [ - " 0%| | 0/78 [00:00\n", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/tmp/ipykernel_320049/140243368.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mencode_captioned_dataset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0myfc100m_output\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbatch_size\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m64\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum_workers\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m16\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m/tmp/ipykernel_320049/2954345319.py\u001b[0m in \u001b[0;36mencode_captioned_dataset\u001b[0;34m(dataset, output_jsonl, batch_size, num_workers)\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0miterations\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m//\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mbatch_size\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mnum_tpus\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mn\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0miterations\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 20\u001b[0;31m \u001b[0msuperbatch\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msuperbatches\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 21\u001b[0m \u001b[0mencoded\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mp_encoder\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msuperbatch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnumpy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0mencoded\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mencoded\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoded\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/tmp/ipykernel_320049/4148450576.py\u001b[0m in \u001b[0;36msuperbatch_generator\u001b[0;34m(dataloader, num_tpus)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0msuperbatch_generator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataloader\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum_tpus\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0miter_loader\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0miter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataloader\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mbatch\u001b[0m \u001b[0;32min\u001b[0m \u001b[0miter_loader\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0msuperbatch\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mbatch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msqueeze\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/hf_jax/lib/python3.8/site-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 519\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sampler_iter\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 520\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 521\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_next_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 522\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_num_yielded\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 523\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_dataset_kind\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0m_DatasetKind\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIterable\u001b[0m \u001b[0;32mand\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/hf_jax/lib/python3.8/site-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m_next_data\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1201\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1202\u001b[0m \u001b[0;32mdel\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_task_info\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1203\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_process_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1204\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1205\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_try_put_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/hf_jax/lib/python3.8/site-packages/torch/utils/data/dataloader.py\u001b[0m in \u001b[0;36m_process_data\u001b[0;34m(self, data)\u001b[0m\n\u001b[1;32m 1227\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_try_put_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1228\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mExceptionWrapper\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1229\u001b[0;31m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreraise\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1230\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1231\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda3/envs/hf_jax/lib/python3.8/site-packages/torch/_utils.py\u001b[0m in \u001b[0;36mreraise\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 423\u001b[0m \u001b[0;31m# have message field\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 424\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexc_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmessage\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 425\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexc_type\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 426\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 427\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mTypeError\u001b[0m: Caught TypeError in DataLoader worker process 0.\nOriginal Traceback (most recent call last):\n File \"/home/pedro/miniconda3/envs/hf_jax/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py\", line 287, in _worker_loop\n data = fetcher.fetch(index)\n File \"/home/pedro/miniconda3/envs/hf_jax/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py\", line 47, in fetch\n return self.collate_fn(data)\n File \"/home/pedro/miniconda3/envs/hf_jax/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py\", line 86, in default_collate\n raise TypeError(default_collate_err_msg_format.format(elem_type))\nTypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found \n" + "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4850/4850 [2:27:51<00:00, 1.83s/it]\n" ] } ], "source": [ - "encode_captioned_dataset(dataset, yfc100m_output, batch_size=64, num_workers=16)" + "encode_captioned_dataset(dataset, yfcc100m_output, batch_size=64, num_workers=16)" ] }, { @@ -1587,9 +1112,8 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" + "name": "python3", + "display_name": "Python 3.9.0 64-bit ('Python39')" }, "language_info": { "codemirror_mode": { @@ -1601,9 +1125,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.9.0" + }, + "interpreter": { + "hash": "db471c52d602b4f5f40ecaf278e88ccfef85c29d0a1a07185b0d51fc7acf4e26" } }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file