codeShare commited on
Commit
16ea5d1
1 Parent(s): c012553

Upload indexed_text_encoding_converter.ipynb

Browse files
Files changed (1) hide show
  1. indexed_text_encoding_converter.ipynb +209 -0
indexed_text_encoding_converter.ipynb ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ }
15
+ },
16
+ "cells": [
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": null,
20
+ "metadata": {
21
+ "id": "cskYkw0zXHEm"
22
+ },
23
+ "outputs": [],
24
+ "source": [
25
+ "# @title Make your own text_encodings .safetensor file for later use (using GPU is recommended to speed things up)\n",
26
+ "\n",
27
+ "import json\n",
28
+ "import pandas as pd\n",
29
+ "import os\n",
30
+ "import shelve\n",
31
+ "import torch\n",
32
+ "from safetensors.torch import save_file\n",
33
+ "import json\n",
34
+ "\n",
35
+ "# Determine if this notebook is running on Colab or Kaggle\n",
36
+ "#Use https://www.kaggle.com/ if Google Colab GPU is busy\n",
37
+ "home_directory = '/content/'\n",
38
+ "using_Kaggle = os.environ.get('KAGGLE_URL_BASE','')\n",
39
+ "if using_Kaggle : home_directory = '/kaggle/working/'\n",
40
+ "%cd {home_directory}\n",
41
+ "#-------#\n",
42
+ "\n",
43
+ "# User input\n",
44
+ "target = home_directory + 'text-to-image-prompts/names/celebs/mixed/'\n",
45
+ "output_folder = home_directory + 'output/celebs/mixed/'\n",
46
+ "root_filename = '🆔👨 fusion-t2i-v2-celeb'\n",
47
+ "NUM_FILES = 1\n",
48
+ "#--------#\n",
49
+ "\n",
50
+ "# Setup environment\n",
51
+ "def my_mkdirs(folder):\n",
52
+ " if os.path.exists(folder)==False:\n",
53
+ " os.makedirs(folder)\n",
54
+ "#--------#\n",
55
+ "output_folder_text = output_folder + 'text/'\n",
56
+ "output_folder_text = output_folder + 'text/'\n",
57
+ "output_folder_text_encodings = output_folder + 'text_encodings/'\n",
58
+ "target_raw = target + 'raw/'\n",
59
+ "%cd {home_directory}\n",
60
+ "my_mkdirs(output_folder)\n",
61
+ "my_mkdirs(output_folder_text)\n",
62
+ "my_mkdirs(output_folder_text_encodings)\n",
63
+ "#-------#\n",
64
+ "\n",
65
+ "# Load the data if not already loaded\n",
66
+ "try:\n",
67
+ " loaded\n",
68
+ "except:\n",
69
+ " %cd {home_directory}\n",
70
+ " !git clone https://huggingface.co/datasets/codeShare/text-to-image-prompts\n",
71
+ " loaded = True\n",
72
+ "#--------#\n",
73
+ "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
74
+ "from transformers import AutoTokenizer\n",
75
+ "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
76
+ "from transformers import CLIPProcessor, CLIPModel\n",
77
+ "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\" , clean_up_tokenization_spaces = True)\n",
78
+ "model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\").to(device)\n",
79
+ "#---------#\n",
80
+ "for file_index in range(NUM_FILES + 1):\n",
81
+ " if (file_index < 1): continue\n",
82
+ " filename = f'{root_filename}-{file_index}'\n",
83
+ "\n",
84
+ " # Read {filename}.json\n",
85
+ " %cd {target_raw}\n",
86
+ " with open(filename + '.json', 'r') as f:\n",
87
+ " data = json.load(f)\n",
88
+ " _df = pd.DataFrame({'count': data})['count']\n",
89
+ " prompts = {\n",
90
+ " key : value.replace(\"</w>\",\" \") for key, value in _df.items()\n",
91
+ " }\n",
92
+ " index = 0\n",
93
+ " for key in prompts:\n",
94
+ " index = index + 1\n",
95
+ " #----------#\n",
96
+ " NUM_ITEMS = index\n",
97
+ " #------#\n",
98
+ "\n",
99
+ " # Calculate text_encoding for .json file contents and results as .db file\n",
100
+ " names_dict = {}\n",
101
+ " text_encoding_dict = {}\n",
102
+ " segments = {}\n",
103
+ " index = 0;\n",
104
+ " subby = 1;\n",
105
+ " NUM_HEADERS = 2\n",
106
+ " CHUNKS_SIZE = 1000\n",
107
+ " _filename = ''\n",
108
+ " for _index in range(NUM_ITEMS):\n",
109
+ " if (index % 100 == 0) : print(index)\n",
110
+ " if (index == 0 and _index>0) : index = index + 2 #make space for headers\n",
111
+ " if (_index % (CHUNKS_SIZE-NUM_HEADERS) == 0 and _index > 0) :\n",
112
+ "\n",
113
+ " # Write headers in the .json\n",
114
+ " names_dict[f'{0}'] = f'{_index}'\n",
115
+ " names_dict[f'{1}'] = f'{filename}-{subby}'\n",
116
+ "\n",
117
+ " # Encode the headers into text_encoding\n",
118
+ " inputs = tokenizer(text = '' + names_dict[f'{0}'], padding=True, return_tensors=\"pt\").to(device)\n",
119
+ " text_features = model.get_text_features(**inputs).to(device)\n",
120
+ " text_features = text_features/text_features.norm(p=2, dim=-1, keepdim=True)\n",
121
+ " text_encoding_dict[f'{0}'] = text_features.to(torch.device('cpu'))\n",
122
+ " inputs = tokenizer(text = '' + names_dict[f'{1}'], padding=True, return_tensors=\"pt\").to(device)\n",
123
+ " text_features = model.get_text_features(**inputs).to(device)\n",
124
+ " text_features = text_features/text_features.norm(p=2, dim=-1, keepdim=True)\n",
125
+ " text_encoding_dict[f'{1}'] = text_features.to(torch.device('cpu'))\n",
126
+ " #-------#\n",
127
+ "\n",
128
+ " # Write .json\n",
129
+ " _filename = f'{filename}-{subby}.json'\n",
130
+ " %cd {output_folder_text}\n",
131
+ " print(f'Saving segment {_filename} to {output_folder_text}...')\n",
132
+ " with open(_filename, 'w') as f:\n",
133
+ " json.dump(names_dict, f)\n",
134
+ " #-------#\n",
135
+ "\n",
136
+ " # Write .safetensors\n",
137
+ " _filename = f'{filename}-{subby}.safetensors'\n",
138
+ " %cd {output_folder_text_encodings}\n",
139
+ " print(f'Saving segment {_filename} to {output_folder_text_encodings}...')\n",
140
+ " save_file(text_encoding_dict, _filename)\n",
141
+ " #--------#\n",
142
+ "\n",
143
+ " #Iterate\n",
144
+ " subby = subby + 1\n",
145
+ " segments[f'{subby}'] = _filename\n",
146
+ " text_encoding_dict = {}\n",
147
+ " names_dict = {}\n",
148
+ " index = 0\n",
149
+ " #------#\n",
150
+ " #------#\n",
151
+ " else: index = index + 1\n",
152
+ " #--------#\n",
153
+ " inputs = tokenizer(text = '' + prompts[f'{_index}'], padding=True, return_tensors=\"pt\").to(device)\n",
154
+ " text_features = model.get_text_features(**inputs).to(device)\n",
155
+ " text_features = text_features/text_features.norm(p=2, dim=-1, keepdim=True)\n",
156
+ " text_encoding_dict[f'{index}'] = text_features.to(torch.device('cpu'))\n",
157
+ " names_dict[f'{index}'] = prompts[f'{_index}']\n",
158
+ " continue\n",
159
+ " #-----#\n",
160
+ " #-----#\n",
161
+ " # Write headers in the .json\n",
162
+ " names_dict[f'{0}'] = f'{_index}'\n",
163
+ " names_dict[f'{1}'] = f'{filename}-{subby}'\n",
164
+ "\n",
165
+ " # Encode the headers into text_encoding\n",
166
+ " inputs = tokenizer(text = '' + names_dict[f'{0}'], padding=True, return_tensors=\"pt\").to(device)\n",
167
+ " text_features = model.get_text_features(**inputs).to(device)\n",
168
+ " text_features = text_features/text_features.norm(p=2, dim=-1, keepdim=True)\n",
169
+ " text_encoding_dict[f'{0}'] = text_features.to(torch.device('cpu'))\n",
170
+ " inputs = tokenizer(text = '' + names_dict[f'{1}'], padding=True, return_tensors=\"pt\").to(device)\n",
171
+ " text_features = model.get_text_features(**inputs).to(device)\n",
172
+ " text_features = text_features/text_features.norm(p=2, dim=-1, keepdim=True)\n",
173
+ " text_encoding_dict[f'{1}'] = text_features.to(torch.device('cpu'))\n",
174
+ " #-------#\n",
175
+ "\n",
176
+ " # Write .json\n",
177
+ " _filename = f'{filename}-{subby}.json'\n",
178
+ " %cd {output_folder_text}\n",
179
+ " print(f'Saving segment {_filename} to {output_folder_text}...')\n",
180
+ " with open(_filename, 'w') as f:\n",
181
+ " json.dump(names_dict, f)\n",
182
+ " #-------#\n",
183
+ "\n",
184
+ " # Write .safetensors\n",
185
+ " _filename = f'{filename}-{subby}.safetensors'\n",
186
+ " %cd {output_folder_text_encodings}\n",
187
+ " print(f'Saving segment {_filename} to {output_folder_text_encodings}...')\n",
188
+ " save_file(text_encoding_dict, _filename)\n",
189
+ " #--------#\n",
190
+ "\n",
191
+ " #Iterate\n",
192
+ " subby = subby + 1\n",
193
+ " segments[f'{subby}'] = _filename\n",
194
+ " text_encoding_dict = {}\n",
195
+ " names_dict = {}\n",
196
+ " index = 0\n",
197
+ " #------#\n",
198
+ " #----#\n",
199
+ "\n",
200
+ "# @title Download the text_encodings as .zip\n",
201
+ "import os\n",
202
+ "%cd {home_directory}\n",
203
+ "os.remove(f'{home_directory}results.zip')\n",
204
+ "zip_dest = f'{home_directory}results.zip'\n",
205
+ "!zip -r {zip_dest} {output_folder}"
206
+ ]
207
+ }
208
+ ]
209
+ }