codeShare commited on
Commit
551a5a7
1 Parent(s): 33824cf

Upload sd_token_similarity_calculator.ipynb

Browse files
Files changed (1) hide show
  1. sd_token_similarity_calculator.ipynb +49 -82
sd_token_similarity_calculator.ipynb CHANGED
@@ -17,7 +17,7 @@
17
  {
18
  "cell_type": "markdown",
19
  "source": [
20
- "This Notebook is a Stable-diffusion tool which allows you to find similiar tokens from the SD 1.5 vocab.json that you can use for text-to-image generation."
21
  ],
22
  "metadata": {
23
  "id": "L7JTcbOdBPfh"
@@ -26,6 +26,7 @@
26
  {
27
  "cell_type": "code",
28
  "source": [
 
29
  "# Load the tokens into the colab\n",
30
  "!git clone https://huggingface.co/datasets/codeShare/sd_tokens\n",
31
  "import torch\n",
@@ -70,6 +71,9 @@
70
  " return result\n",
71
  "#----#\n",
72
  "\n",
 
 
 
73
  "mix_with = \"\"\n",
74
  "mix_method = \"None\""
75
  ],
@@ -82,29 +86,7 @@
82
  {
83
  "cell_type": "code",
84
  "source": [
85
- "#print(vocab[8922]) #the vocab item for ID 8922\n",
86
- "#print(token[8922].shape) #dimension of the token"
87
- ],
88
- "metadata": {
89
- "id": "S_Yh9gH_OUA1"
90
- },
91
- "execution_count": null,
92
- "outputs": []
93
- },
94
- {
95
- "cell_type": "markdown",
96
- "source": [
97
- "Get the IDs from a prompt text.\n",
98
- "\n",
99
- "The prompt will be enclosed with the <|start-of-text|> and <|end-of-text|> tokens. Leave the field empty to get a random value tensor"
100
- ],
101
- "metadata": {
102
- "id": "f1-jS7YJApiO"
103
- }
104
- },
105
- {
106
- "cell_type": "code",
107
- "source": [
108
  "from transformers import AutoTokenizer\n",
109
  "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
110
  "\n",
@@ -128,34 +110,46 @@
128
  "#Save a copy of the tensor A\n",
129
  "id_P = input_ids[1]\n",
130
  "P = token[id_A]\n",
131
- "_P = LA.vector_norm(A, ord=2)"
 
 
 
 
132
  ],
133
  "metadata": {
134
- "id": "RPdkYzT2_X85"
 
 
 
 
135
  },
136
- "execution_count": null,
137
- "outputs": []
138
- },
139
- {
140
- "cell_type": "markdown",
141
- "source": [
142
- "OPTIONAL : Add/subtract + normalize above result with another token. Leave field empty to get a random value tensor"
143
- ],
144
- "metadata": {
145
- "id": "JKnz0aLFVGXc"
146
- }
147
  },
148
  {
149
  "cell_type": "code",
150
  "source": [
 
151
  "mix_with = \"\" # @param {type:'string'}\n",
152
  "mix_method = \"None\" # @param [\"None\" , \"Average\", \"Subtract\"] {allow-input: true}\n",
153
  "w = 0.5 # @param {type:\"slider\", min:0, max:1, step:0.01}\n",
154
  "\n",
155
- "#prevent re-iterating A by reading from stored copy\n",
156
- "id_A = id_P\n",
157
- "A = P\n",
158
- "_A = _P\n",
 
 
 
 
159
  "#----#\n",
160
  "\n",
161
  "tokenizer_output = tokenizer(text = mix_with)\n",
@@ -187,7 +181,7 @@
187
  " _A = LA.vector_norm(A, ord=2)\n",
188
  " print(\"Tokenized prompt tensor A has been recalculated as A = (w*_A + (1-w)*_C) * norm(w*A - (1-w)*C) , where C is the tokenized prompt 'mix_with' tensor C\")\n",
189
  "\n",
190
- "\n"
191
  ],
192
  "metadata": {
193
  "id": "oXbNSRSKPgRr"
@@ -195,19 +189,11 @@
195
  "execution_count": null,
196
  "outputs": []
197
  },
198
- {
199
- "cell_type": "markdown",
200
- "source": [
201
- "Produce a list id IDs that are most similiar to the prompt ID at positiion 1 based on above result"
202
- ],
203
- "metadata": {
204
- "id": "3uBSZ1vWVCew"
205
- }
206
- },
207
  {
208
  "cell_type": "code",
209
  "source": [
210
  "\n",
 
211
  "dots = torch.zeros(NUM_TOKENS)\n",
212
  "for index in range(NUM_TOKENS):\n",
213
  " id_B = index\n",
@@ -234,7 +220,9 @@
234
  "if (mix_method == \"Subtract\"):\n",
235
  " print(f'Calculated all cosine-similarities between the subtract of token {name_A} and {name_C} with Id_A = {id_A} and mixed Id_C = {id_C} as a 1x{sorted.shape[0]} tensor')\n",
236
  "if (mix_method == \"None\"):\n",
237
- " print(f'Calculated all cosine-similarities between the token {name_A} with Id_A = {id_A} with the the rest of the {NUM_TOKENS} tokens as a 1x{sorted.shape[0]} tensor')"
 
 
238
  ],
239
  "metadata": {
240
  "id": "juxsvco9B0iV"
@@ -242,20 +230,11 @@
242
  "execution_count": null,
243
  "outputs": []
244
  },
245
- {
246
- "cell_type": "markdown",
247
- "source": [
248
- "Print the sorted list from above result"
249
- ],
250
- "metadata": {
251
- "id": "y-Ig3glrVQC3"
252
- }
253
- },
254
  {
255
  "cell_type": "code",
256
  "source": [
 
257
  "list_size = 100 # @param {type:'number'}\n",
258
- "\n",
259
  "print_ID = False # @param {type:\"boolean\"}\n",
260
  "print_Similarity = True # @param {type:\"boolean\"}\n",
261
  "print_Name = True # @param {type:\"boolean\"}\n",
@@ -270,7 +249,9 @@
270
  " if (print_Similarity):\n",
271
  " print(f'similiarity = {round(sorted[index].item()*100,2)} %') # % value\n",
272
  " if (print_Divider):\n",
273
- " print('--------')"
 
 
274
  ],
275
  "metadata": {
276
  "id": "YIEmLAzbHeuo",
@@ -279,33 +260,19 @@
279
  "execution_count": null,
280
  "outputs": []
281
  },
282
- {
283
- "cell_type": "markdown",
284
- "source": [
285
- "Find the most similiar Tokens for given input"
286
- ],
287
- "metadata": {
288
- "id": "qqZ5DvfLBJnw"
289
- }
290
- },
291
- {
292
- "cell_type": "markdown",
293
- "source": [
294
- "Valid ID ranges for id_for_token_A / id_for_token_B are between 0 and 49407"
295
- ],
296
- "metadata": {
297
- "id": "kX72bAuhOtlT"
298
- }
299
- },
300
  {
301
  "cell_type": "code",
302
  "source": [
 
 
303
  "id_for_token_A = 4567 # @param {type:'number'}\n",
304
  "id_for_token_B = 4343 # @param {type:'number'}\n",
305
  "\n",
306
  "similarity_str = 'The similarity between tokens A and B is ' + similarity(id_for_token_A , id_for_token_B)\n",
307
  "\n",
308
- "print(similarity_str)"
 
 
309
  ],
310
  "metadata": {
311
  "id": "MwmOdC9cNZty"
 
17
  {
18
  "cell_type": "markdown",
19
  "source": [
20
+ "This Notebook is a Stable-diffusion tool which allows you to find similiar tokens from the SD 1.5 vocab.json that you can use for text-to-image generation. Try this Free online SD 1.5 generator with the results: https://perchance.org/fusion-ai-image-generator"
21
  ],
22
  "metadata": {
23
  "id": "L7JTcbOdBPfh"
 
26
  {
27
  "cell_type": "code",
28
  "source": [
29
+ "# @title Load/initialize values\n",
30
  "# Load the tokens into the colab\n",
31
  "!git clone https://huggingface.co/datasets/codeShare/sd_tokens\n",
32
  "import torch\n",
 
71
  " return result\n",
72
  "#----#\n",
73
  "\n",
74
+ "#print(vocab[8922]) #the vocab item for ID 8922\n",
75
+ "#print(token[8922].shape) #dimension of the token\n",
76
+ "\n",
77
  "mix_with = \"\"\n",
78
  "mix_method = \"None\""
79
  ],
 
86
  {
87
  "cell_type": "code",
88
  "source": [
89
+ "# @title Tokenize prompt into IDs\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  "from transformers import AutoTokenizer\n",
91
  "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
92
  "\n",
 
110
  "#Save a copy of the tensor A\n",
111
  "id_P = input_ids[1]\n",
112
  "P = token[id_A]\n",
113
+ "_P = LA.vector_norm(A, ord=2)\n",
114
+ "\n",
115
+ "#The prompt will be enclosed with the <|start-of-text|> and <|end-of-text|> tokens, which is why output will be [49406, ... , 49407].\n",
116
+ "\n",
117
+ "#You can leave the 'prompt' field empty to get a random value tensor. Since the tensor is random value, it will not correspond to any tensor in the vocab.json list , and this it will have no ID."
118
  ],
119
  "metadata": {
120
+ "id": "RPdkYzT2_X85",
121
+ "colab": {
122
+ "base_uri": "https://localhost:8080/"
123
+ },
124
+ "outputId": "e335f5da-b26d-4eea-f854-fd646444ea14"
125
  },
126
+ "execution_count": 15,
127
+ "outputs": [
128
+ {
129
+ "output_type": "stream",
130
+ "name": "stdout",
131
+ "text": [
132
+ "[49406, 8922, 49407]\n"
133
+ ]
134
+ }
135
+ ]
 
136
  },
137
  {
138
  "cell_type": "code",
139
  "source": [
140
+ "# @title Take the ID at index 1 from above result and modify it (optional)\n",
141
  "mix_with = \"\" # @param {type:'string'}\n",
142
  "mix_method = \"None\" # @param [\"None\" , \"Average\", \"Subtract\"] {allow-input: true}\n",
143
  "w = 0.5 # @param {type:\"slider\", min:0, max:1, step:0.01}\n",
144
  "\n",
145
+ "#------#\n",
146
+ "#If set to TRUE , this will use the output of this cell , tensor A, as the input of this cell the 2nd time we run it. Use this feature to mix many tokens into A\n",
147
+ "re_iterate_tensor_A = True # @param {\"type\":\"boolean\"}\n",
148
+ "if (re_iterate_tensor_A == False) :\n",
149
+ " #prevent re-iterating A by reading from stored copy\n",
150
+ " id_A = id_P\n",
151
+ " A = P\n",
152
+ " _A = _P\n",
153
  "#----#\n",
154
  "\n",
155
  "tokenizer_output = tokenizer(text = mix_with)\n",
 
181
  " _A = LA.vector_norm(A, ord=2)\n",
182
  " print(\"Tokenized prompt tensor A has been recalculated as A = (w*_A + (1-w)*_C) * norm(w*A - (1-w)*C) , where C is the tokenized prompt 'mix_with' tensor C\")\n",
183
  "\n",
184
+ "#OPTIONAL : Add/subtract + normalize above result with another token. Leave field empty to get a random value tensor"
185
  ],
186
  "metadata": {
187
  "id": "oXbNSRSKPgRr"
 
189
  "execution_count": null,
190
  "outputs": []
191
  },
 
 
 
 
 
 
 
 
 
192
  {
193
  "cell_type": "code",
194
  "source": [
195
  "\n",
196
+ "# @title Find Similiar Tokens to ID at index 1 from above result\n",
197
  "dots = torch.zeros(NUM_TOKENS)\n",
198
  "for index in range(NUM_TOKENS):\n",
199
  " id_B = index\n",
 
220
  "if (mix_method == \"Subtract\"):\n",
221
  " print(f'Calculated all cosine-similarities between the subtract of token {name_A} and {name_C} with Id_A = {id_A} and mixed Id_C = {id_C} as a 1x{sorted.shape[0]} tensor')\n",
222
  "if (mix_method == \"None\"):\n",
223
+ " print(f'Calculated all cosine-similarities between the token {name_A} with Id_A = {id_A} with the the rest of the {NUM_TOKENS} tokens as a 1x{sorted.shape[0]} tensor')\n",
224
+ "\n",
225
+ "#Produce a list id IDs that are most similiar to the prompt ID at positiion 1 based on above result"
226
  ],
227
  "metadata": {
228
  "id": "juxsvco9B0iV"
 
230
  "execution_count": null,
231
  "outputs": []
232
  },
 
 
 
 
 
 
 
 
 
233
  {
234
  "cell_type": "code",
235
  "source": [
236
+ "# @title Print Result from the 'Similiar Tokens' list from above result\n",
237
  "list_size = 100 # @param {type:'number'}\n",
 
238
  "print_ID = False # @param {type:\"boolean\"}\n",
239
  "print_Similarity = True # @param {type:\"boolean\"}\n",
240
  "print_Name = True # @param {type:\"boolean\"}\n",
 
249
  " if (print_Similarity):\n",
250
  " print(f'similiarity = {round(sorted[index].item()*100,2)} %') # % value\n",
251
  " if (print_Divider):\n",
252
+ " print('--------')\n",
253
+ "\n",
254
+ "#Print the sorted list from above result"
255
  ],
256
  "metadata": {
257
  "id": "YIEmLAzbHeuo",
 
260
  "execution_count": null,
261
  "outputs": []
262
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  {
264
  "cell_type": "code",
265
  "source": [
266
+ "\n",
267
+ "# @title Get similarity % of two token IDs\n",
268
  "id_for_token_A = 4567 # @param {type:'number'}\n",
269
  "id_for_token_B = 4343 # @param {type:'number'}\n",
270
  "\n",
271
  "similarity_str = 'The similarity between tokens A and B is ' + similarity(id_for_token_A , id_for_token_B)\n",
272
  "\n",
273
+ "print(similarity_str)\n",
274
+ "\n",
275
+ "#Valid ID ranges for id_for_token_A / id_for_token_B are between 0 and 49407"
276
  ],
277
  "metadata": {
278
  "id": "MwmOdC9cNZty"