codeShare commited on
Commit
233ede2
1 Parent(s): a48155a

Upload sd_token_similarity_calculator.ipynb

Browse files
Files changed (1) hide show
  1. sd_token_similarity_calculator.ipynb +63 -303
sd_token_similarity_calculator.ipynb CHANGED
@@ -125,56 +125,53 @@
125
  "cell_type": "code",
126
  "source": [
127
  "# @title ⚡ Get similiar tokens\n",
 
128
  "from transformers import AutoTokenizer\n",
129
  "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
130
  "\n",
131
  "# @markdown Write name of token to match against\n",
132
  "prompt= \"banana\" # @param {type:'string',\"placeholder\":\"leave empty for random value token\"}\n",
133
- "\n",
134
- "tokenizer_output = tokenizer(text = prompt)\n",
135
- "input_ids = tokenizer_output['input_ids']\n",
136
- "print(input_ids)\n",
137
- "\n",
138
- "\n",
139
- "#The prompt will be enclosed with the <|start-of-text|> and <|end-of-text|> tokens, which is why output will be [49406, ... , 49407].\n",
140
- "\n",
141
- "#You can leave the 'prompt' field empty to get a random value tensor. Since the tensor is random value, it will not correspond to any tensor in the vocab.json list , and this it will have no ID.\n",
142
- "\n",
143
- "id_A = input_ids[1]\n",
144
- "A = token[id_A]\n",
145
- "_A = LA.vector_norm(A, ord=2)\n",
146
- "\n",
147
- "#if no imput exists we just randomize the entire thing\n",
148
- "if (prompt == \"\"):\n",
149
- " id_A = -1\n",
150
- " print(\"Tokenized prompt tensor A is a random valued tensor with no ID\")\n",
151
- " R = torch.rand(768)\n",
152
- " _R = LA.vector_norm(R, ord=2)\n",
153
- " A = R*(_A/_R)\n",
154
- " name_A = 'random_A'\n",
155
- "\n",
156
  "# @markdown (optional) Mix the token with something else\n",
157
  "mix_with = \"\" # @param {\"type\":\"string\",\"placeholder\":\"leave empty for random value token\"}\n",
158
  "mix_method = \"None\" # @param [\"None\" , \"Average\", \"Subtract\"] {allow-input: true}\n",
159
  "w = 0.5 # @param {type:\"slider\", min:0, max:1, step:0.01}\n",
160
- "\n",
161
  "# @markdown Limit char size of included token\n",
162
  "min_char_size = 3 # @param {type:\"slider\", min:0, max: 50, step:1}\n",
163
  "char_range = 5 # @param {type:\"slider\", min:0, max: 50, step:1}\n",
164
  "\n",
 
 
 
 
 
 
165
  "tokenizer_output = tokenizer(text = mix_with)\n",
166
  "input_ids = tokenizer_output['input_ids']\n",
167
  "id_C = input_ids[1]\n",
168
- "C = token[id_C]\n",
169
- "_C = LA.vector_norm(C, ord=2)\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  "\n",
171
  "#if no imput exists we just randomize the entire thing\n",
172
  "if (mix_with == \"\"):\n",
173
  " id_C = -1\n",
174
  " print(\"Tokenized prompt 'mix_with' tensor C is a random valued tensor with no ID\")\n",
175
- " R = torch.rand(768)\n",
176
- " _R = LA.vector_norm(R, ord=2)\n",
177
- " C = R*(_C/_R)\n",
178
  " name_C = 'random_C'\n",
179
  "\n",
180
  "name_A = \"A of random type\"\n",
@@ -185,16 +182,7 @@
185
  "if (id_C>-1):\n",
186
  " name_C = vocab[id_C]\n",
187
  "\n",
188
- "# Peaks feature\n",
189
- "#peaks_A = get_valleys(A)\n",
190
- "#peaks_C = get_valleys(C)\n",
191
- "#print(f\"The elementwise top 10 highest values for A is at indices {peaks_A}\")\n",
192
- "#print(\"-------\")\n",
193
- "#print(f\"The elementwise top 10 highest values for C is at indices {peaks_C}\")\n",
194
- "#print(\"-------\")\n",
195
- "#//------//\n",
196
- "\n",
197
- "print(f\"The similarity between A '{name_A}' and C '{name_C}' is {token_similarity(A, C)}\")\n",
198
  "\n",
199
  "if (mix_method == \"None\"):\n",
200
  " print(\"No operation\")\n",
@@ -206,10 +194,9 @@
206
  "\n",
207
  "if (mix_method == \"Subtract\"):\n",
208
  " tmp = w*A - (1-w)*C\n",
209
- " _tmp = LA.vector_norm(tmp, ord=2)\n",
210
- " A = (_A/_tmp)*tmp\n",
211
  " #//---//\n",
212
- " _A = LA.vector_norm(A, ord=2)\n",
213
  " print(f\"Tokenized prompt tensor A '{name_A}' token has been recalculated as A = _A*norm(w*A - (1-w)*C) , where C is '{name_C}' token , for w = {w} \")\n",
214
  "\n",
215
  "#OPTIONAL : Add/subtract + normalize above result with another token. Leave field empty to get a random value tensor\n",
@@ -217,12 +204,10 @@
217
  "dots = torch.zeros(NUM_TOKENS)\n",
218
  "for index in range(NUM_TOKENS):\n",
219
  " id_B = index\n",
220
- " B = token[id_B]\n",
221
- " _B = LA.vector_norm(B, ord=2)\n",
222
- " result = torch.dot(A,B)/(_A*_B)\n",
223
- " #result = absolute_value(result.item())\n",
224
- " result = result.item()\n",
225
- " dots[index] = result\n",
226
  "\n",
227
  "\n",
228
  "sorted, indices = torch.sort(dots,dim=0 , descending=True)\n",
@@ -262,11 +247,14 @@
262
  " if (print_Divider):\n",
263
  " print('--------')\n",
264
  "\n",
265
- "#Print the sorted list from above result"
 
 
 
 
266
  ],
267
  "metadata": {
268
- "id": "iWeFnT1gAx6A",
269
- "cellView": "form"
270
  },
271
  "execution_count": null,
272
  "outputs": []
@@ -395,8 +383,6 @@
395
  "\n",
396
  "for index in range(RANGE):\n",
397
  " id_C = START + index\n",
398
- " C = token[id_C]\n",
399
- " _C = LA.vector_norm(C, ord=2)\n",
400
  " name_C = vocab[id_C]\n",
401
  " is_Prefix = 0\n",
402
  "\n",
@@ -591,10 +577,7 @@
591
  "for index in range(NUM_PERMUTATIONS):\n",
592
  " print(names[indices[index].item()])\n",
593
  " print(f'similiarity = {round(sorted[index].item(),2)} %')\n",
594
- " print('------')\n",
595
- "\n",
596
- "\n",
597
- "\n"
598
  ],
599
  "metadata": {
600
  "collapsed": true,
@@ -607,36 +590,36 @@
607
  "cell_type": "code",
608
  "source": [
609
  "# @title 💫 Compare Text encodings\n",
610
- "\n",
611
  "prompt_A = \"banana\" # @param {\"type\":\"string\",\"placeholder\":\"Write a prompt\"}\n",
612
- "prompt_B = \"\" # @param {\"type\":\"string\",\"placeholder\":\"Write a prompt\"}\n",
613
- "use_token_padding = True # @param {type:\"boolean\"}\n",
614
- "\n",
 
 
 
 
615
  "from transformers import CLIPProcessor, CLIPModel\n",
616
- "\n",
617
  "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\" , clean_up_tokenization_spaces = True)\n",
618
- "\n",
619
  "model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\")\n",
620
- "\n",
621
- "ids_A = processor.tokenizer(text=prompt_A, padding=use_token_padding, return_tensors=\"pt\")\n",
622
- "text_encoding_A = model.get_text_features(**ids_A)\n",
623
- "\n",
624
- "\n",
625
- "ids_B = processor.tokenizer(text=prompt_B, padding=use_token_padding, return_tensors=\"pt\")\n",
626
- "text_encoding_B = model.get_text_features(**ids_B)\n",
627
- "\n",
628
- "similarity_str = 'The similarity between the text_encoding for A:\"' + prompt_A + '\" and B: \"' + prompt_B +'\" is ' + token_similarity(text_encoding_A[0] , text_encoding_B[0])\n",
629
- "\n",
630
- "\n",
631
- "print(similarity_str)\n",
632
- "#outputs = model(**inputs)\n",
633
- "#logits_per_image = outputs.logits_per_image # this is the image-text similarity score\n",
634
- "#probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities"
635
  ],
636
  "metadata": {
637
  "id": "QQOjh5BvnG8M",
638
- "collapsed": true,
639
- "cellView": "form"
640
  },
641
  "execution_count": null,
642
  "outputs": []
@@ -650,229 +633,6 @@
650
  "id": "hyK423TQCRup"
651
  }
652
  },
653
- {
654
- "cell_type": "markdown",
655
- "source": [
656
- "# ↓ Sub modules (use these to build your own projects) ↓"
657
- ],
658
- "metadata": {
659
- "id": "_d8WtPgtAymM"
660
- }
661
- },
662
- {
663
- "cell_type": "code",
664
- "source": [
665
- "# @title 📝 -> 🆔 Tokenize prompt into IDs\n",
666
- "from transformers import AutoTokenizer\n",
667
- "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
668
- "\n",
669
- "prompt= \"banana\" # @param {type:'string'}\n",
670
- "\n",
671
- "tokenizer_output = tokenizer(text = prompt)\n",
672
- "input_ids = tokenizer_output['input_ids']\n",
673
- "print(input_ids)\n",
674
- "\n",
675
- "\n",
676
- "#The prompt will be enclosed with the <|start-of-text|> and <|end-of-text|> tokens, which is why output will be [49406, ... , 49407].\n",
677
- "\n",
678
- "#You can leave the 'prompt' field empty to get a random value tensor. Since the tensor is random value, it will not correspond to any tensor in the vocab.json list , and this it will have no ID."
679
- ],
680
- "metadata": {
681
- "id": "RPdkYzT2_X85",
682
- "cellView": "form"
683
- },
684
- "execution_count": null,
685
- "outputs": []
686
- },
687
- {
688
- "cell_type": "code",
689
- "source": [
690
- "# @title 🆔->🥢 Take the ID at index 1 from above result and get its corresponding tensor value\n",
691
- "\n",
692
- "id_A = input_ids[1]\n",
693
- "A = token[id_A]\n",
694
- "_A = LA.vector_norm(A, ord=2)\n",
695
- "\n",
696
- "#if no imput exists we just randomize the entire thing\n",
697
- "if (prompt == \"\"):\n",
698
- " id_A = -1\n",
699
- " print(\"Tokenized prompt tensor A is a random valued tensor with no ID\")\n",
700
- " R = torch.rand(768)\n",
701
- " _R = LA.vector_norm(R, ord=2)\n",
702
- " A = R*(_A/_R)\n",
703
- "\n",
704
- "#Save a copy of the tensor A\n",
705
- "id_P = id_A\n",
706
- "P = A\n",
707
- "_P = LA.vector_norm(A, ord=2)\n"
708
- ],
709
- "metadata": {
710
- "id": "YqdiF8DIz9Wu",
711
- "cellView": "form"
712
- },
713
- "execution_count": null,
714
- "outputs": []
715
- },
716
- {
717
- "cell_type": "code",
718
- "source": [
719
- "# @title 🥢 -> 🥢🔀 Take the ID at index 1 from above result and modify it (optional)\n",
720
- "mix_with = \"\" # @param {type:'string'}\n",
721
- "mix_method = \"None\" # @param [\"None\" , \"Average\", \"Subtract\"] {allow-input: true}\n",
722
- "w = 0.5 # @param {type:\"slider\", min:0, max:1, step:0.01}\n",
723
- "\n",
724
- "#------#\n",
725
- "#If set to TRUE , this will use the output of this cell , tensor A, as the input of this cell the 2nd time we run it. Use this feature to mix many tokens into A\n",
726
- "re_iterate_tensor_A = True # @param {\"type\":\"boolean\"}\n",
727
- "if (re_iterate_tensor_A == False) :\n",
728
- " #prevent re-iterating A by reading from stored copy\n",
729
- " id_A = id_P\n",
730
- " A = P\n",
731
- " _A = _P\n",
732
- "#----#\n",
733
- "\n",
734
- "tokenizer_output = tokenizer(text = mix_with)\n",
735
- "input_ids = tokenizer_output['input_ids']\n",
736
- "id_C = input_ids[1]\n",
737
- "C = token[id_C]\n",
738
- "_C = LA.vector_norm(C, ord=2)\n",
739
- "\n",
740
- "#if no imput exists we just randomize the entire thing\n",
741
- "if (mix_with == \"\"):\n",
742
- " id_C = -1\n",
743
- " print(\"Tokenized prompt 'mix_with' tensor C is a random valued tensor with no ID\")\n",
744
- " R = torch.rand(768)\n",
745
- " _R = LA.vector_norm(R, ord=2)\n",
746
- " C = R*(_C/_R)\n",
747
- "\n",
748
- "if (mix_method == \"None\"):\n",
749
- " print(\"No operation\")\n",
750
- "\n",
751
- "if (mix_method == \"Average\"):\n",
752
- " A = w*A + (1-w)*C\n",
753
- " _A = LA.vector_norm(A, ord=2)\n",
754
- " print(\"Tokenized prompt tensor A has been recalculated as A = w*A + (1-w)*C , where C is the tokenized prompt 'mix_with' tensor C\")\n",
755
- "\n",
756
- "if (mix_method == \"Subtract\"):\n",
757
- " tmp = (A/_A) - (C/_C)\n",
758
- " _tmp = LA.vector_norm(tmp, ord=2)\n",
759
- " A = tmp*((w*_A + (1-w)*_C)/_tmp)\n",
760
- " _A = LA.vector_norm(A, ord=2)\n",
761
- " print(\"Tokenized prompt tensor A has been recalculated as A = (w*_A + (1-w)*_C) * norm(w*A - (1-w)*C) , where C is the tokenized prompt 'mix_with' tensor C\")\n",
762
- "\n",
763
- "#OPTIONAL : Add/subtract + normalize above result with another token. Leave field empty to get a random value tensor"
764
- ],
765
- "metadata": {
766
- "id": "oXbNSRSKPgRr",
767
- "collapsed": true,
768
- "cellView": "form"
769
- },
770
- "execution_count": null,
771
- "outputs": []
772
- },
773
- {
774
- "cell_type": "code",
775
- "source": [
776
- "\n",
777
- "# @title 🥢->🧾🥢 Find Similiar Tokens to ID at index 1 from above result\n",
778
- "dots = torch.zeros(NUM_TOKENS)\n",
779
- "for index in range(NUM_TOKENS):\n",
780
- " id_B = index\n",
781
- " B = token[id_B]\n",
782
- " _B = LA.vector_norm(B, ord=2)\n",
783
- " result = torch.dot(A,B)/(_A*_B)\n",
784
- " #result = absolute_value(result.item())\n",
785
- " result = result.item()\n",
786
- " dots[index] = result\n",
787
- "\n",
788
- "name_A = \"A of random type\"\n",
789
- "if (id_A>-1):\n",
790
- " name_A = vocab[id_A]\n",
791
- "\n",
792
- "name_C = \"token C of random type\"\n",
793
- "if (id_C>-1):\n",
794
- " name_C = vocab[id_C]\n",
795
- "\n",
796
- "\n",
797
- "sorted, indices = torch.sort(dots,dim=0 , descending=True)\n",
798
- "#----#\n",
799
- "if (mix_method == \"Average\"):\n",
800
- " print(f'Calculated all cosine-similarities between the average of token {name_A} and {name_C} with Id_A = {id_A} and mixed Id_C = {id_C} as a 1x{sorted.shape[0]} tensor')\n",
801
- "if (mix_method == \"Subtract\"):\n",
802
- " print(f'Calculated all cosine-similarities between the subtract of token {name_A} and {name_C} with Id_A = {id_A} and mixed Id_C = {id_C} as a 1x{sorted.shape[0]} tensor')\n",
803
- "if (mix_method == \"None\"):\n",
804
- " print(f'Calculated all cosine-similarities between the token {name_A} with Id_A = {id_A} with the the rest of the {NUM_TOKENS} tokens as a 1x{sorted.shape[0]} tensor')\n",
805
- "\n",
806
- "#Produce a list id IDs that are most similiar to the prompt ID at positiion 1 based on above result"
807
- ],
808
- "metadata": {
809
- "id": "juxsvco9B0iV",
810
- "collapsed": true,
811
- "cellView": "form"
812
- },
813
- "execution_count": null,
814
- "outputs": []
815
- },
816
- {
817
- "cell_type": "markdown",
818
- "source": [],
819
- "metadata": {
820
- "id": "cYYu5C5C6MHH"
821
- }
822
- },
823
- {
824
- "cell_type": "code",
825
- "source": [
826
- "# @title 🥢🧾 -> 🖨️ Print Result from the 'Similiar Tokens' list from above result\n",
827
- "list_size = 100 # @param {type:'number'}\n",
828
- "print_ID = False # @param {type:\"boolean\"}\n",
829
- "print_Similarity = True # @param {type:\"boolean\"}\n",
830
- "print_Name = True # @param {type:\"boolean\"}\n",
831
- "print_Divider = True # @param {type:\"boolean\"}\n",
832
- "\n",
833
- "for index in range(list_size):\n",
834
- " id = indices[index].item()\n",
835
- " if (print_Name):\n",
836
- " print(f'{vocab[id]}') # vocab item\n",
837
- " if (print_ID):\n",
838
- " print(f'ID = {id}') # IDs\n",
839
- " if (print_Similarity):\n",
840
- " print(f'similiarity = {round(sorted[index].item()*100,2)} %') # % value\n",
841
- " if (print_Divider):\n",
842
- " print('--------')\n",
843
- "\n",
844
- "#Print the sorted list from above result"
845
- ],
846
- "metadata": {
847
- "id": "YIEmLAzbHeuo",
848
- "collapsed": true,
849
- "cellView": "form"
850
- },
851
- "execution_count": null,
852
- "outputs": []
853
- },
854
- {
855
- "cell_type": "code",
856
- "source": [
857
- "\n",
858
- "# @title 🆔 Get similarity % of two token IDs\n",
859
- "id_for_token_A = 4567 # @param {type:'number'}\n",
860
- "id_for_token_B = 4343 # @param {type:'number'}\n",
861
- "\n",
862
- "similarity_str = 'The similarity between tokens A and B is ' + similarity(id_for_token_A , id_for_token_B)\n",
863
- "\n",
864
- "print(similarity_str)\n",
865
- "\n",
866
- "#Valid ID ranges for id_for_token_A / id_for_token_B are between 0 and 49407"
867
- ],
868
- "metadata": {
869
- "id": "MwmOdC9cNZty",
870
- "collapsed": true,
871
- "cellView": "form"
872
- },
873
- "execution_count": null,
874
- "outputs": []
875
- },
876
  {
877
  "cell_type": "markdown",
878
  "source": [
 
125
  "cell_type": "code",
126
  "source": [
127
  "# @title ⚡ Get similiar tokens\n",
128
+ "import torch\n",
129
  "from transformers import AutoTokenizer\n",
130
  "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
131
  "\n",
132
  "# @markdown Write name of token to match against\n",
133
  "prompt= \"banana\" # @param {type:'string',\"placeholder\":\"leave empty for random value token\"}\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  "# @markdown (optional) Mix the token with something else\n",
135
  "mix_with = \"\" # @param {\"type\":\"string\",\"placeholder\":\"leave empty for random value token\"}\n",
136
  "mix_method = \"None\" # @param [\"None\" , \"Average\", \"Subtract\"] {allow-input: true}\n",
137
  "w = 0.5 # @param {type:\"slider\", min:0, max:1, step:0.01}\n",
 
138
  "# @markdown Limit char size of included token\n",
139
  "min_char_size = 3 # @param {type:\"slider\", min:0, max: 50, step:1}\n",
140
  "char_range = 5 # @param {type:\"slider\", min:0, max: 50, step:1}\n",
141
  "\n",
142
+ "tokenizer_output = tokenizer(text = prompt)\n",
143
+ "input_ids = tokenizer_output['input_ids']\n",
144
+ "id_A = input_ids[1]\n",
145
+ "A = torch.tensor(token[id_A])\n",
146
+ "A = A/A.norm(p=2, dim=-1, keepdim=True)\n",
147
+ "#-----#\n",
148
  "tokenizer_output = tokenizer(text = mix_with)\n",
149
  "input_ids = tokenizer_output['input_ids']\n",
150
  "id_C = input_ids[1]\n",
151
+ "C = torch.tensor(token[id_C])\n",
152
+ "C = C/C.norm(p=2, dim=-1, keepdim=True)\n",
153
+ "#-----#\n",
154
+ "sim_AC = torch.dot(A,C)\n",
155
+ "#-----#\n",
156
+ "print(input_ids)\n",
157
+ "#-----#\n",
158
+ "\n",
159
+ "#if no imput exists we just randomize the entire thing\n",
160
+ "if (prompt == \"\"):\n",
161
+ " id_A = -1\n",
162
+ " print(\"Tokenized prompt tensor A is a random valued tensor with no ID\")\n",
163
+ " R = torch.rand(A.shape)\n",
164
+ " R = R/R.norm(p=2, dim=-1, keepdim=True)\n",
165
+ " A = R\n",
166
+ " name_A = 'random_A'\n",
167
  "\n",
168
  "#if no imput exists we just randomize the entire thing\n",
169
  "if (mix_with == \"\"):\n",
170
  " id_C = -1\n",
171
  " print(\"Tokenized prompt 'mix_with' tensor C is a random valued tensor with no ID\")\n",
172
+ " R = torch.rand(A.shape)\n",
173
+ " R = R/R.norm(p=2, dim=-1, keepdim=True)\n",
174
+ " C = R\n",
175
  " name_C = 'random_C'\n",
176
  "\n",
177
  "name_A = \"A of random type\"\n",
 
182
  "if (id_C>-1):\n",
183
  " name_C = vocab[id_C]\n",
184
  "\n",
185
+ "print(f\"The similarity between A '{name_A}' and C '{name_C}' is {round(sim_AC.item()*100,2)} %\")\n",
 
 
 
 
 
 
 
 
 
186
  "\n",
187
  "if (mix_method == \"None\"):\n",
188
  " print(\"No operation\")\n",
 
194
  "\n",
195
  "if (mix_method == \"Subtract\"):\n",
196
  " tmp = w*A - (1-w)*C\n",
197
+ " tmp = tmp/tmp.norm(p=2, dim=-1, keepdim=True)\n",
198
+ " A = tmp\n",
199
  " #//---//\n",
 
200
  " print(f\"Tokenized prompt tensor A '{name_A}' token has been recalculated as A = _A*norm(w*A - (1-w)*C) , where C is '{name_C}' token , for w = {w} \")\n",
201
  "\n",
202
  "#OPTIONAL : Add/subtract + normalize above result with another token. Leave field empty to get a random value tensor\n",
 
204
  "dots = torch.zeros(NUM_TOKENS)\n",
205
  "for index in range(NUM_TOKENS):\n",
206
  " id_B = index\n",
207
+ " B = torch.tensor(token[id_B])\n",
208
+ " B = B/B.norm(p=2, dim=-1, keepdim=True)\n",
209
+ " sim_AB = torch.dot(A,B)\n",
210
+ " dots[index] = sim_AB\n",
 
 
211
  "\n",
212
  "\n",
213
  "sorted, indices = torch.sort(dots,dim=0 , descending=True)\n",
 
247
  " if (print_Divider):\n",
248
  " print('--------')\n",
249
  "\n",
250
+ "#Print the sorted list from above result\n",
251
+ "\n",
252
+ "#The prompt will be enclosed with the <|start-of-text|> and <|end-of-text|> tokens, which is why output will be [49406, ... , 49407].\n",
253
+ "\n",
254
+ "#You can leave the 'prompt' field empty to get a random value tensor. Since the tensor is random value, it will not correspond to any tensor in the vocab.json list , and this it will have no ID."
255
  ],
256
  "metadata": {
257
+ "id": "iWeFnT1gAx6A"
 
258
  },
259
  "execution_count": null,
260
  "outputs": []
 
383
  "\n",
384
  "for index in range(RANGE):\n",
385
  " id_C = START + index\n",
 
 
386
  " name_C = vocab[id_C]\n",
387
  " is_Prefix = 0\n",
388
  "\n",
 
577
  "for index in range(NUM_PERMUTATIONS):\n",
578
  " print(names[indices[index].item()])\n",
579
  " print(f'similiarity = {round(sorted[index].item(),2)} %')\n",
580
+ " print('------')"
 
 
 
581
  ],
582
  "metadata": {
583
  "collapsed": true,
 
590
  "cell_type": "code",
591
  "source": [
592
  "# @title 💫 Compare Text encodings\n",
 
593
  "prompt_A = \"banana\" # @param {\"type\":\"string\",\"placeholder\":\"Write a prompt\"}\n",
594
+ "prompt_B = \"bike \" # @param {\"type\":\"string\",\"placeholder\":\"Write a prompt\"}\n",
595
+ "use_token_padding = True # param {type:\"boolean\"} <----- Enabled by default\n",
596
+ "#-----#\n",
597
+ "from transformers import AutoTokenizer\n",
598
+ "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\",\n",
599
+ "clean_up_tokenization_spaces = False)\n",
600
+ "#-----#\n",
601
  "from transformers import CLIPProcessor, CLIPModel\n",
 
602
  "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\" , clean_up_tokenization_spaces = True)\n",
 
603
  "model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\")\n",
604
+ "#----#\n",
605
+ "inputs = tokenizer(text = prompt_A, padding=True, return_tensors=\"pt\")\n",
606
+ "text_features_A = model.get_text_features(**inputs)\n",
607
+ "text_features_A = text_features_A / text_features_A.norm(p=2, dim=-1, keepdim=True)\n",
608
+ "name_A = prompt_A\n",
609
+ "#----#\n",
610
+ "inputs = tokenizer(text = prompt_B, padding=True, return_tensors=\"pt\")\n",
611
+ "text_features_B = model.get_text_features(**inputs)\n",
612
+ "text_features_B = text_features_B / text_features_B.norm(p=2, dim=-1, keepdim=True)\n",
613
+ "name_B = prompt_B\n",
614
+ "#----#\n",
615
+ "import torch\n",
616
+ "sim_AB = torch.nn.functional.cosine_similarity(text_features_A, text_features_B)\n",
617
+ "#----#\n",
618
+ "print(f'The similarity between the text_encoding for A:\"{prompt_A}\" and B: \"{prompt_B}\" is {round(sim_AB.item()*100,2)} %')"
619
  ],
620
  "metadata": {
621
  "id": "QQOjh5BvnG8M",
622
+ "collapsed": true
 
623
  },
624
  "execution_count": null,
625
  "outputs": []
 
633
  "id": "hyK423TQCRup"
634
  }
635
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
636
  {
637
  "cell_type": "markdown",
638
  "source": [