This Notebook is a Stable-diffusion tool which allows you to find similiar tokens from the SD 1.5 vocab.json that you can use for text-to-image generation.

In [1]:
# Load the tokens into the colab
!git clone https://huggingface.co/datasets/codeShare/sd_tokens
import torch
from torch import linalg as LA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
%cd /content/sd_tokens
token = torch.load('sd15_tensors.pt', map_location=device, weights_only=True)
#-----#

#Import the vocab.json
import json
import pandas as pd
with open('vocab.json', 'r') as f:
    data = json.load(f)

_df = pd.DataFrame({'count': data})['count']

vocab = {
    value: key for key, value in _df.items()
}
#-----#

# Define functions/constants
NUM_TOKENS = 49407

def absolute_value(x):
    return max(x, -x)

def similarity(id_A , id_B):
  #Tensors
  A = token[id_A]
  B = token[id_B]
  #Tensor vector length (2nd order, i.e (a^2 + b^2 + ....)^(1/2)
  _A = LA.vector_norm(A, ord=2)
  _B = LA.vector_norm(B, ord=2)
  #----#
  result = torch.dot(A,B)/(_A*_B)
  similarity_pcnt = absolute_value(result.item()*100)
  similarity_pcnt_aprox = round(similarity_pcnt, 3)
  result = f'{similarity_pcnt_aprox} %'
  return result
#----#

mix_with = ""
mix_method = "None"

Cloning into 'sd_tokens'...
remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 7 (delta 0), reused 0 (delta 0), pack-reused 3 (from 1)[K
Unpacking objects: 100% (7/7), 305.93 KiB | 5.67 MiB/s, done.
/content/sd_tokens


In [None]:
print(vocab[8922]) #the vocab item for ID 8922
print(token[8922].shape)  #dimension of the token

Get the IDs from a prompt text.

The prompt will be enclosed with the <|start-of-text|> and <|end-of-text|> tokens

In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-large-patch14", clean_up_tokenization_spaces = False)
prompt= "banana" # @param {type:'string'}
tokenizer_output = tokenizer(text = prompt)
input_ids = tokenizer_output['input_ids']
print(input_ids)
id_A = input_ids[1]
A = token[id_A]
_A = LA.vector_norm(A, ord=2)

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

[49406, 8922, 49407]


OPTIONAL : Add/subtract + normalize above result with another token

In [5]:
mix_with = "" # @param {type:'string'}
mix_method = "None" # @param ["None" , "Average", "Subtract"] {allow-input: true}
w = 0.5 # @param {type:"slider", min:0, max:1, step:0.01}



tokenizer_output = tokenizer(text = mix_with)
input_ids = tokenizer_output['input_ids']
id_C = input_ids[1]
C = token[id_C]
_C = LA.vector_norm(C, ord=2)

if (mix_method ==  "Average"):
  A = w*A + (1-w)*C
  _A = LA.vector_norm(A, ord=2)

if (mix_method ==  "Subtract"):
  tmp = w*A - (1-w)*C
  _tmp = LA.vector_norm(tmp, ord=2)
  A = tmp*((w*_A + (1-w)*_C)/_tmp)
  _A = LA.vector_norm(A, ord=2)




Produce a list id IDs that are most similiar to the prompt ID at positiion 1 based on above result

In [6]:

dots = torch.zeros(NUM_TOKENS)
for index in range(NUM_TOKENS):
  id_B = index
  B = token[id_B]
  _B = LA.vector_norm(B, ord=2)
  result = torch.dot(A,B)/(_A*_B)
  result = absolute_value(result.item())
  dots[index] = result

sorted, indices = torch.sort(dots,dim=0 , descending=True)
#----#
if (mix_method ==  "Average"):
  print(f'Calculated all cosine-similarities between the average of token {vocab[id_A]} and {vocab[id_C]} with ID = {id_A} and mixed ID = {id_C} as a 1x{sorted.shape[0]} tensor')
if (mix_method ==  "Subtract"):
  print(f'Calculated all cosine-similarities between the subtract of token {vocab[id_A]} and {vocab[id_C]} with ID = {id_A} and mixed ID = {id_C} as a 1x{sorted.shape[0]} tensor')
if (mix_method ==  "None"):
  print(f'Calculated all cosine-similarities between the token {vocab[id_A]} with ID = {id_A} the rest of the {NUM_TOKENS} tokens as a 1x{sorted.shape[0]} tensor')

Calculated all cosine-similarities between the token banana</w> with ID = 8922 the rest of the 49407 tokens as a 1x49407 tensor


Print the sorted list from above result

In [7]:
list_size = 100 # @param {type:'number'}

print_ID = False # @param {type:"boolean"}
print_Similarity = True # @param {type:"boolean"}
print_Name = True # @param {type:"boolean"}
print_Divider = True # @param {type:"boolean"}

for index in range(list_size):
  id = indices[index].item()
  if (print_Name):
    print(f'{vocab[id]}') # vocab item
  if (print_ID):
    print(f'ID = {id}') # IDs
  if (print_Similarity):
    print(f'similiarity = {round(sorted[index].item()*100,2)} %') # % value
  if (print_Divider):
    print('--------')

banana</w>
similiarity = 100.0 %
--------
bananas</w>
similiarity = 38.93 %
--------
banan
similiarity = 30.8 %
--------
ðŁįĮ</w>
similiarity = 27.12 %
--------
pineapple</w>
similiarity = 19.7 %
--------
chicken</w>
similiarity = 19.24 %
--------
potassium</w>
similiarity = 19.21 %
--------
sausage</w>
similiarity = 19.07 %
--------
lemon</w>
similiarity = 18.82 %
--------
orange</w>
similiarity = 18.42 %
--------
peanut</w>
similiarity = 17.84 %
--------
parachute</w>
similiarity = 17.19 %
--------
duck
similiarity = 16.8 %
--------
yellow</w>
similiarity = 16.21 %
--------
grape</w>
similiarity = 16.19 %
--------
kangaroo</w>
similiarity = 16.13 %
--------
apple</w>
similiarity = 16.13 %
--------
tangerine</w>
similiarity = 16.08 %
--------
giraffe</w>
similiarity = 16.04 %
--------
mango</w>
similiarity = 16.03 %
--------
rubber</w>
similiarity = 15.95 %
--------
bamboo</w>
similiarity = 15.88 %
--------
umbrella</w>
similiarity = 15.82 %
--------
nutella</w>
similiarity = 15.69 %


Find the most similiar Tokens for given input

Valid ID ranges for id_for_token_A / id_for_token_B are between 0 and 49407

In [None]:
id_for_token_A = 4567 # @param {type:'number'}
id_for_token_B = 4343 # @param {type:'number'}

similarity_str =  'The similarity between tokens A and B is ' + similarity(id_for_token_A , id_for_token_B)

print(similarity_str)