This Notebook is a Stable-diffusion tool which allows you to find similiar tokens from the SD 1.5 vocab.json that you can use for text-to-image generation.

In [None]:
# Load the tokens into the colab
!git clone https://huggingface.co/datasets/codeShare/sd_tokens
import torch
from torch import linalg as LA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
%cd /content/sd_tokens
token = torch.load('sd15_tensors.pt', map_location=device, weights_only=True)
#-----#

#Import the vocab.json
import json
import pandas as pd
with open('vocab.json', 'r') as f:
 data = json.load(f)

_df = pd.DataFrame({'count': data})['count']

vocab = {
 value: key for key, value in _df.items()
}
#-----#

# Define functions/constants
NUM_TOKENS = 49407

def absolute_value(x):
 return max(x, -x)

def similarity(id_A , id_B):
 #Tensors
 A = token[id_A]
 B = token[id_B]
 #Tensor vector length (2nd order, i.e (a^2 + b^2 + ....)^(1/2)
 _A = LA.vector_norm(A, ord=2)
 _B = LA.vector_norm(B, ord=2)
 #----#
 result = torch.dot(A,B)/(_A*_B)
 #similarity_pcnt = absolute_value(result.item()*100)
 similarity_pcnt = result.item()*100
 similarity_pcnt_aprox = round(similarity_pcnt, 3)
 result = f'{similarity_pcnt_aprox} %'
 return result
#----#

mix_with = ""
mix_method = "None"

In [None]:
#print(vocab[8922]) #the vocab item for ID 8922
#print(token[8922].shape) #dimension of the token

Get the IDs from a prompt text.

The prompt will be enclosed with the <|start-of-text|> and <|end-of-text|> tokens. Leave the field empty to get a random value tensor

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-large-patch14", clean_up_tokenization_spaces = False)

prompt= "banana" # @param {type:'string'}

tokenizer_output = tokenizer(text = prompt)
input_ids = tokenizer_output['input_ids']
print(input_ids)
id_A = input_ids[1]
A = token[id_A]
_A = LA.vector_norm(A, ord=2)

#if no imput exists we just randomize the entire thing
if (prompt == ""):
 id_A = -1
 print("Tokenized prompt tensor A is a random valued tensor with no ID")
 R = torch.rand(768)
 _R = LA.vector_norm(R, ord=2)
 A = R*(_A/_R)

#Save a copy of the tensor A
id_P = input_ids[1]
P = token[id_A]
_P = LA.vector_norm(A, ord=2)

OPTIONAL : Add/subtract + normalize above result with another token. Leave field empty to get a random value tensor

In [None]:
mix_with = "" # @param {type:'string'}
mix_method = "None" # @param ["None" , "Average", "Subtract"] {allow-input: true}
w = 0.5 # @param {type:"slider", min:0, max:1, step:0.01}

#prevent re-iterating A by reading from stored copy
id_A = id_P
A = P
_A = _P
#----#

tokenizer_output = tokenizer(text = mix_with)
input_ids = tokenizer_output['input_ids']
id_C = input_ids[1]
C = token[id_C]
_C = LA.vector_norm(C, ord=2)

#if no imput exists we just randomize the entire thing
if (mix_with == ""):
 id_C = -1
 print("Tokenized prompt 'mix_with' tensor C is a random valued tensor with no ID")
 R = torch.rand(768)
 _R = LA.vector_norm(R, ord=2)
 C = R*(_C/_R)

if (mix_method == "None"):
 print("No operation")

if (mix_method == "Average"):
 A = w*A + (1-w)*C
 _A = LA.vector_norm(A, ord=2)
 print("Tokenized prompt tensor A has been recalculated as A = w*A + (1-w)*C , where C is the tokenized prompt 'mix_with' tensor C")

if (mix_method == "Subtract"):
 tmp = (A/_A) - (C/_C)
 _tmp = LA.vector_norm(tmp, ord=2)
 A = tmp*((w*_A + (1-w)*_C)/_tmp)
 _A = LA.vector_norm(A, ord=2)
 print("Tokenized prompt tensor A has been recalculated as A = (w*_A + (1-w)*_C) * norm(w*A - (1-w)*C) , where C is the tokenized prompt 'mix_with' tensor C")




Produce a list id IDs that are most similiar to the prompt ID at positiion 1 based on above result

In [None]:

dots = torch.zeros(NUM_TOKENS)
for index in range(NUM_TOKENS):
 id_B = index
 B = token[id_B]
 _B = LA.vector_norm(B, ord=2)
 result = torch.dot(A,B)/(_A*_B)
 #result = absolute_value(result.item())
 result = result.item()
 dots[index] = result

name_A = "A of random type"
if (id_A>-1):
 name_A = vocab[id_A]

name_C = "token C of random type"
if (id_C>-1):
 name_C = vocab[id_C]


sorted, indices = torch.sort(dots,dim=0 , descending=True)
#----#
if (mix_method == "Average"):
 print(f'Calculated all cosine-similarities between the average of token {name_A} and {name_C} with Id_A = {id_A} and mixed Id_C = {id_C} as a 1x{sorted.shape[0]} tensor')
if (mix_method == "Subtract"):
 print(f'Calculated all cosine-similarities between the subtract of token {name_A} and {name_C} with Id_A = {id_A} and mixed Id_C = {id_C} as a 1x{sorted.shape[0]} tensor')
if (mix_method == "None"):
 print(f'Calculated all cosine-similarities between the token {name_A} with Id_A = {id_A} with the the rest of the {NUM_TOKENS} tokens as a 1x{sorted.shape[0]} tensor')

Print the sorted list from above result

In [None]:
list_size = 100 # @param {type:'number'}

print_ID = False # @param {type:"boolean"}
print_Similarity = True # @param {type:"boolean"}
print_Name = True # @param {type:"boolean"}
print_Divider = True # @param {type:"boolean"}

for index in range(list_size):
 id = indices[index].item()
 if (print_Name):
 print(f'{vocab[id]}') # vocab item
 if (print_ID):
 print(f'ID = {id}') # IDs
 if (print_Similarity):
 print(f'similiarity = {round(sorted[index].item()*100,2)} %') # % value
 if (print_Divider):
 print('--------')

Find the most similiar Tokens for given input

Valid ID ranges for id_for_token_A / id_for_token_B are between 0 and 49407

In [None]:
id_for_token_A = 4567 # @param {type:'number'}
id_for_token_B = 4343 # @param {type:'number'}

similarity_str = 'The similarity between tokens A and B is ' + similarity(id_for_token_A , id_for_token_B)

print(similarity_str)