{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "This Notebook is a Stable-diffusion tool which allows you to find similiar tokens from the SD 1.5 vocab.json that you can use for text-to-image generation."
      ],
      "metadata": {
        "id": "L7JTcbOdBPfh"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Load the tokens into the colab\n",
        "!git clone https://huggingface.co/datasets/codeShare/sd_tokens\n",
        "import torch\n",
        "from torch import linalg as LA\n",
        "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
        "%cd /content/sd_tokens\n",
        "token = torch.load('sd15_tensors.pt', map_location=device, weights_only=True)\n",
        "#-----#\n",
        "\n",
        "#Import the vocab.json\n",
        "import json\n",
        "import pandas as pd\n",
        "with open('vocab.json', 'r') as f:\n",
        "    data = json.load(f)\n",
        "\n",
        "_df = pd.DataFrame({'count': data})['count']\n",
        "\n",
        "vocab = {\n",
        "    value: key for key, value in _df.items()\n",
        "}\n",
        "#-----#\n",
        "\n",
        "# Define functions/constants\n",
        "NUM_TOKENS = 49407\n",
        "\n",
        "def absolute_value(x):\n",
        "    return max(x, -x)\n",
        "\n",
        "def similarity(id_A , id_B):\n",
        "  #Tensors\n",
        "  A = token[id_A]\n",
        "  B = token[id_B]\n",
        "  #Tensor vector length (2nd order, i.e (a^2 + b^2 + ....)^(1/2)\n",
        "  _A = LA.vector_norm(A, ord=2)\n",
        "  _B = LA.vector_norm(B, ord=2)\n",
        "  #----#\n",
        "  result = torch.dot(A,B)/(_A*_B)\n",
        "  #similarity_pcnt = absolute_value(result.item()*100)\n",
        "  similarity_pcnt = result.item()*100\n",
        "  similarity_pcnt_aprox = round(similarity_pcnt, 3)\n",
        "  result = f'{similarity_pcnt_aprox} %'\n",
        "  return result\n",
        "#----#\n",
        "\n",
        "mix_with = \"\"\n",
        "mix_method = \"None\""
      ],
      "metadata": {
        "id": "Ch9puvwKH1s3"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "#print(vocab[8922]) #the vocab item for ID 8922\n",
        "#print(token[8922].shape)  #dimension of the token"
      ],
      "metadata": {
        "id": "S_Yh9gH_OUA1"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Get the IDs from a prompt text.\n",
        "\n",
        "The prompt will be enclosed with the <|start-of-text|> and <|end-of-text|> tokens. Leave the field empty to get a random value tensor"
      ],
      "metadata": {
        "id": "f1-jS7YJApiO"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from transformers import AutoTokenizer\n",
        "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
        "\n",
        "prompt= \"banana\" # @param {type:'string'}\n",
        "\n",
        "tokenizer_output = tokenizer(text = prompt)\n",
        "input_ids = tokenizer_output['input_ids']\n",
        "print(input_ids)\n",
        "id_A = input_ids[1]\n",
        "A = token[id_A]\n",
        "_A = LA.vector_norm(A, ord=2)\n",
        "\n",
        "#if no imput exists we just randomize the entire thing\n",
        "if (prompt == \"\"):\n",
        "  id_A = -1\n",
        "  print(\"Tokenized prompt tensor A is a random valued tensor with no ID\")\n",
        "  R = torch.rand(768)\n",
        "  _R =  LA.vector_norm(R, ord=2)\n",
        "  A = R*(_A/_R)\n",
        "\n",
        "#Save a copy of the tensor A\n",
        "id_P = input_ids[1]\n",
        "P = token[id_A]\n",
        "_P = LA.vector_norm(A, ord=2)"
      ],
      "metadata": {
        "id": "RPdkYzT2_X85"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "OPTIONAL : Add/subtract + normalize above result with another token. Leave field empty to get a random value tensor"
      ],
      "metadata": {
        "id": "JKnz0aLFVGXc"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "mix_with = \"\" # @param {type:'string'}\n",
        "mix_method = \"None\" # @param [\"None\" , \"Average\", \"Subtract\"] {allow-input: true}\n",
        "w = 0.5 # @param {type:\"slider\", min:0, max:1, step:0.01}\n",
        "\n",
        "#prevent re-iterating A by reading from stored copy\n",
        "id_A = id_P\n",
        "A = P\n",
        "_A = _P\n",
        "#----#\n",
        "\n",
        "tokenizer_output = tokenizer(text = mix_with)\n",
        "input_ids = tokenizer_output['input_ids']\n",
        "id_C = input_ids[1]\n",
        "C = token[id_C]\n",
        "_C = LA.vector_norm(C, ord=2)\n",
        "\n",
        "#if no imput exists we just randomize the entire thing\n",
        "if (mix_with == \"\"):\n",
        "  id_C = -1\n",
        "  print(\"Tokenized prompt  'mix_with' tensor C is a random valued tensor with no ID\")\n",
        "  R = torch.rand(768)\n",
        "  _R =  LA.vector_norm(R, ord=2)\n",
        "  C = R*(_C/_R)\n",
        "\n",
        "if (mix_method ==  \"None\"):\n",
        "  print(\"No operation\")\n",
        "\n",
        "if (mix_method ==  \"Average\"):\n",
        "  A = w*A + (1-w)*C\n",
        "  _A = LA.vector_norm(A, ord=2)\n",
        "  print(\"Tokenized prompt tensor A has been recalculated as A = w*A + (1-w)*C , where C is the tokenized prompt  'mix_with' tensor C\")\n",
        "\n",
        "if (mix_method ==  \"Subtract\"):\n",
        "  tmp = (A/_A) - (C/_C)\n",
        "  _tmp = LA.vector_norm(tmp, ord=2)\n",
        "  A = tmp*((w*_A + (1-w)*_C)/_tmp)\n",
        "  _A = LA.vector_norm(A, ord=2)\n",
        "  print(\"Tokenized prompt tensor A has been recalculated as A = (w*_A + (1-w)*_C) * norm(w*A - (1-w)*C) , where C is the tokenized prompt 'mix_with' tensor C\")\n",
        "\n",
        "\n"
      ],
      "metadata": {
        "id": "oXbNSRSKPgRr"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Produce a list id IDs that are most similiar to the prompt ID at positiion 1 based on above result"
      ],
      "metadata": {
        "id": "3uBSZ1vWVCew"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "\n",
        "dots = torch.zeros(NUM_TOKENS)\n",
        "for index in range(NUM_TOKENS):\n",
        "  id_B = index\n",
        "  B = token[id_B]\n",
        "  _B = LA.vector_norm(B, ord=2)\n",
        "  result = torch.dot(A,B)/(_A*_B)\n",
        "  #result = absolute_value(result.item())\n",
        "  result = result.item()\n",
        "  dots[index] = result\n",
        "\n",
        "name_A = \"A of random type\"\n",
        "if (id_A>-1):\n",
        "  name_A = vocab[id_A]\n",
        "\n",
        "name_C = \"token C of random type\"\n",
        "if (id_C>-1):\n",
        "  name_C = vocab[id_C]\n",
        "\n",
        "\n",
        "sorted, indices = torch.sort(dots,dim=0 , descending=True)\n",
        "#----#\n",
        "if (mix_method ==  \"Average\"):\n",
        "  print(f'Calculated all cosine-similarities between the average of token {name_A} and {name_C} with Id_A = {id_A} and mixed Id_C = {id_C} as a 1x{sorted.shape[0]} tensor')\n",
        "if (mix_method ==  \"Subtract\"):\n",
        "  print(f'Calculated all cosine-similarities between the subtract of token {name_A} and {name_C} with Id_A = {id_A} and mixed Id_C = {id_C} as a 1x{sorted.shape[0]} tensor')\n",
        "if (mix_method ==  \"None\"):\n",
        "  print(f'Calculated all cosine-similarities between the token {name_A} with Id_A = {id_A} with the the rest of the {NUM_TOKENS} tokens as a 1x{sorted.shape[0]} tensor')"
      ],
      "metadata": {
        "id": "juxsvco9B0iV"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Print the sorted list from above result"
      ],
      "metadata": {
        "id": "y-Ig3glrVQC3"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "list_size = 100 # @param {type:'number'}\n",
        "\n",
        "print_ID = False # @param {type:\"boolean\"}\n",
        "print_Similarity = True # @param {type:\"boolean\"}\n",
        "print_Name = True # @param {type:\"boolean\"}\n",
        "print_Divider = True # @param {type:\"boolean\"}\n",
        "\n",
        "for index in range(list_size):\n",
        "  id = indices[index].item()\n",
        "  if (print_Name):\n",
        "    print(f'{vocab[id]}') # vocab item\n",
        "  if (print_ID):\n",
        "    print(f'ID = {id}') # IDs\n",
        "  if (print_Similarity):\n",
        "    print(f'similiarity = {round(sorted[index].item()*100,2)} %') # % value\n",
        "  if (print_Divider):\n",
        "    print('--------')"
      ],
      "metadata": {
        "id": "YIEmLAzbHeuo",
        "collapsed": true
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Find the most similiar Tokens for given input"
      ],
      "metadata": {
        "id": "qqZ5DvfLBJnw"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "Valid ID ranges for id_for_token_A / id_for_token_B are between 0 and 49407"
      ],
      "metadata": {
        "id": "kX72bAuhOtlT"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "id_for_token_A = 4567 # @param {type:'number'}\n",
        "id_for_token_B = 4343 # @param {type:'number'}\n",
        "\n",
        "similarity_str =  'The similarity between tokens A and B is ' + similarity(id_for_token_A , id_for_token_B)\n",
        "\n",
        "print(similarity_str)"
      ],
      "metadata": {
        "id": "MwmOdC9cNZty"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}