codeShare commited on
Commit
44eca84
1 Parent(s): e5232dc

Upload sd_token_similarity_calculator.ipynb

Browse files
Files changed (1) hide show
  1. sd_token_similarity_calculator.ipynb +119 -0
sd_token_similarity_calculator.ipynb ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ }
15
+ },
16
+ "cells": [
17
+ {
18
+ "cell_type": "code",
19
+ "source": [
20
+ "# Load the tokens into the colab\n",
21
+ "!git clone https://huggingface.co/datasets/codeShare/sd_tokens\n",
22
+ "import torch\n",
23
+ "from torch import linalg as LA\n",
24
+ "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
25
+ "token = torch.load('sd15_tensors.pt', map_location=device, weights_only=True)"
26
+ ],
27
+ "metadata": {
28
+ "id": "Ch9puvwKH1s3"
29
+ },
30
+ "execution_count": null,
31
+ "outputs": []
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "source": [
36
+ "print(token[100].shape) #dimension of the tokens"
37
+ ],
38
+ "metadata": {
39
+ "id": "S_Yh9gH_OUA1"
40
+ },
41
+ "execution_count": null,
42
+ "outputs": []
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "source": [
47
+ "def absolute_value(x):\n",
48
+ " return max(x, -x)\n",
49
+ "\n",
50
+ "def similarity(id_A , id_B):\n",
51
+ " #Tensors\n",
52
+ " A = token[id_A]\n",
53
+ " B = token[id_B]\n",
54
+ "\n",
55
+ " #Tensor vector length (2nd order, i.e (a^2 + b^2 + ....)^(1/2)\n",
56
+ " _A = LA.vector_norm(A, ord=2)\n",
57
+ " _B = LA.vector_norm(B, ord=2)\n",
58
+ "\n",
59
+ " result = torch.dot(A,B)/(_A*_B)\n",
60
+ " similarity_pcnt = absolute_value(result.item()*100)\n",
61
+ "\n",
62
+ " similarity_pcnt_aprox = round(similarity_pcnt, 3)\n",
63
+ "\n",
64
+ " return f'{similarity_pcnt_aprox} %'"
65
+ ],
66
+ "metadata": {
67
+ "id": "fxquCxFaUxAZ"
68
+ },
69
+ "execution_count": 35,
70
+ "outputs": []
71
+ },
72
+ {
73
+ "cell_type": "markdown",
74
+ "source": [
75
+ "Valid ID ranges for id_for_token_A / id_for_token_B are between 0 and 49407"
76
+ ],
77
+ "metadata": {
78
+ "id": "kX72bAuhOtlT"
79
+ }
80
+ },
81
+ {
82
+ "cell_type": "code",
83
+ "source": [
84
+ "id_for_token_A = 500 # @param {type:'number'}\n",
85
+ "id_for_token_B = 4343 # @param {type:'number'}\n",
86
+ "\n",
87
+ "similarity = similarity(id_for_token_A , id_for_token_B)\n",
88
+ "\n",
89
+ "print(f'The similarity between tokens A and B is {similarity}')"
90
+ ],
91
+ "metadata": {
92
+ "colab": {
93
+ "base_uri": "https://localhost:8080/"
94
+ },
95
+ "id": "MwmOdC9cNZty",
96
+ "outputId": "e75c4987-9d13-4ec7-ca36-775b8dbac707"
97
+ },
98
+ "execution_count": 36,
99
+ "outputs": [
100
+ {
101
+ "output_type": "stream",
102
+ "name": "stdout",
103
+ "text": [
104
+ "The similarity between tokens A and B is 4.001 %\n"
105
+ ]
106
+ }
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "code",
111
+ "source": [],
112
+ "metadata": {
113
+ "id": "oJC12JgJUPrB"
114
+ },
115
+ "execution_count": null,
116
+ "outputs": []
117
+ }
118
+ ]
119
+ }