codeShare
/

JupyterNotebooks

Safetensors

Model card Files Files and versions Community

codeShare commited on Sep 17

Commit

16ea5d1

•

1 Parent(s): c012553

Upload indexed_text_encoding_converter.ipynb

Browse files

Files changed (1) hide show

indexed_text_encoding_converter.ipynb +209 -0

indexed_text_encoding_converter.ipynb ADDED Viewed

	@@ -0,0 +1,209 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "cskYkw0zXHEm"
+      },
+      "outputs": [],
+      "source": [
+        "# @title Make your own text_encodings .safetensor file for later use (using GPU is recommended to speed things up)\n",
+        "\n",
+        "import json\n",
+        "import pandas as pd\n",
+        "import os\n",
+        "import shelve\n",
+        "import torch\n",
+        "from safetensors.torch import save_file\n",
+        "import json\n",
+        "\n",
+        "# Determine if this notebook is running on Colab or Kaggle\n",
+        "#Use https://www.kaggle.com/ if Google Colab GPU is busy\n",
+        "home_directory = '/content/'\n",
+        "using_Kaggle = os.environ.get('KAGGLE_URL_BASE','')\n",
+        "if using_Kaggle : home_directory = '/kaggle/working/'\n",
+        "%cd {home_directory}\n",
+        "#-------#\n",
+        "\n",
+        "# User input\n",
+        "target = home_directory + 'text-to-image-prompts/names/celebs/mixed/'\n",
+        "output_folder = home_directory + 'output/celebs/mixed/'\n",
+        "root_filename = '🆔👨 fusion-t2i-v2-celeb'\n",
+        "NUM_FILES = 1\n",
+        "#--------#\n",
+        "\n",
+        "# Setup environment\n",
+        "def my_mkdirs(folder):\n",
+        "    if os.path.exists(folder)==False:\n",
+        "        os.makedirs(folder)\n",
+        "#--------#\n",
+        "output_folder_text = output_folder + 'text/'\n",
+        "output_folder_text = output_folder + 'text/'\n",
+        "output_folder_text_encodings = output_folder + 'text_encodings/'\n",
+        "target_raw = target + 'raw/'\n",
+        "%cd {home_directory}\n",
+        "my_mkdirs(output_folder)\n",
+        "my_mkdirs(output_folder_text)\n",
+        "my_mkdirs(output_folder_text_encodings)\n",
+        "#-------#\n",
+        "\n",
+        "# Load the data if not already loaded\n",
+        "try:\n",
+        "    loaded\n",
+        "except:\n",
+        "    %cd {home_directory}\n",
+        "    !git clone https://huggingface.co/datasets/codeShare/text-to-image-prompts\n",
+        "    loaded = True\n",
+        "#--------#\n",
+        "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
+        "from transformers import AutoTokenizer\n",
+        "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
+        "from transformers import  CLIPProcessor, CLIPModel\n",
+        "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\" , clean_up_tokenization_spaces = True)\n",
+        "model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\").to(device)\n",
+        "#---------#\n",
+        "for  file_index in range(NUM_FILES + 1):\n",
+        "    if (file_index < 1): continue\n",
+        "    filename = f'{root_filename}-{file_index}'\n",
+        "\n",
+        "    # Read {filename}.json\n",
+        "    %cd {target_raw}\n",
+        "    with open(filename + '.json', 'r') as f:\n",
+        "        data = json.load(f)\n",
+        "    _df = pd.DataFrame({'count': data})['count']\n",
+        "    prompts = {\n",
+        "        key : value.replace(\"</w>\",\" \") for key, value in _df.items()\n",
+        "    }\n",
+        "    index = 0\n",
+        "    for key in prompts:\n",
+        "        index = index + 1\n",
+        "    #----------#\n",
+        "    NUM_ITEMS = index\n",
+        "    #------#\n",
+        "\n",
+        "  # Calculate text_encoding for .json file contents and results as .db file\n",
+        "    names_dict = {}\n",
+        "    text_encoding_dict = {}\n",
+        "    segments = {}\n",
+        "    index = 0;\n",
+        "    subby = 1;\n",
+        "    NUM_HEADERS = 2\n",
+        "    CHUNKS_SIZE = 1000\n",
+        "    _filename = ''\n",
+        "    for _index in range(NUM_ITEMS):\n",
+        "        if (index % 100 == 0) : print(index)\n",
+        "        if (index == 0 and _index>0) : index = index + 2 #make space for headers\n",
+        "        if (_index % (CHUNKS_SIZE-NUM_HEADERS) == 0 and _index > 0) :\n",
+        "\n",
+        "            # Write headers in the .json\n",
+        "            names_dict[f'{0}'] = f'{_index}'\n",
+        "            names_dict[f'{1}'] = f'{filename}-{subby}'\n",
+        "\n",
+        "            # Encode the headers into text_encoding\n",
+        "            inputs = tokenizer(text = '' + names_dict[f'{0}'], padding=True, return_tensors=\"pt\").to(device)\n",
+        "            text_features = model.get_text_features(**inputs).to(device)\n",
+        "            text_features =  text_features/text_features.norm(p=2, dim=-1, keepdim=True)\n",
+        "            text_encoding_dict[f'{0}'] = text_features.to(torch.device('cpu'))\n",
+        "            inputs = tokenizer(text = '' + names_dict[f'{1}'], padding=True, return_tensors=\"pt\").to(device)\n",
+        "            text_features = model.get_text_features(**inputs).to(device)\n",
+        "            text_features =  text_features/text_features.norm(p=2, dim=-1, keepdim=True)\n",
+        "            text_encoding_dict[f'{1}'] = text_features.to(torch.device('cpu'))\n",
+        "            #-------#\n",
+        "\n",
+        "            # Write .json\n",
+        "            _filename = f'{filename}-{subby}.json'\n",
+        "            %cd {output_folder_text}\n",
+        "            print(f'Saving segment {_filename} to {output_folder_text}...')\n",
+        "            with open(_filename, 'w') as f:\n",
+        "                json.dump(names_dict, f)\n",
+        "            #-------#\n",
+        "\n",
+        "            # Write .safetensors\n",
+        "            _filename = f'{filename}-{subby}.safetensors'\n",
+        "            %cd {output_folder_text_encodings}\n",
+        "            print(f'Saving segment {_filename} to {output_folder_text_encodings}...')\n",
+        "            save_file(text_encoding_dict, _filename)\n",
+        "            #--------#\n",
+        "\n",
+        "            #Iterate\n",
+        "            subby = subby + 1\n",
+        "            segments[f'{subby}'] = _filename\n",
+        "            text_encoding_dict = {}\n",
+        "            names_dict = {}\n",
+        "            index = 0\n",
+        "            #------#\n",
+        "            #------#\n",
+        "        else: index = index + 1\n",
+        "        #--------#\n",
+        "        inputs = tokenizer(text = '' + prompts[f'{_index}'], padding=True, return_tensors=\"pt\").to(device)\n",
+        "        text_features = model.get_text_features(**inputs).to(device)\n",
+        "        text_features =  text_features/text_features.norm(p=2, dim=-1, keepdim=True)\n",
+        "        text_encoding_dict[f'{index}'] = text_features.to(torch.device('cpu'))\n",
+        "        names_dict[f'{index}'] = prompts[f'{_index}']\n",
+        "        continue\n",
+        "    #-----#\n",
+        "    #-----#\n",
+        "    # Write headers in the .json\n",
+        "    names_dict[f'{0}'] = f'{_index}'\n",
+        "    names_dict[f'{1}'] = f'{filename}-{subby}'\n",
+        "\n",
+        "    # Encode the headers into text_encoding\n",
+        "    inputs = tokenizer(text = '' + names_dict[f'{0}'], padding=True, return_tensors=\"pt\").to(device)\n",
+        "    text_features = model.get_text_features(**inputs).to(device)\n",
+        "    text_features =  text_features/text_features.norm(p=2, dim=-1, keepdim=True)\n",
+        "    text_encoding_dict[f'{0}'] = text_features.to(torch.device('cpu'))\n",
+        "    inputs = tokenizer(text = '' + names_dict[f'{1}'], padding=True, return_tensors=\"pt\").to(device)\n",
+        "    text_features = model.get_text_features(**inputs).to(device)\n",
+        "    text_features =  text_features/text_features.norm(p=2, dim=-1, keepdim=True)\n",
+        "    text_encoding_dict[f'{1}'] = text_features.to(torch.device('cpu'))\n",
+        "    #-------#\n",
+        "\n",
+        "    # Write .json\n",
+        "    _filename = f'{filename}-{subby}.json'\n",
+        "    %cd {output_folder_text}\n",
+        "    print(f'Saving segment {_filename} to {output_folder_text}...')\n",
+        "    with open(_filename, 'w') as f:\n",
+        "        json.dump(names_dict, f)\n",
+        "    #-------#\n",
+        "\n",
+        "    # Write .safetensors\n",
+        "    _filename = f'{filename}-{subby}.safetensors'\n",
+        "    %cd {output_folder_text_encodings}\n",
+        "    print(f'Saving segment {_filename} to {output_folder_text_encodings}...')\n",
+        "    save_file(text_encoding_dict, _filename)\n",
+        "    #--------#\n",
+        "\n",
+        "    #Iterate\n",
+        "    subby = subby + 1\n",
+        "    segments[f'{subby}'] = _filename\n",
+        "    text_encoding_dict = {}\n",
+        "    names_dict = {}\n",
+        "    index = 0\n",
+        "    #------#\n",
+        "  #----#\n",
+        "\n",
+        "# @title Download the text_encodings as .zip\n",
+        "import os\n",
+        "%cd {home_directory}\n",
+        "os.remove(f'{home_directory}results.zip')\n",
+        "zip_dest = f'{home_directory}results.zip'\n",
+        "!zip -r {zip_dest} {output_folder}"
+      ]
+    }
+  ]
+}