{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "a8ac0f91", "metadata": {}, "outputs": [], "source": [ "!pip install seaborn #why didn't mamba or pip install work with this? \n" ] }, { "cell_type": "code", "execution_count": 1, "id": "39ea4bed", "metadata": {}, "outputs": [], "source": [ "import os\n", "import time\n", "#os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n", "\n", "import seaborn as sns\n", "from IPython.display import Audio\n", "import nltk # we'll use this to split into sentences\n", "import numpy as np\n", "\n", "from bark_infinity.generation import (\n", " generate_text_semantic,\n", " preload_models,\n", " COARSE_RATE_HZ,\n", " SEMANTIC_RATE_HZ\n", ")\n", "from bark_infinity.api import semantic_to_waveform, set_seed\n", "from bark_infinity import generate_audio, SAMPLE_RATE, save_as_prompt\n", "from bark_infinity.generation import generate_coarse, generate_fine, generate_text_semantic, codec_decode\n", "\n", "import numpy as np\n", "from rich import pretty\n", "from rich import inspect\n", "import copy\n", "\n", "from contextlib import contextmanager\n", "\n", "def load_npz(filename):\n", " npz_data = np.load(filename)\n", "\n", " data_dict = {\n", " \"semantic_prompt\": npz_data[\"semantic_prompt\"],\n", " \"coarse_prompt\": npz_data[\"coarse_prompt\"],\n", " \"fine_prompt\": npz_data[\"fine_prompt\"],\n", " }\n", "\n", " npz_data.close() \n", "\n", " return data_dict\n", "\n", "\n", "def resize_history_prompt(history_prompt, tokens=128, from_front=False):\n", " #semantic_to_coarse_ratio = 75 / 49.9\n", " semantic_to_coarse_ratio = COARSE_RATE_HZ / SEMANTIC_RATE_HZ\n", "\n", " semantic_prompt = history_prompt[\"semantic_prompt\"]\n", " coarse_prompt = history_prompt[\"coarse_prompt\"]\n", " fine_prompt = history_prompt[\"fine_prompt\"]\n", "\n", " new_semantic_len = min(tokens, len(semantic_prompt))\n", " new_coarse_len = min(int(new_semantic_len * semantic_to_coarse_ratio), coarse_prompt.shape[1])\n", " \n", " new_fine_len = new_coarse_len\n", "\n", " if from_front:\n", " new_semantic_prompt = semantic_prompt[:new_semantic_len]\n", " new_coarse_prompt = coarse_prompt[:, :new_coarse_len]\n", " new_fine_prompt = fine_prompt[:, :new_fine_len]\n", " else:\n", " new_semantic_prompt = semantic_prompt[-new_semantic_len:]\n", " new_coarse_prompt = coarse_prompt[:, -new_coarse_len:]\n", " new_fine_prompt = fine_prompt[:, -new_fine_len:]\n", "\n", " return {\n", " \"semantic_prompt\": new_semantic_prompt,\n", " \"coarse_prompt\": new_coarse_prompt,\n", " \"fine_prompt\": new_fine_prompt,\n", " }\n", "\n", "def show_history_prompt_size(history_prompt, token_samples=3, semantic_back_n=128, text=\"history_prompt\"):\n", "\n", " semantic_prompt = history_prompt[\"semantic_prompt\"]\n", " coarse_prompt = history_prompt[\"coarse_prompt\"]\n", " fine_prompt = history_prompt[\"fine_prompt\"]\n", "\n", " # compute the ratio for coarse and fine back_n\n", " ratio = 75 / 49.9\n", " coarse_and_fine_back_n = int(semantic_back_n * ratio)\n", "\n", " def show_array_front_back(arr, n, back_n):\n", " if n > 0:\n", " front = arr[:n].tolist()\n", " back = arr[-n:].tolist()\n", "\n", " mid = []\n", " if len(arr) > back_n + token_samples:\n", " mid = arr[-back_n-token_samples:-back_n+token_samples].tolist()\n", "\n", " if mid:\n", " return f\"{front} ... <{back_n} from end> {mid} ... {back}\"\n", " else:\n", " return f\"{front} ... {back}\"\n", " else:\n", " return \"\"\n", "\n", " print(f\"\\n{text}\")\n", " print(f\" {text} semantic_prompt: {semantic_prompt.shape}\")\n", " print(f\" Tokens: {show_array_front_back(semantic_prompt, token_samples, semantic_back_n)}\")\n", " \n", " print(f\" {text} coarse_prompt: {coarse_prompt.shape}\")\n", " for row in coarse_prompt:\n", " print(f\" Tokens: {show_array_front_back(row, token_samples, coarse_and_fine_back_n)}\")\n", " \n", " print(f\" {text} fine_prompt: {fine_prompt.shape}\")\n", " #for row in fine_prompt:\n", " # print(f\" Tokens: {show_array_front_back(row, token_samples, coarse_and_fine_back_n)}\")\n", "\n", "\n", "def show_history_prompt_size(history_prompt, token_samples=3, semantic_back_n=256, text=\"history_prompt\"):\n", "\n", " semantic_prompt = history_prompt[\"semantic_prompt\"]\n", " coarse_prompt = history_prompt[\"coarse_prompt\"]\n", " fine_prompt = history_prompt[\"fine_prompt\"]\n", "\n", " # compute the ratio for coarse and fine back_n\n", " ratio = 75 / 49.9\n", " coarse_and_fine_back_n = int(semantic_back_n * ratio)\n", "\n", " def show_array_front_back(arr, n, back_n):\n", " if n > 0:\n", " front = arr[:n].tolist()\n", " back = arr[-n:].tolist()\n", "\n", " mid_front = []\n", " mid_back = []\n", " if len(arr) > back_n + token_samples:\n", " mid_front = arr[-back_n-token_samples:-back_n].tolist()\n", " mid_back = arr[-back_n:-back_n+token_samples].tolist()\n", "\n", " if mid_front and mid_back:\n", " return f\"{front} ... {mid_front} <{back_n} from end> {mid_back} ... {back}\"\n", " else:\n", " return f\"{front} ... {back}\"\n", " else:\n", " return \"\"\n", "\n", " print(f\"\\n{text}\")\n", " print(f\" {text} semantic_prompt: {semantic_prompt.shape}\")\n", " print(f\" Tokens: {show_array_front_back(semantic_prompt, token_samples, semantic_back_n)}\")\n", " \n", " print(f\" {text} coarse_prompt: {coarse_prompt.shape}\")\n", " for row in coarse_prompt:\n", " print(f\" Tokens: {show_array_front_back(row, token_samples, coarse_and_fine_back_n)}\")\n", " \n", " print(f\" {text} fine_prompt: {fine_prompt.shape}\")\n", " #for row in fine_prompt:\n", " # print(f\" Tokens: {show_array_front_back(row, token_samples, coarse_and_fine_back_n)}\")\n", "\n", "\n", "def split_array_equally(array, num_parts):\n", " split_indices = np.linspace(0, len(array), num_parts + 1, dtype=int)\n", " return [array[split_indices[i]: split_indices[i + 1]].astype(np.int32) for i in range(num_parts)]\n", "\n", "\n", "\n", "\n", "@contextmanager\n", "def measure_time(text=None, index=None):\n", " start_time = time.time()\n", " yield\n", " elapsed_time = time.time() - start_time\n", " if index is not None and text is not None:\n", " text = f\"{text} {index}\"\n", " elif text is None:\n", " text = \"Operation\"\n", " \n", " time_finished = f\"{text} Finished at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))}\"\n", " print(f\" -->{time_finished} in {elapsed_time} seconds\")\n", "\n", "\n", "\n", "def compare_history_prompts(hp1, hp2, text=\"history_prompt\"):\n", " print(f\"\\nComparing {text}\")\n", " for key in hp1.keys():\n", " # Compare only the parts of the arrays that have the same shape\n", " if hp1[key].shape != hp2[key].shape:\n", " print(f\" {key} arrays have different shapes: {hp1[key].shape} vs {hp2[key].shape}.\")\n", " min_size = min(hp1[key].shape[0], hp2[key].shape[0])\n", "\n", " if hp1[key].ndim == 1:\n", " hp1_part = hp1[key][-min_size:]\n", " hp2_part = hp2[key][-min_size:]\n", " else:\n", " min_size = min(hp1[key].shape[1], hp2[key].shape[1])\n", " hp1_part = hp1[key][:, -min_size:]\n", " hp2_part = hp2[key][:, -min_size:]\n", " \n", " print(f\" Comparing the last {min_size} elements of each.\")\n", " else:\n", " hp1_part = hp1[key]\n", " hp2_part = hp2[key]\n", "\n", " if np.array_equal(hp1_part, hp2_part):\n", " print(f\" {key} arrays are exactly the same.\")\n", " elif np.allclose(hp1_part, hp2_part):\n", " diff = np.linalg.norm(hp1_part - hp2_part)\n", " print(f\" {key} arrays are almost equal with a norm of difference: {diff}\")\n", " else:\n", " diff = np.linalg.norm(hp1_part - hp2_part)\n", " print(f\" {key} arrays are not equal. Norm of difference: {diff}\")\n", "\n", "\n", "def split_by_words(text, word_group_size):\n", " words = text.split()\n", " result = []\n", " group = \"\"\n", " \n", " for i, word in enumerate(words):\n", " group += word + \" \"\n", " \n", " if (i + 1) % word_group_size == 0:\n", " result.append(group.strip())\n", " group = \"\"\n", " \n", " # Add the last group if it's not empty\n", " if group.strip():\n", " result.append(group.strip())\n", " \n", " return result\n", "\n", "def concat_history_prompts(history_prompt1, history_prompt2):\n", " new_semantic_prompt = np.hstack([history_prompt1[\"semantic_prompt\"], history_prompt2[\"semantic_prompt\"]]).astype(np.int32) #not int64?\n", " new_coarse_prompt = np.hstack([history_prompt1[\"coarse_prompt\"], history_prompt2[\"coarse_prompt\"]]).astype(np.int32)\n", " new_fine_prompt = np.hstack([history_prompt1[\"fine_prompt\"], history_prompt2[\"fine_prompt\"]]).astype(np.int32)\n", "\n", " concatenated_history_prompt = {\n", " \"semantic_prompt\": new_semantic_prompt,\n", " \"coarse_prompt\": new_coarse_prompt,\n", " \"fine_prompt\": new_fine_prompt,\n", " }\n", "\n", " return concatenated_history_prompt\n", "\n", "# this shoudl be equal because the rows are always the same, I think?\n", "def align_and_concat_history_prompts(history_prompt1, history_prompt2):\n", " # Determine the size along the time dimension for each array in the history prompts\n", " semantic_time_size = min(history_prompt1[\"semantic_prompt\"].shape[0], history_prompt2[\"semantic_prompt\"].shape[0])\n", " coarse_time_size = min(history_prompt1[\"coarse_prompt\"].shape[1], history_prompt2[\"coarse_prompt\"].shape[1])\n", " fine_time_size = min(history_prompt1[\"fine_prompt\"].shape[1], history_prompt2[\"fine_prompt\"].shape[1])\n", "\n", " # Align arrays along the time dimension\n", " semantic_prompt1 = history_prompt1[\"semantic_prompt\"][-semantic_time_size:]\n", " semantic_prompt2 = history_prompt2[\"semantic_prompt\"][-semantic_time_size:]\n", " coarse_prompt1 = history_prompt1[\"coarse_prompt\"][:, -coarse_time_size:]\n", " coarse_prompt2 = history_prompt2[\"coarse_prompt\"][:, -coarse_time_size:]\n", " fine_prompt1 = history_prompt1[\"fine_prompt\"][:, -fine_time_size:]\n", " fine_prompt2 = history_prompt2[\"fine_prompt\"][:, -fine_time_size:]\n", "\n", " # Concatenate each array\n", " new_semantic_prompt = np.hstack([semantic_prompt1, semantic_prompt2]).astype(np.int32)\n", " new_coarse_prompt = np.hstack([coarse_prompt1, coarse_prompt2]).astype(np.int32)\n", " new_fine_prompt = np.hstack([fine_prompt1, fine_prompt2]).astype(np.int32)\n", "\n", " # Create a new history_prompt with concatenated arrays\n", " concatenated_history_prompt = {\n", " \"semantic_prompt\": new_semantic_prompt,\n", " \"coarse_prompt\": new_coarse_prompt,\n", " \"fine_prompt\": new_fine_prompt,\n", " }\n", "\n", " return concatenated_history_prompt\n", "\n", "\n", "def merge_history_prompts(left_history_prompt, right_history_prompt, right_size = 128):\n", " right_history_prompt = resize_history_prompt(right_history_prompt, tokens=right_size, from_front=False)\n", " combined_history_prompts = concat_history_prompts(left_history_prompt, right_history_prompt)\n", " combined_history_prompts = resize_history_prompt(combined_history_prompts, tokens=341, from_front=False)\n", " return combined_history_prompts\n", "\n", "\n", "preload_models(text_use_small=False,coarse_use_small=False, fine_use_small=False)" ] }, { "cell_type": "code", "execution_count": 54, "id": "1d03f4d2", "metadata": {}, "outputs": [], "source": [ "# Or Small with fine large\n", "preload_models(text_use_small=True,coarse_use_small=True, fine_use_small=False, force_reload=True)" ] }, { "cell_type": "code", "execution_count": 43, "id": "e3dfca3d", "metadata": {}, "outputs": [], "source": [ "charlie_text=\"\"\"\n", "Have I told you that story about how Charlie Parker became Charlie Parker?\n", "Parker's a young kid, pretty good on the Sax, \n", "gets up to play at a cutting session, \n", "and well, he fucks it up. \n", "\"\"\".replace(\"\\n\", \" \").strip()\n", "\n", "\n", "sentence_text = \"A common mistake that people make when trying to design something completely foolproof is to underestimate the ingenuity of complete fools.\"\n", "sentence_text = \"In the beginning the Universe was created. This has made a lot of people very angry and been widely regarded as a bad move.\"\n", "\n", "\n", "testing_seed = 12345\n", "testing_seed = -1\n", "testing_SPEAKER = \"en_fiery.npz\"" ] }, { "attachments": {}, "cell_type": "markdown", "id": "74a025a4", "metadata": {}, "source": [ "# Simple Long-Form Generation\n", "We split longer text into sentences using `nltk` and generate the sentences one by one." ] }, { "cell_type": "code", "execution_count": 83, "id": "eb569377", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "en_fiery.npz Original\n", " en_fiery.npz Original semantic_prompt: (682,)\n", " Tokens: [147, 6242, 302] ... [6747, 187, 891] <128 from end> [891, 891, 7100] ... [2403, 147, 2009]\n", " en_fiery.npz Original coarse_prompt: (2, 1025)\n", " Tokens: [738, 738, 1017] ... [683, 402, 162] <192 from end> [695, 501, 240] ... [717, 121, 121]\n", " Tokens: [363, 363, 646] ... [761, 53, 809] <192 from end> [831, 345, 559] ... [424, 424, 424]\n", " en_fiery.npz Original fine_prompt: (8, 1025)\n", "\n", "custom_speakers/classic_robot_tts.npz Other\n", " custom_speakers/classic_robot_tts.npz Other semantic_prompt: (457,)\n", " Tokens: [10, 10, 1184] ... [429, 41, 38] <128 from end> [3277, 3554, 7822] ... [206, 206, 186]\n", " custom_speakers/classic_robot_tts.npz Other coarse_prompt: (2, 686)\n", " Tokens: [699, 699, 753] ... [534, 186, 656] <192 from end> [451, 754, 421] ... [133, 133, 106]\n", " Tokens: [1002, 1002, 404] ... [16, 846, 890] <192 from end> [478, 345, 276] ... [913, 913, 913]\n", " custom_speakers/classic_robot_tts.npz Other fine_prompt: (8, 686)\n", "\n", "custom_speakers/classic_robot_tts.npz Other resize check\n", " custom_speakers/classic_robot_tts.npz Other resize check semantic_prompt: (64,)\n", " Tokens: [7960, 9515, 6622] ... [206, 206, 186]\n", " custom_speakers/classic_robot_tts.npz Other resize check coarse_prompt: (2, 96)\n", " Tokens: [358, 747, 734] ... [133, 133, 106]\n", " Tokens: [785, 318, 406] ... [913, 913, 913]\n", " custom_speakers/classic_robot_tts.npz Other resize check fine_prompt: (8, 96)\n", "\n", "Base en_fiery.npz with 128 tokens from custom_speakers/classic_robot_tts.npz\n", " Base en_fiery.npz with 128 tokens from custom_speakers/classic_robot_tts.npz semantic_prompt: (341,)\n", " Tokens: [43, 457, 1232] ... [2403, 147, 2009] <64 from end> [7960, 9515, 6622] ... [206, 206, 186]\n", " Base en_fiery.npz with 128 tokens from custom_speakers/classic_robot_tts.npz coarse_prompt: (2, 512)\n", " Tokens: [20, 291, 20] ... [717, 121, 121] <96 from end> [358, 747, 734] ... [133, 133, 106]\n", " Tokens: [947, 766, 128] ... [424, 424, 424] <96 from end> [785, 318, 406] ... [913, 913, 913]\n", " Base en_fiery.npz with 128 tokens from custom_speakers/classic_robot_tts.npz fine_prompt: (8, 512)\n" ] } ], "source": [ "SPEAKER = testing_SPEAKER\n", "\n", "other_SPEAKER = \"custom_speakers/classic_robot_tts.npz\"\n", "\n", "og_history_prompt = load_npz(SPEAKER)\n", "other_history_prompt = load_npz(other_SPEAKER)\n", "\n", "previous_segment_token_size = 128\n", "\n", "show_history_prompt_size(og_history_prompt, token_samples=3, text=f\"{SPEAKER} Original\", semantic_back_n=previous_segment_token_size)\n", "show_history_prompt_size(other_history_prompt, token_samples=3, text=f\"{other_SPEAKER} Other\", semantic_back_n=previous_segment_token_size)\n", "\n", "#resized_history_prompt = resize_history_prompt(og_history_prompt, tokens=341, from_front=False)\n", "#show_history_prompt_size(resized_history_prompt, text=\"Resized\")\n", "#compare_history_prompts(og_history_prompt, resized_history_prompt)\n", "\n", "#compare_history_prompts(other_history_prompt, og_history_prompt)\n", "\n", "\n", "#align_and_concat = align_and_concat_history_prompts(og_history_prompt, other_history_prompt)\n", "#concat = concat_history_prompts(og_history_prompt, other_history_prompt)\n", "\n", "#show_history_prompt_size(align_and_concat, text=\"align and concat\")\n", "#show_history_prompt_size(concat, text=\"concat\")\n", "\n", "\n", "\n", "#other_history_prompt_resize = resize_history_prompt(other_history_prompt, tokens=previous_segment_token_size, from_front=False)\n", "#og_history_prompt_trimmed = resize_history_prompt(og_history_prompt, tokens=341, from_front=False)\n", "\n", "#new_speaker_blend = align_and_concat_history_prompts(og_history_prompt_trimmed, other_history_prompt_resize)\n", "\n", "other_history_prompt = resize_history_prompt(other_history_prompt, tokens=64, from_front=False)\n", "\n", "show_history_prompt_size(other_history_prompt, token_samples=3, text=f\"{other_SPEAKER} Other resize check\", semantic_back_n=64)\n", "\n", "speaker_blend = merge_history_prompts(og_history_prompt, other_history_prompt, right_size=previous_segment_token_size)\n", "\n", "show_history_prompt_size(speaker_blend, token_samples=3, text=f\"Base {SPEAKER} with {previous_segment_token_size} tokens from {other_SPEAKER}\", semantic_back_n=64)\n", "\n", "#show_history_prompt_size(new_speaker_blend, text=\"align and concat\")\n", "#show_history_prompt_size(new_speaker_blend_2, text=\"concat\")\n", "#compare_history_prompts(new_speaker_blend, new_speaker_blend_2, text=\"compare concat methods\")\n", "# we have 256 tokens for semantic and less even for coarse, unless we figure out how to pack the inference space \n", "# + " ] }, { "cell_type": "code", "execution_count": 76, "id": "62084f2b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "fen_fiery.npz Original\n", " fen_fiery.npz Original semantic_prompt: (682,)\n", " Tokens: [147, 6242, 302] ... [10, 230, 56] <256 from end> [206, 10, 206] ... [2403, 147, 2009]\n", " fen_fiery.npz Original coarse_prompt: (2, 1025)\n", " Tokens: [738, 738, 1017] ... [738, 738, 738] <384 from end> [738, 738, 738] ... [717, 121, 121]\n", " Tokens: [363, 363, 646] ... [937, 544, 937] <384 from end> [544, 544, 544] ... [424, 424, 424]\n", " fen_fiery.npz Original fine_prompt: (8, 1025)\n", "\n", "custom_speakers/classic_robot_tts.npz Other\n", " custom_speakers/classic_robot_tts.npz Other semantic_prompt: (457,)\n", " Tokens: [10, 10, 1184] ... [41, 41, 2362] <256 from end> [2362, 8414, 7892] ... [206, 206, 186]\n", " custom_speakers/classic_robot_tts.npz Other coarse_prompt: (2, 686)\n", " Tokens: [699, 699, 753] ... [118, 937, 51] <384 from end> [378, 820, 937] ... [133, 133, 106]\n", " Tokens: [1002, 1002, 404] ... [584, 406, 457] <384 from end> [850, 60, 588] ... [913, 913, 913]\n", " custom_speakers/classic_robot_tts.npz Other fine_prompt: (8, 686)\n", "\n", "Base en_fiery.npz with 128 tokens from custom_speakers/classic_robot_tts.npz\n", " Base en_fiery.npz with 128 tokens from custom_speakers/classic_robot_tts.npz semantic_prompt: (341,)\n", " Tokens: [8735, 8735, 8385] ... [6747, 187, 891] <256 from end> [891, 891, 7100] ... [206, 206, 186]\n", " Base en_fiery.npz with 128 tokens from custom_speakers/classic_robot_tts.npz coarse_prompt: (2, 512)\n", " Tokens: [62, 62, 62] ... [683, 402, 162] <384 from end> [695, 501, 240] ... [133, 133, 106]\n", " Tokens: [424, 424, 424] ... [761, 53, 809] <384 from end> [831, 345, 559] ... [913, 913, 913]\n", " Base en_fiery.npz with 128 tokens from custom_speakers/classic_robot_tts.npz fine_prompt: (8, 512)\n", "\n", "-->Generating for en_fiery.npz Original\n", "\n", "en_fiery.npz Original\n", " en_fiery.npz Original semantic_prompt: (682,)\n", " Tokens: [147, 6242, 302] ... [10, 230, 56] <256 from end> [206, 10, 206] ... [2403, 147, 2009]\n", " en_fiery.npz Original coarse_prompt: (2, 1025)\n", " Tokens: [738, 738, 1017] ... [738, 738, 738] <384 from end> [738, 738, 738] ... [717, 121, 121]\n", " Tokens: [363, 363, 646] ... [937, 544, 937] <384 from end> [544, 544, 544] ... [424, 424, 424]\n", " en_fiery.npz Original fine_prompt: (8, 1025)\n", "Disabling deterministic algorithms\n", "Set seed to 2223700255\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", " --> Regular Generation Finished at: 2023-05-10 00:01:20 in 25.751514196395874 seconds\n", "\n", " en_fiery.npz Original full_generation output\n", " en_fiery.npz Original full_generation output semantic_prompt: (540,)\n", " Tokens: [2305, 147, 3208] ... [1041, 4996, 710] <256 from end> [4533, 5231, 1887] ... [2009, 2009, 2403]\n", " en_fiery.npz Original full_generation output coarse_prompt: (2, 811)\n", " Tokens: [62, 62, 62] ... [858, 185, 613] <384 from end> [432, 835, 339] ... [62, 62, 62]\n", " Tokens: [424, 424, 424] ... [570, 570, 748] <384 from end> [741, 841, 747] ... [424, 424, 424]\n", " en_fiery.npz Original full_generation output fine_prompt: (8, 811)\n", "\n", "-->Generating for custom_speakers/classic_robot_tts.npz Other\n", "\n", "custom_speakers/classic_robot_tts.npz Other\n", " custom_speakers/classic_robot_tts.npz Other semantic_prompt: (457,)\n", " Tokens: [10, 10, 1184] ... [41, 41, 2362] <256 from end> [2362, 8414, 7892] ... [206, 206, 186]\n", " custom_speakers/classic_robot_tts.npz Other coarse_prompt: (2, 686)\n", " Tokens: [699, 699, 753] ... [118, 937, 51] <384 from end> [378, 820, 937] ... [133, 133, 106]\n", " Tokens: [1002, 1002, 404] ... [584, 406, 457] <384 from end> [850, 60, 588] ... [913, 913, 913]\n", " custom_speakers/classic_robot_tts.npz Other fine_prompt: (8, 686)\n", "Disabling deterministic algorithms\n", "Set seed to 3208560701\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", " --> Regular Generation Finished at: 2023-05-10 00:01:51 in 30.917768955230713 seconds\n", "\n", " custom_speakers/classic_robot_tts.npz Other full_generation output\n", " custom_speakers/classic_robot_tts.npz Other full_generation output semantic_prompt: (669,)\n", " Tokens: [206, 206, 486] ... [2465, 783, 10] <256 from end> [10, 2000, 7306] ... [206, 6493, 486]\n", " custom_speakers/classic_robot_tts.npz Other full_generation output coarse_prompt: (2, 1005)\n", " Tokens: [133, 133, 133] ... [133, 133, 904] <384 from end> [904, 904, 904] ... [904, 904, 904]\n", " Tokens: [913, 913, 913] ... [282, 580, 277] <384 from end> [277, 277, 277] ... [961, 961, 961]\n", " custom_speakers/classic_robot_tts.npz Other full_generation output fine_prompt: (8, 1005)\n", "\n", "-->Generating for Orig en_fiery.npz + 128 tokens from custom_speakers/classic_robot_tts.npz\n", "\n", "Orig en_fiery.npz + 128 tokens from custom_speakers/classic_robot_tts.npz\n", " Orig en_fiery.npz + 128 tokens from custom_speakers/classic_robot_tts.npz semantic_prompt: (341,)\n", " Tokens: [8735, 8735, 8385] ... [6747, 187, 891] <256 from end> [891, 891, 7100] ... [206, 206, 186]\n", " Orig en_fiery.npz + 128 tokens from custom_speakers/classic_robot_tts.npz coarse_prompt: (2, 512)\n", " Tokens: [62, 62, 62] ... [683, 402, 162] <384 from end> [695, 501, 240] ... [133, 133, 106]\n", " Tokens: [424, 424, 424] ... [761, 53, 809] <384 from end> [831, 345, 559] ... [913, 913, 913]\n", " Orig en_fiery.npz + 128 tokens from custom_speakers/classic_robot_tts.npz fine_prompt: (8, 512)\n", "Disabling deterministic algorithms\n", "Set seed to 132780836\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", " --> Regular Generation Finished at: 2023-05-10 00:02:26 in 34.253556966781616 seconds\n", "\n", " Orig en_fiery.npz + 128 tokens from custom_speakers/classic_robot_tts.npz full_generation output\n", " Orig en_fiery.npz + 128 tokens from custom_speakers/classic_robot_tts.npz full_generation output semantic_prompt: (697,)\n", " Tokens: [56, 3252, 206] ... [7608, 2033, 178] <256 from end> [27, 12, 27] ... [8924, 5934, 206]\n", " Orig en_fiery.npz + 128 tokens from custom_speakers/classic_robot_tts.npz full_generation output coarse_prompt: (2, 1047)\n", " Tokens: [133, 133, 133] ... [489, 699, 834] <384 from end> [133, 133, 472] ... [133, 133, 62]\n", " Tokens: [913, 913, 913] ... [702, 43, 277] <384 from end> [894, 516, 277] ... [913, 913, 424]\n", " Orig en_fiery.npz + 128 tokens from custom_speakers/classic_robot_tts.npz full_generation output fine_prompt: (8, 1047)\n" ] }, { "data": { "text/plain": [ "'\\nshow_history_prompt_size(speaker_blend, token_samples=3, text=f\"{other_SPEAKER}\")\\n\\npieces = []\\n\\nshow_history_prompt_size(og_history_prompt, token_samples=3, text=\"Other history_prompt file\")\\n\\nwith measure_time(text=\"Regular Other\"):\\n\\n set_seed(testing_seed)\\n og_full_generation, audio_array = generate_audio(cell_text_prompt, history_prompt=og_history_prompt, output_full=True)\\n pieces += [audio_array]\\n \\nshow_history_prompt_size(og_full_generation, text=\"Regular Other\")\\n\\nfinal_audio_clips.append(pieces)\\n\\n\\n\\nshow_history_prompt_size(speaker_blend, token_samples=3, text=f\"speaker_blend\")\\n\\npieces = []\\n\\nwith measure_time(text=f\"new_speaker_blend\"):\\n set_seed(testing_seed)\\n new_speaker_blend_output, audio_array = generate_audio(cell_text_prompt, history_prompt=speaker_blend, output_full=True)\\n pieces += [audio_array]\\n \\nshow_history_prompt_size(new_speaker_blend_output, text=f\"speaker_blend Output\")\\n\\nfinal_audio_clips.append(pieces)\\n\\n\\n#compare_history_prompts(og_full_generation, new_speaker_blend_2)\\n'" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Speaker Mixing\n", "\n", "SPEAKER = testing_SPEAKER\n", "\n", "other_SPEAKER = \"custom_speakers/classic_robot_tts.npz\"\n", "\n", "og_history_prompt = load_npz(SPEAKER)\n", "other_history_prompt = load_npz(other_SPEAKER)\n", "\n", "\n", "\n", "\n", "show_history_prompt_size(og_history_prompt, text=f\"f{SPEAKER} Original\")\n", "show_history_prompt_size(other_history_prompt, text=f\"{other_SPEAKER} Other\")\n", "\n", "cell_text_prompt = charlie_text\n", "\n", "previous_segment_token_size = 128\n", "\n", "\n", "\n", "\n", "\n", "speaker_blend = merge_history_prompts(og_history_prompt, other_history_prompt, right_size=previous_segment_token_size)\n", "\n", "show_history_prompt_size(speaker_blend, text=f\"Base {SPEAKER} with {previous_segment_token_size} tokens from {other_SPEAKER}\")\n", "\n", "\"\"\"\n", "other_history_prompt_resize = resize_history_prompt(other_history_prompt, tokens=previous_segment_token_size, from_front=False)\n", "og_history_prompt_trimmed = resize_history_prompt(og_history_prompt, tokens=341, from_front=False)\n", "\n", "\n", "speaker_blend = concat_history_prompts(og_history_prompt_trimmed, other_history_prompt_resize)\n", "show_history_prompt_size(speaker_blend, text=f\"Base {SPEAKER} with {previous_segment_token_size} tokens from {other_SPEAKER}\")\n", "\"\"\"\n", "\n", "all_history_prompts = []\n", "\n", "all_history_prompts.append([og_history_prompt, f\"{SPEAKER} Original\"])\n", "all_history_prompts.append([other_history_prompt, f\"{other_SPEAKER } Other\"])\n", "all_history_prompts.append([speaker_blend, f\"Orig {SPEAKER} + {previous_segment_token_size} tokens from {other_SPEAKER}\"])\n", "\n", "\n", "final_audio_clips = []\n", "\n", "\n", "for history_prompt, text in all_history_prompts:\n", " print(f\"\\n-->Generating for {text}\")\n", "\n", " pieces = []\n", "\n", " show_history_prompt_size(history_prompt, token_samples=3, text=text)\n", "\n", " with measure_time(text=\" Regular Generation\"):\n", "\n", " set_seed(testing_seed)\n", " full_generation, audio_array = generate_audio(cell_text_prompt, history_prompt=history_prompt, output_full=True, silent=True)\n", " pieces += [audio_array]\n", " \n", " show_history_prompt_size(full_generation, text=f\" {text} full_generation output\")\n", "\n", " final_audio_clips.append(pieces)\n", "\n", "\n", "\n", "\"\"\"\n", "show_history_prompt_size(speaker_blend, token_samples=3, text=f\"{other_SPEAKER}\")\n", "\n", "pieces = []\n", "\n", "show_history_prompt_size(og_history_prompt, token_samples=3, text=\"Other history_prompt file\")\n", "\n", "\n", "\n", "\n", "sure_time(text=\"Regular Other\"):\n", "\n", " set_seed(testing_seed)\n", " og_full_generation, audio_array = generate_audio(cell_text_prompt, history_prompt=og_history_prompt, output_full=True)\n", " pieces += [audio_array]\n", " \n", "show_history_prompt_size(og_full_generation, text=\"Regular Other\")\n", "\n", "final_audio_clips.append(pieces)\n", "\n", "\n", "\n", "show_history_prompt_size(speaker_blend, token_samples=3, text=f\"speaker_blend\")\n", "\n", "pieces = []\n", "\n", "with measure_time(text=f\"new_speaker_blend\"):\n", " set_seed(testing_seed)\n", " new_speaker_blend_output, audio_array = generate_audio(cell_text_prompt, history_prompt=speaker_blend, output_full=True)\n", " pieces += [audio_array]\n", " \n", "show_history_prompt_size(new_speaker_blend_output, text=f\"speaker_blend Output\")\n", "\n", "final_audio_clips.append(pieces)\n", "\n", "\n", "#compare_history_prompts(og_full_generation, new_speaker_blend_2)\n", "\"\"\"\n", " \n", "\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 9, "id": "126ed5bc", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rendering samples for speakers in: needs_fixing/\n", "Loading needs_fixing/Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz\n", "semantic_prompt_max: 256\n", "\n", "Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz original\n", " Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz original semantic_prompt: (256,)\n", " Tokens: [1866, 1424, 1424] ... [648, 198, 41]\n", " Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz original coarse_prompt: (2, 384)\n", " Tokens: [679, 747, 11] ... [347, 976, 865]\n", " Tokens: [712, 317, 368] ... [839, 812, 544]\n", " Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz original fine_prompt: (8, 384)\n", "\n", "Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz resized from front\n", " Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz resized from front semantic_prompt: (128,)\n", " Tokens: [1866, 1424, 1424] ... [6664, 748, 8522]\n", " Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz resized from front coarse_prompt: (2, 192)\n", " Tokens: [679, 747, 11] ... [23, 23, 23]\n", " Tokens: [712, 317, 368] ... [777, 828, 885]\n", " Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz resized from front fine_prompt: (8, 192)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt_front_128.npz\n", "\n", "Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz resized from back\n", " Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz resized from back semantic_prompt: (128,)\n", " Tokens: [3767, 3767, 4775] ... [648, 198, 41]\n", " Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz resized from back coarse_prompt: (2, 192)\n", " Tokens: [23, 23, 23] ... [347, 976, 865]\n", " Tokens: [171, 295, 839] ... [839, 812, 544]\n", " Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz resized from back fine_prompt: (8, 192)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt_back_128.npz\n", "\n", "Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz resized from front\n", " Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz resized from front semantic_prompt: (192,)\n", " Tokens: [1866, 1424, 1424] ... [2889, 2613, 4723]\n", " Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz resized from front coarse_prompt: (2, 288)\n", " Tokens: [679, 747, 11] ... [747, 747, 925]\n", " Tokens: [712, 317, 368] ... [668, 501, 598]\n", " Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz resized from front fine_prompt: (8, 288)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt_front_192.npz\n", "\n", "Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz resized from back\n", " Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz resized from back semantic_prompt: (192,)\n", " Tokens: [2640, 2640, 2640] ... [648, 198, 41]\n", " Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz resized from back coarse_prompt: (2, 288)\n", " Tokens: [408, 408, 408] ... [347, 976, 865]\n", " Tokens: [518, 518, 518] ... [839, 812, 544]\n", " Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt.npz resized from back fine_prompt: (8, 288)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0130-34-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_initial_prompt_back_192.npz\n", "Loading needs_fixing/Hey_have_you_he-23-0512-0129-04-SPK-obama_perfect_front_256_trimmed.mp4_initial_prompt.npz\n", "semantic_prompt_max: 256\n", "\n", "Hey_have_you_he-23-0512-0129-04-SPK-obama_perfect_front_256_trimmed.mp4_initial_prompt.npz original\n", " Hey_have_you_he-23-0512-0129-04-SPK-obama_perfect_front_256_trimmed.mp4_initial_prompt.npz original semantic_prompt: (256,)\n", " Tokens: [7059, 9406, 245] ... [558, 4048, 298]\n", " Hey_have_you_he-23-0512-0129-04-SPK-obama_perfect_front_256_trimmed.mp4_initial_prompt.npz original coarse_prompt: (2, 384)\n", " Tokens: [976, 228, 131] ... [808, 690, 604]\n", " Tokens: [364, 669, 114] ... [37, 37, 953]\n", " Hey_have_you_he-23-0512-0129-04-SPK-obama_perfect_front_256_trimmed.mp4_initial_prompt.npz original fine_prompt: (8, 384)\n", "\n", "Hey_have_you_he-23-0512-0129-04-SPK-obama_perfect_front_256_trimmed.mp4_initial_prompt.npz resized from front\n", " Hey_have_you_he-23-0512-0129-04-SPK-obama_perfect_front_256_trimmed.mp4_initial_prompt.npz resized from front semantic_prompt: (128,)\n", " Tokens: [7059, 9406, 245] ... [134, 134, 7141]\n", " Hey_have_you_he-23-0512-0129-04-SPK-obama_perfect_front_256_trimmed.mp4_initial_prompt.npz resized from front coarse_prompt: (2, 192)\n", " Tokens: [976, 228, 131] ... [835, 835, 835]\n", " Tokens: [364, 669, 114] ... [913, 518, 913]\n", " Hey_have_you_he-23-0512-0129-04-SPK-obama_perfect_front_256_trimmed.mp4_initial_prompt.npz resized from front fine_prompt: (8, 192)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0129-04-SPK-obama_perfect_front_256_trimmed.mp4_initial_prompt_front_128.npz\n", "\n", "Hey_have_you_he-23-0512-0129-04-SPK-obama_perfect_front_256_trimmed.mp4_initial_prompt.npz resized from back\n", " Hey_have_you_he-23-0512-0129-04-SPK-obama_perfect_front_256_trimmed.mp4_initial_prompt.npz resized from back semantic_prompt: (128,)\n", " Tokens: [134, 463, 134] ... [558, 4048, 298]\n", " Hey_have_you_he-23-0512-0129-04-SPK-obama_perfect_front_256_trimmed.mp4_initial_prompt.npz resized from back coarse_prompt: (2, 192)\n", " Tokens: [738, 738, 835] ... [808, 690, 604]\n", " Tokens: [544, 544, 518] ... [37, 37, 953]\n", " Hey_have_you_he-23-0512-0129-04-SPK-obama_perfect_front_256_trimmed.mp4_initial_prompt.npz resized from back fine_prompt: (8, 192)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0129-04-SPK-obama_perfect_front_256_trimmed.mp4_initial_prompt_back_128.npz\n", "\n", "Hey_have_you_he-23-0512-0129-04-SPK-obama_perfect_front_256_trimmed.mp4_initial_prompt.npz resized from front\n", " Hey_have_you_he-23-0512-0129-04-SPK-obama_perfect_front_256_trimmed.mp4_initial_prompt.npz resized from front semantic_prompt: (192,)\n", " Tokens: [7059, 9406, 245] ... [41, 401, 3573]\n", " Hey_have_you_he-23-0512-0129-04-SPK-obama_perfect_front_256_trimmed.mp4_initial_prompt.npz resized from front coarse_prompt: (2, 288)\n", " Tokens: [976, 228, 131] ... [38, 872, 726]\n", " Tokens: [364, 669, 114] ... [226, 564, 320]\n", " Hey_have_you_he-23-0512-0129-04-SPK-obama_perfect_front_256_trimmed.mp4_initial_prompt.npz resized from front fine_prompt: (8, 288)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0129-04-SPK-obama_perfect_front_256_trimmed.mp4_initial_prompt_front_192.npz\n", "\n", "Hey_have_you_he-23-0512-0129-04-SPK-obama_perfect_front_256_trimmed.mp4_initial_prompt.npz resized from back\n", " Hey_have_you_he-23-0512-0129-04-SPK-obama_perfect_front_256_trimmed.mp4_initial_prompt.npz resized from back semantic_prompt: (192,)\n", " Tokens: [446, 7197, 134] ... [558, 4048, 298]\n", " Hey_have_you_he-23-0512-0129-04-SPK-obama_perfect_front_256_trimmed.mp4_initial_prompt.npz resized from back coarse_prompt: (2, 288)\n", " Tokens: [738, 835, 835] ... [808, 690, 604]\n", " Tokens: [544, 424, 363] ... [37, 37, 953]\n", " Hey_have_you_he-23-0512-0129-04-SPK-obama_perfect_front_256_trimmed.mp4_initial_prompt.npz resized from back fine_prompt: (8, 288)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0129-04-SPK-obama_perfect_front_256_trimmed.mp4_initial_prompt_back_192.npz\n", "Loading needs_fixing/Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz\n", "semantic_prompt_max: 670\n", "\n", "Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz original\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz original semantic_prompt: (670,)\n", " Tokens: [298, 415, 656] ... [2035, 4228, 1732] <256 from end> [41, 3279, 2065] ... [567, 3249, 1011]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz original coarse_prompt: (2, 1007)\n", " Tokens: [604, 834, 740] ... [834, 875, 131] <384 from end> [344, 1010, 837] ... [764, 534, 982]\n", " Tokens: [953, 571, 195] ... [516, 777, 980] <384 from end> [646, 841, 763] ... [334, 864, 995]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz original fine_prompt: (8, 1007)\n", "\n", "Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front semantic_prompt: (128,)\n", " Tokens: [298, 415, 656] ... [5127, 7134, 3934]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front coarse_prompt: (2, 192)\n", " Tokens: [604, 834, 740] ... [651, 561, 942]\n", " Tokens: [953, 571, 195] ... [73, 767, 114]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front fine_prompt: (8, 192)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4_front_128.npz\n", "\n", "Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back semantic_prompt: (128,)\n", " Tokens: [9198, 2270, 557] ... [567, 3249, 1011]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back coarse_prompt: (2, 192)\n", " Tokens: [983, 658, 950] ... [764, 534, 982]\n", " Tokens: [166, 166, 166] ... [334, 864, 995]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back fine_prompt: (8, 192)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4_back_128.npz\n", "\n", "Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front semantic_prompt: (192,)\n", " Tokens: [298, 415, 656] ... [1400, 441, 100]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front coarse_prompt: (2, 288)\n", " Tokens: [604, 834, 740] ... [1017, 835, 855]\n", " Tokens: [953, 571, 195] ... [765, 424, 544]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front fine_prompt: (8, 288)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4_front_192.npz\n", "\n", "Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back semantic_prompt: (192,)\n", " Tokens: [5764, 5764, 7111] ... [567, 3249, 1011]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back coarse_prompt: (2, 288)\n", " Tokens: [259, 112, 112] ... [764, 534, 982]\n", " Tokens: [685, 647, 6] ... [334, 864, 995]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back fine_prompt: (8, 288)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4_back_192.npz\n", "\n", "Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front semantic_prompt: (256,)\n", " Tokens: [298, 415, 656] ... [1620, 41, 41]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front coarse_prompt: (2, 384)\n", " Tokens: [604, 834, 740] ... [491, 491, 976]\n", " Tokens: [953, 571, 195] ... [859, 937, 877]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front fine_prompt: (8, 384)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4_front_256.npz\n", "\n", "Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back semantic_prompt: (256,)\n", " Tokens: [41, 3279, 2065] ... [567, 3249, 1011]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back coarse_prompt: (2, 384)\n", " Tokens: [344, 1010, 837] ... [764, 534, 982]\n", " Tokens: [646, 841, 763] ... [334, 864, 995]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back fine_prompt: (8, 384)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4_back_256.npz\n", "\n", "Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front semantic_prompt: (320,)\n", " Tokens: [298, 415, 656] ... [3053, 206, 206] <256 from end> [9320, 206, 2381] ... [134, 463, 134]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front coarse_prompt: (2, 480)\n", " Tokens: [604, 834, 740] ... [835, 835, 835] <384 from end> [835, 835, 835] ... [835, 835, 835]\n", " Tokens: [953, 571, 195] ... [424, 424, 424] <384 from end> [424, 424, 424] ... [913, 913, 913]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front fine_prompt: (8, 480)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4_front_320.npz\n", "\n", "Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back semantic_prompt: (320,)\n", " Tokens: [2926, 1919, 5413] ... [2035, 4228, 1732] <256 from end> [41, 3279, 2065] ... [567, 3249, 1011]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back coarse_prompt: (2, 480)\n", " Tokens: [112, 650, 185] ... [834, 875, 131] <384 from end> [344, 1010, 837] ... [764, 534, 982]\n", " Tokens: [511, 546, 79] ... [516, 777, 980] <384 from end> [646, 841, 763] ... [334, 864, 995]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back fine_prompt: (8, 480)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4_back_320.npz\n", "\n", "Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front semantic_prompt: (384,)\n", " Tokens: [298, 415, 656] ... [5127, 7134, 3934] <256 from end> [4222, 2211, 58] ... [56, 7968, 206]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front coarse_prompt: (2, 577)\n", " Tokens: [604, 834, 740] ... [561, 942, 942] <384 from end> [942, 467, 967] ... [835, 835, 835]\n", " Tokens: [953, 571, 195] ... [767, 114, 118] <384 from end> [342, 359, 743] ... [424, 518, 518]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front fine_prompt: (8, 577)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4_front_384.npz\n", "\n", "Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back semantic_prompt: (384,)\n", " Tokens: [206, 206, 2381] ... [2035, 4228, 1732] <256 from end> [41, 3279, 2065] ... [567, 3249, 1011]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back coarse_prompt: (2, 577)\n", " Tokens: [835, 835, 835] ... [834, 875, 131] <384 from end> [344, 1010, 837] ... [764, 534, 982]\n", " Tokens: [913, 424, 518] ... [516, 777, 980] <384 from end> [646, 841, 763] ... [334, 864, 995]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back fine_prompt: (8, 577)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4_back_384.npz\n", "\n", "Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front semantic_prompt: (448,)\n", " Tokens: [298, 415, 656] ... [1400, 441, 100] <256 from end> [196, 282, 672] ... [5006, 8398, 27]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front coarse_prompt: (2, 673)\n", " Tokens: [604, 834, 740] ... [835, 855, 106] <384 from end> [855, 496, 463] ... [796, 131, 1019]\n", " Tokens: [953, 571, 195] ... [424, 544, 424] <384 from end> [544, 1023, 841] ... [482, 685, 839]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front fine_prompt: (8, 673)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4_front_448.npz\n", "\n", "Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back semantic_prompt: (448,)\n", " Tokens: [3715, 3085, 41] ... [2035, 4228, 1732] <256 from end> [41, 3279, 2065] ... [567, 3249, 1011]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back coarse_prompt: (2, 673)\n", " Tokens: [983, 523, 976] ... [834, 875, 131] <384 from end> [344, 1010, 837] ... [764, 534, 982]\n", " Tokens: [199, 147, 371] ... [516, 777, 980] <384 from end> [646, 841, 763] ... [334, 864, 995]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back fine_prompt: (8, 673)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4_back_448.npz\n", "\n", "Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front semantic_prompt: (512,)\n", " Tokens: [298, 415, 656] ... [1620, 41, 41] <256 from end> [105, 5737, 3354] ... [6202, 6202, 6051]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front coarse_prompt: (2, 769)\n", " Tokens: [604, 834, 740] ... [491, 976, 976] <384 from end> [344, 451, 683] ... [983, 30, 30]\n", " Tokens: [953, 571, 195] ... [937, 877, 49] <384 from end> [975, 537, 547] ... [843, 856, 796]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front fine_prompt: (8, 769)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4_front_512.npz\n", "\n", "Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back semantic_prompt: (512,)\n", " Tokens: [5407, 8175, 4172] ... [2035, 4228, 1732] <256 from end> [41, 3279, 2065] ... [567, 3249, 1011]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back coarse_prompt: (2, 769)\n", " Tokens: [1021, 838, 327] ... [834, 875, 131] <384 from end> [344, 1010, 837] ... [764, 534, 982]\n", " Tokens: [541, 955, 195] ... [516, 777, 980] <384 from end> [646, 841, 763] ... [334, 864, 995]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back fine_prompt: (8, 769)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4_back_512.npz\n", "\n", "Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front semantic_prompt: (576,)\n", " Tokens: [298, 415, 656] ... [134, 463, 134] <256 from end> [463, 147, 3302] ... [9673, 1566, 3648]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front coarse_prompt: (2, 865)\n", " Tokens: [604, 834, 740] ... [835, 835, 835] <384 from end> [835, 835, 835] ... [1011, 402, 393]\n", " Tokens: [953, 571, 195] ... [913, 913, 913] <384 from end> [913, 913, 913] ... [534, 546, 880]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from front fine_prompt: (8, 865)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4_front_576.npz\n", "\n", "Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back semantic_prompt: (576,)\n", " Tokens: [134, 2403, 2403] ... [2035, 4228, 1732] <256 from end> [41, 3279, 2065] ... [567, 3249, 1011]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back coarse_prompt: (2, 865)\n", " Tokens: [835, 738, 835] ... [834, 875, 131] <384 from end> [344, 1010, 837] ... [764, 534, 982]\n", " Tokens: [913, 544, 518] ... [516, 777, 980] <384 from end> [646, 841, 763] ... [334, 864, 995]\n", " Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4.npz resized from back fine_prompt: (8, 865)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0129-36-SPK-obama_perfect_front_256_trimmed.mp4_back_576.npz\n", "Loading needs_fixing/Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz\n", "semantic_prompt_max: 703\n", "\n", "Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz original\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz original semantic_prompt: (703,)\n", " Tokens: [38, 38, 7769] ... [41, 41, 3135] <256 from end> [2065, 2065, 4579] ... [3252, 3174, 91]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz original coarse_prompt: (2, 1056)\n", " Tokens: [347, 208, 583] ... [976, 976, 1001] <384 from end> [428, 683, 428] ... [408, 408, 408]\n", " Tokens: [560, 964, 924] ... [729, 729, 266] <384 from end> [212, 95, 996] ... [518, 518, 518]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz original fine_prompt: (8, 1056)\n", "\n", "Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front semantic_prompt: (128,)\n", " Tokens: [38, 38, 7769] ... [266, 10, 266]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front coarse_prompt: (2, 192)\n", " Tokens: [347, 208, 583] ... [738, 408, 738]\n", " Tokens: [560, 964, 924] ... [913, 424, 544]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front fine_prompt: (8, 192)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_front_128.npz\n", "\n", "Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back semantic_prompt: (128,)\n", " Tokens: [31, 67, 196] ... [3252, 3174, 91]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back coarse_prompt: (2, 192)\n", " Tokens: [339, 121, 408] ... [408, 408, 408]\n", " Tokens: [974, 913, 424] ... [518, 518, 518]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back fine_prompt: (8, 192)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_back_128.npz\n", "\n", "Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front semantic_prompt: (192,)\n", " Tokens: [38, 38, 7769] ... [147, 8610, 8610]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front coarse_prompt: (2, 288)\n", " Tokens: [347, 208, 583] ... [408, 408, 408]\n", " Tokens: [560, 964, 924] ... [518, 518, 518]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front fine_prompt: (8, 288)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_front_192.npz\n", "\n", "Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back semantic_prompt: (192,)\n", " Tokens: [12, 99, 401] ... [3252, 3174, 91]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back coarse_prompt: (2, 288)\n", " Tokens: [475, 475, 25] ... [408, 408, 408]\n", " Tokens: [913, 519, 519] ... [518, 518, 518]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back fine_prompt: (8, 288)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_back_192.npz\n", "\n", "Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front semantic_prompt: (256,)\n", " Tokens: [38, 38, 7769] ... [41, 3588, 3691]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front coarse_prompt: (2, 384)\n", " Tokens: [347, 208, 583] ... [753, 753, 690]\n", " Tokens: [560, 964, 924] ... [896, 896, 836]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front fine_prompt: (8, 384)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_front_256.npz\n", "\n", "Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back semantic_prompt: (256,)\n", " Tokens: [2065, 2065, 4579] ... [3252, 3174, 91]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back coarse_prompt: (2, 384)\n", " Tokens: [428, 683, 428] ... [408, 408, 408]\n", " Tokens: [212, 95, 996] ... [518, 518, 518]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back fine_prompt: (8, 384)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_back_256.npz\n", "\n", "Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front semantic_prompt: (320,)\n", " Tokens: [38, 38, 7769] ... [4388, 198, 198] <256 from end> [198, 352, 1658] ... [27, 27, 3971]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front coarse_prompt: (2, 480)\n", " Tokens: [347, 208, 583] ... [151, 976, 276] <384 from end> [276, 208, 208] ... [738, 779, 537]\n", " Tokens: [560, 964, 924] ... [617, 772, 617] <384 from end> [667, 772, 772] ... [544, 200, 43]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front fine_prompt: (8, 480)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_front_320.npz\n", "\n", "Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back semantic_prompt: (320,)\n", " Tokens: [576, 7934, 7934] ... [41, 41, 3135] <256 from end> [2065, 2065, 4579] ... [3252, 3174, 91]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back coarse_prompt: (2, 480)\n", " Tokens: [1022, 679, 216] ... [976, 976, 1001] <384 from end> [428, 683, 428] ... [408, 408, 408]\n", " Tokens: [266, 199, 486] ... [729, 729, 266] <384 from end> [212, 95, 996] ... [518, 518, 518]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back fine_prompt: (8, 480)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_back_320.npz\n", "\n", "Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front semantic_prompt: (384,)\n", " Tokens: [38, 38, 7769] ... [266, 10, 266] <256 from end> [266, 6032, 206] ... [10, 41, 576]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front coarse_prompt: (2, 577)\n", " Tokens: [347, 208, 583] ... [408, 738, 738] <384 from end> [738, 408, 408] ... [408, 676, 1022]\n", " Tokens: [560, 964, 924] ... [424, 544, 518] <384 from end> [544, 424, 424] ... [424, 648, 266]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front fine_prompt: (8, 577)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_front_384.npz\n", "\n", "Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back semantic_prompt: (384,)\n", " Tokens: [3971, 3971, 3971] ... [41, 41, 3135] <256 from end> [2065, 2065, 4579] ... [3252, 3174, 91]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back coarse_prompt: (2, 577)\n", " Tokens: [537, 339, 537] ... [976, 976, 1001] <384 from end> [428, 683, 428] ... [408, 408, 408]\n", " Tokens: [43, 241, 43] ... [729, 729, 266] <384 from end> [212, 95, 996] ... [518, 518, 518]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back fine_prompt: (8, 577)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_back_384.npz\n", "\n", "Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front semantic_prompt: (448,)\n", " Tokens: [38, 38, 7769] ... [147, 8610, 8610] <256 from end> [8610, 147, 1532] ... [41, 3135, 2065]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front coarse_prompt: (2, 673)\n", " Tokens: [347, 208, 583] ... [408, 408, 408] <384 from end> [408, 408, 408] ... [976, 1001, 428]\n", " Tokens: [560, 964, 924] ... [518, 518, 518] <384 from end> [518, 518, 518] ... [729, 266, 212]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front fine_prompt: (8, 673)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_front_448.npz\n", "\n", "Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back semantic_prompt: (448,)\n", " Tokens: [3691, 6621, 6621] ... [41, 41, 3135] <256 from end> [2065, 2065, 4579] ... [3252, 3174, 91]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back coarse_prompt: (2, 673)\n", " Tokens: [690, 983, 428] ... [976, 976, 1001] <384 from end> [428, 683, 428] ... [408, 408, 408]\n", " Tokens: [836, 851, 511] ... [729, 729, 266] <384 from end> [212, 95, 996] ... [518, 518, 518]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back fine_prompt: (8, 673)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_back_448.npz\n", "\n", "Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front semantic_prompt: (512,)\n", " Tokens: [38, 38, 7769] ... [41, 3588, 3691] <256 from end> [6621, 6621, 4071] ... [118, 12, 12]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front coarse_prompt: (2, 769)\n", " Tokens: [347, 208, 583] ... [753, 690, 983] <384 from end> [428, 428, 428] ... [738, 835, 475]\n", " Tokens: [560, 964, 924] ... [896, 836, 851] <384 from end> [511, 1011, 113] ... [765, 518, 913]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front fine_prompt: (8, 769)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_front_512.npz\n", "\n", "Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back semantic_prompt: (512,)\n", " Tokens: [8610, 8610, 147] ... [41, 41, 3135] <256 from end> [2065, 2065, 4579] ... [3252, 3174, 91]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back coarse_prompt: (2, 769)\n", " Tokens: [408, 408, 408] ... [976, 976, 1001] <384 from end> [428, 683, 428] ... [408, 408, 408]\n", " Tokens: [518, 518, 518] ... [729, 729, 266] <384 from end> [212, 95, 996] ... [518, 518, 518]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back fine_prompt: (8, 769)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_back_512.npz\n", "\n", "Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front semantic_prompt: (576,)\n", " Tokens: [38, 38, 7769] ... [27, 27, 3971] <256 from end> [3971, 3971, 1863] ... [28, 31, 31]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front coarse_prompt: (2, 865)\n", " Tokens: [347, 208, 583] ... [779, 537, 339] <384 from end> [537, 395, 887] ... [463, 463, 339]\n", " Tokens: [560, 964, 924] ... [200, 43, 241] <384 from end> [43, 544, 580] ... [974, 646, 974]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from front fine_prompt: (8, 865)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_front_576.npz\n", "\n", "Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back semantic_prompt: (576,)\n", " Tokens: [266, 266, 6032] ... [41, 41, 3135] <256 from end> [2065, 2065, 4579] ... [3252, 3174, 91]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back coarse_prompt: (2, 865)\n", " Tokens: [738, 738, 738] ... [976, 976, 1001] <384 from end> [428, 683, 428] ... [408, 408, 408]\n", " Tokens: [544, 518, 544] ... [729, 729, 266] <384 from end> [212, 95, 996] ... [518, 518, 518]\n", " Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4.npz resized from back fine_prompt: (8, 865)\n", "Saving needs_fixing/vars/Hey_have_you_he-23-0512-0131-07-SPK-trump_mp4_front_384.mp4_back_256_1_trimmed.mp4_back_576.npz\n" ] } ], "source": [ "# Speaker Segmenting\n", "\n", "npz_directory = \"atten/\"\n", "\n", "print(f\"Rendering samples for speakers in: {npz_directory}\")\n", "npz_files = [f for f in os.listdir(npz_directory) if f.endswith(\".npz\")]\n", "\n", "\n", "\n", "new_directory = os.path.join(npz_directory, \"vars\")\n", "# make new directory\n", "if not os.path.exists(new_directory):\n", " os.makedirs(new_directory)\n", " \n", "for i, npz_file in enumerate(npz_files):\n", "\n", " npz_filepath = os.path.join(npz_directory, npz_file)\n", "\n", "\n", " \n", " start_size = 128\n", " increment_size = 64\n", "\n", " print(f\"Loading {npz_filepath}\")\n", " history_prompt = load_npz(npz_filepath)\n", "\n", " semantic_prompt_max = history_prompt[\"semantic_prompt\"].shape[0]\n", " print(f\"semantic_prompt_max: {semantic_prompt_max}\")\n", " \n", " show_history_prompt_size(history_prompt, token_samples=3, text=f\"{npz_file} original\")\n", " # Increase start_size by increment_size until we reach semantic_prompt_max, save each file\n", " while start_size <= semantic_prompt_max - increment_size:\n", "\n", "\n", "\n", "\n", " # from the front\n", " new_history_prompt = resize_history_prompt(history_prompt, tokens=start_size, from_front=True)\n", " show_history_prompt_size(new_history_prompt, token_samples=3, text=f\"{npz_file} resized from front\")\n", "\n", " \n", " new_filename = f\"{npz_file[:-4]}_front_{start_size}.npz\"\n", " new_filepath = os.path.join(new_directory, new_filename)\n", " \n", "\n", " print(f\"Saving {new_filepath}\")\n", " \n", " new_history_prompt = resize_history_prompt(new_history_prompt, tokens=341, from_front=False)\n", " save_as_prompt(new_filepath,new_history_prompt)\n", "\n", " # from the back\n", " new_history_prompt = resize_history_prompt(history_prompt, tokens=start_size, from_front=False)\n", " show_history_prompt_size(new_history_prompt, token_samples=3, text=f\"{npz_file} resized from back\")\n", " new_filename = f\"{npz_file[:-4]}_back_{start_size}.npz\"\n", " \n", " \n", " new_filepath = os.path.join(new_directory, new_filename)\n", " print(f\"Saving {new_filepath}\")\n", " new_history_prompt = resize_history_prompt(new_history_prompt, tokens=341, from_front=False)\n", " save_as_prompt(new_filepath,new_history_prompt)\n", "\n", " start_size += increment_size\n", "\n", "\n", "\n", " \n", "\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "ac64a2c3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rendering samples for speakers in: sendit/\n", "Loading sendit/female_reader_neutral_2.npz\n", "Saving sendit/trimmed/french_female_0_trimmed.npz\n", "Loading sendit/french_female_4b.npz\n", "Saving sendit/trimmed/french_female_1_trimmed.npz\n", "Loading sendit/034_fr_dialog.mp4.npz\n", "Saving sendit/trimmed/french_female_2_trimmed.npz\n", "Loading sendit/081_bark_fr_woman_chanson.mp4.npz\n", "Saving sendit/trimmed/french_female_3_trimmed.npz\n", "Loading sendit/french_female_1.npz\n", "Saving sendit/trimmed/french_female_4_trimmed.npz\n", "Loading sendit/062_bark_fr_woman_chanson.mp4.npz\n", "Saving sendit/trimmed/french_female_5_trimmed.npz\n", "Loading sendit/063_bark_fr_woman_chanson.mp4.npz\n", "Saving sendit/trimmed/french_female_6_trimmed.npz\n", "Loading sendit/female_neutral_reader_1.npz\n", "Saving sendit/trimmed/french_female_7_trimmed.npz\n", "Loading sendit/080_bark_fr_woman_chanson.mp4.npz\n", "Saving sendit/trimmed/french_female_8_trimmed.npz\n", "Loading sendit/female_french.npz\n", "Saving sendit/trimmed/french_female_9_trimmed.npz\n", "Loading sendit/french_female_3a.npz\n", "Saving sendit/trimmed/french_female_10_trimmed.npz\n", "Loading sendit/female_french_might_not_gen.npz\n", "Saving sendit/trimmed/french_female_11_trimmed.npz\n", "Loading sendit/french_female_4a.npz\n", "Saving sendit/trimmed/french_female_12_trimmed.npz\n", "Loading sendit/033_fr_dialog.mp4.npz\n", "Saving sendit/trimmed/french_female_13_trimmed.npz\n", "Loading sendit/014_fr_dialog.mp4.npz\n", "Saving sendit/trimmed/french_female_14_trimmed.npz\n", "Loading sendit/013_fr_dialog.mp4.npz\n", "Saving sendit/trimmed/french_female_15_trimmed.npz\n", "Loading sendit/french_female.npz\n", "Saving sendit/trimmed/french_female_16_trimmed.npz\n", "Loading sendit/061_bark_fr_woman_chanson.mp4.npz\n", "Saving sendit/trimmed/french_female_17_trimmed.npz\n", "Loading sendit/female_1.npz\n", "Saving sendit/trimmed/french_female_18_trimmed.npz\n", "Loading sendit/french_female_3b.npz\n", "Saving sendit/trimmed/french_female_19_trimmed.npz\n" ] } ], "source": [ "# Trim files\n", "\n", "npz_directory = \"sendit/\"\n", "\n", "print(f\"Rendering samples for speakers in: {npz_directory}\")\n", "npz_files = [f for f in os.listdir(npz_directory) if f.endswith(\".npz\")]\n", "\n", "\n", "\n", "new_directory = os.path.join(npz_directory, \"trimmed\")\n", "# make new directory\n", "if not os.path.exists(new_directory):\n", " os.makedirs(new_directory)\n", " \n", "for i, npz_file in enumerate(npz_files):\n", " \n", " npz_filepath = os.path.join(npz_directory, npz_file)\n", "\n", "\n", "\n", "\n", " print(f\"Loading {npz_filepath}\")\n", " history_prompt = load_npz(npz_filepath)\n", "\n", " new_filename = f\"french_female_{i}_trimmed.npz\"\n", " new_filepath = os.path.join(new_directory, new_filename)\n", " print(f\"Saving {new_filepath}\")\n", " history_prompt = resize_history_prompt(history_prompt, tokens=341, from_front=False)\n", " save_as_prompt(new_filepath,history_prompt)\n", "\n", "\n", "\n", "\n", " \n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "3fb20e9f", "metadata": {}, "outputs": [], "source": [ "# Speaker Segmenting\n", "\n", "npz_directory = \"Trump/\"\n", "\n", "print(f\"Rendering samples for speakers in: {npz_directory}\")\n", "npz_files = [f for f in os.listdir(npz_directory) if f.endswith(\".npz\")]\n", "\n", "\n", "\n", "\n", "SPEAKER = testing_SPEAKER\n", "\n", "other_SPEAKER = \"custom_speakers/classic_robot_tts.npz\"\n", "\n", "og_history_prompt = load_npz(SPEAKER)\n", "other_history_prompt = load_npz(other_SPEAKER)\n", "\n", "\n", "\n", "\n", "show_history_prompt_size(og_history_prompt, text=f\"f{SPEAKER} Original\")\n", "show_history_prompt_size(other_history_prompt, text=f\"{other_SPEAKER} Other\")\n", "\n", "cell_text_prompt = charlie_text\n", "\n", "previous_segment_token_size = 128\n", "\n", "\n", "\n", "\n", "\n", "speaker_blend = merge_history_prompts(og_history_prompt, other_history_prompt, right_size=previous_segment_token_size)\n", "\n", "show_history_prompt_size(speaker_blend, text=f\"Base {SPEAKER} with {previous_segment_token_size} tokens from {other_SPEAKER}\")\n", "\n", "\"\"\"\n", "other_history_prompt_resize = resize_history_prompt(other_history_prompt, tokens=previous_segment_token_size, from_front=False)\n", "og_history_prompt_trimmed = resize_history_prompt(og_history_prompt, tokens=341, from_front=False)\n", "\n", "\n", "speaker_blend = concat_history_prompts(og_history_prompt_trimmed, other_history_prompt_resize)\n", "show_history_prompt_size(speaker_blend, text=f\"Base {SPEAKER} with {previous_segment_token_size} tokens from {other_SPEAKER}\")\n", "\"\"\"\n", "\n", "all_history_prompts = []\n", "\n", "all_history_prompts.append([og_history_prompt, f\"{SPEAKER} Original\"])\n", "all_history_prompts.append([other_history_prompt, f\"{other_SPEAKER } Other\"])\n", "all_history_prompts.append([speaker_blend, f\"Orig {SPEAKER} + {previous_segment_token_size} tokens from {other_SPEAKER}\"])\n", "\n", "\n", "final_audio_clips = []\n", "\n", "\n", "for history_prompt, text in all_history_prompts:\n", " print(f\"\\n-->Generating for {text}\")\n", "\n", " pieces = []\n", "\n", " show_history_prompt_size(history_prompt, token_samples=3, text=text)\n", "\n", " with measure_time(text=\" Regular Generation\"):\n", "\n", " set_seed(testing_seed)\n", " full_generation, audio_array = generate_audio(cell_text_prompt, history_prompt=history_prompt, output_full=True, silent=True)\n", " pieces += [audio_array]\n", " \n", " show_history_prompt_size(full_generation, text=f\" {text} full_generation output\")\n", "\n", " final_audio_clips.append(pieces)\n", "\n", "\n", "\n", "\"\"\"\n", "show_history_prompt_size(speaker_blend, token_samples=3, text=f\"{other_SPEAKER}\")\n", "\n", "pieces = []\n", "\n", "show_history_prompt_size(og_history_prompt, token_samples=3, text=\"Other history_prompt file\")\n", "\n", "\n", "\n", "\n", "sure_time(text=\"Regular Other\"):\n", "\n", " set_seed(testing_seed)\n", " og_full_generation, audio_array = generate_audio(cell_text_prompt, history_prompt=og_history_prompt, output_full=True)\n", " pieces += [audio_array]\n", " \n", "show_history_prompt_size(og_full_generation, text=\"Regular Other\")\n", "\n", "final_audio_clips.append(pieces)\n", "\n", "\n", "\n", "show_history_prompt_size(speaker_blend, token_samples=3, text=f\"speaker_blend\")\n", "\n", "pieces = []\n", "\n", "with measure_time(text=f\"new_speaker_blend\"):\n", " set_seed(testing_seed)\n", " new_speaker_blend_output, audio_array = generate_audio(cell_text_prompt, history_prompt=speaker_blend, output_full=True)\n", " pieces += [audio_array]\n", " \n", "show_history_prompt_size(new_speaker_blend_output, text=f\"speaker_blend Output\")\n", "\n", "final_audio_clips.append(pieces)\n", "\n", "\n", "#compare_history_prompts(og_full_generation, new_speaker_blend_2)\n", "\"\"\"\n", " \n", "\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "17400a9b", "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Let's check 256 semantic tokens to exact match the original history prompt, and 341 to match the fine prompt too.\n", "# And the coarse uses only like 209 of the semantic. So low! I hope we can pack the inference token space with more history!\n", "\n", "SPEAKER = testing_SPEAKER\n", "cell_text_prompt = charlie_text\n", "\n", "\n", "\n", "\n", "og_history_prompt = load_npz(SPEAKER)\n", "\n", "show_history_prompt_size(og_history_prompt, token_samples=3, text=\"Original history_prompt file\")\n", "\n", "\n", "final_audio_clips = []\n", "\n", "\n", "og_full_generation = None\n", "pieces = []\n", "\n", "\n", "with measure_time(text=\"Regular Generation\"):\n", "\n", " set_seed(testing_seed)\n", " og_full_generation, audio_array = generate_audio(cell_text_prompt, history_prompt=og_history_prompt, output_full=True)\n", " pieces += [audio_array]\n", " \n", "show_history_prompt_size(og_full_generation, text=\"Regular Output\")\n", "\n", "final_audio_clips.append(pieces)\n", "\n", "\n", "\n", "resized_to = 256 # this will give identical sem and coarse. \n", "resized_to = 341 # this will give identical fine too\n", "\n", "resized_history_prompt = resize_history_prompt(og_history_prompt, tokens=resized_to, from_front=False)\n", "show_history_prompt_size(resized_history_prompt, token_samples=3, text=f\"Resized to {resized_to}\")\n", "\n", "pieces = []\n", "\n", "with measure_time(text=f\"Resized to {resized_to}\"):\n", " set_seed(testing_seed)\n", " resized_256_full_generation, audio_array = generate_audio(cell_text_prompt, history_prompt=resized_history_prompt, output_full=True)\n", " pieces += [audio_array]\n", " \n", "show_history_prompt_size(resized_256_full_generation, text=f\"Output after resize: {resized_to}\")\n", "\n", "final_audio_clips.append(pieces)\n", "\n", "\n", "compare_history_prompts(og_full_generation, resized_256_full_generation)\n", "\n", " \n", "\n", "resized_to = 255\n", "resized_to = 340 \n", "resized_history_prompt_too_small = resize_history_prompt(og_history_prompt, tokens=resized_to, from_front=False)\n", "show_history_prompt_size(resized_history_prompt_too_small, token_samples=3, text=f\"Resized to {resized_to}\")\n", "\n", "pieces = []\n", "with measure_time(text=f\"Resized to {resized_to}\"):\n", " set_seed(testing_seed)\n", " resized_too_small_full_generation, audio_array = generate_audio(cell_text_prompt, history_prompt=resized_history_prompt_too_small, output_full=True)\n", " pieces += [audio_array]\n", " \n", "show_history_prompt_size(resized_too_small_full_generation, text=f\"Output after {resized_to}\")\n", "\n", "compare_history_prompts(og_full_generation, resized_too_small_full_generation)\n", "\n", "\n", "final_audio_clips.append(pieces)\n", "\n", "\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 77, "id": "f5095773", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Audio(np.concatenate(final_audio_clips[0]), rate=SAMPLE_RATE)\n", "# original" ] }, { "cell_type": "code", "execution_count": 78, "id": "b6210dd5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Audio(np.concatenate(final_audio_clips[1]), rate=SAMPLE_RATE)\n", "# 256" ] }, { "cell_type": "code", "execution_count": 79, "id": "77b92fbc", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Audio(np.concatenate(final_audio_clips[2]), rate=SAMPLE_RATE)\n", "# 255" ] }, { "attachments": {}, "cell_type": "markdown", "id": "6d13249b", "metadata": {}, "source": [ "# $ \\\\ $" ] }, { "attachments": {}, "cell_type": "markdown", "id": "cdfc8bf5", "metadata": {}, "source": [ "# Advanced Long-Form Generation\n", "Somtimes Bark will hallucinate a little extra audio at the end of the prompt.\n", "We can solve this issue by lowering the threshold for bark to stop generating text. \n", "We use the `min_eos_p` kwarg in `generate_text_semantic`" ] }, { "cell_type": "code", "execution_count": 60, "id": "62807fd0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Enabling deterministic algorithms\n", "Set seed to 1234\n", "['Have I told', 'you that story', 'about how Charlie', 'Parker became Charlie', \"Parker? Parker's a\", 'young kid, pretty', 'good on the', 'Sax, gets up', 'to play at', 'a cutting session,', 'and well, he', 'fucks it up.']\n", "Have I told\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 100/100 [00:00<00:00, 193.68it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 2/2 [00:01<00:00, 1.74it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " -->Piece 0 Finished at: 2023-05-09 18:04:15 in 3.2909646034240723 seconds\n", "you that story\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 100/100 [00:00<00:00, 187.81it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 3/3 [00:01<00:00, 1.95it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " -->Piece 1 Finished at: 2023-05-09 18:04:19 in 3.8052828311920166 seconds\n", "about how Charlie\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 100/100 [00:00<00:00, 197.72it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 2/2 [00:01<00:00, 1.43it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " -->Piece 2 Finished at: 2023-05-09 18:04:22 in 3.587956666946411 seconds\n", "Parker became Charlie\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 100/100 [00:00<00:00, 106.82it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 5/5 [00:02<00:00, 1.87it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " -->Piece 3 Finished at: 2023-05-09 18:04:28 in 5.316859245300293 seconds\n", "Parker? Parker's a\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 100/100 [00:00<00:00, 101.79it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 5/5 [00:02<00:00, 1.74it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " -->Piece 4 Finished at: 2023-05-09 18:04:33 in 5.499145269393921 seconds\n", "young kid, pretty\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 100/100 [00:01<00:00, 97.89it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 5/5 [00:03<00:00, 1.66it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " -->Piece 5 Finished at: 2023-05-09 18:04:39 in 5.751179456710815 seconds\n", "good on the\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 100/100 [00:01<00:00, 70.66it/s] \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 7/7 [00:04<00:00, 1.69it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " -->Piece 6 Finished at: 2023-05-09 18:04:46 in 7.167700290679932 seconds\n", "Sax, gets up\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 100/100 [00:01<00:00, 66.56it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 8/8 [00:04<00:00, 1.78it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " -->Piece 7 Finished at: 2023-05-09 18:04:54 in 7.699352979660034 seconds\n", "to play at\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 100/100 [00:00<00:00, 185.02it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 3/3 [00:01<00:00, 1.94it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " -->Piece 8 Finished at: 2023-05-09 18:04:57 in 3.6623594760894775 seconds\n", "a cutting session,\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 100/100 [00:00<00:00, 124.79it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 4/4 [00:02<00:00, 1.69it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " -->Piece 9 Finished at: 2023-05-09 18:05:02 in 4.857580661773682 seconds\n", "and well, he\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 100/100 [00:00<00:00, 185.49it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 3/3 [00:01<00:00, 1.87it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " -->Piece 10 Finished at: 2023-05-09 18:05:06 in 3.7911927700042725 seconds\n", "fucks it up.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 100/100 [00:01<00:00, 68.06it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 7/7 [00:04<00:00, 1.58it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " -->Piece 11 Finished at: 2023-05-09 18:05:14 in 7.664571523666382 seconds\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# unmodified naive chunking code. Just generate small audio fragment as clips, just like the original version did with sentences.\n", "\n", "# result: terrible.\n", "# if you ask generate_text_semantic just to generate 3 words, it's still assuming that's a normal sized audio clip\n", "# it sounds like a 3 word complete spoken utterance, not a part of a sentence.\n", "\n", "# REMINDER TO TEST LATER: maybe we can preload generate_text_semantic with already inferenced tokens, using the tokens in that space, instead of puttitng them in the history_prompt\n", "# if we do this will the words follow naturally, and the result match an inference where we given the whole sentence at once?\n", "\n", "GEN_TEMP = 0.6\n", "\n", "\n", "SPEAKER = testing_SPEAKER\n", "cell_text_prompt = charlie_text\n", "\n", "silence = np.zeros(int(0.25 * SAMPLE_RATE)) # quarter second of silence\n", "\n", "set_seed(testing_seed)\n", "\n", "pieces = split_by_words(cell_text_prompt, 3)\n", "print(pieces)\n", "final_pieces = [] \n", "\n", "for i, piece in enumerate(pieces):\n", " with measure_time(text=\"Piece\", index=i):\n", " print(piece)\n", " semantic_tokens = generate_text_semantic(\n", " piece,\n", " history_prompt=SPEAKER,\n", " temp=GEN_TEMP,\n", " min_eos_p=0.05, # this controls how likely the generation is to end\n", " )\n", "\n", " audio_array = semantic_to_waveform(semantic_tokens, history_prompt=SPEAKER)\n", " final_pieces.append(audio_array)\n", "\n", "Audio(np.concatenate(final_pieces), rate=SAMPLE_RATE)" ] }, { "cell_type": "code", "execution_count": 64, "id": "adf89062", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Enabling deterministic algorithms\n", "Set seed to 1234\n", "Have I told you that story about how Charlie Parker became Charlie Parker? Parker's a young kid, pretty good on the Sax, gets up to play at a cutting session, and well, he fucks it up.\n", " -->Semantic Finished at: 2023-05-09 18:13:41 in 6.372102499008179 seconds\n", " full len: 591\n", "length of coarse_semantic_tokens 1: 49\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "length of coarse_semantic_tokens 2: 49\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "length of coarse_semantic_tokens 3: 49\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "length of coarse_semantic_tokens 4: 50\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "length of coarse_semantic_tokens 5: 49\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "length of coarse_semantic_tokens 6: 49\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "length of coarse_semantic_tokens 7: 49\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "length of coarse_semantic_tokens 8: 50\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "length of coarse_semantic_tokens 9: 49\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "length of coarse_semantic_tokens 10: 49\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "length of coarse_semantic_tokens 11: 49\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "length of coarse_semantic_tokens 12: 50\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", " -->Coarse Full Finished at: 2023-05-09 18:14:20 in 38.90763521194458 seconds\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Test: First generate all semantic tokens in one go. Then iteratively chop up the semantic tokens into pieces and feed to the coarse and fine models tiny chunks.\n", "\n", "# Result. Kind of okay with large models. \n", "# Small models ok too except the small fine model, which clips like crazy. \n", "# You an still kind of feel the structure of the 3 word phrases, even though we generated semantic all at ounce. Though I'm not sure.\n", "# Update nah, it's good, I'm just hearing the coarse and fine history prompt changes.\n", "# There is still some clipping. We're splitting on random numbers, we could instead backtrack and erase spaces, they seem easy to recognize, or try to split on actual pauses or silence, rather than between words.\n", "\n", "# The other big flaw is we didn't both updating the history prompt for coarse and semantic for each chunk. Let's try that next.\n", "\n", "GEN_TEMP = 0.6\n", "SPEAKER = testing_SPEAKER\n", "\n", "silence = np.zeros(int(0.25 * SAMPLE_RATE)) # quarter second of silence\n", "\n", "set_seed(testing_seed)\n", "\n", "cell_text_prompt = charlie_text\n", "\n", "number_of_semantic_pieces = 12\n", "\n", "final_pieces = [] \n", "\n", "full_text = charlie_text\n", "print(full_text)\n", "\n", "semantic_tokens = []\n", "\n", "with measure_time(text=\"Semantic\"):\n", " semantic_tokens = generate_text_semantic(\n", " full_text,\n", " history_prompt=SPEAKER,\n", " temp=GEN_TEMP,\n", " min_eos_p=0.05,\n", " silent=True)\n", "\n", "\n", "\n", "print (f\" full len: {len(semantic_tokens)}\")\n", "\n", "split_semantic_tokens = split_array_equally(semantic_tokens, number_of_semantic_pieces)\n", "\n", "with measure_time(text=\"Coarse Full\"):\n", " for i, coarse_semantic_tokens in enumerate(split_semantic_tokens):\n", " print(f\"length of coarse_semantic_tokens {i + 1}: {len(coarse_semantic_tokens)}\")\n", " audio_array = semantic_to_waveform(coarse_semantic_tokens, history_prompt=SPEAKER, silent=True)\n", " final_pieces += [audio_array]\n", "\n", "\n", "\n", "\n", "\n", "\n", "Audio(np.concatenate(final_pieces), rate=SAMPLE_RATE)" ] }, { "cell_type": "code", "execution_count": 65, "id": "69a754e8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "base semantic output semantic_prompt: (548,)\n", " Tokens: [8735, 8385, 147] ... [232, 232, 10]\n", "\n", "base semantic output coarse_prompt: (2, 823)\n", " Tokens: [62, 62, 62] ... [855, 855, 855]\n", " Tokens: [424, 424, 424] ... [928, 913, 913]\n", "\n", "base semantic output fine_prompt: (8, 823)\n", "Enabling deterministic algorithms\n", "Set seed to 1234\n", "length of coarse_semantic_tokens 1: 68\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "length of coarse_semantic_tokens 2: 69\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "length of coarse_semantic_tokens 3: 68\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "length of coarse_semantic_tokens 4: 69\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "length of coarse_semantic_tokens 5: 68\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "length of coarse_semantic_tokens 6: 69\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "length of coarse_semantic_tokens 7: 68\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "length of coarse_semantic_tokens 8: 69\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", " -->Coarse Full Finished at: 2023-05-09 18:16:39 in 31.530385971069336 seconds\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# let's double check the 3 word structure phrasing is ACTUALLY result of the coarse model, by using the raw semantic tokens that we generated in the first cell.\n", "# Just want to double check\n", "# Edit: it wasn't the coarse model. \n", "\n", "\n", "show_history_prompt_size(og_full_generation, text=\"base semantic output\")\n", "\n", "semantic_tokens = og_full_generation[\"semantic_prompt\"]\n", "\n", "set_seed(testing_seed)\n", "\n", "final_pieces = []\n", "\n", "split_semantic_tokens = split_array_equally(semantic_tokens, 8)\n", "\n", "\n", "with measure_time(text=\"Coarse Full\"):\n", " for i, coarse_semantic_tokens in enumerate(split_semantic_tokens):\n", " print(f\"length of coarse_semantic_tokens {i + 1}: {len(coarse_semantic_tokens)}\")\n", " audio_array = semantic_to_waveform(coarse_semantic_tokens, history_prompt=SPEAKER, silent=True)\n", " final_pieces += [audio_array]\n", "\n", "\n", "Audio(np.concatenate(final_pieces), rate=SAMPLE_RATE)" ] }, { "cell_type": "code", "execution_count": 80, "id": "64b2c8df", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Disabling deterministic algorithms\n", "Set seed to 3004692535\n", "Have I told you that story about how Charlie Parker became Charlie Parker? Parker's a young kid, pretty good on the Sax, gets up to play at a cutting session, and well, he fucks it up.\n", "\n", "original history_prompt\n", " original history_prompt semantic_prompt: (682,)\n", " Tokens: [147, 6242, 302] ... [10, 230, 56] <256 from end> [206, 10, 206] ... [2403, 147, 2009]\n", " original history_prompt coarse_prompt: (2, 1025)\n", " Tokens: [738, 738, 1017] ... [738, 738, 738] <384 from end> [738, 738, 738] ... [717, 121, 121]\n", " Tokens: [363, 363, 646] ... [937, 544, 937] <384 from end> [544, 544, 544] ... [424, 424, 424]\n", " original history_prompt fine_prompt: (8, 1025)\n", "processing semantic_tokens chunk 1 of size: 53\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 3/3 [00:01<00:00, 1.86it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full generation returned\n", " full generation returned semantic_prompt: (53,)\n", " Tokens: [2305, 147, 3208] ... [720, 1409, 1409]\n", " full generation returned coarse_prompt: (2, 79)\n", " Tokens: [62, 62, 62] ... [936, 958, 505]\n", " Tokens: [424, 424, 424] ... [632, 654, 140]\n", " full generation returned fine_prompt: (8, 79)\n", "\n", "next history prompt for coarse\n", " next history prompt for coarse semantic_prompt: (735,)\n", " Tokens: [147, 6242, 302] ... [6025, 6025, 6564] <256 from end> [648, 41, 6286] ... [720, 1409, 1409]\n", " next history prompt for coarse coarse_prompt: (2, 1104)\n", " Tokens: [738, 738, 1017] ... [30, 370, 860] <384 from end> [208, 495, 20] ... [936, 958, 505]\n", " Tokens: [363, 363, 646] ... [701, 279, 719] <384 from end> [416, 673, 568] ... [632, 654, 140]\n", " next history prompt for coarse fine_prompt: (8, 1104)\n", "processing semantic_tokens chunk 2 of size: 54\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 3/3 [00:01<00:00, 1.76it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full generation returned\n", " full generation returned semantic_prompt: (54,)\n", " Tokens: [3732, 6358, 808] ... [2069, 9848, 1044]\n", " full generation returned coarse_prompt: (2, 81)\n", " Tokens: [921, 928, 264] ... [20, 192, 56]\n", " Tokens: [772, 1002, 496] ... [836, 633, 994]\n", " full generation returned fine_prompt: (8, 81)\n", "\n", "next history prompt for coarse\n", " next history prompt for coarse semantic_prompt: (789,)\n", " Tokens: [147, 6242, 302] ... [326, 326, 1376] <256 from end> [211, 211, 211] ... [2069, 9848, 1044]\n", " next history prompt for coarse coarse_prompt: (2, 1185)\n", " Tokens: [738, 738, 1017] ... [583, 583, 491] <384 from end> [136, 321, 136] ... [20, 192, 56]\n", " Tokens: [363, 363, 646] ... [414, 960, 674] <384 from end> [564, 693, 700] ... [836, 633, 994]\n", " next history prompt for coarse fine_prompt: (8, 1185)\n", "processing semantic_tokens chunk 3 of size: 53\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 3/3 [00:01<00:00, 1.83it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full generation returned\n", " full generation returned semantic_prompt: (53,)\n", " Tokens: [50, 27, 27] ... [1044, 118, 27]\n", " full generation returned coarse_prompt: (2, 79)\n", " Tokens: [393, 52, 257] ... [393, 91, 738]\n", " Tokens: [404, 700, 700] ... [947, 665, 859]\n", " full generation returned fine_prompt: (8, 79)\n", "\n", "next history prompt for coarse\n", " next history prompt for coarse semantic_prompt: (842,)\n", " Tokens: [147, 6242, 302] ... [230, 206, 56] <256 from end> [193, 193, 56] ... [1044, 118, 27]\n", " next history prompt for coarse coarse_prompt: (2, 1264)\n", " Tokens: [738, 738, 1017] ... [408, 408, 408] <384 from end> [408, 408, 408] ... [393, 91, 738]\n", " Tokens: [363, 363, 646] ... [518, 518, 518] <384 from end> [518, 518, 518] ... [947, 665, 859]\n", " next history prompt for coarse fine_prompt: (8, 1264)\n", "processing semantic_tokens chunk 4 of size: 54\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 3/3 [00:01<00:00, 1.79it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full generation returned\n", " full generation returned semantic_prompt: (54,)\n", " Tokens: [27, 27, 1232] ... [206, 2009, 206]\n", " full generation returned coarse_prompt: (2, 81)\n", " Tokens: [738, 1017, 106] ... [408, 408, 121]\n", " Tokens: [859, 928, 969] ... [518, 518, 424]\n", " full generation returned fine_prompt: (8, 81)\n", "\n", "next history prompt for coarse\n", " next history prompt for coarse semantic_prompt: (896,)\n", " Tokens: [147, 6242, 302] ... [17, 2113, 3745] <256 from end> [5218, 117, 107] ... [206, 2009, 206]\n", " next history prompt for coarse coarse_prompt: (2, 1345)\n", " Tokens: [738, 738, 1017] ... [613, 185, 291] <384 from end> [565, 879, 228] ... [408, 408, 121]\n", " Tokens: [363, 363, 646] ... [453, 198, 298] <384 from end> [809, 516, 687] ... [518, 518, 424]\n", " next history prompt for coarse fine_prompt: (8, 1345)\n", "processing semantic_tokens chunk 5 of size: 53\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 3/3 [00:01<00:00, 1.86it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full generation returned\n", " full generation returned semantic_prompt: (53,)\n", " Tokens: [2009, 206, 528] ... [479, 210, 50]\n", " full generation returned coarse_prompt: (2, 79)\n", " Tokens: [408, 408, 408] ... [751, 530, 1010]\n", " Tokens: [913, 913, 518] ... [924, 924, 924]\n", " full generation returned fine_prompt: (8, 79)\n", "\n", "next history prompt for coarse\n", " next history prompt for coarse semantic_prompt: (949,)\n", " Tokens: [147, 6242, 302] ... [2497, 8029, 9663] <256 from end> [1908, 50, 5369] ... [479, 210, 50]\n", " next history prompt for coarse coarse_prompt: (2, 1424)\n", " Tokens: [738, 738, 1017] ... [983, 216, 747] <384 from end> [958, 921, 604] ... [751, 530, 1010]\n", " Tokens: [363, 363, 646] ... [229, 654, 996] <384 from end> [307, 307, 888] ... [924, 924, 924]\n", " next history prompt for coarse fine_prompt: (8, 1424)\n", "processing semantic_tokens chunk 6 of size: 54\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 3/3 [00:01<00:00, 1.79it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full generation returned\n", " full generation returned semantic_prompt: (54,)\n", " Tokens: [10, 27, 27] ... [4040, 4667, 50]\n", " full generation returned coarse_prompt: (2, 81)\n", " Tokens: [699, 699, 430] ... [475, 738, 62]\n", " Tokens: [373, 765, 601] ... [519, 544, 913]\n", " full generation returned fine_prompt: (8, 81)\n", "\n", "next history prompt for coarse\n", " next history prompt for coarse semantic_prompt: (1003,)\n", " Tokens: [147, 6242, 302] ... [5740, 5740, 520] <256 from end> [4638, 298, 4571] ... [4040, 4667, 50]\n", " next history prompt for coarse coarse_prompt: (2, 1505)\n", " Tokens: [738, 738, 1017] ... [402, 162, 20] <384 from end> [216, 112, 683] ... [475, 738, 62]\n", " Tokens: [363, 363, 646] ... [757, 45, 668] <384 from end> [836, 872, 754] ... [519, 544, 913]\n", " next history prompt for coarse fine_prompt: (8, 1505)\n", "processing semantic_tokens chunk 7 of size: 54\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 3/3 [00:01<00:00, 1.80it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full generation returned\n", " full generation returned semantic_prompt: (54,)\n", " Tokens: [10, 27, 9736] ... [206, 193, 193]\n", " full generation returned coarse_prompt: (2, 81)\n", " Tokens: [324, 584, 796] ... [408, 62, 62]\n", " Tokens: [14, 536, 782] ... [913, 424, 424]\n", " full generation returned fine_prompt: (8, 81)\n", "\n", "next history prompt for coarse\n", " next history prompt for coarse semantic_prompt: (1057,)\n", " Tokens: [147, 6242, 302] ... [5238, 3009, 1787] <256 from end> [50, 10, 27] ... [206, 193, 193]\n", " next history prompt for coarse coarse_prompt: (2, 1586)\n", " Tokens: [738, 738, 1017] ... [724, 833, 23] <384 from end> [530, 976, 724] ... [408, 62, 62]\n", " Tokens: [363, 363, 646] ... [942, 516, 42] <384 from end> [446, 570, 888] ... [913, 424, 424]\n", " next history prompt for coarse fine_prompt: (8, 1586)\n", "processing semantic_tokens chunk 8 of size: 53\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 3/3 [00:01<00:00, 1.87it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full generation returned\n", " full generation returned semantic_prompt: (53,)\n", " Tokens: [193, 147, 193] ... [210, 50, 10]\n", " full generation returned coarse_prompt: (2, 79)\n", " Tokens: [408, 62, 62] ... [855, 855, 855]\n", " Tokens: [913, 424, 424] ... [913, 913, 913]\n", " full generation returned fine_prompt: (8, 79)\n", "\n", "next history prompt for coarse\n", " next history prompt for coarse semantic_prompt: (1110,)\n", " Tokens: [147, 6242, 302] ... [138, 131, 10] <256 from end> [230, 206, 206] ... [210, 50, 10]\n", " next history prompt for coarse coarse_prompt: (2, 1665)\n", " Tokens: [738, 738, 1017] ... [604, 408, 408] <384 from end> [106, 106, 106] ... [855, 855, 855]\n", " Tokens: [363, 363, 646] ... [928, 765, 928] <384 from end> [913, 913, 913] ... [913, 913, 913]\n", " next history prompt for coarse fine_prompt: (8, 1665)\n", "processing semantic_tokens chunk 9 of size: 54\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 3/3 [00:01<00:00, 1.82it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full generation returned\n", " full generation returned semantic_prompt: (54,)\n", " Tokens: [10, 27, 4035] ... [41, 255, 255]\n", " full generation returned coarse_prompt: (2, 81)\n", " Tokens: [472, 472, 404] ... [604, 724, 62]\n", " Tokens: [928, 729, 729] ... [516, 114, 841]\n", " full generation returned fine_prompt: (8, 81)\n", "\n", "next history prompt for coarse\n", " next history prompt for coarse semantic_prompt: (1164,)\n", " Tokens: [147, 6242, 302] ... [2305, 147, 5008] <256 from end> [41, 399, 8831] ... [41, 255, 255]\n", " next history prompt for coarse coarse_prompt: (2, 1746)\n", " Tokens: [738, 738, 1017] ... [408, 408, 408] <384 from end> [408, 62, 408] ... [604, 724, 62]\n", " Tokens: [363, 363, 646] ... [518, 518, 518] <384 from end> [544, 424, 518] ... [516, 114, 841]\n", " next history prompt for coarse fine_prompt: (8, 1746)\n", "processing semantic_tokens chunk 10 of size: 53\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 3/3 [00:01<00:00, 1.84it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full generation returned\n", " full generation returned semantic_prompt: (53,)\n", " Tokens: [255, 321, 41] ... [206, 206, 7567]\n", " full generation returned coarse_prompt: (2, 79)\n", " Tokens: [724, 871, 939] ... [408, 738, 62]\n", " Tokens: [687, 1007, 834] ... [544, 544, 424]\n", " full generation returned fine_prompt: (8, 79)\n", "\n", "next history prompt for coarse\n", " next history prompt for coarse semantic_prompt: (1217,)\n", " Tokens: [147, 6242, 302] ... [3066, 5416, 5416] <256 from end> [3995, 3995, 3995] ... [206, 206, 7567]\n", " next history prompt for coarse coarse_prompt: (2, 1825)\n", " Tokens: [738, 738, 1017] ... [890, 612, 1021] <384 from end> [645, 1021, 495] ... [408, 738, 62]\n", " Tokens: [363, 363, 646] ... [174, 458, 570] <384 from end> [446, 446, 772] ... [544, 544, 424]\n", " next history prompt for coarse fine_prompt: (8, 1825)\n", "processing semantic_tokens chunk 11 of size: 54\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 3/3 [00:01<00:00, 1.76it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full generation returned\n", " full generation returned semantic_prompt: (54,)\n", " Tokens: [65, 206, 206] ... [64, 17, 17]\n", " full generation returned coarse_prompt: (2, 81)\n", " Tokens: [408, 408, 408] ... [74, 378, 59]\n", " Tokens: [518, 518, 544] ... [685, 685, 132]\n", " full generation returned fine_prompt: (8, 81)\n", "\n", "next history prompt for coarse\n", " next history prompt for coarse semantic_prompt: (1271,)\n", " Tokens: [147, 6242, 302] ... [684, 684, 2775] <256 from end> [171, 130, 6326] ... [64, 17, 17]\n", " next history prompt for coarse coarse_prompt: (2, 1906)\n", " Tokens: [738, 738, 1017] ... [942, 402, 428] <384 from end> [428, 402, 833] ... [74, 378, 59]\n", " Tokens: [363, 363, 646] ... [984, 884, 132] <384 from end> [800, 513, 870] ... [685, 685, 132]\n", " next history prompt for coarse fine_prompt: (8, 1906)\n", "processing semantic_tokens chunk 12 of size: 54\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 3/3 [00:01<00:00, 1.82it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full generation returned\n", " full generation returned semantic_prompt: (54,)\n", " Tokens: [9241, 7558, 7558] ... [147, 1613, 2009]\n", " full generation returned coarse_prompt: (2, 81)\n", " Tokens: [501, 162, 501] ... [408, 408, 62]\n", " Tokens: [285, 865, 985] ... [518, 518, 424]\n", " full generation returned fine_prompt: (8, 81)\n", "\n", "next history prompt for coarse\n", " next history prompt for coarse semantic_prompt: (1325,)\n", " Tokens: [147, 6242, 302] ... [2305, 2305, 2305] <256 from end> [5008, 147, 27] ... [147, 1613, 2009]\n", " next history prompt for coarse coarse_prompt: (2, 1987)\n", " Tokens: [738, 738, 1017] ... [62, 408, 62] <384 from end> [62, 25, 465] ... [408, 408, 62]\n", " Tokens: [363, 363, 646] ... [424, 913, 424] <384 from end> [424, 646, 775] ... [518, 518, 424]\n", " next history prompt for coarse fine_prompt: (8, 1987)\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Test: Generate all semantic in one go. Then iteratively chop up the semantic tokens into tiny pieces and feed to the coarse and fine models.\n", "# But this time properly also update the history prompt with the coarse and fine tokens with prev segments\n", "\n", "# this is nearly perfect, down to some pretty small semantic chunks, down as small as almost 50 coarse tokens nearly, and 80+ is usually mostly seamless with a few minor artifacts\n", "# I wonder if the time alignment hack is breaking the smaller chunks because 1 second chunks work so well, but not smaller?\n", "\n", "# Update: constantly updating pushes origina speaker out of the window too fast, and the short segments drift about as much as a full 14 second segment, so it loses the speaker voice too vase. Next cell, fix that.\n", "\n", "SPEAKER = testing_SPEAKER\n", "cell_text_prompt = charlie_text\n", "set_seed(testing_seed)\n", "\n", "\n", "GEN_TEMP = 0.6\n", "\n", "\n", "final_pieces = [] \n", "\n", "full_text = cell_text_prompt\n", "\n", "print(full_text)\n", "\n", "\n", "next_history_prompt_for_coarse = load_npz(SPEAKER)\n", "\n", "show_history_prompt_size(next_history_prompt_for_coarse,text=\"original history_prompt\")\n", "\n", "semantic_tokens_to_process = generate_text_semantic(\n", " full_text,\n", " history_prompt=SPEAKER,\n", " temp=GEN_TEMP,\n", " min_eos_p=0.05,\n", " silent=True)\n", "\n", "\n", "split_semantic_tokens = split_array_equally(semantic_tokens_to_process, 12)\n", "\n", "for i, coarse_semantic_tokens in enumerate(split_semantic_tokens):\n", " print(f\"processing semantic_tokens chunk {i + 1} of size: {len(coarse_semantic_tokens)}\")\n", "\n", " \n", " # first time the history prompt is same as regular speaker file\n", " full_generation, audio_array = semantic_to_waveform(coarse_semantic_tokens, history_prompt=next_history_prompt_for_coarse, output_full=True)\n", "\n", " coarse_tokens = generate_coarse(\n", " coarse_semantic_tokens,\n", " history_prompt=next_history_prompt_for_coarse,\n", " temp=0.7,\n", " silent=True,\n", " use_kv_caching=True,\n", " x_coarse_history_alignment_hack=-2\n", " )\n", " fine_tokens = generate_fine(\n", " coarse_tokens,\n", " history_prompt=next_history_prompt_for_coarse,\n", " temp=0.5,\n", " )\n", " audio_array = codec_decode(fine_tokens)\n", "\n", " full_generation = {\n", " \"semantic_prompt\": coarse_semantic_tokens,\n", " \"coarse_prompt\": coarse_tokens,\n", " \"fine_prompt\": fine_tokens,\n", " }\n", "\n", "\n", "\n", " show_history_prompt_size(full_generation, text=\"full generation returned\") \n", "\n", "\n", " #stack history with last generated\n", " next_semantic_tokens = np.hstack([next_history_prompt_for_coarse[\"semantic_prompt\"], full_generation[\"semantic_prompt\"]]).astype(np.int32) #should this be int64?\n", "\n", " next_coarse_tokens = np.hstack([next_history_prompt_for_coarse[\"coarse_prompt\"], full_generation[\"coarse_prompt\"]]).astype(np.int32)\n", "\n", " next_fine_tokens = np.hstack([next_history_prompt_for_coarse[\"fine_prompt\"], full_generation[\"fine_prompt\"]]).astype(np.int32)\n", "\n", " next_history_prompt_for_coarse = {\n", " \"semantic_prompt\": next_semantic_tokens,\n", " \"coarse_prompt\": next_coarse_tokens,\n", " \"fine_prompt\": next_fine_tokens,\n", " }\n", "\n", " show_history_prompt_size(next_history_prompt_for_coarse, text=\"next history prompt for coarse\")\n", "\n", " final_pieces.append(audio_array)\n", "\n", "\n", "Audio(np.concatenate(final_pieces), rate=SAMPLE_RATE)" ] }, { "cell_type": "code", "execution_count": 90, "id": "b54d7084", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Disabling deterministic algorithms\n", "Set seed to 2973489230\n", "Have I told you that story about how Charlie Parker became Charlie Parker? Parker's a young kid, pretty good on the Sax, gets up to play at a cutting session, and well, he fucks it up.\n", "\n", "original history_prompt\n", " original history_prompt semantic_prompt: (682,)\n", " Tokens: [147, 6242, 302] ... [10, 230, 56] <256 from end> [206, 10, 206] ... [2403, 147, 2009]\n", " original history_prompt coarse_prompt: (2, 1025)\n", " Tokens: [738, 738, 1017] ... [738, 738, 738] <384 from end> [738, 738, 738] ... [717, 121, 121]\n", " Tokens: [363, 363, 646] ... [937, 544, 937] <384 from end> [544, 544, 544] ... [424, 424, 424]\n", " original history_prompt fine_prompt: (8, 1025)\n", "processing semantic_tokens chunk 1 of size: 58\n", "\n", "next history prompt for coarse\n", " next history prompt for coarse semantic_prompt: (682,)\n", " Tokens: [147, 6242, 302] ... [8851, 27, 1041] <64 from end> [59, 28, 107] ... [2403, 147, 2009]\n", " next history prompt for coarse coarse_prompt: (2, 1025)\n", " Tokens: [738, 738, 1017] ... [475, 488, 148] <96 from end> [103, 103, 148] ... [717, 121, 121]\n", " Tokens: [363, 363, 646] ... [544, 43, 364] <96 from end> [420, 420, 241] ... [424, 424, 424]\n", " next history prompt for coarse fine_prompt: (8, 1025)\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full generation returned\n", " full generation returned semantic_prompt: (58,)\n", " Tokens: [8735, 147, 3208] ... [2657, 2013, 9145]\n", " full generation returned coarse_prompt: (2, 87)\n", " Tokens: [62, 62, 62] ... [155, 155, 370]\n", " Tokens: [424, 424, 424] ... [301, 537, 521]\n", " full generation returned fine_prompt: (8, 87)\n", "\n", "next history prompt for coarse + full generation, end of loop\n", " next history prompt for coarse + full generation, end of loop semantic_prompt: (341,)\n", " Tokens: [2893, 9141, 41] ... [6286, 5564, 6627] <256 from end> [6299, 215, 215] ... [2657, 2013, 9145]\n", " next history prompt for coarse + full generation, end of loop coarse_prompt: (2, 512)\n", " Tokens: [344, 311, 583] ... [559, 237, 237] <384 from end> [148, 237, 321] ... [155, 155, 370]\n", " Tokens: [601, 754, 833] ... [589, 121, 376] <384 from end> [266, 533, 794] ... [301, 537, 521]\n", " next history prompt for coarse + full generation, end of loop fine_prompt: (8, 512)\n", "processing semantic_tokens chunk 2 of size: 58\n", "\n", "next history prompt for coarse\n", " next history prompt for coarse semantic_prompt: (341,)\n", " Tokens: [43, 457, 1232] ... [2403, 147, 2009] <64 from end> [206, 147, 2009] ... [2657, 2013, 9145]\n", " next history prompt for coarse coarse_prompt: (2, 512)\n", " Tokens: [20, 291, 20] ... [717, 121, 121] <96 from end> [62, 62, 62] ... [155, 155, 370]\n", " Tokens: [947, 766, 128] ... [424, 424, 424] <96 from end> [424, 424, 424] ... [301, 537, 521]\n", " next history prompt for coarse fine_prompt: (8, 512)\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full generation returned\n", " full generation returned semantic_prompt: (58,)\n", " Tokens: [5392, 7677, 7677] ... [36, 9353, 9353]\n", " full generation returned coarse_prompt: (2, 87)\n", " Tokens: [523, 523, 291] ... [793, 830, 950]\n", " Tokens: [568, 282, 228] ... [289, 304, 75]\n", " full generation returned fine_prompt: (8, 87)\n", "\n", "next history prompt for coarse + full generation, end of loop\n", " next history prompt for coarse + full generation, end of loop semantic_prompt: (341,)\n", " Tokens: [2403, 3302, 528] ... [44, 8891, 5230] <256 from end> [8526, 1725, 1725] ... [36, 9353, 9353]\n", " next history prompt for coarse + full generation, end of loop coarse_prompt: (2, 512)\n", " Tokens: [62, 62, 121] ... [833, 782, 264] <384 from end> [983, 99, 392] ... [793, 830, 950]\n", " Tokens: [424, 424, 424] ... [824, 767, 502] <384 from end> [996, 197, 304] ... [289, 304, 75]\n", " next history prompt for coarse + full generation, end of loop fine_prompt: (8, 512)\n", "processing semantic_tokens chunk 3 of size: 58\n", "\n", "next history prompt for coarse\n", " next history prompt for coarse semantic_prompt: (341,)\n", " Tokens: [43, 457, 1232] ... [2403, 147, 2009] <64 from end> [1933, 2657, 2657] ... [36, 9353, 9353]\n", " next history prompt for coarse coarse_prompt: (2, 512)\n", " Tokens: [20, 291, 20] ... [717, 121, 121] <96 from end> [1000, 983, 30] ... [793, 830, 950]\n", " Tokens: [947, 766, 128] ... [424, 424, 424] <96 from end> [868, 444, 568] ... [289, 304, 75]\n", " next history prompt for coarse fine_prompt: (8, 512)\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full generation returned\n", " full generation returned semantic_prompt: (58,)\n", " Tokens: [5685, 4638, 210] ... [1787, 50, 10]\n", " full generation returned coarse_prompt: (2, 87)\n", " Tokens: [950, 683, 921] ... [604, 1019, 865]\n", " Tokens: [244, 739, 304] ... [754, 430, 200]\n", " full generation returned fine_prompt: (8, 87)\n", "\n", "next history prompt for coarse + full generation, end of loop\n", " next history prompt for coarse + full generation, end of loop semantic_prompt: (341,)\n", " Tokens: [2403, 3302, 528] ... [44, 8891, 5230] <256 from end> [8526, 1725, 1725] ... [1787, 50, 10]\n", " next history prompt for coarse + full generation, end of loop coarse_prompt: (2, 512)\n", " Tokens: [62, 62, 121] ... [833, 782, 264] <384 from end> [983, 99, 392] ... [604, 1019, 865]\n", " Tokens: [424, 424, 424] ... [824, 767, 502] <384 from end> [996, 197, 304] ... [754, 430, 200]\n", " next history prompt for coarse + full generation, end of loop fine_prompt: (8, 512)\n", "processing semantic_tokens chunk 4 of size: 58\n", "\n", "next history prompt for coarse\n", " next history prompt for coarse semantic_prompt: (341,)\n", " Tokens: [43, 457, 1232] ... [2403, 147, 2009] <64 from end> [298, 1556, 1556] ... [1787, 50, 10]\n", " next history prompt for coarse coarse_prompt: (2, 512)\n", " Tokens: [20, 291, 20] ... [717, 121, 121] <96 from end> [430, 887, 275] ... [604, 1019, 865]\n", " Tokens: [947, 766, 128] ... [424, 424, 424] <96 from end> [765, 571, 1007] ... [754, 430, 200]\n", " next history prompt for coarse fine_prompt: (8, 512)\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full generation returned\n", " full generation returned semantic_prompt: (58,)\n", " Tokens: [5, 282, 8549] ... [230, 10, 10]\n", " full generation returned coarse_prompt: (2, 87)\n", " Tokens: [62, 25, 103] ... [738, 738, 62]\n", " Tokens: [424, 200, 648] ... [913, 544, 424]\n", " full generation returned fine_prompt: (8, 87)\n", "\n", "next history prompt for coarse + full generation, end of loop\n", " next history prompt for coarse + full generation, end of loop semantic_prompt: (341,)\n", " Tokens: [2403, 3302, 528] ... [44, 8891, 5230] <256 from end> [8526, 1725, 1725] ... [230, 10, 10]\n", " next history prompt for coarse + full generation, end of loop coarse_prompt: (2, 512)\n", " Tokens: [62, 62, 121] ... [833, 782, 264] <384 from end> [983, 99, 392] ... [738, 738, 62]\n", " Tokens: [424, 424, 424] ... [824, 767, 502] <384 from end> [996, 197, 304] ... [913, 544, 424]\n", " next history prompt for coarse + full generation, end of loop fine_prompt: (8, 512)\n", "processing semantic_tokens chunk 5 of size: 58\n", "\n", "next history prompt for coarse\n", " next history prompt for coarse semantic_prompt: (341,)\n", " Tokens: [43, 457, 1232] ... [2403, 147, 2009] <64 from end> [41, 5238, 1363] ... [230, 10, 10]\n", " next history prompt for coarse coarse_prompt: (2, 512)\n", " Tokens: [20, 291, 20] ... [717, 121, 121] <96 from end> [432, 604, 879] ... [738, 738, 62]\n", " Tokens: [947, 766, 128] ... [424, 424, 424] <96 from end> [601, 646, 888] ... [913, 544, 424]\n", " next history prompt for coarse fine_prompt: (8, 512)\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full generation returned\n", " full generation returned semantic_prompt: (58,)\n", " Tokens: [1710, 1710, 10] ... [1532, 2235, 6572]\n", " full generation returned coarse_prompt: (2, 87)\n", " Tokens: [106, 106, 408] ... [855, 52, 52]\n", " Tokens: [913, 913, 424] ... [913, 928, 386]\n", " full generation returned fine_prompt: (8, 87)\n", "\n", "next history prompt for coarse + full generation, end of loop\n", " next history prompt for coarse + full generation, end of loop semantic_prompt: (341,)\n", " Tokens: [2403, 3302, 528] ... [44, 8891, 5230] <256 from end> [8526, 1725, 1725] ... [1532, 2235, 6572]\n", " next history prompt for coarse + full generation, end of loop coarse_prompt: (2, 512)\n", " Tokens: [62, 62, 121] ... [833, 782, 264] <384 from end> [983, 99, 392] ... [855, 52, 52]\n", " Tokens: [424, 424, 424] ... [824, 767, 502] <384 from end> [996, 197, 304] ... [913, 928, 386]\n", " next history prompt for coarse + full generation, end of loop fine_prompt: (8, 512)\n", "processing semantic_tokens chunk 6 of size: 58\n", "\n", "next history prompt for coarse\n", " next history prompt for coarse semantic_prompt: (341,)\n", " Tokens: [43, 457, 1232] ... [2403, 147, 2009] <64 from end> [175, 482, 10] ... [1532, 2235, 6572]\n", " next history prompt for coarse coarse_prompt: (2, 512)\n", " Tokens: [20, 291, 20] ... [717, 121, 121] <96 from end> [904, 855, 408] ... [855, 52, 52]\n", " Tokens: [947, 766, 128] ... [424, 424, 424] <96 from end> [166, 488, 928] ... [913, 928, 386]\n", " next history prompt for coarse fine_prompt: (8, 512)\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full generation returned\n", " full generation returned semantic_prompt: (58,)\n", " Tokens: [6572, 9869, 9848] ... [206, 206, 206]\n", " full generation returned coarse_prompt: (2, 87)\n", " Tokens: [82, 870, 739] ... [408, 62, 408]\n", " Tokens: [843, 734, 734] ... [913, 424, 913]\n", " full generation returned fine_prompt: (8, 87)\n", "\n", "next history prompt for coarse + full generation, end of loop\n", " next history prompt for coarse + full generation, end of loop semantic_prompt: (341,)\n", " Tokens: [2403, 3302, 528] ... [44, 8891, 5230] <256 from end> [8526, 1725, 1725] ... [206, 206, 206]\n", " next history prompt for coarse + full generation, end of loop coarse_prompt: (2, 512)\n", " Tokens: [62, 62, 121] ... [833, 782, 264] <384 from end> [983, 99, 392] ... [408, 62, 408]\n", " Tokens: [424, 424, 424] ... [824, 767, 502] <384 from end> [996, 197, 304] ... [913, 424, 913]\n", " next history prompt for coarse + full generation, end of loop fine_prompt: (8, 512)\n", "processing semantic_tokens chunk 7 of size: 58\n", "\n", "next history prompt for coarse\n", " next history prompt for coarse semantic_prompt: (341,)\n", " Tokens: [43, 457, 1232] ... [2403, 147, 2009] <64 from end> [881, 147, 5008] ... [206, 206, 206]\n", " next history prompt for coarse coarse_prompt: (2, 512)\n", " Tokens: [20, 291, 20] ... [717, 121, 121] <96 from end> [408, 408, 62] ... [408, 62, 408]\n", " Tokens: [947, 766, 128] ... [424, 424, 424] <96 from end> [913, 913, 424] ... [913, 424, 913]\n", " next history prompt for coarse fine_prompt: (8, 512)\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full generation returned\n", " full generation returned semantic_prompt: (58,)\n", " Tokens: [5092, 147, 193] ... [6326, 3948, 6699]\n", " full generation returned coarse_prompt: (2, 87)\n", " Tokens: [62, 62, 408] ... [583, 971, 922]\n", " Tokens: [424, 424, 913] ... [577, 763, 648]\n", " full generation returned fine_prompt: (8, 87)\n", "\n", "next history prompt for coarse + full generation, end of loop\n", " next history prompt for coarse + full generation, end of loop semantic_prompt: (341,)\n", " Tokens: [2403, 3302, 528] ... [44, 8891, 5230] <256 from end> [8526, 1725, 1725] ... [6326, 3948, 6699]\n", " next history prompt for coarse + full generation, end of loop coarse_prompt: (2, 512)\n", " Tokens: [62, 62, 121] ... [833, 782, 264] <384 from end> [983, 99, 392] ... [583, 971, 922]\n", " Tokens: [424, 424, 424] ... [824, 767, 502] <384 from end> [996, 197, 304] ... [577, 763, 648]\n", " next history prompt for coarse + full generation, end of loop fine_prompt: (8, 512)\n", "processing semantic_tokens chunk 8 of size: 58\n", "\n", "next history prompt for coarse\n", " next history prompt for coarse semantic_prompt: (341,)\n", " Tokens: [43, 457, 1232] ... [2403, 147, 2009] <64 from end> [230, 56, 230] ... [6326, 3948, 6699]\n", " next history prompt for coarse coarse_prompt: (2, 512)\n", " Tokens: [20, 291, 20] ... [717, 121, 121] <96 from end> [408, 738, 408] ... [583, 971, 922]\n", " Tokens: [947, 766, 128] ... [424, 424, 424] <96 from end> [424, 544, 518] ... [577, 763, 648]\n", " next history prompt for coarse fine_prompt: (8, 512)\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full generation returned\n", " full generation returned semantic_prompt: (58,)\n", " Tokens: [59, 28, 28] ... [1380, 2009, 8385]\n", " full generation returned coarse_prompt: (2, 87)\n", " Tokens: [699, 148, 148] ... [62, 62, 62]\n", " Tokens: [923, 993, 646] ... [424, 424, 424]\n", " full generation returned fine_prompt: (8, 87)\n", "\n", "next history prompt for coarse + full generation, end of loop\n", " next history prompt for coarse + full generation, end of loop semantic_prompt: (341,)\n", " Tokens: [2403, 3302, 528] ... [44, 8891, 5230] <256 from end> [8526, 1725, 1725] ... [1380, 2009, 8385]\n", " next history prompt for coarse + full generation, end of loop coarse_prompt: (2, 512)\n", " Tokens: [62, 62, 121] ... [833, 782, 264] <384 from end> [983, 99, 392] ... [62, 62, 62]\n", " Tokens: [424, 424, 424] ... [824, 767, 502] <384 from end> [996, 197, 304] ... [424, 424, 424]\n", " next history prompt for coarse + full generation, end of loop fine_prompt: (8, 512)\n", "processing semantic_tokens chunk 9 of size: 58\n", "\n", "next history prompt for coarse\n", " next history prompt for coarse semantic_prompt: (341,)\n", " Tokens: [43, 457, 1232] ... [2403, 147, 2009] <64 from end> [2775, 321, 321] ... [1380, 2009, 8385]\n", " next history prompt for coarse coarse_prompt: (2, 512)\n", " Tokens: [20, 291, 20] ... [717, 121, 121] <96 from end> [20, 30, 1001] ... [62, 62, 62]\n", " Tokens: [947, 766, 128] ... [424, 424, 424] <96 from end> [996, 395, 729] ... [424, 424, 424]\n", " next history prompt for coarse fine_prompt: (8, 512)\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full generation returned\n", " full generation returned semantic_prompt: (58,)\n", " Tokens: [8610, 8610, 5008] ... [12, 9647, 1032]\n", " full generation returned coarse_prompt: (2, 87)\n", " Tokens: [121, 62, 62] ... [395, 148, 148]\n", " Tokens: [424, 424, 424] ... [575, 282, 993]\n", " full generation returned fine_prompt: (8, 87)\n", "\n", "next history prompt for coarse + full generation, end of loop\n", " next history prompt for coarse + full generation, end of loop semantic_prompt: (341,)\n", " Tokens: [2403, 3302, 528] ... [44, 8891, 5230] <256 from end> [8526, 1725, 1725] ... [12, 9647, 1032]\n", " next history prompt for coarse + full generation, end of loop coarse_prompt: (2, 512)\n", " Tokens: [62, 62, 121] ... [833, 782, 264] <384 from end> [983, 99, 392] ... [395, 148, 148]\n", " Tokens: [424, 424, 424] ... [824, 767, 502] <384 from end> [996, 197, 304] ... [575, 282, 993]\n", " next history prompt for coarse + full generation, end of loop fine_prompt: (8, 512)\n", "processing semantic_tokens chunk 10 of size: 58\n", "\n", "next history prompt for coarse\n", " next history prompt for coarse semantic_prompt: (341,)\n", " Tokens: [43, 457, 1232] ... [2403, 147, 2009] <64 from end> [147, 2403, 147] ... [12, 9647, 1032]\n", " next history prompt for coarse coarse_prompt: (2, 512)\n", " Tokens: [20, 291, 20] ... [717, 121, 121] <96 from end> [62, 717, 62] ... [395, 148, 148]\n", " Tokens: [947, 766, 128] ... [424, 424, 424] <96 from end> [424, 424, 424] ... [575, 282, 993]\n", " next history prompt for coarse fine_prompt: (8, 512)\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full generation returned\n", " full generation returned semantic_prompt: (58,)\n", " Tokens: [7517, 3795, 7151] ... [5309, 6733, 1537]\n", " full generation returned coarse_prompt: (2, 87)\n", " Tokens: [798, 433, 879] ... [414, 613, 56]\n", " Tokens: [652, 1023, 974] ... [947, 272, 646]\n", " full generation returned fine_prompt: (8, 87)\n", "\n", "next history prompt for coarse + full generation, end of loop\n", " next history prompt for coarse + full generation, end of loop semantic_prompt: (341,)\n", " Tokens: [2403, 3302, 528] ... [44, 8891, 5230] <256 from end> [8526, 1725, 1725] ... [5309, 6733, 1537]\n", " next history prompt for coarse + full generation, end of loop coarse_prompt: (2, 512)\n", " Tokens: [62, 62, 121] ... [833, 782, 264] <384 from end> [983, 99, 392] ... [414, 613, 56]\n", " Tokens: [424, 424, 424] ... [824, 767, 502] <384 from end> [996, 197, 304] ... [947, 272, 646]\n", " next history prompt for coarse + full generation, end of loop fine_prompt: (8, 512)\n", "processing semantic_tokens chunk 11 of size: 58\n", "\n", "next history prompt for coarse\n", " next history prompt for coarse semantic_prompt: (341,)\n", " Tokens: [43, 457, 1232] ... [2403, 147, 2009] <64 from end> [1599, 557, 298] ... [5309, 6733, 1537]\n", " next history prompt for coarse coarse_prompt: (2, 512)\n", " Tokens: [20, 291, 20] ... [717, 121, 121] <96 from end> [688, 162, 983] ... [414, 613, 56]\n", " Tokens: [947, 766, 128] ... [424, 424, 424] <96 from end> [84, 995, 731] ... [947, 272, 646]\n", " next history prompt for coarse fine_prompt: (8, 512)\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full generation returned\n", " full generation returned semantic_prompt: (58,)\n", " Tokens: [1537, 175, 175] ... [41, 41, 1065]\n", " full generation returned coarse_prompt: (2, 87)\n", " Tokens: [690, 291, 293] ... [472, 887, 779]\n", " Tokens: [634, 787, 6] ... [841, 371, 928]\n", " full generation returned fine_prompt: (8, 87)\n", "\n", "next history prompt for coarse + full generation, end of loop\n", " next history prompt for coarse + full generation, end of loop semantic_prompt: (341,)\n", " Tokens: [2403, 3302, 528] ... [44, 8891, 5230] <256 from end> [8526, 1725, 1725] ... [41, 41, 1065]\n", " next history prompt for coarse + full generation, end of loop coarse_prompt: (2, 512)\n", " Tokens: [62, 62, 121] ... [833, 782, 264] <384 from end> [983, 99, 392] ... [472, 887, 779]\n", " Tokens: [424, 424, 424] ... [824, 767, 502] <384 from end> [996, 197, 304] ... [841, 371, 928]\n", " next history prompt for coarse + full generation, end of loop fine_prompt: (8, 512)\n", "processing semantic_tokens chunk 12 of size: 59\n", "\n", "next history prompt for coarse\n", " next history prompt for coarse semantic_prompt: (341,)\n", " Tokens: [43, 457, 1232] ... [2403, 147, 2009] <64 from end> [2479, 5309, 5309] ... [41, 41, 1065]\n", " next history prompt for coarse coarse_prompt: (2, 512)\n", " Tokens: [20, 291, 20] ... [717, 121, 121] <96 from end> [694, 575, 414] ... [472, 887, 779]\n", " Tokens: [947, 766, 128] ... [424, 424, 424] <96 from end> [45, 722, 505] ... [841, 371, 928]\n", " next history prompt for coarse fine_prompt: (8, 512)\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full generation returned\n", " full generation returned semantic_prompt: (59,)\n", " Tokens: [3047, 7757, 2722] ... [5188, 298, 17]\n", " full generation returned coarse_prompt: (2, 88)\n", " Tokens: [491, 255, 651] ... [858, 834, 604]\n", " Tokens: [815, 767, 496] ... [722, 252, 303]\n", " full generation returned fine_prompt: (8, 88)\n", "\n", "next history prompt for coarse + full generation, end of loop\n", " next history prompt for coarse + full generation, end of loop semantic_prompt: (341,)\n", " Tokens: [3302, 528, 147] ... [8891, 5230, 8526] <256 from end> [1725, 1725, 6747] ... [5188, 298, 17]\n", " next history prompt for coarse + full generation, end of loop coarse_prompt: (2, 512)\n", " Tokens: [62, 121, 62] ... [782, 264, 983] <384 from end> [99, 392, 683] ... [858, 834, 604]\n", " Tokens: [424, 424, 424] ... [767, 502, 996] <384 from end> [197, 304, 197] ... [722, 252, 303]\n", " next history prompt for coarse + full generation, end of loop fine_prompt: (8, 512)\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Test: Generate all semantic in one go. Then iteratively chop up the semantic tokens into tiny pieces and feed to the coarse and fine models.\n", "# But this time properly also update the history prompt with the coarse and fine tokens with prev segments\n", "\n", "# And further, let's allocate a chunk of permanent base history storage\n", "\n", "# Results: Basically perfect, still some minor artifacts between sections, but the voice doens't change\n", "\n", "# could look into splitting either semantic or coarse on the non speaking sections which seem easy to recognize. \n", "\n", "SPEAKER = testing_SPEAKER\n", "cell_text_prompt = charlie_text\n", "set_seed(testing_seed)\n", "\n", "\n", "GEN_TEMP = 0.6\n", "\n", "\n", "final_pieces = [] \n", "\n", "full_text = cell_text_prompt\n", "\n", "print(full_text)\n", "\n", "\n", "og_history_prompt = load_npz(SPEAKER)\n", "\n", "next_history_prompt_for_coarse = None\n", "show_history_prompt_size(og_history_prompt,text=\"original history_prompt\")\n", "\n", "semantic_tokens_to_process = generate_text_semantic(\n", " full_text,\n", " history_prompt=og_history_prompt,\n", " temp=GEN_TEMP,\n", " min_eos_p=0.05,\n", " silent=True)\n", "\n", "\n", "split_semantic_tokens = split_array_equally(semantic_tokens_to_process, 12)\n", "\n", "previous_segment_buffer = 64\n", "\n", "for i, coarse_semantic_tokens in enumerate(split_semantic_tokens):\n", " print(f\"processing semantic_tokens chunk {i + 1} of size: {len(coarse_semantic_tokens)}\")\n", "\n", " \n", " if next_history_prompt_for_coarse is None:\n", " next_history_prompt_for_coarse = copy.deepcopy(og_history_prompt)\n", " else:\n", " next_history_prompt_for_coarse = merge_history_prompts(og_history_prompt, next_history_prompt_for_coarse, right_size=previous_segment_buffer)\n", "\n", " show_history_prompt_size(next_history_prompt_for_coarse,text=\"next history prompt for coarse\", semantic_back_n=previous_segment_buffer)\n", "\n", " #full_generation, audio_array = semantic_to_waveform(coarse_semantic_tokens, history_prompt=next_history_prompt_for_coarse, output_full=True)\n", "\n", " coarse_tokens = generate_coarse(\n", " coarse_semantic_tokens,\n", " history_prompt=next_history_prompt_for_coarse,\n", " temp=0.7,\n", " silent=True,\n", " use_kv_caching=True,\n", " x_coarse_history_alignment_hack=-2\n", " )\n", " fine_tokens = generate_fine(\n", " coarse_tokens,\n", " history_prompt=next_history_prompt_for_coarse,\n", " temp=0.5,\n", " )\n", " audio_array = codec_decode(fine_tokens)\n", "\n", " full_generation = {\n", " \"semantic_prompt\": coarse_semantic_tokens,\n", " \"coarse_prompt\": coarse_tokens,\n", " \"fine_prompt\": fine_tokens,\n", " }\n", "\n", "\n", "\n", " show_history_prompt_size(full_generation, text=\"full generation returned\", semantic_back_n=previous_segment_buffer) \n", "\n", "\n", "\n", " next_history_prompt_for_coarse = merge_history_prompts(next_history_prompt_for_coarse, full_generation, right_size=256)\n", "\n", " show_history_prompt_size(next_history_prompt_for_coarse, text=\"next history prompt for coarse + full generation, end of loop\")\n", "\n", " final_pieces.append(audio_array)\n", "\n", "\n", "Audio(np.concatenate(final_pieces), rate=SAMPLE_RATE)" ] }, { "cell_type": "code", "execution_count": 95, "id": "40e65987", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Disabling deterministic algorithms\n", "Set seed to 364314352\n", "Have I told you that story about how Charlie Parker became Charlie Parker? Parker's a young kid, pretty good on the Sax, gets up to play at a cutting session, and well, he fucks it up.\n", "['Have I told', 'you that story', 'about how Charlie', 'Parker became Charlie', \"Parker? Parker's a\", 'young kid, pretty', 'good on the', 'Sax, gets up', 'to play at', 'a cutting session,', 'and well, he', 'fucks it up.']\n", "\n", "original history_prompt\n", " original history_prompt semantic_prompt: (682,)\n", " Tokens: [147, 6242, 302] ... [10, 230, 56] <256 from end> [206, 10, 206] ... [2403, 147, 2009]\n", " original history_prompt coarse_prompt: (2, 1025)\n", " Tokens: [738, 738, 1017] ... [738, 738, 738] <384 from end> [738, 738, 738] ... [717, 121, 121]\n", " Tokens: [363, 363, 646] ... [937, 544, 937] <384 from end> [544, 544, 544] ... [424, 424, 424]\n", " original history_prompt fine_prompt: (8, 1025)\n", "\n", "next_segment_history_prompt\n", " next_segment_history_prompt semantic_prompt: (682,)\n", " Tokens: [147, 6242, 302] ... [8851, 27, 1041] <64 from end> [59, 28, 107] ... [2403, 147, 2009]\n", " next_segment_history_prompt coarse_prompt: (2, 1025)\n", " Tokens: [738, 738, 1017] ... [475, 488, 148] <96 from end> [103, 103, 148] ... [717, 121, 121]\n", " Tokens: [363, 363, 646] ... [544, 43, 364] <96 from end> [420, 420, 241] ... [424, 424, 424]\n", " next_segment_history_prompt fine_prompt: (8, 1025)\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full_generation\n", " full_generation semantic_prompt: (70,)\n", " Tokens: [1380, 8735, 8385] ... [5008, 1075, 3490] <64 from end> [2152, 7956, 382] ... [2659, 147, 2009]\n", " full_generation coarse_prompt: (2, 105)\n", " Tokens: [62, 717, 62] ... [408, 887, 372] <96 from end> [372, 754, 850] ... [62, 62, 121]\n", " Tokens: [424, 424, 424] ... [913, 928, 984] <96 from end> [1002, 651, 501] ... [424, 424, 424]\n", " full_generation fine_prompt: (8, 105)\n", "\n", "next_segment_history_prompt, full generation, end of loop\n", " next_segment_history_prompt, full generation, end of loop semantic_prompt: (341,)\n", " Tokens: [6303, 6303, 1997] ... [7233, 5526, 288] <256 from end> [17, 2783, 4348] ... [2659, 147, 2009]\n", " next_segment_history_prompt, full generation, end of loop coarse_prompt: (2, 512)\n", " Tokens: [523, 523, 30] ... [185, 432, 565] <384 from end> [921, 921, 565] ... [62, 62, 121]\n", " Tokens: [113, 866, 924] ... [520, 942, 767] <384 from end> [245, 205, 763] ... [424, 424, 424]\n", " next_segment_history_prompt, full generation, end of loop fine_prompt: (8, 512)\n", " -->Piece 0 Finished at: 2023-05-10 01:55:18 in 4.75558876991272 seconds\n", "\n", "next_segment_history_prompt\n", " next_segment_history_prompt semantic_prompt: (341,)\n", " Tokens: [43, 457, 1232] ... [2403, 147, 2009] <64 from end> [2152, 7956, 382] ... [2659, 147, 2009]\n", " next_segment_history_prompt coarse_prompt: (2, 512)\n", " Tokens: [20, 291, 20] ... [717, 121, 121] <96 from end> [372, 754, 850] ... [62, 62, 121]\n", " Tokens: [947, 766, 128] ... [424, 424, 424] <96 from end> [1002, 651, 501] ... [424, 424, 424]\n", " next_segment_history_prompt fine_prompt: (8, 512)\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full_generation\n", " full_generation semantic_prompt: (50,)\n", " Tokens: [8735, 147, 2305] ... [894, 3527, 56]\n", " full_generation coarse_prompt: (2, 75)\n", " Tokens: [62, 62, 121] ... [106, 106, 835]\n", " Tokens: [424, 424, 424] ... [424, 913, 913]\n", " full_generation fine_prompt: (8, 75)\n", "\n", "next_segment_history_prompt, full generation, end of loop\n", " next_segment_history_prompt, full generation, end of loop semantic_prompt: (341,)\n", " Tokens: [3174, 4488, 3174] ... [10, 41, 1221] <256 from end> [448, 2043, 3084] ... [894, 3527, 56]\n", " next_segment_history_prompt, full generation, end of loop coarse_prompt: (2, 512)\n", " Tokens: [62, 62, 62] ... [696, 560, 465] <384 from end> [502, 907, 979] ... [106, 106, 835]\n", " Tokens: [424, 424, 424] ... [723, 176, 991] <384 from end> [754, 808, 734] ... [424, 913, 913]\n", " next_segment_history_prompt, full generation, end of loop fine_prompt: (8, 512)\n", " -->Piece 1 Finished at: 2023-05-10 01:55:21 in 3.425469160079956 seconds\n", "\n", "next_segment_history_prompt\n", " next_segment_history_prompt semantic_prompt: (341,)\n", " Tokens: [43, 457, 1232] ... [2403, 147, 2009] <64 from end> [206, 206, 206] ... [894, 3527, 56]\n", " next_segment_history_prompt coarse_prompt: (2, 512)\n", " Tokens: [20, 291, 20] ... [717, 121, 121] <96 from end> [62, 408, 408] ... [106, 106, 835]\n", " Tokens: [947, 766, 128] ... [424, 424, 424] <96 from end> [424, 518, 518] ... [424, 913, 913]\n", " next_segment_history_prompt fine_prompt: (8, 512)\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full_generation\n", " full_generation semantic_prompt: (61,)\n", " Tokens: [56, 206, 56] ... [7538, 388, 6358]\n", " full_generation coarse_prompt: (2, 91)\n", " Tokens: [738, 738, 738] ... [424, 70, 563]\n", " Tokens: [913, 518, 544] ... [570, 570, 570]\n", " full_generation fine_prompt: (8, 91)\n", "\n", "next_segment_history_prompt, full generation, end of loop\n", " next_segment_history_prompt, full generation, end of loop semantic_prompt: (341,)\n", " Tokens: [147, 7874, 2009] ... [8526, 1725, 1725] <256 from end> [6747, 187, 891] ... [7538, 388, 6358]\n", " next_segment_history_prompt, full generation, end of loop coarse_prompt: (2, 512)\n", " Tokens: [408, 62, 62] ... [99, 392, 683] <384 from end> [886, 112, 683] ... [424, 70, 563]\n", " Tokens: [913, 424, 424] ... [197, 304, 197] <384 from end> [751, 751, 761] ... [570, 570, 570]\n", " next_segment_history_prompt, full generation, end of loop fine_prompt: (8, 512)\n", " -->Piece 2 Finished at: 2023-05-10 01:55:25 in 3.9877853393554688 seconds\n", "\n", "next_segment_history_prompt\n", " next_segment_history_prompt semantic_prompt: (341,)\n", " Tokens: [43, 457, 1232] ... [2403, 147, 2009] <64 from end> [894, 3527, 56] ... [7538, 388, 6358]\n", " next_segment_history_prompt coarse_prompt: (2, 512)\n", " Tokens: [20, 291, 20] ... [717, 121, 121] <96 from end> [876, 106, 106] ... [424, 70, 563]\n", " Tokens: [947, 766, 128] ... [424, 424, 424] <96 from end> [841, 937, 424] ... [570, 570, 570]\n", " next_segment_history_prompt fine_prompt: (8, 512)\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full_generation\n", " full_generation semantic_prompt: (60,)\n", " Tokens: [808, 3256, 50] ... [292, 7890, 4040]\n", " full_generation coarse_prompt: (2, 90)\n", " Tokens: [645, 291, 690] ... [208, 860, 865]\n", " Tokens: [52, 864, 745] ... [894, 71, 648]\n", " full_generation fine_prompt: (8, 90)\n", "\n", "next_segment_history_prompt, full generation, end of loop\n", " next_segment_history_prompt, full generation, end of loop semantic_prompt: (341,)\n", " Tokens: [528, 147, 7874] ... [5230, 8526, 1725] <256 from end> [1725, 6747, 187] ... [292, 7890, 4040]\n", " next_segment_history_prompt, full generation, end of loop coarse_prompt: (2, 512)\n", " Tokens: [62, 408, 62] ... [983, 99, 392] <384 from end> [683, 886, 112] ... [208, 860, 865]\n", " Tokens: [424, 913, 424] ... [996, 197, 304] <384 from end> [197, 751, 751] ... [894, 71, 648]\n", " next_segment_history_prompt, full generation, end of loop fine_prompt: (8, 512)\n", " -->Piece 3 Finished at: 2023-05-10 01:55:29 in 4.049088001251221 seconds\n", "\n", "next_segment_history_prompt\n", " next_segment_history_prompt semantic_prompt: (341,)\n", " Tokens: [43, 457, 1232] ... [2403, 147, 2009] <64 from end> [9490, 7538, 388] ... [292, 7890, 4040]\n", " next_segment_history_prompt coarse_prompt: (2, 512)\n", " Tokens: [20, 291, 20] ... [717, 121, 121] <96 from end> [612, 890, 890] ... [208, 860, 865]\n", " Tokens: [947, 766, 128] ... [424, 424, 424] <96 from end> [60, 703, 570] ... [894, 71, 648]\n", " next_segment_history_prompt fine_prompt: (8, 512)\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full_generation\n", " full_generation semantic_prompt: (73,)\n", " Tokens: [2970, 122, 41] ... [2235, 6572, 6572] <64 from end> [9869, 7656, 7656] ... [92, 9296, 4093]\n", " full_generation coarse_prompt: (2, 109)\n", " Tokens: [1001, 860, 1017] ... [257, 370, 257] <96 from end> [20, 20, 20] ... [875, 1001, 523]\n", " Tokens: [904, 669, 896] ... [928, 928, 928] <96 from end> [765, 444, 221] ... [420, 923, 708]\n", " full_generation fine_prompt: (8, 109)\n", "\n", "next_segment_history_prompt, full generation, end of loop\n", " next_segment_history_prompt, full generation, end of loop semantic_prompt: (341,)\n", " Tokens: [6564, 648, 41] ... [7100, 167, 167] <256 from end> [4296, 5815, 657] ... [92, 9296, 4093]\n", " next_segment_history_prompt, full generation, end of loop coarse_prompt: (2, 512)\n", " Tokens: [370, 860, 208] ... [984, 428, 428] <384 from end> [402, 402, 565] ... [875, 1001, 523]\n", " Tokens: [279, 719, 416] ... [939, 582, 417] <384 from end> [444, 298, 767] ... [420, 923, 708]\n", " next_segment_history_prompt, full generation, end of loop fine_prompt: (8, 512)\n", " -->Piece 4 Finished at: 2023-05-10 01:55:34 in 4.435755729675293 seconds\n", "\n", "next_segment_history_prompt\n", " next_segment_history_prompt semantic_prompt: (341,)\n", " Tokens: [43, 457, 1232] ... [2403, 147, 2009] <64 from end> [9869, 7656, 7656] ... [92, 9296, 4093]\n", " next_segment_history_prompt coarse_prompt: (2, 512)\n", " Tokens: [20, 291, 20] ... [717, 121, 121] <96 from end> [20, 20, 20] ... [875, 1001, 523]\n", " Tokens: [947, 766, 128] ... [424, 424, 424] <96 from end> [765, 444, 221] ... [420, 923, 708]\n", " next_segment_history_prompt fine_prompt: (8, 512)\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full_generation\n", " full_generation semantic_prompt: (65,)\n", " Tokens: [2184, 198, 198] ... [298, 4111, 526]\n", " full_generation coarse_prompt: (2, 97)\n", " Tokens: [530, 23, 224] ... [583, 751, 344]\n", " Tokens: [414, 811, 570] ... [404, 241, 519]\n", " full_generation fine_prompt: (8, 97)\n", "\n", "next_segment_history_prompt, full generation, end of loop\n", " next_segment_history_prompt, full generation, end of loop semantic_prompt: (341,)\n", " Tokens: [8735, 8385, 5008] ... [187, 891, 891] <256 from end> [891, 7100, 891] ... [298, 4111, 526]\n", " next_segment_history_prompt, full generation, end of loop coarse_prompt: (2, 512)\n", " Tokens: [62, 62, 717] ... [402, 162, 695] <384 from end> [501, 240, 162] ... [583, 751, 344]\n", " Tokens: [424, 424, 424] ... [53, 809, 831] <384 from end> [345, 559, 343] ... [404, 241, 519]\n", " next_segment_history_prompt, full generation, end of loop fine_prompt: (8, 512)\n", " -->Piece 5 Finished at: 2023-05-10 01:55:38 in 4.12956428527832 seconds\n", "\n", "next_segment_history_prompt\n", " next_segment_history_prompt semantic_prompt: (341,)\n", " Tokens: [43, 457, 1232] ... [2403, 147, 2009] <64 from end> [198, 198, 198] ... [298, 4111, 526]\n", " next_segment_history_prompt coarse_prompt: (2, 512)\n", " Tokens: [20, 291, 20] ... [717, 121, 121] <96 from end> [23, 224, 604] ... [583, 751, 344]\n", " Tokens: [947, 766, 128] ... [424, 424, 424] <96 from end> [811, 570, 519] ... [404, 241, 519]\n", " next_segment_history_prompt fine_prompt: (8, 512)\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full_generation\n", " full_generation semantic_prompt: (107,)\n", " Tokens: [2970, 50, 41] ... [206, 206, 206] <64 from end> [206, 206, 206] ... [206, 206, 147]\n", " full_generation coarse_prompt: (2, 160)\n", " Tokens: [751, 699, 432] ... [62, 62, 62] <96 from end> [62, 408, 62] ... [408, 408, 121]\n", " Tokens: [831, 564, 765] ... [424, 424, 424] <96 from end> [424, 913, 424] ... [913, 913, 424]\n", " full_generation fine_prompt: (8, 160)\n", "\n", "next_segment_history_prompt, full generation, end of loop\n", " next_segment_history_prompt, full generation, end of loop semantic_prompt: (341,)\n", " Tokens: [107, 299, 196] ... [4488, 193, 206] <256 from end> [147, 6340, 1278] ... [206, 206, 147]\n", " next_segment_history_prompt, full generation, end of loop coarse_prompt: (2, 512)\n", " Tokens: [103, 537, 257] ... [62, 62, 717] <384 from end> [62, 62, 62] ... [408, 408, 121]\n", " Tokens: [71, 857, 913] ... [424, 424, 424] <384 from end> [424, 424, 424] ... [913, 913, 424]\n", " next_segment_history_prompt, full generation, end of loop fine_prompt: (8, 512)\n", " -->Piece 6 Finished at: 2023-05-10 01:55:44 in 6.043853759765625 seconds\n", "\n", "next_segment_history_prompt\n", " next_segment_history_prompt semantic_prompt: (341,)\n", " Tokens: [43, 457, 1232] ... [2403, 147, 2009] <64 from end> [206, 206, 206] ... [206, 206, 147]\n", " next_segment_history_prompt coarse_prompt: (2, 512)\n", " Tokens: [20, 291, 20] ... [717, 121, 121] <96 from end> [62, 408, 62] ... [408, 408, 121]\n", " Tokens: [947, 766, 128] ... [424, 424, 424] <96 from end> [424, 913, 424] ... [913, 913, 424]\n", " next_segment_history_prompt fine_prompt: (8, 512)\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full_generation\n", " full_generation semantic_prompt: (150,)\n", " Tokens: [3174, 206, 206] ... [10, 10, 27] <64 from end> [345, 345, 4223] ... [258, 258, 258]\n", " full_generation coarse_prompt: (2, 225)\n", " Tokens: [408, 62, 717] ... [404, 257, 52] <96 from end> [887, 887, 404] ... [62, 62, 121]\n", " Tokens: [913, 424, 424] ... [928, 928, 928] <96 from end> [888, 969, 544] ... [424, 424, 518]\n", " full_generation fine_prompt: (8, 225)\n", "\n", "next_segment_history_prompt, full generation, end of loop\n", " next_segment_history_prompt, full generation, end of loop semantic_prompt: (341,)\n", " Tokens: [891, 7100, 891] ... [17, 2113, 3745] <256 from end> [5218, 117, 107] ... [258, 258, 258]\n", " next_segment_history_prompt, full generation, end of loop coarse_prompt: (2, 512)\n", " Tokens: [501, 240, 162] ... [185, 291, 565] <384 from end> [879, 228, 604] ... [62, 62, 121]\n", " Tokens: [345, 559, 343] ... [198, 298, 809] <384 from end> [516, 687, 363] ... [424, 424, 518]\n", " next_segment_history_prompt, full generation, end of loop fine_prompt: (8, 512)\n", " -->Piece 7 Finished at: 2023-05-10 01:55:52 in 8.245976686477661 seconds\n", "\n", "next_segment_history_prompt\n", " next_segment_history_prompt semantic_prompt: (341,)\n", " Tokens: [43, 457, 1232] ... [2403, 147, 2009] <64 from end> [345, 345, 4223] ... [258, 258, 258]\n", " next_segment_history_prompt coarse_prompt: (2, 512)\n", " Tokens: [20, 291, 20] ... [717, 121, 121] <96 from end> [887, 887, 404] ... [62, 62, 121]\n", " Tokens: [947, 766, 128] ... [424, 424, 424] <96 from end> [888, 969, 544] ... [424, 424, 518]\n", " next_segment_history_prompt fine_prompt: (8, 512)\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full_generation\n", " full_generation semantic_prompt: (75,)\n", " Tokens: [258, 258, 258] ... [258, 258, 1554] <64 from end> [1554, 1554, 2198] ... [1190, 4263, 6702]\n", " full_generation coarse_prompt: (2, 112)\n", " Tokens: [62, 62, 62] ... [62, 62, 121] <96 from end> [62, 62, 62] ... [645, 385, 344]\n", " Tokens: [424, 424, 518] ... [424, 424, 424] <96 from end> [424, 424, 518] ... [489, 489, 596]\n", " full_generation fine_prompt: (8, 112)\n", "\n", "next_segment_history_prompt, full generation, end of loop\n", " next_segment_history_prompt, full generation, end of loop semantic_prompt: (341,)\n", " Tokens: [41, 6286, 5564] ... [167, 4296, 5815] <256 from end> [657, 657, 657] ... [1190, 4263, 6702]\n", " next_segment_history_prompt, full generation, end of loop coarse_prompt: (2, 512)\n", " Tokens: [495, 20, 950] ... [402, 402, 565] <384 from end> [321, 1001, 565] ... [645, 385, 344]\n", " Tokens: [673, 568, 889] ... [444, 298, 767] <384 from end> [952, 952, 767] ... [489, 489, 596]\n", " next_segment_history_prompt, full generation, end of loop fine_prompt: (8, 512)\n", " -->Piece 8 Finished at: 2023-05-10 01:55:56 in 4.434818983078003 seconds\n", "\n", "next_segment_history_prompt\n", " next_segment_history_prompt semantic_prompt: (341,)\n", " Tokens: [43, 457, 1232] ... [2403, 147, 2009] <64 from end> [1554, 1554, 2198] ... [1190, 4263, 6702]\n", " next_segment_history_prompt coarse_prompt: (2, 512)\n", " Tokens: [20, 291, 20] ... [717, 121, 121] <96 from end> [62, 62, 62] ... [645, 385, 344]\n", " Tokens: [947, 766, 128] ... [424, 424, 424] <96 from end> [424, 424, 518] ... [489, 489, 596]\n", " next_segment_history_prompt fine_prompt: (8, 512)\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full_generation\n", " full_generation semantic_prompt: (109,)\n", " Tokens: [452, 9728, 4325] ... [28, 28, 28] <64 from end> [28, 6801, 9235] ... [602, 5934, 2617]\n", " full_generation coarse_prompt: (2, 163)\n", " Tokens: [208, 976, 1010] ... [148, 148, 148] <96 from end> [148, 148, 463] ... [408, 408, 408]\n", " Tokens: [889, 599, 560] ... [993, 993, 993] <96 from end> [993, 993, 747] ... [544, 544, 518]\n", " full_generation fine_prompt: (8, 163)\n", "\n", "next_segment_history_prompt, full generation, end of loop\n", " next_segment_history_prompt, full generation, end of loop semantic_prompt: (341,)\n", " Tokens: [196, 3208, 10] ... [206, 147, 6340] <256 from end> [1278, 2305, 147] ... [602, 5934, 2617]\n", " next_segment_history_prompt, full generation, end of loop coarse_prompt: (2, 512)\n", " Tokens: [408, 106, 738] ... [62, 62, 62] <384 from end> [62, 62, 62] ... [408, 408, 408]\n", " Tokens: [841, 363, 424] ... [424, 424, 424] <384 from end> [424, 424, 424] ... [544, 544, 518]\n", " next_segment_history_prompt, full generation, end of loop fine_prompt: (8, 512)\n", " -->Piece 9 Finished at: 2023-05-10 01:56:02 in 6.051714897155762 seconds\n", "\n", "next_segment_history_prompt\n", " next_segment_history_prompt semantic_prompt: (341,)\n", " Tokens: [43, 457, 1232] ... [2403, 147, 2009] <64 from end> [28, 6801, 9235] ... [602, 5934, 2617]\n", " next_segment_history_prompt coarse_prompt: (2, 512)\n", " Tokens: [20, 291, 20] ... [717, 121, 121] <96 from end> [148, 148, 463] ... [408, 408, 408]\n", " Tokens: [947, 766, 128] ... [424, 424, 424] <96 from end> [993, 993, 747] ... [544, 544, 518]\n", " next_segment_history_prompt fine_prompt: (8, 512)\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full_generation\n", " full_generation semantic_prompt: (131,)\n", " Tokens: [2617, 2015, 662] ... [147, 206, 193] <64 from end> [206, 193, 206] ... [1613, 193, 206]\n", " full_generation coarse_prompt: (2, 196)\n", " Tokens: [408, 408, 408] ... [408, 408, 408] <96 from end> [408, 408, 408] ... [408, 408, 408]\n", " Tokens: [518, 518, 518] ... [518, 518, 518] <96 from end> [518, 518, 518] ... [518, 518, 518]\n", " full_generation fine_prompt: (8, 196)\n", "\n", "next_segment_history_prompt, full generation, end of loop\n", " next_segment_history_prompt, full generation, end of loop semantic_prompt: (341,)\n", " Tokens: [41, 10, 41] ... [59, 28, 107] <256 from end> [5558, 957, 10] ... [1613, 193, 206]\n", " next_segment_history_prompt, full generation, end of loop coarse_prompt: (2, 512)\n", " Tokens: [321, 310, 432] ... [103, 148, 25] <384 from end> [904, 106, 408] ... [408, 408, 408]\n", " Tokens: [646, 765, 571] ... [420, 241, 747] <384 from end> [964, 913, 544] ... [518, 518, 518]\n", " next_segment_history_prompt, full generation, end of loop fine_prompt: (8, 512)\n", " -->Piece 10 Finished at: 2023-05-10 01:56:09 in 6.866191625595093 seconds\n", "\n", "next_segment_history_prompt\n", " next_segment_history_prompt semantic_prompt: (341,)\n", " Tokens: [43, 457, 1232] ... [2403, 147, 2009] <64 from end> [206, 193, 206] ... [1613, 193, 206]\n", " next_segment_history_prompt coarse_prompt: (2, 512)\n", " Tokens: [20, 291, 20] ... [717, 121, 121] <96 from end> [408, 408, 408] ... [408, 408, 408]\n", " Tokens: [947, 766, 128] ... [424, 424, 424] <96 from end> [518, 518, 518] ... [518, 518, 518]\n", " next_segment_history_prompt fine_prompt: (8, 512)\n", "actual lengths we're using, x_semantic_history: 209 x_coarse_history: 626\n", "\n", "full_generation\n", " full_generation semantic_prompt: (100,)\n", " Tokens: [193, 528, 206] ... [28, 1133, 3190] <64 from end> [3190, 298, 17] ... [258, 258, 258]\n", " full_generation coarse_prompt: (2, 150)\n", " Tokens: [62, 408, 408] ... [463, 819, 182] <96 from end> [833, 890, 432] ... [408, 408, 62]\n", " Tokens: [424, 518, 518] ... [364, 974, 920] <96 from end> [245, 228, 894] ... [518, 518, 424]\n", " full_generation fine_prompt: (8, 150)\n", "\n", "next_segment_history_prompt, full generation, end of loop\n", " next_segment_history_prompt, full generation, end of loop semantic_prompt: (341,)\n", " Tokens: [9473, 9473, 2581] ... [193, 56, 193] <256 from end> [56, 4488, 193] ... [258, 258, 258]\n", " next_segment_history_prompt, full generation, end of loop coarse_prompt: (2, 512)\n", " Tokens: [52, 604, 3] ... [408, 408, 62] <384 from end> [408, 62, 408] ... [408, 408, 62]\n", " Tokens: [928, 404, 619] ... [518, 518, 424] <384 from end> [518, 424, 913] ... [518, 518, 424]\n", " next_segment_history_prompt, full generation, end of loop fine_prompt: (8, 512)\n", " -->Piece 11 Finished at: 2023-05-10 01:56:15 in 5.595268726348877 seconds\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Test: Can we calso split up generate_text_semantic and get coherent and similar results?\n", "\n", "# not really. It works but sounds weird. We're gonna have pack the inference space.\n", "\n", "\n", "SPEAKER = testing_SPEAKER\n", "\n", "\n", "\n", "cell_text_prompt = charlie_text\n", "set_seed(testing_seed)\n", "\n", "\n", "\n", "semantic_tokens_full = []\n", "\n", "\n", "full_text = cell_text_prompt\n", "\n", "print(full_text)\n", "pieces = split_by_words(full_text, 3)\n", "print(pieces)\n", "\n", "final_pieces = []\n", "\n", "og_speaker_prompt = load_npz(SPEAKER) \n", "show_history_prompt_size(og_speaker_prompt,text=\"original history_prompt\")\n", "\n", "\n", "next_segment_history_prompt = None\n", "next_semantic_for_coarse = None\n", "\n", "# I think we need more than 256 semantic space here. history won't be good enough. But let's try.\n", "previous_segment_buffer = 64\n", "\n", "for i, piece in enumerate(pieces):\n", " with measure_time(text=\"Piece\", index=i):\n", "\n", " \n", " if next_segment_history_prompt is None:\n", " next_segment_history_prompt = copy.deepcopy(og_history_prompt)\n", " else:\n", " next_segment_history_prompt = merge_history_prompts(og_history_prompt, next_segment_history_prompt, right_size=previous_segment_buffer)\n", "\n", " show_history_prompt_size(next_segment_history_prompt,text=\"next_segment_history_prompt\", semantic_back_n=previous_segment_buffer)\n", "\n", " semantic_tokens_for_this_piece = generate_text_semantic(\n", " piece,\n", " history_prompt=next_segment_history_prompt,\n", " temp=GEN_TEMP,\n", " min_eos_p=0.05,\n", " #top_k = 50,\n", " #top_p = 0.90,\n", " silent=True,\n", " )\n", " \n", " semantic_tokens_full.append(semantic_tokens_for_this_piece)\n", "\n", " coarse_tokens_for_this_piece = generate_coarse(\n", " semantic_tokens_for_this_piece,\n", " history_prompt=next_segment_history_prompt,\n", " temp=0.7,\n", " silent=True,\n", " use_kv_caching=True,\n", " x_coarse_history_alignment_hack=-2\n", " )\n", " fine_tokens_for_this_piece = generate_fine(\n", " coarse_tokens_for_this_piece,\n", " history_prompt=next_segment_history_prompt,\n", " temp=0.5,\n", " )\n", " audio_array = codec_decode(fine_tokens_for_this_piece)\n", "\n", " full_generation = {\n", " \"semantic_prompt\": semantic_tokens_for_this_piece,\n", " \"coarse_prompt\": coarse_tokens_for_this_piece,\n", " \"fine_prompt\": fine_tokens_for_this_piece,\n", " }\n", "\n", " show_history_prompt_size(full_generation, text=\"full_generation\", semantic_back_n=previous_segment_buffer)\n", "\n", "\n", " next_segment_history_prompt = merge_history_prompts(next_segment_history_prompt, full_generation, right_size=1024)\n", "\n", " show_history_prompt_size(next_segment_history_prompt, text=\"next_segment_history_prompt, full generation, end of loop\")\n", "\n", " final_pieces.append(audio_array)\n", "\n", "\n", "\n", "\"\"\"\n", "\n", "\n", "print (f\" full len: {len(semantic_tokens_full)}\")\n", "\n", "#split_semantic_tokens = split_array_equally(semantic_tokens, 8)\n", "\n", "#for i, coarse_semantic_tokens in enumerate(split_semantic_tokens):\n", "for i, piece in enumerate(pieces):\n", " coarse_semantic_tokens = semantic_tokens_full[i]\n", " print(f\"length of coarse_semantic_tokens {i + 1}: {len(coarse_semantic_tokens)}\")\n", " audio_array = semantic_to_waveform(coarse_semantic_tokens, history_prompt=SPEAKER)\n", " final_pieces += [audio_array, silence.copy()]\n", "\"\"\"\n", "\n", "\n", "\n", "\n", "Audio(np.concatenate(final_pieces), rate=SAMPLE_RATE)" ] }, { "cell_type": "code", "execution_count": null, "id": "d29cf0d7", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "\n", "def plot_heatmap(data, width_per_100_cols=1, title=None):\n", " if len(data.shape) == 1:\n", " data = data[np.newaxis, :]\n", " \n", " width = (data.shape[1] / 100) * width_per_100_cols\n", " fig, ax = plt.subplots(figsize=(width, 5))\n", " sns.heatmap(data, cmap='coolwarm', ax=ax)\n", " \n", " if title:\n", " plt.title(title)\n", " \n", " plt.show()\n", " \n", " \n", "x_coarse_history = og_full_generation[\"coarse_prompt\"]\n", "\n", "print(x_coarse_history.shape)\n", "#x_coarse_history = _flatten_codebooks(x_coarse_history) + SEMANTIC_VOCAB_SIZE\n", "\n", "print(full_generation[\"coarse_prompt\"].shape)" ] }, { "cell_type": "code", "execution_count": null, "id": "6708bacb", "metadata": {}, "outputs": [], "source": [ "import rich\n", "from rich import print\n", "from rich import pretty\n", "from rich.pretty import pprint\n", "from rich import inspect\n", "\n", "SPEAKER = \"en_fiery.npz\"\n", "history_prompt_npz = np.load(SPEAKER)\n", "\n", "semantic_tokens = history_prompt_npz[\"semantic_prompt\"]\n", "coarse_tokens = history_prompt_npz[\"coarse_prompt\"]\n", "fine_tokens = history_prompt_npz[\"fine_prompt\"]\n", "\n", "print(f\"semantic_tokens shape: {semantic_tokens.shape}\")\n", "print(f\"coarse_tokens shape: {coarse_tokens.shape}\")\n", "print(f\"fine_tokens shape: {fine_tokens.shape}\")\n", "\n", "plot_heatmap(semantic_tokens)\n", "plot_heatmap(coarse_tokens)\n", "plot_heatmap(fine_tokens)\n", "\n", "# print shapes\n", "\n", "\n", "\n", "# The blue values seem to be silence. if we chunk up coarse, we should split on those. \n", "# is is token 424 and 518 mostly? I culd strip those, then resize semantic. That way if we chunk between words were good.\n", "# or, split on those...\n", "inspect(semantic_tokens, title=\"semantic_tokens\")\n", "inspect(coarse_tokens, title=\"coarse_tokens\")\n", "inspect(fine_tokens, title=\"fine_tokens\")" ] }, { "cell_type": "code", "execution_count": null, "id": "aa01f71c", "metadata": {}, "outputs": [], "source": [ "sns.heatmap(next_coarse_tokens, cmap='coolwarm')\n", "plt.title('Coarse Prompt')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "2485d1e4", "metadata": {}, "outputs": [], "source": [ "og_coarse = og_full_generation[\"coarse_prompt\"]\n", "quick_cat_test = np.hstack([og_coarse,og_coarse]).astype(np.int32)\n", "\n", "width_per_100_cols = 1\n", "\n", "width1 = (og_coarse.shape[1] / 100) * width_per_100_cols\n", "fig, ax = plt.subplots(figsize=(width1, 5))\n", "sns.heatmap(og_coarse, cmap='coolwarm', ax=ax)\n", "plt.title('Coarse Prompt orig')\n", "plt.show()\n", "\n", "width2 = (quick_cat_test.shape[1] / 100) * width_per_100_cols\n", "fig, ax = plt.subplots(figsize=(width2, 5))\n", "\n", "sns.heatmap(quick_cat_test, cmap='coolwarm', ax=ax)\n", "plt.title('Coarse Prompt concat with itself')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "b131aa2f", "metadata": {}, "outputs": [], "source": [ "text = \"\"\"\n", "Truth is, I don't think people understood what it was I was doing at Schaffer. \n", "I wasn't there to conduct. \n", "How many fucken morons can wave his arms and keep people in tempo? \n", "I was there to push people beyond what's expected of them. \n", "I believe that is an absolute necessity. \n", "Otherwise we're depriving the world of the next Louis Armstrong, \n", "or the next Charlie Parker. \n", "Have I told you that story about how Charlie Parker became Charlie Parker?\n", "Parker's a young kid, pretty good on the Sax, \n", "gets up to play at a cutting session, \n", "and well, he fucks it up. \n", "And Jones nearly decapitates him for it, throws a cymbal at his head. \n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": null, "id": "8a9dcf9f", "metadata": {}, "outputs": [], "source": [ "# Test: When generate_text_semantic is given way too many tokens, and the audio is messed up, are we sure it's not the coarse function that is dying when trying to handle the big prompt?\n", "# To figure out, let's give it some bgi text and split the semantic tokens ourselves\n", "# voice speed is largely determined by how many words you generate_text_semantic to represent, though the history_prompt has a huge effect\n", "# because that's how many words THAT was asked to represent\n", "\n", "# result, not it's the semantic\n", "\n", "# you can see the tokens stop increasing at some point. could be good rule of thumb for chunk decisions\n", "\n", "GEN_TEMP = 0.6\n", "SPEAKER = \"v2/en_speaker_6\"\n", "silence = np.zeros(int(0.25 * SAMPLE_RATE)) # quarter second of silence\n", "\n", "# A common mistake that people make when trying to design something completely foolproof is to underestimate the ingenuity of complete fools.\n", "import time\n", "\n", "import pandas\n", "\n", "set_seed(-1)\n", "\n", "this_segment_start_time = time.time()\n", "\n", "\n", "print(f\"Segment Start at: {time.strftime('%Y-%m-%d %H:%M:%S')}\")\n", "final_pieces = [] \n", "\n", "semantic_tokens_full = []\n", "\n", "\n", "pieces = text.strip().split(\"\\n\")\n", "\n", "last_piece = ''\n", "for i, piece in enumerate(pieces):\n", " piece = f\"{last_piece} {piece}\"\n", " print(f\"piece {i}: {piece}\")\n", "\n", " semantic_tokens = generate_text_semantic(\n", " piece,\n", " history_prompt=SPEAKER,\n", " temp=GEN_TEMP,\n", " min_eos_p=0.05, \n", " silent=True,\n", " )\n", " last_piece = piece\n", " semantic_token_length = len(semantic_tokens)\n", " print(f\"length of semantic_tokens: {semantic_token_length}\")\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "6eee9f5a", "metadata": {}, "outputs": [], "source": [ "# Test: When generate_text_semantic is given way too many tokens, and the audio is messed up, are we sure it's not the coarse function that is dying when trying to handle the big prompt?\n", "# To figure out, let's give it some bgi text and split the semantic tokens ourselves\n", "\n", "GEN_TEMP = 0.6\n", "SPEAKER = \"v2/en_speaker_6\"\n", "silence = np.zeros(int(0.25 * SAMPLE_RATE)) # quarter second of silence\n", "\n", "# A common mistake that people make when trying to design something completely foolproof is to underestimate the ingenuity of complete fools.\n", "import time\n", "\n", "import pandas\n", "\n", "set_seed(-1)\n", "\n", "this_segment_start_time = time.time()\n", "\n", "\n", "print(f\"Segment Start at: {time.strftime('%Y-%m-%d %H:%M:%S')}\")\n", "final_pieces = [] \n", "\n", "semantic_tokens_full = []\n", "\n", "\n", "\n", "\n", "full_text = text\n", "pieces = text.split(\"\\n\")\n", "print(pieces)\n", "\n", "last_piece = ''\n", "for i, piece in enumerate(pieces):\n", " piece += last_piece\n", " print(f\"piece {i}: {piece}\")\n", "\n", " semantic_tokens = generate_text_semantic(\n", " piece,\n", " history_prompt=SPEAKER,\n", " temp=GEN_TEMP,\n", " min_eos_p=0.05, \n", " )\n", " last_piece = piece\n", " semantic_token_length = len(semantic_tokens)\n", " print(f\"length of semantic_tokens: {semantic_token_length}\")\n", " \n", "\n", "\n", "semantic_end_time = time.time()\n", "elapsed_time = semantic_end_time - this_segment_start_time\n", "\n", "semantic_token_length = len(semantic_tokens)\n", "time_finished = f\"{semantic_token_length} semantic_tokens Finished at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(semantic_end_time))}\"\n", "time_taken = f\"in {elapsed_time} seconds\"\n", "print(f\" -->{time_finished} {time_taken}\")\n", "\n", "\n", "\"\"\"\n", "\n", "history_prompt_npz = np.load(\"bark/assets/prompts/v2/en_speaker_6.npz\")\n", "\n", "semantic_tokens = history_prompt_npz[\"semantic_prompt\"]\n", "coarse_tokens = history_prompt_npz[\"coarse_prompt\"]\n", "fine_tokens = history_prompt_npz[\"fine_prompt\"]\n", "\n", "next_segment_history_prompt = {\n", " \"semantic_prompt\": semantic_tokens,\n", " \"coarse_prompt\": coarse_tokens,\n", " \"fine_prompt\": fine_tokens,\n", "}\n", "\n", "\n", "for i, piece in enumerate(pieces):\n", "\n", " print(f\"i: {i} piece: {piece}\")\n", "\n", " semantic_tokens = generate_text_semantic(\n", " piece,\n", " history_prompt=next_segment_history_prompt,\n", " temp=GEN_TEMP,\n", " min_eos_p=0.05, # this controls how likely the generation is to end\n", " )\n", "\n", " # need to save for coarse? first try without. just using base histotry for coarse, but with semantic toekes preogressive\n", "\n", " new_semantic = np.hstack([x_semantic_history, x_semantic]).astype(np.int32)\n", "\n", "\n", " next_segment_history_prompt[\"semantic_prompt\"] = semantic_tokens\n", " \n", " semantic_tokens_full.append(semantic_tokens)\n", "\"\"\"\n", "#Assuming x_semantic_history and x_semantic are already defined\n", "\n", "\n", "\n", "\n", "print (f\" full len: {len(semantic_tokens)}\")\n", "\n", "split_semantic_tokens = split_array_equally(semantic_tokens, 8)\n", "\n", "for i, coarse_semantic_tokens in enumerate(split_semantic_tokens):\n", " print(f\"length of coarse_semantic_tokens {i + 1}: {len(coarse_semantic_tokens)}\")\n", " audio_array = semantic_to_waveform(coarse_semantic_tokens, history_prompt=SPEAKER)\n", " final_pieces += [audio_array, silence.copy()]\n", "\n", "\n", "\n", "coarse_end_time = time.time()\n", "elapsed_time = coarse_end_time - semantic_end_time\n", "\n", "\n", "time_finished = f\"coarse finished Finished at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(coarse_end_time))}\"\n", "time_taken = f\"in {elapsed_time} seconds\"\n", "print(f\" -->{time_finished} {time_taken}\")\n", "\n", "Audio(np.concatenate(final_pieces), rate=SAMPLE_RATE)" ] }, { "attachments": {}, "cell_type": "markdown", "id": "be8e125e", "metadata": {}, "source": [ "# $ \\\\ $" ] }, { "attachments": {}, "cell_type": "markdown", "id": "03a16c1b", "metadata": {}, "source": [ "# Make a Long-Form Dialog with Bark" ] }, { "attachments": {}, "cell_type": "markdown", "id": "06c5eff8", "metadata": {}, "source": [ "### Step 1: Format a script and speaker lookup" ] }, { "cell_type": "code", "execution_count": null, "id": "5238b297", "metadata": {}, "outputs": [], "source": [ "speaker_lookup = {\"Samantha\": \"v2/en_speaker_9\", \"John\": \"v2/en_speaker_2\"}\n", "\n", "# Script generated by chat GPT\n", "script = \"\"\"\n", "Samantha: Hey, have you heard about this new text-to-audio model called \"Bark\"?\n", "\n", "John: No, I haven't. What's so special about it?\n", "\n", "Samantha: Well, apparently it's the most realistic and natural-sounding text-to-audio model out there right now. People are saying it sounds just like a real person speaking.\n", "\n", "John: Wow, that sounds amazing. How does it work?\n", "\n", "Samantha: I think it uses advanced machine learning algorithms to analyze and understand the nuances of human speech, and then replicates those nuances in its own speech output.\n", "\n", "John: That's pretty impressive. Do you think it could be used for things like audiobooks or podcasts?\n", "\n", "Samantha: Definitely! In fact, I heard that some publishers are already starting to use Bark to create audiobooks. And I bet it would be great for podcasts too.\n", "\n", "John: I can imagine. It would be like having your own personal voiceover artist.\n", "\n", "Samantha: Exactly! I think Bark is going to be a game-changer in the world of text-to-audio technology.\"\"\"\n", "script = script.strip().split(\"\\n\")\n", "script = [s.strip() for s in script if s]\n", "script" ] }, { "attachments": {}, "cell_type": "markdown", "id": "ee547efd", "metadata": {}, "source": [ "### Step 2: Generate the audio for every speaker turn" ] }, { "cell_type": "code", "execution_count": null, "id": "203e5081", "metadata": {}, "outputs": [], "source": [ "pieces = []\n", "silence = np.zeros(int(0.5*SAMPLE_RATE))\n", "for line in script:\n", " speaker, text = line.split(\": \")\n", " audio_array = generate_audio(text, history_prompt=speaker_lookup[speaker], )\n", " pieces += [audio_array, silence.copy()]" ] }, { "attachments": {}, "cell_type": "markdown", "id": "7c54bada", "metadata": {}, "source": [ "### Step 3: Concatenate all of the audio and play it" ] }, { "cell_type": "code", "execution_count": null, "id": "27a56842", "metadata": {}, "outputs": [], "source": [ "Audio(np.concatenate(pieces), rate=SAMPLE_RATE)" ] }, { "cell_type": "code", "execution_count": null, "id": "a1bc5877", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.10" } }, "nbformat": 4, "nbformat_minor": 5 }