{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "39ea4bed", "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n", "\n", "\n", "from IPython.display import Audio\n", "import nltk # we'll use this to split into sentences\n", "import numpy as np\n", "\n", "from bark.generation import (\n", " generate_text_semantic,\n", " preload_models,\n", ")\n", "from bark.api import semantic_to_waveform\n", "from bark import generate_audio, SAMPLE_RATE" ] }, { "cell_type": "code", "execution_count": 29, "id": "776964b6", "metadata": {}, "outputs": [], "source": [ "preload_models()" ] }, { "cell_type": "code", "execution_count": null, "id": "1d03f4d2", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "74a025a4", "metadata": {}, "source": [ "# Simple Long-Form Generation\n", "We split longer text into sentences using `nltk` and generate the sentences one by one." ] }, { "cell_type": "code", "execution_count": 33, "id": "57b06e2a", "metadata": {}, "outputs": [], "source": [ "script = \"\"\"\n", "Hey, have you heard about this new text-to-audio model called \"Bark\"? \n", "Apparently, it's the most realistic and natural-sounding text-to-audio model \n", "out there right now. People are saying it sounds just like a real person speaking. \n", "I think it uses advanced machine learning algorithms to analyze and understand the \n", "nuances of human speech, and then replicates those nuances in its own speech output. \n", "It's pretty impressive, and I bet it could be used for things like audiobooks or podcasts. \n", "In fact, I heard that some publishers are already starting to use Bark to create audiobooks. \n", "It would be like having your own personal voiceover artist. I really think Bark is going to \n", "be a game-changer in the world of text-to-audio technology.\n", "\"\"\".replace(\"\\n\", \" \").strip()" ] }, { "cell_type": "code", "execution_count": 34, "id": "f747f804", "metadata": {}, "outputs": [], "source": [ "sentences = nltk.sent_tokenize(script)" ] }, { "cell_type": "code", "execution_count": 35, "id": "17400a9b", "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 43.03it/s]\n", "100%|████████████████████████████████████████████████████████████████████████| 17/17 [00:06<00:00, 2.45it/s]\n", "100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 22.73it/s]\n", "100%|████████████████████████████████████████████████████████████████████████| 33/33 [00:13<00:00, 2.52it/s]\n", "100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 66.30it/s]\n", "100%|████████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00, 2.46it/s]\n", "100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 20.99it/s]\n", "100%|████████████████████████████████████████████████████████████████████████| 35/35 [00:14<00:00, 2.46it/s]\n", "100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 25.63it/s]\n", "100%|████████████████████████████████████████████████████████████████████████| 29/29 [00:11<00:00, 2.50it/s]\n", "100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 23.90it/s]\n", "100%|████████████████████████████████████████████████████████████████████████| 30/30 [00:12<00:00, 2.46it/s]\n", "100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 53.24it/s]\n", "100%|████████████████████████████████████████████████████████████████████████| 14/14 [00:05<00:00, 2.51it/s]\n", "100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 50.63it/s]\n", "100%|████████████████████████████████████████████████████████████████████████| 15/15 [00:05<00:00, 2.57it/s]\n" ] } ], "source": [ "SPEAKER = \"v2/en_speaker_6\"\n", "silence = np.zeros(int(0.25 * SAMPLE_RATE)) # quarter second of silence\n", "\n", "pieces = []\n", "for sentence in sentences:\n", " audio_array = generate_audio(sentence, history_prompt=SPEAKER)\n", " pieces += [audio_array, silence.copy()]\n" ] }, { "cell_type": "code", "execution_count": null, "id": "04cf77f9", "metadata": {}, "outputs": [], "source": [ "Audio(np.concatenate(pieces), rate=SAMPLE_RATE)" ] }, { "cell_type": "code", "execution_count": null, "id": "ac2d4625", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "6d13249b", "metadata": {}, "source": [ "# $ \\\\ $" ] }, { "cell_type": "markdown", "id": "cdfc8bf5", "metadata": {}, "source": [ "# Advanced Long-Form Generation\n", "Somtimes Bark will hallucinate a little extra audio at the end of the prompt.\n", "We can solve this issue by lowering the threshold for bark to stop generating text. \n", "We use the `min_eos_p` kwarg in `generate_text_semantic`" ] }, { "cell_type": "code", "execution_count": 37, "id": "62807fd0", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 38.05it/s]\n", "100%|████████████████████████████████████████████████████████████████████████| 18/18 [00:07<00:00, 2.46it/s]\n", "100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 32.28it/s]\n", "100%|████████████████████████████████████████████████████████████████████████| 21/21 [00:08<00:00, 2.54it/s]\n", "100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 55.78it/s]\n", "100%|████████████████████████████████████████████████████████████████████████| 14/14 [00:05<00:00, 2.57it/s]\n", "100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:06<00:00, 14.73it/s]\n", "100%|████████████████████████████████████████████████████████████████████████| 35/35 [00:14<00:00, 2.47it/s]\n", "100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 40.29it/s]\n", "100%|████████████████████████████████████████████████████████████████████████| 18/18 [00:07<00:00, 2.56it/s]\n", "100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 32.92it/s]\n", "100%|████████████████████████████████████████████████████████████████████████| 20/20 [00:08<00:00, 2.47it/s]\n", "100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 68.87it/s]\n", "100%|████████████████████████████████████████████████████████████████████████| 12/12 [00:04<00:00, 2.62it/s]\n", "100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 47.64it/s]\n", "100%|████████████████████████████████████████████████████████████████████████| 15/15 [00:06<00:00, 2.46it/s]\n" ] } ], "source": [ "GEN_TEMP = 0.6\n", "SPEAKER = \"v2/en_speaker_6\"\n", "silence = np.zeros(int(0.25 * SAMPLE_RATE)) # quarter second of silence\n", "\n", "pieces = []\n", "for sentence in sentences:\n", " semantic_tokens = generate_text_semantic(\n", " sentence,\n", " history_prompt=SPEAKER,\n", " temp=GEN_TEMP,\n", " min_eos_p=0.05, # this controls how likely the generation is to end\n", " )\n", "\n", " audio_array = semantic_to_waveform(semantic_tokens, history_prompt=SPEAKER,)\n", " pieces += [audio_array, silence.copy()]\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "133fec46", "metadata": {}, "outputs": [], "source": [ "Audio(np.concatenate(pieces), rate=SAMPLE_RATE)" ] }, { "cell_type": "code", "execution_count": null, "id": "6eee9f5a", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "be8e125e", "metadata": {}, "source": [ "# $ \\\\ $" ] }, { "cell_type": "markdown", "id": "03a16c1b", "metadata": {}, "source": [ "# Make a Long-Form Dialog with Bark" ] }, { "cell_type": "markdown", "id": "06c5eff8", "metadata": {}, "source": [ "### Step 1: Format a script and speaker lookup" ] }, { "cell_type": "code", "execution_count": 14, "id": "5238b297", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Samantha: Hey, have you heard about this new text-to-audio model called \"Bark\"?',\n", " \"John: No, I haven't. What's so special about it?\",\n", " \"Samantha: Well, apparently it's the most realistic and natural-sounding text-to-audio model out there right now. People are saying it sounds just like a real person speaking.\",\n", " 'John: Wow, that sounds amazing. How does it work?',\n", " 'Samantha: I think it uses advanced machine learning algorithms to analyze and understand the nuances of human speech, and then replicates those nuances in its own speech output.',\n", " \"John: That's pretty impressive. Do you think it could be used for things like audiobooks or podcasts?\",\n", " 'Samantha: Definitely! In fact, I heard that some publishers are already starting to use Bark to create audiobooks. And I bet it would be great for podcasts too.',\n", " 'John: I can imagine. It would be like having your own personal voiceover artist.',\n", " 'Samantha: Exactly! I think Bark is going to be a game-changer in the world of text-to-audio technology.']" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "speaker_lookup = {\"Samantha\": \"v2/en_speaker_9\", \"John\": \"v2/en_speaker_2\"}\n", "\n", "# Script generated by chat GPT\n", "script = \"\"\"\n", "Samantha: Hey, have you heard about this new text-to-audio model called \"Bark\"?\n", "\n", "John: No, I haven't. What's so special about it?\n", "\n", "Samantha: Well, apparently it's the most realistic and natural-sounding text-to-audio model out there right now. People are saying it sounds just like a real person speaking.\n", "\n", "John: Wow, that sounds amazing. How does it work?\n", "\n", "Samantha: I think it uses advanced machine learning algorithms to analyze and understand the nuances of human speech, and then replicates those nuances in its own speech output.\n", "\n", "John: That's pretty impressive. Do you think it could be used for things like audiobooks or podcasts?\n", "\n", "Samantha: Definitely! In fact, I heard that some publishers are already starting to use Bark to create audiobooks. And I bet it would be great for podcasts too.\n", "\n", "John: I can imagine. It would be like having your own personal voiceover artist.\n", "\n", "Samantha: Exactly! I think Bark is going to be a game-changer in the world of text-to-audio technology.\"\"\"\n", "script = script.strip().split(\"\\n\")\n", "script = [s.strip() for s in script if s]\n", "script" ] }, { "cell_type": "markdown", "id": "ee547efd", "metadata": {}, "source": [ "### Step 2: Generate the audio for every speaker turn" ] }, { "cell_type": "code", "execution_count": 15, "id": "203e5081", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 34.03it/s]\n", "100%|████████████████████████████████████████████████████████████████████████| 22/22 [00:08<00:00, 2.55it/s]\n", "100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 71.58it/s]\n", "100%|████████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00, 2.65it/s]\n", "100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 22.75it/s]\n", "100%|████████████████████████████████████████████████████████████████████████| 33/33 [00:13<00:00, 2.53it/s]\n", "100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 70.76it/s]\n", "100%|████████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00, 2.63it/s]\n", "100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 20.46it/s]\n", "100%|████████████████████████████████████████████████████████████████████████| 36/36 [00:14<00:00, 2.47it/s]\n", "100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 20.18it/s]\n", "100%|████████████████████████████████████████████████████████████████████████| 37/37 [00:14<00:00, 2.51it/s]\n", "100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 23.04it/s]\n", "100%|████████████████████████████████████████████████████████████████████████| 32/32 [00:12<00:00, 2.48it/s]\n", "100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 54.64it/s]\n", "100%|████████████████████████████████████████████████████████████████████████| 14/14 [00:05<00:00, 2.58it/s]\n", "100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 31.71it/s]\n", "100%|████████████████████████████████████████████████████████████████████████| 24/24 [00:09<00:00, 2.56it/s]\n" ] } ], "source": [ "pieces = []\n", "silence = np.zeros(int(0.5*SAMPLE_RATE))\n", "for line in script:\n", " speaker, text = line.split(\": \")\n", " audio_array = generate_audio(text, history_prompt=speaker_lookup[speaker], )\n", " pieces += [audio_array, silence.copy()]" ] }, { "cell_type": "markdown", "id": "7c54bada", "metadata": {}, "source": [ "### Step 3: Concatenate all of the audio and play it" ] }, { "cell_type": "code", "execution_count": null, "id": "27a56842", "metadata": {}, "outputs": [], "source": [ "Audio(np.concatenate(pieces), rate=SAMPLE_RATE)" ] }, { "cell_type": "code", "execution_count": null, "id": "a1bc5877", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" } }, "nbformat": 4, "nbformat_minor": 5 }