diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 0000000000000000000000000000000000000000..b7c95a39476397f9add818a392810ab40b1c97b2
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,22 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/python
+{
+ "name": "Python 3",
+ // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
+ "image": "mcr.microsoft.com/devcontainers/python:1-3.10-bookworm"
+
+ // Features to add to the dev container. More info: https://containers.dev/features.
+ // "features": {},
+
+ // Use 'forwardPorts' to make a list of ports inside the container available locally.
+ // "forwardPorts": [],
+
+ // Use 'postCreateCommand' to run commands after the container is created.
+ // "postCreateCommand": "pip3 install --user -r requirements.txt",
+
+ // Configure tool-specific properties.
+ // "customizations": {},
+
+ // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
+ // "remoteUser": "root"
+}
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..348dcbbb885300c9671a05998c011b488055723c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,27 @@
+__pycache__/
+
+# Ignore all directories
+*/
+
+*.json
+*.mp4
+*.wav
+# But don't ignore these specific directories
+!/bark/
+!/bark_infinity/
+!/notebooks/
+!/webui/
+!/old_setup_files/
+!/bark_infinity/hubert/
+!/one-click-bark-installer/
+
+bark_samples/
+bark/assets/prompts/*.wav
+bark_infinity/assets/prompts/*.wav
+*.wav
+
+custom_speakers/
+.vscode
+bark_infinity.egg-info/
+.history/
+*.log
diff --git a/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/.ipynb_checkpoints/Untitled-checkpoint.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..363fcab7ed6e9634e198cf5555ceb88932c9a245
--- /dev/null
+++ b/.ipynb_checkpoints/Untitled-checkpoint.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..609e2f201675aa127fd34b97779107c0d0305cd2
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,35 @@
+FROM nvidia/cuda:11.8.0-base-ubuntu22.04 as cuda
+
+ENV PYTHON_VERSION=3.10
+
+RUN export DEBIAN_FRONTEND=noninteractive \
+ && apt-get -qq update \
+ && apt-get -qq install --no-install-recommends \
+ libsndfile1-dev \
+ git \
+ python${PYTHON_VERSION} \
+ python${PYTHON_VERSION}-venv \
+ python3-pip \
+ && rm -rf /var/lib/apt/lists/*
+
+RUN ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python3 && \
+ ln -s -f /usr/bin/python${PYTHON_VERSION} /usr/bin/python && \
+ ln -s -f /usr/bin/pip3 /usr/bin/pip
+
+RUN pip install --upgrade pip
+
+RUN pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
+
+FROM cuda as app
+# 2. Copy files
+COPY . /src
+
+WORKDIR /src
+# 3. Install dependencies
+RUN pip install -r requirements-pip.txt
+
+# 4. Install notebook
+RUN pip install encodec rich-argparse
+
+EXPOSE 8082
+CMD ["python", "bark_webui.py"]
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..11aac3a9d4e5b3e8f6c8251b275895142190a40e
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) Suno, Inc
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.docker.md b/README.docker.md
new file mode 100644
index 0000000000000000000000000000000000000000..b644c60a2d5be7f16a4300697dd4ee28b6b72d01
--- /dev/null
+++ b/README.docker.md
@@ -0,0 +1,15 @@
+# Instructions for Docker
+
+Building the Docker image
+
+```bash
+docker build -t bark-infinity:latest .
+```
+
+Running the image
+
+```bash
+docker run --gpus all -p 7860:7860 -v "$(pwd)/.cache:/root/.cache" -v "$(pwd):/src" --rm -e GRADIO_SERVER_NAME=0.0.0.0 bark-infinity:latest
+```
+
+You can now use it at http://localhost:7860
diff --git a/README.md b/README.md
index df6b0291212495f97c65e45422e7cd8bdd33c749..c44f13703841b61f48d7405086181b459735a061 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,742 @@
---
-title: Bark
-emoji: ๐
-colorFrom: red
-colorTo: green
+title: bark
+app_file: bark_webui.py
sdk: gradio
sdk_version: 3.39.0
-app_file: app.py
-pinned: false
---
+# ๐ BARK INFINITY, Voices are Just Sounds. ๐ถ ๐โจ๐
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+#### Why this fork ?
+
+I do not have a GPU so I need Google Colab for running the python code, as there are notebooks available in this project my focus is here.
+
+[![Open In Colab](https://tinyurl.com/3m5bcd9h)](https://colab.research.google.com/github/steinhaug/bark-infinity/blob/main/notebooks/Bark-Infinity.ipynb)
+[![Open In Colab](https://tinyurl.com/jjbfsbk2)](https://colab.research.google.com/github/steinhaug/bark-infinity/blob/main/notebooks/Bark_Infinity_Long_Form_Audio_Colab.ipynb)
+
+
+
+
+# ๐ Bark INFINITY Automatic Windows Installer, NVIDIA (CPU update soon) ๐
+
+
+### โ ๏ธ Note: make sure you fully extract the .zip file before running the .bat files. Check this image if you aren't sure: [install_help.PNG](https://raw.githubusercontent.com/JonathanFly/bark/main/one-click-bark-installer/install_help.PNG)
+
+## Install Prerequisites:
+
+1. **Just the regular Windows NVIDIA drivers**. You don't need anything else installed ahead of time. Not Pytorch. Nothing with `Cuda` in the name. Not even Python. In fact if you installed anything on your Windows system without using a venv or conda, it may cause a problem.
+2. *(Optional But Recommended)* The Windows Terminal https://apps.microsoft.com/store/detail/windows-terminal/9N0DX20HK701 -- Bark still has a lot of text output and it's looks nicer and is easier to read in the Windows Terminal. But you can also use the regular Windows Command Prompt.
+
+## Install Steps
+
+1. Download the latest zip file from the releases page: https://github.com/JonathanFly/bark/releases
+2. Extract the zip file into a directory. Choose a place where Bark will be installed. You will unzip about six small files.
+3. Click on `INSTALL_bark_infinity_windows.bat` (you should not need to be administrator)
+4. If the install finished with no errors, close that terminal window. Close any other open command line windows as well.
+5. Click `LAUNCH_already_installed_bark_infinity_windows.bat`
+
+## Install Problems
+
+1. If you get a Windows permissions error, I seemed to get it randomly. Just trying again usually fixed it. You don't even need to restart from scratch, just rerun the script that threw the error.
+
+
+### Command Line:
+Click `TROUBLESHOOT_bark_setup_manually_by_entering_the_conda_environment.bat`
+```
+cd bark
+python bark_perform.py
+python bark_perform.py --help
+```
+### Trouble Shooting:
+Click `TROUBLESHOOT_bark_setup_manually_by_entering_the_conda_environment.bat`
+
+```
+-----Manual Updates-----
+Type `conda update -y -n base conda` to update conda.
+Type `conda update -y --all --solver=libmamba` to update all packages.
+Type `conda clean --all` to free up disk space from unused versions.
+Type `ffdl install -U --add-path` to try to fix ffmpeg not not problems.
+Type `pip install -r requirements-extra.txt` to try to manually install pip requirements.
+
+Type `conda env update -y -f environment-cuda-installer.yml --prune --solver=libmamba` to update your env manually, if the .yml changed.
+Type `cd bark` to enter the bark directory and then `git pull` to update the repo code.
+Type `git branch` to view branches and then
+Type `git checkout ` to switch branches.
+(You can try `git checkout bark_amd_directml_test` branch if you have an AMD GPU)
+
+-----Still Not Working?-----
+Go ahead and @ me on Bark Official Discord, username "Jonathan Fly" jonathanfly.
+Don't worry about waking me up with a message, my Discord never makes audible alerts.
+
+-----How do I get out of here?-----
+Type 'conda deactivate' to exit this environment and go back to normal terminal.
+```
+
+![LAUNCH_already_installed_bark_infinity_windows.bat](https://github.com/JonathanFly/bark/assets/163408/fcd91d15-6bee-44c7-8c99-95ca48fbc1d5)
+
+
+
+# ๐ Pytorch 2.0 Bark AMD Install Test Pytorch 2.0 ๐
+
+
+**DirectML works on AMD in Pytorch 2.0 Confirmed works.**
+It's not super fast but it's a lot faster than CPU.
+
+Bark AMD DirectML Instructions.
+
+What is DirectML?
+https://learn.microsoft.com/en-us/windows/ai/directml/gpu-pytorch-windows
+
+Install Miniconda. https://repo.anaconda.com/miniconda/Miniconda3-py310_23.3.1-0-Windows-x86_64.exe
+
+Then go to start menu and start a new "Ananconda Prompt" not regular windows command line
+
+
+```
+conda update -y conda
+conda update -y -n base conda
+conda install -y -n base conda-libmamba-solver
+conda create --name pydml_torch2 -y python=3.10.6
+conda activate pydml_torch2
+```
+
+make sure you see (pydml_torch2) in the corner of of your prompt.
+***(pydml_torch2) C:\Users\YourName***
+
+```
+conda install -y pip git --solver=libmamba
+conda update -y --all --solver=libmamba
+
+pip install ffmpeg_downloader
+ffdl install -U --add-path
+```
+Now quit out of the terminal and restart. We need ffmpeg in the path, which means you need to be able to type `ffmpeg -version` and have it work. If you close and restart, you should be able to do that.
+
+So close the terminal, close all window command lines or terminals to be sure.
+Then go back start menu and start a new "Ananaconda Prompt". This should be same you started the install.
+
+```
+conda activate pydml_torch2
+```
+make sure you see (pydml_torch2) in the corner again. ***(pydml_torch2) C:\Users\YourName*** etc.
+
+Now try typing
+```
+ffmpeg -version
+```
+
+Do you see ffmpeg 6.0? If it doesn't work you can keep going and you can use .wav file outputs, and fix it later.
+
+Now the big conda install command. This could take 5 to 15 minutes, and if you have a slow internet it could even take hours, because it downloads multiple gigabytes. So if looks like it's frozen, let it go. Check your task manager and see if it's downloading.
+
+### For testing torch 2.0, just some giant pip installs:
+```
+pip install torch==2.0.0 torchvision==0.15.1 torch-directml==0.2.0.dev230426 opencv-python torchvision==0.15.1 wget torch-directml==0.2.0.dev230426 pygments numpy pandas tensorboard matplotlib tqdm pyyaml boto3 funcy torchaudio transformers pydub pathvalidate rich nltk chardet av hydra-core>=1.1 einops scipy num2words pywin32 ffmpeg ffmpeg-python sentencepiece spacy==3.5.2 librosa jsonschema pytorch_lightning==1.9.4
+
+pip install encodec flashy>=0.0.1 audiolm_pytorch==1.1.4 demucs
+
+pip install universal-startfile hydra_colorlog julius soundfile==0.12.1 gradio>=3.35.2 rich_argparse flashy>=0.0.1 ffmpeg_downloader rich_argparse devtools vector_quantize_pytorch
+
+pip install https://github.com/Sharrnah/fairseq/releases/download/v0.12.4/fairseq-0.12.4-cp310-cp310-win_amd64.whl
+``````
+
+First set a SUNO_USE_DIRECTML variable. This tells Bark to use DirectML. If this doesn't work you can edit `/bark_infinity/config.py`` and set `SUNO_USE_DIRECTML`` to `True`` in the `DEFAULTS`` section.
+```
+set SUNO_USE_DIRECTML=1
+```
+
+Download Bark:
+```
+git clone https://github.com/JonathanFly/bark.git
+cd bark
+```
+Change to the AMD Test Version
+```
+git checkout bark_amd_directml_test
+```
+
+Now try running it. Bark has to download all the models the first time it runs, so it might look frozen for awhile. It's another 10 gigs of files.
+```
+python bark_perform.py
+```
+When I tested this install, `bark_perform.py` seemed to freeze at downloading models without making progress. I don't know if was a fluke, but I ran `python bark_webui.py` and it downloaded them fine.
+
+Start the Bark UI
+```
+python bark_webui.py
+```
+
+Things that don't work:
+1. Voice Cloning (might work?)
+2. Top_k and top_p
+3. Probably more things I haven't tested.
+
+### Start Back UI Later
+1. Click Anaconda Prompt in start menu
+2. `conda activate pydml_torch2`
+3. cd bark
+4. `python bark_webui.py`
+
+### Make it faster? (Note for later, don't try yet)
+
+1. Install MKL exe https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl-download.html
+```
+conda install -y mkl mkl-service mkl_fft libcblas liblapacke liblapack blas-devel mkl-include mkl_random mkl-devel mkl-include libblas=*=*mkl mkl-static intel-openmp blas=*=*mkl -c intel -c conda-forge --solver=libmamba
+```
+
+# ๐นโ๏ธ This AMD Pytorch 1.13.1 (slower)
+
+```
+conda update -y conda
+conda update -y -n base conda
+conda install -y -n base conda-libmamba-solver
+conda create --name pydml -y python=3.10.6
+conda activate pydml
+```
+
+make sure you see (pydml) in the corner of of your prompt.
+***(pydml) C:\Users\YourName***
+
+```
+conda install -y pip git --solver=libmamba
+conda update -y --all --solver=libmamba
+
+pip install ffmpeg_downloader
+ffdl install -U --add-path
+```
+Now quit out of the terminal and restart. We need ffmpeg in the path, which means you need to be able to type `ffmpeg -version` and have it work. If you close and restart, you should be able to do that.
+
+So close the terminal, close all window command lines or terminals to be sure.
+Then go back start menu and start a new "Ananaconda Prompt". This should be same you started the install.
+
+```
+conda activate pydml
+```
+make sure you see (pydml) in the corner again. ***(pydml) C:\Users\YourName*** etc.
+
+Now try typing
+```
+ffmpeg -version
+```
+
+Do you see ffmpeg 6.0? If it doesn't work you can keep going and you can use .wav file outputs, and fix it later.
+
+Now the big conda install command. This could take 5 to 15 minutes, and if you have a slow internet it could even take hours, because it downloads multiple gigabytes. So if looks like it's frozen, let it go. Check your task manager and see if it's downloading.
+
+```
+conda install -y pytorch==1.13.1 pygments numpy pandas tensorboard matplotlib tqdm pyyaml boto3 funcy torchvision==0.14.1 torchaudio==0.13.1 cpuonly transformers pydub pathvalidate rich nltk chardet av hydra-core>=1.1 einops scipy num2words pywin32 ffmpeg ffmpeg-python sentencepiece spacy==3.5.2 librosa jsonschema -c pytorch -c conda-forge --solver=libmamba
+```
+Now that's done a few more things we need, that are not in conda. So we have to use pip.
+
+This is where the instal can go wrong up. **We don't want anything to upgrade either torch or torchaudio to torch 2.0**, and it often happens by accident. (As far I know AMD DirectML Windows only works in Torch 1.13, not 2.0. If anyone knows different let me know!)
+
+If you somehow end up installing torch 2.0. Try `pip uninstall torch torchaudio` and then redo the big long conda install command (the one with `pytorch==1.13.1` in it).
+
+```
+pip install universal-startfile hydra_colorlog julius soundfile==0.12.1 gradio>=3.35.2 rich_argparse flashy>=0.0.1 ffmpeg_downloader rich_argparse devtools
+```
+
+
+```
+pip install encodec flashy>=0.0.1 audiolm_pytorch==1.1.4 demucs --no-dependencies
+
+pip install https://github.com/Sharrnah/fairseq/releases/download/v0.12.4/fairseq-0.12.4-cp310-cp310-win_amd64.whl --no-dependencies
+```
+
+And now finally the actual `torch-directml` that has GPU support. I found installing this last seems best, but you could try doing it earlier.
+```
+pip install torch-directml==0.1.13.1.dev230413
+```
+If everything worked, you might be done.
+Now we install Bark. And then run one command line test first with bark_perform.py
+
+First set a SUNO_USE_DIRECTML variable. This tells Bark to use DirectML. If this doesn't work you can edit `/bark_infinity/config.py`` and set `SUNO_USE_DIRECTML`` to `True`` in the `DEFAULTS`` section.
+```
+set SUNO_USE_DIRECTML=1
+```
+
+Download Bark:
+```
+git clone https://github.com/JonathanFly/bark.git
+cd bark
+```
+Change to the AMD Test Version
+```
+git checkout bark_amd_directml_test
+```
+
+Now try running it. Bark has to download all the models the first time it runs, so it might look frozen for awhile. It's another 10 gigs of files.
+```
+python bark_perform.py
+```
+When I tested this install, `bark_perform.py` seemed to freeze at downloading models without making progress. I don't know if was a fluke, but I ran `python bark_webui.py` and it downloaded them fine.
+
+Start the Bark UI
+```
+python bark_webui.py
+```
+
+Things that don't work:
+1. Voice Cloning
+2. Top_k and top_p
+3. Probably more things I haven't tested.
+
+### Start Back UI Later
+1. Click Anaconda Prompt in start menu
+2. `conda activate pydml`
+3. cd bark
+4. `python bark_webui.py`
+
+### Make it faster? (Note for later, don't try yet)
+
+1. Install MKL exe https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl-download.html
+```
+conda install -y mkl mkl-service mkl_fft libcblas liblapacke liblapack blas-devel mkl-include mkl_random mkl-devel mkl-include libblas=*=*mkl mkl-static intel-openmp blas=*=*mkl -c intel -c conda-forge --solver=libmamba
+```
+
+
+โก Low GPU memory? No problem. CPU offloading. โก Somewhat easy install?
+
+# ๐ Install Bark Infinity Any OS With Mamba (or Conda) ๐
+
+
+## Mamba Install (Still Works) (Should work...)
+
+
+
+(Mamba is a fast version of conda. They should work the same if you install either one, just change mamba to conda or vice-versa.)
+
+Pip and conda/mamba are two _different_ ways of installing Bark Infinity. If you use **Mamba** do not install anything. Don't install _pytorch_, do not install anything with 'CUDA' in the same. You don't need to lookup a YouTube tutorial. Just type the commands. The only thing you need installed is the NVIDIA drivers.
+
+**Take note of which lines are for NVIDIA or CPU, or Linux or Windows.**
+
+There is one exception, on Windows if you don't have the better Windows Terminal installed, that is a nice to have feature https://apps.microsoft.com/store/detail/windows-terminal/9N0DX20HK701
+
+You don't have to but it may display the output from the bark commands better. When you start **Anaconda Prompt (miniconda3)** you can do it from the new Windows Terminal app, clicking on the down arrow next to the plus, should let you pick **Anaconda Prompt (miniconda3)**
+
+1. Go here: [https://github.com/conda-forge/miniforge#mambaforge](https://github.com/conda-forge/miniforge#mambaforge)
+
+2.
+3. Download a **Python 3.10 Miniconda3** installer for your OS. Windows 64-bit, macOS, and Linux probably don't need a guide.
+ a. Install the **Mambaforge** for your OS, not specifically Windows. OSX for OSX etc.
+ b. Don't install Mambaforge-pypy3. (It probably works fine, it is just not what I tested.) Install the one above that, just plain **Mambaforge**. Or you can use **Conda**, Mamba should faster but sometimes Conda may be more compatible.
+
+1. Install the **Python 3.10 Miniconda3** exe. Then start the miniforge **'Miniforge Prompt** Terminal which is a new program it installed. You will always use this program for Bark.
+
+2. Start **'Miniforge Prompt** Be careful not to start the regular windows command line. (Unless you installed the new Terminal and know how to switch.) It should say **"Anaconda Prompt (miniconda3)**"
+
+You should see also terminal that says "**(base)**".
+
+### Do not move forward until you see _(base)_.
+
+5. **Choose the place to install Bark Infinity directory.** You can also just leave it at default. If you make a LOT of audio you think about a place with a lot of space.
+
+When you start **"Anaconda Prompt (miniconda3)"** you will be in a directory, in Windows, probably something like** "C:\Users\YourName"**. Okay to install there. Just remember where you put it. It will be in **/bark.** (If you already had bark-infinity installed and want to update instead of reinstalling, skip to the end.)
+
+6. Type the next commands _exactly_. Hit "Y" for yes where you need to:
+
+
+
+```
+mamba update -y mamba
+mamba create --name bark-infinity python=3.10
+mamba activate bark-infinity
+
+## NVIDIA GPU ONLY
+mamba install -y -k cuda ninja git pip -c nvidia/label/cuda-11.7.0 -c nvidia
+pip install torch==2.0.1+cu117 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
+## END NVIDIA GPU ONLY
+
+## CPU ONLY, Or MacOS
+mamba install -y -k ninja git
+pip install torch torchvision torchaudio
+## END CPU ONLY, Or MacOS
+
+
+## WINDOWS ONLY fairseq
+pip install fairseq@https://github.com/Sharrnah/fairseq/releases/download/v0.12.4/fairseq-0.12.4-cp310-cp310-win_amd64.whl
+
+## NON-WINDOWS fairseq
+mamba install fairseq
+
+pip install audiolm_pytorch==1.1.4 --no-deps
+
+git clone https://github.com/JonathanFly/bark.git
+cd bark
+
+pip install -r barki-allpip.txt --upgrade
+ffdl install -U --add-path
+```
+
+# Run Bark Infinity
+
+## Run command line version
+```
+python bark_perform.py
+```
+## Run web ui version
+```
+python bark_webui.py
+```
+
+(If you see a warning that "No GPU being used. Careful, inference might be very slow!" after `python bark_perform.py` then something may be wrong, if you have GPU. If you *don't* see that then the GPU is working.)
+
+# Start Bark Infinity At A Later Time
+
+To restart later, start **Miniforge Prompt.** Not Regular Prompt. Make sure you see (base) You will type a command to activate **bark-infinity** and of base, like this:
+
+```
+mamba activate bark-infinity
+cd bark
+python bark_webui.py
+```
+
+# Update Bark Infinity
+
+```
+git pull
+pip install -r barki-allpip.txt --upgrade
+```
+
+I have so much good Bark I need to post at [twitter.com/jonathanfly](https://twitter.com/jonathanfly)
+
+
+# ๐ The Past: ๐
+
+Bark Infinity started as a humble ๐ป command line wrapper, a CLI ๐ฌ. Built from simple keyword commands, it was a proof of concept ๐งช, a glimmer of potential ๐ก.
+
+# ๐ The Present: ๐
+
+Bark Infinity _evolved_ ๐งฌ, expanding across dimensions ๐. Infinite Length ๐ต๐, Infinite Voices ๐๐, and a true high point in human history: [๐ Infinite Awkwardness ๐บ](https://twitter.com/jonathanfly/status/1650001584485552130). But for some people, the time-tested command line interface was not a good fit. Many couldn't even try Bark ๐, struggling with CUDA gods ๐ฉ and being left with cryptic error messages ๐ง and a chaotic computer ๐พ. Many people felt veryโฆ UN INFINITE.
+
+# ๐๐ The Future: ๐
+
+๐ Bark Infinity ๐พ was born in the command line, and Bark Infinity grew within the command line. We live in the era where old fashioned command line applications are wrapped in โจfancy Gradio Uis๐ and ๐ฑ๏ธOne Click Installers. We all must adapt to a changing world, right? *Or do we?*
+
+
+
+## ๐ (OLD NOT UPDATED) README ๐ __
+
+### 1. INFINITY VOICES ๐๐
+Discover cool new voices and reuse them. Performers, musicians, sound effects, two party dialog scenes. Save and share them. Every audio clip saves a speaker.npz file with the voice. To reuse a voice, move the generated speaker.npz file (named the same as the .wav file) to the "prompts" directory inside "bark" where all the other .npz files are.
+
+๐ With random celebrity appearances!
+
+(I accidentally left a bunch of voices in the repo, some of them are pretty good. Use --history_prompt 'en_fiery' for the same voice as the audio sample right after this sentence.)
+
+https://user-images.githubusercontent.com/163408/233747981-173b5f03-654e-4a0e-b71b-5d220601fcc7.mp4
+
+
+### 2. INFINITY LENGTH ๐ต๐
+Any length prompt and audio clips. Sometimes the final result is seamless, sometimes it's stable (but usually not both!).
+
+๐ต Now with Slowly Morphing Rick Rolls! Can you even spot the seams in the most earnest Rick Rolls you've ever heard in your life?
+
+https://user-images.githubusercontent.com/163408/233747400-b18411f8-afcb-437d-9288-c54cc2c95e62.mp4
+
+### ๐บ Confused Travolta Mode ๐บ
+Confused Travolta GIF
+![confused_travolta](https://user-images.githubusercontent.com/163408/233747428-c6bf03e2-b3ce-4ce3-a29d-836bf73a4ec2.gif)
+
+Can your text-to-speech model stammer and stall like a student answering a question about a book they didn't read? Bark can. That's the human touch. The *semantic* touch. You can almost feel the awkward silence through the screen.
+
+## ๐ก But Wait, There's More: Travolta Mode Isn't Just A Joke ๐ก
+
+Are you tired of telling your TTS model what to say? Why not take a break and let your TTS model do the work for you. With enough patience and Confused Travolta Mode, Bark can finish your jokes for you.
+
+https://user-images.githubusercontent.com/163408/233746957-f3bbe25f-c8f0-4570-97b1-1005e1b40cbe.mp4
+
+Truly we live in the future. It might take 50 tries to get a joke and it's probably an accident, but all 49 failures are also *very* amusing so it's a win/win. (That's right, I set a single function flag to False in a Bark and raved about the amazing new feature. Everything here is small potatoes really.)
+
+https://user-images.githubusercontent.com/163408/233746872-cac78447-8e87-49e7-b79b-28ec51264019.mp4
+
+
+
+_**BARK INFINITY** is possible because Bark is such an amazingly simple and powerful model that even I could poke around easily._
+
+_For music, I recommend using the --split_by_lines and making sure you use a multiline string as input. You'll generally get better results if you manually split your text, which I neglected to provide an easy way to do because I stayed too late listening to 100 different Bark versions of a scene an Andor and failed Why was 6 afraid of 7 jokes._
+
+## ๐ Command Line Options ๐ (Some of these parameters are not implemented.)
+
+Type --help or use the GUI
+```
+python bark_perform.py --help
+```
+### prompt_file input text file example
+```myprompts.txt
+This is the first prompt.
+Lots of text here maybe.
+As long as you want.
+
+AAAAA
+
+This is the second prompt.
+
+AAAAA
+
+This is the third prompt.
+```
+
+```
+python bark_perform.py --prompt_file myprompts.txt --split_input_into_separate_prompts_by string --split_input_into_separate_prompts_by_value AAAAA --output_dir myprompts_samples
+```
+
+
+# ๐ถ Bark Original Readme ๐ถ
+
+[![](https://dcbadge.vercel.app/api/server/J2B2vsjKuE?style=flat&compact=True)](https://discord.gg/J2B2vsjKuE)
+[![Twitter](https://img.shields.io/twitter/url/https/twitter.com/OnusFM.svg?style=social&label=@OnusFM)](https://twitter.com/OnusFM)
+
+
+[Examples](https://suno-ai.notion.site/Bark-Examples-5edae8b02a604b54a42244ba45ebc2e2) โข [Suno Studio Waitlist](https://3os84zs17th.typeform.com/suno-studio) โข [Updates](#-updates) โข [How to Use](#-usage-in-python) โข [Installation](#-installation) โข [FAQ](#-faq)
+
+[//]: (vertical spaces around image)
+
+
+
+
+
+
+Bark is a transformer-based text-to-audio model created by [Suno](https://suno.ai). Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects. The model can also produce nonverbal communications like laughing, sighing and crying. To support the research community, we are providing access to pretrained model checkpoints, which are ready for inference and available for commercial use.
+
+## โ Disclaimer
+Bark was developed for research purposes. It is not a conventional text-to-speech model but instead a fully generative text-to-audio model, which can deviate in unexpected ways from provided prompts. Suno does not take responsibility for any output generated. Use at your own risk, and please act responsibly.
+
+## ๐ง Demos
+
+[![Open in Spaces](https://img.shields.io/badge/๐ค-Open%20in%20Spaces-blue.svg)](https://huggingface.co/spaces/suno/bark)
+[![Open on Replicate](https://img.shields.io/badge/ยฎ๏ธ-Open%20on%20Replicate-blue.svg)](https://replicate.com/suno-ai/bark)
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1eJfA2XUa-mXwdMy7DoYKVYHI1iTd9Vkt?usp=sharing)
+
+## ๐ Updates
+
+**2023.05.01**
+- ยฉ๏ธ Bark is now licensed under the MIT License, meaning it's now available for commercial use!
+- โก 2x speed-up on GPU. 10x speed-up on CPU. We also added an option for a smaller version of Bark, which offers additional speed-up with the trade-off of slightly lower quality.
+- ๐ [Long-form generation](notebooks/long_form_generation.ipynb), voice consistency enhancements and other examples are now documented in a new [notebooks](./notebooks) section.
+- ๐ฅ We created a [voice prompt library](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c). We hope this resource helps you find useful prompts for your use cases! You can also join us on [Discord](https://discord.gg/J2B2vsjKuE), where the community actively shares useful prompts in the **#audio-prompts** channel.
+- ๐ฌ Growing community support and access to new features here:
+
+ [![](https://dcbadge.vercel.app/api/server/J2B2vsjKuE)](https://discord.gg/J2B2vsjKuE)
+
+- ๐พ You can now use Bark with GPUs that have low VRAM (<4GB).
+
+**2023.04.20**
+- ๐ถ Bark release!
+
+## ๐ Usage in Python
+
+
+
๐ช Basics
+
+```python
+from bark import SAMPLE_RATE, generate_audio, preload_models
+from scipy.io.wavfile import write as write_wav
+from IPython.display import Audio
+
+# download and load all models
+preload_models()
+
+# generate audio from text
+text_prompt = """
+ Hello, my name is Suno. And, uh โ and I like pizza. [laughs]
+ But I also have other interests such as playing tic tac toe.
+"""
+audio_array = generate_audio(text_prompt)
+
+# save audio to disk
+write_wav("bark_generation.wav", SAMPLE_RATE, audio_array)
+
+# play text in notebook
+Audio(audio_array, rate=SAMPLE_RATE)
+```
+
+[pizza.webm](https://user-images.githubusercontent.com/5068315/230490503-417e688d-5115-4eee-9550-b46a2b465ee3.webm)
+
+
+
+
+
๐ Foreign Language
+
+Bark supports various languages out-of-the-box and automatically determines language from input text. When prompted with code-switched text, Bark will attempt to employ the native accent for the respective languages. English quality is best for the time being, and we expect other languages to further improve with scaling.
+
+
+
+```python
+
+text_prompt = """
+ ์ถ์์ ๋ด๊ฐ ๊ฐ์ฅ ์ข์ํ๋ ๋ช ์ ์ด๋ค. ๋๋ ๋ฉฐ์น ๋์ ํด์์ ์ทจํ๊ณ ์น๊ตฌ ๋ฐ ๊ฐ์กฑ๊ณผ ์๊ฐ์ ๋ณด๋ผ ์ ์์ต๋๋ค.
+"""
+audio_array = generate_audio(text_prompt)
+```
+[suno_korean.webm](https://user-images.githubusercontent.com/32879321/235313033-dc4477b9-2da0-4b94-9c8b-a8c2d8f5bb5e.webm)
+
+*Note: since Bark recognizes languages automatically from input text, it is possible to use for example a german history prompt with english text. This usually leads to english audio with a german accent.*
+
+
+
+
+
๐ถ Music
+Bark can generate all types of audio, and, in principle, doesn't see a difference between speech and music. Sometimes Bark chooses to generate text as music, but you can help it out by adding music notes around your lyrics.
+
+
+
+```python
+text_prompt = """
+ โช In the jungle, the mighty jungle, the lion barks tonight โช
+"""
+audio_array = generate_audio(text_prompt)
+```
+[lion.webm](https://user-images.githubusercontent.com/5068315/230684766-97f5ea23-ad99-473c-924b-66b6fab24289.webm)
+
+
+
+
๐ค Voice Presets
+
+Bark supports 100+ speaker presets across [supported languages](#supported-languages). You can browse the library of speaker presets [here](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c), or in the [code](bark/assets/prompts). The community also often shares presets in [Discord](https://discord.gg/J2B2vsjKuE).
+
+Bark tries to match the tone, pitch, emotion and prosody of a given preset, but does not currently support custom voice cloning. The model also attempts to preserve music, ambient noise, etc.
+
+
+
+```python
+text_prompt = """
+ I have a silky smooth voice, and today I will tell you about
+ the exercise regimen of the common sloth.
+"""
+audio_array = generate_audio(text_prompt, history_prompt="v2/en_speaker_1")
+```
+
+[sloth.webm](https://user-images.githubusercontent.com/5068315/230684883-a344c619-a560-4ff5-8b99-b4463a34487b.webm)
+
+
+### Generating Longer Audio
+
+By default, `generate_audio` works well with around 13 seconds of spoken text. For an example of how to do long-form generation, see this [example notebook](notebooks/long_form_generation.ipynb).
+
+
+Click to toggle example long-form generations (from the example notebook)
+
+[dialog.webm](https://user-images.githubusercontent.com/2565833/235463539-f57608da-e4cb-4062-8771-148e29512b01.webm)
+
+[longform_advanced.webm](https://user-images.githubusercontent.com/2565833/235463547-1c0d8744-269b-43fe-9630-897ea5731652.webm)
+
+[longform_basic.webm](https://user-images.githubusercontent.com/2565833/235463559-87efe9f8-a2db-4d59-b764-57db83f95270.webm)
+
+
+
+
+
+
+## ๐ป Installation
+
+```
+pip install git+https://github.com/suno-ai/bark.git
+```
+
+or
+
+```
+git clone https://github.com/suno-ai/bark
+cd bark && pip install .
+```
+*Note: Do NOT use 'pip install bark'. It installs a different package, which is not managed by Suno.*
+
+
+## ๐ ๏ธ Hardware and Inference Speed
+
+Bark has been tested and works on both CPU and GPU (`pytorch 2.0+`, CUDA 11.7 and CUDA 12.0).
+
+On enterprise GPUs and PyTorch nightly, Bark can generate audio in roughly real-time. On older GPUs, default colab, or CPU, inference time might be significantly slower. For older GPUs or CPU you might want to consider using smaller models. Details can be found in out tutorial sections here.
+
+The full version of Bark requires around 12GB of VRAM to hold everything on GPU at the same time.
+To use a smaller version of the models, which should fit into 8GB VRAM, set the environment flag `SUNO_USE_SMALL_MODELS=True`.
+
+If you don't have hardware available or if you want to play with bigger versions of our models, you can also sign up for early access to our model playground [here](https://3os84zs17th.typeform.com/suno-studio).
+
+## โ๏ธ Details
+
+Bark is fully generative tex-to-audio model devolved for research and demo purposes. It follows a GPT style architecture similar to [AudioLM](https://arxiv.org/abs/2209.03143) and [Vall-E](https://arxiv.org/abs/2301.02111) and a quantized Audio representation from [EnCodec](https://github.com/facebookresearch/encodec). It is not a conventional TTS model, but instead a fully generative text-to-audio model capable of deviating in unexpected ways from any given script. Different to previous approaches, the input text prompt is converted directly to audio without the intermediate use of phonemes. It can therefore generalize to arbitrary instructions beyond speech such as music lyrics, sound effects or other non-speech sounds.
+
+Below is a list of some known non-speech sounds, but we are finding more every day. Please let us know if you find patterns that work particularly well on [Discord](https://discord.gg/J2B2vsjKuE)!
+
+- `[laughter]`
+- `[laughs]`
+- `[sighs]`
+- `[music]`
+- `[gasps]`
+- `[clears throat]`
+- `โ` or `...` for hesitations
+- `โช` for song lyrics
+- CAPITALIZATION for emphasis of a word
+- `[MAN]` and `[WOMAN]` to bias Bark toward male and female speakers, respectively
+
+### Supported Languages
+
+| Language | Status |
+| --- | --- |
+| English (en) | โ |
+| German (de) | โ |
+| Spanish (es) | โ |
+| French (fr) | โ |
+| Hindi (hi) | โ |
+| Italian (it) | โ |
+| Japanese (ja) | โ |
+| Korean (ko) | โ |
+| Polish (pl) | โ |
+| Portuguese (pt) | โ |
+| Russian (ru) | โ |
+| Turkish (tr) | โ |
+| Chinese, simplified (zh) | โ |
+
+Requests for future language support [here](https://github.com/suno-ai/bark/discussions/111) or in the **#forums** channel on [Discord](https://discord.com/invite/J2B2vsjKuE).
+
+## ๐ Appreciation
+
+- [nanoGPT](https://github.com/karpathy/nanoGPT) for a dead-simple and blazing fast implementation of GPT-style models
+- [EnCodec](https://github.com/facebookresearch/encodec) for a state-of-the-art implementation of a fantastic audio codec
+- [AudioLM](https://github.com/lucidrains/audiolm-pytorch) for related training and inference code
+- [Vall-E](https://arxiv.org/abs/2301.02111), [AudioLM](https://arxiv.org/abs/2209.03143) and many other ground-breaking papers that enabled the development of Bark
+
+## ยฉ License
+
+Bark is licensed under the MIT License.
+
+Please contact us at `bark@suno.ai` to request access to a larger version of the model.
+
+## ๐ฑย Community
+
+- [Twitter](https://twitter.com/OnusFM)
+- [Discord](https://discord.gg/J2B2vsjKuE)
+
+## ๐งย Suno Studio (Early Access)
+
+Weโre developing a playground for our models, including Bark.
+
+If you are interested, you can sign up for early access [here](https://3os84zs17th.typeform.com/suno-studio).
+
+## โ FAQ
+
+#### How do I specify where models are downloaded and cached?
+* Bark uses Hugging Face to download and store models. You can see find more info [here](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhome).
+
+
+#### Bark's generations sometimes differ from my prompts. What's happening?
+* Bark is a GPT-style model. As such, it may take some creative liberties in its generations, resulting in higher-variance model outputs than traditional text-to-speech approaches.
+
+#### What voices are supported by Bark?
+* Bark supports 100+ speaker presets across [supported languages](#supported-languages). You can browse the library of speaker presets [here](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c). The community also shares presets in [Discord](https://discord.gg/J2B2vsjKuE). Bark also supports generating unique random voices that fit the input text. Bark does not currently support custom voice cloning.
+
+#### Why is the output limited to ~13-14 seconds?
+* Bark is a GPT-style model, and its architecture/context window is optimized to output generations with roughly this length.
+
+#### How much VRAM do I need?
+* The full version of Bark requires around 12Gb of memory to hold everything on GPU at the same time. However, even smaller cards down to ~2Gb work with some additional settings. Simply add the following code snippet before your generation:
+
+```python
+import os
+os.environ["SUNO_OFFLOAD_CPU"] = True
+os.environ["SUNO_USE_SMALL_MODELS"] = True
+```
+
+#### My generated audio sounds like a 1980s phone call. What's happening?
+* Bark generates audio from scratch. It is not meant to create only high-fidelity, studio-quality speech. Rather, outputs could be anything from perfect speech to multiple people arguing at a baseball game recorded with bad microphones.
diff --git a/Untitled.ipynb b/Untitled.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..0b630d951b0e11dc5727e70b6b7361c6f0fc8626
--- /dev/null
+++ b/Untitled.ipynb
@@ -0,0 +1,486 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "9c4c1f56",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Ignoring sox: markers 'platform_system == \"Darwin\"' don't match your environment\n",
+ "Ignoring soundfile: markers 'platform_system == \"Windows\"' don't match your environment\n",
+ "Ignoring fairseq: markers 'platform_system == \"Windows\"' don't match your environment\n",
+ "Ignoring fairseq: markers 'platform_system == \"Darwin\"' don't match your environment\n",
+ "Ignoring pywin32: markers 'platform_system == \"Windows\"' don't match your environment\n",
+ "Requirement already satisfied: setuptools in /usr/lib/python3/dist-packages (from -r old_setup_files/requirements-pip.txt (line 1)) (45.2.0)\n",
+ "Collecting transformers\n",
+ " Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 7.4 MB 40 kB/s eta 0:00:01\n",
+ "\u001b[?25hCollecting diffusers\n",
+ " Downloading diffusers-0.19.3-py3-none-any.whl (1.3 MB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 1.3 MB 14 kB/s eta 0:00:01\n",
+ "\u001b[?25hCollecting ffmpeg-downloader\n",
+ " Downloading ffmpeg_downloader-0.2.0-py3-none-any.whl (27 kB)\n",
+ "Collecting ffmpeg\n",
+ " Downloading ffmpeg-1.4.tar.gz (5.1 kB)\n",
+ "Collecting ffmpeg-python\n",
+ " Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)\n",
+ "Collecting sox\n",
+ " Downloading sox-1.4.1-py2.py3-none-any.whl (39 kB)\n",
+ "Collecting fairseq\n",
+ " Downloading fairseq-0.12.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (11.0 MB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 11.0 MB 1.1 kB/s eta 0:00:01\n",
+ "\u001b[?25hRequirement already satisfied: librosa in /home/jamal/.local/lib/python3.8/site-packages (from -r old_setup_files/requirements-pip.txt (line 13)) (0.8.1)\n",
+ "Collecting boto3\n",
+ " Downloading boto3-1.28.18-py3-none-any.whl (135 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 135 kB 3.2 MB/s eta 0:00:01\n",
+ "\u001b[?25hCollecting funcy\n",
+ " Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)\n",
+ "Requirement already satisfied: numpy in /home/jamal/.local/lib/python3.8/site-packages (from -r old_setup_files/requirements-pip.txt (line 16)) (1.20.3)\n",
+ "Requirement already satisfied: scipy in /home/jamal/.local/lib/python3.8/site-packages (from -r old_setup_files/requirements-pip.txt (line 17)) (1.7.3)\n",
+ "Collecting tokenizers\n",
+ " Downloading tokenizers-0.13.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 7.8 MB 5.1 MB/s eta 0:00:01\n",
+ "\u001b[?25hRequirement already satisfied: tqdm in /home/jamal/.local/lib/python3.8/site-packages (from -r old_setup_files/requirements-pip.txt (line 19)) (4.62.3)\n",
+ "Requirement already satisfied: ipython in /home/jamal/.local/lib/python3.8/site-packages (from -r old_setup_files/requirements-pip.txt (line 20)) (8.9.0)\n",
+ "Collecting huggingface_hub>0.15\n",
+ " Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 268 kB 2.5 MB/s eta 0:00:01\n",
+ "\u001b[?25hRequirement already satisfied: rich in /home/jamal/.local/lib/python3.8/site-packages (from -r old_setup_files/requirements-pip.txt (line 22)) (13.3.5)\n",
+ "Collecting pathvalidate\n",
+ " Downloading pathvalidate-3.1.0-py3-none-any.whl (21 kB)\n",
+ "Collecting rich-argparse\n",
+ " Downloading rich_argparse-1.2.0-py3-none-any.whl (16 kB)\n",
+ "Collecting encodec\n",
+ " Downloading encodec-0.1.1.tar.gz (3.7 MB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 3.7 MB 2.8 MB/s eta 0:00:01\n",
+ "\u001b[?25hRequirement already satisfied: chardet in /usr/lib/python3/dist-packages (from -r old_setup_files/requirements-pip.txt (line 26)) (3.0.4)\n",
+ "Collecting pydub\n",
+ " Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)\n",
+ "Requirement already satisfied: requests in /home/jamal/.local/lib/python3.8/site-packages (from -r old_setup_files/requirements-pip.txt (line 28)) (2.28.2)\n",
+ "Collecting audio2numpy\n",
+ " Downloading audio2numpy-0.1.2-py3-none-any.whl (10 kB)\n",
+ "Collecting faiss-cpu\n",
+ " Downloading faiss_cpu-1.7.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 17.6 MB 69 kB/s eta 0:00:01 |โโโโโโโโโโโโ | 6.2 MB 13.9 MB/s eta 0:00:01\n",
+ "\u001b[?25hRequirement already satisfied: joblib in /home/jamal/.local/lib/python3.8/site-packages (from -r old_setup_files/requirements-pip.txt (line 31)) (1.3.1)\n",
+ "Collecting audiolm-pytorch\n",
+ " Downloading audiolm_pytorch-1.2.24-py3-none-any.whl (40 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 40 kB 192 kB/s eta 0:00:01\n",
+ "\u001b[?25hCollecting universal-startfile\n",
+ " Downloading universal_startfile-0.2-py3-none-any.whl (3.4 kB)\n",
+ "Collecting gradio>=3.34.0\n",
+ " Downloading gradio-3.39.0-py3-none-any.whl (19.9 MB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 19.9 MB 2.9 MB/s eta 0:00:01\n",
+ "\u001b[?25hRequirement already satisfied: pyyaml>=5.1 in /usr/lib/python3/dist-packages (from transformers->-r old_setup_files/requirements-pip.txt (line 2)) (5.3.1)\n",
+ "Requirement already satisfied: filelock in /home/jamal/.local/lib/python3.8/site-packages (from transformers->-r old_setup_files/requirements-pip.txt (line 2)) (3.12.2)\n",
+ "Collecting regex!=2019.12.17\n",
+ " Downloading regex-2023.6.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (772 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 772 kB 3.0 MB/s eta 0:00:01\n",
+ "\u001b[?25hCollecting safetensors>=0.3.1\n",
+ " Downloading safetensors-0.3.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 1.3 MB 3.1 MB/s eta 0:00:01\n",
+ "\u001b[?25hRequirement already satisfied: packaging>=20.0 in /home/jamal/.local/lib/python3.8/site-packages (from transformers->-r old_setup_files/requirements-pip.txt (line 2)) (23.0)\n",
+ "Requirement already satisfied: importlib-metadata in /home/jamal/.local/lib/python3.8/site-packages (from diffusers->-r old_setup_files/requirements-pip.txt (line 3)) (6.0.0)\n",
+ "Requirement already satisfied: Pillow in /home/jamal/.local/lib/python3.8/site-packages (from diffusers->-r old_setup_files/requirements-pip.txt (line 3)) (8.4.0)\n",
+ "Collecting appdirs\n",
+ " Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)\n",
+ "Requirement already satisfied: future in /usr/lib/python3/dist-packages (from ffmpeg-python->-r old_setup_files/requirements-pip.txt (line 6)) (0.18.2)\n",
+ "Collecting omegaconf<2.1\n",
+ " Downloading omegaconf-2.0.6-py3-none-any.whl (36 kB)\n",
+ "Collecting cython\n",
+ " Downloading Cython-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 3.6 MB 3.6 MB/s eta 0:00:01\n",
+ "\u001b[?25hRequirement already satisfied: cffi in /home/jamal/.local/lib/python3.8/site-packages (from fairseq->-r old_setup_files/requirements-pip.txt (line 12)) (1.15.1)\n",
+ "Requirement already satisfied: torch in /home/jamal/.local/lib/python3.8/site-packages (from fairseq->-r old_setup_files/requirements-pip.txt (line 12)) (2.0.1)\n",
+ "Collecting sacrebleu>=1.4.12\n",
+ " Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 118 kB 3.1 MB/s eta 0:00:01\n",
+ "\u001b[?25hCollecting torchaudio>=0.8.0\n",
+ " Downloading torchaudio-2.0.2-cp38-cp38-manylinux1_x86_64.whl (4.4 MB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 4.4 MB 572 kB/s eta 0:00:01 |โโโโโโโโโโโโโโโโโโโโโ | 2.9 MB 2.0 MB/s eta 0:00:01\n",
+ "\u001b[?25hCollecting hydra-core<1.1,>=1.0.7\n",
+ " Downloading hydra_core-1.0.7-py3-none-any.whl (123 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 123 kB 1.9 MB/s eta 0:00:01\n",
+ "\u001b[?25hCollecting bitarray\n",
+ " Downloading bitarray-2.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (283 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 283 kB 2.4 MB/s eta 0:00:01\n",
+ "\u001b[?25hRequirement already satisfied: audioread>=2.0.0 in /home/jamal/.local/lib/python3.8/site-packages (from librosa->-r old_setup_files/requirements-pip.txt (line 13)) (3.0.0)\n",
+ "Requirement already satisfied: soundfile>=0.10.2 in /home/jamal/.local/lib/python3.8/site-packages (from librosa->-r old_setup_files/requirements-pip.txt (line 13)) (0.10.3.post1)\n",
+ "Requirement already satisfied: pooch>=1.0 in /home/jamal/.local/lib/python3.8/site-packages (from librosa->-r old_setup_files/requirements-pip.txt (line 13)) (1.7.0)\n",
+ "Requirement already satisfied: scikit-learn!=0.19.0,>=0.14.0 in /home/jamal/.local/lib/python3.8/site-packages (from librosa->-r old_setup_files/requirements-pip.txt (line 13)) (1.0.2)\n",
+ "Requirement already satisfied: numba>=0.43.0 in /home/jamal/.local/lib/python3.8/site-packages (from librosa->-r old_setup_files/requirements-pip.txt (line 13)) (0.57.1)\n",
+ "Requirement already satisfied: decorator>=3.0.0 in /home/jamal/.local/lib/python3.8/site-packages (from librosa->-r old_setup_files/requirements-pip.txt (line 13)) (5.1.1)\n",
+ "Requirement already satisfied: resampy>=0.2.2 in /home/jamal/.local/lib/python3.8/site-packages (from librosa->-r old_setup_files/requirements-pip.txt (line 13)) (0.4.2)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Collecting botocore<1.32.0,>=1.31.18\n",
+ " Downloading botocore-1.31.18-py3-none-any.whl (11.1 MB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 11.1 MB 1.2 kB/s eta 0:00:01\n",
+ "\u001b[?25hCollecting s3transfer<0.7.0,>=0.6.0\n",
+ " Downloading s3transfer-0.6.1-py3-none-any.whl (79 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 79 kB 964 kB/s eta 0:00:01\n",
+ "\u001b[?25hCollecting jmespath<2.0.0,>=0.7.1\n",
+ " Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)\n",
+ "Requirement already satisfied: backcall in /home/jamal/.local/lib/python3.8/site-packages (from ipython->-r old_setup_files/requirements-pip.txt (line 20)) (0.2.0)\n",
+ "Requirement already satisfied: pygments>=2.4.0 in /home/jamal/.local/lib/python3.8/site-packages (from ipython->-r old_setup_files/requirements-pip.txt (line 20)) (2.14.0)\n",
+ "Requirement already satisfied: jedi>=0.16 in /home/jamal/.local/lib/python3.8/site-packages (from ipython->-r old_setup_files/requirements-pip.txt (line 20)) (0.18.2)\n",
+ "Requirement already satisfied: stack-data in /home/jamal/.local/lib/python3.8/site-packages (from ipython->-r old_setup_files/requirements-pip.txt (line 20)) (0.6.2)\n",
+ "Requirement already satisfied: traitlets>=5 in /home/jamal/.local/lib/python3.8/site-packages (from ipython->-r old_setup_files/requirements-pip.txt (line 20)) (5.9.0)\n",
+ "Requirement already satisfied: pexpect>4.3; sys_platform != \"win32\" in /usr/lib/python3/dist-packages (from ipython->-r old_setup_files/requirements-pip.txt (line 20)) (4.6.0)\n",
+ "Requirement already satisfied: matplotlib-inline in /home/jamal/.local/lib/python3.8/site-packages (from ipython->-r old_setup_files/requirements-pip.txt (line 20)) (0.1.6)\n",
+ "Requirement already satisfied: pickleshare in /home/jamal/.local/lib/python3.8/site-packages (from ipython->-r old_setup_files/requirements-pip.txt (line 20)) (0.7.5)\n",
+ "Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.30 in /home/jamal/.local/lib/python3.8/site-packages (from ipython->-r old_setup_files/requirements-pip.txt (line 20)) (3.0.36)\n",
+ "Collecting fsspec\n",
+ " Downloading fsspec-2023.6.0-py3-none-any.whl (163 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 163 kB 3.0 MB/s eta 0:00:01\n",
+ "\u001b[?25hRequirement already satisfied: typing-extensions>=3.7.4.3 in /home/jamal/.local/lib/python3.8/site-packages (from huggingface_hub>0.15->-r old_setup_files/requirements-pip.txt (line 21)) (4.5.0)\n",
+ "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /home/jamal/.local/lib/python3.8/site-packages (from rich->-r old_setup_files/requirements-pip.txt (line 22)) (2.2.0)\n",
+ "Collecting einops\n",
+ " Downloading einops-0.6.1-py3-none-any.whl (42 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 42 kB 143 kB/s eta 0:00:01\n",
+ "\u001b[?25hRequirement already satisfied: charset-normalizer<4,>=2 in /home/jamal/.local/lib/python3.8/site-packages (from requests->-r old_setup_files/requirements-pip.txt (line 28)) (3.0.1)\n",
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3/dist-packages (from requests->-r old_setup_files/requirements-pip.txt (line 28)) (2.8)\n",
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/lib/python3/dist-packages (from requests->-r old_setup_files/requirements-pip.txt (line 28)) (2019.11.28)\n",
+ "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/jamal/.local/lib/python3.8/site-packages (from requests->-r old_setup_files/requirements-pip.txt (line 28)) (1.26.7)\n",
+ "Collecting sentencepiece\n",
+ " Downloading sentencepiece-0.1.99-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 1.3 MB 3.1 MB/s eta 0:00:01\n",
+ "\u001b[?25hCollecting lion-pytorch\n",
+ " Downloading lion_pytorch-0.1.2-py3-none-any.whl (4.4 kB)\n",
+ "Collecting ema-pytorch>=0.2.2\n",
+ " Downloading ema_pytorch-0.2.3-py3-none-any.whl (4.4 kB)\n",
+ "Collecting vector-quantize-pytorch>=1.5.14\n",
+ " Downloading vector_quantize_pytorch-1.6.30-py3-none-any.whl (13 kB)\n",
+ "Collecting local-attention>=1.8.4\n",
+ " Downloading local_attention-1.8.6-py3-none-any.whl (8.1 kB)\n",
+ "Collecting beartype\n",
+ " Downloading beartype-0.15.0-py3-none-any.whl (777 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 777 kB 3.2 MB/s eta 0:00:01\n",
+ "\u001b[?25hCollecting accelerate\n",
+ " Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 244 kB 2.4 MB/s eta 0:00:01\n",
+ "\u001b[?25hCollecting aiohttp~=3.0\n",
+ " Downloading aiohttp-3.8.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 1.1 MB 2.7 MB/s eta 0:00:01\n",
+ "\u001b[?25hRequirement already satisfied: pandas<3.0,>=1.0 in /home/jamal/.local/lib/python3.8/site-packages (from gradio>=3.34.0->-r old_setup_files/requirements-pip.txt (line 34)) (1.5.3)\n",
+ "Collecting orjson~=3.0\n",
+ " Downloading orjson-3.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (138 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 138 kB 2.8 MB/s eta 0:00:01\n",
+ "\u001b[?25hCollecting python-multipart\n",
+ " Downloading python_multipart-0.0.6-py3-none-any.whl (45 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 45 kB 486 kB/s eta 0:00:01\n",
+ "\u001b[?25hCollecting fastapi\n",
+ " Downloading fastapi-0.100.1-py3-none-any.whl (65 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 65 kB 530 kB/s eta 0:00:01\n",
+ "\u001b[?25hCollecting uvicorn>=0.14.0\n",
+ " Downloading uvicorn-0.23.2-py3-none-any.whl (59 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 59 kB 1.0 MB/s eta 0:00:01\n",
+ "\u001b[?25hRequirement already satisfied: jinja2<4.0 in /home/jamal/.local/lib/python3.8/site-packages (from gradio>=3.34.0->-r old_setup_files/requirements-pip.txt (line 34)) (3.1.2)\n",
+ "Collecting aiofiles<24.0,>=22.0\n",
+ " Downloading aiofiles-23.1.0-py3-none-any.whl (14 kB)\n",
+ "Collecting altair<6.0,>=4.2.0\n",
+ " Downloading altair-5.0.1-py3-none-any.whl (471 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 471 kB 3.0 MB/s eta 0:00:01\n",
+ "\u001b[?25hCollecting mdit-py-plugins<=0.3.3\n",
+ " Downloading mdit_py_plugins-0.3.3-py3-none-any.whl (50 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 50 kB 777 kB/s eta 0:00:01\n",
+ "\u001b[?25hCollecting ffmpy\n",
+ " Downloading ffmpy-0.3.1.tar.gz (5.5 kB)\n",
+ "Collecting pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,<3.0.0,>=1.7.4\n",
+ " Downloading pydantic-2.1.1-py3-none-any.whl (370 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 370 kB 3.2 MB/s eta 0:00:01\n",
+ "\u001b[?25hCollecting httpx\n",
+ " Downloading httpx-0.24.1-py3-none-any.whl (75 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 75 kB 593 kB/s eta 0:00:01\n",
+ "\u001b[?25hRequirement already satisfied: websockets<12.0,>=10.0 in /home/jamal/.local/lib/python3.8/site-packages (from gradio>=3.34.0->-r old_setup_files/requirements-pip.txt (line 34)) (11.0.3)\n",
+ "Requirement already satisfied: markupsafe~=2.0 in /home/jamal/.local/lib/python3.8/site-packages (from gradio>=3.34.0->-r old_setup_files/requirements-pip.txt (line 34)) (2.1.2)\n",
+ "Collecting gradio-client>=0.3.0\n",
+ " Downloading gradio_client-0.3.0-py3-none-any.whl (294 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 294 kB 2.2 MB/s eta 0:00:01\n",
+ "\u001b[?25hCollecting semantic-version~=2.0\n",
+ " Downloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)\n",
+ "Requirement already satisfied: matplotlib~=3.0 in /home/jamal/.local/lib/python3.8/site-packages (from gradio>=3.34.0->-r old_setup_files/requirements-pip.txt (line 34)) (3.5.1)\n",
+ "Requirement already satisfied: zipp>=0.5 in /usr/lib/python3/dist-packages (from importlib-metadata->diffusers->-r old_setup_files/requirements-pip.txt (line 3)) (1.0.0)\n",
+ "Requirement already satisfied: pycparser in /home/jamal/.local/lib/python3.8/site-packages (from cffi->fairseq->-r old_setup_files/requirements-pip.txt (line 12)) (2.21)\n",
+ "Requirement already satisfied: sympy in /home/jamal/.local/lib/python3.8/site-packages (from torch->fairseq->-r old_setup_files/requirements-pip.txt (line 12)) (1.12)\n",
+ "Requirement already satisfied: nvidia-curand-cu11==10.2.10.91; platform_system == \"Linux\" and platform_machine == \"x86_64\" in /home/jamal/.local/lib/python3.8/site-packages (from torch->fairseq->-r old_setup_files/requirements-pip.txt (line 12)) (10.2.10.91)\n",
+ "Requirement already satisfied: nvidia-cusparse-cu11==11.7.4.91; platform_system == \"Linux\" and platform_machine == \"x86_64\" in /home/jamal/.local/lib/python3.8/site-packages (from torch->fairseq->-r old_setup_files/requirements-pip.txt (line 12)) (11.7.4.91)\n",
+ "Requirement already satisfied: nvidia-cuda-runtime-cu11==11.7.99; platform_system == \"Linux\" and platform_machine == \"x86_64\" in /home/jamal/.local/lib/python3.8/site-packages (from torch->fairseq->-r old_setup_files/requirements-pip.txt (line 12)) (11.7.99)\n",
+ "Requirement already satisfied: nvidia-nvtx-cu11==11.7.91; platform_system == \"Linux\" and platform_machine == \"x86_64\" in /home/jamal/.local/lib/python3.8/site-packages (from torch->fairseq->-r old_setup_files/requirements-pip.txt (line 12)) (11.7.91)\n",
+ "Requirement already satisfied: networkx in /home/jamal/.local/lib/python3.8/site-packages (from torch->fairseq->-r old_setup_files/requirements-pip.txt (line 12)) (3.1)\n",
+ "Requirement already satisfied: nvidia-cublas-cu11==11.10.3.66; platform_system == \"Linux\" and platform_machine == \"x86_64\" in /home/jamal/.local/lib/python3.8/site-packages (from torch->fairseq->-r old_setup_files/requirements-pip.txt (line 12)) (11.10.3.66)\n",
+ "Requirement already satisfied: nvidia-cusolver-cu11==11.4.0.1; platform_system == \"Linux\" and platform_machine == \"x86_64\" in /home/jamal/.local/lib/python3.8/site-packages (from torch->fairseq->-r old_setup_files/requirements-pip.txt (line 12)) (11.4.0.1)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Requirement already satisfied: triton==2.0.0; platform_system == \"Linux\" and platform_machine == \"x86_64\" in /home/jamal/.local/lib/python3.8/site-packages (from torch->fairseq->-r old_setup_files/requirements-pip.txt (line 12)) (2.0.0)\n",
+ "Requirement already satisfied: nvidia-cufft-cu11==10.9.0.58; platform_system == \"Linux\" and platform_machine == \"x86_64\" in /home/jamal/.local/lib/python3.8/site-packages (from torch->fairseq->-r old_setup_files/requirements-pip.txt (line 12)) (10.9.0.58)\n",
+ "Requirement already satisfied: nvidia-nccl-cu11==2.14.3; platform_system == \"Linux\" and platform_machine == \"x86_64\" in /home/jamal/.local/lib/python3.8/site-packages (from torch->fairseq->-r old_setup_files/requirements-pip.txt (line 12)) (2.14.3)\n",
+ "Requirement already satisfied: nvidia-cuda-cupti-cu11==11.7.101; platform_system == \"Linux\" and platform_machine == \"x86_64\" in /home/jamal/.local/lib/python3.8/site-packages (from torch->fairseq->-r old_setup_files/requirements-pip.txt (line 12)) (11.7.101)\n",
+ "Requirement already satisfied: nvidia-cudnn-cu11==8.5.0.96; platform_system == \"Linux\" and platform_machine == \"x86_64\" in /home/jamal/.local/lib/python3.8/site-packages (from torch->fairseq->-r old_setup_files/requirements-pip.txt (line 12)) (8.5.0.96)\n",
+ "Requirement already satisfied: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == \"Linux\" and platform_machine == \"x86_64\" in /home/jamal/.local/lib/python3.8/site-packages (from torch->fairseq->-r old_setup_files/requirements-pip.txt (line 12)) (11.7.99)\n",
+ "Requirement already satisfied: colorama in /usr/lib/python3/dist-packages (from sacrebleu>=1.4.12->fairseq->-r old_setup_files/requirements-pip.txt (line 12)) (0.4.3)\n",
+ "Collecting lxml\n",
+ " Downloading lxml-4.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (7.1 MB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 7.1 MB 2.8 MB/s eta 0:00:01\n",
+ "\u001b[?25hCollecting tabulate>=0.8.9\n",
+ " Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)\n",
+ "Collecting portalocker\n",
+ " Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)\n",
+ "Requirement already satisfied: importlib-resources; python_version < \"3.9\" in /home/jamal/.local/lib/python3.8/site-packages (from hydra-core<1.1,>=1.0.7->fairseq->-r old_setup_files/requirements-pip.txt (line 12)) (5.10.2)\n",
+ "Collecting antlr4-python3-runtime==4.8\n",
+ " Downloading antlr4-python3-runtime-4.8.tar.gz (112 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 112 kB 3.2 MB/s eta 0:00:01\n",
+ "\u001b[?25hRequirement already satisfied: platformdirs>=2.5.0 in /home/jamal/.local/lib/python3.8/site-packages (from pooch>=1.0->librosa->-r old_setup_files/requirements-pip.txt (line 13)) (2.6.2)\n",
+ "Requirement already satisfied: threadpoolctl>=2.0.0 in /home/jamal/.local/lib/python3.8/site-packages (from scikit-learn!=0.19.0,>=0.14.0->librosa->-r old_setup_files/requirements-pip.txt (line 13)) (3.2.0)\n",
+ "Requirement already satisfied: llvmlite<0.41,>=0.40.0dev0 in /home/jamal/.local/lib/python3.8/site-packages (from numba>=0.43.0->librosa->-r old_setup_files/requirements-pip.txt (line 13)) (0.40.1)\n",
+ "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/jamal/.local/lib/python3.8/site-packages (from botocore<1.32.0,>=1.31.18->boto3->-r old_setup_files/requirements-pip.txt (line 14)) (2.8.2)\n",
+ "Requirement already satisfied: parso<0.9.0,>=0.8.0 in /home/jamal/.local/lib/python3.8/site-packages (from jedi>=0.16->ipython->-r old_setup_files/requirements-pip.txt (line 20)) (0.8.3)\n",
+ "Requirement already satisfied: executing>=1.2.0 in /home/jamal/.local/lib/python3.8/site-packages (from stack-data->ipython->-r old_setup_files/requirements-pip.txt (line 20)) (1.2.0)\n",
+ "Requirement already satisfied: asttokens>=2.1.0 in /home/jamal/.local/lib/python3.8/site-packages (from stack-data->ipython->-r old_setup_files/requirements-pip.txt (line 20)) (2.2.1)\n",
+ "Requirement already satisfied: pure-eval in /home/jamal/.local/lib/python3.8/site-packages (from stack-data->ipython->-r old_setup_files/requirements-pip.txt (line 20)) (0.2.2)\n",
+ "Requirement already satisfied: wcwidth in /home/jamal/.local/lib/python3.8/site-packages (from prompt-toolkit<3.1.0,>=3.0.30->ipython->-r old_setup_files/requirements-pip.txt (line 20)) (0.2.6)\n",
+ "Requirement already satisfied: mdurl~=0.1 in /home/jamal/.local/lib/python3.8/site-packages (from markdown-it-py<3.0.0,>=2.2.0->rich->-r old_setup_files/requirements-pip.txt (line 22)) (0.1.2)\n",
+ "Requirement already satisfied: psutil in /usr/lib/python3/dist-packages (from accelerate->audiolm-pytorch->-r old_setup_files/requirements-pip.txt (line 32)) (5.5.1)\n",
+ "Collecting frozenlist>=1.1.1\n",
+ " Downloading frozenlist-1.4.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (220 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 220 kB 3.1 MB/s eta 0:00:01\n",
+ "\u001b[?25hRequirement already satisfied: attrs>=17.3.0 in /usr/lib/python3/dist-packages (from aiohttp~=3.0->gradio>=3.34.0->-r old_setup_files/requirements-pip.txt (line 34)) (19.3.0)\n",
+ "Collecting aiosignal>=1.1.2\n",
+ " Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)\n",
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /home/jamal/.local/lib/python3.8/site-packages (from aiohttp~=3.0->gradio>=3.34.0->-r old_setup_files/requirements-pip.txt (line 34)) (6.0.4)\n",
+ "Collecting async-timeout<5.0,>=4.0.0a3\n",
+ " Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)\n",
+ "Collecting yarl<2.0,>=1.0\n",
+ " Downloading yarl-1.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (266 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 266 kB 2.7 MB/s eta 0:00:01\n",
+ "\u001b[?25hRequirement already satisfied: pytz>=2020.1 in /home/jamal/.local/lib/python3.8/site-packages (from pandas<3.0,>=1.0->gradio>=3.34.0->-r old_setup_files/requirements-pip.txt (line 34)) (2022.7.1)\n",
+ "Collecting starlette<0.28.0,>=0.27.0\n",
+ " Downloading starlette-0.27.0-py3-none-any.whl (66 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 66 kB 574 kB/s eta 0:00:01\n",
+ "\u001b[?25hCollecting h11>=0.8\n",
+ " Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 58 kB 719 kB/s eta 0:00:01\n",
+ "\u001b[?25hRequirement already satisfied: click>=7.0 in /usr/lib/python3/dist-packages (from uvicorn>=0.14.0->gradio>=3.34.0->-r old_setup_files/requirements-pip.txt (line 34)) (7.0)\n",
+ "Requirement already satisfied: jsonschema>=3.0 in /home/jamal/.local/lib/python3.8/site-packages (from altair<6.0,>=4.2.0->gradio>=3.34.0->-r old_setup_files/requirements-pip.txt (line 34)) (4.17.3)\n",
+ "Collecting toolz\n",
+ " Downloading toolz-0.12.0-py3-none-any.whl (55 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 55 kB 606 kB/s eta 0:00:01\n",
+ "\u001b[?25hCollecting pydantic-core==2.4.0\n",
+ " Downloading pydantic_core-2.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.9 MB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 1.9 MB 2.6 MB/s eta 0:00:01\n",
+ "\u001b[?25hCollecting annotated-types>=0.4.0\n",
+ " Downloading annotated_types-0.5.0-py3-none-any.whl (11 kB)\n",
+ "Requirement already satisfied: sniffio in /home/jamal/.local/lib/python3.8/site-packages (from httpx->gradio>=3.34.0->-r old_setup_files/requirements-pip.txt (line 34)) (1.3.0)\n",
+ "Collecting httpcore<0.18.0,>=0.15.0\n",
+ " Downloading httpcore-0.17.3-py3-none-any.whl (74 kB)\n",
+ "\u001b[K |โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ| 74 kB 471 kB/s eta 0:00:01\n",
+ "\u001b[?25hRequirement already satisfied: pyparsing>=2.2.1 in /usr/lib/python3/dist-packages (from matplotlib~=3.0->gradio>=3.34.0->-r old_setup_files/requirements-pip.txt (line 34)) (2.4.6)\n",
+ "Requirement already satisfied: cycler>=0.10 in /home/jamal/.local/lib/python3.8/site-packages (from matplotlib~=3.0->gradio>=3.34.0->-r old_setup_files/requirements-pip.txt (line 34)) (0.11.0)\n",
+ "Requirement already satisfied: fonttools>=4.22.0 in /home/jamal/.local/lib/python3.8/site-packages (from matplotlib~=3.0->gradio>=3.34.0->-r old_setup_files/requirements-pip.txt (line 34)) (4.38.0)\n",
+ "Requirement already satisfied: kiwisolver>=1.0.1 in /home/jamal/.local/lib/python3.8/site-packages (from matplotlib~=3.0->gradio>=3.34.0->-r old_setup_files/requirements-pip.txt (line 34)) (1.4.4)\n",
+ "Requirement already satisfied: mpmath>=0.19 in /home/jamal/.local/lib/python3.8/site-packages (from sympy->torch->fairseq->-r old_setup_files/requirements-pip.txt (line 12)) (1.3.0)\n",
+ "Requirement already satisfied: wheel in /usr/lib/python3/dist-packages (from nvidia-curand-cu11==10.2.10.91; platform_system == \"Linux\" and platform_machine == \"x86_64\"->torch->fairseq->-r old_setup_files/requirements-pip.txt (line 12)) (0.34.2)\n",
+ "Requirement already satisfied: cmake in /home/jamal/.local/lib/python3.8/site-packages (from triton==2.0.0; platform_system == \"Linux\" and platform_machine == \"x86_64\"->torch->fairseq->-r old_setup_files/requirements-pip.txt (line 12)) (3.27.0)\n",
+ "Requirement already satisfied: lit in /home/jamal/.local/lib/python3.8/site-packages (from triton==2.0.0; platform_system == \"Linux\" and platform_machine == \"x86_64\"->torch->fairseq->-r old_setup_files/requirements-pip.txt (line 12)) (16.0.6)\n",
+ "Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.32.0,>=1.31.18->boto3->-r old_setup_files/requirements-pip.txt (line 14)) (1.14.0)\n",
+ "Requirement already satisfied: anyio<5,>=3.4.0 in /home/jamal/.local/lib/python3.8/site-packages (from starlette<0.28.0,>=0.27.0->fastapi->gradio>=3.34.0->-r old_setup_files/requirements-pip.txt (line 34)) (3.6.2)\n",
+ "Requirement already satisfied: pkgutil-resolve-name>=1.3.10; python_version < \"3.9\" in /home/jamal/.local/lib/python3.8/site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio>=3.34.0->-r old_setup_files/requirements-pip.txt (line 34)) (1.3.10)\n",
+ "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/lib/python3/dist-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio>=3.34.0->-r old_setup_files/requirements-pip.txt (line 34)) (0.15.5)\n",
+ "Building wheels for collected packages: ffmpeg, encodec, ffmpy, antlr4-python3-runtime\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " Building wheel for ffmpeg (setup.py) ... \u001b[?25ldone\n",
+ "\u001b[?25h Created wheel for ffmpeg: filename=ffmpeg-1.4-py3-none-any.whl size=6083 sha256=19514e448b6bdeb0af7ec0711ab1153c6ed7a4c2c94d70be977b7c7fe20bb378\n",
+ " Stored in directory: /home/jamal/.cache/pip/wheels/30/33/46/5ab7eca55b9490dddbf3441c68a29535996270ef1ce8b9b6d7\n",
+ " Building wheel for encodec (setup.py) ... \u001b[?25ldone\n",
+ "\u001b[?25h Created wheel for encodec: filename=encodec-0.1.1-py3-none-any.whl size=45768 sha256=5be14b57922136d74f505c3f00b442410ed01e1df97724788ca60a81ae4c88c3\n",
+ " Stored in directory: /home/jamal/.cache/pip/wheels/83/ca/c5/2770ecff40c79307803c30f8d4c5dcb533722f5f7c049ee9db\n",
+ " Building wheel for ffmpy (setup.py) ... \u001b[?25ldone\n",
+ "\u001b[?25h Created wheel for ffmpy: filename=ffmpy-0.3.1-py3-none-any.whl size=5580 sha256=30a6f174f3c528a86e4132068eecaac56ddfdca93b4128ed8631c5f17707f7d7\n",
+ " Stored in directory: /home/jamal/.cache/pip/wheels/75/a3/1a/2f3f90b9a4eb0408109ae1b5bae01efbdf8ab4ef98797433e4\n",
+ " Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25ldone\n",
+ "\u001b[?25h Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-py3-none-any.whl size=141230 sha256=24d2cd414c2d61b3f9bc6b30748b85ab68aa6d00c4ed7145823ba466469b0796\n",
+ " Stored in directory: /home/jamal/.cache/pip/wheels/c8/d0/ab/d43c02eaddc5b9004db86950802442ad9a26f279c619e28da0\n",
+ "Successfully built ffmpeg encodec ffmpy antlr4-python3-runtime\n",
+ "\u001b[31mERROR: pydantic-core 2.4.0 has requirement typing-extensions!=4.7.0,>=4.6.0, but you'll have typing-extensions 4.5.0 which is incompatible.\u001b[0m\n",
+ "\u001b[31mERROR: pydantic 2.1.1 has requirement typing-extensions>=4.6.1, but you'll have typing-extensions 4.5.0 which is incompatible.\u001b[0m\n",
+ "Installing collected packages: regex, fsspec, huggingface-hub, safetensors, tokenizers, transformers, diffusers, appdirs, ffmpeg-downloader, ffmpeg, ffmpeg-python, sox, omegaconf, cython, lxml, tabulate, portalocker, sacrebleu, torchaudio, antlr4-python3-runtime, hydra-core, bitarray, fairseq, jmespath, botocore, s3transfer, boto3, funcy, pathvalidate, rich-argparse, einops, encodec, pydub, audio2numpy, faiss-cpu, sentencepiece, lion-pytorch, ema-pytorch, vector-quantize-pytorch, local-attention, beartype, accelerate, audiolm-pytorch, universal-startfile, frozenlist, aiosignal, async-timeout, yarl, aiohttp, orjson, python-multipart, pydantic-core, annotated-types, pydantic, starlette, fastapi, h11, uvicorn, aiofiles, toolz, altair, mdit-py-plugins, ffmpy, httpcore, httpx, gradio-client, semantic-version, gradio\n",
+ "Successfully installed accelerate-0.21.0 aiofiles-23.1.0 aiohttp-3.8.5 aiosignal-1.3.1 altair-5.0.1 annotated-types-0.5.0 antlr4-python3-runtime-4.8 appdirs-1.4.4 async-timeout-4.0.2 audio2numpy-0.1.2 audiolm-pytorch-1.2.24 beartype-0.15.0 bitarray-2.8.0 boto3-1.28.18 botocore-1.31.18 cython-3.0.0 diffusers-0.19.3 einops-0.6.1 ema-pytorch-0.2.3 encodec-0.1.1 fairseq-0.12.2 faiss-cpu-1.7.4 fastapi-0.100.1 ffmpeg-1.4 ffmpeg-downloader-0.2.0 ffmpeg-python-0.2.0 ffmpy-0.3.1 frozenlist-1.4.0 fsspec-2023.6.0 funcy-2.0 gradio-3.39.0 gradio-client-0.3.0 h11-0.14.0 httpcore-0.17.3 httpx-0.24.1 huggingface-hub-0.16.4 hydra-core-1.0.7 jmespath-1.0.1 lion-pytorch-0.1.2 local-attention-1.8.6 lxml-4.9.3 mdit-py-plugins-0.3.3 omegaconf-2.0.6 orjson-3.9.2 pathvalidate-3.1.0 portalocker-2.7.0 pydantic-2.1.1 pydantic-core-2.4.0 pydub-0.25.1 python-multipart-0.0.6 regex-2023.6.3 rich-argparse-1.2.0 s3transfer-0.6.1 sacrebleu-2.3.1 safetensors-0.3.1 semantic-version-2.10.0 sentencepiece-0.1.99 sox-1.4.1 starlette-0.27.0 tabulate-0.9.0 tokenizers-0.13.3 toolz-0.12.0 torchaudio-2.0.2 transformers-4.31.0 universal-startfile-0.2 uvicorn-0.23.2 vector-quantize-pytorch-1.6.30 yarl-1.9.2\n",
+ "Requirement already satisfied: encodec in /home/jamal/.local/lib/python3.8/site-packages (0.1.1)\n",
+ "Requirement already satisfied: rich-argparse in /home/jamal/.local/lib/python3.8/site-packages (1.2.0)\n",
+ "Requirement already satisfied: librosa in /home/jamal/.local/lib/python3.8/site-packages (0.8.1)\n",
+ "Requirement already satisfied: pydub in /home/jamal/.local/lib/python3.8/site-packages (0.25.1)\n",
+ "Collecting devtools\n",
+ " Downloading devtools-0.11.0-py3-none-any.whl (19 kB)\n",
+ "Requirement already satisfied: einops in /home/jamal/.local/lib/python3.8/site-packages (from encodec) (0.6.1)\n",
+ "Requirement already satisfied: numpy in /home/jamal/.local/lib/python3.8/site-packages (from encodec) (1.20.3)\n",
+ "Requirement already satisfied: torch in /home/jamal/.local/lib/python3.8/site-packages (from encodec) (2.0.1)\n",
+ "Requirement already satisfied: torchaudio in /home/jamal/.local/lib/python3.8/site-packages (from encodec) (2.0.2)\n",
+ "Requirement already satisfied: rich>=11.0.0 in /home/jamal/.local/lib/python3.8/site-packages (from rich-argparse) (13.3.5)\n",
+ "Requirement already satisfied: numba>=0.43.0 in /home/jamal/.local/lib/python3.8/site-packages (from librosa) (0.57.1)\n",
+ "Requirement already satisfied: joblib>=0.14 in /home/jamal/.local/lib/python3.8/site-packages (from librosa) (1.3.1)\n",
+ "Requirement already satisfied: soundfile>=0.10.2 in /home/jamal/.local/lib/python3.8/site-packages (from librosa) (0.10.3.post1)\n",
+ "Requirement already satisfied: packaging>=20.0 in /home/jamal/.local/lib/python3.8/site-packages (from librosa) (23.0)\n",
+ "Requirement already satisfied: resampy>=0.2.2 in /home/jamal/.local/lib/python3.8/site-packages (from librosa) (0.4.2)\n",
+ "Requirement already satisfied: pooch>=1.0 in /home/jamal/.local/lib/python3.8/site-packages (from librosa) (1.7.0)\n",
+ "Requirement already satisfied: scipy>=1.0.0 in /home/jamal/.local/lib/python3.8/site-packages (from librosa) (1.7.3)\n",
+ "Requirement already satisfied: decorator>=3.0.0 in /home/jamal/.local/lib/python3.8/site-packages (from librosa) (5.1.1)\n",
+ "Requirement already satisfied: scikit-learn!=0.19.0,>=0.14.0 in /home/jamal/.local/lib/python3.8/site-packages (from librosa) (1.0.2)\n",
+ "Requirement already satisfied: audioread>=2.0.0 in /home/jamal/.local/lib/python3.8/site-packages (from librosa) (3.0.0)\n",
+ "Requirement already satisfied: executing>=1.1.1 in /home/jamal/.local/lib/python3.8/site-packages (from devtools) (1.2.0)\n",
+ "Requirement already satisfied: asttokens<3.0.0,>=2.0.0 in /home/jamal/.local/lib/python3.8/site-packages (from devtools) (2.2.1)\n",
+ "Requirement already satisfied: nvidia-cuda-nvrtc-cu11==11.7.99; platform_system == \"Linux\" and platform_machine == \"x86_64\" in /home/jamal/.local/lib/python3.8/site-packages (from torch->encodec) (11.7.99)\n",
+ "Requirement already satisfied: nvidia-nccl-cu11==2.14.3; platform_system == \"Linux\" and platform_machine == \"x86_64\" in /home/jamal/.local/lib/python3.8/site-packages (from torch->encodec) (2.14.3)\n",
+ "Requirement already satisfied: nvidia-cuda-runtime-cu11==11.7.99; platform_system == \"Linux\" and platform_machine == \"x86_64\" in /home/jamal/.local/lib/python3.8/site-packages (from torch->encodec) (11.7.99)\n",
+ "Requirement already satisfied: nvidia-nvtx-cu11==11.7.91; platform_system == \"Linux\" and platform_machine == \"x86_64\" in /home/jamal/.local/lib/python3.8/site-packages (from torch->encodec) (11.7.91)\n",
+ "Requirement already satisfied: nvidia-cublas-cu11==11.10.3.66; platform_system == \"Linux\" and platform_machine == \"x86_64\" in /home/jamal/.local/lib/python3.8/site-packages (from torch->encodec) (11.10.3.66)\n",
+ "Requirement already satisfied: nvidia-curand-cu11==10.2.10.91; platform_system == \"Linux\" and platform_machine == \"x86_64\" in /home/jamal/.local/lib/python3.8/site-packages (from torch->encodec) (10.2.10.91)\n",
+ "Requirement already satisfied: nvidia-cufft-cu11==10.9.0.58; platform_system == \"Linux\" and platform_machine == \"x86_64\" in /home/jamal/.local/lib/python3.8/site-packages (from torch->encodec) (10.9.0.58)\n",
+ "Requirement already satisfied: nvidia-cusolver-cu11==11.4.0.1; platform_system == \"Linux\" and platform_machine == \"x86_64\" in /home/jamal/.local/lib/python3.8/site-packages (from torch->encodec) (11.4.0.1)\n",
+ "Requirement already satisfied: nvidia-cudnn-cu11==8.5.0.96; platform_system == \"Linux\" and platform_machine == \"x86_64\" in /home/jamal/.local/lib/python3.8/site-packages (from torch->encodec) (8.5.0.96)\n",
+ "Requirement already satisfied: networkx in /home/jamal/.local/lib/python3.8/site-packages (from torch->encodec) (3.1)\n",
+ "Requirement already satisfied: nvidia-cuda-cupti-cu11==11.7.101; platform_system == \"Linux\" and platform_machine == \"x86_64\" in /home/jamal/.local/lib/python3.8/site-packages (from torch->encodec) (11.7.101)\n",
+ "Requirement already satisfied: sympy in /home/jamal/.local/lib/python3.8/site-packages (from torch->encodec) (1.12)\n",
+ "Requirement already satisfied: filelock in /home/jamal/.local/lib/python3.8/site-packages (from torch->encodec) (3.12.2)\n",
+ "Requirement already satisfied: nvidia-cusparse-cu11==11.7.4.91; platform_system == \"Linux\" and platform_machine == \"x86_64\" in /home/jamal/.local/lib/python3.8/site-packages (from torch->encodec) (11.7.4.91)\n",
+ "Requirement already satisfied: triton==2.0.0; platform_system == \"Linux\" and platform_machine == \"x86_64\" in /home/jamal/.local/lib/python3.8/site-packages (from torch->encodec) (2.0.0)\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Requirement already satisfied: jinja2 in /home/jamal/.local/lib/python3.8/site-packages (from torch->encodec) (3.1.2)\n",
+ "Requirement already satisfied: typing-extensions in /home/jamal/.local/lib/python3.8/site-packages (from torch->encodec) (4.5.0)\n",
+ "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /home/jamal/.local/lib/python3.8/site-packages (from rich>=11.0.0->rich-argparse) (2.14.0)\n",
+ "Requirement already satisfied: markdown-it-py<3.0.0,>=2.2.0 in /home/jamal/.local/lib/python3.8/site-packages (from rich>=11.0.0->rich-argparse) (2.2.0)\n",
+ "Requirement already satisfied: importlib-metadata; python_version < \"3.9\" in /home/jamal/.local/lib/python3.8/site-packages (from numba>=0.43.0->librosa) (6.0.0)\n",
+ "Requirement already satisfied: llvmlite<0.41,>=0.40.0dev0 in /home/jamal/.local/lib/python3.8/site-packages (from numba>=0.43.0->librosa) (0.40.1)\n",
+ "Requirement already satisfied: cffi>=1.0 in /home/jamal/.local/lib/python3.8/site-packages (from soundfile>=0.10.2->librosa) (1.15.1)\n",
+ "Requirement already satisfied: platformdirs>=2.5.0 in /home/jamal/.local/lib/python3.8/site-packages (from pooch>=1.0->librosa) (2.6.2)\n",
+ "Requirement already satisfied: requests>=2.19.0 in /home/jamal/.local/lib/python3.8/site-packages (from pooch>=1.0->librosa) (2.28.2)\n",
+ "Requirement already satisfied: threadpoolctl>=2.0.0 in /home/jamal/.local/lib/python3.8/site-packages (from scikit-learn!=0.19.0,>=0.14.0->librosa) (3.2.0)\n",
+ "Requirement already satisfied: six in /usr/lib/python3/dist-packages (from asttokens<3.0.0,>=2.0.0->devtools) (1.14.0)\n",
+ "Requirement already satisfied: setuptools in /usr/lib/python3/dist-packages (from nvidia-cuda-runtime-cu11==11.7.99; platform_system == \"Linux\" and platform_machine == \"x86_64\"->torch->encodec) (45.2.0)\n",
+ "Requirement already satisfied: wheel in /usr/lib/python3/dist-packages (from nvidia-cuda-runtime-cu11==11.7.99; platform_system == \"Linux\" and platform_machine == \"x86_64\"->torch->encodec) (0.34.2)\n",
+ "Requirement already satisfied: mpmath>=0.19 in /home/jamal/.local/lib/python3.8/site-packages (from sympy->torch->encodec) (1.3.0)\n",
+ "Requirement already satisfied: lit in /home/jamal/.local/lib/python3.8/site-packages (from triton==2.0.0; platform_system == \"Linux\" and platform_machine == \"x86_64\"->torch->encodec) (16.0.6)\n",
+ "Requirement already satisfied: cmake in /home/jamal/.local/lib/python3.8/site-packages (from triton==2.0.0; platform_system == \"Linux\" and platform_machine == \"x86_64\"->torch->encodec) (3.27.0)\n",
+ "Requirement already satisfied: MarkupSafe>=2.0 in /home/jamal/.local/lib/python3.8/site-packages (from jinja2->torch->encodec) (2.1.2)\n",
+ "Requirement already satisfied: mdurl~=0.1 in /home/jamal/.local/lib/python3.8/site-packages (from markdown-it-py<3.0.0,>=2.2.0->rich>=11.0.0->rich-argparse) (0.1.2)\n",
+ "Requirement already satisfied: zipp>=0.5 in /usr/lib/python3/dist-packages (from importlib-metadata; python_version < \"3.9\"->numba>=0.43.0->librosa) (1.0.0)\n",
+ "Requirement already satisfied: pycparser in /home/jamal/.local/lib/python3.8/site-packages (from cffi>=1.0->soundfile>=0.10.2->librosa) (2.21)\n",
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/lib/python3/dist-packages (from requests>=2.19.0->pooch>=1.0->librosa) (2019.11.28)\n",
+ "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/jamal/.local/lib/python3.8/site-packages (from requests>=2.19.0->pooch>=1.0->librosa) (1.26.7)\n",
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /home/jamal/.local/lib/python3.8/site-packages (from requests>=2.19.0->pooch>=1.0->librosa) (3.0.1)\n",
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3/dist-packages (from requests>=2.19.0->pooch>=1.0->librosa) (2.8)\n",
+ "Installing collected packages: devtools\n",
+ "Successfully installed devtools-0.11.0\n",
+ "fish: Unknown command: python\n",
+ "fish: \n",
+ "python bark_webui.py --share\n",
+ "^\n"
+ ]
+ }
+ ],
+ "source": [
+ "!pip install -r old_setup_files/requirements-pip.txt\n",
+ "!pip install encodec rich-argparse librosa pydub devtools"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "e6884c4a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Collecting typing-extensions\n",
+ " Downloading typing_extensions-4.7.1-py3-none-any.whl (33 kB)\n",
+ "\u001b[31mERROR: tensorflow 2.13.0 has requirement numpy<=1.24.3,>=1.22, but you'll have numpy 1.20.3 which is incompatible.\u001b[0m\n",
+ "\u001b[31mERROR: tensorflow 2.13.0 has requirement typing-extensions<4.6.0,>=3.6.6, but you'll have typing-extensions 4.7.1 which is incompatible.\u001b[0m\n",
+ "Installing collected packages: typing-extensions\n",
+ " Attempting uninstall: typing-extensions\n",
+ " Found existing installation: typing-extensions 4.5.0\n",
+ " Uninstalling typing-extensions-4.5.0:\n",
+ " Successfully uninstalled typing-extensions-4.5.0\n",
+ "Successfully installed typing-extensions-4.7.1\n",
+ "Note: you may need to restart the kernel to use updated packages.\n"
+ ]
+ }
+ ],
+ "source": [
+ "pip install typing-extensions --upgrade"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "3a2e8312",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Traceback (most recent call last):\r\n",
+ " File \"bark_webui.py\", line 14, in \r\n",
+ " from bark_infinity import config\r\n",
+ " File \"/home/jamal/projects/bark/bark_infinity/__init__.py\", line 1, in \r\n",
+ " from .api import generate_audio, text_to_semantic, semantic_to_waveform, save_as_prompt\r\n",
+ " File \"/home/jamal/projects/bark/bark_infinity/api.py\", line 30, in \r\n",
+ " from .clonevoice import wav_to_semantics, generate_fine_from_wav, quick_clone\r\n",
+ " File \"/home/jamal/projects/bark/bark_infinity/clonevoice.py\", line 31, in \r\n",
+ " from bark_infinity.hubert.customtokenizer import CustomTokenizer\r\n",
+ " File \"/home/jamal/projects/bark/bark_infinity/hubert/customtokenizer.py\", line 159, in \r\n",
+ " def auto_train(data_path, save_path=\"model.pth\", load_model: str | None = None, save_epochs=1):\r\n",
+ "TypeError: unsupported operand type(s) for |: 'type' and 'NoneType'\r\n"
+ ]
+ }
+ ],
+ "source": [
+ "!python3 bark_webui.py --share"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.10"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/bark/__init__.py b/bark/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0b17c8b44869c554931c723446c65d3903821a9
--- /dev/null
+++ b/bark/__init__.py
@@ -0,0 +1,2 @@
+from .api import generate_audio, text_to_semantic, semantic_to_waveform, save_as_prompt
+from .generation import SAMPLE_RATE, preload_models
diff --git a/bark/__main__.py b/bark/__main__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc7ef830d61dd5ff7126a9217325c2b6e696ad4c
--- /dev/null
+++ b/bark/__main__.py
@@ -0,0 +1,4 @@
+from .cli import cli
+
+cli()
+
diff --git a/bark/api.py b/bark/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b646b7f03d060b52d0cd7063ee7aa955af65955
--- /dev/null
+++ b/bark/api.py
@@ -0,0 +1,125 @@
+from typing import Dict, Optional, Union
+
+import numpy as np
+
+from .generation import codec_decode, generate_coarse, generate_fine, generate_text_semantic
+
+
+def text_to_semantic(
+ text: str,
+ history_prompt: Optional[Union[Dict, str]] = None,
+ temp: float = 0.7,
+ silent: bool = False,
+):
+ """Generate semantic array from text.
+
+ Args:
+ text: text to be turned into audio
+ history_prompt: history choice for audio cloning
+ temp: generation temperature (1.0 more diverse, 0.0 more conservative)
+ silent: disable progress bar
+
+ Returns:
+ numpy semantic array to be fed into `semantic_to_waveform`
+ """
+ x_semantic = generate_text_semantic(
+ text,
+ history_prompt=history_prompt,
+ temp=temp,
+ silent=silent,
+ use_kv_caching=True
+ )
+ return x_semantic
+
+
+def semantic_to_waveform(
+ semantic_tokens: np.ndarray,
+ history_prompt: Optional[Union[Dict, str]] = None,
+ temp: float = 0.7,
+ silent: bool = False,
+ output_full: bool = False,
+):
+ """Generate audio array from semantic input.
+
+ Args:
+ semantic_tokens: semantic token output from `text_to_semantic`
+ history_prompt: history choice for audio cloning
+ temp: generation temperature (1.0 more diverse, 0.0 more conservative)
+ silent: disable progress bar
+ output_full: return full generation to be used as a history prompt
+
+ Returns:
+ numpy audio array at sample frequency 24khz
+ """
+ coarse_tokens = generate_coarse(
+ semantic_tokens,
+ history_prompt=history_prompt,
+ temp=temp,
+ silent=silent,
+ use_kv_caching=True
+ )
+ fine_tokens = generate_fine(
+ coarse_tokens,
+ history_prompt=history_prompt,
+ temp=0.5,
+ )
+ audio_arr = codec_decode(fine_tokens)
+ if output_full:
+ full_generation = {
+ "semantic_prompt": semantic_tokens,
+ "coarse_prompt": coarse_tokens,
+ "fine_prompt": fine_tokens,
+ }
+ return full_generation, audio_arr
+ return audio_arr
+
+
+def save_as_prompt(filepath, full_generation):
+ assert(filepath.endswith(".npz"))
+ assert(isinstance(full_generation, dict))
+ assert("semantic_prompt" in full_generation)
+ assert("coarse_prompt" in full_generation)
+ assert("fine_prompt" in full_generation)
+ np.savez(filepath, **full_generation)
+
+
+def generate_audio(
+ text: str,
+ history_prompt: Optional[Union[Dict, str]] = None,
+ text_temp: float = 0.7,
+ waveform_temp: float = 0.7,
+ silent: bool = False,
+ output_full: bool = False,
+):
+ """Generate audio array from input text.
+
+ Args:
+ text: text to be turned into audio
+ history_prompt: history choice for audio cloning
+ text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
+ waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
+ silent: disable progress bar
+ output_full: return full generation to be used as a history prompt
+
+ Returns:
+ numpy audio array at sample frequency 24khz
+ """
+ semantic_tokens = text_to_semantic(
+ text,
+ history_prompt=history_prompt,
+ temp=text_temp,
+ silent=silent,
+ )
+ out = semantic_to_waveform(
+ semantic_tokens,
+ history_prompt=history_prompt,
+ temp=waveform_temp,
+ silent=silent,
+ output_full=output_full,
+ )
+ if output_full:
+ full_generation, audio_arr = out
+ return full_generation, audio_arr
+ else:
+ audio_arr = out
+ return audio_arr
diff --git a/bark/assets/prompts/announcer.npz b/bark/assets/prompts/announcer.npz
new file mode 100644
index 0000000000000000000000000000000000000000..28e92eb5d6361c9322119ccc9acdc5c4d9183561
--- /dev/null
+++ b/bark/assets/prompts/announcer.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26f2d1a9e3b6fe453cf5fc8191de26cbfae6276c5b0f7c376c6a0f3c35867f83
+size 16794
diff --git a/bark/assets/prompts/de_speaker_0.npz b/bark/assets/prompts/de_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..5b2eb4d730924729d53b33ecdd393bfeec76f90e
--- /dev/null
+++ b/bark/assets/prompts/de_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:008d7f3d0a52305a80c1abce26ccf4120181554a24055a0581894819b14f998d
+size 31940
diff --git a/bark/assets/prompts/de_speaker_1.npz b/bark/assets/prompts/de_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..cb6e76ac746434bfcc7826e55ff8fefb46f30d21
--- /dev/null
+++ b/bark/assets/prompts/de_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5bb2ac34fa466f5d6804f48f51658d7b7d8d91ce7139d34c717c917578858fb
+size 31940
diff --git a/bark/assets/prompts/de_speaker_2.npz b/bark/assets/prompts/de_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d0184c01c7ab4dfe5b5c10fae329933840e5d6d2
--- /dev/null
+++ b/bark/assets/prompts/de_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1dedc8ab1949653480223f0c0cf3ebd20406d39b52e19908d32275eb8cfaf4b9
+size 23516
diff --git a/bark/assets/prompts/de_speaker_3.npz b/bark/assets/prompts/de_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c3fccd1ebf0a489a12d99ab14178f491806f66ec
--- /dev/null
+++ b/bark/assets/prompts/de_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5abe325e6306a7f96725fcc6186c0eb147d2f068ce14b863e086cbf52b1986e
+size 29060
diff --git a/bark/assets/prompts/de_speaker_4.npz b/bark/assets/prompts/de_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..823611fd0f09582ab20b5b73a2e476f146586208
--- /dev/null
+++ b/bark/assets/prompts/de_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91d102ad045aabc996f487d0d4f0b3fd289ef2da200d1df289cf5da298d23796
+size 20316
diff --git a/bark/assets/prompts/de_speaker_5.npz b/bark/assets/prompts/de_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8fd40cba82a9c85dafacc83f96a4f5c9a583239d
--- /dev/null
+++ b/bark/assets/prompts/de_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8aa116b450c74c60ef43d1fd141fe961e23ebeafdcb57991b22ae4a08c62cf44
+size 35084
diff --git a/bark/assets/prompts/de_speaker_6.npz b/bark/assets/prompts/de_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d232d37532510268e4d261751234ded7d3775870
--- /dev/null
+++ b/bark/assets/prompts/de_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0f95bd28bc7382b7294c0bb187b18873aa9c050b3fe5793166c547200c8e2da9
+size 31724
diff --git a/bark/assets/prompts/de_speaker_7.npz b/bark/assets/prompts/de_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..13173efc4c4ab9c45ab8aaba01784e50c112385d
--- /dev/null
+++ b/bark/assets/prompts/de_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:332c5aee851c0544e1ad587fbc477b8d4eb28e852192fcd969d97c894b028a2b
+size 59348
diff --git a/bark/assets/prompts/de_speaker_8.npz b/bark/assets/prompts/de_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..bdebbf4ea7e47f8dbed6893c9c05c0fb65d778c1
--- /dev/null
+++ b/bark/assets/prompts/de_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a0eefea2a0d702177f44df4b218b950119726c041cb505e1df36ab0fc0651018
+size 25116
diff --git a/bark/assets/prompts/de_speaker_9.npz b/bark/assets/prompts/de_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..2a9e6f86eb1980e1d328b4aa692134333a03773a
--- /dev/null
+++ b/bark/assets/prompts/de_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:189e941a182411853351c56e422d51a4a8fad20f1f8b8f396042bb2ada3cceb2
+size 22180
diff --git a/bark/assets/prompts/en_speaker_0.npz b/bark/assets/prompts/en_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6253a25757a8c7107b448bb788bffd27401d07f4
--- /dev/null
+++ b/bark/assets/prompts/en_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb130b14872cc53381bdb867cee71c26a6d116af81dbf2542f3f44d11b8aaf3f
+size 22396
diff --git a/bark/assets/prompts/en_speaker_1.npz b/bark/assets/prompts/en_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8593e2a7f2d792ebace1d2d14e9826f0feb74779
--- /dev/null
+++ b/bark/assets/prompts/en_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3cdc113954acb3839e9112437a029d482925236bce91294803a42e3f1f493aea
+size 18396
diff --git a/bark/assets/prompts/en_speaker_2.npz b/bark/assets/prompts/en_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..af149463949d88744498c1003d1721c5316020da
--- /dev/null
+++ b/bark/assets/prompts/en_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c27653e7db430ba4518cb5306c62a228329f928bfa566f68334545f0949b5eea
+size 33860
diff --git a/bark/assets/prompts/en_speaker_3.npz b/bark/assets/prompts/en_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..55514b9f1abe3ec035bd973b38fa016066f7722e
--- /dev/null
+++ b/bark/assets/prompts/en_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22de48d9414836a5337e483b256ed916d51ece916c36669371d9e92b1323047b
+size 38124
diff --git a/bark/assets/prompts/en_speaker_4.npz b/bark/assets/prompts/en_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..5ca5182e6526ae713cc033be26d49396db4404de
--- /dev/null
+++ b/bark/assets/prompts/en_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3481fe27c9ffc73b68783ebe122934e0430a888c199ade914e97433df73038c1
+size 21220
diff --git a/bark/assets/prompts/en_speaker_5.npz b/bark/assets/prompts/en_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4655dfd3a4a72968f727ef06ad31e2d1babfcbe9
--- /dev/null
+++ b/bark/assets/prompts/en_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b661d1573ab2df0d89b4b51e79d727dd5bfccfe8d740a84594de4028e1a23057
+size 15516
diff --git a/bark/assets/prompts/en_speaker_6.npz b/bark/assets/prompts/en_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4ffaca98c9bf76ee62c7693f4b6939d5f1fe4aab
--- /dev/null
+++ b/bark/assets/prompts/en_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d8f92a1ea0383453614d1c20c8cfbeaf9ad28d9f5778f718bf0e54eb18c0245
+size 13436
diff --git a/bark/assets/prompts/en_speaker_7.npz b/bark/assets/prompts/en_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..11835338c58dd52dd2c5875e5ce6344f94ae17d7
--- /dev/null
+++ b/bark/assets/prompts/en_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6fdbb2c04efb4e81d179369b614678adba1cac9da8cc76fe6c40396da681b3a3
+size 35084
diff --git a/bark/assets/prompts/en_speaker_8.npz b/bark/assets/prompts/en_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..359f5227b7cfffd7103805ea7dc3feb01b2eae3b
--- /dev/null
+++ b/bark/assets/prompts/en_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4233571cfc24030c9c2ed823f6393d8f3c99e26fef20d744a2e5ff59b93f086
+size 18980
diff --git a/bark/assets/prompts/en_speaker_9.npz b/bark/assets/prompts/en_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..853a75dbeafe384b7adc30aa83eccd26657abc87
--- /dev/null
+++ b/bark/assets/prompts/en_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb86c2ec884fcc906cb0d7342a9d84657f6d9abeac3c88c7b1bbfd1207ec09ca
+size 35940
diff --git a/bark/assets/prompts/es_speaker_0.npz b/bark/assets/prompts/es_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f623f750873cf95a954083e6af9d60cb9c3e0ece
--- /dev/null
+++ b/bark/assets/prompts/es_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8a4849970528104040e0ed6a96f9c705b58c72b5eee538baed1fa2283873b331
+size 27620
diff --git a/bark/assets/prompts/es_speaker_1.npz b/bark/assets/prompts/es_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8be6db7d61f6fe70b2def1ab10d8614c420ac3f4
--- /dev/null
+++ b/bark/assets/prompts/es_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c41ca11134138c1cb6108f643c686f0d0c72f376a13576cd9490721a0916d07a
+size 25436
diff --git a/bark/assets/prompts/es_speaker_2.npz b/bark/assets/prompts/es_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1f5ce31fc85f4fea950bb0a4f4bce1387b131ddd
--- /dev/null
+++ b/bark/assets/prompts/es_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d9a6406ce99291a80f81bef895e1fd3d13b5204143d656cf0aa30c013f2974bd
+size 27620
diff --git a/bark/assets/prompts/es_speaker_3.npz b/bark/assets/prompts/es_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..bd4b5b9a0e18381e275938a8ab5c3cf12c4168f8
--- /dev/null
+++ b/bark/assets/prompts/es_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f9e43586d2a185df543444fe3f7e604bfe56c9f1364f59c9671be75e88b14d02
+size 26500
diff --git a/bark/assets/prompts/es_speaker_4.npz b/bark/assets/prompts/es_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..214ea50a1737c87a0585be790c3fbbcf34bdb888
--- /dev/null
+++ b/bark/assets/prompts/es_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52b4c89d19199265d9347ff83550ceeb5bead49c2552df776ef292f851d3de33
+size 24420
diff --git a/bark/assets/prompts/es_speaker_5.npz b/bark/assets/prompts/es_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1cb83ba796ef80847844b435c4a9098c36ce2fba
--- /dev/null
+++ b/bark/assets/prompts/es_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c57dddcdf54e8e97813e887dc2e066efde628d17e10fad2a9824b552af485b2
+size 24900
diff --git a/bark/assets/prompts/es_speaker_6.npz b/bark/assets/prompts/es_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..2f87cf761f8d118f367ea7f17fa7c87b05d0587d
--- /dev/null
+++ b/bark/assets/prompts/es_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22b2dc4980a17c3dcd5f2833cc0eaab5dec06e7233520885fa792f618606dc68
+size 34820
diff --git a/bark/assets/prompts/es_speaker_7.npz b/bark/assets/prompts/es_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d5e98054d1d72e8d4a9d307457f43d1765626f61
--- /dev/null
+++ b/bark/assets/prompts/es_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c848b3561977abaed30f38fcda853283ae04c11457483347c8baaa2d5a5f94d3
+size 21596
diff --git a/bark/assets/prompts/es_speaker_8.npz b/bark/assets/prompts/es_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..e69d6c91831e5edfd06f6c074ba7452550b8c7f1
--- /dev/null
+++ b/bark/assets/prompts/es_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:691b4a12bbfd8f0e04df1ed793de2a4ada97ae04a7546e3bee12aaa094b7e156
+size 18660
diff --git a/bark/assets/prompts/es_speaker_9.npz b/bark/assets/prompts/es_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..5a823149f919a9b6131b7c4a9a149cb200572a56
--- /dev/null
+++ b/bark/assets/prompts/es_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5dda9f490517edf9447e2f02de3bec3877515a086e9668d7f0abb0d800d82ab6
+size 22660
diff --git a/bark/assets/prompts/fr_speaker_0.npz b/bark/assets/prompts/fr_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4fb5c14e5de259906f040f48996954dc06fda09f
--- /dev/null
+++ b/bark/assets/prompts/fr_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f483b271820be529ffc95968a1b7cd5e5f63137c30649192b1e10a935a8b846c
+size 30604
diff --git a/bark/assets/prompts/fr_speaker_1.npz b/bark/assets/prompts/fr_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..211ac75f015f11bc4aca7d713442b4e7fb880438
--- /dev/null
+++ b/bark/assets/prompts/fr_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba3805ef05a285f8501762900b1919631b2fd4274ee8d7cf4b4c432afd6a7635
+size 29324
diff --git a/bark/assets/prompts/fr_speaker_2.npz b/bark/assets/prompts/fr_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..3fd8b3a2114a63e20ba1bf683b6a900cf6f3481d
--- /dev/null
+++ b/bark/assets/prompts/fr_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b3e7654e74d80a7068745838b1640c72d3616fbb2fa8f88de997d252139f7b74
+size 51084
diff --git a/bark/assets/prompts/fr_speaker_3.npz b/bark/assets/prompts/fr_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4c2a4885f4f41a55996c587db29a96a30b9ced5a
--- /dev/null
+++ b/bark/assets/prompts/fr_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e67de23fa486d091eaea3d276dcf640ed0d34079fc5e78ae9e4ab0f758341af2
+size 31460
diff --git a/bark/assets/prompts/fr_speaker_4.npz b/bark/assets/prompts/fr_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f45f488d6f7edf2ff5ff5e9161c14050be5db5bb
--- /dev/null
+++ b/bark/assets/prompts/fr_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0e02e7b5f98b834968a47b1dbbb7acb18b681152461ae08e16c4b5ee93cbbcd
+size 36364
diff --git a/bark/assets/prompts/fr_speaker_5.npz b/bark/assets/prompts/fr_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d80e28998de5668d712b03b3474f7e592e01b108
--- /dev/null
+++ b/bark/assets/prompts/fr_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f771bcf5db66f2865a8023874291a6d706154853c9c9bdecd0ab0aeae3bd0a59
+size 44044
diff --git a/bark/assets/prompts/fr_speaker_6.npz b/bark/assets/prompts/fr_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..585152d9d23da138343c515d2c55b7276dd6755c
--- /dev/null
+++ b/bark/assets/prompts/fr_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21906f0c2dc2578662cdc6359a03a96e02aa296c02d0cd3c50cb9dca4379ae9a
+size 43564
diff --git a/bark/assets/prompts/fr_speaker_7.npz b/bark/assets/prompts/fr_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b2192d2e117b7e1d19878d7b7fa2a99ab7d5f0bb
--- /dev/null
+++ b/bark/assets/prompts/fr_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:51b48089d9a29cc2dc8db21393fb67558cfa75a6aa46d1b495d483d13fffa04d
+size 53908
diff --git a/bark/assets/prompts/fr_speaker_8.npz b/bark/assets/prompts/fr_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..0ccf49d404284cf135084dd7d7c048c7c04f8201
--- /dev/null
+++ b/bark/assets/prompts/fr_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e949256eecd733f22eed7b27e61bcf9331108f88849b39882723a68dac9d8cf
+size 33060
diff --git a/bark/assets/prompts/fr_speaker_9.npz b/bark/assets/prompts/fr_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..50488e936dd37e2d5de8613424acbe18839b1693
--- /dev/null
+++ b/bark/assets/prompts/fr_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:064d376376a3986e9c576851c61679c26d82de9023cfd2bb5b4b58b49c89940f
+size 31244
diff --git a/bark/assets/prompts/hi_speaker_0.npz b/bark/assets/prompts/hi_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..570622903b1918a1c6a128fbfbbb6530186834f5
--- /dev/null
+++ b/bark/assets/prompts/hi_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e17c25e0974142d03a3e1faa9ad69d8c737e1a0ed69b190ccd6a6ede69f99665
+size 32580
diff --git a/bark/assets/prompts/hi_speaker_1.npz b/bark/assets/prompts/hi_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9a1d363af7637994128de495fef17c4adf0a768f
--- /dev/null
+++ b/bark/assets/prompts/hi_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a426fc71fc502ac801f171ea1aad7d5e2b1466a2f959033fa6a6397ffb24aae2
+size 23036
diff --git a/bark/assets/prompts/hi_speaker_2.npz b/bark/assets/prompts/hi_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..bf4d9ad393306579daee5d3caaf1b7501c59d0d2
--- /dev/null
+++ b/bark/assets/prompts/hi_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6c58938653d80a56381b63b0befd0b1efba59c304ccaa76cd3a3626f81a3207
+size 26820
diff --git a/bark/assets/prompts/hi_speaker_3.npz b/bark/assets/prompts/hi_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..501e53adc0729fbf4f81a295e2f28ac2429d8952
--- /dev/null
+++ b/bark/assets/prompts/hi_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:388ed364e507437d42947f9a2d44b5e59d975bf100fcdb1d32801a2607955046
+size 28684
diff --git a/bark/assets/prompts/hi_speaker_4.npz b/bark/assets/prompts/hi_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..317b52f0b0984fbda8910ee790c999f263c00801
--- /dev/null
+++ b/bark/assets/prompts/hi_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2995c7d84e750aedbed9a781b22227d630d8a29fd26d658815561f818e955d08
+size 24476
diff --git a/bark/assets/prompts/hi_speaker_5.npz b/bark/assets/prompts/hi_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..aa15bc55722df5844e67cd49d3e8a6754f16367d
--- /dev/null
+++ b/bark/assets/prompts/hi_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:80d8f948811ef8229e96c57bbc1450def49e2c8517a05ed15292419963df30ca
+size 33004
diff --git a/bark/assets/prompts/hi_speaker_6.npz b/bark/assets/prompts/hi_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..2cd59c258d988f3c643c383ed5896e24dca91668
--- /dev/null
+++ b/bark/assets/prompts/hi_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4209c9fce350e20df2729820ca2539d60a19b618cb4c23bc3ab1391840a4a6e9
+size 24900
diff --git a/bark/assets/prompts/hi_speaker_7.npz b/bark/assets/prompts/hi_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..dd12b21b026f63cf23b289e93ca1854188794003
--- /dev/null
+++ b/bark/assets/prompts/hi_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0fe296c0a623a12e0e93f6d0ccc1b7662f0988fdc56797be3494cea4e8dcf7e0
+size 30020
diff --git a/bark/assets/prompts/hi_speaker_8.npz b/bark/assets/prompts/hi_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c0c17228d69e7dbe9c18bb383addd529d849c0e5
--- /dev/null
+++ b/bark/assets/prompts/hi_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b0136ce35d5848e253738d7a8cbdb19c4ceb471b54c0a2886fec22192c48a5d
+size 24956
diff --git a/bark/assets/prompts/hi_speaker_9.npz b/bark/assets/prompts/hi_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..186e5f787263c3532fded47a2f710dc8920e234e
--- /dev/null
+++ b/bark/assets/prompts/hi_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:41d245ab7ebe601366e085138dddc72b8c06a23faf9f4763466d7413fce88995
+size 30180
diff --git a/bark/assets/prompts/it_speaker_0.npz b/bark/assets/prompts/it_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..7c602fa064907956d84ef72a50c2ba6c98edf19d
--- /dev/null
+++ b/bark/assets/prompts/it_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f9c87011c846349276873815c9c95bb2cde9d80b781f8b349f87a186b12039f
+size 46604
diff --git a/bark/assets/prompts/it_speaker_1.npz b/bark/assets/prompts/it_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..85a0aae447ce2a4d135561dac7a9e012b5e88859
--- /dev/null
+++ b/bark/assets/prompts/it_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d6bc095f18f987bee31e85a2aceed8ef298bc093828b70c325b54e198f3463cc
+size 24900
diff --git a/bark/assets/prompts/it_speaker_2.npz b/bark/assets/prompts/it_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..68fe40537e5cfd1d1f38a349c0703ffc2be453ba
--- /dev/null
+++ b/bark/assets/prompts/it_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2d1ed1c5a9937595f30c6955cfdebc8828db9d7ee40a86ae2c7409bbfc58839
+size 45268
diff --git a/bark/assets/prompts/it_speaker_3.npz b/bark/assets/prompts/it_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c83d9757533d0cd8070c77be1566567632b8a8da
--- /dev/null
+++ b/bark/assets/prompts/it_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b9f519930f3aad2d5b9a826bb2cb33370f98d9630fa4c781419fbd8ad2faa979
+size 52684
diff --git a/bark/assets/prompts/it_speaker_4.npz b/bark/assets/prompts/it_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..fd54abc8a7095becf62ab344393f4b97ddfd91b6
--- /dev/null
+++ b/bark/assets/prompts/it_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:86586c07ac2139c17bc17487e240f75d834818e1df67e9cf6b855ee54bdb6f12
+size 22396
diff --git a/bark/assets/prompts/it_speaker_5.npz b/bark/assets/prompts/it_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..0cd1b38e7cda0b0635f5ccfda8987f9d2063706b
--- /dev/null
+++ b/bark/assets/prompts/it_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a64ae582db6a03be5044d3091f1ee9b21f58446dacabd308bb556ceb60c54001
+size 42764
diff --git a/bark/assets/prompts/it_speaker_6.npz b/bark/assets/prompts/it_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..e45025d2e14570cfe0e8357442cfd0444fc1a15e
--- /dev/null
+++ b/bark/assets/prompts/it_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2a93e98ac9f725263ae677af7a9dc1693c9a318e1091bdead6e1028e9c92e683
+size 34180
diff --git a/bark/assets/prompts/it_speaker_7.npz b/bark/assets/prompts/it_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d2abfa5fcac70f3eb522093589db6745e8f6c79b
--- /dev/null
+++ b/bark/assets/prompts/it_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eff33fcbf105889bc524a777fab002ec74e12de78396b312b002c4617789bcdc
+size 41268
diff --git a/bark/assets/prompts/it_speaker_8.npz b/bark/assets/prompts/it_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..a56572c43ec413c89bf10dbf17ea095f8b502874
--- /dev/null
+++ b/bark/assets/prompts/it_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5e076ba04d147bfec35248e304ddfae522915d9781f8a02d3e1e67a9a40ea72
+size 29964
diff --git a/bark/assets/prompts/it_speaker_9.npz b/bark/assets/prompts/it_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c577a115449c7d723271a562c73d14f5653523f8
--- /dev/null
+++ b/bark/assets/prompts/it_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb8bbe06a881e48a7f7a6a0ef93dee65fa35b37a0b295d054f4cbf5df040f0a8
+size 35940
diff --git a/bark/assets/prompts/ja_speaker_0.npz b/bark/assets/prompts/ja_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..deef4d6749928f6fd55697c9b0133f946fbf7391
--- /dev/null
+++ b/bark/assets/prompts/ja_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1d9b793498fb04c1aef4355dac1ccf8dae0b8365f84d539d2d4374bfb4882267
+size 24900
diff --git a/bark/assets/prompts/ja_speaker_1.npz b/bark/assets/prompts/ja_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..da9dc373d9f28adf4d4cdcfad64bf4fdfbc1619b
--- /dev/null
+++ b/bark/assets/prompts/ja_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d7b55962d62c708e3446cd654962b558de803081b84dd9a074602fa224e66203
+size 25220
diff --git a/bark/assets/prompts/ja_speaker_2.npz b/bark/assets/prompts/ja_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..0d8ab3b1ac8a4956f6f850255d44515048d0133e
--- /dev/null
+++ b/bark/assets/prompts/ja_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6865c99d9cbf70ac3c75e3c9f083074cc5a1247771e4989bc0029e5b8265c3f4
+size 44148
diff --git a/bark/assets/prompts/ja_speaker_3.npz b/bark/assets/prompts/ja_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ddac82e2abad3e958e002ba76ef33284d9725c57
--- /dev/null
+++ b/bark/assets/prompts/ja_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e1af7de8c66f05fa54d3aa5f8968d4f4a97857da8b3f9653b9e42c0d22d5e9f
+size 24796
diff --git a/bark/assets/prompts/ja_speaker_4.npz b/bark/assets/prompts/ja_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d6f78caff3e5cd1d6141562b8afa95b4317df669
--- /dev/null
+++ b/bark/assets/prompts/ja_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3baff37c42adc35f7e3a81e518afd9c02d9c45792a350b10ff72c3c28672857a
+size 37964
diff --git a/bark/assets/prompts/ja_speaker_5.npz b/bark/assets/prompts/ja_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..3d5488e6c83dd2fcac741a97e33ba32a494f5820
--- /dev/null
+++ b/bark/assets/prompts/ja_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:39a1d7e383a57fa5325bd604f71ad51db42308e0461ff23f14e97b4d0c08c5a9
+size 22716
diff --git a/bark/assets/prompts/ja_speaker_6.npz b/bark/assets/prompts/ja_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..3e898497cd5b849c8aa1087b4e23126573532e42
--- /dev/null
+++ b/bark/assets/prompts/ja_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:259578cdb7eb4628b3d514a4bd7737ff3e76010f8a6f906241f24a4574e78b8c
+size 24580
diff --git a/bark/assets/prompts/ja_speaker_7.npz b/bark/assets/prompts/ja_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..97153367c600e5b4aa143dff679c2a973125ee30
--- /dev/null
+++ b/bark/assets/prompts/ja_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de23f5343a3e7d4fb807ec5e20ba37ea3eec925554924f03043f17550eaf9237
+size 33380
diff --git a/bark/assets/prompts/ja_speaker_8.npz b/bark/assets/prompts/ja_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8cc42d950eaa5d4c87fae6f63d0e2b82ebc0cfd6
--- /dev/null
+++ b/bark/assets/prompts/ja_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc2c1f34894cdd6ea6fc42686e7886da0b4f0782256afd811927ecc373715c63
+size 50548
diff --git a/bark/assets/prompts/ja_speaker_9.npz b/bark/assets/prompts/ja_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..5a25762ae7fad586283bdc3aa280fc8486878721
--- /dev/null
+++ b/bark/assets/prompts/ja_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:210a0a10d67480cebc8b4b5140d04f9c9e39ad85d1c1aec74cf59edbee4c0721
+size 29540
diff --git a/bark/assets/prompts/ko_speaker_0.npz b/bark/assets/prompts/ko_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..aa55581a2a38c245aec4aaee63eb68326baceef0
--- /dev/null
+++ b/bark/assets/prompts/ko_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b15efeee85aff5768722dcec6fb5fb440afabc91ecfc605c9fd02eddc4c4133d
+size 24156
diff --git a/bark/assets/prompts/ko_speaker_1.npz b/bark/assets/prompts/ko_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9d8e19308d4c2e4df2db1588da32fb21061e753e
--- /dev/null
+++ b/bark/assets/prompts/ko_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b1d1323b815260483e2c3da6a4b7c39091b7aa71d09e955351edf6f21ffe218
+size 26396
diff --git a/bark/assets/prompts/ko_speaker_2.npz b/bark/assets/prompts/ko_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4d0e7a49bca4ccdd72c9c152ca6ae45745828171
--- /dev/null
+++ b/bark/assets/prompts/ko_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9081b57e01e7717284f530d66dfe0ef2e8e3a0f0e8b2064db7cb8afc04f04954
+size 31940
diff --git a/bark/assets/prompts/ko_speaker_3.npz b/bark/assets/prompts/ko_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b953a2461fff793ad18779fe04ee2f09015a7692
--- /dev/null
+++ b/bark/assets/prompts/ko_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:49c9472b53b209e5d435b05a8b842908ce10865ac0553d284f9b434330043a7f
+size 56628
diff --git a/bark/assets/prompts/ko_speaker_4.npz b/bark/assets/prompts/ko_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..2d1281dd33f24a8c8fd6d52226a7e88215faead1
--- /dev/null
+++ b/bark/assets/prompts/ko_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25a9840ebf6b57af5ac0e3f4a8ce734f8926e0571eaeaf0dfd7dbbcfc5745626
+size 23356
diff --git a/bark/assets/prompts/ko_speaker_5.npz b/bark/assets/prompts/ko_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9b1d783811226098e71d753257d9ed4c09852382
--- /dev/null
+++ b/bark/assets/prompts/ko_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5dc503a2f074f9df41cb407265c4a089179c7849d1dcd7774bc2439b616f25e8
+size 29004
diff --git a/bark/assets/prompts/ko_speaker_6.npz b/bark/assets/prompts/ko_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..3bb297dc00a4586b03b5923e43b620ef0bba0093
--- /dev/null
+++ b/bark/assets/prompts/ko_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:620d16320908c68ea72dd06ffcdcf61ba77f67a41a02cc4e537ff365b03fb519
+size 30500
diff --git a/bark/assets/prompts/ko_speaker_7.npz b/bark/assets/prompts/ko_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..82c3b45b4d8b87c1a1ea7782ef98cd19783fef74
--- /dev/null
+++ b/bark/assets/prompts/ko_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3d8fbf756cbe523a21ca7400e514fea8c3a3afb3537eec49a2d0e21112a275b0
+size 22180
diff --git a/bark/assets/prompts/ko_speaker_8.npz b/bark/assets/prompts/ko_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9beb955d8682c4b0b77894c7ded8e8b6624d5d5b
--- /dev/null
+++ b/bark/assets/prompts/ko_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c883e76824b9b9a3ae1da4d7cdd6f5a23e868a1c73842ffd11739db067f9d5d2
+size 24476
diff --git a/bark/assets/prompts/ko_speaker_9.npz b/bark/assets/prompts/ko_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..efb35be697bced1c219ce1cd48b862b6d7a5d574
--- /dev/null
+++ b/bark/assets/prompts/ko_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36efef182c0f3e11aaf62ea69dbf006d75a7e23ff5e2a28086c9e21d06c9948a
+size 21916
diff --git a/bark/assets/prompts/pl_speaker_0.npz b/bark/assets/prompts/pl_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..0fe56845b647bfa44c20d7f61148c923fe4af5d2
--- /dev/null
+++ b/bark/assets/prompts/pl_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fab5442ed0f4c23cd17613ca5bf321d2d2908e94cded6a529554e3e695f33eb5
+size 39780
diff --git a/bark/assets/prompts/pl_speaker_1.npz b/bark/assets/prompts/pl_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8f6e4e4297abf87be21a2c4dc7f84c1961c09a16
--- /dev/null
+++ b/bark/assets/prompts/pl_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c454b61f7762c08d6f1d8f2519b295c230533967aa41ca7376582160705434b6
+size 26500
diff --git a/bark/assets/prompts/pl_speaker_2.npz b/bark/assets/prompts/pl_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9ed3c7e2edaba9c1b130228b80666c316f975add
--- /dev/null
+++ b/bark/assets/prompts/pl_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1485325fbaddbb5840e6f4f2232ae65a79740496824f5bbea5e6b06538577749
+size 43084
diff --git a/bark/assets/prompts/pl_speaker_3.npz b/bark/assets/prompts/pl_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f5d41103a0a6316291e067dc5c149b9535c16752
--- /dev/null
+++ b/bark/assets/prompts/pl_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b01909eb0b9d18e9558ab54ec4b3a89ee74fe9001cb48b425fe996969ec84129
+size 42284
diff --git a/bark/assets/prompts/pl_speaker_4.npz b/bark/assets/prompts/pl_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f2a0666e42263c97fd5ddcf10d9c6057afd13248
--- /dev/null
+++ b/bark/assets/prompts/pl_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bbfb1f4acd490e7621fc37492c9bba7f108d8285e6210ab1108dd9cb8326f831
+size 42548
diff --git a/bark/assets/prompts/pl_speaker_5.npz b/bark/assets/prompts/pl_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..5a458f59471a11c07e85fa9025ba5539c0934312
--- /dev/null
+++ b/bark/assets/prompts/pl_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:543862aff2a35d6bf6592535369dfff8cae30e4cbafbaf2753e8041bab782d78
+size 34020
diff --git a/bark/assets/prompts/pl_speaker_6.npz b/bark/assets/prompts/pl_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..bd8d79c6293c7308a31cba38f721faf38ac8b780
--- /dev/null
+++ b/bark/assets/prompts/pl_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0648515f19e66cc1da1ae08144a0df14c7b3e0df2ad05b6ff869b0b4a4619573
+size 45324
diff --git a/bark/assets/prompts/pl_speaker_7.npz b/bark/assets/prompts/pl_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..27a05601ecf8c73af506c91d4d4be7b419013949
--- /dev/null
+++ b/bark/assets/prompts/pl_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3bff704fa071311dbc4b808729303b81b822b8d7c1293d10ad8d7398796350fe
+size 37380
diff --git a/bark/assets/prompts/pl_speaker_8.npz b/bark/assets/prompts/pl_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..319123cfcd8a389597d37c5f221a4c1bda06aadd
--- /dev/null
+++ b/bark/assets/prompts/pl_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f064caee991852fe1a1cec68fbf26dfef3fd988666ccee78b9ef61e7ebe84d5b
+size 33380
diff --git a/bark/assets/prompts/pl_speaker_9.npz b/bark/assets/prompts/pl_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6f5d04b4d425c6960786ad3ad52c036230a7ae20
--- /dev/null
+++ b/bark/assets/prompts/pl_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d75c2d6e465bbdeeba5c6dccf83589f7eceb3b834d1f32b17b210049fa535df5
+size 36364
diff --git a/bark/assets/prompts/pt_speaker_0.npz b/bark/assets/prompts/pt_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..aa7bcaa2e66cfca6068b3d709d218fa53ddea914
--- /dev/null
+++ b/bark/assets/prompts/pt_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5002193665e3baccde6ae281e156af974080ad86e535380c736f5bcc72b2435
+size 32420
diff --git a/bark/assets/prompts/pt_speaker_1.npz b/bark/assets/prompts/pt_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4811ebf7cb4acf9c21650b7404b0ecfa227bffbd
--- /dev/null
+++ b/bark/assets/prompts/pt_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b32252732efe10a7373d17f7272fe68c7009a129fed5359e75675d5cbb62930e
+size 58492
diff --git a/bark/assets/prompts/pt_speaker_2.npz b/bark/assets/prompts/pt_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..422d9de9c76d3005cb4fa3f2cc384970260c9148
--- /dev/null
+++ b/bark/assets/prompts/pt_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c6ec426f1d138a57579c126cd7e69d4123bbe8d1c1587b9f83db7cb3b3cf963d
+size 21596
diff --git a/bark/assets/prompts/pt_speaker_3.npz b/bark/assets/prompts/pt_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..45cfcaa85992e1f0e1ca5ae6042cf778a185827a
--- /dev/null
+++ b/bark/assets/prompts/pt_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e41a05a94c01fd7810fa0b3986acd83cfebb55b76ac42d31e801113a9f3c599a
+size 35300
diff --git a/bark/assets/prompts/pt_speaker_4.npz b/bark/assets/prompts/pt_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..0fdcbc0286b5bc99886fa13f11e20c9b615ca009
--- /dev/null
+++ b/bark/assets/prompts/pt_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8df852db686b39f17eeec6e00c794eb0d28254c2f3bd5b7659f84238df09d642
+size 49004
diff --git a/bark/assets/prompts/pt_speaker_5.npz b/bark/assets/prompts/pt_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..aa65f95d218ab1e39b7864ec8409a42ea12ca557
--- /dev/null
+++ b/bark/assets/prompts/pt_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f4b46e6a8483c8cacc9afaff6ef91f3be5b2059275853ff990911179f0f2112
+size 34444
diff --git a/bark/assets/prompts/pt_speaker_6.npz b/bark/assets/prompts/pt_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..76d3d0b304674e7b27751a578cc751127ad84862
--- /dev/null
+++ b/bark/assets/prompts/pt_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb3b9bc612d61196ac456145e7a571c4304f7643cef3b004934a0513117ed5c7
+size 56628
diff --git a/bark/assets/prompts/pt_speaker_7.npz b/bark/assets/prompts/pt_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..672e65b14dc5ca3473bb26546ca986070e3f9b2f
--- /dev/null
+++ b/bark/assets/prompts/pt_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd9c8fab73e4d2d1d6170c23a78340c4e4ad4ddb9ed6864127474c38ca2907e1
+size 34020
diff --git a/bark/assets/prompts/pt_speaker_8.npz b/bark/assets/prompts/pt_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..a9c89b45b9fbffb02dd4a9038bb0f0e560a6976f
--- /dev/null
+++ b/bark/assets/prompts/pt_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:584d7eb0ffd6552fd749718ccdd2422cc23cc04c4aab38725756335142914aff
+size 30284
diff --git a/bark/assets/prompts/pt_speaker_9.npz b/bark/assets/prompts/pt_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4303251b30bed8a332217d1009b14c4a324b5192
--- /dev/null
+++ b/bark/assets/prompts/pt_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21724f6ac25a3aef785875a0cbd98c6d72fabd9e60aa982a8afa6608b59388ae
+size 58652
diff --git a/bark/assets/prompts/readme.md b/bark/assets/prompts/readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..b01ae915d015f80c164253ac79e5e97e9b6e04b5
--- /dev/null
+++ b/bark/assets/prompts/readme.md
@@ -0,0 +1,30 @@
+# Example Prompts Data
+
+## Version Two
+The `v2` prompts are better engineered to follow text with a consistent voice.
+To use them, simply include `v2` in the prompt. For example
+```python
+from bark import generate_audio
+text_prompt = "madam I'm adam"
+audio_array = generate_audio(text_prompt, history_prompt="v2/en_speaker_1")
+```
+
+## Prompt Format
+The provided data is in the .npz format, which is a file format used in Python for storing arrays and data. The data contains three arrays: semantic_prompt, coarse_prompt, and fine_prompt.
+
+```semantic_prompt```
+
+The semantic_prompt array contains a sequence of token IDs generated by the BERT tokenizer from Hugging Face. These tokens encode the text input and are used as an input to generate the audio output. The shape of this array is (n,), where n is the number of tokens in the input text.
+
+```coarse_prompt```
+
+The coarse_prompt array is an intermediate output of the text-to-speech pipeline, and contains token IDs generated by the first two codebooks of the EnCodec Codec from Facebook. This step converts the semantic tokens into a different representation that is better suited for the subsequent step. The shape of this array is (2, m), where m is the number of tokens after conversion by the EnCodec Codec.
+
+```fine_prompt```
+
+The fine_prompt array is a further processed output of the pipeline, and contains 8 codebooks from the EnCodec Codec. These codebooks represent the final stage of tokenization, and the resulting tokens are used to generate the audio output. The shape of this array is (8, p), where p is the number of tokens after further processing by the EnCodec Codec.
+
+Overall, these arrays represent different stages of a text-to-speech pipeline that converts text input into synthesized audio output. The semantic_prompt array represents the input text, while coarse_prompt and fine_prompt represent intermediate and final stages of tokenization, respectively.
+
+
+
diff --git a/bark/assets/prompts/ru_speaker_0.npz b/bark/assets/prompts/ru_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4d7fbd1b61390cb294e4f211f8dd9445936d14fa
--- /dev/null
+++ b/bark/assets/prompts/ru_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f832edfe62de54ab56cd09862af428927f8e82ddbc371365c6a19db3b4fc1ab6
+size 57852
diff --git a/bark/assets/prompts/ru_speaker_1.npz b/bark/assets/prompts/ru_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4dee2ba3ce87f3ad5903e364d7c11ccee58cc7e4
--- /dev/null
+++ b/bark/assets/prompts/ru_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c72519f5060c8896d8131671e047e347614f555471a5f30da76fbc77acb5e8ee
+size 24260
diff --git a/bark/assets/prompts/ru_speaker_2.npz b/bark/assets/prompts/ru_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c52be3b31ea18c3817f715f8199fdd9f5c51abc9
--- /dev/null
+++ b/bark/assets/prompts/ru_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87db3f55be72596b53afc7d2166ae38a1b6e5ba04880f4c392ff662ff14e41f4
+size 51668
diff --git a/bark/assets/prompts/ru_speaker_3.npz b/bark/assets/prompts/ru_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d6c1d1e8362ec8ac94bc3149d3705090c117b0ad
--- /dev/null
+++ b/bark/assets/prompts/ru_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b68d63b8ae2d46b68a67be76ef4cb7823c7640f2e855da05648bdea9a7c0871b
+size 29164
diff --git a/bark/assets/prompts/ru_speaker_4.npz b/bark/assets/prompts/ru_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..776cc437ca9a34349a6079a6ed1c76ba1c2766c3
--- /dev/null
+++ b/bark/assets/prompts/ru_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c15a3c0cb477b01ab4baecadc9781a398c9e82e1db6cc00f98c78d165af0e6b
+size 27940
diff --git a/bark/assets/prompts/ru_speaker_5.npz b/bark/assets/prompts/ru_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..12fa85e005492c85bc28589a43551999e03c7c17
--- /dev/null
+++ b/bark/assets/prompts/ru_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3bf201c1ea1ea44c77c0264f33dfeeee99d27d02498e77f23d63a56de4ebdeeb
+size 23356
diff --git a/bark/assets/prompts/ru_speaker_6.npz b/bark/assets/prompts/ru_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..0046bb3967ea771370fff4da6fe8fe7d3060bd55
--- /dev/null
+++ b/bark/assets/prompts/ru_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a048c4676d46fbc86492813145e018ecf8790f00153e69bf080926f2a5ba594e
+size 45748
diff --git a/bark/assets/prompts/ru_speaker_7.npz b/bark/assets/prompts/ru_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..da5c1e4d7e3db00dead674973bc520a7b3025f5b
--- /dev/null
+++ b/bark/assets/prompts/ru_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:16078f0e920479b090000cba9fe6cd47be53f8ced6441ad0452267dd5b170870
+size 25380
diff --git a/bark/assets/prompts/ru_speaker_8.npz b/bark/assets/prompts/ru_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..549034bbcc1aaec1c9bcda7082f75d7a028e808b
--- /dev/null
+++ b/bark/assets/prompts/ru_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4c8abbf2a202ccbce4f569233f13adad49cbec45dc9f5029c1e357882c4dbc7
+size 42924
diff --git a/bark/assets/prompts/ru_speaker_9.npz b/bark/assets/prompts/ru_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..af16ace1e06603ffa3e317d8dbb80a19bee69b65
--- /dev/null
+++ b/bark/assets/prompts/ru_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:756000ceb9eea65fa8a257cdee25ba7ec03e2c653c3d5913e0082540811f791d
+size 38500
diff --git a/bark/assets/prompts/speaker_0.npz b/bark/assets/prompts/speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4c531fb26cfafea44a9a1e90b4efe0ee4a79dc4e
--- /dev/null
+++ b/bark/assets/prompts/speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55bc30061b5c5928454e4c7a1d6206e359a25ca38fec3ca96de0a625fa96c572
+size 19620
diff --git a/bark/assets/prompts/speaker_1.npz b/bark/assets/prompts/speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d858600f97683c44cd72ccfa8badfa3b189f0467
--- /dev/null
+++ b/bark/assets/prompts/speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d5d5531998bd91684806eb64a2ac659d8c242f4112d6216697d3cae0b99b978
+size 21380
diff --git a/bark/assets/prompts/speaker_2.npz b/bark/assets/prompts/speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..2452ed7bcab190bedde76dc7d7d3fe4d82643278
--- /dev/null
+++ b/bark/assets/prompts/speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3001ff8a04e64e0687b0ad145c92684c8758ce7af68fb330dcfee4739fd896b
+size 19460
diff --git a/bark/assets/prompts/speaker_3.npz b/bark/assets/prompts/speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..29d23b0a1795126c86f735f3e5f8af17de9184b5
--- /dev/null
+++ b/bark/assets/prompts/speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08b20f307ff4a1e5a947f4394ce2f2c3c5e0e6a9f78e0fd77604fb08359ab90d
+size 32740
diff --git a/bark/assets/prompts/speaker_4.npz b/bark/assets/prompts/speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d12adb15a7bc72de351c53b046a6edbb46713cd4
--- /dev/null
+++ b/bark/assets/prompts/speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b6acddfa41ce84e558e09e91fae5fbb01704bc1cef0f000bcc7f30d05e51afc
+size 19676
diff --git a/bark/assets/prompts/speaker_5.npz b/bark/assets/prompts/speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1662063711535dffe2ec4c0711e940ca0bd78a7b
--- /dev/null
+++ b/bark/assets/prompts/speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:048c7362b237c43ceb0c3a4986b5c42c21ef013cadaf7c77b6348419f801dc93
+size 54548
diff --git a/bark/assets/prompts/speaker_6.npz b/bark/assets/prompts/speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9877675833fe910b4fd15b6938e35a8bf1434073
--- /dev/null
+++ b/bark/assets/prompts/speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d7359be4a984930a81103043409b695e383d493f4edd6d4786537b1730a95c0
+size 23516
diff --git a/bark/assets/prompts/speaker_7.npz b/bark/assets/prompts/speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f83e4af9176bc23fb0dbafaeadd0c3f24dcb14e4
--- /dev/null
+++ b/bark/assets/prompts/speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:560ccbd20b16a2313cdc44ed578c8fb4dcbe51c2d1c57756dc242d185a6b88d3
+size 22556
diff --git a/bark/assets/prompts/speaker_8.npz b/bark/assets/prompts/speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..dff9d012159fd857ba4070c99fb96a66a8c8de41
--- /dev/null
+++ b/bark/assets/prompts/speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26eb3e2589f21f88aa963f052cc5134c6510b1cdb0033be277733bc7dc77157c
+size 20580
diff --git a/bark/assets/prompts/speaker_9.npz b/bark/assets/prompts/speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..98fc91445386fe8ea4aabe7a9172d10e4298b557
--- /dev/null
+++ b/bark/assets/prompts/speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15ab7bbb47bf326e454cc1d299f4069d0fa9ea8e934273dbed4cbf1116404322
+size 18396
diff --git a/bark/assets/prompts/tr_speaker_0.npz b/bark/assets/prompts/tr_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8d037093d14edd7fb770878a545573d11648fd56
--- /dev/null
+++ b/bark/assets/prompts/tr_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21c8f2c4e8b31b0a11c1565ba5ee104e11db4e3f83c6d8b44d52385692322d3b
+size 26020
diff --git a/bark/assets/prompts/tr_speaker_1.npz b/bark/assets/prompts/tr_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..0eb88054072445f64d62370aabf7256505453acf
--- /dev/null
+++ b/bark/assets/prompts/tr_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:578ca20688ff603c6365a9f53076300cd17dec784532b4bb2e75de8a25f4781c
+size 24156
diff --git a/bark/assets/prompts/tr_speaker_2.npz b/bark/assets/prompts/tr_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..324a432c1456a37f490bd3358e440a5f8e4b07d2
--- /dev/null
+++ b/bark/assets/prompts/tr_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:97013a34b28feb95881e5bcd4bea53e81acfcb5c4c896a6733e2a5e351242e6c
+size 32740
diff --git a/bark/assets/prompts/tr_speaker_3.npz b/bark/assets/prompts/tr_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..741c83a19f8867f98151203b043c9af46885de4e
--- /dev/null
+++ b/bark/assets/prompts/tr_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:117cd3cf2367f009d86849c75f85709cd227628b6b26ce7074b6196c2bb12132
+size 20100
diff --git a/bark/assets/prompts/tr_speaker_4.npz b/bark/assets/prompts/tr_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f06c02cf18b9a686aaa6a1c24b8dd402f8a1a4c2
--- /dev/null
+++ b/bark/assets/prompts/tr_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:102a94642852e99a171875a53a3f219196407d75bbec62191dcd3bd542aa9c64
+size 16100
diff --git a/bark/assets/prompts/tr_speaker_5.npz b/bark/assets/prompts/tr_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4395256a8e99798ed3c0733299ef3166c109365c
--- /dev/null
+++ b/bark/assets/prompts/tr_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c61bf6adc04f81f1a5fbcbac1c0257fd76769f122ab083f16b3e29e2a7eeae7a
+size 29220
diff --git a/bark/assets/prompts/tr_speaker_6.npz b/bark/assets/prompts/tr_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..34723602614bf90e80ab0cfd986e57af7513cafe
--- /dev/null
+++ b/bark/assets/prompts/tr_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6250cb26f5b4563e9be8e5ae24f3c3af386c5b52ea21ad99237edc08296e3b6d
+size 21596
diff --git a/bark/assets/prompts/tr_speaker_7.npz b/bark/assets/prompts/tr_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b240fa58e2a72cd390f0afa813b55409c473a1d4
--- /dev/null
+++ b/bark/assets/prompts/tr_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1eeca993d97dd24a1115494872c062c35297f462270ed4062f3158b0f8af08ac
+size 21276
diff --git a/bark/assets/prompts/tr_speaker_8.npz b/bark/assets/prompts/tr_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ebd583be48327212ea90e34e3aa5b5f1493333fe
--- /dev/null
+++ b/bark/assets/prompts/tr_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cba4b642845725653e6d18b55b892208b33878e3914daeb1bd86e9c2d6383e33
+size 35724
diff --git a/bark/assets/prompts/tr_speaker_9.npz b/bark/assets/prompts/tr_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..45b093169b0c0338303052a3c82a584710a26658
--- /dev/null
+++ b/bark/assets/prompts/tr_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d493f6328ba149b7680e55cd6a9b7419b88df871981040a0cc4a51493b210b6
+size 19460
diff --git a/bark/assets/prompts/v2/de_speaker_0.npz b/bark/assets/prompts/v2/de_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..430a12b87d7d962ab47a562e8a93952f01c8ed5b
--- /dev/null
+++ b/bark/assets/prompts/v2/de_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82c8d443f71a46bca90e9323e0fd14c8beaaa55dbc690eb14b75b6b14497005a
+size 39620
diff --git a/bark/assets/prompts/v2/de_speaker_1.npz b/bark/assets/prompts/v2/de_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..edce987cc081817c674a0f0e98c8159e1d2982d6
--- /dev/null
+++ b/bark/assets/prompts/v2/de_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed1c4324e3f989d484d7ed433efa2082f87c845f8688e18417624210e979335d
+size 27460
diff --git a/bark/assets/prompts/v2/de_speaker_2.npz b/bark/assets/prompts/v2/de_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..643d1533ad5bacce08a9c42a013b28aa53c38faa
--- /dev/null
+++ b/bark/assets/prompts/v2/de_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90f08869a1377c86ec525f0ad7aed10b4dcf7d75717b47a34d4b677d7e33e921
+size 24740
diff --git a/bark/assets/prompts/v2/de_speaker_3.npz b/bark/assets/prompts/v2/de_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..809646482f87622cf2bd2cc07989c4b6a48d04c8
--- /dev/null
+++ b/bark/assets/prompts/v2/de_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7da1e9457f8a5e082988652f202a5cc5320ac362f81ecfce5b1ce6edce2342d1
+size 31300
diff --git a/bark/assets/prompts/v2/de_speaker_4.npz b/bark/assets/prompts/v2/de_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..a806a4c65d5543a95e0f25744267a7eb7ac7594b
--- /dev/null
+++ b/bark/assets/prompts/v2/de_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c53c565cedaa683bc2bf5577c0ad70c4d435b66055641234857dff5e743b2b5a
+size 30660
diff --git a/bark/assets/prompts/v2/de_speaker_5.npz b/bark/assets/prompts/v2/de_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..e5d6f6bd57fac5a6c30edf4849eab83f70451310
--- /dev/null
+++ b/bark/assets/prompts/v2/de_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a798e48483c89702c316478336939a5c5a579cb0dd9e76943eca1ece914e3bdc
+size 31300
diff --git a/bark/assets/prompts/v2/de_speaker_6.npz b/bark/assets/prompts/v2/de_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c11dbdad84dec46ffdb302b699474875ca404ce2
--- /dev/null
+++ b/bark/assets/prompts/v2/de_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9d668bf7735343ca059cfc35c0d796a422cb05ae6172a244dfd7320958943304
+size 23196
diff --git a/bark/assets/prompts/v2/de_speaker_7.npz b/bark/assets/prompts/v2/de_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f442fb9e896dc1f7ee40672bfc742e4a158a05c5
--- /dev/null
+++ b/bark/assets/prompts/v2/de_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7aec16132be2de475b9d8889ff281cce60efa06f7910f8d2701aac75d119d9b4
+size 40100
diff --git a/bark/assets/prompts/v2/de_speaker_8.npz b/bark/assets/prompts/v2/de_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9f7efdeb0be1be9e287f3094b335ff55b9fa6e1a
--- /dev/null
+++ b/bark/assets/prompts/v2/de_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a81d5f6c95d347cc269679bc41bf5dc50fe644e01b472985f6dd46c9b578937
+size 28524
diff --git a/bark/assets/prompts/v2/de_speaker_9.npz b/bark/assets/prompts/v2/de_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..afeab53e486cbe349a9b4924077404eda3b53960
--- /dev/null
+++ b/bark/assets/prompts/v2/de_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:73667c2a678d5264d583085772297aa7451c20d48286dba57ebc43d78767de38
+size 51084
diff --git a/bark/assets/prompts/v2/en_speaker_0.npz b/bark/assets/prompts/v2/en_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..2ccc5a8a08be9765800958b93858b5720b594665
--- /dev/null
+++ b/bark/assets/prompts/v2/en_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:932f40d879ba8659f1ca26319ba64ea3b0647b2050fe24313bf42b0dff1fe241
+size 28100
diff --git a/bark/assets/prompts/v2/en_speaker_1.npz b/bark/assets/prompts/v2/en_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..773451dd1073938fccf73895ec049042c9609bc0
--- /dev/null
+++ b/bark/assets/prompts/v2/en_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e7f18015e1ab9b6302ded1e28a971af5306a72f193bb6c411f1948a083c8578
+size 25220
diff --git a/bark/assets/prompts/v2/en_speaker_2.npz b/bark/assets/prompts/v2/en_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8a2f9e4366031f67781097371e08a36342635ff4
--- /dev/null
+++ b/bark/assets/prompts/v2/en_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d218990680ece5f2d4fc18ea4783b016b3ae353ec413eaee2058f2d57263c9b3
+size 26236
diff --git a/bark/assets/prompts/v2/en_speaker_3.npz b/bark/assets/prompts/v2/en_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..103cfb362b1ede1b67145d4c2384c7797e8d5ea4
--- /dev/null
+++ b/bark/assets/prompts/v2/en_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:92c2e2a29145c83738e9b63f082fd1c873d9422468a155463cb27f814aeaea66
+size 34980
diff --git a/bark/assets/prompts/v2/en_speaker_4.npz b/bark/assets/prompts/v2/en_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..123777ca72c8bbd4d4548b48d6e0cae91b13ab0d
--- /dev/null
+++ b/bark/assets/prompts/v2/en_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:992f91991a9a5359d72f00b09a11a550e71bb8ebfc0cfd877e39d7d41f98b714
+size 23780
diff --git a/bark/assets/prompts/v2/en_speaker_5.npz b/bark/assets/prompts/v2/en_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..dcf05979f75c24b11888ab53da02ddb118c91459
--- /dev/null
+++ b/bark/assets/prompts/v2/en_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18831c3f6014e4a2ff60ad5169b1fae06e28ed07f43f8a3616aafb84515091bf
+size 24740
diff --git a/bark/assets/prompts/v2/en_speaker_6.npz b/bark/assets/prompts/v2/en_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..090f03f886a4eba3105a0d28e7b739fb600c2cd8
--- /dev/null
+++ b/bark/assets/prompts/v2/en_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fab38dc6b6bc9226bcc414f4c5a9524bc1b2441865a586153fb620127a8faa4e
+size 25540
diff --git a/bark/assets/prompts/v2/en_speaker_7.npz b/bark/assets/prompts/v2/en_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d5d9068bff806b7c6e1025720c5a2c1636ba8b36
--- /dev/null
+++ b/bark/assets/prompts/v2/en_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f4c4eb33f5994be8de5cfd1744ebce13da1618a6da3a7d244514178c61ef7db
+size 22716
diff --git a/bark/assets/prompts/v2/en_speaker_8.npz b/bark/assets/prompts/v2/en_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..99bdf0061c5d3377aa1aebe5759faa3f41aa27e1
--- /dev/null
+++ b/bark/assets/prompts/v2/en_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8fc9f11b539588f51bbf78150a73e0365c49b2306bd72e5a22b28ef09c4fb15d
+size 23300
diff --git a/bark/assets/prompts/v2/en_speaker_9.npz b/bark/assets/prompts/v2/en_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..2439d40fb6cf3a754c4ce305d3c95e8c463690d1
--- /dev/null
+++ b/bark/assets/prompts/v2/en_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:78b3ba32eb9aeb9ed34556856c40633ecc8332d1c3ae3c81e6f5015ac3eefbd5
+size 30180
diff --git a/bark/assets/prompts/v2/es_speaker_0.npz b/bark/assets/prompts/v2/es_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1dc7eeefbfc994e558c051b68ea4ff054890732f
--- /dev/null
+++ b/bark/assets/prompts/v2/es_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:753ccfffe8b5f1a8dbb44bf6fb7bb66f39d11adedb37204256f194f6c8bf0205
+size 22020
diff --git a/bark/assets/prompts/v2/es_speaker_1.npz b/bark/assets/prompts/v2/es_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..778130eb7e527c3724a952bdfcc4fc5cb33c2c0d
--- /dev/null
+++ b/bark/assets/prompts/v2/es_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68574669097c67e420bc587c2043310d5c1de3f8a65280207f9c08bd577a5906
+size 25116
diff --git a/bark/assets/prompts/v2/es_speaker_2.npz b/bark/assets/prompts/v2/es_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9fafdaaf3d1bf1de2fb685c79c7eafa7f06843cf
--- /dev/null
+++ b/bark/assets/prompts/v2/es_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d1c4292dc08db834668d3f93b98d7b2589242408f8027c344c29081d7304da6
+size 26236
diff --git a/bark/assets/prompts/v2/es_speaker_3.npz b/bark/assets/prompts/v2/es_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..2757fb847e0f1755bf2012f5a5ac854ff4b2c92d
--- /dev/null
+++ b/bark/assets/prompts/v2/es_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57ae62de759e42f45b3e46095a023f5a242f028b856ac6d8bfdb9f61edaf2089
+size 23780
diff --git a/bark/assets/prompts/v2/es_speaker_4.npz b/bark/assets/prompts/v2/es_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..cd59ad463c97266762660b716bcc2575a7d42c30
--- /dev/null
+++ b/bark/assets/prompts/v2/es_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61c863e864973d7c4b30b97af1bfcfdf90a037a576f58c88890e8ff603f2c157
+size 23356
diff --git a/bark/assets/prompts/v2/es_speaker_5.npz b/bark/assets/prompts/v2/es_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..04bf8f6ac5521d653c8d71a315b7a240fa14379d
--- /dev/null
+++ b/bark/assets/prompts/v2/es_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71566bd17e787b472c6705da4b09b5017b3d9e68b274b593bc620bbe7beed6bc
+size 25700
diff --git a/bark/assets/prompts/v2/es_speaker_6.npz b/bark/assets/prompts/v2/es_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..39a0e5d20674ed69a3ca9e8bdbc4f296661bbbc8
--- /dev/null
+++ b/bark/assets/prompts/v2/es_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b866852d37c40d526b2f0f19549038cd000d5d33895b4a4ef391494efeb681b2
+size 20580
diff --git a/bark/assets/prompts/v2/es_speaker_7.npz b/bark/assets/prompts/v2/es_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..bf00317a9b7303962d9bbc5da126fe507c752a6a
--- /dev/null
+++ b/bark/assets/prompts/v2/es_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d02d4d65e70b4469152ee0ae359ca83d7c8c9824c666047f84fa20889f261cb4
+size 22020
diff --git a/bark/assets/prompts/v2/es_speaker_8.npz b/bark/assets/prompts/v2/es_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..0d38cec3b9626904404edcb43f97ffe1927dc47f
--- /dev/null
+++ b/bark/assets/prompts/v2/es_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c097c1ee632612aa9c6c839dc0c885832b040a0d8d2bd8749b891429bb609a0f
+size 25436
diff --git a/bark/assets/prompts/v2/es_speaker_9.npz b/bark/assets/prompts/v2/es_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1c4359d68f2d2cd2d443feaed47d77292c8067fd
--- /dev/null
+++ b/bark/assets/prompts/v2/es_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c4b77f52b0c18619e948d711e70b76fa57597126d2c22e6bbe409055d4abdec0
+size 19940
diff --git a/bark/assets/prompts/v2/fr_speaker_0.npz b/bark/assets/prompts/v2/fr_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..a0675f8dc5b0408e84517ffc1b77950327226c72
--- /dev/null
+++ b/bark/assets/prompts/v2/fr_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68240e79c5b11d6c1a3d6c0cc47de60fd92040229ffe12b2770a0ded35a62161
+size 45804
diff --git a/bark/assets/prompts/v2/fr_speaker_1.npz b/bark/assets/prompts/v2/fr_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c33f7e13b693439c32acf3cbeef0c143f90370d4
--- /dev/null
+++ b/bark/assets/prompts/v2/fr_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d87cc3e81a3c166138a80f23614b11ba16f03c662a7974e9f9f3e419203a4228
+size 25700
diff --git a/bark/assets/prompts/v2/fr_speaker_2.npz b/bark/assets/prompts/v2/fr_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..259e17217b278da528a49fff0037171687f4db21
--- /dev/null
+++ b/bark/assets/prompts/v2/fr_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:265004e0a094d846d655f9f6b8802caee76e1cd4721a53b2c5201f99c9b87edf
+size 52204
diff --git a/bark/assets/prompts/v2/fr_speaker_3.npz b/bark/assets/prompts/v2/fr_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b48aeda87afa3f1252f04f9512937817993576da
--- /dev/null
+++ b/bark/assets/prompts/v2/fr_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4f5438a83c768b722b75798d3a2696950aa5f86628d522075c7ca13047a0a166
+size 50764
diff --git a/bark/assets/prompts/v2/fr_speaker_4.npz b/bark/assets/prompts/v2/fr_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ddb90bbcb3a8e21835e9d39c522286d6a8d2ab18
--- /dev/null
+++ b/bark/assets/prompts/v2/fr_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e07317223ef83c23b380029043a3897487baef441351dc6982d01f6fd0079b9
+size 49908
diff --git a/bark/assets/prompts/v2/fr_speaker_5.npz b/bark/assets/prompts/v2/fr_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..0b07103e6488b2d8daeec71a6b4cc82bf408bd12
--- /dev/null
+++ b/bark/assets/prompts/v2/fr_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:acf899fe1f544a49d5d2acb6d7d770a89f7efc068355e272a8204a343f89b5ce
+size 45108
diff --git a/bark/assets/prompts/v2/fr_speaker_6.npz b/bark/assets/prompts/v2/fr_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b0bd5dea4b081188a8056f922e4877df559611db
--- /dev/null
+++ b/bark/assets/prompts/v2/fr_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:224a9b8eeda606850388911d3a82630ba7fa32a276cbfb8cd06399712c7f7ca8
+size 55932
diff --git a/bark/assets/prompts/v2/fr_speaker_7.npz b/bark/assets/prompts/v2/fr_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..05718606139d0b5c8bd9df1e92d2a3cc34343420
--- /dev/null
+++ b/bark/assets/prompts/v2/fr_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:422d55e1d59cd761cc7386c71b832bfb6a770fe51352d7313785612460c475e8
+size 32524
diff --git a/bark/assets/prompts/v2/fr_speaker_8.npz b/bark/assets/prompts/v2/fr_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..14f6f4863ca68e491fb438f808e37378b493541b
--- /dev/null
+++ b/bark/assets/prompts/v2/fr_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:929a8e2303508c7602e8f65942844f453a719ede39f5d7357bbee859214ea145
+size 43244
diff --git a/bark/assets/prompts/v2/fr_speaker_9.npz b/bark/assets/prompts/v2/fr_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..625fceeff84119d1c464a992864249f493bb7988
--- /dev/null
+++ b/bark/assets/prompts/v2/fr_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09cc3b4d0cb1d7fc86caa65a36f6cee615d947c367adfe226f947917ec7b08b2
+size 32100
diff --git a/bark/assets/prompts/v2/hi_speaker_0.npz b/bark/assets/prompts/v2/hi_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1a481c40ff390cbfa7c61df81d966ff6df7a28bb
--- /dev/null
+++ b/bark/assets/prompts/v2/hi_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:12a32c0f2ade5948d850b06069f31dea88c70be841dc2dfbc048af3024d0fd87
+size 32580
diff --git a/bark/assets/prompts/v2/hi_speaker_1.npz b/bark/assets/prompts/v2/hi_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9a22578290b9a23d0b9b107d7417ced7bf0cfd52
--- /dev/null
+++ b/bark/assets/prompts/v2/hi_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6fea5c5f9555e37c782b88e87db73fa3a1adabf287625cf72c289aeef139a938
+size 25860
diff --git a/bark/assets/prompts/v2/hi_speaker_2.npz b/bark/assets/prompts/v2/hi_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1b65cf9976f0ffeb4ec011f03e650d155b88f076
--- /dev/null
+++ b/bark/assets/prompts/v2/hi_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5904ab83736410ce52588efa27f61fff59df21e186508aabbe015b3db78d4d40
+size 27780
diff --git a/bark/assets/prompts/v2/hi_speaker_3.npz b/bark/assets/prompts/v2/hi_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..01a2bab3858d106b0cef25a04020d5afd6cdcc53
--- /dev/null
+++ b/bark/assets/prompts/v2/hi_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d40a8eda9ed64d9294b4bb85922d2dde48d8be5e5bddfcfc400bc5f6442b5178
+size 29804
diff --git a/bark/assets/prompts/v2/hi_speaker_4.npz b/bark/assets/prompts/v2/hi_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..427927394b79e6b9586146b655ea8b35ae381277
--- /dev/null
+++ b/bark/assets/prompts/v2/hi_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09dbe02c981d79ea9b4794553fb54d6a05d573a5e8f2bace8a78f7ebee65878a
+size 25380
diff --git a/bark/assets/prompts/v2/hi_speaker_5.npz b/bark/assets/prompts/v2/hi_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6b6197a9494cd656a8072766bb2344039019b88d
--- /dev/null
+++ b/bark/assets/prompts/v2/hi_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cba5c6f62757ddf29502e5cdb7ad184edc0bb7b6b05ec6020ee3fb2c404d7642
+size 51404
diff --git a/bark/assets/prompts/v2/hi_speaker_6.npz b/bark/assets/prompts/v2/hi_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..62d8127e826c11c2710f9ebe8138a40360980512
--- /dev/null
+++ b/bark/assets/prompts/v2/hi_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f098561145b8f2755089b94753db97525e1427efd3b5a9daf923990abd04828
+size 26396
diff --git a/bark/assets/prompts/v2/hi_speaker_7.npz b/bark/assets/prompts/v2/hi_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..aaaae9306427e67be00c4b1d0b0f34a455344b3e
--- /dev/null
+++ b/bark/assets/prompts/v2/hi_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b92ed8ca28d71a3fc720a99386467447fc8753e7bc8b4d8ff7cd765c835b467
+size 29380
diff --git a/bark/assets/prompts/v2/hi_speaker_8.npz b/bark/assets/prompts/v2/hi_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..7d4d8d7c3f24de34791a9ad98a76f75ebf2db131
--- /dev/null
+++ b/bark/assets/prompts/v2/hi_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc8e094f5d6b3109b6a8a5f040db0f2e3f214f90ebd7708e19c6db88aabdaeca
+size 39404
diff --git a/bark/assets/prompts/v2/hi_speaker_9.npz b/bark/assets/prompts/v2/hi_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..0af99a71f8a65d4c2dc0df15f820d59be8d59db3
--- /dev/null
+++ b/bark/assets/prompts/v2/hi_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2dbf6c82c5eac9d412c50ae76a42525d28c64c08900d191e1838cbeca4b133a1
+size 23516
diff --git a/bark/assets/prompts/v2/it_speaker_0.npz b/bark/assets/prompts/v2/it_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..debaa79a85e3ec8592da7e8464867bdb6331ac4e
--- /dev/null
+++ b/bark/assets/prompts/v2/it_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ad1d1194448c69b8b6e5e6a56cbbd7ebf1d2f654e89a0b773702124b8dcec799
+size 28740
diff --git a/bark/assets/prompts/v2/it_speaker_1.npz b/bark/assets/prompts/v2/it_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..72e1d748940bdc3c11818af62f28bfd6e59b0fab
--- /dev/null
+++ b/bark/assets/prompts/v2/it_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:838b990eb7e86d463ebbec70926aad5bea178c902065f2a48818c86c9a2056be
+size 33804
diff --git a/bark/assets/prompts/v2/it_speaker_2.npz b/bark/assets/prompts/v2/it_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..88ce1d8ec7773380a5bf03ed9387cffdf0077129
--- /dev/null
+++ b/bark/assets/prompts/v2/it_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26cefcec88aab6adc0d159e0ca34e1e97fec8a1f240af11fa6f3f321f789e787
+size 40788
diff --git a/bark/assets/prompts/v2/it_speaker_3.npz b/bark/assets/prompts/v2/it_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..83100eda7b2af732dab73f160f1266e8671b3051
--- /dev/null
+++ b/bark/assets/prompts/v2/it_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fc35eea4020470fbddfea5c3948115fdc7eb122d4f88ea20f641bde5f71a1133
+size 30764
diff --git a/bark/assets/prompts/v2/it_speaker_4.npz b/bark/assets/prompts/v2/it_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1621505e856d21a72b5d56535cdd355de4052694
--- /dev/null
+++ b/bark/assets/prompts/v2/it_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9fe9d9de363305e7dbe8d2aa5311c552260e42c4a94a4b5a40c78a1b47d44689
+size 28740
diff --git a/bark/assets/prompts/v2/it_speaker_5.npz b/bark/assets/prompts/v2/it_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..a5306af173c1319f1de9f4e615ee525c2a29f6b0
--- /dev/null
+++ b/bark/assets/prompts/v2/it_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24314dc0e83d78278eccb55cdb8fab18b80019fcf8b9e819a21916dbe20e61cd
+size 30444
diff --git a/bark/assets/prompts/v2/it_speaker_6.npz b/bark/assets/prompts/v2/it_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..94e1949c29255fc401a107af97d6a9bb54268b32
--- /dev/null
+++ b/bark/assets/prompts/v2/it_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:955873c0b1f5736d5b5597d70a9520bc61002c472c4714a2821714c5f4ef3b70
+size 29644
diff --git a/bark/assets/prompts/v2/it_speaker_7.npz b/bark/assets/prompts/v2/it_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ea271eaf5d6c3471585b66ed41fe3cde358f3c9a
--- /dev/null
+++ b/bark/assets/prompts/v2/it_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0809a63254b1f8ffa4f948d400e8be7c8cd8e728da5fc25e8add8120f2af5533
+size 43724
diff --git a/bark/assets/prompts/v2/it_speaker_8.npz b/bark/assets/prompts/v2/it_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..84a45070bca2f3e20935a14374fb8559f469b465
--- /dev/null
+++ b/bark/assets/prompts/v2/it_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0346a9e3f1772be3dbd656fda5fc0768eb1e9623e2afaf7e37acc0f88c5c266b
+size 42708
diff --git a/bark/assets/prompts/v2/it_speaker_9.npz b/bark/assets/prompts/v2/it_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..0711fa95af43a4f8f71fcd529b36e37cbb251b5e
--- /dev/null
+++ b/bark/assets/prompts/v2/it_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55cd34677e80f3ee87ae089e6a6c5378c5440039ca9a898f631dd62d21192b09
+size 37644
diff --git a/bark/assets/prompts/v2/ja_speaker_0.npz b/bark/assets/prompts/v2/ja_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..3efa1e84712165e47f9f566a60abdafd5698b0ae
--- /dev/null
+++ b/bark/assets/prompts/v2/ja_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9fed62585fb3e36b8f9bc390c7e897f7abe2dd4b13308ef37bd8f83b4fd13c4a
+size 24420
diff --git a/bark/assets/prompts/v2/ja_speaker_1.npz b/bark/assets/prompts/v2/ja_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..a7846bdedcda1157f741882dcb7a77f5028715fe
--- /dev/null
+++ b/bark/assets/prompts/v2/ja_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7fb54f924f2a14de55b616d466ff795cef7b18709bc9091cb2c2dd10ec5060d3
+size 31244
diff --git a/bark/assets/prompts/v2/ja_speaker_2.npz b/bark/assets/prompts/v2/ja_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..030b2a7b3b3443f1a73ee3acbba63e3bd3dadc6d
--- /dev/null
+++ b/bark/assets/prompts/v2/ja_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52dac33a71598d8c0e5044233f589f97cf7d93b09d46974b4351e0bfaf425d73
+size 24100
diff --git a/bark/assets/prompts/v2/ja_speaker_3.npz b/bark/assets/prompts/v2/ja_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..31b217397f2079dfeab46dab70c3d6202a4a1bb7
--- /dev/null
+++ b/bark/assets/prompts/v2/ja_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:615194f254981be6d55130a0e557092c6850a80ed36ca8cbcf835e73bfaf8036
+size 24476
diff --git a/bark/assets/prompts/v2/ja_speaker_4.npz b/bark/assets/prompts/v2/ja_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..2f0c9d95e052d04adc545efb4ecc4093acfbcbf8
--- /dev/null
+++ b/bark/assets/prompts/v2/ja_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13dacf5605b1a94bf6a739523e3d5f771d793d401c699cce259e5b7560fb1986
+size 26716
diff --git a/bark/assets/prompts/v2/ja_speaker_5.npz b/bark/assets/prompts/v2/ja_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..a6a8032f4876fa7801ff875ce8d604cb92fb3968
--- /dev/null
+++ b/bark/assets/prompts/v2/ja_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42d80ce210df3f10373652ef35a554e14038ee09d8d7367b4aebbf2fbc45ef95
+size 24956
diff --git a/bark/assets/prompts/v2/ja_speaker_6.npz b/bark/assets/prompts/v2/ja_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..fb29cbc5e082a1e0acd5e13bbe843bebf703d65c
--- /dev/null
+++ b/bark/assets/prompts/v2/ja_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8654c0f4a6331a40586b3b265b7ad3224ed9884362b96b3ce8d16fc009ea56f4
+size 40788
diff --git a/bark/assets/prompts/v2/ja_speaker_7.npz b/bark/assets/prompts/v2/ja_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..e56891982dc0963f35ba811628e66e37c20aea03
--- /dev/null
+++ b/bark/assets/prompts/v2/ja_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d892d981d73e5a7b388d388bb68266ca7aca158ab5182e82108f454f1e0e7d07
+size 25060
diff --git a/bark/assets/prompts/v2/ja_speaker_8.npz b/bark/assets/prompts/v2/ja_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..a8cad1b14001c773b92a4022a245236718030bbd
--- /dev/null
+++ b/bark/assets/prompts/v2/ja_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03a17b4f638f01d3bb4c224041c243e747d2c4bf10ac8d03f6daff5475a1a99c
+size 20260
diff --git a/bark/assets/prompts/v2/ja_speaker_9.npz b/bark/assets/prompts/v2/ja_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..bb13eca0c7c51749477afd0221cc4daf89d64958
--- /dev/null
+++ b/bark/assets/prompts/v2/ja_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3c7547365d4d3a27572843ed3899ea28a43fb05239c1bce9d005db6da143cd8a
+size 31140
diff --git a/bark/assets/prompts/v2/ko_speaker_0.npz b/bark/assets/prompts/v2/ko_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..5404666dbd515dbf23df66450d352757733eb536
--- /dev/null
+++ b/bark/assets/prompts/v2/ko_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe31a6881986dcc0d75d3142825aa98b5f642ec3d3ae4385bfa7accbf20c4a26
+size 26556
diff --git a/bark/assets/prompts/v2/ko_speaker_1.npz b/bark/assets/prompts/v2/ko_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..379c1f9c4135d05913b7a797256d647bd1e6bebe
--- /dev/null
+++ b/bark/assets/prompts/v2/ko_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f987c09b64025759da9af1a4443fb5b6c9675a7b0dd50f5de4231e57ed9e81e
+size 26340
diff --git a/bark/assets/prompts/v2/ko_speaker_2.npz b/bark/assets/prompts/v2/ko_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1df0bcf6b8283ce260470ee3810f24b1ab2837b9
--- /dev/null
+++ b/bark/assets/prompts/v2/ko_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e3aac1bf04c2c8aae49dbd4dd4f2ccac346e6726b22563b23f87d2506bd930f
+size 19196
diff --git a/bark/assets/prompts/v2/ko_speaker_3.npz b/bark/assets/prompts/v2/ko_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6e3b48080c53a74cd4c367716f16c41c194b8d85
--- /dev/null
+++ b/bark/assets/prompts/v2/ko_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4974e0659fcd9aaa72514ac146b7ad5e9696773c18549a3a0797802f5db70955
+size 39564
diff --git a/bark/assets/prompts/v2/ko_speaker_4.npz b/bark/assets/prompts/v2/ko_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d5e068b5443d6912c69c1521b8fdd6ee8d326dde
--- /dev/null
+++ b/bark/assets/prompts/v2/ko_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eeb9837e9ad0214859a1fcb71c5d7413ca60be55a4e611228f28aa434feea3a7
+size 23140
diff --git a/bark/assets/prompts/v2/ko_speaker_5.npz b/bark/assets/prompts/v2/ko_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d98b54b310de4b60a008f9d0265bc29016207739
--- /dev/null
+++ b/bark/assets/prompts/v2/ko_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1fd5319c1f6366cc52c99517c2f951adbd5ac67b76829fc06f9be367b79132d
+size 23196
diff --git a/bark/assets/prompts/v2/ko_speaker_6.npz b/bark/assets/prompts/v2/ko_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..7985bdf3fa3e30d046b62ac354cd426d6dd2da7e
--- /dev/null
+++ b/bark/assets/prompts/v2/ko_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:328d17d8918e9a732d9b3f07228c2f875843097632a6e06f822613c8bc5b48df
+size 26396
diff --git a/bark/assets/prompts/v2/ko_speaker_7.npz b/bark/assets/prompts/v2/ko_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..deb134e30a69034868ce49de85ac557aebbddb15
--- /dev/null
+++ b/bark/assets/prompts/v2/ko_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e79f5aa446e1cdb3a2c3c752a76f68e0a6a890d5605bda325bfeff9dd69cf887
+size 27884
diff --git a/bark/assets/prompts/v2/ko_speaker_8.npz b/bark/assets/prompts/v2/ko_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..57f2487e7da571930ce97c39ac1c210324673abc
--- /dev/null
+++ b/bark/assets/prompts/v2/ko_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:99ed04ad3deb6b981976e14df0abc4933c663fb0fb3eb015acbceb450a48da0d
+size 31140
diff --git a/bark/assets/prompts/v2/ko_speaker_9.npz b/bark/assets/prompts/v2/ko_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..267053615c88bf0ec0271dfb604d4de9a07ee3ea
--- /dev/null
+++ b/bark/assets/prompts/v2/ko_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b7f91e06ecf5886a52f73801ea5be894f1b6cb3d67da7738fa8873b618b7535
+size 23676
diff --git a/bark/assets/prompts/v2/pl_speaker_0.npz b/bark/assets/prompts/v2/pl_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..328efc7d637d2ac47e72bed3bf6ae0fd554bbb8c
--- /dev/null
+++ b/bark/assets/prompts/v2/pl_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb026490fef1bad75b5f193eacbe8c99647a0faf17e9583827f4a93c71817439
+size 24900
diff --git a/bark/assets/prompts/v2/pl_speaker_1.npz b/bark/assets/prompts/v2/pl_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f1745a236e6950d4ab4ede58b47bd8eb8b29e664
--- /dev/null
+++ b/bark/assets/prompts/v2/pl_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8daa9e0e55537b05756136e984f9484a21239e9d49a4567e9e695cc281ce9f6a
+size 34660
diff --git a/bark/assets/prompts/v2/pl_speaker_2.npz b/bark/assets/prompts/v2/pl_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..aad28a11aa36f16e47310c70d77823a4cf4f0588
--- /dev/null
+++ b/bark/assets/prompts/v2/pl_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:88fd57a7a0c9a5d7875cd91d28b672085dde12109347b49b095adcf7658d52c4
+size 28580
diff --git a/bark/assets/prompts/v2/pl_speaker_3.npz b/bark/assets/prompts/v2/pl_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4ba807fc90397b43b9508df585b2e0bafb26ac3b
--- /dev/null
+++ b/bark/assets/prompts/v2/pl_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:995f847a79b8dce9bf2009e0cef74e660d4697e90ee9b445f21ca1d817fa7ba9
+size 41428
diff --git a/bark/assets/prompts/v2/pl_speaker_4.npz b/bark/assets/prompts/v2/pl_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1d74cf49cdf7989c4b0bc8f273990cff60e1c976
--- /dev/null
+++ b/bark/assets/prompts/v2/pl_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:23635e04802c46eaadf86295d21d61e19ecb0969402d8ad78e141f04fb2eb1c9
+size 30764
diff --git a/bark/assets/prompts/v2/pl_speaker_5.npz b/bark/assets/prompts/v2/pl_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..437a96776e8ba20c68ca0d9a3516992a107e5b4c
--- /dev/null
+++ b/bark/assets/prompts/v2/pl_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b294acb6154c9f145535b5a23602c34e635e63755a7e76e8ce672e8fa8901774
+size 38180
diff --git a/bark/assets/prompts/v2/pl_speaker_6.npz b/bark/assets/prompts/v2/pl_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b1bd86a276cd619658fa868f5b699a89144c7b35
--- /dev/null
+++ b/bark/assets/prompts/v2/pl_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:60853df4aec0a0365c8515ba4d498a131abefab3f36aadcdb198048423b3ae45
+size 38820
diff --git a/bark/assets/prompts/v2/pl_speaker_7.npz b/bark/assets/prompts/v2/pl_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..faace4b209511c6ea096cd80c6584573322eb9da
--- /dev/null
+++ b/bark/assets/prompts/v2/pl_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b52e278832532ee7a5260e51db6337971231637d81f8e258834884a845b93f67
+size 29060
diff --git a/bark/assets/prompts/v2/pl_speaker_8.npz b/bark/assets/prompts/v2/pl_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ffea7377274b9cf82aaaad1fe6f952a73a74c7db
--- /dev/null
+++ b/bark/assets/prompts/v2/pl_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b65bbcbad148b24ff25625cc952d527c4143614c0bcc5176689ef2991971fe2
+size 19460
diff --git a/bark/assets/prompts/v2/pl_speaker_9.npz b/bark/assets/prompts/v2/pl_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..3cb6ca960151ede664e43a74675df0c972430eb7
--- /dev/null
+++ b/bark/assets/prompts/v2/pl_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34d9f70e6118e07822d8413cf6c88c2c9a43efaa5f581d855b71747ff3c0479d
+size 30980
diff --git a/bark/assets/prompts/v2/pt_speaker_0.npz b/bark/assets/prompts/v2/pt_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9255b6e2db1251052550a2992afbe0c0aa0ee823
--- /dev/null
+++ b/bark/assets/prompts/v2/pt_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:91642af464c5ad73a480907a2b8f85fd5755d4b2127586a23e916204a37e6330
+size 27724
diff --git a/bark/assets/prompts/v2/pt_speaker_1.npz b/bark/assets/prompts/v2/pt_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..e41be26e61abe3bdc8952139cb8819bad6b80f50
--- /dev/null
+++ b/bark/assets/prompts/v2/pt_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fbaba67cbfc0fc2b0ded04b533e6c67d2b0f77670b81afa3bb6391360c7b5834
+size 34500
diff --git a/bark/assets/prompts/v2/pt_speaker_2.npz b/bark/assets/prompts/v2/pt_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d9814fdc19ffeaa100038bc69e8b174e7312ee46
--- /dev/null
+++ b/bark/assets/prompts/v2/pt_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:196def1c1c743569b62b58ce6d5d3f36f43ee3c7e7ceef5548ac2b1a512e610b
+size 36844
diff --git a/bark/assets/prompts/v2/pt_speaker_3.npz b/bark/assets/prompts/v2/pt_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c18d03eeb88da2d4ecb2f7ce7c5195ff81be3810
--- /dev/null
+++ b/bark/assets/prompts/v2/pt_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ebae3bec99e8ba38df8afc1727f4f54ec331e3da6ce5d3a1c0f9b35bbb1a2844
+size 26980
diff --git a/bark/assets/prompts/v2/pt_speaker_4.npz b/bark/assets/prompts/v2/pt_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..3365618d8bcf3b32a1ab7d3d22b623d4e0cde8f4
--- /dev/null
+++ b/bark/assets/prompts/v2/pt_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c93c38105050f658b93d3d076c3e0bb7088e0b2f4550f7b27d5fe5cce8c35bf1
+size 26396
diff --git a/bark/assets/prompts/v2/pt_speaker_5.npz b/bark/assets/prompts/v2/pt_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..2a3c339918a505c339e1724dd00f1a8a0d134547
--- /dev/null
+++ b/bark/assets/prompts/v2/pt_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf4984719aad20c9ef021657c83b21233584a9854981152b63defa3f2936401c
+size 28260
diff --git a/bark/assets/prompts/v2/pt_speaker_6.npz b/bark/assets/prompts/v2/pt_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d29a56ae7775d820dbdf35c77fd9e797b4a32142
--- /dev/null
+++ b/bark/assets/prompts/v2/pt_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6a4169f290ef7601d2c586975ced73d8d6545ac1f62ec78a3c8c1ac5f6535eaf
+size 30764
diff --git a/bark/assets/prompts/v2/pt_speaker_7.npz b/bark/assets/prompts/v2/pt_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..bb6cbe9389e18540a2ccb2b04530273458949862
--- /dev/null
+++ b/bark/assets/prompts/v2/pt_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a65441bcee2bf83e0c34260bcd11bb83a5c147bf4da8692a3a6392fe643aa8b8
+size 28100
diff --git a/bark/assets/prompts/v2/pt_speaker_8.npz b/bark/assets/prompts/v2/pt_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..45835e8ac2efc24068143f8dc6d4d776c8340bb8
--- /dev/null
+++ b/bark/assets/prompts/v2/pt_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03164847ec1161b354827950ccbb7e7082d638f8fa28c7d3ffbe936fff44d7de
+size 28524
diff --git a/bark/assets/prompts/v2/pt_speaker_9.npz b/bark/assets/prompts/v2/pt_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..7e7a299428cefcb4ae2ed92e6529abd789bd3b97
--- /dev/null
+++ b/bark/assets/prompts/v2/pt_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8558c022cd0aef90b214085ec098e2350261059ce0c1fe199935237421f1ade2
+size 39780
diff --git a/bark/assets/prompts/v2/ru_speaker_0.npz b/bark/assets/prompts/v2/ru_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..66447e1e52f0249dd8fb469bd0ef9ae00b0705a3
--- /dev/null
+++ b/bark/assets/prompts/v2/ru_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6405a5dc1746dfd97aaa51b2a3cea4853dd4fae0dcb3f378a4734c29c50930bd
+size 39884
diff --git a/bark/assets/prompts/v2/ru_speaker_1.npz b/bark/assets/prompts/v2/ru_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..123d48373801a84dda4cb4311a606a5599708f2f
--- /dev/null
+++ b/bark/assets/prompts/v2/ru_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01fa66fd2ec3caf72e7a7c3db78f42690c53c175599d00d4ea72694d35d5fa61
+size 56628
diff --git a/bark/assets/prompts/v2/ru_speaker_2.npz b/bark/assets/prompts/v2/ru_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b835c6629f143e9423ae1de96febee41c202be7b
--- /dev/null
+++ b/bark/assets/prompts/v2/ru_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a49d5dbe3d688232ec91803f184004abff8903dd550c24800b86df82547ec31f
+size 29220
diff --git a/bark/assets/prompts/v2/ru_speaker_3.npz b/bark/assets/prompts/v2/ru_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..eaf483f3cb7a3d7575731b2336d0027cd23135d1
--- /dev/null
+++ b/bark/assets/prompts/v2/ru_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:613b64d5e1296e46a2250c1ddb3b07264edfff91380871d418dd729eaf223706
+size 19940
diff --git a/bark/assets/prompts/v2/ru_speaker_4.npz b/bark/assets/prompts/v2/ru_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..5212b894a12f68cf22477bd0d9dc394032cffc11
--- /dev/null
+++ b/bark/assets/prompts/v2/ru_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a96d957516c6fb0e554b1a8fae548c6bc21646f202fd0d4c540ea421dc0b0c7
+size 28204
diff --git a/bark/assets/prompts/v2/ru_speaker_5.npz b/bark/assets/prompts/v2/ru_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..0db2d67581b807f75b3156cb72d690c6698d164b
--- /dev/null
+++ b/bark/assets/prompts/v2/ru_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:72d472cff7af811dd3c3ae18b46c1ad5ead70b28acba00c3b9bd7d117fe67624
+size 44628
diff --git a/bark/assets/prompts/v2/ru_speaker_6.npz b/bark/assets/prompts/v2/ru_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..123b4639e3a1ac431444d9b148d32381124c0485
--- /dev/null
+++ b/bark/assets/prompts/v2/ru_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2160d4938f61921405bff3ac69c0168a65722197b8ee41379685f415f7fb40cd
+size 20476
diff --git a/bark/assets/prompts/v2/ru_speaker_7.npz b/bark/assets/prompts/v2/ru_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c99218eee3920194b748ac4816be5f0d2ee9818b
--- /dev/null
+++ b/bark/assets/prompts/v2/ru_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:941a776f7b1d50d17d1ee23373d4dfc7a9f1a5395177301f847e0a22a5f00628
+size 26020
diff --git a/bark/assets/prompts/v2/ru_speaker_8.npz b/bark/assets/prompts/v2/ru_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8932a9c04a4e16ba5cb029846f2a76a6114a3c92
--- /dev/null
+++ b/bark/assets/prompts/v2/ru_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1cc916253b1e226616d5b81e0d3fec66c0f26a2ba1ae956f30351f12a9b7a2f1
+size 39084
diff --git a/bark/assets/prompts/v2/ru_speaker_9.npz b/bark/assets/prompts/v2/ru_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1f060fca4857396b7fa6e100eb23dd4f772b1acb
--- /dev/null
+++ b/bark/assets/prompts/v2/ru_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:abd862b60db01516bcf224033a15c207c25fbdaae994b4748163ad0af697059f
+size 34660
diff --git a/bark/assets/prompts/v2/tr_speaker_0.npz b/bark/assets/prompts/v2/tr_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..5b404c97118a2546d5bc7be0d76677bf96294d2a
--- /dev/null
+++ b/bark/assets/prompts/v2/tr_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6985d2931a9310cf86b1938a6f06d23b0faf8579186f2ecf3076bb275881064e
+size 22076
diff --git a/bark/assets/prompts/v2/tr_speaker_1.npz b/bark/assets/prompts/v2/tr_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..2064b62a0265e616fd4a5a1e8fab4e47bc2e264d
--- /dev/null
+++ b/bark/assets/prompts/v2/tr_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e34fac6bd9b18f28913d5c55f544caf10ad449ddada3d7c96556d11207569cfa
+size 24476
diff --git a/bark/assets/prompts/v2/tr_speaker_2.npz b/bark/assets/prompts/v2/tr_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6c608086a22e2931cd31e6a90586b32b9cece557
--- /dev/null
+++ b/bark/assets/prompts/v2/tr_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d8822c6c75198b511e682d6235493e559c4b52a0324d6ac5f5a2d253a78dd019
+size 24956
diff --git a/bark/assets/prompts/v2/tr_speaker_3.npz b/bark/assets/prompts/v2/tr_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..44415d5bc19983f6e9a9cd350c38b00161617a0f
--- /dev/null
+++ b/bark/assets/prompts/v2/tr_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d00f600343a9b7f3c556aa7778f29b573455b0611dfc1a194dc46304908839fc
+size 28684
diff --git a/bark/assets/prompts/v2/tr_speaker_4.npz b/bark/assets/prompts/v2/tr_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f2b76f974c0bcc98dd81add7a796dc7bf7faffa2
--- /dev/null
+++ b/bark/assets/prompts/v2/tr_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:46ecfff5873b47e4799e9265468c2d15e3737caec7c370a13c37a12d255ff11f
+size 33164
diff --git a/bark/assets/prompts/v2/tr_speaker_5.npz b/bark/assets/prompts/v2/tr_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..64cfd45c0ee36c1d471bda53e677ee1718accd49
--- /dev/null
+++ b/bark/assets/prompts/v2/tr_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f9bbc5d67bc37d21e02f4620ecd0aaddbe6b0e7a29560e759b042d2823ec21b
+size 17220
diff --git a/bark/assets/prompts/v2/tr_speaker_6.npz b/bark/assets/prompts/v2/tr_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..398e50ea6d26267f6a4eae3d509ced69c15b2c1a
--- /dev/null
+++ b/bark/assets/prompts/v2/tr_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b46f0589a78a5e40877e3fc017974dfb679068690d49a82824e6847952510732
+size 25276
diff --git a/bark/assets/prompts/v2/tr_speaker_7.npz b/bark/assets/prompts/v2/tr_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6820267776e1560f638b4cc4f89d477ff236417a
--- /dev/null
+++ b/bark/assets/prompts/v2/tr_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c2eac63412873fa0d0e65c4018f7ff7df02f691bc6f389c5675af94cdea3623
+size 20260
diff --git a/bark/assets/prompts/v2/tr_speaker_8.npz b/bark/assets/prompts/v2/tr_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..acd6321658c55069079a840c261d0f38ba87f662
--- /dev/null
+++ b/bark/assets/prompts/v2/tr_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a8af6e8790b0cb2622edd1ff31df98f4c32f06bb1ee60a41ed6dce69f2b1b48d
+size 20580
diff --git a/bark/assets/prompts/v2/tr_speaker_9.npz b/bark/assets/prompts/v2/tr_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1f652e09362947e730f754ec36e02c3b16a017f7
--- /dev/null
+++ b/bark/assets/prompts/v2/tr_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d5faf0d1cf51b1ab8e6138bb8dd70f5e5122c41bc6fbf8a0536bcbee4f1963ee
+size 28204
diff --git a/bark/assets/prompts/v2/zh_speaker_0.npz b/bark/assets/prompts/v2/zh_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c0da0dd19dee7ea7045b24af8b5ef979b3967d99
--- /dev/null
+++ b/bark/assets/prompts/v2/zh_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd7ac118a3e944b3f20c89f2446056a00850a630ee16318922acc6572ce80929
+size 20636
diff --git a/bark/assets/prompts/v2/zh_speaker_1.npz b/bark/assets/prompts/v2/zh_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..a41097e8fadddf15777cf8e4433602eeaee81e52
--- /dev/null
+++ b/bark/assets/prompts/v2/zh_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0eacf5c862dfd3c5ac825f2ebb26f323e64309cb712e7e264cbd31c5bca3f038
+size 19836
diff --git a/bark/assets/prompts/v2/zh_speaker_2.npz b/bark/assets/prompts/v2/zh_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4fca832724ff2da321f2ef129e224d524075690d
--- /dev/null
+++ b/bark/assets/prompts/v2/zh_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e324b47f8250e5798c314f395d4e049575e7ca369d0b6074e91c7bba70e9f26d
+size 21060
diff --git a/bark/assets/prompts/v2/zh_speaker_3.npz b/bark/assets/prompts/v2/zh_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..cd1d101a472fd9dcfa3c6d374f5099e42a002e73
--- /dev/null
+++ b/bark/assets/prompts/v2/zh_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98c476abc7bf634ffb2d71d363284e7bd8c8abd5e33ec5ca21d4aa5b15730d18
+size 31300
diff --git a/bark/assets/prompts/v2/zh_speaker_4.npz b/bark/assets/prompts/v2/zh_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..8c2c94f8f02f8fc8ee490fd1174195634a28ab67
--- /dev/null
+++ b/bark/assets/prompts/v2/zh_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1fa8673a9895ad3302d13ac94193b5ad5da481f1cc276e6181fa895acaae133b
+size 29964
diff --git a/bark/assets/prompts/v2/zh_speaker_5.npz b/bark/assets/prompts/v2/zh_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f2269a6bc79a059214486a5a346e2890bb355b95
--- /dev/null
+++ b/bark/assets/prompts/v2/zh_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:226edfe5fabc72eeb83a13e350599bc8babe5adc2264b3cdb661fd1258dc4044
+size 17436
diff --git a/bark/assets/prompts/v2/zh_speaker_6.npz b/bark/assets/prompts/v2/zh_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..76a4891df92e084fbd3c1e7c19682ad155694efe
--- /dev/null
+++ b/bark/assets/prompts/v2/zh_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:285d51fbe81cc263636b5b487fbb6633e6f3cf92c53ca9ab8e6b7f55d4b4a31d
+size 16900
diff --git a/bark/assets/prompts/v2/zh_speaker_7.npz b/bark/assets/prompts/v2/zh_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..7d4d635ffe13e4f9a21e9d5b8f514f9db4f1ebab
--- /dev/null
+++ b/bark/assets/prompts/v2/zh_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0967cdb14ffa79895747b0d52df9f15bdad80d6c55b7630894345c9a7ec87c91
+size 21060
diff --git a/bark/assets/prompts/v2/zh_speaker_8.npz b/bark/assets/prompts/v2/zh_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1ea29786a479ff5fe94822fee1e00a6484c8bec3
--- /dev/null
+++ b/bark/assets/prompts/v2/zh_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c028f78530013f29ab8c0c1cf4fe2138106fbe5252951f5f36e0168056779549
+size 19300
diff --git a/bark/assets/prompts/v2/zh_speaker_9.npz b/bark/assets/prompts/v2/zh_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..caf80d75d736fd7a8c0a8febdd23d2e99449896b
--- /dev/null
+++ b/bark/assets/prompts/v2/zh_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6265bb827008d7af8a45a8e057fe3e91efb347d56208180a9ed990ad54e4d75e
+size 16156
diff --git a/bark/assets/prompts/zh_speaker_0.npz b/bark/assets/prompts/zh_speaker_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4c531fb26cfafea44a9a1e90b4efe0ee4a79dc4e
--- /dev/null
+++ b/bark/assets/prompts/zh_speaker_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55bc30061b5c5928454e4c7a1d6206e359a25ca38fec3ca96de0a625fa96c572
+size 19620
diff --git a/bark/assets/prompts/zh_speaker_1.npz b/bark/assets/prompts/zh_speaker_1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d858600f97683c44cd72ccfa8badfa3b189f0467
--- /dev/null
+++ b/bark/assets/prompts/zh_speaker_1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d5d5531998bd91684806eb64a2ac659d8c242f4112d6216697d3cae0b99b978
+size 21380
diff --git a/bark/assets/prompts/zh_speaker_2.npz b/bark/assets/prompts/zh_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..2452ed7bcab190bedde76dc7d7d3fe4d82643278
--- /dev/null
+++ b/bark/assets/prompts/zh_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c3001ff8a04e64e0687b0ad145c92684c8758ce7af68fb330dcfee4739fd896b
+size 19460
diff --git a/bark/assets/prompts/zh_speaker_3.npz b/bark/assets/prompts/zh_speaker_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..29d23b0a1795126c86f735f3e5f8af17de9184b5
--- /dev/null
+++ b/bark/assets/prompts/zh_speaker_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:08b20f307ff4a1e5a947f4394ce2f2c3c5e0e6a9f78e0fd77604fb08359ab90d
+size 32740
diff --git a/bark/assets/prompts/zh_speaker_4.npz b/bark/assets/prompts/zh_speaker_4.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d12adb15a7bc72de351c53b046a6edbb46713cd4
--- /dev/null
+++ b/bark/assets/prompts/zh_speaker_4.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b6acddfa41ce84e558e09e91fae5fbb01704bc1cef0f000bcc7f30d05e51afc
+size 19676
diff --git a/bark/assets/prompts/zh_speaker_5.npz b/bark/assets/prompts/zh_speaker_5.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1662063711535dffe2ec4c0711e940ca0bd78a7b
--- /dev/null
+++ b/bark/assets/prompts/zh_speaker_5.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:048c7362b237c43ceb0c3a4986b5c42c21ef013cadaf7c77b6348419f801dc93
+size 54548
diff --git a/bark/assets/prompts/zh_speaker_6.npz b/bark/assets/prompts/zh_speaker_6.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9877675833fe910b4fd15b6938e35a8bf1434073
--- /dev/null
+++ b/bark/assets/prompts/zh_speaker_6.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d7359be4a984930a81103043409b695e383d493f4edd6d4786537b1730a95c0
+size 23516
diff --git a/bark/assets/prompts/zh_speaker_7.npz b/bark/assets/prompts/zh_speaker_7.npz
new file mode 100644
index 0000000000000000000000000000000000000000..f83e4af9176bc23fb0dbafaeadd0c3f24dcb14e4
--- /dev/null
+++ b/bark/assets/prompts/zh_speaker_7.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:560ccbd20b16a2313cdc44ed578c8fb4dcbe51c2d1c57756dc242d185a6b88d3
+size 22556
diff --git a/bark/assets/prompts/zh_speaker_8.npz b/bark/assets/prompts/zh_speaker_8.npz
new file mode 100644
index 0000000000000000000000000000000000000000..dff9d012159fd857ba4070c99fb96a66a8c8de41
--- /dev/null
+++ b/bark/assets/prompts/zh_speaker_8.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26eb3e2589f21f88aa963f052cc5134c6510b1cdb0033be277733bc7dc77157c
+size 20580
diff --git a/bark/assets/prompts/zh_speaker_9.npz b/bark/assets/prompts/zh_speaker_9.npz
new file mode 100644
index 0000000000000000000000000000000000000000..98fc91445386fe8ea4aabe7a9172d10e4298b557
--- /dev/null
+++ b/bark/assets/prompts/zh_speaker_9.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15ab7bbb47bf326e454cc1d299f4069d0fa9ea8e934273dbed4cbf1116404322
+size 18396
diff --git a/bark/cli.py b/bark/cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..687e5655f85a9f4a77bb07c76ed9c3f96bac4854
--- /dev/null
+++ b/bark/cli.py
@@ -0,0 +1,71 @@
+import argparse
+from typing import Dict, Optional, Union
+import os
+
+from scipy.io.wavfile import write as write_wav
+from .api import generate_audio
+from .generation import SAMPLE_RATE
+
+
+def cli():
+ """Commandline interface."""
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument("--text", type=str, help="text to be turned into audio")
+ parser.add_argument(
+ "--output_filename",
+ type=str,
+ default="bark_generation.wav",
+ help="output audio file name",
+ )
+ parser.add_argument("--output_dir", type=str, default=".", help="directory to save the outputs")
+ parser.add_argument(
+ "--history_prompt",
+ type=Optional[Union[Dict, str]],
+ default=None,
+ help="history choice for audio cloning",
+ )
+ parser.add_argument(
+ "--text_temp",
+ default=0.7,
+ type=float,
+ help="generation temperature (1.0 more diverse, 0.0 more conservative)",
+ )
+ parser.add_argument(
+ "--waveform_temp",
+ default=0.7,
+ type=float,
+ help="generation temperature (1.0 more diverse, 0.0 more conservative)",
+ )
+ parser.add_argument("--silent", default=False, type=bool, help="disable progress bar")
+ parser.add_argument(
+ "--output_full",
+ default=False,
+ type=bool,
+ help="return full generation to be used as a history prompt",
+ )
+
+ args = vars(parser.parse_args())
+ input_text: str = args.get("text")
+ output_filename: str = args.get("output_filename")
+ output_dir: str = args.get("output_dir")
+ history_prompt: Optional[Union[Dict, str]] = args.get("history_prompt")
+ text_temp: float = args.get("text_temp")
+ waveform_temp: float = args.get("waveform_temp")
+ silent: bool = args.get("silent")
+ output_full: bool = args.get("output_full")
+
+ try:
+ os.makedirs(output_dir, exist_ok=True)
+ generated_audio = generate_audio(
+ input_text,
+ history_prompt=history_prompt,
+ text_temp=text_temp,
+ waveform_temp=waveform_temp,
+ silent=silent,
+ output_full=output_full,
+ )
+ output_file_path = os.path.join(output_dir, output_filename)
+ write_wav(output_file_path, SAMPLE_RATE, generated_audio)
+ print(f"Done! Output audio file is saved at: '{output_file_path}'")
+ except Exception as e:
+ print(f"Oops, an error occurred: {e}")
diff --git a/bark/generation.py b/bark/generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..7efbd86240f4de82f74de663c6ac2f54d061c06b
--- /dev/null
+++ b/bark/generation.py
@@ -0,0 +1,857 @@
+import contextlib
+import gc
+import os
+import re
+
+from encodec import EncodecModel
+import funcy
+import logging
+import numpy as np
+from scipy.special import softmax
+import torch
+import torch.nn.functional as F
+import tqdm
+from transformers import BertTokenizer
+from huggingface_hub import hf_hub_download
+
+from .model import GPTConfig, GPT
+from .model_fine import FineGPT, FineGPTConfig
+
+if (
+ torch.cuda.is_available()
+ and hasattr(torch.cuda, "amp")
+ and hasattr(torch.cuda.amp, "autocast")
+ and hasattr(torch.cuda, "is_bf16_supported")
+ and torch.cuda.is_bf16_supported()
+):
+ autocast = funcy.partial(torch.cuda.amp.autocast, dtype=torch.bfloat16)
+else:
+
+ @contextlib.contextmanager
+ def autocast():
+ yield
+
+
+# hold models in global scope to lazy load
+global models
+models = {}
+
+global models_devices
+models_devices = {}
+
+
+CONTEXT_WINDOW_SIZE = 1024
+
+SEMANTIC_RATE_HZ = 49.9
+SEMANTIC_VOCAB_SIZE = 10_000
+
+CODEBOOK_SIZE = 1024
+N_COARSE_CODEBOOKS = 2
+N_FINE_CODEBOOKS = 8
+COARSE_RATE_HZ = 75
+
+SAMPLE_RATE = 24_000
+
+
+SUPPORTED_LANGS = [
+ ("English", "en"),
+ ("German", "de"),
+ ("Spanish", "es"),
+ ("French", "fr"),
+ ("Hindi", "hi"),
+ ("Italian", "it"),
+ ("Japanese", "ja"),
+ ("Korean", "ko"),
+ ("Polish", "pl"),
+ ("Portuguese", "pt"),
+ ("Russian", "ru"),
+ ("Turkish", "tr"),
+ ("Chinese", "zh"),
+]
+
+ALLOWED_PROMPTS = {"announcer"}
+for _, lang in SUPPORTED_LANGS:
+ for prefix in ("", f"v2{os.path.sep}"):
+ for n in range(10):
+ ALLOWED_PROMPTS.add(f"{prefix}{lang}_speaker_{n}")
+
+
+logger = logging.getLogger(__name__)
+
+
+CUR_PATH = os.path.dirname(os.path.abspath(__file__))
+
+
+default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache")
+CACHE_DIR = os.path.join(
+ os.getenv("XDG_CACHE_HOME", default_cache_dir), "suno", "bark_v0"
+)
+
+
+def _cast_bool_env_var(s):
+ return s.lower() in ("true", "1", "t")
+
+
+USE_SMALL_MODELS = _cast_bool_env_var(os.environ.get("SUNO_USE_SMALL_MODELS", "False"))
+GLOBAL_ENABLE_MPS = _cast_bool_env_var(os.environ.get("SUNO_ENABLE_MPS", "False"))
+OFFLOAD_CPU = _cast_bool_env_var(os.environ.get("SUNO_OFFLOAD_CPU", "False"))
+
+
+REMOTE_MODEL_PATHS = {
+ "text_small": {
+ "repo_id": "suno/bark",
+ "file_name": "text.pt",
+ },
+ "coarse_small": {
+ "repo_id": "suno/bark",
+ "file_name": "coarse.pt",
+ },
+ "fine_small": {
+ "repo_id": "suno/bark",
+ "file_name": "fine.pt",
+ },
+ "text": {
+ "repo_id": "suno/bark",
+ "file_name": "text_2.pt",
+ },
+ "coarse": {
+ "repo_id": "suno/bark",
+ "file_name": "coarse_2.pt",
+ },
+ "fine": {
+ "repo_id": "suno/bark",
+ "file_name": "fine_2.pt",
+ },
+}
+
+
+if (
+ not hasattr(torch.nn.functional, "scaled_dot_product_attention")
+ and torch.cuda.is_available()
+):
+ logger.warning(
+ "torch version does not support flash attention. You will get faster"
+ + " inference speed by upgrade torch to newest nightly version."
+ )
+
+
+def _grab_best_device(use_gpu=True):
+ if torch.cuda.device_count() > 0 and use_gpu:
+ device = "cuda"
+ elif torch.backends.mps.is_available() and use_gpu and GLOBAL_ENABLE_MPS:
+ device = "mps"
+ else:
+ device = "cpu"
+ return device
+
+
+def _get_ckpt_path(model_type, use_small=False):
+ key = model_type
+ if use_small or USE_SMALL_MODELS:
+ key += "_small"
+ return os.path.join(CACHE_DIR, REMOTE_MODEL_PATHS[key]["file_name"])
+
+
+def _download(from_hf_path, file_name):
+ os.makedirs(CACHE_DIR, exist_ok=True)
+ hf_hub_download(repo_id=from_hf_path, filename=file_name, local_dir=CACHE_DIR)
+
+
+class InferenceContext:
+ def __init__(self, benchmark=False):
+ # we can't expect inputs to be the same length, so disable benchmarking by default
+ self._chosen_cudnn_benchmark = benchmark
+ self._cudnn_benchmark = None
+
+ def __enter__(self):
+ self._cudnn_benchmark = torch.backends.cudnn.benchmark
+ torch.backends.cudnn.benchmark = self._chosen_cudnn_benchmark
+
+ def __exit__(self, exc_type, exc_value, exc_traceback):
+ torch.backends.cudnn.benchmark = self._cudnn_benchmark
+
+
+if torch.cuda.is_available():
+ torch.backends.cuda.matmul.allow_tf32 = True
+ torch.backends.cudnn.allow_tf32 = True
+
+
+@contextlib.contextmanager
+def _inference_mode():
+ with InferenceContext(), torch.inference_mode(), torch.no_grad(), autocast():
+ yield
+
+
+def _clear_cuda_cache():
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+ torch.cuda.synchronize()
+
+
+def clean_models(model_key=None):
+ global models
+ model_keys = [model_key] if model_key is not None else models.keys()
+ for k in model_keys:
+ if k in models:
+ del models[k]
+ _clear_cuda_cache()
+ gc.collect()
+
+
+def _load_model(ckpt_path, device, use_small=False, model_type="text"):
+ if model_type == "text":
+ ConfigClass = GPTConfig
+ ModelClass = GPT
+ elif model_type == "coarse":
+ ConfigClass = GPTConfig
+ ModelClass = GPT
+ elif model_type == "fine":
+ ConfigClass = FineGPTConfig
+ ModelClass = FineGPT
+ else:
+ raise NotImplementedError()
+ model_key = f"{model_type}_small" if use_small or USE_SMALL_MODELS else model_type
+ model_info = REMOTE_MODEL_PATHS[model_key]
+ if not os.path.exists(ckpt_path):
+ logger.info(f"{model_type} model not found, downloading into `{CACHE_DIR}`.")
+ _download(model_info["repo_id"], model_info["file_name"])
+ checkpoint = torch.load(ckpt_path, map_location=device)
+ # this is a hack
+ model_args = checkpoint["model_args"]
+ if "input_vocab_size" not in model_args:
+ model_args["input_vocab_size"] = model_args["vocab_size"]
+ model_args["output_vocab_size"] = model_args["vocab_size"]
+ del model_args["vocab_size"]
+ gptconf = ConfigClass(**checkpoint["model_args"])
+ model = ModelClass(gptconf)
+ state_dict = checkpoint["model"]
+ # fixup checkpoint
+ unwanted_prefix = "_orig_mod."
+ for k, v in list(state_dict.items()):
+ if k.startswith(unwanted_prefix):
+ state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
+ extra_keys = set(state_dict.keys()) - set(model.state_dict().keys())
+ extra_keys = set([k for k in extra_keys if not k.endswith(".attn.bias")])
+ missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
+ missing_keys = set([k for k in missing_keys if not k.endswith(".attn.bias")])
+ if len(extra_keys) != 0:
+ raise ValueError(f"extra keys found: {extra_keys}")
+ if len(missing_keys) != 0:
+ raise ValueError(f"missing keys: {missing_keys}")
+ model.load_state_dict(state_dict, strict=False)
+ n_params = model.get_num_params()
+ val_loss = checkpoint["best_val_loss"].item()
+ logger.info(
+ f"model loaded: {round(n_params/1e6,1)}M params, {round(val_loss,3)} loss"
+ )
+ model.eval()
+ model.to(device)
+ del checkpoint, state_dict
+ _clear_cuda_cache()
+ if model_type == "text":
+ tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
+ return {
+ "model": model,
+ "tokenizer": tokenizer,
+ }
+ return model
+
+
+def _load_codec_model(device):
+ model = EncodecModel.encodec_model_24khz()
+ model.set_target_bandwidth(6.0)
+ model.eval()
+ model.to(device)
+ _clear_cuda_cache()
+ return model
+
+
+def load_model(use_gpu=True, use_small=False, force_reload=False, model_type="text"):
+ _load_model_f = funcy.partial(
+ _load_model, model_type=model_type, use_small=use_small
+ )
+ if model_type not in ("text", "coarse", "fine"):
+ raise NotImplementedError()
+ global models
+ global models_devices
+ device = _grab_best_device(use_gpu=use_gpu)
+ model_key = f"{model_type}"
+ if OFFLOAD_CPU:
+ models_devices[model_key] = device
+ device = "cpu"
+ if model_key not in models or force_reload:
+ ckpt_path = _get_ckpt_path(model_type, use_small=use_small)
+ clean_models(model_key=model_key)
+ model = _load_model_f(ckpt_path, device)
+ models[model_key] = model
+ if model_type == "text":
+ models[model_key]["model"].to(device)
+ else:
+ models[model_key].to(device)
+ return models[model_key]
+
+
+def load_codec_model(use_gpu=True, force_reload=False):
+ global models
+ global models_devices
+ device = _grab_best_device(use_gpu=use_gpu)
+ if device == "mps":
+ # encodec doesn't support mps
+ device = "cpu"
+ model_key = "codec"
+ if OFFLOAD_CPU:
+ models_devices[model_key] = device
+ device = "cpu"
+ if model_key not in models or force_reload:
+ clean_models(model_key=model_key)
+ model = _load_codec_model(device)
+ models[model_key] = model
+ models[model_key].to(device)
+ return models[model_key]
+
+
+def preload_models(
+ text_use_gpu=True,
+ text_use_small=False,
+ coarse_use_gpu=True,
+ coarse_use_small=False,
+ fine_use_gpu=True,
+ fine_use_small=False,
+ codec_use_gpu=True,
+ force_reload=False,
+):
+ """Load all the necessary models for the pipeline."""
+ if _grab_best_device() == "cpu" and (
+ text_use_gpu or coarse_use_gpu or fine_use_gpu or codec_use_gpu
+ ):
+ logger.warning("No GPU being used. Careful, inference might be very slow!")
+ _ = load_model(
+ model_type="text",
+ use_gpu=text_use_gpu,
+ use_small=text_use_small,
+ force_reload=force_reload,
+ )
+ _ = load_model(
+ model_type="coarse",
+ use_gpu=coarse_use_gpu,
+ use_small=coarse_use_small,
+ force_reload=force_reload,
+ )
+ _ = load_model(
+ model_type="fine",
+ use_gpu=fine_use_gpu,
+ use_small=fine_use_small,
+ force_reload=force_reload,
+ )
+ _ = load_codec_model(use_gpu=codec_use_gpu, force_reload=force_reload)
+
+
+####
+# Generation Functionality
+####
+
+
+def _tokenize(tokenizer, text):
+ return tokenizer.encode(text, add_special_tokens=False)
+
+
+def _detokenize(tokenizer, enc_text):
+ return tokenizer.decode(enc_text)
+
+
+def _normalize_whitespace(text):
+ return re.sub(r"\s+", " ", text).strip()
+
+
+TEXT_ENCODING_OFFSET = 10_048
+SEMANTIC_PAD_TOKEN = 10_000
+TEXT_PAD_TOKEN = 129_595
+SEMANTIC_INFER_TOKEN = 129_599
+
+
+def _load_history_prompt(history_prompt_input):
+ if isinstance(history_prompt_input, str) and history_prompt_input.endswith(".npz"):
+ history_prompt = np.load(history_prompt_input)
+ elif isinstance(history_prompt_input, str):
+ # make sure this works on non-ubuntu
+ history_prompt_input = os.path.join(*history_prompt_input.split("/"))
+ if history_prompt_input not in ALLOWED_PROMPTS:
+ raise ValueError("history prompt not found")
+ history_prompt = np.load(
+ os.path.join(CUR_PATH, "assets", "prompts", f"{history_prompt_input}.npz")
+ )
+ elif isinstance(history_prompt_input, dict):
+ assert "semantic_prompt" in history_prompt_input
+ assert "coarse_prompt" in history_prompt_input
+ assert "fine_prompt" in history_prompt_input
+ history_prompt = history_prompt_input
+ else:
+ raise ValueError("history prompt format unrecognized")
+ return history_prompt
+
+
+def generate_text_semantic(
+ text,
+ history_prompt=None,
+ temp=0.7,
+ top_k=None,
+ top_p=None,
+ silent=False,
+ min_eos_p=0.2,
+ max_gen_duration_s=None,
+ allow_early_stop=True,
+ use_kv_caching=False,
+):
+ """Generate semantic tokens from text."""
+ assert isinstance(text, str)
+ text = _normalize_whitespace(text)
+ assert len(text.strip()) > 0
+ if history_prompt is not None:
+ history_prompt = _load_history_prompt(history_prompt)
+ semantic_history = history_prompt["semantic_prompt"]
+ assert (
+ isinstance(semantic_history, np.ndarray)
+ and len(semantic_history.shape) == 1
+ and len(semantic_history) > 0
+ and semantic_history.min() >= 0
+ and semantic_history.max() <= SEMANTIC_VOCAB_SIZE - 1
+ )
+ else:
+ semantic_history = None
+ # load models if not yet exist
+ global models
+ global models_devices
+ if "text" not in models:
+ preload_models()
+ model_container = models["text"]
+ model = model_container["model"]
+ tokenizer = model_container["tokenizer"]
+ encoded_text = np.array(_tokenize(tokenizer, text)) + TEXT_ENCODING_OFFSET
+ if OFFLOAD_CPU:
+ model.to(models_devices["text"])
+ device = next(model.parameters()).device
+ if len(encoded_text) > 256:
+ p = round((len(encoded_text) - 256) / len(encoded_text) * 100, 1)
+ logger.warning(f"warning, text too long, lopping of last {p}%")
+ encoded_text = encoded_text[:256]
+ encoded_text = np.pad(
+ encoded_text,
+ (0, 256 - len(encoded_text)),
+ constant_values=TEXT_PAD_TOKEN,
+ mode="constant",
+ )
+ if semantic_history is not None:
+ semantic_history = semantic_history.astype(np.int64)
+ # lop off if history is too long, pad if needed
+ semantic_history = semantic_history[-256:]
+ semantic_history = np.pad(
+ semantic_history,
+ (0, 256 - len(semantic_history)),
+ constant_values=SEMANTIC_PAD_TOKEN,
+ mode="constant",
+ )
+ else:
+ semantic_history = np.array([SEMANTIC_PAD_TOKEN] * 256)
+ x = torch.from_numpy(
+ np.hstack(
+ [encoded_text, semantic_history, np.array([SEMANTIC_INFER_TOKEN])]
+ ).astype(np.int64)
+ )[None]
+ assert x.shape[1] == 256 + 256 + 1
+ with _inference_mode():
+ x = x.to(device)
+ n_tot_steps = 768
+ # custom tqdm updates since we don't know when eos will occur
+ pbar = tqdm.tqdm(disable=silent, total=100)
+ pbar_state = 0
+ tot_generated_duration_s = 0
+ kv_cache = None
+ for n in range(n_tot_steps):
+ if use_kv_caching and kv_cache is not None:
+ x_input = x[:, [-1]]
+ else:
+ x_input = x
+ logits, kv_cache = model(
+ x_input, merge_context=True, use_cache=use_kv_caching, past_kv=kv_cache
+ )
+ relevant_logits = logits[0, 0, :SEMANTIC_VOCAB_SIZE]
+ if allow_early_stop:
+ relevant_logits = torch.hstack(
+ (relevant_logits, logits[0, 0, [SEMANTIC_PAD_TOKEN]]) # eos
+ )
+ if top_p is not None:
+ # faster to convert to numpy
+ original_device = relevant_logits.device
+ relevant_logits = (
+ relevant_logits.detach().cpu().type(torch.float32).numpy()
+ )
+ sorted_indices = np.argsort(relevant_logits)[::-1]
+ sorted_logits = relevant_logits[sorted_indices]
+ cumulative_probs = np.cumsum(softmax(sorted_logits))
+ sorted_indices_to_remove = cumulative_probs > top_p
+ sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
+ sorted_indices_to_remove[0] = False
+ relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf
+ relevant_logits = torch.from_numpy(relevant_logits)
+ relevant_logits = relevant_logits.to(original_device)
+ if top_k is not None:
+ v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1)))
+ relevant_logits[relevant_logits < v[-1]] = -float("Inf")
+ probs = F.softmax(relevant_logits / temp, dim=-1)
+ item_next = torch.multinomial(probs, num_samples=1).to(torch.int32)
+ if allow_early_stop and (
+ item_next == SEMANTIC_VOCAB_SIZE
+ or (min_eos_p is not None and probs[-1] >= min_eos_p)
+ ):
+ # eos found, so break
+ pbar.update(100 - pbar_state)
+ break
+ x = torch.cat((x, item_next[None]), dim=1)
+ tot_generated_duration_s += 1 / SEMANTIC_RATE_HZ
+ if (
+ max_gen_duration_s is not None
+ and tot_generated_duration_s > max_gen_duration_s
+ ):
+ pbar.update(100 - pbar_state)
+ break
+ if n == n_tot_steps - 1:
+ pbar.update(100 - pbar_state)
+ break
+ del logits, relevant_logits, probs, item_next
+ req_pbar_state = np.min([100, int(round(100 * n / n_tot_steps))])
+ if req_pbar_state > pbar_state:
+ pbar.update(req_pbar_state - pbar_state)
+ pbar_state = req_pbar_state
+ pbar.close()
+ out = x.detach().cpu().numpy().squeeze()[256 + 256 + 1 :]
+ if OFFLOAD_CPU:
+ model.to("cpu")
+ assert all(0 <= out) and all(out < SEMANTIC_VOCAB_SIZE)
+ _clear_cuda_cache()
+ return out
+
+
+def _flatten_codebooks(arr, offset_size=CODEBOOK_SIZE):
+ assert len(arr.shape) == 2
+ arr = arr.copy()
+ if offset_size is not None:
+ for n in range(1, arr.shape[0]):
+ arr[n, :] += offset_size * n
+ flat_arr = arr.ravel("F")
+ return flat_arr
+
+
+COARSE_SEMANTIC_PAD_TOKEN = 12_048
+COARSE_INFER_TOKEN = 12_050
+
+
+def generate_coarse(
+ x_semantic,
+ history_prompt=None,
+ temp=0.7,
+ top_k=None,
+ top_p=None,
+ silent=False,
+ max_coarse_history=630, # min 60 (faster), max 630 (more context)
+ sliding_window_len=60,
+ use_kv_caching=False,
+):
+ """Generate coarse audio codes from semantic tokens."""
+ assert (
+ isinstance(x_semantic, np.ndarray)
+ and len(x_semantic.shape) == 1
+ and len(x_semantic) > 0
+ and x_semantic.min() >= 0
+ and x_semantic.max() <= SEMANTIC_VOCAB_SIZE - 1
+ )
+ assert 60 <= max_coarse_history <= 630
+ assert max_coarse_history + sliding_window_len <= 1024 - 256
+ semantic_to_coarse_ratio = COARSE_RATE_HZ / SEMANTIC_RATE_HZ * N_COARSE_CODEBOOKS
+ max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
+ if history_prompt is not None:
+ history_prompt = _load_history_prompt(history_prompt)
+ x_semantic_history = history_prompt["semantic_prompt"]
+ x_coarse_history = history_prompt["coarse_prompt"]
+ assert (
+ isinstance(x_semantic_history, np.ndarray)
+ and len(x_semantic_history.shape) == 1
+ and len(x_semantic_history) > 0
+ and x_semantic_history.min() >= 0
+ and x_semantic_history.max() <= SEMANTIC_VOCAB_SIZE - 1
+ and isinstance(x_coarse_history, np.ndarray)
+ and len(x_coarse_history.shape) == 2
+ and x_coarse_history.shape[0] == N_COARSE_CODEBOOKS
+ and x_coarse_history.shape[-1] >= 0
+ and x_coarse_history.min() >= 0
+ and x_coarse_history.max() <= CODEBOOK_SIZE - 1
+ and (
+ round(x_coarse_history.shape[-1] / len(x_semantic_history), 1)
+ == round(semantic_to_coarse_ratio / N_COARSE_CODEBOOKS, 1)
+ )
+ )
+ x_coarse_history = _flatten_codebooks(x_coarse_history) + SEMANTIC_VOCAB_SIZE
+ # trim histories correctly
+ n_semantic_hist_provided = np.min(
+ [
+ max_semantic_history,
+ len(x_semantic_history) - len(x_semantic_history) % 2,
+ int(np.floor(len(x_coarse_history) / semantic_to_coarse_ratio)),
+ ]
+ )
+ n_coarse_hist_provided = int(
+ round(n_semantic_hist_provided * semantic_to_coarse_ratio)
+ )
+ x_semantic_history = x_semantic_history[-n_semantic_hist_provided:].astype(
+ np.int32
+ )
+ x_coarse_history = x_coarse_history[-n_coarse_hist_provided:].astype(np.int32)
+ # TODO: bit of a hack for time alignment (sounds better)
+ x_coarse_history = x_coarse_history[:-2]
+ else:
+ x_semantic_history = np.array([], dtype=np.int32)
+ x_coarse_history = np.array([], dtype=np.int32)
+ # load models if not yet exist
+ global models
+ global models_devices
+ if "coarse" not in models:
+ preload_models()
+ model = models["coarse"]
+ if OFFLOAD_CPU:
+ model.to(models_devices["coarse"])
+ device = next(model.parameters()).device
+ # start loop
+ n_steps = int(
+ round(
+ np.floor(len(x_semantic) * semantic_to_coarse_ratio / N_COARSE_CODEBOOKS)
+ * N_COARSE_CODEBOOKS
+ )
+ )
+ assert n_steps > 0 and n_steps % N_COARSE_CODEBOOKS == 0
+ x_semantic = np.hstack([x_semantic_history, x_semantic]).astype(np.int32)
+ x_coarse = x_coarse_history.astype(np.int32)
+ base_semantic_idx = len(x_semantic_history)
+ with _inference_mode():
+ x_semantic_in = torch.from_numpy(x_semantic)[None].to(device)
+ x_coarse_in = torch.from_numpy(x_coarse)[None].to(device)
+ n_window_steps = int(np.ceil(n_steps / sliding_window_len))
+ n_step = 0
+ for _ in tqdm.tqdm(range(n_window_steps), total=n_window_steps, disable=silent):
+ semantic_idx = base_semantic_idx + int(
+ round(n_step / semantic_to_coarse_ratio)
+ )
+ # pad from right side
+ x_in = x_semantic_in[:, np.max([0, semantic_idx - max_semantic_history]) :]
+ x_in = x_in[:, :256]
+ x_in = F.pad(
+ x_in,
+ (0, 256 - x_in.shape[-1]),
+ "constant",
+ COARSE_SEMANTIC_PAD_TOKEN,
+ )
+ x_in = torch.hstack(
+ [
+ x_in,
+ torch.tensor([COARSE_INFER_TOKEN])[None].to(device),
+ x_coarse_in[:, -max_coarse_history:],
+ ]
+ )
+ kv_cache = None
+ for _ in range(sliding_window_len):
+ if n_step >= n_steps:
+ continue
+ is_major_step = n_step % N_COARSE_CODEBOOKS == 0
+
+ if use_kv_caching and kv_cache is not None:
+ x_input = x_in[:, [-1]]
+ else:
+ x_input = x_in
+
+ logits, kv_cache = model(
+ x_input, use_cache=use_kv_caching, past_kv=kv_cache
+ )
+ logit_start_idx = (
+ SEMANTIC_VOCAB_SIZE + (1 - int(is_major_step)) * CODEBOOK_SIZE
+ )
+ logit_end_idx = (
+ SEMANTIC_VOCAB_SIZE + (2 - int(is_major_step)) * CODEBOOK_SIZE
+ )
+ relevant_logits = logits[0, 0, logit_start_idx:logit_end_idx]
+ if top_p is not None:
+ # faster to convert to numpy
+ original_device = relevant_logits.device
+ relevant_logits = (
+ relevant_logits.detach().cpu().type(torch.float32).numpy()
+ )
+ sorted_indices = np.argsort(relevant_logits)[::-1]
+ sorted_logits = relevant_logits[sorted_indices]
+ cumulative_probs = np.cumsum(softmax(sorted_logits))
+ sorted_indices_to_remove = cumulative_probs > top_p
+ sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
+ sorted_indices_to_remove[0] = False
+ relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf
+ relevant_logits = torch.from_numpy(relevant_logits)
+ relevant_logits = relevant_logits.to(original_device)
+ if top_k is not None:
+ v, _ = torch.topk(
+ relevant_logits, min(top_k, relevant_logits.size(-1))
+ )
+ relevant_logits[relevant_logits < v[-1]] = -float("Inf")
+ probs = F.softmax(relevant_logits / temp, dim=-1)
+ item_next = torch.multinomial(probs, num_samples=1).to(torch.int32)
+ item_next += logit_start_idx
+ x_coarse_in = torch.cat((x_coarse_in, item_next[None]), dim=1)
+ x_in = torch.cat((x_in, item_next[None]), dim=1)
+ del logits, relevant_logits, probs, item_next
+ n_step += 1
+ del x_in
+ del x_semantic_in
+ if OFFLOAD_CPU:
+ model.to("cpu")
+ gen_coarse_arr = (
+ x_coarse_in.detach().cpu().numpy().squeeze()[len(x_coarse_history) :]
+ )
+ del x_coarse_in
+ assert len(gen_coarse_arr) == n_steps
+ gen_coarse_audio_arr = (
+ gen_coarse_arr.reshape(-1, N_COARSE_CODEBOOKS).T - SEMANTIC_VOCAB_SIZE
+ )
+ for n in range(1, N_COARSE_CODEBOOKS):
+ gen_coarse_audio_arr[n, :] -= n * CODEBOOK_SIZE
+ _clear_cuda_cache()
+ return gen_coarse_audio_arr
+
+
+def generate_fine(
+ x_coarse_gen,
+ history_prompt=None,
+ temp=0.5,
+ silent=True,
+):
+ """Generate full audio codes from coarse audio codes."""
+ assert (
+ isinstance(x_coarse_gen, np.ndarray)
+ and len(x_coarse_gen.shape) == 2
+ and 1 <= x_coarse_gen.shape[0] <= N_FINE_CODEBOOKS - 1
+ and x_coarse_gen.shape[1] > 0
+ and x_coarse_gen.min() >= 0
+ and x_coarse_gen.max() <= CODEBOOK_SIZE - 1
+ )
+ if history_prompt is not None:
+ history_prompt = _load_history_prompt(history_prompt)
+ x_fine_history = history_prompt["fine_prompt"]
+ assert (
+ isinstance(x_fine_history, np.ndarray)
+ and len(x_fine_history.shape) == 2
+ and x_fine_history.shape[0] == N_FINE_CODEBOOKS
+ and x_fine_history.shape[1] >= 0
+ and x_fine_history.min() >= 0
+ and x_fine_history.max() <= CODEBOOK_SIZE - 1
+ )
+ else:
+ x_fine_history = None
+ n_coarse = x_coarse_gen.shape[0]
+ # load models if not yet exist
+ global models
+ global models_devices
+ if "fine" not in models:
+ preload_models()
+ model = models["fine"]
+ if OFFLOAD_CPU:
+ model.to(models_devices["fine"])
+ device = next(model.parameters()).device
+ # make input arr
+ in_arr = np.vstack(
+ [
+ x_coarse_gen,
+ np.zeros((N_FINE_CODEBOOKS - n_coarse, x_coarse_gen.shape[1]))
+ + CODEBOOK_SIZE, # padding
+ ]
+ ).astype(np.int32)
+ # prepend history if available (max 512)
+ if x_fine_history is not None:
+ x_fine_history = x_fine_history.astype(np.int32)
+ in_arr = np.hstack(
+ [
+ x_fine_history[:, -512:].astype(np.int32),
+ in_arr,
+ ]
+ )
+ n_history = x_fine_history[:, -512:].shape[1]
+ else:
+ n_history = 0
+ n_remove_from_end = 0
+ # need to pad if too short (since non-causal model)
+ if in_arr.shape[1] < 1024:
+ n_remove_from_end = 1024 - in_arr.shape[1]
+ in_arr = np.hstack(
+ [
+ in_arr,
+ np.zeros((N_FINE_CODEBOOKS, n_remove_from_end), dtype=np.int32)
+ + CODEBOOK_SIZE,
+ ]
+ )
+ # we can be lazy about fractional loop and just keep overwriting codebooks
+ n_loops = (
+ np.max([0, int(np.ceil((x_coarse_gen.shape[1] - (1024 - n_history)) / 512))])
+ + 1
+ )
+ with _inference_mode():
+ in_arr = torch.tensor(in_arr.T).to(device)
+ for n in tqdm.tqdm(range(n_loops), disable=silent):
+ start_idx = np.min([n * 512, in_arr.shape[0] - 1024])
+ start_fill_idx = np.min([n_history + n * 512, in_arr.shape[0] - 512])
+ rel_start_fill_idx = start_fill_idx - start_idx
+ in_buffer = in_arr[start_idx : start_idx + 1024, :][None]
+ for nn in range(n_coarse, N_FINE_CODEBOOKS):
+ logits = model(nn, in_buffer)
+ if temp is None:
+ relevant_logits = logits[0, rel_start_fill_idx:, :CODEBOOK_SIZE]
+ codebook_preds = torch.argmax(relevant_logits, -1)
+ else:
+ relevant_logits = logits[0, :, :CODEBOOK_SIZE] / temp
+ probs = F.softmax(relevant_logits, dim=-1)
+ codebook_preds = torch.multinomial(
+ probs[rel_start_fill_idx:1024], num_samples=1
+ ).reshape(-1)
+ codebook_preds = codebook_preds.to(torch.int32)
+ in_buffer[0, rel_start_fill_idx:, nn] = codebook_preds
+ del logits, codebook_preds
+ # transfer over info into model_in and convert to numpy
+ for nn in range(n_coarse, N_FINE_CODEBOOKS):
+ in_arr[
+ start_fill_idx : start_fill_idx + (1024 - rel_start_fill_idx), nn
+ ] = in_buffer[0, rel_start_fill_idx:, nn]
+ del in_buffer
+ gen_fine_arr = in_arr.detach().cpu().numpy().squeeze().T
+ del in_arr
+ if OFFLOAD_CPU:
+ model.to("cpu")
+ gen_fine_arr = gen_fine_arr[:, n_history:]
+ if n_remove_from_end > 0:
+ gen_fine_arr = gen_fine_arr[:, :-n_remove_from_end]
+ assert gen_fine_arr.shape[-1] == x_coarse_gen.shape[-1]
+ _clear_cuda_cache()
+ return gen_fine_arr
+
+
+def codec_decode(fine_tokens):
+ """Turn quantized audio codes into audio array using encodec."""
+ # load models if not yet exist
+ global models
+ global models_devices
+ if "codec" not in models:
+ preload_models()
+ model = models["codec"]
+ if OFFLOAD_CPU:
+ model.to(models_devices["codec"])
+ device = next(model.parameters()).device
+ arr = torch.from_numpy(fine_tokens)[None]
+ arr = arr.to(device)
+ arr = arr.transpose(0, 1)
+ emb = model.quantizer.decode(arr)
+ out = model.decoder(emb)
+ audio_arr = out.detach().cpu().numpy().squeeze()
+ del arr, emb, out
+ if OFFLOAD_CPU:
+ model.to("cpu")
+ return audio_arr
diff --git a/bark/model.py b/bark/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..457b49e749f396c47c6b35f44955fd512d233d79
--- /dev/null
+++ b/bark/model.py
@@ -0,0 +1,218 @@
+"""
+Much of this code is adapted from Andrej Karpathy's NanoGPT
+(https://github.com/karpathy/nanoGPT)
+"""
+import math
+from dataclasses import dataclass
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+class LayerNorm(nn.Module):
+ """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """
+
+ def __init__(self, ndim, bias):
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(ndim))
+ self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
+
+ def forward(self, input):
+ return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
+
+class CausalSelfAttention(nn.Module):
+
+ def __init__(self, config):
+ super().__init__()
+ assert config.n_embd % config.n_head == 0
+ # key, query, value projections for all heads, but in a batch
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+ # output projection
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+ # regularization
+ self.attn_dropout = nn.Dropout(config.dropout)
+ self.resid_dropout = nn.Dropout(config.dropout)
+ self.n_head = config.n_head
+ self.n_embd = config.n_embd
+ self.dropout = config.dropout
+ # flash attention make GPU go brrrrr but support is only in PyTorch nightly and still a bit scary
+ self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
+ if not self.flash:
+ # print("WARNING: using slow attention. Flash Attention atm needs PyTorch nightly and dropout=0.0")
+ # causal mask to ensure that attention is only applied to the left in the input sequence
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
+ .view(1, 1, config.block_size, config.block_size))
+
+ def forward(self, x, past_kv=None, use_cache=False):
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
+
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+ q, k ,v = self.c_attn(x).split(self.n_embd, dim=2)
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+
+ if past_kv is not None:
+ past_key = past_kv[0]
+ past_value = past_kv[1]
+ k = torch.cat((past_key, k), dim=-2)
+ v = torch.cat((past_value, v), dim=-2)
+
+ FULL_T = k.shape[-2]
+
+ if use_cache is True:
+ present = (k, v)
+ else:
+ present = None
+
+ # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+ if self.flash:
+ # efficient attention using Flash Attention CUDA kernels
+ if past_kv is not None:
+ # When `past_kv` is provided, we're doing incremental decoding and `q.shape[2] == 1`: q only contains
+ # the query for the last token. scaled_dot_product_attention interprets this as the first token in the
+ # sequence, so if is_causal=True it will mask out all attention from it. This is not what we want, so
+ # to work around this we set is_causal=False.
+ is_causal = False
+ else:
+ is_causal = True
+
+ y = torch.nn.functional.scaled_dot_product_attention(q, k, v, dropout_p=self.dropout, is_causal=is_causal)
+ else:
+ # manual implementation of attention
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+ att = att.masked_fill(self.bias[:,:,FULL_T-T:FULL_T,:FULL_T] == 0, float('-inf'))
+ att = F.softmax(att, dim=-1)
+ att = self.attn_dropout(att)
+ y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
+
+ # output projection
+ y = self.resid_dropout(self.c_proj(y))
+ return (y, present)
+
+class MLP(nn.Module):
+
+ def __init__(self, config):
+ super().__init__()
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
+ self.dropout = nn.Dropout(config.dropout)
+ self.gelu = nn.GELU()
+
+ def forward(self, x):
+ x = self.c_fc(x)
+ x = self.gelu(x)
+ x = self.c_proj(x)
+ x = self.dropout(x)
+ return x
+
+class Block(nn.Module):
+
+ def __init__(self, config, layer_idx):
+ super().__init__()
+ self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
+ self.attn = CausalSelfAttention(config)
+ self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
+ self.mlp = MLP(config)
+ self.layer_idx = layer_idx
+
+ def forward(self, x, past_kv=None, use_cache=False):
+ attn_output, prev_kvs = self.attn(self.ln_1(x), past_kv=past_kv, use_cache=use_cache)
+ x = x + attn_output
+ x = x + self.mlp(self.ln_2(x))
+ return (x, prev_kvs)
+
+@dataclass
+class GPTConfig:
+ block_size: int = 1024
+ input_vocab_size: int = 10_048
+ output_vocab_size: int = 10_048
+ n_layer: int = 12
+ n_head: int = 12
+ n_embd: int = 768
+ dropout: float = 0.0
+ bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
+
+class GPT(nn.Module):
+
+ def __init__(self, config):
+ super().__init__()
+ assert config.input_vocab_size is not None
+ assert config.output_vocab_size is not None
+ assert config.block_size is not None
+ self.config = config
+
+ self.transformer = nn.ModuleDict(dict(
+ wte = nn.Embedding(config.input_vocab_size, config.n_embd),
+ wpe = nn.Embedding(config.block_size, config.n_embd),
+ drop = nn.Dropout(config.dropout),
+ h = nn.ModuleList([Block(config, idx) for idx in range(config.n_layer)]),
+ ln_f = LayerNorm(config.n_embd, bias=config.bias),
+ ))
+ self.lm_head = nn.Linear(config.n_embd, config.output_vocab_size, bias=False)
+
+ def get_num_params(self, non_embedding=True):
+ """
+ Return the number of parameters in the model.
+ For non-embedding count (default), the position embeddings get subtracted.
+ The token embeddings would too, except due to the parameter sharing these
+ params are actually used as weights in the final layer, so we include them.
+ """
+ n_params = sum(p.numel() for p in self.parameters())
+ if non_embedding:
+ n_params -= self.transformer.wte.weight.numel()
+ n_params -= self.transformer.wpe.weight.numel()
+ return n_params
+
+ def forward(self, idx, merge_context=False, past_kv=None, position_ids=None, use_cache=False):
+ device = idx.device
+ b, t = idx.size()
+ if past_kv is not None:
+ assert t == 1
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
+ else:
+ if merge_context:
+ assert(idx.shape[1] >= 256+256+1)
+ t = idx.shape[1] - 256
+ else:
+ assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+
+ # forward the GPT model itself
+ if merge_context:
+ tok_emb = torch.cat([
+ self.transformer.wte(idx[:,:256]) + self.transformer.wte(idx[:,256:256+256]),
+ self.transformer.wte(idx[:,256+256:])
+ ], dim=1)
+ else:
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
+
+ if past_kv is None:
+ past_length = 0
+ past_kv = tuple([None] * len(self.transformer.h))
+ else:
+ past_length = past_kv[0][0].size(-2)
+
+ if position_ids is None:
+ position_ids = torch.arange(past_length, t + past_length, dtype=torch.long, device=device)
+ position_ids = position_ids.unsqueeze(0) # shape (1, t)
+ assert position_ids.shape == (1, t)
+
+ pos_emb = self.transformer.wpe(position_ids) # position embeddings of shape (1, t, n_embd)
+
+ x = self.transformer.drop(tok_emb + pos_emb)
+
+ new_kv = () if use_cache else None
+
+ for i, (block, past_layer_kv) in enumerate(zip(self.transformer.h, past_kv)):
+ x, kv = block(x, past_kv=past_layer_kv, use_cache=use_cache)
+
+ if use_cache:
+ new_kv = new_kv + (kv,)
+
+ x = self.transformer.ln_f(x)
+
+ # inference-time mini-optimization: only forward the lm_head on the very last position
+ logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
+
+ return (logits, new_kv)
diff --git a/bark/model_fine.py b/bark/model_fine.py
new file mode 100644
index 0000000000000000000000000000000000000000..6179a851319692b10df0d69b00910ad36cee8685
--- /dev/null
+++ b/bark/model_fine.py
@@ -0,0 +1,149 @@
+"""
+Much of this code is adapted from Andrej Karpathy's NanoGPT
+(https://github.com/karpathy/nanoGPT)
+"""
+from dataclasses import dataclass
+import math
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+from .model import GPT, GPTConfig, MLP
+
+
+class NonCausalSelfAttention(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ assert config.n_embd % config.n_head == 0
+ # key, query, value projections for all heads, but in a batch
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+ # output projection
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+ # regularization
+ self.attn_dropout = nn.Dropout(config.dropout)
+ self.resid_dropout = nn.Dropout(config.dropout)
+ self.n_head = config.n_head
+ self.n_embd = config.n_embd
+ self.dropout = config.dropout
+ # flash attention make GPU go brrrrr but support is only in PyTorch nightly and still a bit scary
+ self.flash = (
+ hasattr(torch.nn.functional, "scaled_dot_product_attention") and self.dropout == 0.0
+ )
+
+ def forward(self, x):
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
+
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+ q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+
+ # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+ if self.flash:
+ # efficient attention using Flash Attention CUDA kernels
+ y = torch.nn.functional.scaled_dot_product_attention(
+ q, k, v, attn_mask=None, dropout_p=self.dropout, is_causal=False
+ )
+ else:
+ # manual implementation of attention
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+ att = F.softmax(att, dim=-1)
+ att = self.attn_dropout(att)
+ y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+ y = (
+ y.transpose(1, 2).contiguous().view(B, T, C)
+ ) # re-assemble all head outputs side by side
+
+ # output projection
+ y = self.resid_dropout(self.c_proj(y))
+ return y
+
+
+class FineBlock(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.ln_1 = nn.LayerNorm(config.n_embd)
+ self.attn = NonCausalSelfAttention(config)
+ self.ln_2 = nn.LayerNorm(config.n_embd)
+ self.mlp = MLP(config)
+
+ def forward(self, x):
+ x = x + self.attn(self.ln_1(x))
+ x = x + self.mlp(self.ln_2(x))
+ return x
+
+
+class FineGPT(GPT):
+ def __init__(self, config):
+ super().__init__(config)
+ del self.lm_head
+ self.config = config
+ self.n_codes_total = config.n_codes_total
+ self.transformer = nn.ModuleDict(
+ dict(
+ wtes=nn.ModuleList(
+ [
+ nn.Embedding(config.input_vocab_size, config.n_embd)
+ for _ in range(config.n_codes_total)
+ ]
+ ),
+ wpe=nn.Embedding(config.block_size, config.n_embd),
+ drop=nn.Dropout(config.dropout),
+ h=nn.ModuleList([FineBlock(config) for _ in range(config.n_layer)]),
+ ln_f=nn.LayerNorm(config.n_embd),
+ )
+ )
+ self.lm_heads = nn.ModuleList(
+ [
+ nn.Linear(config.n_embd, config.output_vocab_size, bias=False)
+ for _ in range(config.n_codes_given, self.n_codes_total)
+ ]
+ )
+ for i in range(self.n_codes_total - config.n_codes_given):
+ self.transformer.wtes[i + 1].weight = self.lm_heads[i].weight
+
+ def forward(self, pred_idx, idx):
+ device = idx.device
+ b, t, codes = idx.size()
+ assert (
+ t <= self.config.block_size
+ ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+ assert pred_idx > 0, "cannot predict 0th codebook"
+ assert codes == self.n_codes_total, (b, t, codes)
+ pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)
+
+ # forward the GPT model itself
+ tok_embs = [
+ wte(idx[:, :, i]).unsqueeze(-1) for i, wte in enumerate(self.transformer.wtes)
+ ] # token embeddings of shape (b, t, n_embd)
+ tok_emb = torch.cat(tok_embs, dim=-1)
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
+ x = tok_emb[:, :, :, : pred_idx + 1].sum(dim=-1)
+ x = self.transformer.drop(x + pos_emb)
+ for block in self.transformer.h:
+ x = block(x)
+ x = self.transformer.ln_f(x)
+ logits = self.lm_heads[pred_idx - self.config.n_codes_given](x)
+ return logits
+
+ def get_num_params(self, non_embedding=True):
+ """
+ Return the number of parameters in the model.
+ For non-embedding count (default), the position embeddings get subtracted.
+ The token embeddings would too, except due to the parameter sharing these
+ params are actually used as weights in the final layer, so we include them.
+ """
+ n_params = sum(p.numel() for p in self.parameters())
+ if non_embedding:
+ for wte in self.transformer.wtes:
+ n_params -= wte.weight.numel()
+ n_params -= self.transformer.wpe.weight.numel()
+ return n_params
+
+
+@dataclass
+class FineGPTConfig(GPTConfig):
+ n_codes_total: int = 8
+ n_codes_given: int = 1
diff --git a/bark_infinity/HFCacheInfo.txt b/bark_infinity/HFCacheInfo.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9269acd8610945110453fb7bdd62c71c1f30680c
--- /dev/null
+++ b/bark_infinity/HFCacheInfo.txt
@@ -0,0 +1,72 @@
+HFCacheInfo(
+ size_on_disk=996180,
+ repos=frozenset({
+ CachedRepoInfo(
+ repo_id='bert-base-multilingual-cased',
+ repo_type='model',
+ repo_path=PosixPath('/home/jon/mamba_projects/june26_barki/bark/bark_infinity/data/models/unclassified/hub/models--bert-base-multilingual-cased'),
+ size_on_disk=996180,
+ nb_files=3,
+ revisions=frozenset({
+ CachedRevisionInfo(
+ commit_hash='fdfce55e83dbed325647a63e7e1f5de19f0382ba',
+ snapshot_path=PosixPath('/home/jon/mamba_projects/june26_barki/bark/bark_infinity/data/models/unclassified/hub/models--bert-base-multilingual-cased/snapshots/fdfce55e83dbed325647a63e7e1f5de19f0382ba'),
+ size_on_disk=996180,
+ files=frozenset({
+ CachedFileInfo(
+ file_name='vocab.txt',
+ file_path=PosixPath('/home/jon/mamba_projects/june26_barki/bark/bark_infinity/data/models/unclassified/hub/models--bert-base-multilingual-cased/snapshots/fdfce55e83dbed325647a63e7e1f5de19f0382ba/vocab.txt'),
+ blob_path=PosixPath('/home/jon/mamba_projects/june26_barki/bark/bark_infinity/data/models/unclassified/hub/models--bert-base-multilingual-cased/blobs/e837bab60a5d204e29622d127c2dafe508aa0731'),
+ size_on_disk=995526,
+ blob_last_accessed=1687920080.6168203,
+ blob_last_modified=1687832408.172939
+ ),
+ CachedFileInfo(
+ file_name='config.json',
+ file_path=PosixPath('/home/jon/mamba_projects/june26_barki/bark/bark_infinity/data/models/unclassified/hub/models--bert-base-multilingual-cased/snapshots/fdfce55e83dbed325647a63e7e1f5de19f0382ba/config.json'),
+ blob_path=PosixPath('/home/jon/mamba_projects/june26_barki/bark/bark_infinity/data/models/unclassified/hub/models--bert-base-multilingual-cased/blobs/b122e74db13b415ea824c074da33c1c44f0d13a3'),
+ size_on_disk=625,
+ blob_last_accessed=1687950514.9967504,
+ blob_last_modified=1687832408.4329393
+ ),
+ CachedFileInfo(
+ file_name='tokenizer_config.json',
+ file_path=PosixPath('/home/jon/mamba_projects/june26_barki/bark/bark_infinity/data/models/unclassified/hub/models--bert-base-multilingual-cased/snapshots/fdfce55e83dbed325647a63e7e1f5de19f0382ba/tokenizer_config.json'),
+ blob_path=PosixPath('/home/jon/mamba_projects/june26_barki/bark/bark_infinity/data/models/unclassified/hub/models--bert-base-multilingual-cased/blobs/e3c6d456fb2616f01a9a6cd01a1be1a36353ed22'),
+ size_on_disk=29,
+ blob_last_accessed=1687950514.9967504,
+ blob_last_modified=1687832408.3329394
+ )
+ }),
+ refs=frozenset({'main'}),
+ last_modified=1687832408.4329393
+ )
+ }),
+ last_accessed=1687950514.9967504,
+ last_modified=1687832408.4329393
+ ),
+ CachedRepoInfo(
+ repo_id='GitMylo/bark-voice-cloning',
+ repo_type='model',
+ repo_path=PosixPath('/home/jon/mamba_projects/june26_barki/bark/bark_infinity/data/models/unclassified/hub/models--GitMylo--bark-voice-cloning'),
+ size_on_disk=0,
+ nb_files=0,
+ revisions=frozenset({
+ CachedRevisionInfo(
+ commit_hash='c26e70f3311c6973ca86511dd18b6a8ee073e830',
+ snapshot_path=PosixPath('/home/jon/mamba_projects/june26_barki/bark/bark_infinity/data/models/unclassified/hub/models--GitMylo--bark-voice-cloning/snapshots/c26e70f3311c6973ca86511dd18b6a8ee073e830'),
+ size_on_disk=0,
+ files=frozenset(),
+ refs=frozenset({'main'}),
+ last_modified=1687857042.1168284
+ )
+ }),
+ last_accessed=1687883437.046821,
+ last_modified=1687857042.1168284
+ )
+ }),
+ warnings=[
+ CorruptedCacheException('Repo path is not a directory:
+/home/jon/mamba_projects/june26_barki/bark/bark_infinity/data/models/unclassified/hub/version.txt')
+ ]
+)
\ No newline at end of file
diff --git a/bark_infinity/__init__.py b/bark_infinity/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f3b4ba9f905ec7061af935e23c51c5471420ad7
--- /dev/null
+++ b/bark_infinity/__init__.py
@@ -0,0 +1,7 @@
+from .api import generate_audio, text_to_semantic, semantic_to_waveform, save_as_prompt
+from .generation import SAMPLE_RATE, preload_models
+
+
+from .api import generate_audio_long, render_npz_samples, list_speakers
+from .config import logger, console, get_default_values, load_all_defaults, VALID_HISTORY_PROMPT_DIRS
+
diff --git a/bark_infinity/__pycache__/__init__.cpython-310.pyc b/bark_infinity/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ff76dc4b1380a41cb1cbd2a5556c3967721f99b
Binary files /dev/null and b/bark_infinity/__pycache__/__init__.cpython-310.pyc differ
diff --git a/bark_infinity/__pycache__/__init__.cpython-38.pyc b/bark_infinity/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f5e803c178867231772b5c6307ced6daad6c191
Binary files /dev/null and b/bark_infinity/__pycache__/__init__.cpython-38.pyc differ
diff --git a/bark_infinity/__pycache__/api.cpython-310.pyc b/bark_infinity/__pycache__/api.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7bfe0c8e2a3353969ad1f746c49b413f3598a024
Binary files /dev/null and b/bark_infinity/__pycache__/api.cpython-310.pyc differ
diff --git a/bark_infinity/__pycache__/api.cpython-38.pyc b/bark_infinity/__pycache__/api.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..56313e5c912c3efab8441611d59d61d19e102b00
Binary files /dev/null and b/bark_infinity/__pycache__/api.cpython-38.pyc differ
diff --git a/bark_infinity/__pycache__/clonevoice.cpython-310.pyc b/bark_infinity/__pycache__/clonevoice.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de854f5d20583f2166331465c989db5b16492696
Binary files /dev/null and b/bark_infinity/__pycache__/clonevoice.cpython-310.pyc differ
diff --git a/bark_infinity/__pycache__/clonevoice.cpython-38.pyc b/bark_infinity/__pycache__/clonevoice.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b1e6ef622f9a8bfb408475392201b343b2d78cc0
Binary files /dev/null and b/bark_infinity/__pycache__/clonevoice.cpython-38.pyc differ
diff --git a/bark_infinity/__pycache__/config.cpython-310.pyc b/bark_infinity/__pycache__/config.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e735d154e4f50cde546531a259202fcdf0e8f65
Binary files /dev/null and b/bark_infinity/__pycache__/config.cpython-310.pyc differ
diff --git a/bark_infinity/__pycache__/config.cpython-38.pyc b/bark_infinity/__pycache__/config.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dca4dee01edccc3782c1f0ebe01af66c4a9eb101
Binary files /dev/null and b/bark_infinity/__pycache__/config.cpython-38.pyc differ
diff --git a/bark_infinity/__pycache__/debug.cpython-310.pyc b/bark_infinity/__pycache__/debug.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..528053a18bfee312c1f68f22cb5e6e8a78788cf8
Binary files /dev/null and b/bark_infinity/__pycache__/debug.cpython-310.pyc differ
diff --git a/bark_infinity/__pycache__/generation.cpython-310.pyc b/bark_infinity/__pycache__/generation.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96d8395a12c92654d500b626a3ca901c5de658dd
Binary files /dev/null and b/bark_infinity/__pycache__/generation.cpython-310.pyc differ
diff --git a/bark_infinity/__pycache__/generation.cpython-38.pyc b/bark_infinity/__pycache__/generation.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..26d4f035e5e99c6639cf94298855d255179885e9
Binary files /dev/null and b/bark_infinity/__pycache__/generation.cpython-38.pyc differ
diff --git a/bark_infinity/__pycache__/model.cpython-310.pyc b/bark_infinity/__pycache__/model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a29b675cefd61896038f36dcec3b15ed283ef0da
Binary files /dev/null and b/bark_infinity/__pycache__/model.cpython-310.pyc differ
diff --git a/bark_infinity/__pycache__/model.cpython-38.pyc b/bark_infinity/__pycache__/model.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66e919a9d761c3860b2dbef6a25b228df07e278e
Binary files /dev/null and b/bark_infinity/__pycache__/model.cpython-38.pyc differ
diff --git a/bark_infinity/__pycache__/model_fine.cpython-310.pyc b/bark_infinity/__pycache__/model_fine.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..46d95c843b69e5b562adda03dd6a4be24797f824
Binary files /dev/null and b/bark_infinity/__pycache__/model_fine.cpython-310.pyc differ
diff --git a/bark_infinity/__pycache__/model_fine.cpython-38.pyc b/bark_infinity/__pycache__/model_fine.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..511850f96fa84f2510d7a2dbf9ba4219d1cc05a9
Binary files /dev/null and b/bark_infinity/__pycache__/model_fine.cpython-38.pyc differ
diff --git a/bark_infinity/__pycache__/text_processing.cpython-310.pyc b/bark_infinity/__pycache__/text_processing.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3cd814bd4e60c60885da7c2f4aeafd2268478c5a
Binary files /dev/null and b/bark_infinity/__pycache__/text_processing.cpython-310.pyc differ
diff --git a/bark_infinity/api.py b/bark_infinity/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..03c290086ae7ef5a6bc1371825c9b28e0ae50945
--- /dev/null
+++ b/bark_infinity/api.py
@@ -0,0 +1,2418 @@
+import os
+
+os.environ["HF_HOME"] = os.getenv(
+ "HF_HOME",
+ os.path.join(os.path.dirname(os.path.realpath(__file__)), "data", "models", "unclassified"),
+)
+
+from typing import Dict, Optional, Union
+
+import numpy as np
+
+from .generation import get_SUNO_USE_DIRECTML
+
+if get_SUNO_USE_DIRECTML() is True:
+ from .generation import (
+ codec_decode,
+ generate_coarse_amd_directml as generate_coarse,
+ generate_fine,
+ generate_text_semantic,
+ SAMPLE_RATE,
+ )
+else:
+ from .generation import (
+ codec_decode,
+ generate_coarse,
+ generate_fine,
+ generate_text_semantic,
+ SAMPLE_RATE,
+ )
+ from .clonevoice import wav_to_semantics, generate_fine_from_wav, quick_clone
+from .config import (
+ logger,
+ console,
+ console_file,
+ get_default_values,
+ load_all_defaults,
+ VALID_HISTORY_PROMPT_DIRS,
+)
+from scipy.io.wavfile import write as write_wav
+from huggingface_hub import scan_cache_dir
+import scipy
+
+
+from huggingface_hub import scan_cache_dir
+
+import tempfile
+import copy
+
+
+import re
+import torch
+import datetime
+import random
+import sys
+
+
+from torch.utils import collect_env
+
+
+import time
+from bark_infinity import generation
+
+from pathvalidate import sanitize_filename, sanitize_filepath
+
+from rich.pretty import pprint
+from rich.table import Table
+
+from collections import defaultdict
+from tqdm import tqdm
+
+from bark_infinity import text_processing
+
+
+import ctypes
+from pydub import AudioSegment
+
+import ffmpeg_downloader as ffdl
+
+
+global gradio_try_to_cancel
+global done_cancelling
+
+
+gradio_try_to_cancel = False
+done_cancelling = False
+from devtools import debug
+
+
+def numpy_report():
+ os.environ["MKL_VERBOSE"] = "1"
+
+ import numpy as np
+ from time import time
+
+ status_report_string = np.show_config()
+ del os.environ["MKL_VERBOSE"]
+
+ return status_report_string
+
+
+def cuda_status_report():
+ # print(torch.__config__.show(), torch.cuda.get_device_properties(0))
+ status_report_string = "=== torch.__config__.show() ===\n"
+ status_report_string += torch.__config__.show()
+ status_report_string += "\n=== torch.cuda.get_device_properties(0) ===\n"
+ status_report_string += str(torch.cuda.get_device_properties(0))
+
+ # pytorch/torch/utils/collect_env.py get_pretty_env_info()
+ status_report_string += "\n=== torch.utils.collect_env.get_pretty_env_info() ===\n"
+ status_report_string += collect_env.get_pretty_env_info()
+
+ return status_report_string
+
+
+def gpu_status_report(quick=False, gpu_no_details=False):
+ status_report_string = ""
+
+ if torch.cuda.is_available():
+ device = torch.device("cuda")
+
+ if gpu_no_details:
+ status_report_string += f"{torch.cuda.get_device_name(device)}\n"
+ else:
+ status_report_string += "=== GPU Information ===\n"
+ status_report_string += f"GPU Device: {torch.cuda.get_device_name(device)}\n"
+ if not quick:
+ status_report_string += f"Number of GPUs: {torch.cuda.device_count()}\n"
+ status_report_string += f"Current GPU id: {torch.cuda.current_device()}\n"
+ status_report_string += (
+ f"GPU Capability: {torch.cuda.get_device_capability(device)}\n"
+ )
+ status_report_string += f"Supports Tensor Cores: {torch.cuda.get_device_properties(device).major >= 7}\n"
+
+ props = torch.cuda.get_device_properties(device)
+ status_report_string += f"Total memory: {props.total_memory / (1024 ** 3)} GB\n"
+
+ if not quick:
+ status_report_string += f"GPU Cores: {props.multi_processor_count}\n"
+
+ status_report_string += "\n=== Current GPU Memory ===\n"
+
+ current_memory_allocated = torch.cuda.memory_allocated(device) / 1e9
+ status_report_string += f"Current memory allocated: {current_memory_allocated} GB\n"
+
+ max_memory_allocated = torch.cuda.max_memory_allocated(device) / 1e9
+ status_report_string += (
+ f"Max memory allocated during run: {max_memory_allocated} GB\n"
+ )
+
+ status_report_string += f"CUDA Version: {torch.version.cuda}\n"
+ status_report_string += f"PyTorch Version: {torch.__version__}\n"
+
+ else:
+ if gpu_no_details:
+ status_report_string += "CPU or non CUDA device.\n"
+ else:
+ status_report_string += "No CUDA device is detected.\n"
+
+ return status_report_string
+
+
+def gpu_memory_report(quick=False):
+ status_report_string = ""
+
+ if torch.cuda.is_available():
+ device = torch.device("cuda")
+
+ status_report_string += "=== CUDA Memory Summary ===\n"
+ status_report_string += torch.cuda.memory_summary(device)
+
+ else:
+ status_report_string += "No CUDA device is detected.\n"
+
+ return status_report_string
+
+
+def gpu_max_memory():
+ if torch.cuda.is_available():
+ device = torch.device("cuda")
+ props = torch.cuda.get_device_properties(device)
+ return props.total_memory / (1024**3)
+
+ else:
+ return None
+
+
+def text_to_semantic(
+ text: str,
+ history_prompt: Optional[Union[Dict, str]] = None,
+ temp: float = 0.7,
+ silent: bool = False,
+):
+ """Generate semantic array from text.
+
+ Args:
+ text: text to be turned into audio
+ history_prompt: history choice for audio cloning
+ temp: generation temperature (1.0 more diverse, 0.0 more conservative)
+ silent: disable progress bar
+
+ Returns:
+ numpy semantic array to be fed into `semantic_to_waveform`
+ """
+
+ x_semantic = generate_text_semantic(
+ text,
+ history_prompt=history_prompt,
+ temp=temp,
+ silent=silent,
+ use_kv_caching=True,
+ )
+
+ return x_semantic
+
+
+def semantic_to_waveform(
+ semantic_tokens: np.ndarray,
+ history_prompt: Optional[Union[Dict, str]] = None,
+ temp: float = 0.7,
+ silent: bool = False,
+ output_full: bool = False,
+):
+ """Generate audio array from semantic input.
+
+ Args:
+ semantic_tokens: semantic token output from `text_to_semantic`
+ history_prompt: history choice for audio cloning
+ temp: generation temperature (1.0 more diverse, 0.0 more conservative)
+ silent: disable progress bar
+ output_full: return full generation to be used as a history prompt
+
+ Returns:
+ numpy audio array at sample frequency 24khz
+ """
+
+ coarse_tokens = generate_coarse(
+ semantic_tokens,
+ history_prompt=history_prompt,
+ temp=temp,
+ silent=silent,
+ use_kv_caching=True,
+ )
+ bark_coarse_tokens = coarse_tokens
+
+ fine_tokens = generate_fine(
+ coarse_tokens,
+ history_prompt=history_prompt,
+ temp=0.5,
+ )
+ bark_fine_tokens = fine_tokens
+
+ audio_arr = codec_decode(fine_tokens)
+ if output_full:
+ full_generation = {
+ "semantic_prompt": semantic_tokens,
+ "coarse_prompt": coarse_tokens,
+ "fine_prompt": fine_tokens,
+ }
+ return full_generation, audio_arr
+ return audio_arr
+
+
+def save_as_prompt(filepath, full_generation):
+ assert filepath.endswith(".npz")
+ assert isinstance(full_generation, dict)
+ assert "semantic_prompt" in full_generation
+ assert "coarse_prompt" in full_generation
+ assert "fine_prompt" in full_generation
+ np.savez(filepath, **full_generation)
+
+
+def generate_audio(
+ text: str,
+ history_prompt: Optional[Union[Dict, str]] = None,
+ text_temp: float = 0.7,
+ waveform_temp: float = 0.7,
+ silent: bool = False,
+ output_full: bool = False,
+):
+ """Generate audio array from input text.
+
+ Args:
+ text: text to be turned into audio
+ history_prompt: history choice for audio cloning
+ text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
+ waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
+ silent: disable progress bar
+ output_full: return full generation to be used as a history prompt
+
+ Returns:
+ numpy audio array at sample frequency 24khz
+ """
+ semantic_tokens = text_to_semantic(
+ text,
+ history_prompt=history_prompt,
+ temp=text_temp,
+ silent=silent,
+ )
+ out = semantic_to_waveform(
+ semantic_tokens,
+ history_prompt=history_prompt,
+ temp=waveform_temp,
+ silent=silent,
+ output_full=output_full,
+ )
+ if output_full:
+ full_generation, audio_arr = out
+ return full_generation, audio_arr
+ else:
+ audio_arr = out
+ return audio_arr
+
+
+## ADDED BELOW
+
+
+def set_seed(seed: int = 0):
+ """Set the seed
+ seed = 0 Generate a random seed
+ seed = -1 Disable deterministic algorithms
+ 0 < seed < 2**32 Set the seed
+ Args:
+ seed: integer to use as seed
+ Returns:
+ integer used as seed
+ """
+
+ original_seed = seed
+
+ # See for more information: https://pytorch.org/docs/stable/notes/randomness.html
+ if seed == -1:
+ # Disable deterministic
+
+ print("Disabling deterministic algorithms")
+
+ torch.backends.cudnn.deterministic = False
+ torch.backends.cudnn.benchmark = True
+
+ if "CUBLAS_WORKSPACE_CONFIG" in os.environ:
+ del os.environ["CUBLAS_WORKSPACE_CONFIG"]
+
+ torch.use_deterministic_algorithms(False)
+
+ else:
+ print("Enabling deterministic algorithms")
+
+ torch.backends.cudnn.deterministic = True
+ torch.backends.cudnn.benchmark = False
+
+ os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+ torch.use_deterministic_algorithms(True)
+
+ if seed <= 0:
+ # Generate random seed
+ # Use default_rng() because it is independent of np.random.seed()
+ seed = np.random.default_rng().integers(1, 2**32 - 1)
+
+ assert 0 < seed and seed < 2**32
+
+ np.random.seed(seed)
+ random.seed(seed)
+ torch.manual_seed(seed)
+ torch.cuda.manual_seed_all(seed)
+ os.environ["PYTHONHASHSEED"] = str(seed)
+
+ print(f"Set seed to {seed}")
+
+ return original_seed if original_seed != 0 else seed
+
+
+# mostly just looks in different directories and handles fuzzier matching like not including the extension
+def process_history_prompt(user_history_prompt):
+ valid_directories_to_check = VALID_HISTORY_PROMPT_DIRS
+
+ if user_history_prompt is None:
+ return None
+
+ file_name, file_extension = os.path.splitext(user_history_prompt)
+ if not file_extension:
+ file_extension = ".npz"
+
+ full_path = f"{file_name}{file_extension}"
+
+ history_prompt_returned = None
+ if os.path.dirname(full_path): # Check if a directory is specified
+ if os.path.exists(full_path):
+ history_prompt_returned = full_path
+ else:
+ logger.error(f" >> Can't find speaker file at: {full_path}")
+ else:
+ for directory in valid_directories_to_check:
+ full_path_in_dir = os.path.join(directory, f"{file_name}{file_extension}")
+ if os.path.exists(full_path_in_dir):
+ history_prompt_returned = full_path_in_dir
+
+ if history_prompt_returned is None:
+ logger.error(f" >>! Can't find speaker file: {full_path} in: {valid_directories_to_check}")
+ return None
+
+ if not history_prompt_is_valid(history_prompt_returned):
+ logger.error(f" >>! Speaker file: {history_prompt_returned} is invalid, skipping.")
+ return None
+
+ return history_prompt_returned
+
+
+def log_params(log_filepath, **kwargs):
+ from rich.console import Console
+ import os
+
+ if not isinstance(log_filepath, str) or not os.path.isdir(os.path.dirname(log_filepath)):
+ print(f"Invalid log_filepath: {log_filepath}. Log file was not created.")
+ return
+
+ file_console = Console(color_system=None)
+ with file_console.capture() as capture:
+ kwargs["history_prompt"] = kwargs.get("history_prompt_string", None)
+ kwargs["history_prompt_string"] = None
+
+ file_console.print(kwargs)
+ str_output = capture.get()
+
+ try:
+ log_filepath = generate_unique_filepath(log_filepath)
+ with open(log_filepath, "wt", encoding="utf-8") as log_file:
+ log_file.write(str_output)
+ except Exception as e:
+ print(f"An error occurred while trying to log generation parameters: {e}")
+
+
+def determine_output_filename(special_one_off_path=None, **kwargs):
+ if special_one_off_path:
+ return sanitize_filepath(special_one_off_path)
+
+ # normally generate a filename
+ output_dir = kwargs.get("output_dir", None)
+ output_filename = kwargs.get("output_filename", None)
+
+ # TODO: Offer a config for long clips to show only the original starting prompt. I prefer seeing each clip separately names for easy referencing myself.
+ text_prompt = kwargs.get("text_prompt", None) or kwargs.get("text", None) or ""
+ history_prompt = kwargs.get("history_prompt_string", None) or "random"
+ text_prompt = text_prompt.strip()
+ history_prompt = os.path.basename(history_prompt).replace(".npz", "")
+
+ history_prompt = history_prompt[:15].strip()
+
+ # There's a Lot of stuff that passes that sanitize check that we don't want in the filename
+ text_prompt = re.sub(r" ", "_", text_prompt) # spaces with underscores
+ # quotes, colons, and semicolons
+ text_prompt = re.sub(r'[^\w\s]|[:;\'"]', "", text_prompt)
+ text_prompt = re.sub(
+ r"[\U00010000-\U0010ffff]", "", text_prompt, flags=re.UNICODE
+ ) # Remove emojis
+ segment_number_text = None
+ hoarder_mode = kwargs.get("hoarder_mode", False)
+ if hoarder_mode:
+ segment_number = kwargs.get("segment_number")
+ if segment_number and kwargs.get("total_segments", 1) > 1:
+ segment_number_text = f"{str(segment_number).zfill(3)}_"
+
+ if output_filename is not None and output_filename.strip() != "":
+ base_output_filename = f"{output_filename}"
+ else:
+ # makes the filename unique which is good when just browsing via search
+ date_str = datetime.datetime.now().strftime("%y-%m%d-%H%M-%S")
+
+ truncated_text = re.sub(
+ r"[^a-zA-Z0-9]", "", text_prompt
+ ) # this is brutal but I'm sick of weird filename problems.
+ truncated_text = text_prompt[:15].strip()
+
+ base_output_filename = f"{truncated_text}-{date_str}-SPK-{history_prompt}"
+
+ if segment_number_text is not None:
+ base_output_filename = f"{segment_number_text}{base_output_filename}"
+
+ output_format = kwargs.get("output_format", None)
+
+ npz_only = kwargs.get("npz_only", False)
+
+ # print(f"output_format is {output_format}")
+ if output_format is not None and not npz_only:
+ if output_format in ["ogg", "flac", "mp4", "wav"]:
+ base_output_filename = f"{base_output_filename}.{output_format}"
+ else:
+ base_output_filename = f"{base_output_filename}.mp3"
+ elif npz_only:
+ base_output_filename = f"{base_output_filename}"
+
+ output_filepath = os.path.join(output_dir, base_output_filename)
+
+ os.makedirs(output_dir, exist_ok=True)
+
+ output_filepath = generate_unique_filepath(output_filepath)
+
+ return output_filepath
+
+
+def write_one_segment(audio_arr=None, full_generation=None, **kwargs):
+ filepath = determine_output_filename(**kwargs)
+ # print(f"Looks like filepath is {filepath} is okay?")
+ if full_generation is not None:
+ write_seg_npz(filepath, full_generation, **kwargs)
+ if audio_arr is not None and kwargs.get("segment_number", 1) != "base_history":
+ write_seg_wav(filepath, audio_arr, **kwargs)
+
+ hoarder_mode = kwargs.get("hoarder_mode", False)
+ dry_run = kwargs.get("dry_run", False)
+ # if hoarder_mode and not dry_run:
+ if not dry_run:
+ log_params(f"{filepath}_info.txt", **kwargs)
+ return filepath
+
+
+def generate_unique_dirpath(dirpath):
+ unique_dirpath = sanitize_filepath(dirpath)
+ base_name = os.path.basename(dirpath)
+ parent_dir = os.path.dirname(dirpath)
+ counter = 1
+ while os.path.exists(unique_dirpath):
+ unique_dirpath = os.path.join(parent_dir, f"{base_name}_{counter}")
+ counter += 1
+ return unique_dirpath
+
+
+def generate_unique_filepath(filepath):
+ unique_filename = sanitize_filepath(filepath)
+ name, ext = os.path.splitext(filepath)
+ counter = 1
+ while os.path.exists(unique_filename):
+ unique_filename = os.path.join(f"{name}_{counter}{ext}")
+ counter += 1
+ return unique_filename
+
+
+def write_seg_npz(filepath, full_generation, **kwargs):
+ # logger.debug(kwargs)
+
+ if kwargs.get("segment_number", 1) == "base_history":
+ filepath = f"{filepath}_orig_speaker.npz"
+
+ if not kwargs.get("dry_run", False):
+ filepath = generate_unique_filepath(filepath)
+ # np.savez_compressed(filepath, semantic_prompt = full_generation["semantic_prompt"], coarse_prompt = full_generation["coarse_prompt"], fine_prompt = full_generation["fine_prompt"])
+ if "semantic_prompt" in full_generation:
+ np.savez(
+ filepath,
+ semantic_prompt=full_generation["semantic_prompt"],
+ coarse_prompt=full_generation["coarse_prompt"],
+ fine_prompt=full_generation["fine_prompt"],
+ )
+ else:
+ print("No semantic prompt to save")
+ return filepath
+
+
+def write_seg_wav(filepath, audio_arr, **kwargs):
+ dry_run = kwargs.get("dry_run", False)
+ dry_text = "(dry run)" if dry_run else ""
+ if dry_run is not True:
+ filepath = generate_unique_filepath(filepath)
+ write_audiofile(filepath, audio_arr, **kwargs)
+
+
+def write_audiofile_old(output_filepath, audio_arr, **kwargs):
+ output_filepath = generate_unique_filepath(output_filepath)
+
+ dry_run = kwargs.get("dry_run", False)
+ dry_text = "(dry run)" if dry_run else ""
+
+ output_format = kwargs.get("output_format", None)
+
+ output_format_bitrate = kwargs.get("output_format_bitrate", None)
+
+ output_format_ffmpeg_parameters = kwargs.get("output_format_ffmpeg_parameters", None)
+
+ if output_format is None or output_format == "":
+ output_format = "mp3"
+
+ if output_format_bitrate is None or output_format_bitrate == "":
+ output_format_bitrate = "64k"
+
+ ffmpeg_parameters = None
+ if output_format_ffmpeg_parameters is not None and output_format_ffmpeg_parameters != "":
+ ffmpeg_parameters = output_format_ffmpeg_parameters
+
+ if output_format in ["mp3", "ogg", "flac", "mp4"]:
+ temp_wav = f"{output_filepath}.tmp.wav"
+ # print(f"temp_wav is {temp_wav}")
+ temp_wav = f"{output_filepath}.tmp.wav"
+ # print(f"temp_wav is {temp_wav}")
+ write_wav(temp_wav, SAMPLE_RATE, audio_arr) if not dry_run else None
+ if dry_run is not True:
+ audio = AudioSegment.from_wav(temp_wav)
+
+ # sample_rate, wav_sample = scipy_wavfile.read(temp_wav)
+ # print(f"sample_rate is {sample_rate}")
+ # audio = AudioSegment(data=wav_sample.tobytes(),
+ # sample_width=2,
+ # frame_rate=sample_rate, channels=1)
+
+ if output_format == "mp4":
+ audio.export(
+ output_filepath,
+ format="mp4",
+ codec="aac",
+ bitrate=output_format_bitrate,
+ )
+ else:
+ audio.export(output_filepath, format=output_format)
+ os.remove(temp_wav)
+ else:
+ write_wav(output_filepath, SAMPLE_RATE, audio_arr) if not dry_run else None
+
+ logger.info(f" .{output_format} saved to {output_filepath} {dry_text}")
+
+ """
+ if output_format in ['mp3', 'ogg', 'flac', 'mp4']:
+ with tempfile.NamedTemporaryFile(suffix=".tmp.wav") as temp:
+ temp_wav = temp.name
+ write_wav(temp_wav, SAMPLE_RATE, audio_arr) if not dry_run else None
+ if dry_run is not True:
+ audio = AudioSegment.from_wav(temp_wav)
+
+ # sample_rate, wav_sample = scipy.io.wavfile.read(temp_wav)
+ # audio = AudioSegment(data=wav_sample.tobytes(),
+ sample_width=2,
+ frame_rate=sample_rate, channels=1)
+
+ if output_format == 'mp4':
+ audio.export(output_filepath, format="mp4", codec="aac")
+ else:
+ audio.export(output_filepath, format=output_format)
+ else:
+ write_wav(output_filepath, SAMPLE_RATE, audio_arr) if not dry_run else None
+
+ logger.info(f" .{output_format} saved to {output_filepath} {dry_text}")
+ """
+
+
+def parse_ffmpeg_parameters(parameters):
+ # Split the parameters string based on 'QQQQQ'
+ parsed_parameters = parameters.split("QQQQQ")
+
+ # Replace 'DDDDD' with '-'
+ parsed_parameters = [param.replace("DDDDD", "-") for param in parsed_parameters]
+
+ # Strip leading/trailing white spaces from each parameter
+ parsed_parameters = [param.strip() for param in parsed_parameters]
+
+ # Print debug information
+ # print("Final command for ffmpeg (without QQQQQ, DDDDD replaced by -):")
+ print(" ".join(parsed_parameters))
+
+ return parsed_parameters
+
+
+def write_audiofile(output_filepath, audio_arr, **kwargs):
+ output_filepath = generate_unique_filepath(output_filepath)
+
+ dry_run = kwargs.get("dry_run", False)
+ dry_text = "(dry run)" if dry_run else ""
+
+ output_format = kwargs.get("output_format", "mp3")
+ output_format_bitrate = kwargs.get("output_format_bitrate", "64k")
+ output_format_ffmpeg_parameters = kwargs.get("output_format_ffmpeg_parameters")
+
+ ffmpeg_parameters = None
+ if output_format_ffmpeg_parameters is not None and output_format_ffmpeg_parameters != "":
+ ffmpeg_parameters = []
+ parameters = parse_ffmpeg_parameters(output_format_ffmpeg_parameters)
+
+ if output_format in ["mp3", "ogg", "flac", "mp4"]:
+ temp_wav = f"{output_filepath}.tmp.wav"
+ if not dry_run:
+ write_wav(temp_wav, SAMPLE_RATE, audio_arr)
+ audio = AudioSegment.from_wav(temp_wav)
+ if output_format == "mp4":
+ audio.export(
+ output_filepath,
+ format="mp4",
+ codec="aac",
+ bitrate=output_format_bitrate,
+ )
+ elif output_format_ffmpeg_parameters:
+ audio.export(
+ output_filepath,
+ format=output_format,
+ bitrate=output_format_bitrate,
+ parameters=ffmpeg_parameters,
+ )
+ else:
+ audio.export(output_filepath, format=output_format, bitrate=output_format_bitrate)
+ os.remove(temp_wav)
+ elif not dry_run:
+ write_wav(output_filepath, SAMPLE_RATE, audio_arr)
+
+ logger.info(f" .{output_format} saved to {output_filepath} {dry_text}")
+
+
+def call_with_non_none_params(func, **kwargs):
+ non_none_params = {key: value for key, value in kwargs.items() if value is not None}
+ return func(**non_none_params)
+
+
+def generate_audio_barki(
+ text: str,
+ **kwargs,
+):
+ """Generate audio array from input text.
+
+ Args:
+ text: text to be turned into audio
+ history_prompt: history choice for audio cloning
+ text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
+ waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
+ silent: disable progress bar
+ output_full: return full generation to be used as a history prompt
+
+
+ Returns:
+ numpy audio array at sample frequency 24khz
+ """
+ logger.debug(locals())
+ kwargs = load_all_defaults(**kwargs)
+
+ history_prompt = kwargs.get("history_prompt", None)
+ text_temp = kwargs.get("text_temp", None)
+ waveform_temp = kwargs.get("waveform_temp", None)
+ silent = kwargs.get("silent", None)
+ output_full = kwargs.get("output_full", None)
+
+ global gradio_try_to_cancel
+ global done_cancelling
+
+ seed = kwargs.get("seed", None)
+ if seed is not None:
+ set_seed(seed)
+
+ ## Semantic Options
+ semantic_temp = text_temp
+ if kwargs.get("semantic_temp", None):
+ semantic_temp = kwargs.get("semantic_temp")
+
+ semantic_seed = kwargs.get("semantic_seed", None)
+ if semantic_seed is not None:
+ set_seed(semantic_seed)
+
+ if gradio_try_to_cancel:
+ done_cancelling = True
+ return None, None
+
+ confused_travolta_mode = kwargs.get("confused_travolta_mode", False)
+ if confused_travolta_mode:
+ kwargs["semantic_allow_early_stop"] = False
+
+ semantic_tokens = None
+ bark_speaker_as_the_prompt = kwargs.get("bark_speaker_as_the_prompt", None)
+
+ if bark_speaker_as_the_prompt is not None:
+ bark_speaker_as_the_prompt = kwargs.get("bark_speaker_as_the_prompt")
+ bark_speaker_as_the_prompt = load_npz(bark_speaker_as_the_prompt)
+
+ if "semantic_prompt" in bark_speaker_as_the_prompt:
+ semantic_tokens = bark_speaker_as_the_prompt["semantic_prompt"]
+ else:
+ print(f"That voice file does not have semantic tokens.")
+
+ semantic_use_mirostat_sampling = kwargs.get("semantic_use_mirostat_sampling", None)
+ semantic_mirostat_tau = kwargs.get("semantic_mirostat_tau", None)
+ semantic_mirostat_learning_rate = kwargs.get("semantic_mirostat_learning_rate", None)
+ semantic_token_repeat_penalty = kwargs.get("semantic_token_repeat_penalty", None)
+ semantic_inverted_p = kwargs.get("semantic_inverted_p", None)
+ semantic_bottom_k = kwargs.get("semantic_bottom_k", None)
+
+ negative_text_prompt = kwargs.get("negative_text_prompt", None)
+ specific_npz_file_negative_prompt = kwargs.get("specific_npz_file_negative_prompt", None)
+
+ semantic_tokens = None
+
+ negative_tokens = None
+ negative_logits = None
+ negative_text_prompt_logits_scale = None
+ negative_text_prompt_divergence_scale = None
+
+ if negative_text_prompt is not None or specific_npz_file_negative_prompt is not None:
+ negative_text_prompt = negative_text_prompt.strip()
+ # print(f"---->\nnegative_text_prompt: {negative_text_prompt}")
+ # print(f"specific_npz_file_negative_prompt: {specific_npz_file_negative_prompt}")
+
+ negative_text_prompt_logits_scale = kwargs.get("negative_text_prompt_logits_scale", None)
+ negative_text_prompt_divergence_scale = kwargs.get(
+ "negative_text_prompt_divergence_scale", None
+ )
+
+ # print(f"negative_text_prompt_logits_scale: {negative_text_prompt_logits_scale}")
+ # print(f"negative_text_prompt_divergence_scale: {negative_text_prompt_divergence_scale}")
+
+ # negative_text_prompt_to_use = text
+ negative_text_prompt_to_use = ""
+ if (
+ negative_text_prompt is not None
+ and negative_text_prompt != ""
+ and len(negative_text_prompt) > 1
+ ):
+ negative_text_prompt_to_use = negative_text_prompt
+
+ # negative_history_prompt_to_use = history_prompt
+
+ negative_history_prompt_to_use = None
+
+ if (
+ specific_npz_file_negative_prompt is not None
+ and specific_npz_file_negative_prompt != ""
+ and len(specific_npz_file_negative_prompt) > 1
+ ):
+ negative_history_prompt_to_use = specific_npz_file_negative_prompt
+
+ negative_tokens, negative_logits = call_with_non_none_params(
+ generate_text_semantic,
+ text=negative_text_prompt_to_use,
+ history_prompt=negative_history_prompt_to_use,
+ temp=semantic_temp,
+ top_k=kwargs.get("semantic_top_k", None),
+ top_p=kwargs.get("semantic_top_p", None),
+ silent=silent,
+ min_eos_p=kwargs.get("semantic_min_eos_p", None),
+ max_gen_duration_s=kwargs.get("semantic_max_gen_duration_s", None),
+ allow_early_stop=kwargs.get("semantic_allow_early_stop", True),
+ # use_kv_caching=kwargs.get("semantic_use_kv_caching", True),
+ use_kv_caching=True,
+ semantic_use_mirostat_sampling=semantic_use_mirostat_sampling,
+ semantic_mirostat_tau=semantic_mirostat_tau,
+ semantic_mirostat_learning_rate=semantic_mirostat_learning_rate,
+ semantic_token_repeat_penalty=semantic_token_repeat_penalty,
+ semantic_inverted_p=semantic_inverted_p,
+ semantic_bottom_k=semantic_bottom_k,
+ return_logits=True,
+ )
+ # debug(f"negative_tokens: {negative_tokens}")
+ # debug(f"negative_logits: {negative_logits}")
+ else:
+ pass
+ # print(f"Not using negative_text_prompt or specific_npz_file_negative_prompt.")
+
+ if semantic_tokens is None:
+ semantic_tokens = call_with_non_none_params(
+ generate_text_semantic,
+ text=text,
+ history_prompt=history_prompt,
+ temp=semantic_temp,
+ top_k=kwargs.get("semantic_top_k", None),
+ top_p=kwargs.get("semantic_top_p", None),
+ silent=silent,
+ min_eos_p=kwargs.get("semantic_min_eos_p", None),
+ max_gen_duration_s=kwargs.get("semantic_max_gen_duration_s", None),
+ allow_early_stop=kwargs.get("semantic_allow_early_stop", True),
+ # use_kv_caching=kwargs.get("semantic_use_kv_caching", True),
+ use_kv_caching=True,
+ semantic_use_mirostat_sampling=semantic_use_mirostat_sampling,
+ semantic_mirostat_tau=semantic_mirostat_tau,
+ semantic_mirostat_learning_rate=semantic_mirostat_learning_rate,
+ semantic_token_repeat_penalty=semantic_token_repeat_penalty,
+ semantic_inverted_p=semantic_inverted_p,
+ semantic_bottom_k=semantic_bottom_k,
+ return_logits=False,
+ negative_tokens=negative_tokens,
+ negative_logits=negative_logits,
+ negative_text_prompt_logits_scale=negative_text_prompt_logits_scale,
+ negative_text_prompt_divergence_scale=negative_text_prompt_divergence_scale,
+ )
+
+ if generation.get_SUNO_USE_DIRECTML() is True:
+ generation.clean_models()
+
+ # print(f"semantic_tokens is {semantic_tokens}")
+
+ if gradio_try_to_cancel:
+ done_cancelling = True
+ return None, None
+
+ ## Coarse Options
+ coarse_temp = waveform_temp
+ if kwargs.get("coarse_temp", None):
+ coarse_temp = kwargs.get("coarse_temp")
+
+ coarse_seed = kwargs.get("coarse_seed", None)
+ if coarse_seed is not None:
+ set_seed(coarse_seed)
+
+ if gradio_try_to_cancel:
+ done_cancelling = True
+ return None, None
+
+ semantic_history_only = kwargs.get("semantic_history_only", False)
+ previous_segment_type = kwargs.get("previous_segment_type", "")
+ if previous_segment_type == "base_history" and semantic_history_only:
+ print(
+ f"previous_segment_type is base_history and semantic_history_only is True. Not forwarding history for for coarse and fine"
+ )
+ history_prompt = None
+
+ absolute_semantic_history_only = kwargs.get("absolute_semantic_history_only", False)
+ if absolute_semantic_history_only:
+ print(
+ f"absolute_semantic_history_only is True. Not forwarding history for for coarse and fine"
+ )
+ history_prompt = None
+
+ absolute_semantic_history_only_every_x = kwargs.get(
+ "absolute_semantic_history_only_every_x", None
+ )
+ if (
+ absolute_semantic_history_only_every_x is not None
+ and absolute_semantic_history_only_every_x > 0
+ ):
+ segment_number = kwargs.get("segment_number", None)
+ if segment_number is not None:
+ if segment_number % absolute_semantic_history_only_every_x == 0:
+ print(
+ f"segment_number {segment_number} is divisible by {absolute_semantic_history_only_every_x}. Not forwarding history for for coarse and fine"
+ )
+ history_prompt = None
+
+ coarse_tokens = call_with_non_none_params(
+ generate_coarse,
+ x_semantic=semantic_tokens,
+ history_prompt=history_prompt,
+ temp=coarse_temp,
+ top_k=kwargs.get("coarse_top_k", None),
+ top_p=kwargs.get("coarse_top_p", None),
+ silent=silent,
+ max_coarse_history=kwargs.get("coarse_max_coarse_history", None),
+ sliding_window_len=kwargs.get("coarse_sliding_window_len", None),
+ # use_kv_caching=kwargs.get("coarse_kv_caching", True),
+ use_kv_caching=True,
+ )
+
+ if generation.get_SUNO_USE_DIRECTML() is True:
+ generation.clean_models()
+
+ fine_temp = kwargs.get("fine_temp", 0.5)
+
+ fine_seed = kwargs.get("fine_seed", None)
+ if fine_seed is not None:
+ set_seed(fine_seed)
+
+ if gradio_try_to_cancel:
+ done_cancelling = True
+ return None, None
+ fine_tokens = call_with_non_none_params(
+ generate_fine,
+ x_coarse_gen=coarse_tokens,
+ history_prompt=history_prompt,
+ temp=fine_temp,
+ silent=silent,
+ )
+ if generation.get_SUNO_USE_DIRECTML() is True:
+ generation.clean_models()
+
+ if gradio_try_to_cancel:
+ done_cancelling = True
+ return None, None
+ audio_arr = codec_decode(fine_tokens)
+ full_generation = {
+ "semantic_prompt": semantic_tokens,
+ "coarse_prompt": coarse_tokens,
+ "fine_prompt": fine_tokens,
+ }
+ if generation.get_SUNO_USE_DIRECTML() is True:
+ generation.clean_models()
+ if gradio_try_to_cancel:
+ done_cancelling = True
+ return None, None
+
+ hoarder_mode = kwargs.get("hoarder_mode", None)
+ total_segments = kwargs.get("total_segments", 1)
+ if hoarder_mode and (total_segments > 1):
+ kwargs["text"] = text
+ write_one_segment(audio_arr, full_generation, **kwargs)
+
+ if output_full:
+ return full_generation, audio_arr
+
+ return audio_arr
+
+
+def generate_audio_sampling_mods_old(
+ text: str,
+ **kwargs,
+):
+ """Generate audio array from input text.
+
+ Args:
+ text: text to be turned into audio
+ history_prompt: history choice for audio cloning
+ text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
+ waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
+ silent: disable progress bar
+ output_full: return full generation to be used as a history prompt
+
+
+ Returns:
+ numpy audio array at sample frequency 24khz
+ """
+ logger.debug(locals())
+ kwargs = load_all_defaults(**kwargs)
+
+ history_prompt = kwargs.get("history_prompt", None)
+ text_temp = kwargs.get("text_temp", None)
+ waveform_temp = kwargs.get("waveform_temp", None)
+ silent = kwargs.get("silent", None)
+ output_full = kwargs.get("output_full", None)
+
+ global gradio_try_to_cancel
+ global done_cancelling
+
+ seed = kwargs.get("seed", None)
+ if seed is not None:
+ set_seed(seed)
+
+ ## Semantic Options
+ semantic_temp = text_temp
+ if kwargs.get("semantic_temp", None):
+ semantic_temp = kwargs.get("semantic_temp")
+
+ semantic_seed = kwargs.get("semantic_seed", None)
+ if semantic_seed is not None:
+ set_seed(semantic_seed)
+
+ if gradio_try_to_cancel:
+ done_cancelling = True
+ return None, None
+
+ negative_text_prompt = kwargs.get("negative_text_prompt", None)
+
+ negative_text_prompt = negative_text_prompt.strip()
+
+ specific_npz_file_negative_prompt = kwargs.get("specific_npz_file_negative_prompt", None)
+
+ semantic_tokens = None
+
+ return_logits = False
+ negative_tokens = None
+ negative_logits = None
+ negative_text_prompt_logits_scale = None
+ negative_text_prompt_divergence_scale = None
+
+ if negative_text_prompt is not None or specific_npz_file_negative_prompt is not None:
+ print(f"negative_text_prompt: {negative_text_prompt}")
+ print(f"specific_npz_file_negative_prompt: {specific_npz_file_negative_prompt}")
+
+ negative_text_prompt_logits_scale = kwargs.get("negative_text_prompt_logits_scale", None)
+ negative_text_prompt_divergence_scale = kwargs.get(
+ "negative_text_prompt_divergence_scale", None
+ )
+
+ print(f"negative_text_prompt_logits_scale: {negative_text_prompt_logits_scale}")
+ print(f"negative_text_prompt_divergence_scale: {negative_text_prompt_divergence_scale}")
+
+ negative_text_prompt_to_use = text
+ if (
+ negative_text_prompt is not None
+ and negative_text_prompt != ""
+ and len(negative_text_prompt) > 1
+ ):
+ negative_text_prompt_to_use = negative_text_prompt
+
+ negative_history_prompt_to_use = history_prompt
+
+ if (
+ specific_npz_file_negative_prompt is not None
+ and specific_npz_file_negative_prompt != ""
+ and len(specific_npz_file_negative_prompt) > 1
+ ):
+ negative_history_prompt_to_use = specific_npz_file_negative_prompt
+
+ negative_tokens, negative_logits = call_with_non_none_params(
+ generate_text_semantic,
+ text=negative_text_prompt_to_use,
+ history_prompt=negative_history_prompt_to_use,
+ temp=semantic_temp,
+ top_k=kwargs.get("semantic_top_k", None),
+ top_p=kwargs.get("semantic_top_p", None),
+ silent=silent,
+ min_eos_p=kwargs.get("semantic_min_eos_p", None),
+ max_gen_duration_s=kwargs.get("semantic_max_gen_duration_s", None),
+ # allow_early_stop=kwargs.get("semantic_allow_early_stop", True),
+ allow_early_stop=kwargs.get("semantic_allow_early_stop", True),
+ # use_kv_caching=kwargs.get("semantic_use_kv_caching", True),
+ use_kv_caching=True,
+ banned_tokens=kwargs.get("semantic_banned_tokens", None),
+ absolute_banned_tokens=kwargs.get("semantic_absolute_banned_tokens", None),
+ outside_banned_penalty=kwargs.get("semantic_outside_banned_penalty", None),
+ target_distribution=kwargs.get("semantic_target_distribution", None),
+ target_k_smoothing_factor=kwargs.get("semantic_target_k_smoothing_factor", None),
+ target_scaling_factor=kwargs.get("semantic_target_scaling_factor", None),
+ history_prompt_distribution=kwargs.get("semantic_history_prompt_distribution", None),
+ history_prompt_k_smoothing_factor=kwargs.get(
+ "semantic_history_prompt_k_smoothing_factor", None
+ ),
+ history_prompt_scaling_factor=kwargs.get(
+ "semantic_history_prompt_scaling_factor", None
+ ),
+ history_prompt_average_distribution=kwargs.get(
+ "semantic_history_prompt_average_distribution", None
+ ),
+ history_prompt_average_k_smoothing_factor=kwargs.get(
+ "semantic_history_prompt_average_k_smoothing_factor", None
+ ),
+ history_prompt_average_scaling_factor=kwargs.get(
+ "semantic_history_prompt_average_scaling_factor", None
+ ),
+ target_outside_default_penalty=kwargs.get(
+ "semantic_target_outside_default_penalty", None
+ ),
+ target_outside_outlier_penalty=kwargs.get(
+ "semantic_target_outside_outlier_penalty", None
+ ),
+ history_prompt_unique_voice_penalty=kwargs.get(
+ "semantic_history_prompt_unique_voice_penalty", None
+ ),
+ consider_common_threshold=kwargs.get("semantic_consider_common_threshold", None),
+ history_prompt_unique_voice_threshold=kwargs.get(
+ "semantic_history_prompt_unique_voice_threshold", None
+ ),
+ return_logits=True,
+ )
+ else:
+ print(f"no negative_text_prompt or specific_npz_file_negative_prompt")
+
+ semantic_tokens = call_with_non_none_params(
+ generate_text_semantic,
+ text=text,
+ history_prompt=history_prompt,
+ temp=semantic_temp,
+ top_k=kwargs.get("semantic_top_k", None),
+ top_p=kwargs.get("semantic_top_p", None),
+ silent=silent,
+ min_eos_p=kwargs.get("semantic_min_eos_p", None),
+ max_gen_duration_s=kwargs.get("semantic_max_gen_duration_s", None),
+ allow_early_stop=kwargs.get("semantic_allow_early_stop", True),
+ # use_kv_caching=kwargs.get("semantic_use_kv_caching", True),
+ use_kv_caching=True,
+ banned_tokens=kwargs.get("semantic_banned_tokens", None),
+ absolute_banned_tokens=kwargs.get("semantic_absolute_banned_tokens", None),
+ outside_banned_penalty=kwargs.get("semantic_outside_banned_penalty", None),
+ target_distribution=kwargs.get("semantic_target_distribution", None),
+ target_k_smoothing_factor=kwargs.get("semantic_target_k_smoothing_factor", None),
+ target_scaling_factor=kwargs.get("semantic_target_scaling_factor", None),
+ history_prompt_distribution=kwargs.get("semantic_history_prompt_distribution", None),
+ history_prompt_k_smoothing_factor=kwargs.get(
+ "semantic_history_prompt_k_smoothing_factor", None
+ ),
+ history_prompt_scaling_factor=kwargs.get("semantic_history_prompt_scaling_factor", None),
+ history_prompt_average_distribution=kwargs.get(
+ "semantic_history_prompt_average_distribution", None
+ ),
+ history_prompt_average_k_smoothing_factor=kwargs.get(
+ "semantic_history_prompt_average_k_smoothing_factor", None
+ ),
+ history_prompt_average_scaling_factor=kwargs.get(
+ "semantic_history_prompt_average_scaling_factor", None
+ ),
+ target_outside_default_penalty=kwargs.get("semantic_target_outside_default_penalty", None),
+ target_outside_outlier_penalty=kwargs.get("semantic_target_outside_outlier_penalty", None),
+ history_prompt_unique_voice_penalty=kwargs.get(
+ "semantic_history_prompt_unique_voice_penalty", None
+ ),
+ consider_common_threshold=kwargs.get("semantic_consider_common_threshold", None),
+ history_prompt_unique_voice_threshold=kwargs.get(
+ "semantic_history_prompt_unique_voice_threshold", None
+ ),
+ return_logits=False,
+ negative_tokens=negative_tokens,
+ negative_logits=negative_logits,
+ negative_text_prompt_logits_scale=negative_text_prompt_logits_scale,
+ negative_text_prompt_divergence_scale=negative_text_prompt_divergence_scale,
+ )
+
+ if gradio_try_to_cancel:
+ done_cancelling = True
+ return None, None
+
+ ## Coarse Options
+ coarse_temp = waveform_temp
+ if kwargs.get("coarse_temp", None):
+ coarse_temp = kwargs.get("coarse_temp")
+
+ coarse_seed = kwargs.get("coarse_seed", None)
+ if coarse_seed is not None:
+ set_seed(coarse_seed)
+
+ if gradio_try_to_cancel:
+ done_cancelling = True
+ return None, None
+
+ semantic_history_only = kwargs.get("semantic_history_only", False)
+ previous_segment_type = kwargs.get("previous_segment_type", "")
+ if previous_segment_type == "base_history" and semantic_history_only is True:
+ print(
+ f"previous_segment_type is base_history and semantic_history_only is True. Not forwarding history for for coarse and fine"
+ )
+ history_prompt = None
+
+ absolute_semantic_history_only = kwargs.get("absolute_semantic_history_only", False)
+ if absolute_semantic_history_only:
+ print(
+ f"absolute_semantic_history_only is True. Not forwarding history for for coarse and fine"
+ )
+ history_prompt = None
+
+ absolute_semantic_history_only_every_x = kwargs.get(
+ "absolute_semantic_history_only_every_x", None
+ )
+ if (
+ absolute_semantic_history_only_every_x is not None
+ and absolute_semantic_history_only_every_x > 0
+ ):
+ segment_number = kwargs.get("segment_number", None)
+ if segment_number is not None:
+ if segment_number % absolute_semantic_history_only_every_x == 0:
+ print(
+ f"segment_number {segment_number} is divisible by {absolute_semantic_history_only_every_x}. Not forwarding history for for coarse and fine"
+ )
+ history_prompt = None
+
+ coarse_tokens = call_with_non_none_params(
+ generate_coarse,
+ x_semantic=semantic_tokens,
+ history_prompt=history_prompt,
+ temp=coarse_temp,
+ top_k=kwargs.get("coarse_top_k", None),
+ top_p=kwargs.get("coarse_top_p", None),
+ silent=silent,
+ max_coarse_history=kwargs.get("coarse_max_coarse_history", None),
+ sliding_window_len=kwargs.get("coarse_sliding_window_len", None),
+ # use_kv_caching=kwargs.get("coarse_kv_caching", True),
+ use_kv_caching=True,
+ )
+
+ fine_temp = kwargs.get("fine_temp", 0.5)
+
+ fine_seed = kwargs.get("fine_seed", None)
+ if fine_seed is not None:
+ set_seed(fine_seed)
+
+ if gradio_try_to_cancel:
+ done_cancelling = True
+ return None, None
+ fine_tokens = call_with_non_none_params(
+ generate_fine,
+ x_coarse_gen=coarse_tokens,
+ history_prompt=history_prompt,
+ temp=fine_temp,
+ silent=silent,
+ )
+
+ if gradio_try_to_cancel:
+ done_cancelling = True
+ return None, None
+ audio_arr = codec_decode(fine_tokens)
+ full_generation = {
+ "semantic_prompt": semantic_tokens,
+ "coarse_prompt": coarse_tokens,
+ "fine_prompt": fine_tokens,
+ }
+
+ if gradio_try_to_cancel:
+ done_cancelling = True
+ return None, None
+
+ hoarder_mode = kwargs.get("hoarder_mode", None)
+
+ force_write_segment = kwargs.get("force_write_segment", False)
+
+ total_segments = kwargs.get("total_segments", 1)
+ if (hoarder_mode and (total_segments > 1)) or force_write_segment:
+ kwargs["text"] = text
+ write_one_segment(audio_arr, full_generation, **kwargs)
+
+ if output_full:
+ return full_generation, audio_arr
+
+ return audio_arr
+
+
+def generate_audio_long_from_gradio(**kwargs):
+ full_generation_segments, audio_arr_segments, final_filename_will_be, clone_created_filepath = (
+ [],
+ [],
+ None,
+ None,
+ )
+
+ (
+ full_generation_segments,
+ audio_arr_segments,
+ final_filename_will_be,
+ clone_created_filepath,
+ ) = generate_audio_long(**kwargs)
+
+ # if generation.OFFLOAD_CPU:
+ # generation.clean_models()
+
+ return (
+ full_generation_segments,
+ audio_arr_segments,
+ final_filename_will_be,
+ clone_created_filepath,
+ )
+
+
+def generate_audio_long(
+ **kwargs,
+):
+ global gradio_try_to_cancel
+ global done_cancelling
+
+ kwargs = load_all_defaults(**kwargs)
+ logger.debug(locals())
+
+ history_prompt = None
+ history_prompt = kwargs.get("history_prompt", None)
+ kwargs["history_prompt"] = None
+
+ audio_file_as_history_prompt = None
+ audio_file_as_history_prompt = kwargs.get("audio_file_as_history_prompt", None)
+
+ clone_created_filepaths = []
+
+ audio_file_as_history_prompt_clone_only = kwargs.get(
+ "audio_file_as_history_prompt_clone_only", None
+ )
+
+ if audio_file_as_history_prompt_clone_only is not None:
+ audio_file_as_history_prompt = audio_file_as_history_prompt_clone_only
+
+ if audio_file_as_history_prompt is not None:
+ print(f"Audio File as the history_prompt: {audio_file_as_history_prompt}")
+ quick_voice_clone = quick_clone(audio_file_as_history_prompt)
+ kwargs_clone = copy.deepcopy(kwargs)
+ kwargs_clone["output_filename"] = os.path.basename(audio_file_as_history_prompt)
+ kwargs_clone["npz_only"] = "True"
+ clone_filepath = f"{determine_output_filename(**kwargs_clone)}_quick_clone"
+
+ quick_clone_filename = write_seg_npz(clone_filepath, quick_voice_clone, **kwargs_clone)
+ history_prompt = f"{quick_clone_filename}.npz"
+ kwargs["history_prompt_string"] = history_prompt
+ clone_created_filepaths = [history_prompt]
+
+ if audio_file_as_history_prompt_clone_only is not None:
+ return [], [], None, clone_created_filepaths
+
+ print(f"history_prompt: {history_prompt}")
+
+ silent = kwargs.get("silent", None)
+
+ full_generation_segments = []
+ audio_arr_segments = []
+
+ stable_mode_interval = kwargs.get("stable_mode_interval", None)
+ if stable_mode_interval is None:
+ stable_mode_interval = 1
+
+ if stable_mode_interval < 0:
+ stable_mode_interval = 0
+
+ stable_mode_interval_counter = None
+
+ if stable_mode_interval >= 2:
+ stable_mode_interval_counter = stable_mode_interval
+
+ dry_run = kwargs.get("dry_run", False)
+
+ text_splits_only = kwargs.get("text_splits_only", False)
+
+ if text_splits_only:
+ dry_run = True
+
+ # yanked for now,
+ extra_confused_travolta_mode = kwargs.get("extra_confused_travolta_mode", None)
+
+ confused_travolta_mode = kwargs.get("confused_travolta_mode", None)
+
+ hoarder_mode = kwargs.get("hoarder_mode", None)
+
+ single_starting_seed = kwargs.get("single_starting_seed", None)
+ if single_starting_seed is not None:
+ kwargs["seed_return_value"] = set_seed(single_starting_seed)
+
+ # the old way of doing this
+ process_text_by_each = kwargs.get("process_text_by_each", None)
+ group_text_by_counting = kwargs.get("group_text_by_counting", None)
+
+ history_prompt_for_next_segment = None
+ base_history = None
+ if history_prompt is not None:
+ history_prompt_string = history_prompt
+ history_prompt = process_history_prompt(history_prompt)
+ if history_prompt is not None:
+ # base_history = np.load(history_prompt)
+ base_history = load_npz(history_prompt)
+
+ base_history = {key: base_history[key] for key in base_history.keys()}
+ kwargs["history_prompt_string"] = history_prompt_string
+ kwargs["previous_segment_type"] = "base_history"
+ history_prompt_for_next_segment = copy.deepcopy(
+ base_history
+ ) # just start from a dict for consistency
+ else:
+ logger.error(
+ f"Speaker {history_prompt} could not be found, looking in{VALID_HISTORY_PROMPT_DIRS}"
+ )
+
+ gradio_try_to_cancel = True
+ done_cancelling = True
+
+ return None, None, None, None
+
+ if group_text_by_counting is not None and process_text_by_each is not None:
+ audio_segments = chunk_up_text_prev(**kwargs)
+ else:
+ audio_segments = chunk_up_text(**kwargs)
+
+ if text_splits_only:
+ print("Nothing was generated, this is just text the splits!")
+ return None, None, None, None
+
+ # way too many files, for hoarder_mode every sample is in own dir
+ if hoarder_mode and len(audio_segments) > 1:
+ output_dir = kwargs.get("output_dir", "bark_samples")
+ output_filename_will_be = determine_output_filename(**kwargs)
+ file_name, file_extension = os.path.splitext(output_filename_will_be)
+ output_dir_sub = os.path.basename(file_name)
+ output_dir = os.path.join(output_dir, output_dir_sub)
+ output_dir = generate_unique_dirpath(output_dir)
+ kwargs["output_dir"] = output_dir
+
+ if hoarder_mode and kwargs.get("history_prompt_string", False):
+ kwargs["segment_number"] = "base_history"
+ write_one_segment(audio_arr=None, full_generation=base_history, **kwargs)
+
+ full_generation, audio_arr = (None, None)
+
+ kwargs["output_full"] = True
+
+ # TODO MAKE THIS A PARAM
+ # doubled_audio_segments = []
+ # doubled_audio_segments = [item for item in audio_segments for _ in range(2)]
+ # audio_segments = doubled_audio_segments
+
+ kwargs["total_segments"] = len(audio_segments)
+
+ show_generation_times = kwargs.get("show_generation_times", None)
+
+ all_segments_start_time = time.time()
+
+ history_prompt_flipper = False
+ if len(audio_segments) < 1:
+ audio_segments.append("")
+
+ for i, segment_text in enumerate(audio_segments):
+ estimated_time = estimate_spoken_time(segment_text)
+ print(f"segment_text: {segment_text}")
+
+ prompt_text_prefix = kwargs.get("prompt_text_prefix", None)
+ if prompt_text_prefix is not None:
+ segment_text = f"{prompt_text_prefix} {segment_text}"
+
+ prompt_text_suffix = kwargs.get("prompt_text_suffix", None)
+ if prompt_text_suffix is not None:
+ segment_text = f"{segment_text} {prompt_text_suffix}"
+
+ kwargs["text_prompt"] = segment_text
+ timeest = f"{estimated_time:.2f}"
+ if estimated_time > 14 or estimated_time < 3:
+ timeest = f"[bold red]{estimated_time:.2f}[/bold red]"
+
+ current_iteration = (
+ str(kwargs["current_iteration"]) if "current_iteration" in kwargs else ""
+ )
+
+ output_iterations = kwargs.get("output_iterations", "")
+ iteration_text = ""
+ if len(audio_segments) == 1:
+ iteration_text = f"{current_iteration} of {output_iterations} iterations"
+
+ segment_number = i + 1
+ console.print(
+ f"--Segment {segment_number}/{len(audio_segments)}: est. {timeest}s ({iteration_text})"
+ )
+ # tqdm.write(f"--Segment {segment_number}/{len(audio_segments)}: est. {timeest}s")
+ # tqdm.set_postfix_str(f"--Segment {segment_number}/{len(audio_segments)}: est. {timeest}s")
+
+ if not silent:
+ print(f"{segment_text}")
+ kwargs["segment_number"] = segment_number
+
+ if dry_run is True:
+ full_generation, audio_arr = [], []
+ else:
+ separate_prompts = kwargs.get("separate_prompts", False)
+ separate_prompts_flipper = kwargs.get("separate_prompts_flipper", False)
+
+ if separate_prompts_flipper is True:
+ if separate_prompts is True:
+ # nice to get actual generation from each speaker
+ if history_prompt_flipper is True:
+ kwargs["history_prompt"] = None
+ history_prompt_for_next_segment = None
+ history_prompt_flipper = False
+ print(" ")
+ else:
+ kwargs["history_prompt"] = history_prompt_for_next_segment
+ history_prompt_flipper = True
+ else:
+ kwargs["history_prompt"] = history_prompt_for_next_segment
+
+ else:
+ if separate_prompts is True:
+ history_prompt_for_next_segment = None
+ print(" ")
+ else:
+ kwargs["history_prompt"] = history_prompt_for_next_segment
+
+ if gradio_try_to_cancel:
+ done_cancelling = True
+ print(" ")
+ return None, None, None, None
+
+ this_segment_start_time = time.time()
+
+ full_generation, audio_arr = generate_audio_barki(text=segment_text, **kwargs)
+
+ if gradio_try_to_cancel or full_generation is None or audio_arr is None:
+ # Hmn, cancelling and restarting seems to be a bit buggy
+ # let's try clearing out stuff
+ kwargs = {}
+ history_prompt_for_next_segment = None
+ base_history = None
+ full_generation = None
+ done_cancelling = True
+ print(" -----Bark Infinity Cancelled.>")
+ return None, None, None, None
+
+ if show_generation_times:
+ this_segment_end_time = time.time()
+ elapsed_time = this_segment_end_time - this_segment_start_time
+
+ time_finished = f"Segment Finished at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(this_segment_end_time))}"
+ time_taken = f"in {elapsed_time:.2f} seconds"
+ print(f" -->{time_finished} {time_taken}")
+
+ if base_history is None:
+ base_history = copy.deepcopy(full_generation)
+
+ logger.debug(
+ f"stable_mode_interval: {stable_mode_interval_counter} of {stable_mode_interval}"
+ )
+
+ if stable_mode_interval == 0:
+ kwargs["previous_segment_type"] = "full_generation"
+ history_prompt_for_next_segment = copy.deepcopy(full_generation)
+
+ elif stable_mode_interval == 1:
+ kwargs["previous_segment_type"] = "base_history"
+ history_prompt_for_next_segment = copy.deepcopy(base_history)
+
+ elif stable_mode_interval >= 2:
+ if stable_mode_interval_counter == 1:
+ # reset to base history
+ stable_mode_interval_counter = stable_mode_interval
+ kwargs["previous_segment_type"] = "base_history"
+ history_prompt_for_next_segment = copy.deepcopy(base_history)
+ logger.info(
+ f"resetting to base history_prompt, again in {stable_mode_interval} chunks"
+ )
+ else:
+ stable_mode_interval_counter -= 1
+ kwargs["previous_segment_type"] = "full_generation"
+ history_prompt_for_next_segment = copy.deepcopy(full_generation)
+ else:
+ logger.error(
+ f"stable_mode_interval is {stable_mode_interval} and something has gone wrong."
+ )
+
+ return None, None, None, None
+
+ full_generation_segments.append(full_generation)
+ audio_arr_segments.append(audio_arr)
+
+ add_silence_between_segments = kwargs.get("add_silence_between_segments", 0.0)
+ if add_silence_between_segments > 0.0:
+ print(f"Adding {add_silence_between_segments} seconds of silence between segments.")
+ # silence = np.zeros(int(add_silence_between_segments * SAMPLE_RATE))
+ silence = np.zeros(int(add_silence_between_segments * SAMPLE_RATE), dtype=np.int16)
+
+ audio_arr_segments.append(silence)
+
+ if show_generation_times or True:
+ all_segments_end_time = time.time()
+ elapsed_time = all_segments_end_time - all_segments_start_time
+
+ time_finished = f"All Audio Sections Finished at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(all_segments_end_time))}"
+ time_taken = f"in {elapsed_time:.2f} seconds"
+ print(f" -->{time_finished} {time_taken}")
+
+ if gradio_try_to_cancel:
+ done_cancelling = True
+ print("< Cancelled >")
+ return None, None, None, None
+
+ kwargs["segment_number"] = "final"
+ final_filename_will_be = determine_output_filename(**kwargs)
+ dry_run = kwargs.get("dry_run", None)
+ if not dry_run:
+ if len(audio_arr_segments) > 0:
+ write_one_segment(
+ audio_arr=np.concatenate(audio_arr_segments),
+ full_generation=full_generation_segments[0],
+ **kwargs,
+ )
+ else:
+ print("No audio to write. Something may have gone wrong.")
+ print(f"Saved to {final_filename_will_be}")
+
+ return (
+ full_generation_segments,
+ audio_arr_segments,
+ final_filename_will_be,
+ clone_created_filepaths,
+ )
+
+
+def play_superpack_track(superpack_filepath=None, one_random=True):
+ try:
+ npz_file = np.load(superpack_filepath, allow_pickle=True)
+
+ keys = list(npz_file.keys())
+ random_key = random.choice(keys)
+ random_prompt = npz_file[random_key].item()
+ coarse_tokens = random_prompt["coarse_prompt"]
+ fine_tokens = generate_fine(coarse_tokens)
+ audio_arr = codec_decode(fine_tokens)
+
+ return audio_arr
+ except:
+ return None
+
+
+## TODO can I port the notebook tools somehow?
+
+
+def doctor_random_speaker_surgery(npz_filepath, gen_minor_variants=5):
+ # get directory and filename from npz_filepath
+ npz_file_directory, npz_filename = os.path.split(npz_filepath)
+
+ original_history_prompt = np.load(npz_filepath, allow_pickle=True)
+ semantic_prompt = original_history_prompt["semantic_prompt"]
+ original_semantic_prompt = copy.deepcopy(semantic_prompt)
+
+ starting_point = 128
+ ending_point = len(original_semantic_prompt) - starting_point
+
+ points = np.linspace(starting_point, ending_point, gen_minor_variants)
+
+ i = 0
+ for starting_point in points:
+ starting_point = int(starting_point)
+ i += 1
+
+ new_semantic_from_beginning = copy.deepcopy(
+ original_semantic_prompt[:starting_point].astype(np.int32)
+ )
+ new_semantic_from_ending = copy.deepcopy(
+ original_semantic_prompt[starting_point:].astype(np.int32)
+ )
+
+ # worse than generating brand new random samples, typically
+ for semantic_prompt in [new_semantic_from_beginning, new_semantic_from_ending]:
+ # print(f"len(semantic_prompt): {len(semantic_prompt)}")
+ # print(f"starting_point: {starting_point}, ending_poinst: {ending_point}")
+
+ temp_coarse = random.uniform(0.3, 0.90)
+ top_k_coarse = None if random.random() < 1 / 3 else random.randint(25, 400)
+ top_p_coarse = None if random.random() < 1 / 3 else random.uniform(0.90, 0.97)
+
+ max_coarse_history_options = [
+ 630,
+ random.randint(500, 630),
+ random.randint(60, 500),
+ ]
+ max_coarse_history = random.choice(max_coarse_history_options)
+
+ coarse_tokens = generation.generate_coarse(
+ semantic_prompt,
+ temp=temp_coarse,
+ top_k=top_k_coarse,
+ top_p=top_p_coarse,
+ max_coarse_history=max_coarse_history,
+ )
+
+ temp_fine = random.uniform(0.3, 0.8)
+ fine_tokens = generation.generate_fine(coarse_tokens, temp=temp_fine)
+
+ history_prompt_render_variant = {
+ "semantic_prompt": semantic_prompt,
+ "coarse_prompt": coarse_tokens,
+ "fine_prompt": fine_tokens,
+ }
+
+ try:
+ audio_arr = generation.codec_decode(fine_tokens)
+ base_output_filename = os.path.splitext(npz_filename)[0] + f"_var_{i}.wav"
+ output_filepath = os.path.join(npz_file_directory, base_output_filename)
+ output_filepath = generate_unique_filepath(output_filepath)
+ print(f"output_filepath {output_filepath}")
+ print(
+ f" Rendering minor variant voice audio for {npz_filepath} to {output_filepath}"
+ )
+ write_seg_wav(output_filepath, audio_arr)
+
+ write_seg_npz(output_filepath, history_prompt_render_variant)
+ except:
+ # show error
+ print(f" ")
+
+
+def load_npz(filename):
+ npz_data = np.load(filename, allow_pickle=True)
+
+ data_dict = {
+ "semantic_prompt": npz_data["semantic_prompt"],
+ "coarse_prompt": npz_data["coarse_prompt"],
+ "fine_prompt": npz_data["fine_prompt"],
+ }
+
+ npz_data.close()
+
+ return data_dict
+
+
+def render_npz_samples(
+ npz_directory="bark_infinity/assets/prompts/",
+ start_from=None,
+ double_up_history=False,
+ save_npz=False,
+ compression_mode=False,
+ gen_minor_variants=None,
+):
+ # Find all the .npz files
+
+ print(f"Rendering samples for speakers in: {npz_directory}")
+ npz_files = [f for f in os.listdir(npz_directory) if f.endswith(".npz")]
+
+ if start_from is None:
+ start_from = "fine_prompt"
+ compress_mode_data = []
+
+ for npz_file in npz_files:
+ npz_filepath = os.path.join(npz_directory, npz_file)
+
+ history_prompt = load_npz(npz_filepath)
+
+ if not history_prompt_is_valid(history_prompt):
+ print(f"Skipping invalid history prompt: {npz_filepath}")
+ print(history_prompt_detailed_report(history_prompt))
+ continue
+
+ semantic_tokens = history_prompt["semantic_prompt"]
+ coarse_tokens = history_prompt["coarse_prompt"]
+ fine_tokens = history_prompt["fine_prompt"]
+
+ # print(f"semantic_tokens.shape: {semantic_tokens.shape}")
+ # print(f"coarse_tokens.shape: {coarse_tokens.shape}")
+ # print(f"fine_tokens.shape: {fine_tokens.shape}")
+
+ # this is old and kind of useless, but I'll leave this in UI until I port the better stuff
+ if gen_minor_variants is None:
+ if start_from == "pure_semantic":
+ # code removed for now
+ semantic_tokens = generate_text_semantic(text=None, history_prompt=history_prompt)
+ coarse_tokens = generate_coarse(semantic_tokens, use_kv_caching=True)
+ fine_tokens = generate_fine(coarse_tokens)
+
+ elif start_from == "semantic_prompt":
+ coarse_tokens = generate_coarse(semantic_tokens, use_kv_caching=True)
+ fine_tokens = generate_fine(coarse_tokens)
+
+ elif start_from == "coarse_prompt":
+ fine_tokens = generate_fine(coarse_tokens)
+
+ elif start_from == "coarse_prompt_first_two_quantizers_decoded":
+ # just decode existing fine tokens
+ pass
+ elif start_from == "fine_prompt":
+ # just decode existing fine tokens
+ pass
+
+ history_prompt_render_variant = {
+ "semantic_prompt": semantic_tokens,
+ "coarse_prompt": coarse_tokens,
+ "fine_prompt": fine_tokens,
+ }
+
+ # Not great but it's hooked up to the Gradio UI and does do something guess leave it for now
+ elif gen_minor_variants > 0: # gen_minor_variants quick and simple
+ print(f"Generating {gen_minor_variants} minor variants for {npz_file}")
+ gen_minor_variants = gen_minor_variants or 1
+ for i in range(gen_minor_variants):
+ temp_coarse = random.uniform(0.3, 0.9)
+ top_k_coarse = None if random.random() < 1 / 3 else random.randint(25, 400)
+ top_p_coarse = None if random.random() < 1 / 3 else random.uniform(0.8, 0.95)
+
+ max_coarse_history_options = [
+ 630,
+ random.randint(500, 630),
+ random.randint(60, 500),
+ ]
+ max_coarse_history = random.choice(max_coarse_history_options)
+
+ coarse_tokens = generate_coarse(
+ semantic_tokens,
+ temp=temp_coarse,
+ top_k=top_k_coarse,
+ top_p=top_p_coarse,
+ max_coarse_history=max_coarse_history,
+ )
+
+ temp_fine = random.uniform(0.3, 0.7)
+ fine_tokens = generate_fine(coarse_tokens, temp=temp_fine)
+
+ history_prompt_render_variant = {
+ "semantic_prompt": semantic_tokens,
+ "coarse_prompt": coarse_tokens,
+ "fine_prompt": fine_tokens,
+ }
+
+ try:
+ audio_arr = codec_decode(fine_tokens)
+ base_output_filename = os.path.splitext(npz_file)[0] + f"_var_{i}.wav"
+ output_filepath = os.path.join(npz_directory, base_output_filename)
+ output_filepath = generate_unique_filepath(output_filepath)
+ print(
+ f" Rendering minor variant voice audio for {npz_filepath} to {output_filepath}"
+ )
+ write_seg_wav(output_filepath, audio_arr)
+
+ write_seg_npz(output_filepath, history_prompt_render_variant)
+ except:
+ print(f" ")
+
+ if not compression_mode:
+ start_from_txt = ""
+
+ if start_from == "semantic_prompt":
+ start_from_txt = "_W"
+ elif start_from == "coarse_prompt":
+ start_from_txt = "_S"
+ try:
+ # print(f"fine_tokens.shape final: {fine_tokens.shape}")
+ if start_from == "coarse_prompt_first_two_quantizers_decoded":
+ audio_arr = codec_decode(coarse_tokens)
+ else:
+ audio_arr = codec_decode(fine_tokens)
+ base_output_filename = os.path.splitext(npz_file)[0] + f"_{start_from_txt}_.wav"
+ output_filepath = os.path.join(npz_directory, base_output_filename)
+ output_filepath = generate_unique_filepath(output_filepath)
+ print(f" Rendering audio for {npz_filepath} to {output_filepath}")
+ write_seg_wav(output_filepath, audio_arr)
+ if save_npz and start_from != "fine_prompt":
+ write_seg_npz(output_filepath, history_prompt_render_variant)
+ except Exception as e:
+ print(f" ")
+ print(f" Error details: {e}")
+ elif compression_mode:
+ just_record_it = {
+ "semantic_prompt": None,
+ "coarse_prompt": coarse_tokens,
+ "fine_prompt": None,
+ }
+ compress_mode_data.append(just_record_it)
+ # compress_mode_data.append(history_prompt_render_variant)
+
+ # defunct
+ if compression_mode:
+ print(f"have {len(compress_mode_data)} samples")
+ output_filepath = os.path.join(npz_directory, "superpack.npz")
+ output_filepath = generate_unique_filepath(output_filepath)
+ with open(f"{output_filepath}", "wb") as f:
+ np.savez_compressed(
+ f,
+ **{f"dict_{i}": np.array([d]) for i, d in enumerate(compress_mode_data)},
+ )
+
+
+def resize_semantic_history(semantic_history, weight, max_len=256):
+ new_len = int(max_len * weight)
+
+ semantic_history = semantic_history.astype(np.int64)
+ # Trim
+ if len(semantic_history) > new_len:
+ semantic_history = semantic_history[-new_len:]
+ # Pad
+ else:
+ semantic_history = np.pad(
+ semantic_history,
+ (0, new_len - len(semantic_history)),
+ constant_values=SEMANTIC_PAD_TOKEN,
+ mode="constant",
+ )
+
+ return semantic_history
+
+
+def estimate_spoken_time(text, wpm=150, threshold=15):
+ text_without_brackets = re.sub(r"\[.*?\]", "", text)
+
+ words = text_without_brackets.split()
+ word_count = len(words)
+ time_in_seconds = (word_count / wpm) * 60
+ return time_in_seconds
+
+
+def chunk_up_text(**kwargs):
+ text_prompt = kwargs["text_prompt"]
+ split_character_goal_length = kwargs["split_character_goal_length"]
+ split_character_max_length = kwargs["split_character_max_length"]
+ silent = kwargs.get("silent")
+
+ split_character_jitter = kwargs.get("split_character_jitter") or 0
+
+ if split_character_jitter > 0:
+ split_character_goal_length = random.randint(
+ split_character_goal_length - split_character_jitter,
+ split_character_goal_length + split_character_jitter,
+ )
+ split_character_max_length = random.randint(
+ split_character_max_length - split_character_jitter,
+ split_character_max_length + split_character_jitter,
+ )
+
+ audio_segments = text_processing.split_general_purpose(
+ text_prompt,
+ split_character_goal_length=split_character_goal_length,
+ split_character_max_length=split_character_max_length,
+ )
+
+ split_desc = f"Splitting long text aiming for {split_character_goal_length} chars max {split_character_max_length}"
+
+ if len(audio_segments) > 0:
+ print_chunks_table(
+ audio_segments,
+ left_column_header="Words",
+ right_column_header=split_desc,
+ **kwargs,
+ ) if not silent else None
+ return audio_segments
+
+
+def chunk_up_text_prev(**kwargs):
+ text_prompt = kwargs["text_prompt"]
+ process_text_by_each = kwargs["process_text_by_each"]
+ in_groups_of_size = kwargs["in_groups_of_size"]
+ group_text_by_counting = kwargs.get("group_text_by_counting", None)
+ split_type_string = kwargs.get("split_type_string", "")
+
+ silent = kwargs.get("silent")
+
+ audio_segments = text_processing.split_text(
+ text_prompt,
+ split_type=process_text_by_each,
+ split_type_quantity=in_groups_of_size,
+ split_type_string=split_type_string,
+ split_type_value_type=group_text_by_counting,
+ )
+
+ split_desc = f"Processing text by {process_text_by_each} grouping by {group_text_by_counting} in {in_groups_of_size}, str: {split_type_string} "
+
+ if len(audio_segments) > 0:
+ print_chunks_table(
+ audio_segments,
+ left_column_header="Words",
+ right_column_header=split_desc,
+ **kwargs,
+ ) if not silent else None
+ return audio_segments
+
+
+def print_chunks_table(
+ chunks: list,
+ left_column_header: str = "Words",
+ right_column_header: str = "Segment Text",
+ **kwargs,
+):
+ output_iterations = kwargs.get("output_iterations", "")
+ history_prompt_string = kwargs.get("history_prompt_string", "random")
+ current_iteration = str(kwargs["current_iteration"]) if "current_iteration" in kwargs else ""
+
+ iteration_text = ""
+ if output_iterations and current_iteration:
+ iteration_text = f"{current_iteration} of {output_iterations} iterations"
+
+ table = Table(
+ title=f" ({iteration_text}) Segment Breakdown (Speaker: {history_prompt_string})",
+ show_lines=True,
+ title_justify="left",
+ )
+ table.add_column("#", justify="right", style="magenta", no_wrap=True)
+ table.add_column(left_column_header, style="green")
+ table.add_column("Time Est", style="green")
+ table.add_column(right_column_header)
+ i = 1
+
+ for chunk in chunks:
+ timeest = f"{estimate_spoken_time(chunk):.2f} s"
+ if estimate_spoken_time(chunk) > 14:
+ timeest = f"!{timeest}!"
+ wordcount = f"{str(len(chunk.split()))}"
+ charcount = f"{str(len(chunk))}"
+ table.add_row(str(i), f"{str(len(chunk.split()))}", f"{timeest}\n{charcount} chars", chunk)
+ i += 1
+ console.print(table)
+
+
+LANG_CODE_DICT = {code: lang for lang, code in generation.SUPPORTED_LANGS}
+
+
+def gather_speakers(directory):
+ speakers = defaultdict(list)
+ unsupported_files = []
+
+ for root, dirs, files in os.walk(directory):
+ for filename in files:
+ if filename.endswith(".npz"):
+ match = re.match(r"^([a-z]{2})_.*", filename)
+ if match and match.group(1) in LANG_CODE_DICT:
+ speakers[match.group(1)].append(os.path.join(root, filename))
+ else:
+ unsupported_files.append(os.path.join(root, filename))
+
+ return speakers, unsupported_files
+
+
+def list_speakers():
+ all_speakers = defaultdict(list)
+ all_unsupported_files = []
+
+ for directory in VALID_HISTORY_PROMPT_DIRS:
+ speakers, unsupported_files = gather_speakers(directory)
+ all_speakers.update(speakers)
+ all_unsupported_files.extend(unsupported_files)
+
+ print_speakers(all_speakers, all_unsupported_files)
+
+ return all_speakers, all_unsupported_files
+
+
+def print_speakers(speakers, unsupported_files):
+ # Print speakers grouped by language code
+ for lang_code, files in speakers.items():
+ print(LANG_CODE_DICT[lang_code] + ":")
+ for file in files:
+ print(" " + file)
+
+ # Print unsupported files
+ print("Other:")
+ for file in unsupported_files:
+ print(" " + file)
+
+
+from collections import Counter
+
+CONTEXT_WINDOW_SIZE = 1024
+
+SEMANTIC_RATE_HZ = 49.9
+SEMANTIC_VOCAB_SIZE = 10_000
+
+CODEBOOK_SIZE = 1024
+N_COARSE_CODEBOOKS = 2
+N_FINE_CODEBOOKS = 8
+COARSE_RATE_HZ = 75
+
+SAMPLE_RATE = 24_000
+
+TEXT_ENCODING_OFFSET = 10_048
+SEMANTIC_PAD_TOKEN = 10_000
+TEXT_PAD_TOKEN = 129_595
+SEMANTIC_INFER_TOKEN = 129_599
+
+
+def generate_text_semantic_report(history_prompt, token_samples=3):
+ semantic_history = history_prompt["semantic_prompt"]
+
+ report = {"valid": True, "messages": []}
+
+ if not isinstance(semantic_history, np.ndarray) and not isinstance(
+ semantic_history, torch.Tensor
+ ):
+ report["valid"] = False
+ report["messages"].append(f"should be a numpy array but was {type(semantic_history)}.")
+
+ elif len(semantic_history.shape) != 1:
+ report["valid"] = False
+ report["messages"].append(
+ f"should be a 1d numpy array but shape was {semantic_history.shape}."
+ )
+
+ elif len(semantic_history) == 0:
+ report["valid"] = False
+ report["messages"].append("should not be empty.")
+
+ else:
+ if semantic_history.min() < 0:
+ report["valid"] = False
+ report["messages"].append(f"minimum value of 0, but it was {semantic_history.min()}.")
+ index = np.argmin(semantic_history)
+ surrounding = semantic_history[
+ max(0, index - token_samples) : min(len(semantic_history), index + token_samples)
+ ]
+ report["messages"].append(f"Surrounding tokens: {surrounding}")
+
+ elif semantic_history.max() >= SEMANTIC_VOCAB_SIZE + 1:
+ report["valid"] = False
+ report["messages"].append(
+ f"should have a maximum value less than {SEMANTIC_VOCAB_SIZE}, but it was {semantic_history.max()}."
+ )
+ index = np.argmax(semantic_history)
+ surrounding = semantic_history[
+ max(0, index - token_samples) : min(len(semantic_history), index + token_samples)
+ ]
+ report["messages"].append(f"Surrounding tokens: {surrounding}")
+
+ return report
+
+
+def generate_coarse_report(history_prompt, token_samples=3):
+ semantic_to_coarse_ratio = COARSE_RATE_HZ / SEMANTIC_RATE_HZ * N_COARSE_CODEBOOKS
+
+ semantic_history = history_prompt["semantic_prompt"]
+ coarse_history = history_prompt["coarse_prompt"]
+
+ report = {"valid": True, "messages": []}
+
+ if not isinstance(semantic_history, np.ndarray) and not isinstance(
+ semantic_history, torch.Tensor
+ ):
+ report["valid"] = False
+ report["messages"].append(f"should be a numpy array but it's a {type(semantic_history)}.")
+
+ elif len(semantic_history.shape) != 1:
+ report["valid"] = False
+ report["messages"].append(
+ f"should be a 1d numpy array but shape is {semantic_history.shape}."
+ )
+
+ elif len(semantic_history) == 0:
+ report["valid"] = False
+ report["messages"].append("should not be empty.")
+ else:
+ if semantic_history.min() < 0:
+ report["valid"] = False
+ report["messages"].append(
+ f"should have a minimum value of 0, but it was {semantic_history.min()}."
+ )
+ index = np.argmin(semantic_history)
+ surrounding = semantic_history[
+ max(0, index - token_samples) : min(len(semantic_history), index + token_samples)
+ ]
+ report["messages"].append(f"Surrounding tokens: {surrounding}")
+
+ elif semantic_history.max() >= SEMANTIC_VOCAB_SIZE:
+ report["valid"] = False
+ report["messages"].append(
+ f"should have a maximum value less than {SEMANTIC_VOCAB_SIZE}, but it was {semantic_history.max()}."
+ )
+ index = np.argmax(semantic_history)
+ surrounding = semantic_history[
+ max(0, index - token_samples) : min(len(semantic_history), index + token_samples)
+ ]
+ report["messages"].append(f"Surrounding tokens: {surrounding}")
+
+ if not isinstance(coarse_history, np.ndarray):
+ report["valid"] = False
+ report["messages"].append(f"should be a numpy array but it's a {type(coarse_history)}.")
+
+ elif len(coarse_history.shape) != 2:
+ report["valid"] = False
+ report["messages"].append(
+ f"should be a 2-dimensional numpy array but shape is {coarse_history.shape}."
+ )
+
+ elif coarse_history.shape[0] != N_COARSE_CODEBOOKS:
+ report["valid"] = False
+ report["messages"].append(
+ f"should have {N_COARSE_CODEBOOKS} rows, but it has {coarse_history.shape[0]}."
+ )
+
+ elif coarse_history.size == 0:
+ report["valid"] = False
+ report["messages"].append("The coarse history should not be empty.")
+
+ else:
+ if coarse_history.min() < 0:
+ report["valid"] = False
+ report["messages"].append(
+ f"should have a minimum value of 0, but it was {coarse_history.min()}."
+ )
+ indices = np.unravel_index(coarse_history.argmin(), coarse_history.shape)
+ surrounding = coarse_history[
+ max(0, indices[1] - token_samples) : min(
+ coarse_history.shape[1], indices[1] + token_samples
+ )
+ ]
+ report["messages"].append(f"Surrounding tokens in row {indices[0]}: {surrounding}")
+
+ elif coarse_history.max() >= CODEBOOK_SIZE:
+ report["valid"] = False
+ report["messages"].append(
+ f"should have a maximum value less than {CODEBOOK_SIZE}, but it was {coarse_history.max()}."
+ )
+ indices = np.unravel_index(coarse_history.argmax(), coarse_history.shape)
+ surrounding = coarse_history[
+ max(0, indices[1] - token_samples) : min(
+ coarse_history.shape[1], indices[1] + token_samples
+ )
+ ]
+ report["messages"].append(f"Surrounding tokens in row {indices[0]}: {surrounding}")
+
+ ratio = round(coarse_history.shape[1] / len(semantic_history), 1)
+ if ratio != round(semantic_to_coarse_ratio / N_COARSE_CODEBOOKS, 1):
+ report["valid"] = False
+ report["messages"].append(
+ f"ratio should be {round(semantic_to_coarse_ratio / N_COARSE_CODEBOOKS, 1)}, but it was {ratio}."
+ )
+
+ return report
+
+
+def generate_fine_report(history_prompt, token_samples=3):
+ fine_history = history_prompt["fine_prompt"]
+
+ report = {"valid": True, "messages": []}
+
+ if not isinstance(fine_history, np.ndarray):
+ report["valid"] = False
+ report["messages"].append(
+ f"fine_prompt should be a numpy array but it's a {type(fine_history)}."
+ )
+
+ elif len(fine_history.shape) != 2:
+ report["valid"] = False
+ report["messages"].append(
+ f"fine_prompt should be a 2-dimensional numpy array but shape is {fine_history.shape}."
+ )
+
+ elif fine_history.size == 0:
+ report["valid"] = False
+ report["messages"].append("fine_prompt should not be empty.")
+
+ else:
+ if fine_history.shape[0] != N_FINE_CODEBOOKS:
+ report["valid"] = False
+ report["messages"].append(
+ f"fine_prompt should have {N_FINE_CODEBOOKS} rows, but it has {fine_history.shape[0]}."
+ )
+
+ elif fine_history.min() < 0:
+ report["valid"] = False
+ report["messages"].append(
+ f"fine_prompt should have a minimum value of 0, but it was {fine_history.min()}."
+ )
+ indices = np.unravel_index(fine_history.argmin(), fine_history.shape)
+ surrounding = fine_history[
+ max(0, indices[1] - token_samples) : min(
+ fine_history.shape[1], indices[1] + token_samples
+ )
+ ]
+ report["messages"].append(f"Surrounding tokens in row {indices[0]}: {surrounding}")
+
+ elif fine_history.max() >= CODEBOOK_SIZE:
+ report["valid"] = False
+ report["messages"].append(
+ f"fine_prompt should have a maximum value less than {CODEBOOK_SIZE}, but it was {fine_history.max()}."
+ )
+ indices = np.unravel_index(fine_history.argmax(), fine_history.shape)
+ surrounding = fine_history[
+ max(0, indices[1] - token_samples) : min(
+ fine_history.shape[1], indices[1] + token_samples
+ )
+ ]
+ report["messages"].append(f"Surrounding tokens in row {indices[0]}: {surrounding}")
+
+ return report
+
+
+def display_history_prompt_report(report):
+ if report["valid"]:
+ print("valid")
+ else:
+ print("history_prompt failed the following checks:")
+ for i, message in enumerate(report["messages"], start=1):
+ print(f" Error {i}: {message}")
+
+
+def history_prompt_is_valid(history_prompt):
+ try:
+ history_prompt = generation._load_history_prompt(history_prompt)
+ except Exception as e:
+ print(f"Error: {str(e)}")
+ return
+
+ semantic_report = generate_text_semantic_report(history_prompt)
+ coarse_report = generate_coarse_report(history_prompt)
+ fine_report = generate_fine_report(history_prompt)
+ return semantic_report["valid"] and coarse_report["valid"] and fine_report["valid"]
+
+
+def history_prompt_detailed_report(history_prompt, token_samples=3):
+ try:
+ history_prompt = generation._load_history_prompt(history_prompt)
+ except Exception as e:
+ print(f"Error: {str(e)}")
+ return
+
+ file_name = None
+ if isinstance(history_prompt, str):
+ file_name = history_prompt
+
+ if file_name:
+ print(f"\n>>{file_name}")
+
+ try:
+ text_semantic_report = generate_text_semantic_report(history_prompt, token_samples)
+ print("\n Semantic:")
+ display_history_prompt_report(text_semantic_report)
+ except Exception as e:
+ print(f"Error generating Text Semantic Report: {str(e)}")
+
+ try:
+ coarse_report = generate_coarse_report(history_prompt, token_samples)
+ print("\n Coarse:")
+ display_history_prompt_report(coarse_report)
+ except Exception as e:
+ print(f"Error generating Coarse Report: {str(e)}")
+
+ try:
+ fine_report = generate_fine_report(history_prompt, token_samples)
+ print("\n Fine:")
+ display_history_prompt_report(fine_report)
+ except Exception as e:
+ print(f"Error generating Fine Report: {str(e)}")
+
+
+def startup_status_report(quick=True, gpu_no_details=False):
+ status = gpu_status_report(quick=quick, gpu_no_details=gpu_no_details)
+
+ status += f"\nOFFLOAD_CPU: {generation.OFFLOAD_CPU} (Default is True)"
+ status += f"\nUSE_SMALL_MODELS: {generation.USE_SMALL_MODELS} (Default is False)"
+ status += f"\nGLOBAL_ENABLE_MPS (Apple): {generation.GLOBAL_ENABLE_MPS} (Default is False)"
+
+ gpu_memory = gpu_max_memory()
+ status += f"\nGPU Memory: {gpu_memory} GB"
+
+ if gpu_memory is not None and gpu_memory < 4.1 and gpu_memory > 2.0:
+ status += f"\n WARNING: Your GPU memory is only {gpu_memory} GB. This is OK: enabling SUNO_HALF_PRECISION to save memory."
+ status += f"\n However, if your GPU does have > 6GB of memory, Bark may be using your integrated GPU instead of your main GPU."
+ status += f"\n Recommend using smaller/faster coarse model to increase speed on a weaker GPU, with only minor quality loss."
+ status += f"\n (Go to Setting Tab, then click Apply Settings, coarse_use_small should have defaulted to checked)."
+ status += f"\n If you are still getting memory errors, try closing all other applications. Bark can fit in 4GB, but it can be tight. If that fails you can use still use small text model (text_use_small parameter) but that does have a larger reduction in quality."
+ generation.SUNO_HALF_PRECISION = True
+
+ status += f"\nSUNO_HALF_PRECISION: {generation.SUNO_HALF_PRECISION} (Default is False)"
+ status += f"\nSUNO_HALF_BFLOAT16: {generation.SUNO_HALF_BFLOAT16} (Default is False)"
+ status += f"\nSUNO_DISABLE_COMPILE: {generation.SUNO_DISABLE_COMPILE} (Default is False)"
+
+ # generation.get_SUNO_USE_DIRECTML()
+ status += f"\nSUNO_USE_DIRECTML (AMD): {generation.SUNO_USE_DIRECTML} (Default is False)"
+ num_threads = torch.get_num_threads()
+ status += f"\nTorch Num CPU Threads: {num_threads}"
+
+ XDG = os.getenv("XDG_CACHE_HOME")
+ if XDG is not None:
+ status += f"\nXDG_CACHE_HOME (Model Override Directory) {os.getenv('XDG_CACHE_HOME')}"
+ status += (
+ f"\nBark Model Location: {generation.CACHE_DIR} (Env var 'XDG_CACHE_HOME' to override)"
+ )
+
+ hugging_face_home = os.getenv("HF_HOME")
+ if hugging_face_home:
+ status += f"\nHF_HOME: {hugging_face_home}"
+
+ # print ffmpeg variable status
+ status += f"\n\nFFmpeg status, this should say version 6.0"
+ try:
+ status += f"\nFFmpeg binaries directory: {ffdl.ffmpeg_version}"
+ status += f"\nFFmpeg Version: {ffdl.ffmpeg_version}"
+ status += f"\nFFmpeg Path: {ffdl.ffmpeg_path}"
+ status += f"\nFFprobe Path: {ffdl.ffprobe_path}"
+ status += f"\nFFplay Path: {ffdl.ffplay_path}\n"
+ except Exception as e:
+ status += f"\nError finding FFmpeg: {str(e)}\n"
+ status += """
+ Bark can't find ffmpeg. Try typing this in a command prompt:
+
+ ffdl install -U --add-path
+
+ You can also install ffmpeg.exe in regular windows program, and make sure the the file ffmpeg.exe is in your PATH environment variable.
+ Basically, you want to be able to type 'ffmpeg -version' in a command prompt, in the same place you type 'python bark_webui.py'
+ """
+
+ return status
+
+
+def hugging_face_cache_report():
+ hf_cache_info = scan_cache_dir()
+ return hf_cache_info
diff --git a/bark_infinity/api_in_dev.py b/bark_infinity/api_in_dev.py
new file mode 100644
index 0000000000000000000000000000000000000000..d10f0242fb74c60e43923c584fc4b6d69fdb5307
--- /dev/null
+++ b/bark_infinity/api_in_dev.py
@@ -0,0 +1,1105 @@
+from typing import Dict, Optional, Union
+
+import numpy as np
+from .generation import codec_decode, generate_coarse, generate_fine, generate_text_semantic, SAMPLE_RATE
+from .config import logger, console, console_file, get_default_values, load_all_defaults, VALID_HISTORY_PROMPT_DIRS
+from scipy.io.wavfile import write as write_wav
+
+import copy
+## ADDED
+import os
+import re
+import datetime
+import random
+
+import time
+from bark_infinity import generation
+
+from pathvalidate import sanitize_filename, sanitize_filepath
+
+from rich.pretty import pprint
+from rich.table import Table
+
+from collections import defaultdict
+from tqdm import tqdm
+
+from bark_infinity import text_processing
+
+global gradio_try_to_cancel
+global done_cancelling
+
+
+
+
+gradio_try_to_cancel = False
+done_cancelling = False
+
+def text_to_semantic(
+ text: str,
+ history_prompt: Optional[Union[Dict, str]] = None,
+ temp: float = 0.7,
+ silent: bool = False,
+):
+ """Generate semantic array from text.
+
+ Args:
+ text: text to be turned into audio
+ history_prompt: history choice for audio cloning
+ temp: generation temperature (1.0 more diverse, 0.0 more conservative)
+ silent: disable progress bar
+
+ Returns:
+ numpy semantic array to be fed into `semantic_to_waveform`
+ """
+
+
+ x_semantic = generate_text_semantic(
+ text,
+ history_prompt=history_prompt,
+ temp=temp,
+ silent=silent,
+ use_kv_caching=True
+ )
+
+ return x_semantic
+
+
+def semantic_to_waveform(
+ semantic_tokens: np.ndarray,
+ history_prompt: Optional[Union[Dict, str]] = None,
+ temp: float = 0.7,
+ silent: bool = False,
+ output_full: bool = False,
+):
+ """Generate audio array from semantic input.
+
+ Args:
+ semantic_tokens: semantic token output from `text_to_semantic`
+ history_prompt: history choice for audio cloning
+ temp: generation temperature (1.0 more diverse, 0.0 more conservative)
+ silent: disable progress bar
+ output_full: return full generation to be used as a history prompt
+
+ Returns:
+ numpy audio array at sample frequency 24khz
+ """
+
+ coarse_tokens = generate_coarse(
+ semantic_tokens,
+ history_prompt=history_prompt,
+ temp=temp,
+ silent=silent,
+ use_kv_caching=True
+ )
+ bark_coarse_tokens = coarse_tokens
+
+ fine_tokens = generate_fine(
+ coarse_tokens,
+ history_prompt=history_prompt,
+ temp=0.5,
+ )
+ bark_fine_tokens = fine_tokens
+
+ audio_arr = codec_decode(fine_tokens)
+ if output_full:
+ full_generation = {
+ "semantic_prompt": semantic_tokens,
+ "coarse_prompt": coarse_tokens,
+ "fine_prompt": fine_tokens,
+ }
+ return full_generation, audio_arr
+ return audio_arr
+
+
+def save_as_prompt(filepath, full_generation):
+ assert(filepath.endswith(".npz"))
+ assert(isinstance(full_generation, dict))
+ assert("semantic_prompt" in full_generation)
+ assert("coarse_prompt" in full_generation)
+ assert("fine_prompt" in full_generation)
+ np.savez(filepath, **full_generation)
+
+
+def generate_audio(
+ text: str,
+ history_prompt: Optional[Union[Dict, str]] = None,
+ text_temp: float = 0.7,
+ waveform_temp: float = 0.7,
+ silent: bool = False,
+ output_full: bool = False,
+):
+ """Generate audio array from input text.
+
+ Args:
+ text: text to be turned into audio
+ history_prompt: history choice for audio cloning
+ text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
+ waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
+ silent: disable progress bar
+ output_full: return full generation to be used as a history prompt
+
+ Returns:
+ numpy audio array at sample frequency 24khz
+ """
+ semantic_tokens = text_to_semantic(
+ text,
+ history_prompt=history_prompt,
+ temp=text_temp,
+ silent=silent,
+ )
+ out = semantic_to_waveform(
+ semantic_tokens,
+ history_prompt=history_prompt,
+ temp=waveform_temp,
+ silent=silent,
+ output_full=output_full,
+ )
+ if output_full:
+ full_generation, audio_arr = out
+ return full_generation, audio_arr
+ else:
+ audio_arr = out
+ return audio_arr
+
+## ADDED BELOW
+
+def process_history_prompt(user_history_prompt):
+
+ valid_directories_to_check = VALID_HISTORY_PROMPT_DIRS
+
+ if user_history_prompt is None:
+ return None
+
+ file_name, file_extension = os.path.splitext(user_history_prompt)
+ if not file_extension:
+ file_extension = '.npz'
+
+ full_path = f"{file_name}{file_extension}"
+
+ if os.path.dirname(full_path): # Check if a directory is specified
+ if os.path.exists(full_path):
+ return full_path
+ else:
+ logger.error(f" >> Can't find speaker file at: {full_path}")
+ else:
+ for directory in valid_directories_to_check:
+ full_path_in_dir = os.path.join(directory, f"{file_name}{file_extension}")
+ if os.path.exists(full_path_in_dir):
+ return full_path_in_dir
+
+ logger.error(f" >>! Can't find speaker file: {full_path} in: {valid_directories_to_check}")
+
+ return None
+
+def log_params(log_filepath, **kwargs):
+
+
+ from rich.console import Console
+ file_console = Console(color_system=None)
+ with file_console.capture() as capture:
+ kwargs['history_prompt'] = kwargs.get('history_prompt_string',None)
+ kwargs['history_prompt_string'] = None
+
+ file_console.print(kwargs)
+ str_output = capture.get()
+
+
+ log_filepath = generate_unique_filepath(log_filepath)
+ with open(log_filepath, "wt") as log_file:
+ log_file.write(str_output)
+
+ return
+
+
+def determine_output_filename(special_one_off_path = None, **kwargs):
+ if special_one_off_path:
+ return sanitize_filepath(special_one_off_path)
+
+ # normally generate a filename
+ output_dir = kwargs.get('output_dir',None)
+ output_filename = kwargs.get('output_filename',None)
+
+
+ # TODO: Offer a config for long clips to show only the original starting prompt. I prefer seeing each clip seperately names for easy referencing myself.
+ text_prompt = kwargs.get('text_prompt',None) or kwargs.get('text',None) or ''
+ history_prompt = kwargs.get('history_prompt_string',None) or 'random'
+ text_prompt = text_prompt.strip()
+ history_prompt = os.path.basename(history_prompt).replace('.npz', '')
+
+ # There's a Lot of stuff that passes that sanitize check that we don't want in the filename
+ text_prompt = re.sub(r' ', '_', text_prompt) # spaces with underscores
+ # quotes, colons, and semicolons
+ text_prompt = re.sub(r'[^\w\s]|[:;\'"]', '', text_prompt)
+ text_prompt = re.sub(r'[\U00010000-\U0010ffff]', '',
+ text_prompt, flags=re.UNICODE) # Remove emojis
+ segment_number_text = None
+ hoarder_mode = kwargs.get('hoarder_mode', False)
+ if hoarder_mode:
+ segment_number = kwargs.get("segment_number")
+ if segment_number and kwargs.get("total_segments", 1) > 1:
+ segment_number_text = f"{str(segment_number).zfill(3)}_"
+
+ if output_filename:
+ base_output_filename = f"{output_filename}"
+ else:
+ # didn't seem to add value, ripped out
+ """
+ extra_stats = ''
+ extra_stats = kwargs.get('extra_stats', False)
+ if extra_stats:
+ token_probs_history = kwargs['token_probs_history']
+ if token_probs_history is not None:
+ token_probs_history_entropy = average_entropy(token_probs_history)
+ token_probs_history_perplexity = perplexity(token_probs_history)
+ token_probs_history_entropy_std = entropy_std(token_probs_history)
+ extra_stats = f"ent-{token_probs_history_entropy:.2f}_perp-{token_probs_history_perplexity:.2f}_entstd-{token_probs_history_entropy_std:.2f}"
+ """
+ date_str = datetime.datetime.now().strftime("%y-%m%d-%H%M-%S")
+
+ truncated_text = text_prompt[:15].strip()
+ base_output_filename = f"{truncated_text}-SPK-{history_prompt}"
+
+ if segment_number_text is not None:
+ base_output_filename = f"{segment_number_text}{base_output_filename}"
+
+
+ base_output_filename = f"{base_output_filename}.wav"
+
+ output_filepath = (
+ os.path.join(output_dir, base_output_filename))
+
+ os.makedirs(output_dir, exist_ok=True)
+
+ output_filepath = generate_unique_filepath(output_filepath)
+
+ return output_filepath
+
+
+def write_one_segment(audio_arr = None, full_generation = None, **kwargs):
+ filepath = determine_output_filename(**kwargs)
+ #print(f"Looks like filepath is {filepath} is okay?")
+ if full_generation is not None:
+ write_seg_npz(filepath, full_generation, **kwargs)
+ if audio_arr is not None and kwargs.get("segment_number", 1) != "base_history":
+ write_seg_wav(filepath, audio_arr, **kwargs)
+
+ hoarder_mode = kwargs.get('hoarder_mode', False)
+ dry_run = kwargs.get('dry_run', False)
+ if hoarder_mode and not dry_run:
+ log_params(f"{filepath}_info.txt",**kwargs)
+
+
+def generate_unique_dirpath(dirpath):
+ unique_dirpath = sanitize_filepath(dirpath)
+ base_name = os.path.basename(dirpath)
+ parent_dir = os.path.dirname(dirpath)
+ counter = 1
+ while os.path.exists(unique_dirpath):
+ unique_dirpath = os.path.join(parent_dir, f"{base_name}_{counter}")
+ counter += 1
+ return unique_dirpath
+
+def generate_unique_filepath(filepath):
+ unique_filename = sanitize_filepath(filepath)
+ name, ext = os.path.splitext(filepath)
+ counter = 1
+ while os.path.exists(unique_filename):
+ unique_filename = os.path.join(f"{name}_{counter}{ext}")
+ counter += 1
+ return unique_filename
+
+def write_seg_npz(filepath, full_generation, **kwargs):
+
+ #logger.debug(kwargs)
+
+ if kwargs.get("segment_number", 1) == "base_history":
+ filepath = f"{filepath}_initial_prompt.npz"
+ dry_text = '(dry run)' if kwargs.get('dry_run', False) else ''
+
+ if not kwargs.get('dry_run', False) and kwargs.get('always_save_speaker', True):
+ filepath = generate_unique_filepath(filepath)
+ np.savez_compressed(filepath, semantic_prompt = full_generation["semantic_prompt"], coarse_prompt = full_generation["coarse_prompt"], fine_prompt = full_generation["fine_prompt"])
+
+
+ logger.info(f" .npz saved to {filepath} {dry_text}")
+
+def write_seg_wav(filepath, audio_arr, **kwargs):
+ dry_run = kwargs.get('dry_run', False)
+ dry_text = '(dry run)' if dry_run else ''
+ if dry_run is not True:
+ filepath = generate_unique_filepath(filepath)
+ write_audiofile(filepath, audio_arr)
+
+ logger.info(f" .wav saved to {filepath} {dry_text}")
+
+
+
+def write_audiofile(output_filepath, audio_arr):
+ output_filepath = generate_unique_filepath(output_filepath)
+ write_wav(output_filepath, SAMPLE_RATE, audio_arr)
+
+ #sample_rate = 24000
+ #soundfile.write(output_filepath, audio_arr, sample_rate,format='WAV', subtype='PCM_16')
+ # print(f"[green] ")
+
+
+
+def call_with_non_none_params(func, **kwargs):
+ non_none_params = {key: value for key, value in kwargs.items() if value is not None}
+ return func(**non_none_params)
+
+
+def generate_audio_barki(
+ text: str,
+ **kwargs,
+):
+ """Generate audio array from input text.
+
+ Args:
+ text: text to be turned into audio
+ history_prompt: history choice for audio cloning
+ text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
+ waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
+ silent: disable progress bar
+ output_full: return full generation to be used as a history prompt
+
+
+ Returns:
+ numpy audio array at sample frequency 24khz
+ """
+ logger.debug(locals())
+ kwargs = load_all_defaults(**kwargs)
+
+ history_prompt = kwargs.get("history_prompt", None)
+ text_temp = kwargs.get("text_temp", None)
+ waveform_temp = kwargs.get("waveform_temp", None)
+ silent = kwargs.get("silent", None)
+ output_full = kwargs.get("output_full", None)
+
+ global gradio_try_to_cancel
+ global done_cancelling
+
+ seed = kwargs.get("seed",None)
+ if seed is not None:
+ generation.set_seed(seed)
+
+
+ ## Semantic Options
+ semantic_temp = text_temp
+ if kwargs.get("semantic_temp", None):
+ semantic_temp = kwargs.get("semantic_temp")
+
+ semantic_seed = kwargs.get("semantic_seed",None)
+ if semantic_seed is not None:
+ generation.set_seed(semantic_seed)
+
+ if gradio_try_to_cancel:
+ done_cancelling = True
+ return None, None
+
+
+ # this has to be bugged? But when I logged generate_text_semantic inputs they were exacttly the same as raw generate audio...
+
+ # i must be messning up some values somewhere
+
+ semantic_tokens = call_with_non_none_params(
+ generate_text_semantic,
+ text=text,
+ history_prompt=history_prompt,
+ temp=semantic_temp,
+ top_k=kwargs.get("semantic_top_k", None),
+ top_p=kwargs.get("semantic_top_p", None),
+ silent=silent,
+ min_eos_p = kwargs.get("semantic_min_eos_p", None),
+ max_gen_duration_s = kwargs.get("semantic_max_gen_duration_s", None),
+ allow_early_stop = kwargs.get("semantic_allow_early_stop", True),
+ use_kv_caching=kwargs.get("semantic_use_kv_caching", True),
+ )
+
+
+ if gradio_try_to_cancel:
+ done_cancelling = True
+ return None, None
+
+ ## Coarse Options
+ coarse_temp = waveform_temp
+ if kwargs.get("coarse_temp", None):
+ coarse_temp = kwargs.get("coarse_temp")
+
+ coarse_seed = kwargs.get("coarse_seed",None)
+ if coarse_seed is not None:
+ generation.set_seed(coarse_seed)
+
+
+ if gradio_try_to_cancel:
+ done_cancelling = True
+ return None, None
+
+ coarse_tokens = call_with_non_none_params(
+ generate_coarse,
+ x_semantic=semantic_tokens,
+ history_prompt=history_prompt,
+ temp=coarse_temp,
+ top_k=kwargs.get("coarse_top_k", None),
+ top_p=kwargs.get("coarse_top_p", None),
+ silent=silent,
+ max_coarse_history=kwargs.get("coarse_max_coarse_history", None),
+ sliding_window_len=kwargs.get("coarse_sliding_window_len", None),
+ use_kv_caching=kwargs.get("coarse_kv_caching", True),
+ )
+
+ fine_temp = kwargs.get("fine_temp", 0.5)
+
+ fine_seed = kwargs.get("fine_seed",None)
+ if fine_seed is not None:
+ generation.set_seed(fine_seed)
+
+ if gradio_try_to_cancel:
+ done_cancelling = True
+ return None, None
+ fine_tokens = call_with_non_none_params(
+ generate_fine,
+ x_coarse_gen=coarse_tokens,
+ history_prompt=history_prompt,
+ temp=fine_temp,
+ silent=silent,
+ )
+
+ if gradio_try_to_cancel:
+ done_cancelling = True
+ return None, None
+ audio_arr = codec_decode(fine_tokens)
+ full_generation = {
+ "semantic_prompt": semantic_tokens,
+ "coarse_prompt": coarse_tokens,
+ "fine_prompt": fine_tokens,
+ }
+
+ if gradio_try_to_cancel:
+ done_cancelling = True
+ return None, None
+
+ hoarder_mode = kwargs.get("hoarder_mode", None)
+ total_segments = kwargs.get("total_segments", 1)
+ if hoarder_mode and (total_segments > 1):
+ kwargs["text"] = text
+ write_one_segment(audio_arr, full_generation, **kwargs)
+
+ if output_full:
+ return full_generation, audio_arr
+
+ return audio_arr
+
+
+
+
+def generate_audio_long_from_gradio(**kwargs):
+
+
+
+ full_generation_segments, audio_arr_segments, final_filename_will_be = [],[],None
+
+ full_generation_segments, audio_arr_segments, final_filename_will_be = generate_audio_long(**kwargs)
+
+ return full_generation_segments, audio_arr_segments, final_filename_will_be
+
+
+def generate_audio_long(
+ **kwargs,
+):
+
+ global gradio_try_to_cancel
+ global done_cancelling
+
+ kwargs = load_all_defaults(**kwargs)
+ logger.debug(locals())
+
+
+ history_prompt = None
+ history_prompt = kwargs.get("history_prompt", None)
+ kwargs["history_prompt"] = None
+
+ silent = kwargs.get("silent", None)
+
+ full_generation_segments = []
+ audio_arr_segments = []
+
+
+
+ stable_mode_interval = kwargs.get('stable_mode_interval', None)
+ if stable_mode_interval is None:
+ stable_mode_interval = 1
+
+ if stable_mode_interval < 0:
+ stable_mode_interval = 0
+
+ stable_mode_interval_counter = None
+
+ if stable_mode_interval >= 2:
+ stable_mode_interval_counter = stable_mode_interval
+
+ dry_run = kwargs.get('dry_run', False)
+
+ text_splits_only = kwargs.get('text_splits_only', False)
+
+ if text_splits_only:
+ dry_run = True
+
+
+
+
+ # yanked for now, required too many mods to core Bark code
+ extra_confused_travolta_mode = kwargs.get('extra_confused_travolta_mode', None)
+
+ hoarder_mode = kwargs.get('hoarder_mode', None)
+
+ single_starting_seed = kwargs.get("single_starting_seed",None)
+ if single_starting_seed is not None:
+ kwargs["seed_return_value"] = generation.set_seed(single_starting_seed)
+
+ # the old way of doing this
+ split_each_text_prompt_by = kwargs.get("split_each_text_prompt_by",None)
+ split_each_text_prompt_by_value = kwargs.get("split_each_text_prompt_by_value",None)
+
+ if split_each_text_prompt_by is not None and split_each_text_prompt_by_value is not None:
+ audio_segments = chunk_up_text_prev(**kwargs)
+ else:
+ audio_segments = chunk_up_text(**kwargs)
+
+ if text_splits_only:
+ print("Nothing was generated, this is just text the splits!")
+ return None, None, None
+
+ history_prompt_for_next_segment = None
+ base_history = None
+ if history_prompt is not None:
+ history_prompt_string = history_prompt
+ history_prompt = process_history_prompt(history_prompt)
+ if history_prompt is not None:
+ base_history = np.load(history_prompt)
+ base_history = {key: base_history[key] for key in base_history.keys()}
+ kwargs['history_prompt_string'] = history_prompt_string
+ history_prompt_for_next_segment = copy.deepcopy(base_history) # just start from a dict for consistency
+ else:
+ logger.error(f"Speaker {history_prompt} could not be found, looking in{VALID_HISTORY_PROMPT_DIRS}")
+
+ gradio_try_to_cancel = False
+ done_cancelling = True
+
+ return None, None, None
+
+ # way too many files, for hoarder_mode every sample is in own dir
+ if hoarder_mode and len(audio_segments) > 1:
+ output_dir = kwargs.get('output_dir', "bark_samples")
+ output_filename_will_be = determine_output_filename(**kwargs)
+ file_name, file_extension = os.path.splitext(output_filename_will_be)
+ output_dir_sub = os.path.basename(file_name)
+ output_dir = os.path.join(output_dir, output_dir_sub)
+ output_dir = generate_unique_dirpath(output_dir)
+ kwargs['output_dir'] = output_dir
+
+
+ if hoarder_mode and kwargs.get("history_prompt_string", False):
+ kwargs['segment_number'] = "base_history"
+ write_one_segment(audio_arr = None, full_generation = base_history, **kwargs)
+
+ full_generation, audio_arr = (None, None)
+
+ kwargs["output_full"] = True
+ kwargs["total_segments"] = len(audio_segments)
+
+
+
+
+ for i, segment_text in enumerate(audio_segments):
+ estimated_time = estimate_spoken_time(segment_text)
+ print(f"segment_text: {segment_text}")
+ kwargs["text_prompt"] = segment_text
+ timeest = f"{estimated_time:.2f}"
+ if estimated_time > 14 or estimated_time < 3:
+ timeest = f"[bold red]{estimated_time:.2f}[/bold red]"
+
+ current_iteration = str(
+ kwargs['current_iteration']) if 'current_iteration' in kwargs else ''
+
+ output_iterations = kwargs.get('output_iterations', '')
+ iteration_text = ''
+ if len(audio_segments) == 1:
+ iteration_text = f"{current_iteration} of {output_iterations} iterations"
+
+ segment_number = i + 1
+ console.print(f"--Segment {segment_number}/{len(audio_segments)}: est. {timeest}s ({iteration_text})")
+ #tqdm.write(f"--Segment {segment_number}/{len(audio_segments)}: est. {timeest}s")
+ #tqdm.set_postfix_str(f"--Segment {segment_number}/{len(audio_segments)}: est. {timeest}s")
+
+
+
+ if not silent:
+ print(f"{segment_text}")
+ kwargs['segment_number'] = segment_number
+
+ if dry_run is True:
+ full_generation, audio_arr = [], []
+ else:
+
+ kwargs['history_prompt'] = history_prompt_for_next_segment
+
+
+ if gradio_try_to_cancel:
+ done_cancelling = True
+ print("<<<>>>")
+ return None, None, None
+
+ full_generation, audio_arr = generate_audio_barki(text=segment_text, **kwargs)
+
+ # if we weren't given a history prompt, save first segment instead
+
+
+
+ if gradio_try_to_cancel or full_generation is None or audio_arr is None:
+ # Hmn, cancelling and restarting seems to be a bit buggy
+ # let's try clearing out stuff
+ kwargs = {}
+ history_prompt_for_next_segment = None
+ base_history = None
+ full_generation = None
+ done_cancelling = True
+ print("<<<>>>")
+ return None, None, None
+
+ # we shouldn't need deepcopy but i'm just throwing darts at the bug
+ if base_history is None:
+ #print(f"Saving base history for {segment_text}")
+ base_history = copy.deepcopy(full_generation)
+
+ logger.debug(f"stable_mode_interval: {stable_mode_interval_counter} of {stable_mode_interval}")
+
+
+
+
+
+ if stable_mode_interval == 0:
+ history_prompt_for_next_segment = copy.deepcopy(full_generation)
+
+
+ elif stable_mode_interval == 1:
+
+ history_prompt_for_next_segment = copy.deepcopy(base_history)
+
+ elif stable_mode_interval >= 2:
+ if stable_mode_interval_counter == 1:
+ # reset to base history
+ stable_mode_interval_counter = stable_mode_interval
+ history_prompt_for_next_segment = copy.deepcopy(base_history)
+ logger.info(f"resetting to base history_prompt, again in {stable_mode_interval} chunks")
+ else:
+ stable_mode_interval_counter -= 1
+ history_prompt_for_next_segment = copy.deepcopy(full_generation)
+ else:
+ logger.error(f"stable_mode_interval is {stable_mode_interval} and something has gone wrong.")
+
+ return None, None, None
+
+
+
+ full_generation_segments.append(full_generation)
+ audio_arr_segments.append(audio_arr)
+
+ add_silence_between_segments = kwargs.get("add_silence_between_segments", 0.0)
+ if add_silence_between_segments > 0.0:
+ silence = np.zeros(int(add_silence_between_segments * SAMPLE_RATE))
+ audio_arr_segments.append(silence)
+
+
+ if gradio_try_to_cancel:
+ done_cancelling = True
+ print("< Cancelled >")
+ return None, None, None
+
+ kwargs['segment_number'] = "final"
+ final_filename_will_be = determine_output_filename(**kwargs)
+ dry_run = kwargs.get('dry_run', None)
+ if not dry_run:
+ write_one_segment(audio_arr = np.concatenate(audio_arr_segments), full_generation = full_generation_segments[0], **kwargs)
+ print(f"Saved to {final_filename_will_be}")
+
+ return full_generation_segments, audio_arr_segments, final_filename_will_be
+
+
+def play_superpack_track(superpack_filepath = None, one_random=True):
+
+ try:
+ npz_file = np.load(superpack_filepath)
+
+ keys = list(npz_file.keys())
+ random_key = random.choice(keys)
+ random_prompt = npz_file[random_key].item()
+ coarse_tokens = random_prompt["coarse_prompt"]
+ fine_tokens = generate_fine(coarse_tokens)
+ audio_arr = codec_decode(fine_tokens)
+
+ return audio_arr
+ except:
+ return None
+
+
+
+def doctor_random_speaker_surgery(npz_filepath, gen_minor_variants=5):
+
+
+ # get directory and filename from npz_filepath
+ npz_file_directory, npz_filename = os.path.split(npz_filepath)
+
+ original_history_prompt = np.load(npz_filepath)
+ semantic_prompt = original_history_prompt["semantic_prompt"]
+ original_semantic_prompt = copy.deepcopy(semantic_prompt)
+
+ starting_point = 128
+ starting_point = 64
+ ending_point = len(original_semantic_prompt) - starting_point
+
+ points = np.linspace(starting_point, ending_point, gen_minor_variants)
+
+ i = 0
+ for starting_point in points:
+ starting_point = int(starting_point)
+ i += 1
+ #chop off the front and take thet back, chop off the back and take the front
+ #is it worth doing something with the middle? nah it's worth doing someting more sophisticated later
+
+ new_semantic_from_beginning = copy.deepcopy(original_semantic_prompt[:starting_point].astype(np.int32))
+ new_semantic_from_ending = copy.deepcopy(original_semantic_prompt[starting_point:].astype(np.int32))
+
+
+ ## TODO: port over the good magic from experiments
+
+ for semantic_prompt in [new_semantic_from_beginning, new_semantic_from_ending]:
+
+ print(f"len(semantic_prompt): {len(semantic_prompt)}")
+ print(f"starting_point: {starting_point}, ending_poinst: {ending_point}")
+
+
+ # FAST TALKING SURGERY IS A SUCCESS HOW IN THE HECK DOES THIS
+ # STUPID IDEA JUST ACTUALLY WORK!?!??!?!
+ """
+ print(f"length bfore {len(semantic_prompt)}")
+ X = 2
+ total_elements = len(semantic_prompt)
+ indices = np.arange(0, total_elements, X)
+ semantic_prompt = semantic_prompt[indices]
+ print(f"length after {len(semantic_prompt)}")
+ """
+ # END SLOW TALKER SURGERY
+
+ # SLOW TALKING SURGERY?
+ print(f"length before {len(semantic_prompt)}")
+
+ X = 2
+ total_elements = len(semantic_prompt)
+ duplicated_elements = []
+
+ for i, element in enumerate(semantic_prompt):
+ duplicated_elements.append(element)
+ if (i + 1) % X == 0:
+ duplicated_elements.append(element)
+
+ duplicated_semantic_prompt = np.array(duplicated_elements)
+
+ semantic_prompt = duplicated_semantic_prompt
+ print(f"length after slow surgery {len(semantic_prompt)}")
+
+
+ temp_coarse = random.uniform(0.50, 0.90)
+ top_k_coarse = None if random.random() < 1/3 else random.randint(50, 150)
+ top_p_coarse = None if random.random() < 1/3 else random.uniform(0.90, 0.97)
+
+ max_coarse_history_options = [630, random.randint(500, 630), random.randint(60, 500)]
+ max_coarse_history = random.choice(max_coarse_history_options)
+
+ coarse_tokens = generation.generate_coarse(semantic_prompt, temp=temp_coarse, top_k=top_k_coarse, top_p=top_p_coarse, max_coarse_history=max_coarse_history)
+
+ temp_fine = random.uniform(0.4, 0.6)
+ fine_tokens = generation.generate_fine(coarse_tokens, temp=temp_fine)
+
+ history_prompt_render_variant = {"semantic_prompt": semantic_prompt, "coarse_prompt": coarse_tokens, "fine_prompt": fine_tokens}
+
+ try:
+ audio_arr = generation.codec_decode(fine_tokens)
+ base_output_filename = os.path.splitext(npz_filename)[0] + f"_var_{i}.wav"
+ output_filepath = os.path.join(npz_file_directory, base_output_filename)
+ output_filepath = generate_unique_filepath(output_filepath)
+ print(f"output_filepath {output_filepath}")
+ print(f" Rendering minor variant voice audio for {npz_filepath} to {output_filepath}")
+ write_seg_wav(output_filepath, audio_arr)
+
+ write_seg_npz(output_filepath, history_prompt_render_variant)
+ except:
+ print(f" ")
+
+
+
+def render_npz_samples(npz_directory="bark_infinity/assets/prompts/", start_from=None, double_up_history=False, save_npz=False, compression_mode=False, gen_minor_variants=None):
+ # Find all the .npz files
+ # interesting results when you pack double up and use the tokens in both history and current # model input
+
+ print(f"Rendering samples for speakers in: {npz_directory}")
+ npz_files = [f for f in os.listdir(npz_directory) if f.endswith(".npz")]
+
+
+ if start_from is None:
+ start_from = "fine_prompt"
+ compress_mode_data = []
+
+ for npz_file in npz_files:
+ npz_filepath = os.path.join(npz_directory, npz_file)
+
+ history_prompt = np.load(npz_filepath)
+
+ semantic_tokens = history_prompt["semantic_prompt"]
+ coarse_tokens = history_prompt["coarse_prompt"]
+ fine_tokens = history_prompt["fine_prompt"]
+
+ if gen_minor_variants is None:
+
+ if start_from == "pure_semantic":
+ # this required my mod generate_text_semantic, need to pretend it's two prompts
+ semantic_tokens = generate_text_semantic(text=None, history_prompt = history_prompt)
+ coarse_tokens = generate_coarse(semantic_tokens)
+ fine_tokens = generate_fine(coarse_tokens)
+
+ elif start_from == "semantic_prompt":
+ coarse_tokens = generate_coarse(semantic_tokens)
+ fine_tokens = generate_fine(coarse_tokens)
+
+ elif start_from == "coarse_prompt":
+ fine_tokens = generate_fine(coarse_tokens)
+
+ elif start_from == "fine_prompt":
+ # just decode existing fine tokens
+ pass
+
+ history_prompt_render_variant = {"semantic_prompt": semantic_tokens, "coarse_prompt": coarse_tokens, "fine_prompt": fine_tokens}
+
+
+ elif gen_minor_variants > 0: # gen_minor_variants quick and simple
+ print(f"Generating {gen_minor_variants} minor variants for {npz_file}")
+ gen_minor_variants = gen_minor_variants or 1
+ for i in range(gen_minor_variants):
+ temp_coarse = random.uniform(0.5, 0.9)
+ top_k_coarse = None if random.random() < 1/3 else random.randint(50, 100)
+ top_p_coarse = None if random.random() < 1/3 else random.uniform(0.8, 0.95)
+
+ max_coarse_history_options = [630, random.randint(500, 630), random.randint(60, 500)]
+ max_coarse_history = random.choice(max_coarse_history_options)
+
+ coarse_tokens = generate_coarse(semantic_tokens, temp=temp_coarse, top_k=top_k_coarse, top_p=top_p_coarse, max_coarse_history=max_coarse_history)
+
+ temp_fine = random.uniform(0.3, 0.7)
+ fine_tokens = generate_fine(coarse_tokens, temp=temp_fine)
+
+ history_prompt_render_variant = {"semantic_prompt": semantic_tokens, "coarse_prompt": coarse_tokens, "fine_prompt": fine_tokens}
+
+ try:
+ audio_arr = codec_decode(fine_tokens)
+ base_output_filename = os.path.splitext(npz_file)[0] + f"_var_{i}.wav"
+ output_filepath = os.path.join(npz_directory, base_output_filename)
+ output_filepath = generate_unique_filepath(output_filepath)
+ print(f" Rendering minor variant voice audio for {npz_filepath} to {output_filepath}")
+ write_seg_wav(output_filepath, audio_arr)
+
+ write_seg_npz(output_filepath, history_prompt_render_variant)
+ except:
+ print(f" ")
+
+
+ if not compression_mode:
+ try:
+ audio_arr = codec_decode(fine_tokens)
+ base_output_filename = os.path.splitext(npz_file)[0] + ".wav"
+ output_filepath = os.path.join(npz_directory, base_output_filename)
+ output_filepath = generate_unique_filepath(output_filepath)
+ print(f" Rendering audio for {npz_filepath} to {output_filepath}")
+ write_seg_wav(output_filepath, audio_arr)
+ if save_npz:
+ write_seg_npz(output_filepath, history_prompt_render_variant)
+ except:
+ print(f" ")
+ elif compression_mode:
+ just_record_it = {"semantic_prompt": None, "coarse_prompt": coarse_tokens, "fine_prompt": None}
+ compress_mode_data.append(just_record_it)
+ #compress_mode_data.append(history_prompt_render_variant)
+
+ if compression_mode:
+ print(f"have {len(compress_mode_data)} samples")
+ output_filepath = os.path.join(npz_directory, "superpack.npz")
+ output_filepath = generate_unique_filepath(output_filepath)
+ with open(f"{output_filepath}", 'wb') as f:
+ np.savez_compressed(f, **{f"dict_{i}": np.array([d]) for i, d in enumerate(compress_mode_data)})
+
+
+
+
+
+def resize_semantic_history(semantic_history, weight, max_len=256):
+
+ new_len = int(max_len * weight)
+
+ semantic_history = semantic_history.astype(np.int64)
+ # Trim
+ if len(semantic_history) > new_len:
+ semantic_history = semantic_history[-new_len:]
+ # Pad
+ else:
+ semantic_history = np.pad(
+ semantic_history,
+ (0, new_len - len(semantic_history)),
+ constant_values=SEMANTIC_PAD_TOKEN,
+ mode="constant",
+ )
+
+ return semantic_history
+
+
+
+def estimate_spoken_time(text, wpm=150, threshold=15):
+ text_without_brackets = re.sub(r'\[.*?\]', '', text)
+
+ words = text_without_brackets.split()
+ word_count = len(words)
+ time_in_seconds = (word_count / wpm) * 60
+ return time_in_seconds
+
+
+
+def chunk_up_text(**kwargs):
+
+ text_prompt = kwargs['text_prompt']
+ split_character_goal_length = kwargs['split_character_goal_length']
+ split_character_max_length = kwargs['split_character_max_length']
+ silent = kwargs.get('silent')
+
+
+ split_character_jitter = kwargs.get('split_character_jitter') or 0
+
+ if split_character_jitter > 0:
+ split_character_goal_length = random.randint(split_character_goal_length - split_character_jitter, split_character_goal_length + split_character_jitter)
+ split_character_max_length = random.randint(split_character_max_length - split_character_jitter, split_character_max_length + split_character_jitter)
+
+
+
+
+ audio_segments = text_processing.split_general_purpose(text_prompt, split_character_goal_length=split_character_goal_length, split_character_max_length=split_character_max_length)
+
+
+ split_desc = f"Splitting long text aiming for {split_character_goal_length} chars max {split_character_max_length}"
+
+ if (len(audio_segments) > 0):
+ print_chunks_table(audio_segments, left_column_header="Words",
+ right_column_header=split_desc, **kwargs) if not silent else None
+ return audio_segments
+
+
+
+def chunk_up_text_prev(**kwargs):
+
+ text_prompt = kwargs['text_prompt']
+ split_by = kwargs['split_each_text_prompt_by']
+ split_by_value = kwargs['split_each_text_prompt_by_value']
+ split_by_value_type = kwargs['split_each_text_prompt_by_value_type']
+ silent = kwargs.get('silent')
+
+ audio_segments = text_processing.split_text(text_prompt, split_by, split_by_value, split_by_value_type)
+
+ if split_by == 'phrase':
+ split_desc = f"Splitting long text by *{split_by}* (min_duration=8, max_duration=18, words_per_second=2.3)"
+ else:
+ split_desc = f"Splitting long text by '{split_by}' in groups of {split_by_value}"
+
+ if (len(audio_segments) > 0):
+ print_chunks_table(audio_segments, left_column_header="Words",
+ right_column_header=split_desc, **kwargs) if not silent else None
+ return audio_segments
+
+
+
+def print_chunks_table(chunks: list, left_column_header: str = "Words", right_column_header: str = "Segment Text", **kwargs):
+
+ output_iterations = kwargs.get('output_iterations', '')
+
+ current_iteration = str(
+ kwargs['current_iteration']) if 'current_iteration' in kwargs else ''
+
+ iteration_text = ''
+ if output_iterations and current_iteration:
+
+ iteration_text = f"{current_iteration} of {output_iterations} iterations"
+
+ table = Table(
+ title=f" ({iteration_text}) Segment Breakdown", show_lines=True, title_justify = "left")
+ table.add_column('#', justify="right", style="magenta", no_wrap=True)
+ table.add_column(left_column_header, style="green")
+ table.add_column("Time Est", style="green")
+ table.add_column(right_column_header)
+ i = 1
+
+
+ for chunk in chunks:
+ timeest = f"{estimate_spoken_time(chunk):.2f} s"
+ if estimate_spoken_time(chunk) > 14:
+ timeest = f"!{timeest}!"
+ wordcount = f"{str(len(chunk.split()))}"
+ charcount = f"{str(len(chunk))}"
+ table.add_row(str(i), f"{str(len(chunk.split()))}", f"{timeest}\n{charcount} chars", chunk)
+ i += 1
+ console.print(table)
+
+
+
+
+LANG_CODE_DICT = {code: lang for lang, code in generation.SUPPORTED_LANGS}
+
+
+def gather_speakers(directory):
+ speakers = defaultdict(list)
+ unsupported_files = []
+
+ for root, dirs, files in os.walk(directory):
+ for filename in files:
+ if filename.endswith('.npz'):
+ match = re.match(r"^([a-z]{2})_.*", filename)
+ if match and match.group(1) in LANG_CODE_DICT:
+ speakers[match.group(1)].append(os.path.join(root, filename))
+ else:
+ unsupported_files.append(os.path.join(root, filename))
+
+ return speakers, unsupported_files
+
+def list_speakers():
+ all_speakers = defaultdict(list)
+ all_unsupported_files = []
+
+ for directory in VALID_HISTORY_PROMPT_DIRS:
+ speakers, unsupported_files = gather_speakers(directory)
+ all_speakers.update(speakers)
+ all_unsupported_files.extend(unsupported_files)
+
+ print_speakers(all_speakers, all_unsupported_files)
+
+ return all_speakers, all_unsupported_files
+
+
+def print_speakers(speakers, unsupported_files):
+ # Print speakers grouped by language code
+ for lang_code, files in speakers.items():
+ print(LANG_CODE_DICT[lang_code] + ":")
+ for file in files:
+ print(" " + file)
+
+ # Print unsupported files
+ print("Other:")
+ for file in unsupported_files:
+ print(" " + file)
+
+
+
+
diff --git a/bark_infinity/assets/prompts/add_name_and_desc_to_speaker.py b/bark_infinity/assets/prompts/add_name_and_desc_to_speaker.py
new file mode 100644
index 0000000000000000000000000000000000000000..a585ca2cea60f8e0f66214dc77292047ee9f94ac
--- /dev/null
+++ b/bark_infinity/assets/prompts/add_name_and_desc_to_speaker.py
@@ -0,0 +1,33 @@
+import argparse
+import numpy as np
+import os
+
+def load_npz_file(filepath):
+ with np.load(filepath) as data:
+ return dict(data)
+
+def save_npz_file(filepath, data):
+ np.savez(filepath, **data)
+
+def update_metadata(filepath, metadata):
+ data = load_npz_file(filepath)
+
+ for key, value in metadata.items():
+ data[key] = value
+
+ save_npz_file(filepath, data)
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Add or update metadata in a .npz file. Shows up in --list_speakers output I found myself just using filenames for simplicity.")
+ parser.add_argument("filepath", help="Path to the .npz file.")
+ parser.add_argument("--name", type=str, help="Short Name of the speaker file.")
+ parser.add_argument("--desc", type=str, help="Longer Descriptio of the .npz file.")
+ args = parser.parse_args()
+
+ metadata = {}
+ if args.name:
+ metadata["name"] = args.name
+ if args.desc:
+ metadata["desc"] = args.desc
+
+ update_metadata(args.filepath, metadata)
diff --git a/bark_infinity/assets/prompts/cartoon_extreme.npz b/bark_infinity/assets/prompts/cartoon_extreme.npz
new file mode 100644
index 0000000000000000000000000000000000000000..19893328f8a4282bd1501e8a67c4741afbefb9fb
--- /dev/null
+++ b/bark_infinity/assets/prompts/cartoon_extreme.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:089a455b4d7fbad572c9755271dd78ebb4c8327890cc44f0589edbb06d0be760
+size 16580
diff --git a/bark_infinity/assets/prompts/classic_robot_tts.npz b/bark_infinity/assets/prompts/classic_robot_tts.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9302b1a41f3d20da99f25c41967dbb5cfdcd645c
--- /dev/null
+++ b/bark_infinity/assets/prompts/classic_robot_tts.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:50f2bc9ac0b739e7a6ad33096eadc99af66621de2c9f7d98e5a010000cd45203
+size 37380
diff --git a/bark_infinity/assets/prompts/cool_duo.npz b/bark_infinity/assets/prompts/cool_duo.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d960f5a8edd0d89e9689ff39172921a3873abe44
--- /dev/null
+++ b/bark_infinity/assets/prompts/cool_duo.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b441f303c4d660616b14a05d1cacd8ee267642f0d8b4adc656885be6e8bd53f
+size 62332
diff --git a/bark_infinity/assets/prompts/en_british.npz b/bark_infinity/assets/prompts/en_british.npz
new file mode 100644
index 0000000000000000000000000000000000000000..39568010dafe87d69d66ca5cc92f858b98c8f440
--- /dev/null
+++ b/bark_infinity/assets/prompts/en_british.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c677c135a0f95a0015af31dd3dfb8212b85757338beb230f2ecf8ccb634cc0c8
+size 56572
diff --git a/bark_infinity/assets/prompts/en_deadpan.npz b/bark_infinity/assets/prompts/en_deadpan.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1fc37b0a89265b082aabcf8c6a27affe662fd791
--- /dev/null
+++ b/bark_infinity/assets/prompts/en_deadpan.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:445bbc5f5458f6bc6a8e1980a11ce63b89a9d2c77bbd82c70f61425c022129f3
+size 45908
diff --git a/bark_infinity/assets/prompts/en_female_intense.npz b/bark_infinity/assets/prompts/en_female_intense.npz
new file mode 100644
index 0000000000000000000000000000000000000000..336e1b7daa738085147b21acd44a21645ef880a5
--- /dev/null
+++ b/bark_infinity/assets/prompts/en_female_intense.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7afedd06278c7a3cf816dd99c2959827edaa6b88ec67f91c71781cc4dab8f8d7
+size 54492
diff --git a/bark_infinity/assets/prompts/en_female_performing_play_awesome_but_noisy.npz b/bark_infinity/assets/prompts/en_female_performing_play_awesome_but_noisy.npz
new file mode 100644
index 0000000000000000000000000000000000000000..664761600eecef8757c8f9f16375a19f2377b819
--- /dev/null
+++ b/bark_infinity/assets/prompts/en_female_performing_play_awesome_but_noisy.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d7c814fd52cf7f3dc2da05d116fef3ad45bd78174f2abfe9031e100af882e98d
+size 56732
diff --git a/bark_infinity/assets/prompts/en_female_professional_reader.npz b/bark_infinity/assets/prompts/en_female_professional_reader.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9d63f706fd0dbaad235d159bdcb1fca5a0ef6c87
--- /dev/null
+++ b/bark_infinity/assets/prompts/en_female_professional_reader.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe4388a8ab6bd44035f6f922a7d59148de08e35183104274322b23523bf4bc03
+size 52044
diff --git a/bark_infinity/assets/prompts/en_female_slow_talker.npz b/bark_infinity/assets/prompts/en_female_slow_talker.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b325330639137d7019be51889b2acb8bf6ffd358
--- /dev/null
+++ b/bark_infinity/assets/prompts/en_female_slow_talker.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a44fba632e8b6c33e1b31cc4b0d1408e9313338ce5c59069bf1977646c2308b
+size 57908
diff --git a/bark_infinity/assets/prompts/en_female_storyteller.npz b/bark_infinity/assets/prompts/en_female_storyteller.npz
new file mode 100644
index 0000000000000000000000000000000000000000..87dc0a13fa15b8206aa6d5473c3dfe59267365ce
--- /dev/null
+++ b/bark_infinity/assets/prompts/en_female_storyteller.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3a2f4e8cbcd6ebb5d508231a9735b0e871ecc72e6e971c4f7231c93e315c2c5
+size 56628
diff --git a/bark_infinity/assets/prompts/en_fiery.npz b/bark_infinity/assets/prompts/en_fiery.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1e2d5f905e7678e1eacd0b0c0755f4f8ac4a0687
--- /dev/null
+++ b/bark_infinity/assets/prompts/en_fiery.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98ec6ae353dac99cc5925e8045a56d1164c8be39a0515d1e43a0d74a8f9f1e78
+size 56060
diff --git a/bark_infinity/assets/prompts/en_german_professor.npz b/bark_infinity/assets/prompts/en_german_professor.npz
new file mode 100644
index 0000000000000000000000000000000000000000..9830154a7514d13b092e9bc406066cb96ceaaf14
--- /dev/null
+++ b/bark_infinity/assets/prompts/en_german_professor.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b4d44e97b171c8b603a70b0857321886a90dc5d24c324776d6f989571a9be30
+size 50604
diff --git a/bark_infinity/assets/prompts/en_guitar.npz b/bark_infinity/assets/prompts/en_guitar.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6da14b2c871940259787e631fcb5d040cc4f53fa
--- /dev/null
+++ b/bark_infinity/assets/prompts/en_guitar.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b57682f552fc98248f30d0e32a408eb1c1413fa3d5caa0c99e6581cb9d9b1c7
+size 62332
diff --git a/bark_infinity/assets/prompts/en_interesting_tone.npz b/bark_infinity/assets/prompts/en_interesting_tone.npz
new file mode 100644
index 0000000000000000000000000000000000000000..97035d0e220b6369d6f0f154a2b4584b59f22c2b
--- /dev/null
+++ b/bark_infinity/assets/prompts/en_interesting_tone.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7987916c7ae88d5dd0232384cff085d3e1ac591b0fb547d4a5ee231ad5bc0e18
+size 44524
diff --git a/bark_infinity/assets/prompts/en_male_nervous_subdued.npz b/bark_infinity/assets/prompts/en_male_nervous_subdued.npz
new file mode 100644
index 0000000000000000000000000000000000000000..5a62b3b6360ebf34c381e4011edec43980236f7a
--- /dev/null
+++ b/bark_infinity/assets/prompts/en_male_nervous_subdued.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f52822d3a5ad4b5b0a438af077a30e03d2166810f61b39ac5102323576c8ddbb
+size 57852
diff --git a/bark_infinity/assets/prompts/en_male_professional_reader.npz b/bark_infinity/assets/prompts/en_male_professional_reader.npz
new file mode 100644
index 0000000000000000000000000000000000000000..77c8c067a535fb3b5eef8ccf6469a4ec5befb825
--- /dev/null
+++ b/bark_infinity/assets/prompts/en_male_professional_reader.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:409fb3cb35f420a136aee63d46d5428e6f6a8c3001f518d4a7e7b6395e1dd4b5
+size 40260
diff --git a/bark_infinity/assets/prompts/en_man_giving_ted_talk.npz b/bark_infinity/assets/prompts/en_man_giving_ted_talk.npz
new file mode 100644
index 0000000000000000000000000000000000000000..72e9437682c686d23fe6b916bc955a6c5a0d330b
--- /dev/null
+++ b/bark_infinity/assets/prompts/en_man_giving_ted_talk.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf74c736a0f7a4db125dcee35ebec94c1b2638f7921ad0e43e73dabc2130ef98
+size 55932
diff --git a/bark_infinity/assets/prompts/en_narrator_deep.npz b/bark_infinity/assets/prompts/en_narrator_deep.npz
new file mode 100644
index 0000000000000000000000000000000000000000..fa9c713139fd365f7efd0c80ac5f108d1d38aeb2
--- /dev/null
+++ b/bark_infinity/assets/prompts/en_narrator_deep.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7d4d21fbe661dd0081499b2fe0d5df5171257a497e5b5e4bce8e6315ed9929f8
+size 55988
diff --git a/bark_infinity/assets/prompts/en_narrator_light_bg.npz b/bark_infinity/assets/prompts/en_narrator_light_bg.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1ad0b69b83a1b1347aef5dfafa53e887a9608c6b
--- /dev/null
+++ b/bark_infinity/assets/prompts/en_narrator_light_bg.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8d0ee9e3585bb93d3a37bd7dfeee49e9d059b722cb8f12a2fbf49b4850e046e1
+size 40100
diff --git a/bark_infinity/assets/prompts/en_old_movie_actor.npz b/bark_infinity/assets/prompts/en_old_movie_actor.npz
new file mode 100644
index 0000000000000000000000000000000000000000..495260eb857a25e3e6d88dd7f63b8f98894053eb
--- /dev/null
+++ b/bark_infinity/assets/prompts/en_old_movie_actor.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4cfe0e3f58cfe6d38b3940850ad2add20f399f9ce8abcc946351f74993e704c3
+size 59452
diff --git a/bark_infinity/assets/prompts/en_public_speaker.npz b/bark_infinity/assets/prompts/en_public_speaker.npz
new file mode 100644
index 0000000000000000000000000000000000000000..1791a1a3eb3fedea56888fc9e4145236e75c90ba
--- /dev/null
+++ b/bark_infinity/assets/prompts/en_public_speaker.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6fa7b738db8b025ecc2ced069ba979df42b4821e19e51cb6774829e186741d39
+size 57748
diff --git a/bark_infinity/assets/prompts/en_public_speaker_2.npz b/bark_infinity/assets/prompts/en_public_speaker_2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..61e380fdeb632b33b027ae4f929127ba7ebe37f5
--- /dev/null
+++ b/bark_infinity/assets/prompts/en_public_speaker_2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a5561b641ca6f2922536c53965de51bb362fe9405bdb427ea9c33aba95a31876
+size 55508
diff --git a/bark_infinity/assets/prompts/en_quiet_intense.npz b/bark_infinity/assets/prompts/en_quiet_intense.npz
new file mode 100644
index 0000000000000000000000000000000000000000..be34b1202fc974df9add9fefb5779bf3a5160076
--- /dev/null
+++ b/bark_infinity/assets/prompts/en_quiet_intense.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9504ad903317dce7344048d0d4e827d1cad252bbdca92ef1b41b1ecb1c4df19c
+size 60732
diff --git a/bark_infinity/assets/prompts/en_sharp_tone_but_noisy.npz b/bark_infinity/assets/prompts/en_sharp_tone_but_noisy.npz
new file mode 100644
index 0000000000000000000000000000000000000000..b5532392d345f34cf0dcdc6d630fc3c8092faade
--- /dev/null
+++ b/bark_infinity/assets/prompts/en_sharp_tone_but_noisy.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:454926c8d92df0386b165f168f0867f5806faaf15b21722ff17d12c716dd573a
+size 58388
diff --git a/bark_infinity/assets/prompts/en_smooth_gruff.npz b/bark_infinity/assets/prompts/en_smooth_gruff.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4d5c7e92048677c612b71dde2222e275ebf95b34
--- /dev/null
+++ b/bark_infinity/assets/prompts/en_smooth_gruff.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:208a003d30eacc63d1b6c3a537d58afb656a99d2299a3af0d0392e8f5085cf21
+size 58172
diff --git a/bark_infinity/assets/prompts/en_solo_singer.npz b/bark_infinity/assets/prompts/en_solo_singer.npz
new file mode 100644
index 0000000000000000000000000000000000000000..a9ce1a2e76c7a1a0abd3e5d0fbde1cb3749e1538
--- /dev/null
+++ b/bark_infinity/assets/prompts/en_solo_singer.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:09ac1bc9245dac2fb6ae408f2df9910688de76a0386c04d7e7ca05ebebdcffc0
+size 32580
diff --git a/bark_infinity/assets/prompts/en_tv_commercial.npz b/bark_infinity/assets/prompts/en_tv_commercial.npz
new file mode 100644
index 0000000000000000000000000000000000000000..447a19e4835ded53dc6c5fb69d692ad56c25ff6f
--- /dev/null
+++ b/bark_infinity/assets/prompts/en_tv_commercial.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0650c34312c156587e74f1f5f4a13904f3d78aa0beb676a074d95bc84d291993
+size 40844
diff --git a/bark_infinity/assets/prompts/music_off_the_rails.npz b/bark_infinity/assets/prompts/music_off_the_rails.npz
new file mode 100644
index 0000000000000000000000000000000000000000..14ffbcd7a15aae0cf024f5e007aed4dfbedde56f
--- /dev/null
+++ b/bark_infinity/assets/prompts/music_off_the_rails.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4a33f48bd13dd0fe8ab55ba01dca84bdaabd69272443d18fde27ec9ffa10af04
+size 59988
diff --git a/bark_infinity/assets/prompts/rock_maybe.npz b/bark_infinity/assets/prompts/rock_maybe.npz
new file mode 100644
index 0000000000000000000000000000000000000000..90a1376af0a51b706267ad0672882e383910138d
--- /dev/null
+++ b/bark_infinity/assets/prompts/rock_maybe.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee61636247b8aaf95efebf1d6f7853a3eb5734592cbda8d70568c3ac9181a67d
+size 60628
diff --git a/bark_infinity/assets/prompts/sing1.npz b/bark_infinity/assets/prompts/sing1.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4d2c185499a922d0592115f6c1fedd3e7119919c
--- /dev/null
+++ b/bark_infinity/assets/prompts/sing1.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e04f7b52b4c9d3ba211e6ab4b9db34ab79e865bfd23581f1335d911a75c21f37
+size 60732
diff --git a/bark_infinity/assets/prompts/sing2.npz b/bark_infinity/assets/prompts/sing2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..4e8ec625f15e38c812c0049ae8f8a971b5e6c5f1
--- /dev/null
+++ b/bark_infinity/assets/prompts/sing2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fa6c04f57b3f04b0ccaa480b518fe0fd157385aa8ef6514b2bfc179a27b5466f
+size 56308
diff --git a/bark_infinity/assets/prompts/sing_3.npz b/bark_infinity/assets/prompts/sing_3.npz
new file mode 100644
index 0000000000000000000000000000000000000000..90a1376af0a51b706267ad0672882e383910138d
--- /dev/null
+++ b/bark_infinity/assets/prompts/sing_3.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ee61636247b8aaf95efebf1d6f7853a3eb5734592cbda8d70568c3ac9181a67d
+size 60628
diff --git a/bark_infinity/assets/prompts/snarky_but_noisy.npz b/bark_infinity/assets/prompts/snarky_but_noisy.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ec7d197bc30e247abb4327928038c8025b2c922b
--- /dev/null
+++ b/bark_infinity/assets/prompts/snarky_but_noisy.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0cde75af58006188bfe7db7cb0d9e979e3b6bf378677d7242c379a665e7bd018
+size 62332
diff --git a/bark_infinity/assets/prompts/snarky_narrator_but_noisy.npz b/bark_infinity/assets/prompts/snarky_narrator_but_noisy.npz
new file mode 100644
index 0000000000000000000000000000000000000000..fce59a7eb32d387b62c5c69d53563aa854d0dbae
--- /dev/null
+++ b/bark_infinity/assets/prompts/snarky_narrator_but_noisy.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55576eda26c6e38f5324bd99aadeb985f98a91e54fec53de95785479912caa16
+size 44788
diff --git a/bark_infinity/assets/prompts/talkradio.npz b/bark_infinity/assets/prompts/talkradio.npz
new file mode 100644
index 0000000000000000000000000000000000000000..e20864f7e03daf1e03506c6e5dfda03958cd3a79
--- /dev/null
+++ b/bark_infinity/assets/prompts/talkradio.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af1cc4cd5ac53029bef5d20f518f2910e8ddb8d1eaf7abaf101fcc586c081b00
+size 56628
diff --git a/bark_infinity/assets/prompts/timid_jane.npz b/bark_infinity/assets/prompts/timid_jane.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ae8a1cf36ede5cf1e11a6e17b791186b7194b4bf
--- /dev/null
+++ b/bark_infinity/assets/prompts/timid_jane.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b94568bc84dad9374dc681b73d480fd2e124b12d78658036ad4546a802b87aa
+size 60308
diff --git a/bark_infinity/assets/prompts/weirdvibes.npz b/bark_infinity/assets/prompts/weirdvibes.npz
new file mode 100644
index 0000000000000000000000000000000000000000..21b8abad5d74deb08ea93f031c98ef835bb70d8d
--- /dev/null
+++ b/bark_infinity/assets/prompts/weirdvibes.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:14baa86721202db6edcfef3fae2128f0cc1b994c2655a4f3ac68e6632c769c39
+size 58972
diff --git a/bark_infinity/assets/prompts/weirdvibes2.npz b/bark_infinity/assets/prompts/weirdvibes2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..08802f8b0ee829c5a8c73b547cbbad6a9d1b13d5
--- /dev/null
+++ b/bark_infinity/assets/prompts/weirdvibes2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f2e6ba674a9032cc923833caa6e94cf6819d739e0432b5eed4d705d696d72356
+size 58012
diff --git a/bark_infinity/assets/split_the_text.wav b/bark_infinity/assets/split_the_text.wav
new file mode 100644
index 0000000000000000000000000000000000000000..8a948692be6ff9443e5c1ceabb268ef7a74ae859
Binary files /dev/null and b/bark_infinity/assets/split_the_text.wav differ
diff --git a/bark_infinity/bark_legacy/bark_perform.py b/bark_infinity/bark_legacy/bark_perform.py
new file mode 100644
index 0000000000000000000000000000000000000000..8de288ac20bcfd8f3deddfb22943b3e10e1c5387
--- /dev/null
+++ b/bark_infinity/bark_legacy/bark_perform.py
@@ -0,0 +1,312 @@
+import argparse
+import numpy as np
+from bark import SAMPLE_RATE, generate_audio, preload_models
+import os
+import datetime
+import soundfile as sf
+import re
+from collections import defaultdict, namedtuple
+
+FileData = namedtuple("FileData", ["filename", "name", "desc"])
+
+
+
+SUPPORTED_LANGS = [
+ ("English", "en"),
+ ("German", "de"),
+ ("Spanish", "es"),
+ ("French", "fr"),
+ ("Hindi", "hi"),
+ ("Italian", "it"),
+ ("Japanese", "ja"),
+ ("Korean", "ko"),
+ ("Polish", "pl"),
+ ("Portuguese", "pt"),
+ ("Russian", "ru"),
+ ("Turkish", "tr"),
+ ("Chinese", "zh"),
+]
+
+
+
+def read_npz_files(directory):
+ return [f for f in os.listdir(directory) if f.endswith(".npz")]
+
+def extract_name_and_desc(filepath):
+ with np.load(filepath) as data:
+ name = data.get('name', '')
+ desc = data.get('desc', '')
+ return name, desc
+
+def categorize_files(files, directory):
+ categorized_files = defaultdict(list)
+ lang_dict = {code: lang for lang, code in SUPPORTED_LANGS}
+
+ for file in files:
+ name, desc = extract_name_and_desc(os.path.join(directory, file))
+ match = re.match(r"([a-z]{2}|\w+)_", file)
+ if match:
+ prefix = match.group(1)
+ if prefix in lang_dict:
+ categorized_files[lang_dict[prefix]].append(FileData(file, name, desc))
+ else:
+ categorized_files[prefix.capitalize()].append(FileData(file, name, desc))
+ else:
+ categorized_files["Other"].append(FileData(file, name, desc))
+
+ return categorized_files
+
+# this is a mess but whatever
+def print_speakers_list(categorized_files):
+ print("Available history prompts:")
+ for category, files in categorized_files.items():
+ sorted_files = sorted(files, key=lambda x: (re.search(r"_\w+(_\d+)?\.npz$", x.filename) and re.search(r"_\w+(_\d+)?\.npz$", x.filename).group()[:-4], x.filename))
+ print(f"\n {category}:")
+ for file_data in sorted_files:
+ name_display = f' "{file_data.name}"' if file_data.name else ''
+ desc_display = f'{file_data.desc}' if file_data.desc else ''
+ print(f" {file_data.filename[:-4]} {name_display} {desc_display}")
+
+CUR_PATH = os.path.dirname(os.path.abspath(__file__))
+history_prompt_dir = os.path.join(CUR_PATH, "bark", "assets", "prompts")
+
+npz_files = read_npz_files(history_prompt_dir)
+categorized_files = categorize_files(npz_files, history_prompt_dir)
+ALLOWED_PROMPTS = {file[:-4] for file in npz_files}
+
+
+
+def estimate_spoken_time(text, wpm=150, time_limit=14):
+ # Remove text within square brackets
+ text_without_brackets = re.sub(r'\[.*?\]', '', text)
+
+ words = text_without_brackets.split()
+ word_count = len(words)
+ time_in_seconds = (word_count / wpm) * 60
+
+ if time_in_seconds > time_limit:
+ return True, time_in_seconds
+ else:
+ return False, time_in_seconds
+
+
+def save_npz_file(filepath, x_semantic_continued, coarse_prompt, fine_prompt, output_dir=None):
+ np.savez(filepath, semantic_prompt=x_semantic_continued, coarse_prompt=coarse_prompt, fine_prompt=fine_prompt)
+ print(f"speaker file for this clip saved to {filepath}")
+
+def split_text(text, split_words=0, split_lines=0):
+ if split_words > 0:
+ words = text.split()
+ chunks = [' '.join(words[i:i + split_words]) for i in range(0, len(words), split_words)]
+ elif split_lines > 0:
+ lines = [line for line in text.split('\n') if line.strip()]
+ chunks = ['\n'.join(lines[i:i + split_lines]) for i in range(0, len(lines), split_lines)]
+ else:
+ chunks = [text]
+ return chunks
+
+def save_audio_to_file(filepath, audio_array, sample_rate=24000, format='WAV', subtype='PCM_16', output_dir=None):
+ sf.write(filepath, audio_array, sample_rate, format=format, subtype=subtype)
+ print(f"Saved audio to {filepath}")
+
+
+def gen_and_save_audio(text_prompt, history_prompt=None, text_temp=0.7, waveform_temp=0.7, filename="", output_dir="bark_samples", split_by_words=0, split_by_lines=0, stable_mode=False, confused_travolta_mode=False, iteration=1):
+ def generate_unique_filename(base_filename):
+ name, ext = os.path.splitext(base_filename)
+ unique_filename = base_filename
+ counter = 1
+ while os.path.exists(unique_filename):
+ unique_filename = f"{name}_{counter}{ext}"
+ counter += 1
+ return unique_filename
+ orig_history_prompt = history_prompt
+ saveit = True if history_prompt is None else False
+ if iteration == 1:
+ print(f"Full Prompt: {text_prompt}")
+ if args.history_prompt:
+ print(f" Using speaker: {history_prompt}")
+ else:
+ print(f" No speaker. Randomly generating a speaker.")
+
+ text_chunks = split_text(text_prompt, split_by_words, split_by_lines)
+
+ base = None
+ npzbase = None
+ audio_arr_chunks = []
+
+ # Should output each audio chunk to disk midway so you at least a partial output if a long process crashes.
+ for i, chunk in enumerate(text_chunks):
+ print(f"Processing chunk {i + 1}/{len(text_chunks)}: {chunk}")
+ longer_than_14_seconds, estimated_time = estimate_spoken_time(chunk)
+ print(f"Current text chunk ballpark estimate: {estimated_time:.2f} seconds.")
+ if longer_than_14_seconds:
+ print(f"Text Prompt could be too long, might want to try a shorter one or try splitting tighter.")
+
+ audio_array, x = generate_audio(chunk, history_prompt, text_temp=text_temp, waveform_temp=waveform_temp, base=base, confused_travolta_mode=confused_travolta_mode)
+ if saveit is True and npzbase is None:
+ npzbase = x
+ if stable_mode:
+ base = x if (base is None and history_prompt is None) else base
+ else:
+ base = x
+ history_prompt = None
+ audio_arr_chunks.append(audio_array)
+
+ concatenated_audio_arr = np.concatenate(audio_arr_chunks)
+
+ if not filename:
+ date_str = datetime.datetime.now().strftime("%Y-%m-%d-%H")
+ truncated_text = text_prompt.replace("WOMAN:", "").replace("MAN:", "")[:15].strip().replace(" ", "_")
+ filename = f"{truncated_text}-history_prompt-{orig_history_prompt}-text_temp-{text_temp}-waveform_temp-{waveform_temp}-{date_str}.wav"
+ filename = generate_unique_filename(filename)
+
+ # Create output directory if it doesn't exist
+ if output_dir:
+ os.makedirs(output_dir, exist_ok=True)
+ filepath = os.path.join(output_dir, filename)
+ else:
+ filepath = filename
+
+ i = 1
+ name, ext = os.path.splitext(filepath)
+ while os.path.exists(filepath):
+ filepath = f"{name}_{i}{ext}"
+ i += 1
+
+ if saveit is True:
+ save_npz_file(f'{filepath}.npz', npzbase[0], npzbase[1], npzbase[2], output_dir=output_dir)
+
+ save_audio_to_file(filepath, concatenated_audio_arr, SAMPLE_RATE, output_dir=output_dir)
+
+
+
+# If there's no text_prompt passed on the command line, process this list instead.
+# If you use an entir song, make sure you set --split_by_lines.
+text_prompts = []
+
+text_prompt = """
+ โช We're no strangers to love โช
+ โช You know the rules and so do I (do I) โช
+ โช A full commitment's what I'm thinking of โช
+ โช You wouldn't get this from any other guy โช
+"""
+text_prompts.append(text_prompt)
+
+text_prompt = """
+ In the beginning the Universe was created. This has made a lot of people very angry and been widely regarded as a bad move.
+"""
+text_prompts.append(text_prompt)
+
+text_prompt = """
+ A common mistake that people make when trying to design something completely foolproof is to underestimate the ingenuity of complete fools.
+"""
+text_prompts.append(text_prompt)
+
+
+def main(args):
+
+ if args.list_speakers:
+ print_speakers_list(categorized_files)
+ else:
+ if args.text_prompt:
+ text_prompts_to_process = [args.text_prompt]
+ elif args.prompt_file:
+ with open(args.prompt_file, "r", encoding="utf-8") as f:
+ if args.prompt_file_separator:
+ text_prompts_to_process = f.read().split(args.prompt_file_separator)
+ else:
+ text_prompts_to_process = [f.read()]
+
+ text_prompts_to_process = [prompt for prompt in text_prompts_to_process if prompt.strip()]
+
+ print(f"Processing prompts from file: {args.prompt_file}")
+ print(f"Number of prompts after splitting: {len(text_prompts_to_process)}")
+
+ else:
+ print("No text prompt provided. Using the prompts defined in this python file instead.")
+ text_prompts_to_process = text_prompts
+ if args.history_prompt:
+ history_prompt = args.history_prompt
+ else:
+ history_prompt = None
+ text_temp = args.text_temp if args.text_temp else 0.7
+ waveform_temp = args.waveform_temp if args.waveform_temp else 0.7
+ stable_mode = args.stable_mode if args.stable_mode else False
+ confused_travolta_mode = args.confused_travolta_mode if args.confused_travolta_mode else False
+ filename = args.filename if args.filename else ""
+ output_dir = args.output_dir if args.output_dir else "bark_samples"
+
+ print("Loading Bark models...")
+
+ if args.use_smaller_models:
+ print("Using smaller models.")
+ preload_models(use_smaller_models=True)
+ else:
+ preload_models()
+
+ print("Models loaded.")
+
+ for idx, prompt in enumerate(text_prompts_to_process, start=1):
+ print(f"Processing prompt {idx} of {len(text_prompts_to_process)}:")
+
+ split_by_words = args.split_by_words if args.split_by_words else 0
+ split_by_lines = args.split_by_lines if args.split_by_lines else 0
+
+ if args.iterations > 1:
+ for iteration in range(1, args.iterations + 1):
+ print(f"Iteration {iteration} of {args.iterations}.")
+ gen_and_save_audio(prompt, history_prompt, text_temp, waveform_temp, filename, output_dir, split_by_words, split_by_lines, stable_mode, confused_travolta_mode, iteration=iteration)
+ else:
+ gen_and_save_audio(prompt, history_prompt, text_temp, waveform_temp, filename, output_dir, split_by_words, split_by_lines, stable_mode, confused_travolta_mode)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="""
+ (This grew into a bit more than a BARK CLI wrapper.)
+
+ WELCOME TO BARK INFINITY
+
+ INFINITY VOICES
+ Discover cool new voices, save them, share them.
+ Every audio clip saves a speaker.npz file with voice.
+ To reuse a voice, move the generated speaker.npz file (named the same as the .wav file)
+ to the "prompts" directory inside "bark" where all the other .npz files are.
+
+ INFINITY LENGTH
+ Any length prompt and audio clips.
+ Sometimes the final result is seemless, sometimes it's stable. (But usually not both!)
+
+ CONFUSED TRAVOLTA MODE
+ Not super useful but very fun.
+
+ --use_smaller_models for faster generation even on low VRAM gpus.
+
+ install this first: pip install soundfile
+
+ Example: python bark_perform.py --text_prompt "It is a mistake to think you can solve any major problems just with potatoes... (and full page more of text)" --split_by_words 35
+
+ BARK INFINITY is possible because Bark is such an amazingly simple and powerful model that even I can could poke around easily.
+
+ For music I recommend using the --split_by_lines and making sure you use a multiline string as input.
+ You'll generally get better results if you manually split your text, which I neglected to provide an easy way to do (seperate token?).
+
+ """, formatter_class=argparse.RawTextHelpFormatter)
+ parser.add_argument("--text_prompt", help="Text prompt. If not provided, a set of default prompts will be used defined in this file.")
+ parser.add_argument("--history_prompt", default=None, help="Optional. Choose a speaker from the list of languages: . Use --list_speakers to see all available options.")
+ parser.add_argument("--text_temp", type=float, help="Text temperature. Default is 0.7.")
+ parser.add_argument("--waveform_temp", type=float, help="Waveform temperature. Default is 0.7.")
+ parser.add_argument("--filename", help="Output filename. If not provided, a unique filename will be generated based on the text prompt and other parameters.")
+ parser.add_argument("--output_dir", help="Output directory. Default is 'bark_samples'.")
+ parser.add_argument("--list_speakers", action="store_true", help="List all preset speaker options instead of generating audio.")
+ parser.add_argument("--use_smaller_models", action="store_true", help="Use for GPUS with less than 10GB of memory, or for more speed.")
+ parser.add_argument("--iterations", type=int, default=1, help="Number of iterations. Default is 1.")
+ parser.add_argument("--split_by_words", type=int, default=0, help="Breaks text_prompt into <14 second audio clips every x words")
+ parser.add_argument("--split_by_lines", type=int, default=0, help="Breaks text_prompt into <14 second audio clips every x lines")
+ parser.add_argument("--stable_mode", action="store_true", help="Choppier and not as natural sounding, but much more stable for very long audio files.")
+ parser.add_argument("--confused_travolta_mode", default=False, action="store_true", help="Just for fun. Try it and you'll understand.")
+
+ parser.add_argument("--prompt_file", help="Optional. The path to a file containing the text prompt. Overrides the --text_prompt option if provided.")
+ parser.add_argument("--prompt_file_separator", help="Optional. The separator used to split the content of the prompt_file into multiple text prompts.")
+
+ args = parser.parse_args()
+ main(args)
diff --git a/bark_infinity/bark_legacy/bark_speak.py b/bark_infinity/bark_legacy/bark_speak.py
new file mode 100644
index 0000000000000000000000000000000000000000..11467b57ce16da99b55eef83382002ead4cdfccb
--- /dev/null
+++ b/bark_infinity/bark_legacy/bark_speak.py
@@ -0,0 +1,175 @@
+import argparse
+import numpy as np
+from bark import SAMPLE_RATE, generate_audio, preload_models
+import os
+import datetime
+import soundfile as sf
+import re
+
+SUPPORTED_LANGS = [
+ ("English", "en"),
+ ("German", "de"),
+ ("Spanish", "es"),
+ ("French", "fr"),
+ ("Hindi", "hi"),
+ ("Italian", "it"),
+ ("Japanese", "ja"),
+ ("Korean", "ko"),
+ ("Polish", "pl"),
+ ("Portuguese", "pt"),
+ ("Russian", "ru"),
+ ("Turkish", "tr"),
+ ("Chinese", "zh"),
+]
+
+
+ALLOWED_PROMPTS = set()
+
+ALLOWED_PROMPTS = {"announcer"}
+for _, lang in SUPPORTED_LANGS:
+ for n in range(10):
+ ALLOWED_PROMPTS.add(f"{lang}_speaker_{n}")
+ for n in range(10):
+ ALLOWED_PROMPTS.add(f"speaker_{n}")
+
+def estimate_spoken_time(text, wpm=150, time_limit=14):
+ # Remove text within square brackets
+ text_without_brackets = re.sub(r'\[.*?\]', '', text)
+
+ words = text_without_brackets.split()
+ word_count = len(words)
+ time_in_seconds = (word_count / wpm) * 60
+
+ if time_in_seconds > time_limit:
+ return True, time_in_seconds
+ else:
+ return False, time_in_seconds
+
+def save_audio_to_file(filename, audio_array, sample_rate=24000, format='WAV', subtype='PCM_16', output_dir=None):
+
+ # Create output directory if it doesn't exist
+ if output_dir:
+ os.makedirs(output_dir, exist_ok=True)
+ filepath = os.path.join(output_dir, filename)
+ else:
+ filepath = filename
+
+ i = 1
+ name, ext = os.path.splitext(filepath)
+ while os.path.exists(filepath):
+ filepath = f"{name}_{i}{ext}"
+ i += 1
+
+ sf.write(filepath, audio_array, sample_rate, format=format, subtype=subtype)
+ print(f"Saved audio to {filepath}")
+
+
+def gen_and_save_audio(text_prompt, history_prompt=None, text_temp=0.7, waveform_temp=0.7, filename="", output_dir="bark_samples"):
+ def generate_unique_filename(base_filename):
+ name, ext = os.path.splitext(base_filename)
+ unique_filename = base_filename
+ counter = 1
+ while os.path.exists(unique_filename):
+ unique_filename = f"{name}_{counter}{ext}"
+ counter += 1
+ return unique_filename
+
+ longer_than_14_seconds, estimated_time = estimate_spoken_time(text_prompt)
+ print(f"Estimated time: {estimated_time:.2f} seconds.")
+ if longer_than_14_seconds:
+ print(f"Text Prompt could be too long, might want to try a shorter one if you get a bad result.")
+ print(f"Generating: {text_prompt}")
+ if args.history_prompt:
+ print(f"Using speaker: {history_prompt}")
+
+ else:
+ print(f"No speaker. Randomly generating a speaker.")
+
+ audio_array = generate_audio(text_prompt, history_prompt, text_temp=text_temp,
+ waveform_temp=waveform_temp)
+
+ if not filename:
+ date_str = datetime.datetime.now().strftime("%Y-%m-%d-%H")
+ truncated_text = text_prompt.replace("WOMAN:", "").replace("MAN:", "")[:15].strip().replace(" ", "_")
+ filename = f"{truncated_text}-history_prompt-{history_prompt}-text_temp-{text_temp}-waveform_temp-{waveform_temp}-{date_str}.wav"
+ filename = generate_unique_filename(filename)
+
+ save_audio_to_file(filename, audio_array, SAMPLE_RATE, output_dir=output_dir)
+
+
+
+
+def print_speakers_list():
+ print("Available history prompts:")
+ print("\nNon-specific speakers:")
+ print(" announcer")
+ print(" speaker_0 to speaker_9")
+ print("\nLanguage-specific speakers:")
+ for language, lang_code in SUPPORTED_LANGS:
+ speakers = ", ".join([f"{lang_code}_speaker_{n}" for n in range(10)])
+ print(f"\n {language}({lang_code}):\n{speakers}")
+
+
+
+# If there's no text_prompt passed on the command line, process this list instead.
+text_prompts = []
+
+text_prompt = """
+ In the beginning the Universe was created. This has made a lot of people very angry and been widely regarded as a bad move.
+"""
+text_prompts.append(text_prompt)
+
+text_prompt = """
+ A common mistake that people make when trying to design something completely foolproof is to underestimate the ingenuity of complete fools.
+"""
+text_prompts.append(text_prompt)
+
+
+def main(args):
+ if args.list_speakers:
+ print_speakers_list()
+ else:
+ if args.text_prompt:
+ text_prompts_to_process = [args.text_prompt]
+ else:
+ print("No text prompt provided. Using default prompts defined in this file.")
+ text_prompts_to_process = text_prompts
+ if args.history_prompt:
+ history_prompt = args.history_prompt
+ else:
+ history_prompt = None
+ text_temp = args.text_temp if args.text_temp else 0.7
+ waveform_temp = args.waveform_temp if args.waveform_temp else 0.7
+ filename = args.filename if args.filename else ""
+ output_dir = args.output_dir if args.output_dir else "bark_samples"
+
+ print("Loading Bark models...")
+
+ if args.use_smaller_models:
+ print("Using smaller models.")
+ preload_models(use_smaller_models=True)
+ else:
+ preload_models()
+
+ print("Models loaded.")
+
+ for prompt in text_prompts_to_process:
+ gen_and_save_audio(prompt, history_prompt, text_temp, waveform_temp, filename, output_dir)
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="""
+ Generate and save audio.
+ install this first: pip install soundfile
+ Example: python bark_speak.py --text_prompt "It is a mistake to think you can solve any major problems just with potatoes." --history_prompt en_speaker_3
+ """, formatter_class=argparse.RawTextHelpFormatter)
+ parser.add_argument("--text_prompt", help="Text prompt. If not provided, a set of default prompts will be used defined in this file.")
+ parser.add_argument("--history_prompt", help="Optional. Choose a speaker from the list of languages: " + ", ".join([lang[0] for lang in SUPPORTED_LANGS]) + ". Use --list_speakers to see all available options.")
+ parser.add_argument("--text_temp", type=float, help="Text temperature. Default is 0.7.")
+ parser.add_argument("--waveform_temp", type=float, help="Waveform temperature. Default is 0.7.")
+ parser.add_argument("--filename", help="Output filename. If not provided, a unique filename will be generated based on the text prompt and other parameters.")
+ parser.add_argument("--output_dir", help="Output directory. Default is 'bark_samples'.")
+ parser.add_argument("--list_speakers", action="store_true", help="List all preset speaker options instead of generating audio.")
+ parser.add_argument("--use_smaller_models", action="store_true", help="Use for GPUS with less than 10GB of memory, or for more speed.")
+
+ args = parser.parse_args()
+ main(args)
diff --git a/bark_infinity/clonevoice.py b/bark_infinity/clonevoice.py
new file mode 100644
index 0000000000000000000000000000000000000000..deb1afa81c632d10900e065ba6d989845025d883
--- /dev/null
+++ b/bark_infinity/clonevoice.py
@@ -0,0 +1,1049 @@
+from bark_infinity import generation
+from bark_infinity import api
+
+from bark_infinity.generation import SAMPLE_RATE, load_codec_model
+
+from encodec.utils import convert_audio
+import torchaudio
+import torch
+import os
+import gradio
+import numpy as np
+import shutil
+
+import math
+import datetime
+from pathlib import Path
+import re
+import gradio
+
+
+from pydub import AudioSegment
+
+
+from typing import List
+
+from math import ceil
+
+from encodec.utils import convert_audio
+
+
+from bark_infinity.hubert.customtokenizer import CustomTokenizer
+from bark_infinity.hubert.hubert_manager import HuBERTManager
+from bark_infinity.hubert.pre_kmeans_hubert import CustomHubert
+
+
+def sanitize_filename(filename):
+ # replace invalid characters with underscores
+ return re.sub(r"[^a-zA-Z0-9_]", "_", filename)
+
+
+CONTEXT_WINDOW_SIZE = 1024
+
+SEMANTIC_RATE_HZ = 49.9
+SEMANTIC_VOCAB_SIZE = 10_000
+
+CODEBOOK_SIZE = 1024
+N_COARSE_CODEBOOKS = 2
+N_FINE_CODEBOOKS = 8
+COARSE_RATE_HZ = 75
+
+SAMPLE_RATE = 24_000
+
+TEXT_ENCODING_OFFSET = 10_048
+SEMANTIC_PAD_TOKEN = 10_000
+TEXT_PAD_TOKEN = 129_595
+SEMANTIC_INFER_TOKEN = 129_599
+
+from bark_infinity import api
+from bark_infinity import generation
+from bark_infinity import text_processing
+from bark_infinity import config
+
+
+# test polish
+
+alt_model = {
+ "repo": "Hobis/bark-voice-cloning-polish-HuBERT-quantizer",
+ "model": "polish-HuBERT-quantizer_8_epoch.pth",
+ "tokenizer_name": "polish_tokenizer_large.pth",
+}
+
+"""
+def validate_prompt_ratio(history_prompt):
+ semantic_to_coarse_ratio = COARSE_RATE_HZ / SEMANTIC_RATE_HZ
+
+ semantic_prompt = history_prompt["semantic_prompt"]
+ coarse_prompt = history_prompt["coarse_prompt"]
+ fine_prompt = history_prompt["fine_prompt"]
+
+ current_semantic_len = len(semantic_prompt)
+ current_coarse_len = coarse_prompt.shape[1]
+ current_fine_len = fine_prompt.shape[1]
+
+ expected_coarse_len = int(current_semantic_len * semantic_to_coarse_ratio)
+ expected_fine_len = expected_coarse_len
+
+ if current_coarse_len != expected_coarse_len:
+ print(f"Coarse length mismatch! Expected {expected_coarse_len}, got {current_coarse_len}.")
+ return False
+
+ if current_fine_len != expected_fine_len:
+ print(f"Fine length mismatch! Expected {expected_fine_len}, got {current_fine_len}.")
+ return False
+
+ return True
+"""
+import os
+
+
+def write_clone_npz(filepath, full_generation, regen_fine=False, gen_raw_coarse=False, **kwargs):
+ gen_raw_coarse = False
+
+ filepath = api.generate_unique_filepath(filepath)
+ # np.savez_compressed(filepath, semantic_prompt = full_generation["semantic_prompt"], coarse_prompt = full_generation["coarse_prompt"], fine_prompt = full_generation["fine_prompt"])
+ if "semantic_prompt" in full_generation:
+ np.savez(
+ filepath,
+ semantic_prompt=full_generation["semantic_prompt"],
+ coarse_prompt=full_generation["coarse_prompt"],
+ fine_prompt=full_generation["fine_prompt"],
+ )
+ quick_codec_render(filepath)
+ else:
+ print("No semantic prompt to save")
+
+ history_prompt = load_npz(filepath)
+ if regen_fine:
+ # maybe cut half or something so half a speaker, so we have some history, would do that anyhing? or dupe it?
+
+ # fine_tokens = generation.generate_fine(full_generation["coarse_prompt"])
+
+ fine_tokens = generation.generate_fine(
+ history_prompt["coarse_prompt"], history_prompt=history_prompt
+ )
+ base = os.path.basename(filepath)
+ filename, extension = os.path.splitext(base)
+ suffix = "_blurryhistory_"
+ new_filename = filename + suffix
+ new_filepath = os.path.join(os.path.dirname(new_filepath), new_filename + extension)
+ new_filepath = api.generate_unique_filepath(new_filepath)
+ np.savez(
+ new_filepath,
+ semantic_prompt=history_prompt["semantic_prompt"],
+ coarse_prompt=history_prompt["coarse_prompt"],
+ fine_prompt=fine_tokens,
+ )
+ quick_codec_render(new_filepath)
+
+ fine_tokens = generation.generate_fine(history_prompt["coarse_prompt"], history_prompt=None)
+ base = os.path.basename(filepath)
+ filename, extension = os.path.splitext(base)
+ suffix = "_blurrynohitory_"
+ new_filename = filename + suffix
+ new_filepath = os.path.join(os.path.dirname(new_filepath), new_filename + extension)
+ new_filepath = api.generate_unique_filepath(new_filepath)
+ np.savez(
+ new_filepath,
+ semantic_prompt=history_prompt["semantic_prompt"],
+ coarse_prompt=history_prompt["coarse_prompt"],
+ fine_prompt=fine_tokens,
+ )
+ quick_codec_render(new_filepath)
+
+ if gen_raw_coarse:
+ show_history_prompt_size(history_prompt)
+ new_history = resize_history_prompt(history_prompt, tokens=128, from_front=False)
+ # print(api.history_prompt_detailed_report(full_generation))
+ # show_history_prompt_size(full_generation)
+
+ # maybe cut half or something so half a speaker?
+
+ coarse_tokens = generation.generate_coarse(
+ history_prompt["semantic_prompt"],
+ history_prompt=history_prompt,
+ use_kv_caching=True,
+ )
+ base = os.path.basename(filepath)
+ filename, extension = os.path.splitext(base)
+ suffix = "coarse_yes_his_"
+ new_filename = filename + suffix
+ new_filepath = os.path.join(os.path.dirname(new_filepath), new_filename + extension)
+ new_filepath = api.generate_unique_filepath(new_filepath)
+ np.savez(
+ new_filepath,
+ semantic_prompt=history_prompt["semantic_prompt"],
+ coarse_prompt=coarse_tokens,
+ fine_prompt=None,
+ )
+ quick_codec_render(new_filepath)
+
+ api.history_prompt_detailed_report(history_prompt)
+
+ # maybe cut half or something so half a speaker?
+ coarse_tokens = generation.generate_coarse(
+ history_prompt["semantic_prompt"], use_kv_caching=True
+ )
+ base = os.path.basename(filepath)
+ filename, extension = os.path.splitext(base)
+ suffix = "_course_no_his_"
+ new_filename = filename + suffix
+ new_filepath = os.path.join(os.path.dirname(new_filepath), new_filename + extension)
+ new_filepath = api.generate_unique_filepath(new_filepath)
+ np.savez(
+ new_filepath,
+ semantic_prompt=history_prompt["semantic_prompt"],
+ coarse_prompt=coarse_tokens,
+ fine_prompt=None,
+ )
+ quick_codec_render(new_filepath)
+
+
+# missing at least two good tokens
+soft_semantic = [2, 3, 4, 5, 10, 206]
+# allowed_splits = [3,4,5,10]
+
+
+# somehow actually works great
+def segment_these_semantics_smartly_and_smoothly(
+ tokens,
+ soft_semantic,
+ split_threshold=4,
+ minimum_segment_size=64,
+ maximum_segment_size=768,
+ maximum_segment_size_split_threshold=1,
+ require_consecutive_split_tokens=True,
+ repetition_threshold=15,
+):
+ segments = []
+ segment = []
+ split_counter = 0
+ max_split_counter = 0
+ repetition_counter = (
+ 1 # start at 1 as the first token is the beginning of a potential repetition
+ )
+ last_token = None
+ last_token_was_split = False
+
+ for token in tokens:
+ segment.append(token)
+
+ if (
+ token == last_token
+ ): # if this token is the same as the last one, increment the repetition counter
+ repetition_counter += 1
+ else: # otherwise, reset the repetition counter
+ repetition_counter = 1
+
+ if token in soft_semantic:
+ if not require_consecutive_split_tokens or (
+ require_consecutive_split_tokens and last_token_was_split
+ ):
+ split_counter += 1
+ else:
+ split_counter = 1
+ max_split_counter = 0
+ last_token_was_split = True
+ else:
+ max_split_counter += 1
+ last_token_was_split = False
+
+ if (split_counter == split_threshold or repetition_counter == repetition_threshold) and len(
+ segment
+ ) >= minimum_segment_size:
+ segments.append(segment)
+ segment = []
+ split_counter = 0
+ max_split_counter = 0
+ repetition_counter = 1 # reset the repetition counter after a segment split
+ elif len(segment) > maximum_segment_size:
+ if (
+ max_split_counter == maximum_segment_size_split_threshold
+ or maximum_segment_size_split_threshold == 0
+ ):
+ segments.append(segment[:-max_split_counter])
+ segment = segment[-max_split_counter:]
+ split_counter = 0
+ max_split_counter = 0
+
+ last_token = token # update last_token at the end of the loop
+
+ if segment: # don't forget to add the last segment
+ segments.append(segment)
+
+ return segments
+
+
+def quick_clone(file):
+ # file_name = ".".join(file.replace("\\", "/").split("/")[-1].split(".")[:-1])
+ # out_file = f"data/bark_custom_speakers/{file_name}.npz"
+
+ semantic_prompt = wav_to_semantics(file)
+ fine_prompt = generate_fine_from_wav(file)
+ coarse_prompt = generate_course_history(fine_prompt)
+
+ full_generation = {
+ "semantic_prompt": semantic_prompt,
+ "coarse_prompt": coarse_prompt,
+ "fine_prompt": fine_prompt,
+ }
+
+ return full_generation
+
+
+def clone_voice(
+ audio_filepath,
+ input_audio_filename_secondary,
+ dest_filename,
+ speaker_as_clone_content=None,
+ progress=gradio.Progress(track_tqdm=True),
+ max_retries=2,
+ even_more_clones=False,
+ extra_blurry_clones=False,
+ audio_filepath_directory=None,
+ simple_clones_only=False,
+):
+ old = generation.OFFLOAD_CPU
+ generation.OFFLOAD_CPU = False
+
+ dest_filename = sanitize_filename(dest_filename)
+ timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+ dir_path = Path("cloned_voices") / f"{dest_filename}_{timestamp}"
+ dir_path.mkdir(parents=True, exist_ok=True)
+
+ base_clone_subdir = Path(dir_path) / f"gen_0_clones"
+ base_clone_subdir.mkdir(parents=True, exist_ok=True)
+
+ starting_base_output_path = base_clone_subdir
+
+ starting_base_output_path = starting_base_output_path / f"{dest_filename}"
+
+ audio_filepath_files = []
+
+ if audio_filepath_directory is not None and audio_filepath_directory.strip() != "":
+ audio_filepath_files = os.listdir(audio_filepath_directory)
+ audio_filepath_files = [file for file in audio_filepath_files if file.endswith(".wav")]
+
+ audio_filepath_files = [
+ os.path.join(audio_filepath_directory, file) for file in audio_filepath_files
+ ]
+
+ print(f"Found {len(audio_filepath_files)} audio files in {audio_filepath_directory}")
+
+ else:
+ audio_filepath_files = [audio_filepath]
+
+ for audio_num, audio_filepath in enumerate(audio_filepath_files):
+ if audio_filepath is None or not os.path.exists(audio_filepath):
+ print(f"The audio file {audio_filepath} does not exist. Please check the path.")
+ progress(0, f"The audio file {audio_filepath} does not exist. Please check the path.")
+ return
+ else:
+ print(f"Found the audio file {audio_filepath}.")
+
+ base_output_path = Path(f"{starting_base_output_path}_file{audio_num}.npz")
+
+ progress(0, desc="HuBERT Quantizer, Quantizing.")
+
+ default_prompt_width = 512
+
+ budget_prompt_width = 512
+
+ attempts = 0
+
+ orig_semantic_prompt = None
+ all_completed_clones = []
+
+ print(f"Cloning voice from {audio_filepath} to {dest_filename}")
+
+ if even_more_clones is True:
+ max_retries = 2
+ else:
+ max_retries = 1
+
+ while attempts < max_retries:
+ attempts += 1
+
+ # Step 1: Converting WAV to Semantics
+ progress(1, desc="Step 1 of 4: Converting WAV to Semantics")
+
+ print(f"attempt {attempts} of {max_retries}")
+ if attempts == 2:
+ semantic_prompt_tensor = wav_to_semantics(audio_filepath, alt_model)
+ else:
+ semantic_prompt_tensor = wav_to_semantics(audio_filepath)
+
+ orig_semantic_prompt = semantic_prompt_tensor
+ # semantic_prompt = semantic_prompt_tensor.numpy()
+ semantic_prompt = semantic_prompt_tensor
+
+ # Step 2: Generating Fine from WAV
+ progress(2, desc="Step 2 of 4: Generating Fine from WAV")
+ try:
+ fine_prompt = generate_fine_from_wav(audio_filepath)
+ except Exception as e:
+ print(f"Failed at step 2 with error: {e}")
+ continue
+
+ # Step 3: Generating Coarse History
+ progress(3, desc="Step 3 of 4: Generating Coarse History")
+ coarse_prompt = generate_course_history(fine_prompt)
+ # coarse_prompt = coarse_prompt.numpy()
+
+ # Building the history prompt
+ history_prompt = {
+ "semantic_prompt": semantic_prompt,
+ "coarse_prompt": coarse_prompt,
+ "fine_prompt": fine_prompt,
+ }
+
+ # print types of each
+ # print(f"semantic_prompt type: {type(semantic_prompt)}")
+ # print(f"coarse_prompt type: {type(coarse_prompt)}")
+ # print(f"fine_prompt type: {type(fine_prompt)}")
+
+ if not api.history_prompt_is_valid(history_prompt):
+ print("Primary prompt potentially problematic:")
+ print(api.history_prompt_detailed_report(history_prompt))
+
+ attempt_string = f"_{attempts}"
+ attempt_string = f""
+ if attempts == 2:
+ # attempt_string = f"{attempt_string}a"
+ attempt_string = f"_x"
+
+ output_path = base_output_path.with_stem(base_output_path.stem + attempt_string)
+
+ # full_output_path = output_path.with_stem(output_path.stem + "_FULLAUDIOCLIP")
+ # write_clone_npz(str(full_output_path), history_prompt)
+
+ # The back of audio is generally the best speaker by far, as the user specifically chose this audio clip and it likely has a natural ending.
+ # If you had to choose one the front of the clip is bit different style and decent, though cutting randomly so
+ # it has a high chance of being terrible.
+
+ progress(4, desc="\nSegmenting A Little More Smoothy Now...\n")
+ print(f"Segmenting A Little More Smoothy Now...")
+
+ full_output_path = output_path.with_stem(output_path.stem + "_FULL_LENGTH_AUDIO")
+ write_clone_npz(str(full_output_path), history_prompt)
+
+ full = load_npz(str(full_output_path))
+ # print(f"{show_history_prompt_size(full, token_samples=128)}")
+
+ # The back of clip generally the best speaker, as the user specifically chose this audio clip and it likely has a natural ending.
+
+ clip_full_semantic_length = len(semantic_prompt)
+
+ back_history_prompt = resize_history_prompt(
+ history_prompt, tokens=768, from_front=False
+ )
+ back_output_path = output_path.with_stem(output_path.stem + "__ENDCLIP")
+ write_clone_npz(
+ str(back_output_path), back_history_prompt, regen_fine=extra_blurry_clones
+ )
+ all_completed_clones.append(
+ (
+ back_history_prompt,
+ str(back_output_path),
+ clip_full_semantic_length - 768,
+ )
+ )
+
+ # thought this would need to be more sophisticated, maybe this is ok
+
+ split_semantic_segments = [semantic_prompt]
+
+ if not simple_clones_only:
+ split_semantic_segments = segment_these_semantics_smartly_and_smoothly(
+ semantic_prompt,
+ soft_semantic,
+ split_threshold=3,
+ minimum_segment_size=96,
+ maximum_segment_size=768,
+ maximum_segment_size_split_threshold=1,
+ require_consecutive_split_tokens=True,
+ repetition_threshold=9,
+ )
+ else:
+ print(f"Skipping smart segmentation, using single file instead.")
+
+ clone_start = 0
+
+ segment_number = 1
+
+ # while clone_end < clip_full_semantic_length + semantic_step_interval:
+ for idx, semantic_segment_smarter_seg in enumerate(split_semantic_segments):
+ semantic_segment_smarter_seg_len = len(semantic_segment_smarter_seg)
+ current_slice = clone_start + semantic_segment_smarter_seg_len
+ # segment_movement_so_far = current_slice
+
+ clone_start = current_slice
+ sliced_history_prompt = resize_history_prompt(
+ history_prompt, tokens=current_slice, from_front=True
+ )
+ sliced_history_prompt = resize_history_prompt(
+ sliced_history_prompt, tokens=budget_prompt_width, from_front=False
+ )
+ if api.history_prompt_is_valid(sliced_history_prompt):
+ # segment_output_path = output_path.with_stem(output_path.stem + f"_s_{current_slice}")
+ segment_output_path = output_path.with_stem(
+ output_path.stem + f"_{segment_number}"
+ )
+ else:
+ print(f"segment {segment_number} potentially problematic:")
+ # print(api.history_prompt_detailed_report(sliced_history_prompt))
+ sliced_history_prompt = resize_history_prompt(
+ sliced_history_prompt,
+ tokens=budget_prompt_width - 1,
+ from_front=False,
+ )
+ if api.history_prompt_is_valid(sliced_history_prompt):
+ # segment_output_path = output_path.with_stem(output_path.stem + f"_s_{current_slice}")
+ segment_output_path = output_path.with_stem(
+ output_path.stem + f"_{segment_number}"
+ )
+ else:
+ print(f"segment {segment_number} still potentially problematic:")
+ # print(api.history_prompt_detailed_report(sliced_history_prompt))
+ continue
+
+ write_clone_npz(
+ str(segment_output_path),
+ sliced_history_prompt,
+ regen_fine=extra_blurry_clones,
+ )
+ segment_number += 1
+ all_completed_clones.append(
+ (sliced_history_prompt, str(segment_output_path), current_slice)
+ )
+
+ if attempts == 1 and False:
+ original_audio_filepath_ext = Path(audio_filepath).suffix
+ copy_of_original_target_audio_file = (
+ dir_path / f"{dest_filename}_TARGET_ORIGINAL_audio.wav"
+ )
+ copy_of_original_target_audio_file = api.generate_unique_filepath(
+ str(copy_of_original_target_audio_file)
+ )
+ print(
+ f"Copying original clone audio sample from {audio_filepath} to {copy_of_original_target_audio_file}"
+ )
+ shutil.copyfile(audio_filepath, str(copy_of_original_target_audio_file))
+
+ progress(5, desc="Base Voice Clones Done")
+ print(f"Finished cloning voice from {audio_filepath} to {dest_filename}")
+
+ # TODO just an experiment, doesn't seem to help though
+ orig_semantic_prompt = orig_semantic_prompt.numpy()
+
+ import random
+
+ print(f"input_audio_filename_secondary: {input_audio_filename_secondary}")
+
+ if input_audio_filename_secondary is not None:
+ progress(5, desc="Generative Clones, Long Clip, Lots of randomness")
+
+ second_sample_prompt = None
+ if input_audio_filename_secondary is not None:
+ progress(
+ 5,
+ desc="Step 5 of 5: Converting Secondary Audio sample to Semantic Prompt",
+ )
+ second_sample_tensor = wav_to_semantics(input_audio_filename_secondary)
+ second_sample_prompt = second_sample_tensor.numpy()
+ if len(second_sample_prompt) > 850:
+ second_sample_prompt = second_sample_prompt[
+ :850
+ ] # Actually from front, makes sense
+
+ orig_semantic_prompt_len = len(orig_semantic_prompt)
+
+ generation.OFFLOAD_CPU = old
+
+ generation.preload_models()
+ generation.clean_models()
+
+ total_clones = len(all_completed_clones)
+ clone_num = 0
+ for clone, filepath, end_slice in all_completed_clones:
+ clone_num += 1
+ clone_history = load_npz(filepath) # lazy tensor to numpy...
+ progress(5, desc=f"Generating {clone_num} of {total_clones}")
+ if api.history_prompt_is_valid(clone_history):
+ end_of_prompt = end_slice + budget_prompt_width
+ if end_of_prompt > orig_semantic_prompt_len:
+ semantic_next_segment = orig_semantic_prompt # use beginning
+ else:
+ semantic_next_segment = orig_semantic_prompt[
+ -(orig_semantic_prompt_len - end_slice) :
+ ]
+
+ prompts = []
+ if second_sample_prompt is not None:
+ prompts.append(second_sample_prompt)
+
+ if even_more_clones:
+ prompts.append(semantic_next_segment)
+
+ for semantic_next_segment in prompts:
+ # print(f"Shape of semantic_next_segment: {semantic_next_segment.shape}")
+
+ if len(semantic_next_segment) > 800:
+ semantic_next_segment = semantic_next_segment[:800]
+
+ chop1 = random.randint(32, 128)
+ chop2 = random.randint(64, 192)
+ chop3 = random.randint(128, 256)
+
+ chop_sizes = [chop1, chop2, chop3]
+
+ chop = random.choice(chop_sizes)
+
+ if chop == 0:
+ chop_his = None
+ else:
+ chop_his = resize_history_prompt(
+ clone_history, tokens=chop, from_front=False
+ )
+ coarse_tokens = api.generate_coarse(
+ semantic_next_segment,
+ history_prompt=chop_his,
+ temp=0.7,
+ silent=False,
+ use_kv_caching=True,
+ )
+
+ fine_tokens = api.generate_fine(
+ coarse_tokens,
+ history_prompt=chop_his,
+ temp=0.5,
+ )
+
+ full_generation = {
+ "semantic_prompt": semantic_next_segment,
+ "coarse_prompt": coarse_tokens,
+ "fine_prompt": fine_tokens,
+ }
+
+ if api.history_prompt_is_valid(full_generation):
+ base = os.path.basename(filepath)
+ filename, extension = os.path.splitext(base)
+ suffix = f"g2_{chop}_"
+ new_filename = filename + suffix
+ new_filepath = os.path.join(
+ os.path.dirname(filepath), new_filename + extension
+ )
+ new_filepath = api.generate_unique_filepath(new_filepath)
+ write_clone_npz(new_filepath, full_generation)
+
+ # messy, really bark infinity should sample from different spaces in huge npz files, no reason to cut like this.
+ suffix = f"g2f_{chop}_"
+ full_generation = resize_history_prompt(
+ full_generation, tokens=budget_prompt_width, from_front=True
+ )
+ new_filename = filename + suffix
+ new_filepath = os.path.join(
+ os.path.dirname(filepath), new_filename + extension
+ )
+ new_filepath = api.generate_unique_filepath(new_filepath)
+ write_clone_npz(new_filepath, full_generation)
+
+ tiny_history_addition = resize_history_prompt(
+ full_generation, tokens=128, from_front=True
+ )
+ merged = merge_history_prompts(
+ chop_his, tiny_history_addition, right_size=128
+ )
+ suffix = f"g2t_{chop}_"
+ full_generation = resize_history_prompt(
+ merged, tokens=budget_prompt_width, from_front=False
+ )
+ new_filename = filename + suffix
+ new_filepath = os.path.join(
+ os.path.dirname(filepath), new_filename + extension
+ )
+ new_filepath = api.generate_unique_filepath(new_filepath)
+ write_clone_npz(new_filepath, full_generation)
+ else:
+ print(f"Full generation for {filepath} was invalid, skipping")
+ print(api.history_prompt_detailed_report(full_generation))
+ else:
+ print(f"Clone {filepath} was invalid, skipping")
+ print(api.history_prompt_detailed_report(clone_history))
+
+ print(f"Generation 0 clones completed. You'll find your clones at: {base_clone_subdir}")
+
+ # restore previous CPU offload state
+
+ generation.OFFLOAD_CPU = old
+ generation.clean_models()
+ generation.preload_models() # ?
+ return f"{base_clone_subdir}"
+
+
+def quick_codec_render(filepath):
+ reload = load_npz(filepath) # lazy
+ if "fine_prompt" in reload:
+ fine_prompt = reload["fine_prompt"]
+ if fine_prompt is not None and fine_prompt.shape[0] >= 8 and fine_prompt.shape[1] >= 1:
+ audio_arr = generation.codec_decode(fine_prompt)
+
+ base = os.path.basename(filepath)
+ filename, extension = os.path.splitext(base)
+ new_filepath = os.path.join(os.path.dirname(filepath), filename + "_f.mp4")
+ new_filepath = api.generate_unique_filepath(new_filepath)
+ api.write_audiofile(new_filepath, audio_arr, output_format="mp4")
+
+ else:
+ print(f"Fine prompt was invalid, skipping")
+ print(show_history_prompt_size(reload))
+ if "coarse_prompt" in reload:
+ coarse_prompt = reload["coarse_prompt"]
+ if (
+ coarse_prompt is not None
+ and coarse_prompt.ndim == 2
+ and coarse_prompt.shape[0] >= 2
+ and coarse_prompt.shape[1] >= 1
+ ):
+ audio_arr = generation.codec_decode(coarse_prompt)
+ base = os.path.basename(filepath)
+ filename, extension = os.path.splitext(base)
+ new_filepath = os.path.join(os.path.dirname(filepath), filename + "_co.mp4")
+ new_filepath = api.generate_unique_filepath(new_filepath)
+ api.write_audiofile(new_filepath, audio_arr, output_format="mp4")
+ else:
+ print(f"Coarse prompt was invalid, skipping")
+ print(show_history_prompt_size(reload))
+
+
+"""
+
+def load_hubert():
+ HuBERTManager.make_sure_hubert_installed()
+ HuBERTManager.make_sure_tokenizer_installed()
+ if 'hubert' not in huberts:
+ hubert_path = './bark_infinity/hubert/hubert.pt'
+ print('Loading HuBERT')
+ huberts['hubert'] = CustomHubert(hubert_path)
+ if 'tokenizer' not in huberts:
+ tokenizer_path = './bark_infinity/hubert/tokenizer.pth'
+ print('Loading Custom Tokenizer')
+ tokenizer = CustomTokenizer()
+ tokenizer.load_state_dict(torch.load(tokenizer_path)) # Load the model
+ huberts['tokenizer'] = tokenizer
+"""
+
+huberts = {}
+
+bark_cloning_large_model = True #
+
+
+def load_hubert(alt_model=None, force_reload=True):
+ hubert_path = HuBERTManager.make_sure_hubert_installed()
+ model = (
+ ("quantifier_V1_hubert_base_ls960_23.pth", "tokenizer_large.pth")
+ if bark_cloning_large_model
+ else ("quantifier_hubert_base_ls960_14.pth", "tokenizer.pth")
+ )
+ tokenizer_path = None
+ if alt_model is not None:
+ model = (alt_model["model"], alt_model["tokenizer_name"])
+ tokenizer_path = HuBERTManager.make_sure_tokenizer_installed(
+ model=model[0], local_file=model[1], repo=alt_model["repo"]
+ )
+ else:
+ tokenizer_path = HuBERTManager.make_sure_tokenizer_installed(
+ model=model[0], local_file=model[1]
+ )
+
+ if "hubert" not in huberts:
+ print(f"Loading HuBERT models {model} from {hubert_path}")
+ # huberts["hubert"] = CustomHubert(hubert_path)
+ huberts["hubert"] = CustomHubert(hubert_path, device=torch.device("cpu"))
+ if "tokenizer" not in huberts or force_reload:
+ # print('Loading Custom Tokenizer')
+ # print(f'Loading tokenizer from {tokenizer_path}')
+ tokenizer = CustomTokenizer.load_from_checkpoint(
+ tokenizer_path, map_location=torch.device("cpu")
+ )
+ huberts["tokenizer"] = tokenizer
+
+
+def generate_course_history(fine_history):
+ return fine_history[:2, :]
+
+
+# TODO don't hardcode GPU
+"""
+def generate_fine_from_wav(file):
+ model = load_codec_model(use_gpu=True) # Don't worry about reimporting, it stores the loaded model in a dict
+ wav, sr = torchaudio.load(file)
+ wav = convert_audio(wav, sr, SAMPLE_RATE, model.channels)
+ wav = wav.unsqueeze(0).to('cuda')
+ with torch.no_grad():
+ encoded_frames = model.encode(wav)
+ codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()
+
+ codes = codes.cpu().numpy()
+
+ return codes
+"""
+clone_use_gpu = False
+
+
+def generate_fine_from_wav(file):
+ # model = load_codec_model(use_gpu=not args.bark_use_cpu) # Don't worry about reimporting, it stores the loaded model in a dict
+ model = load_codec_model(
+ use_gpu=False
+ ) # Don't worry about reimporting, it stores the loaded model in a dict
+ wav, sr = torchaudio.load(file)
+ wav = convert_audio(wav, sr, SAMPLE_RATE, model.channels)
+ wav = wav.unsqueeze(0)
+ # if not (args.bark_cpu_offload or args.bark_use_cpu):
+ if False:
+ wav = wav.to("cuda")
+ with torch.no_grad():
+ encoded_frames = model.encode(wav)
+ codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()
+
+ codes = codes.cpu().numpy()
+
+ return codes
+
+
+def wav_to_semantics(file, alt_model=None) -> torch.Tensor:
+ # Vocab size is 10,000.
+
+ if alt_model is None:
+ load_hubert()
+ else:
+ load_hubert(alt_model=alt_model, force_reload=True)
+
+ # check file extension and set
+
+ # format = None
+ # audio_extension = os.path.splitext(file)[1]
+ # format = audio_extension
+
+ # print(f"Loading {file} as {format}")
+ wav, sr = torchaudio.load(file)
+
+ # wav, sr = torchaudio.load(file, format=f"{format}")
+
+ # sr, wav = wavfile.read(file)
+ # wav = torch.tensor(wav, dtype=torch.float32)
+
+ if wav.shape[0] == 2: # Stereo to mono if needed
+ wav = wav.mean(0, keepdim=True)
+
+ # Extract semantics in HuBERT style
+ # print('Extracting and Tokenizing Semantics')
+ print("Clones Inbound...")
+ semantics = huberts["hubert"].forward(wav, input_sample_hz=sr)
+ # print('Tokenizing...')
+ tokens = huberts["tokenizer"].get_token(semantics)
+ return tokens
+
+
+import copy
+from collections import Counter
+
+
+from contextlib import contextmanager
+
+
+def load_npz(filename):
+ npz_data = np.load(filename, allow_pickle=True)
+
+ data_dict = {
+ "semantic_prompt": npz_data["semantic_prompt"],
+ "coarse_prompt": npz_data["coarse_prompt"],
+ "fine_prompt": npz_data["fine_prompt"],
+ }
+
+ npz_data.close()
+
+ return data_dict
+
+
+def resize_history_prompt(history_prompt, tokens=128, from_front=False):
+ semantic_to_coarse_ratio = COARSE_RATE_HZ / SEMANTIC_RATE_HZ
+
+ semantic_prompt = history_prompt["semantic_prompt"]
+ coarse_prompt = history_prompt["coarse_prompt"]
+ fine_prompt = history_prompt["fine_prompt"]
+
+ new_semantic_len = min(tokens, len(semantic_prompt))
+ new_coarse_len = min(int(new_semantic_len * semantic_to_coarse_ratio), coarse_prompt.shape[1])
+
+ new_fine_len = new_coarse_len
+
+ if from_front:
+ new_semantic_prompt = semantic_prompt[:new_semantic_len]
+ new_coarse_prompt = coarse_prompt[:, :new_coarse_len]
+ new_fine_prompt = fine_prompt[:, :new_fine_len]
+ else:
+ new_semantic_prompt = semantic_prompt[-new_semantic_len:]
+ new_coarse_prompt = coarse_prompt[:, -new_coarse_len:]
+ new_fine_prompt = fine_prompt[:, -new_fine_len:]
+
+ return {
+ "semantic_prompt": new_semantic_prompt,
+ "coarse_prompt": new_coarse_prompt,
+ "fine_prompt": new_fine_prompt,
+ }
+
+
+def show_history_prompt_size(
+ history_prompt, token_samples=3, semantic_back_n=128, text="history_prompt"
+):
+ semantic_prompt = history_prompt["semantic_prompt"]
+ coarse_prompt = history_prompt["coarse_prompt"]
+ fine_prompt = history_prompt["fine_prompt"]
+
+ # compute the ratio for coarse and fine back_n
+ ratio = 75 / 49.9
+ coarse_and_fine_back_n = int(semantic_back_n * ratio)
+
+ def show_array_front_back(arr, n, back_n):
+ if n > 0:
+ front = arr[:n].tolist()
+ back = arr[-n:].tolist()
+
+ mid = []
+ if len(arr) > back_n + token_samples:
+ mid = arr[-back_n - token_samples : -back_n + token_samples].tolist()
+
+ if mid:
+ return f"{front} ... <{back_n} from end> {mid} ... {back}"
+ else:
+ return f"{front} ... {back}"
+ else:
+ return ""
+
+ def most_common_tokens(arr, n=3):
+ flattened = arr.flatten()
+ counter = Counter(flattened)
+ return counter.most_common(n)
+
+ print(f"\n{text}")
+ print(f" {text} semantic_prompt: {semantic_prompt.shape}")
+ print(f" Tokens: {show_array_front_back(semantic_prompt, token_samples, semantic_back_n)}")
+ print(f" Most common tokens: {most_common_tokens(semantic_prompt)}")
+
+ print(f" {text} coarse_prompt: {coarse_prompt.shape}")
+ for i, row in enumerate(coarse_prompt):
+ print(
+ f" Row {i} Tokens: {show_array_front_back(row, token_samples, coarse_and_fine_back_n)}"
+ )
+ print(f" Most common tokens in row {i}: {most_common_tokens(row)}")
+
+ print(f" {text} fine_prompt: {fine_prompt.shape}")
+ # for i, row in enumerate(fine_prompt):
+ # print(f" Row {i} Tokens: {show_array_front_back(row, token_samples, coarse_and_fine_back_n)}")
+ # print(f" Most common tokens in row {i}: {most_common_tokens(row)}")
+
+
+def split_array_equally(array, num_parts):
+ split_indices = np.linspace(0, len(array), num_parts + 1, dtype=int)
+ return [
+ array[split_indices[i] : split_indices[i + 1]].astype(np.int32) for i in range(num_parts)
+ ]
+
+
+@contextmanager
+def measure_time(text=None, index=None):
+ start_time = time.time()
+ yield
+ elapsed_time = time.time() - start_time
+ if index is not None and text is not None:
+ text = f"{text} {index}"
+ elif text is None:
+ text = "Operation"
+
+ time_finished = (
+ f"{text} Finished at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))}"
+ )
+ print(f" -->{time_finished} in {elapsed_time} seconds")
+
+
+def compare_history_prompts(hp1, hp2, text="history_prompt"):
+ print(f"\nComparing {text}")
+ for key in hp1.keys():
+ if hp1[key].shape != hp2[key].shape:
+ print(f" {key} arrays have different shapes: {hp1[key].shape} vs {hp2[key].shape}.")
+ min_size = min(hp1[key].shape[0], hp2[key].shape[0])
+
+ if hp1[key].ndim == 1:
+ hp1_part = hp1[key][-min_size:]
+ hp2_part = hp2[key][-min_size:]
+ else:
+ min_size = min(hp1[key].shape[1], hp2[key].shape[1])
+ hp1_part = hp1[key][:, -min_size:]
+ hp2_part = hp2[key][:, -min_size:]
+
+ print(f" Comparing the last {min_size} elements of each.")
+ else:
+ hp1_part = hp1[key]
+ hp2_part = hp2[key]
+
+ if np.array_equal(hp1_part, hp2_part):
+ print(f" {key} arrays are exactly the same.")
+ elif np.allclose(hp1_part, hp2_part):
+ diff = np.linalg.norm(hp1_part - hp2_part)
+ print(f" {key} arrays are almost equal with a norm of difference: {diff}")
+ else:
+ diff = np.linalg.norm(hp1_part - hp2_part)
+ print(f" {key} arrays are not equal. Norm of difference: {diff}")
+
+
+def split_by_words(text, word_group_size):
+ words = text.split()
+ result = []
+ group = ""
+
+ for i, word in enumerate(words):
+ group += word + " "
+
+ if (i + 1) % word_group_size == 0:
+ result.append(group.strip())
+ group = ""
+
+ # Add the last group if it's not empty
+ if group.strip():
+ result.append(group.strip())
+
+ return result
+
+
+def concat_history_prompts(history_prompt1, history_prompt2):
+ new_semantic_prompt = np.hstack(
+ [history_prompt1["semantic_prompt"], history_prompt2["semantic_prompt"]]
+ ).astype(
+ np.int32
+ ) # not int64?
+ new_coarse_prompt = np.hstack(
+ [history_prompt1["coarse_prompt"], history_prompt2["coarse_prompt"]]
+ ).astype(np.int32)
+ new_fine_prompt = np.hstack(
+ [history_prompt1["fine_prompt"], history_prompt2["fine_prompt"]]
+ ).astype(np.int32)
+
+ concatenated_history_prompt = {
+ "semantic_prompt": new_semantic_prompt,
+ "coarse_prompt": new_coarse_prompt,
+ "fine_prompt": new_fine_prompt,
+ }
+
+ return concatenated_history_prompt
+
+
+def merge_history_prompts(left_history_prompt, right_history_prompt, right_size=128):
+ right_history_prompt = resize_history_prompt(
+ right_history_prompt, tokens=right_size, from_front=False
+ )
+ combined_history_prompts = concat_history_prompts(left_history_prompt, right_history_prompt)
+ combined_history_prompts = resize_history_prompt(
+ combined_history_prompts, tokens=341, from_front=False
+ )
+ return combined_history_prompts
diff --git a/bark_infinity/config.py b/bark_infinity/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..696d905c000d2a1c81ab81740355ce2fac494724
--- /dev/null
+++ b/bark_infinity/config.py
@@ -0,0 +1,573 @@
+import logging
+from io import StringIO
+from rich.console import Console
+from rich.logging import RichHandler
+import os
+
+FORMAT = "%(funcName)s %(message)s"
+
+logging.basicConfig(
+ level=logging.WARNING,
+ format=FORMAT,
+ datefmt="[%X]",
+ handlers=[RichHandler(show_level=False, show_time=False)],
+)
+logger = logging.getLogger("bark-infinity")
+
+
+console_file = Console(file=StringIO())
+console = Console()
+
+CHOICES = {
+ "split_options": ["word", "line", "sentence", "char", "string", "random", "regex"],
+ "log_levels": ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
+ "output_formats": ["wav", "mp3", "ogg", "flac", "mp4"],
+}
+
+
+VALID_HISTORY_PROMPT_DIRS = [
+ os.path.join("bark", "assets", "prompts"),
+ os.path.join("bark_infinity", "assets", "prompts"),
+ "custom_speakers",
+]
+
+DEFAULTS = {
+ "input": [
+ (
+ "text_prompt",
+ {"value": None, "type": str, "help": "Text prompt to generate audio from."},
+ ),
+ ("list_speakers", {"value": None, "type": bool, "help": "List available speakers."}),
+ (
+ "dry_run",
+ {
+ "value": False,
+ "type": bool,
+ "help": "Don't generate audio, but show output like you would have. Useful for testing.",
+ },
+ ),
+ (
+ "text_splits_only",
+ {
+ "value": False,
+ "type": bool,
+ "help": "Just show how the text will be split into each segment.",
+ },
+ ),
+ (
+ "history_prompt",
+ {"value": None, "type": str, "help": "Text prompt to generate audio from."},
+ ),
+ (
+ "audio_file_as_history_prompt",
+ {
+ "value": None,
+ "type": str,
+ "help": "Use an audio file as the history prompt. Do a quick clone, then proceed normally.",
+ },
+ ),
+ (
+ "prompt_file",
+ {"value": None, "type": str, "help": "Text prompt to generate audio from."},
+ ),
+ (
+ "split_input_into_separate_prompts_by",
+ {
+ "value": None,
+ "type": str,
+ "help": "Split input into separate prompts, each with it's own wav file.",
+ "choices": CHOICES["split_options"],
+ },
+ ),
+ (
+ "split_input_into_separate_prompts_by_value",
+ {
+ "value": None,
+ "type": str,
+ "help": "The number of words, lines, sentences, rhymes, alliterations, or the value of the specific string to split your text-file prompts by. Much like in_groups_of_size is in prompts.",
+ },
+ ),
+ (
+ "bark_speaker_as_the_prompt",
+ {"value": None, "type": str, "help": "Bark Speaker As Prop."},
+ ),
+ ],
+ "output": [
+ (
+ "always_save_speaker",
+ {
+ "value": True,
+ "type": bool,
+ "help": "Save the speaker.npz files for every generated audio clip. Even history prompts, because the voice will be slightly different after the generation if you save it again.",
+ },
+ ),
+ (
+ "output_iterations",
+ {"value": 1, "type": int, "help": "Number of audio clips to generate per prompt."},
+ ),
+ (
+ "output_filename",
+ {
+ "value": None,
+ "type": str,
+ "help": "Output filename. If not provided, a unique filename will be generated based on the text prompt and other parameters.",
+ },
+ ),
+ ("output_dir", {"value": "bark_samples", "type": str, "help": "Output directory."}),
+ (
+ "hoarder_mode",
+ {
+ "value": False,
+ "type": bool,
+ "help": "Who wants to make a cool audio clip and not able to reproduce it in the future? Save it all! Creates a sub directory for each clip that is more than one segment long, because it's kind of a lot.",
+ },
+ ),
+ ("extra_stats", {"value": False, "type": bool, "help": "Extra stats in the filename."}),
+ (
+ "show_generation_times",
+ {
+ "value": False,
+ "type": bool,
+ "help": "Output how long each sample took to generate, good for benchmarking.",
+ },
+ ),
+ (
+ "output_format",
+ {
+ "value": "mp3",
+ "type": str,
+ "help": "(Output format. You can always re-render the uncompressed wav later if you save the speaker.npz files.)",
+ "choices": CHOICES["output_formats"],
+ },
+ ),
+ (
+ "output_format_ffmpeg_parameters",
+ {
+ "value": None,
+ "type": str,
+ "help": 'Custom ffmpeg parameters: Separate parameter name and value by QQQQQ. \
+ Any arguments supported by ffmpeg can be passed as a list. Note that no validation \
+ takes place on these parameters, and you may be limited by what your particular \
+ build of ffmpeg support. (Why QQQQQ? Sick of punctuation related bugs.) Example: "-volQQQQQ150QQQQQ-q:aQQQQQ0"',
+ },
+ ),
+ ],
+ "model": [
+ ("text_use_gpu", {"value": True, "type": bool, "help": "Load the text model on the GPU."}),
+ (
+ "text_use_small",
+ {"value": False, "type": bool, "help": "Use a smaller/faster text model."},
+ ),
+ (
+ "coarse_use_gpu",
+ {"value": True, "type": bool, "help": "Load the coarse model on the GPU."},
+ ),
+ (
+ "coarse_use_small",
+ {"value": False, "type": bool, "help": "Use a smaller/faster coarse model."},
+ ),
+ ("fine_use_gpu", {"value": True, "type": bool, "help": "Load the fine model on the GPU."}),
+ (
+ "fine_use_small",
+ {"value": False, "type": bool, "help": "Use a smaller/faster fine model."},
+ ),
+ (
+ "codec_use_gpu",
+ {"value": True, "type": bool, "help": "Load the codec model on the GPU."},
+ ),
+ (
+ "force_reload",
+ {"value": False, "type": bool, "help": "Force the models to be downloaded again."},
+ ),
+ (
+ "GLOBAL_ENABLE_MPS",
+ {"value": None, "type": bool, "help": "Apple M1 Hardware Acceleration."},
+ ),
+ ("USE_SMALL_MODELS", {"value": None, "type": bool, "help": "Set OS env for small models."}),
+ (
+ "SUNO_USE_DIRECTML",
+ {"value": False, "type": bool, "help": "Experimental AMD DirectML Bark support."},
+ ),
+ (
+ "OFFLOAD_CPU",
+ {
+ "value": None,
+ "type": bool,
+ "help": "Offload models when not in use, saves a ton of GPU memory and almost as fast.",
+ },
+ ),
+ ],
+ "bark_model_parameters": [
+ ("text_temp", {"value": 0.7, "type": float, "help": "Text temperature. "}),
+ ("waveform_temp", {"value": 0.5, "type": float, "help": "Waveform temperature."}),
+ ("confused_travolta_mode", {"value": False, "type": bool, "help": "Just for fun. Mostly."}),
+ ("silent", {"value": False, "type": bool, "help": "Disable progress bar."}),
+ (
+ "seed",
+ {
+ "value": None,
+ "type": int,
+ "help": "Random seed for a single clip of audio. This sets the seed one time before all three models, but if you have multiple clips, it sets the same seed for every segment. You probably want to use --single_starting_seed instead in most cases.",
+ },
+ ),
+ ],
+ # todo split by one of the options, count by the other. splitting by phrase, and counting by word, is probably pretty good.
+ "generating_long_clips": [
+ (
+ "stable_mode_interval",
+ {
+ "value": 1,
+ "type": int,
+ "help": "Optional. stable_mode_interval set to 1 means every 14s clip uses the original speaker .npz file, or the first 14s clip of a random voice. 0 means the previous file is continues. 3 means the speaker history is carried forward 3 times, and then reset back to the original. Not needed at all for short clips. ",
+ },
+ ),
+ (
+ "single_starting_seed",
+ {
+ "value": None,
+ "type": int,
+ "help": "Random seed that it just set once at the start. This is probably the seed you want.",
+ },
+ ),
+ (
+ "split_character_goal_length",
+ {
+ "value": 125,
+ "type": int,
+ "help": "Split your text_prompt into < 14s chunks of about many characters, general splitter.",
+ },
+ ),
+ (
+ "split_character_max_length",
+ {
+ "value": 175,
+ "type": int,
+ "help": "Split your text_prompt into < 14s, ceiling value.",
+ },
+ ),
+ (
+ "split_character_jitter",
+ {
+ "value": 0,
+ "type": int,
+ "help": "Add or subtract the split_character values by the jitter value every iteration. Useful for running a lot of samples to get some variety.",
+ },
+ ),
+ (
+ "add_silence_between_segments",
+ {
+ "value": 0.0,
+ "type": float,
+ "help": "Add a bit of silence between joined audio segments. Works good if you splitting your text on complete sentences or phrases, or if you are using the same prompt every segment (stable_mode_interval = 1). If you are using stable_mode_interval = 0 it might be worse.",
+ },
+ ),
+ (
+ "process_text_by_each",
+ {
+ "value": None,
+ "type": str,
+ "help": "Bark only generates 14s at a time, so the text_prompt needs to be split into chunks smaller than that.",
+ "choices": CHOICES["split_options"],
+ },
+ ),
+ (
+ "group_text_by_counting",
+ {
+ "value": None,
+ "type": str,
+ "help": "Bark only generates 14s at a time, so the text_prompt needs to be split into chunks smaller than that.",
+ "choices": CHOICES["split_options"],
+ },
+ ),
+ (
+ "in_groups_of_size",
+ {
+ "value": None,
+ "type": int,
+ "help": "Bark only generates 14s at a time, so the text_prompt needs to be split into chunks smaller than that.",
+ },
+ ),
+ (
+ "split_type_string",
+ {
+ "value": None,
+ "type": str,
+ "help": "Bark only generates 14s at a time, so the text_prompt needs to be split into chunks smaller than that.",
+ },
+ ),
+ (
+ "prompt_text_prefix",
+ {
+ "value": None,
+ "type": str,
+ "help": "Put this text string in front of every text prompt, after splitting.",
+ },
+ ),
+ (
+ "prompt_text_suffix",
+ {
+ "value": None,
+ "type": str,
+ "help": "Put this text string after every text prompt, after splitting.",
+ },
+ ),
+ (
+ "extra_confused_travolta_mode",
+ {
+ "value": None,
+ "type": int,
+ "help": "Like the name says... 1 for more, 2 for way more, the level of confusion now goes to infinity.",
+ },
+ ),
+ (
+ "separate_prompts",
+ {
+ "value": False,
+ "type": bool,
+ "help": "Split text, but into completely separate prompts. Great for generating a bunch of different samples from a single text file to explore the space of possibilities.",
+ },
+ ),
+ ],
+ "convenience": [
+ (
+ "use_smaller_models",
+ {
+ "value": False,
+ "type": bool,
+ "help": "Use all small models. Overrides --text_use_small, --coarse_use_small, --fine_use_small. You can probably use big models just fine by default in the latest version though!",
+ },
+ ),
+ ],
+ "advanced": [
+ (
+ "detailed_gpu_report",
+ {"value": False, "type": bool, "help": "Show detailed GPU details on startup."},
+ ),
+ (
+ "detailed_cuda_report",
+ {"value": False, "type": bool, "help": "Show detailed CUDA details on startup."},
+ ),
+ (
+ "detailed_hugging_face_cache_report",
+ {"value": False, "type": bool, "help": "Show detailed GPU details on startup."},
+ ),
+ (
+ "detailed_numpy_report",
+ {"value": False, "type": bool, "help": "Show details on Numpy and MKL config."},
+ ),
+ (
+ "run_numpy_benchmark",
+ {"value": False, "type": bool, "help": "Run CPU benchmark for Numpy and MKL."},
+ ),
+ (
+ "show_all_reports",
+ {"value": False, "type": bool, "help": "Show all reports on startup."},
+ ),
+ (
+ "semantic_temp",
+ {"value": 0.7, "type": float, "help": "Temperature for semantic function."},
+ ),
+ ("semantic_top_k", {"value": None, "type": int, "help": "Top K for semantic function."}),
+ ("semantic_top_p", {"value": None, "type": float, "help": "Top P for semantic function."}),
+ (
+ "semantic_min_eos_p",
+ {"value": 0.2, "type": float, "help": "Minimum EOS probability for semantic function."},
+ ),
+ (
+ "semantic_max_gen_duration_s",
+ {
+ "value": None,
+ "type": float,
+ "help": "Maximum generation duration for semantic function. ",
+ },
+ ),
+ (
+ "semantic_allow_early_stop",
+ {"value": True, "type": bool, "help": "The secret behind Confused Travolta Mode."},
+ ),
+ (
+ "semantic_use_kv_caching",
+ {
+ "value": True,
+ "type": bool,
+ "help": "Use key-value caching. Probably faster with no quality loss.",
+ },
+ ),
+ ("semantic_seed", {"value": None, "type": int, "help": "Lock semantic seed"}),
+ (
+ "semantic_history_oversize_limit",
+ {
+ "value": None,
+ "type": int,
+ "help": "Maximum size of semantic history, hardcoded to 256. Increasing seems terrible but decreasing it may be useful to lower the value and get variations on existing speakers, or try to fine-tune a bit.",
+ },
+ ),
+ ("coarse_temp", {"value": 0.7, "type": float, "help": "Temperature for fine function."}),
+ ("coarse_top_k", {"value": None, "type": int, "help": "Top K for coarse function. "}),
+ ("coarse_top_p", {"value": None, "type": float, "help": "Top P for coarse function. "}),
+ (
+ "coarse_max_coarse_history",
+ {"value": 630, "type": int, "help": "Maximum coarse history for coarse function."},
+ ),
+ (
+ "coarse_sliding_window_len",
+ {"value": 60, "type": int, "help": "Sliding window length for coarse function."},
+ ),
+ (
+ "coarse_kv_caching",
+ {
+ "value": True,
+ "type": bool,
+ "help": "Use key-value caching. Probably faster with no quality loss.",
+ },
+ ),
+ ("coarse_seed", {"value": None, "type": int, "help": "Lock coarse seed"}),
+ (
+ "x_coarse_history_alignment_hack",
+ {
+ "value": -2,
+ "type": int,
+ "help": "Can try up or down a few notches to see if your audio align better",
+ },
+ ),
+ ("fine_temp", {"value": 0.5, "type": float, "help": "Temperature for fine function."}),
+ ("fine_seed", {"value": None, "type": int, "help": "Lock fine seed"}),
+ (
+ "render_npz_samples",
+ {
+ "value": False,
+ "type": bool,
+ "help": "Give this a directory of .npz files and it generates sample audio clips from them.",
+ },
+ ),
+ (
+ "loglevel",
+ {
+ "value": "WARNING",
+ "type": str,
+ "help": "Logging level. Choices are DEBUG, INFO, WARNING, ERROR, CRITICAL.",
+ "choices": CHOICES["log_levels"],
+ },
+ ),
+ (
+ "absolute_semantic_history_only",
+ {
+ "value": False,
+ "type": bool,
+ "help": "Only use semantic history in generation. Generates voices that are based on original speaker, but different.",
+ },
+ ),
+ (
+ "absolute_semantic_history_only_every_x",
+ {
+ "value": None,
+ "type": int,
+ "help": "Only use semantic history in generation every X segments. Generates voices that are based on original speaker, but different.",
+ },
+ ),
+ ],
+}
+
+
+def _cast_bool_env_var(s):
+ return s.lower() in ("true", "1", "t")
+
+
+def get_default_values(group_name):
+ if group_name in DEFAULTS:
+ return {key: value["value"] for key, value in DEFAULTS[group_name]}
+ return {}
+
+
+def load_all_defaults(**kwargs):
+ for group_name in DEFAULTS:
+ default_values = get_default_values(group_name)
+ for key, value in default_values.items():
+ if key not in kwargs:
+ kwargs[key] = value
+ return kwargs
+
+
+import argparse
+from rich_argparse import RichHelpFormatter
+
+
+def create_argument_parser():
+ parser = argparse.ArgumentParser(
+ description="""
+ Bark is a text-to-speech tool that uses machine learning to synthesize speech from text and other audio sources
+ """,
+ formatter_class=RichHelpFormatter,
+ )
+
+ help_tags = {
+ "input": "Input settings",
+ "output": "Output settings",
+ "model": "Model settings",
+ "bark_model_parameters": "Bark model parameters",
+ "generating_long_clips": "Generating long clips",
+ "convenience": "Convenience options",
+ "cloning": "Voice cloning options",
+ "advanced": "Advanced options",
+ }
+
+ for group_name, arguments in DEFAULTS.items():
+ group = parser.add_argument_group(group_name, help_tags.get(group_name, ""))
+ add_arguments_to_group(group, arguments)
+
+ return parser
+
+
+class StringToBoolAction(argparse.Action):
+ def __call__(self, parser, namespace, values, option_string=None):
+ if isinstance(values, str):
+ if values.lower() == "true":
+ setattr(namespace, self.dest, True)
+ elif values.lower() == "false":
+ setattr(namespace, self.dest, False)
+ else:
+ parser.error(f"{option_string} should be True or False")
+ else:
+ setattr(namespace, self.dest, values)
+
+
+def add_arguments_to_group(group, arguments, help_tag=""):
+ # print(arguments)
+ group.help = help_tag
+ for key, arg in arguments: # Changed this line
+ help_text = f"{arg['help']} Default: {arg['value']}"
+ if "choices" in arg:
+ help_text += f" Choices: {', '.join(map(str, arg['choices']))}"
+
+ if arg["type"] == bool:
+ group.add_argument(f"--{key}", action=StringToBoolAction, help=help_text)
+ else:
+ group.add_argument(
+ f"--{key}", type=arg["type"], help=help_text, choices=arg.get("choices")
+ )
+
+
+def update_group_args_with_defaults(args):
+ updated_args = {}
+ for group_name, arguments in DEFAULTS.items():
+ for key, value in arguments:
+ if getattr(args, key) is None:
+ updated_args[key] = value["value"]
+ # print(f" IS NONE Using {key} = {updated_args[key]}")
+ else:
+ updated_args[key] = getattr(args, key)
+
+ # print(f"Using {key} = {updated_args[key]}")
+ return updated_args
+
+
+def update_group_args_with_defaults_what(args):
+ updated_args = {}
+ for group_name in DEFAULTS:
+ default_values = get_default_values(group_name)
+ for key, value in default_values.items():
+ if key not in args:
+ updated_args[key] = value
+ updated_args[key] = getattr(args, key)
+
+ return updated_args
diff --git a/bark_infinity/config_dev.py b/bark_infinity/config_dev.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd9c210946a975b1b677c27cbb5e92f4ed09d550
--- /dev/null
+++ b/bark_infinity/config_dev.py
@@ -0,0 +1,563 @@
+import logging
+from io import StringIO
+from rich.console import Console
+from rich.logging import RichHandler
+import os
+
+FORMAT = "%(funcName)s %(message)s"
+
+logging.basicConfig(
+ level=logging.WARNING,
+ format=FORMAT,
+ datefmt="[%X]",
+ handlers=[RichHandler(show_level=False, show_time=False)],
+)
+logger = logging.getLogger("bark-infinity")
+
+
+console_file = Console(file=StringIO())
+console = Console()
+
+CHOICES = {
+ "split_options": ["word", "line", "sentence", "char", "string", "random", "regex"],
+ "log_levels": ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
+ "output_formats": ["wav", "mp3", "ogg", "flac", "mp4"],
+}
+
+
+VALID_HISTORY_PROMPT_DIRS = [
+ os.path.join("bark", "assets", "prompts"),
+ os.path.join("bark_infinity", "assets", "prompts"),
+ "custom_speakers",
+]
+
+DEFAULTS = {
+ "input": [
+ (
+ "text_prompt",
+ {"value": None, "type": str, "help": "Text prompt to generate audio from."},
+ ),
+ ("list_speakers", {"value": None, "type": bool, "help": "List available speakers."}),
+ (
+ "dry_run",
+ {
+ "value": False,
+ "type": bool,
+ "help": "Don't generate audio, but show output like you would have. Useful for testing.",
+ },
+ ),
+ (
+ "text_splits_only",
+ {
+ "value": False,
+ "type": bool,
+ "help": "Just show how the text will be split into each segment.",
+ },
+ ),
+ (
+ "history_prompt",
+ {"value": None, "type": str, "help": "Text prompt to generate audio from."},
+ ),
+ (
+ "prompt_file",
+ {"value": None, "type": str, "help": "Text prompt to generate audio from."},
+ ),
+ (
+ "split_input_into_separate_prompts_by",
+ {
+ "value": None,
+ "type": str,
+ "help": "Split input into separate prompts, each with it's own wav file.",
+ "choices": CHOICES["split_options"],
+ },
+ ),
+ (
+ "split_input_into_separate_prompts_by_value",
+ {
+ "value": None,
+ "type": str,
+ "help": "The number of words, lines, sentences, rhymes, alliterations, or the value of the specific string to split your text-file prompts by. Much like in_groups_of_size is in prompts.",
+ },
+ ),
+ (
+ "bark_speaker_as_the_prompt",
+ {"value": None, "type": str, "help": "Bark Speaker As Prop."},
+ ),
+ ],
+ "output": [
+ (
+ "always_save_speaker",
+ {
+ "value": True,
+ "type": bool,
+ "help": "Save the speaker.npz files for every generated audio clip. Even history prompts, because the voice will be slightly different after the generation if you save it again.",
+ },
+ ),
+ (
+ "output_iterations",
+ {"value": 1, "type": int, "help": "Number of audio clips to generate per prompt."},
+ ),
+ (
+ "output_filename",
+ {
+ "value": None,
+ "type": str,
+ "help": "Output filename. If not provided, a unique filename will be generated based on the text prompt and other parameters.",
+ },
+ ),
+ ("output_dir", {"value": "bark_samples/", "type": str, "help": "Output directory."}),
+ (
+ "hoarder_mode",
+ {
+ "value": False,
+ "type": bool,
+ "help": "Who wants to make a cool audio clip and not able to reproduce it in the future? Save it all! Creates a sub directory for each clip that is more than one segment long, because it's kind of a lot.",
+ },
+ ),
+ ("extra_stats", {"value": False, "type": bool, "help": "Extra stats in the filename."}),
+ (
+ "show_generation_times",
+ {
+ "value": False,
+ "type": bool,
+ "help": "Output how long each sample took to generate, good for benchmarking.",
+ },
+ ),
+ (
+ "output_format",
+ {
+ "value": "mp4",
+ "type": str,
+ "help": "(Output format. You can always re-render the uncompressed wav later if you save the speaker.npz files.)",
+ "choices": CHOICES["output_formats"],
+ },
+ ),
+ (
+ "output_format_ffmpeg_parameters",
+ {
+ "value": None,
+ "type": str,
+ "help": 'Custom ffmpeg parameters: Separate parameter name and value by QQQQQ. \
+ Any arguments supported by ffmpeg can be passed as a list. Note that no validation \
+ takes place on these parameters, and you may be limited by what your particular \
+ build of ffmpeg support. (Why QQQQQ? Sick of punctuation related bugs.) Example: "-volQQQQQ150QQQQQ-q:aQQQQQ0"',
+ },
+ ),
+ ],
+ "model": [
+ ("text_use_gpu", {"value": True, "type": bool, "help": "Load the text model on the GPU."}),
+ (
+ "text_use_small",
+ {"value": False, "type": bool, "help": "Use a smaller/faster text model."},
+ ),
+ (
+ "coarse_use_gpu",
+ {"value": True, "type": bool, "help": "Load the coarse model on the GPU."},
+ ),
+ (
+ "coarse_use_small",
+ {"value": False, "type": bool, "help": "Use a smaller/faster coarse model."},
+ ),
+ ("fine_use_gpu", {"value": True, "type": bool, "help": "Load the fine model on the GPU."}),
+ (
+ "fine_use_small",
+ {"value": False, "type": bool, "help": "Use a smaller/faster fine model."},
+ ),
+ (
+ "codec_use_gpu",
+ {"value": True, "type": bool, "help": "Load the codec model on the GPU."},
+ ),
+ (
+ "force_reload",
+ {"value": False, "type": bool, "help": "Force the models to be downloaded again."},
+ ),
+ (
+ "GLOBAL_ENABLE_MPS",
+ {"value": None, "type": bool, "help": "Apple M1 Hardware Acceleration."},
+ ),
+ ("USE_SMALL_MODELS", {"value": None, "type": bool, "help": "Set OS env for small models."}),
+ (
+ "SUNO_USE_DIRECTML",
+ {"value": False, "type": bool, "help": "Experimental AMD DirectML Bark support."},
+ ),
+ (
+ "OFFLOAD_CPU",
+ {
+ "value": None,
+ "type": bool,
+ "help": "Offload models when not in use, saves a ton of GPU memory and almost as fast.",
+ },
+ ),
+ ],
+ "bark_model_parameters": [
+ ("text_temp", {"value": 0.7, "type": float, "help": "Text temperature. "}),
+ ("waveform_temp", {"value": 0.7, "type": float, "help": "Waveform temperature."}),
+ ("confused_travolta_mode", {"value": False, "type": bool, "help": "Just for fun. Mostly."}),
+ ("silent", {"value": False, "type": bool, "help": "Disable progress bar."}),
+ (
+ "seed",
+ {
+ "value": None,
+ "type": int,
+ "help": "Random seed for a single clip of audio. This sets the seed one time before all three models, but if you have multiple clips, it sets the same seed for every segment. You probably want to use --single_starting_seed instead in most cases.",
+ },
+ ),
+ ],
+ # todo split by one of the options, count by the other. splitting by phrase, and counting by word, is probably pretty good.
+ "generating_long_clips": [
+ (
+ "stable_mode_interval",
+ {
+ "value": 1,
+ "type": int,
+ "help": "Optional. stable_mode_interval set to 1 means every 14s clip uses the original speaker .npz file, or the first 14s clip of a random voice. 0 means the previous file is continues. 3 means the speaker history is carried forward 3 times, and then reset back to the original. Not needed at all for short clips. ",
+ },
+ ),
+ (
+ "single_starting_seed",
+ {
+ "value": None,
+ "type": int,
+ "help": "Random seed that it just set once at the start. This is probably the seed you want.",
+ },
+ ),
+ (
+ "split_character_goal_length",
+ {
+ "value": 125,
+ "type": int,
+ "help": "Split your text_prompt into < 14s chunks of about many characters, general splitter.",
+ },
+ ),
+ (
+ "split_character_max_length",
+ {
+ "value": 175,
+ "type": int,
+ "help": "Split your text_prompt into < 14s, ceiling value.",
+ },
+ ),
+ (
+ "split_character_jitter",
+ {
+ "value": 0,
+ "type": int,
+ "help": "Add or subtract the split_character values by the jitter value every iteration. Useful for running a lot of samples to get some variety.",
+ },
+ ),
+ (
+ "add_silence_between_segments",
+ {
+ "value": 0.0,
+ "type": float,
+ "help": "Add a bit of silence between joined audio segments. Works good if you splitting your text on complete sentences or phrases, or if you are using the same prompt every segment (stable_mode_interval = 1). If you are using stable_mode_interval = 0 it might be worse.",
+ },
+ ),
+ (
+ "process_text_by_each",
+ {
+ "value": None,
+ "type": str,
+ "help": "Bark only generates 14s at a time, so the text_prompt needs to be split into chunks smaller than that.",
+ "choices": CHOICES["split_options"],
+ },
+ ),
+ (
+ "group_text_by_counting",
+ {
+ "value": None,
+ "type": str,
+ "help": "Bark only generates 14s at a time, so the text_prompt needs to be split into chunks smaller than that.",
+ "choices": CHOICES["split_options"],
+ },
+ ),
+ (
+ "in_groups_of_size",
+ {
+ "value": None,
+ "type": int,
+ "help": "Bark only generates 14s at a time, so the text_prompt needs to be split into chunks smaller than that.",
+ },
+ ),
+ (
+ "split_type_string",
+ {
+ "value": None,
+ "type": str,
+ "help": "Bark only generates 14s at a time, so the text_prompt needs to be split into chunks smaller than that.",
+ },
+ ),
+ (
+ "prompt_text_prefix",
+ {
+ "value": None,
+ "type": str,
+ "help": "Put this text string in front of every text prompt, after splitting.",
+ },
+ ),
+ (
+ "prompt_text_suffix",
+ {
+ "value": None,
+ "type": str,
+ "help": "Put this text string after every text prompt, after splitting.",
+ },
+ ),
+ (
+ "extra_confused_travolta_mode",
+ {
+ "value": None,
+ "type": int,
+ "help": "Like the name says... 1 for more, 2 for way more, the level of confusion now goes to infinity.",
+ },
+ ),
+ (
+ "separate_prompts",
+ {
+ "value": False,
+ "type": bool,
+ "help": "Split text, but into completely separate prompts. Great for generating a bunch of different samples from a single text file to explore the space of possibilities.",
+ },
+ ),
+ ("semantic_history_only", {"value": False, "type": bool, "help": ""}),
+ ("absolute_semantic_history_only", {"value": False, "type": bool, "help": ""}),
+ ("absolute_semantic_history_only_every_x", {"value": None, "type": int, "help": ""}),
+ ("semantic_history_starting_weight", {"value": 1.0, "type": float, "help": ""}),
+ ("semantic_history_future_weight", {"value": 1.0, "type": float, "help": ""}),
+ ("semantic_prev_segment_weight", {"value": 0.5, "type": float, "help": ""}),
+ ("coarse_history_starting_weight", {"value": 1.0, "type": float, "help": ""}),
+ ("coarse_history_future_weight", {"value": 0.5, "type": float, "help": ""}),
+ ("coarse_prev_segment_weight", {"value": 0.5, "type": float, "help": ""}),
+ ("fine_history_starting_weight", {"value": 1.0, "type": float, "help": ""}),
+ ("fine_history_future_weight", {"value": 0.0, "type": float, "help": ""}),
+ ("fine_prev_segment_weight", {"value": 0.0, "type": float, "help": ""}),
+ (
+ "custom_audio_processing_function",
+ {
+ "value": None,
+ "type": int,
+ "help": "Specify a python function callback which determines when and how much of the speaker context to keep or remove or reset. (Not in this version.)",
+ },
+ ),
+ ],
+ "convenience": [
+ (
+ "use_smaller_models",
+ {
+ "value": False,
+ "type": bool,
+ "help": "Use all small models. Overrides --text_use_small, --coarse_use_small, --fine_use_small. You can probably use big models just fine by default in the latest version though!",
+ },
+ ),
+ ],
+ "cloning": [
+ (
+ "bark_cloning_large_model",
+ {"value": True, "type": bool, "help": "Use larger model for cloning."},
+ ),
+ ],
+ "advanced": [
+ (
+ "detailed_gpu_report",
+ {"value": False, "type": bool, "help": "Show detailed GPU details on startup."},
+ ),
+ (
+ "detailed_cuda_report",
+ {"value": False, "type": bool, "help": "Show detailed CUDA details on startup."},
+ ),
+ (
+ "detailed_hugging_face_cache_report",
+ {"value": False, "type": bool, "help": "Show detailed GPU details on startup."},
+ ),
+ (
+ "semantic_temp",
+ {"value": 0.7, "type": float, "help": "Temperature for semantic function."},
+ ),
+ ("semantic_top_k", {"value": None, "type": int, "help": "Top K for semantic function."}),
+ ("semantic_top_p", {"value": None, "type": float, "help": "Top P for semantic function."}),
+ (
+ "semantic_min_eos_p",
+ {"value": 0.2, "type": float, "help": "Minimum EOS probability for semantic function."},
+ ),
+ (
+ "semantic_max_gen_duration_s",
+ {
+ "value": None,
+ "type": float,
+ "help": "Maximum generation duration for semantic function. ",
+ },
+ ),
+ (
+ "semantic_allow_early_stop",
+ {"value": True, "type": bool, "help": "The secret behind Confused Travolta Mode."},
+ ),
+ (
+ "semantic_use_kv_caching",
+ {
+ "value": True,
+ "type": bool,
+ "help": "Use key-value caching. Probably faster with no quality loss.",
+ },
+ ),
+ ("semantic_seed", {"value": None, "type": int, "help": "Lock semantic seed"}),
+ (
+ "semantic_history_oversize_limit",
+ {
+ "value": None,
+ "type": int,
+ "help": "Maximum size of semantic history, hardcoded to 256. Increasing seems terrible but descreasing it may be useful to lower the value and get variations on existing speakers, or try to fine-tune a bit.",
+ },
+ ),
+ ("coarse_temp", {"value": 0.7, "type": float, "help": "Temperature for fine function."}),
+ ("coarse_top_k", {"value": None, "type": int, "help": "Top K for coarse function. "}),
+ ("coarse_top_p", {"value": None, "type": float, "help": "Top P for coarse function. "}),
+ (
+ "coarse_max_coarse_history",
+ {"value": 630, "type": int, "help": "Maximum coarse history for coarse function."},
+ ),
+ (
+ "coarse_sliding_window_len",
+ {"value": 60, "type": int, "help": "Sliding window length for coarse function."},
+ ),
+ (
+ "coarse_kv_caching",
+ {
+ "value": True,
+ "type": bool,
+ "help": "Use key-value caching. Probably faster with no quality loss.",
+ },
+ ),
+ ("coarse_seed", {"value": None, "type": int, "help": "Lock coarse seed"}),
+ (
+ "x_coarse_history_alignment_hack",
+ {
+ "value": -2,
+ "type": int,
+ "help": "Can try up or down a few notches to see if your audio align better",
+ },
+ ),
+ ("fine_temp", {"value": 0.5, "type": float, "help": "Temperature for fine function."}),
+ ("fine_seed", {"value": None, "type": int, "help": "Lock fine seed"}),
+ (
+ "render_npz_samples",
+ {
+ "value": False,
+ "type": bool,
+ "help": "Give this a directory of .npz files and it generaates sample audio clips from them.",
+ },
+ ),
+ (
+ "loglevel",
+ {
+ "value": "WARNING",
+ "type": str,
+ "help": "Logging level. Choices are DEBUG, INFO, WARNING, ERROR, CRITICAL.",
+ "choices": CHOICES["log_levels"],
+ },
+ ),
+ ],
+}
+
+
+def _cast_bool_env_var(s):
+ return s.lower() in ("true", "1", "t")
+
+
+def get_default_values(group_name):
+ if group_name in DEFAULTS:
+ return {key: value["value"] for key, value in DEFAULTS[group_name]}
+ return {}
+
+
+def load_all_defaults(**kwargs):
+ for group_name in DEFAULTS:
+ default_values = get_default_values(group_name)
+ for key, value in default_values.items():
+ if key not in kwargs:
+ kwargs[key] = value
+ return kwargs
+
+
+import argparse
+from rich_argparse import RichHelpFormatter
+
+
+def create_argument_parser():
+ parser = argparse.ArgumentParser(
+ description="""
+ Bark is a text-to-speech tool that uses machine learning to synthesize speech from text and other audio sources
+ """,
+ formatter_class=RichHelpFormatter,
+ )
+
+ help_tags = {
+ "input": "Input settings",
+ "output": "Output settings",
+ "model": "Model settings",
+ "bark_model_parameters": "Bark model parameters",
+ "generating_long_clips": "Generating long clips",
+ "convenience": "Convenience options",
+ "cloning": "Voice cloning options",
+ "advanced": "Advanced options",
+ }
+
+ for group_name, arguments in DEFAULTS.items():
+ group = parser.add_argument_group(group_name, help_tags.get(group_name, ""))
+ add_arguments_to_group(group, arguments)
+
+ return parser
+
+
+class StringToBoolAction(argparse.Action):
+ def __call__(self, parser, namespace, values, option_string=None):
+ if isinstance(values, str):
+ if values.lower() == "true":
+ setattr(namespace, self.dest, True)
+ elif values.lower() == "false":
+ setattr(namespace, self.dest, False)
+ else:
+ parser.error(f"{option_string} should be True or False")
+ else:
+ setattr(namespace, self.dest, values)
+
+
+def add_arguments_to_group(group, arguments, help_tag=""):
+ # print(arguments)
+ group.help = help_tag
+ for key, arg in arguments: # Changed this line
+ help_text = f"{arg['help']} Default: {arg['value']}"
+ if "choices" in arg:
+ help_text += f" Choices: {', '.join(map(str, arg['choices']))}"
+
+ if arg["type"] == bool:
+ group.add_argument(f"--{key}", action=StringToBoolAction, help=help_text)
+ else:
+ group.add_argument(
+ f"--{key}", type=arg["type"], help=help_text, choices=arg.get("choices")
+ )
+
+
+def update_group_args_with_defaults(args):
+ updated_args = {}
+ for group_name, arguments in DEFAULTS.items():
+ for key, value in arguments:
+ if getattr(args, key) is None:
+ updated_args[key] = value["value"]
+ # print(f" IS NONE Using {key} = {updated_args[key]}")
+ else:
+ updated_args[key] = getattr(args, key)
+
+ # print(f"Using {key} = {updated_args[key]}")
+ return updated_args
+
+
+def update_group_args_with_defaults_what(args):
+ updated_args = {}
+ for group_name in DEFAULTS:
+ default_values = get_default_values(group_name)
+ for key, value in default_values.items():
+ if key not in args:
+ updated_args[key] = value
+ updated_args[key] = getattr(args, key)
+
+ return updated_args
diff --git a/bark_infinity/data/models/unclassified/hub/version.txt b/bark_infinity/data/models/unclassified/hub/version.txt
new file mode 100644
index 0000000000000000000000000000000000000000..56a6051ca2b02b04ef92d5150c9ef600403cb1de
--- /dev/null
+++ b/bark_infinity/data/models/unclassified/hub/version.txt
@@ -0,0 +1 @@
+1
\ No newline at end of file
diff --git a/bark_infinity/data_utils.py b/bark_infinity/data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0c7854fdf27622a72076829d2ac2f7afab4695c
--- /dev/null
+++ b/bark_infinity/data_utils.py
@@ -0,0 +1,80 @@
+import requests
+import bs4
+import json
+import multiprocessing
+import subprocess
+import shutil
+import os
+from pathlib import Path
+from datetime import datetime
+from typing import Optional, Dict, List
+
+HEADERS = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:104.0) Gecko/20100101 Firefox/104.0"}
+BASE_URL = "https://www.101soundboards.com"
+
+
+def convert_mp3_to_wav(mp3_path: str, wav_path: str) -> None:
+ subprocess.run(["ffmpeg", "-i", mp3_path, wav_path])
+
+
+def find_sounds(url: str) -> List[Dict[str, str]]:
+ res = requests.get(url, headers=HEADERS)
+ res.raise_for_status()
+
+ soup = bs4.BeautifulSoup(res.text, "html.parser")
+ scripts = soup.find_all("script")
+
+ for script in scripts:
+ if "board_id" not in str(script):
+ continue
+
+ trimmed_script = str(script)[
+ str(script).find("board_data_inline") + 20 : str(script).find("}]};") + 3
+ ]
+ sound_list = json.loads(trimmed_script)
+ return [
+ {
+ "id": sound["id"],
+ "title": sound["sound_transcript"],
+ "url": sound["sound_file_url"],
+ "sound_file_pitch": sound["sound_file_pitch"],
+ }
+ for sound in sound_list["sounds"]
+ ]
+
+ raise ValueError("Could not find sounds at provided URL")
+
+
+def download_sound(url: str, filepath: str) -> None:
+ res = requests.get(BASE_URL + url, headers=HEADERS)
+ res.raise_for_status()
+
+ with open(filepath, "wb") as f:
+ f.write(res.content)
+
+
+def handle_sound(sound: Dict[str, str], output_directory: str) -> None:
+ sound_file_pitch = str(float(sound["sound_file_pitch"]) / 10)
+ original_path = os.path.join(output_directory, f'{sound["title"]}-{sound["id"]}')
+ download_sound(sound["url"], original_path)
+
+ try:
+ wav_path = f"{original_path}.wav"
+ convert_mp3_to_wav(original_path, wav_path)
+ os.remove(original_path)
+ except Exception as e:
+ print(f"Failed to convert file: {original_path}, error: {str(e)}")
+
+
+def fetch_and_convert_sounds(download_directory: str, soundboard_url: str) -> None:
+ if not shutil.which("ffmpeg"):
+ raise EnvironmentError("ffmpeg not found. Please install ffmpeg in your system.")
+
+ if os.path.exists(download_directory):
+ download_directory += f'_{datetime.now().strftime("%Y%m%d%H%M%S")}'
+
+ Path(download_directory).mkdir(exist_ok=True)
+ sounds = find_sounds(soundboard_url)
+
+ with multiprocessing.Pool() as pool:
+ pool.starmap(handle_sound, [(sound, download_directory) for sound in sounds])
diff --git a/bark_infinity/debug.py b/bark_infinity/debug.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc371ea1d4e53abb0baf743fafcd85f223807fea
--- /dev/null
+++ b/bark_infinity/debug.py
@@ -0,0 +1,208 @@
+# benchmark.py
+
+import numpy as np
+from time import time
+import torch
+
+
+def numpy_benchmark():
+ np.random.seed(0) # for reproducibility
+
+ size = 4096
+ A, B = np.random.random((size, size)), np.random.random((size, size))
+ C, D = np.random.random((size * 1280,)), np.random.random(
+ (size * 1280,)
+ ) # increase vector size for benchmark
+ E = np.random.random((int(size / 2), int(size / 4)))
+ F = np.random.random((int(size / 2), int(size / 2)))
+ F = np.dot(F, F.T)
+ G = np.random.random((int(size / 2), int(size / 2)))
+ H = np.random.random((size, size))
+ I = np.random.random((int(size), int(size)))
+
+ print("\nNUMPY CONFIGURATION:")
+ print(np.show_config())
+
+ print("\nNUMPY BENCHMARK RESULTS:")
+
+ t0 = time()
+ # Matrix multiplication
+ N = 20
+ t = time()
+ for i in range(N):
+ np.dot(A, B)
+ delta = time() - t
+ print(f"1. Dotted two {size}x{size} matrices in {delta / N:.3f} s.")
+ del A, B
+
+ # Vector multiplication
+ N = 5000
+ t = time()
+ for i in range(N):
+ np.dot(C, D)
+ delta = time() - t
+ print(f"2. Dotted two vectors of length {size * 1280} in {1e3 * delta / N:.3f} ms.")
+ del C, D
+
+ # Singular Value Decomposition (SVD)
+ N = 3
+ t = time()
+ for i in range(N):
+ np.linalg.svd(E, full_matrices=False)
+ delta = time() - t
+ print(f"3. SVD of a {size // 2}x{size // 4} matrix in {delta / N:.3f} s.")
+ del E
+
+ # Cholesky Decomposition
+ N = 3
+ t = time()
+ for i in range(N):
+ np.linalg.cholesky(F)
+ delta = time() - t
+ print(f"4. Cholesky decomposition of a {size // 2}x{size // 2} matrix in {delta / N:.3f} s.")
+
+ # Eigendecomposition
+ t = time()
+ for i in range(N):
+ np.linalg.eig(G)
+ delta = time() - t
+ print(f"5. Eigendecomposition of a {size // 2}x{size // 2} matrix in {delta / N:.3f} s.")
+
+ # compute covariance matrix
+ N = 10
+ t = time()
+ for i in range(N):
+ np.dot(H.T, H)
+ delta = time() - t
+ print(f"6. Computing Covariance Matrix of a {size}x{size} matrix in {delta / N:.4f} s.")
+
+ # compute inverse matrix
+ N = 3
+ t = time()
+ for i in range(N):
+ np.linalg.inv(I)
+ delta = time() - t
+ print(f"7. Inverse Matrix of a {size}x{size} matrix in {delta / N:.4f} s.")
+
+ # Gradient calculation
+ N, D_in, H, D_out = 64, 1000, 100, 10
+ x = np.random.randn(N, D_in)
+ y = np.random.randn(N, D_out)
+ w1 = np.random.randn(D_in, H)
+ w2 = np.random.randn(H, D_out)
+ learning_rate = 1e-6
+
+ t = time()
+ for _ in range(10000):
+ h = x.dot(w1)
+ h_relu = np.maximum(h, 0)
+ y_pred = h_relu.dot(w2)
+ loss = np.square(y_pred - y).sum()
+ grad_y_pred = 2.0 * (y_pred - y)
+ grad_w2 = h_relu.T.dot(grad_y_pred)
+ grad_h_relu = grad_y_pred.dot(w2.T)
+ grad_h = grad_h_relu.copy()
+ grad_h[h < 0] = 0
+ grad_w1 = x.T.dot(grad_h)
+ w1 -= learning_rate * grad_w1
+ w2 -= learning_rate * grad_w2
+ delta = time() - t
+ print(f"8. Gradient calculation time: {delta:.3f} s.")
+
+ J = np.random.rand(size * 1280)
+ t = time()
+ N = 5
+ for _ in range(N):
+ sorted_indices = np.argsort(J)[::-1]
+ cumulative_probs = np.cumsum(sorted_indices)
+ sorted_indices_to_remove = cumulative_probs > np.random.rand()
+ sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
+ sorted_indices_to_remove[0] = False
+ J[sorted_indices[sorted_indices_to_remove]] = -np.inf
+ delta = time() - t
+ print(
+ f"9. np.argsort and np.cumsum on a vector of length {size*1280} in {1e3 * delta / N:.3f} ms."
+ )
+ del J
+
+ K, L = np.random.random((size, 1)), np.random.random((1, size))
+ t = time()
+ N = 200
+ for _ in range(N):
+ M = K * L
+ delta = time() - t
+ print(f"10. Broadcasting two vectors of length {size} in {1e3 * delta / N:.3f} ms.")
+ del K, L, M
+
+ N = np.random.random((size, size))
+ indices = np.random.randint(size, size=(size,))
+ t = time()
+ M = 200
+ for _ in range(M):
+ O = N[indices, :]
+ delta = time() - t
+ print(f"11. Indexing a {size}x{size} matrix in {1e3 * delta / M:.3f} ms.")
+ del N, O
+
+ P = np.random.random((size, size))
+ t = time()
+ M = 100
+ for _ in range(M):
+ s = np.sum(P)
+ delta = time() - t
+ print(f"12. Sum reduction of a {size}x{size} matrix in {1e3 * delta / M:.3f} ms.")
+ del P
+
+ Q = np.random.random((size, size))
+ R = torch.tensor(Q)
+
+ # Numpy to PyTorch
+ t = time()
+ N = 100
+ for _ in range(N):
+ R = torch.from_numpy(Q)
+ delta = time() - t
+ print(
+ f"13. Conversion of a Numpy {size}x{size} matrix to PyTorch tensor in {1e3 * delta / N:.3f} ms."
+ )
+
+ # PyTorch to Numpy
+ t = time()
+ for _ in range(N):
+ Q_new = R.numpy()
+ delta = time() - t
+ print(
+ f"14. Conversion of a PyTorch tensor {size}x{size} to Numpy array in {1e3 * delta / N:.3f} ms."
+ )
+ del Q, R
+
+ # Benchmark for conversion operations
+ Q = np.random.random((size, size)).astype(np.float32)
+ R = torch.tensor(Q)
+
+ # Numpy to PyTorch with forced copy via type conversion
+ t = time()
+ N = 100
+ for _ in range(N):
+ R = torch.tensor(Q, dtype=torch.float64)
+ delta = time() - t
+ print(
+ f"15. Conversion of a Numpy {size}x{size} matrix to PyTorch tensor with forced copy in {1e3 * delta / N:.3f} ms."
+ )
+
+ # PyTorch to Numpy with forced copy via operation that doesn't change data
+ t = time()
+ for _ in range(N):
+ Q_new = (R + 0).numpy()
+ delta = time() - t
+ print(
+ f"16. Conversion of a PyTorch tensor {size}x{size} to Numpy array with forced copy in {1e3 * delta / N:.3f} ms."
+ )
+ del Q, R
+
+ t1 = time()
+ print(f"\nTotal time: {t1 - t0:.3f}s \n\n")
+
+
+if __name__ == "__main__":
+ numpy_benchmark()
diff --git a/bark_infinity/generation.py b/bark_infinity/generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..aecabfc9b7186d2dee969b9773ee60a5b9fefc6a
--- /dev/null
+++ b/bark_infinity/generation.py
@@ -0,0 +1,2130 @@
+import contextlib
+import gc
+import os
+import re
+
+import random
+from encodec import EncodecModel
+import funcy
+import numpy as np
+from scipy.special import softmax
+import torch
+
+import math
+from scipy.spatial import distance
+
+import torch.distributions as torch_distributions
+
+import torch.nn.functional as F
+import tqdm
+from transformers import BertTokenizer
+from huggingface_hub import hf_hub_download
+
+from .model import GPTConfig, GPT
+from .model_fine import FineGPT, FineGPTConfig
+
+import traceback
+import sys
+import time
+
+import math
+
+from rich.pretty import pprint
+
+from .config import logger, load_all_defaults
+
+from huggingface_hub import hf_hub_url
+from collections import Counter
+
+from devtools import debug
+from collections import defaultdict
+
+
+def _cast_bool_env_var(s):
+ return s.lower() in ("true", "1", "t")
+
+
+def get_SUNO_USE_DIRECTML():
+ if _cast_bool_env_var(os.environ.get("SUNO_USE_DIRECTML", "False")):
+ return True
+
+ kwargs = {}
+ defaults = load_all_defaults(*kwargs)
+ if defaults["SUNO_USE_DIRECTML"] is True:
+ return True
+ else:
+ return False
+
+
+SUNO_USE_DIRECTML = get_SUNO_USE_DIRECTML()
+
+dml = None
+if SUNO_USE_DIRECTML is True:
+ print(f" --->> Experimental AMD DirectML support enabled.")
+ import torch_directml
+
+ torch.cuda.is_available = lambda: False
+
+ dml = torch_directml.device()
+
+
+if (
+ torch.cuda.is_available()
+ and hasattr(torch.cuda, "amp")
+ and hasattr(torch.cuda.amp, "autocast")
+ and hasattr(torch.cuda, "is_bf16_supported")
+ and torch.cuda.is_bf16_supported()
+):
+ # print(f" --->> Experimental NVIDIA BF16 support enabled.")
+ autocast = funcy.partial(torch.cuda.amp.autocast, dtype=torch.bfloat16)
+else:
+
+ @contextlib.contextmanager
+ def autocast():
+ yield
+
+
+# hold models in global scope to lazy load
+global models
+models = {}
+
+global models_devices
+models_devices = {}
+
+
+CONTEXT_WINDOW_SIZE = 1024
+
+SEMANTIC_RATE_HZ = 49.9
+SEMANTIC_VOCAB_SIZE = 10_000
+
+CODEBOOK_SIZE = 1024
+N_COARSE_CODEBOOKS = 2
+N_FINE_CODEBOOKS = 8
+COARSE_RATE_HZ = 75
+
+SAMPLE_RATE = 24_000
+
+
+SUPPORTED_LANGS = [
+ ("English", "en"),
+ ("German", "de"),
+ ("Spanish", "es"),
+ ("French", "fr"),
+ ("Hindi", "hi"),
+ ("Italian", "it"),
+ ("Japanese", "ja"),
+ ("Korean", "ko"),
+ ("Polish", "pl"),
+ ("Portuguese", "pt"),
+ ("Russian", "ru"),
+ ("Turkish", "tr"),
+ ("Chinese", "zh"),
+]
+
+ALLOWED_PROMPTS = {"announcer"}
+for _, lang in SUPPORTED_LANGS:
+ for prefix in ("", f"v2{os.path.sep}"):
+ for n in range(10):
+ ALLOWED_PROMPTS.add(f"{prefix}{lang}_speaker_{n}")
+
+
+SUPPORTED_LANGS = [
+ ("English", "en"),
+ ("German", "de"),
+ ("Spanish", "es"),
+ ("French", "fr"),
+ ("Hindi", "hi"),
+ ("Italian", "it"),
+ ("Japanese", "ja"),
+ ("Korean", "ko"),
+ ("Polish", "pl"),
+ ("Portuguese", "pt"),
+ ("Russian", "ru"),
+ ("Turkish", "tr"),
+ ("Chinese", "zh"),
+]
+
+ALLOWED_PROMPTS = {"announcer"}
+for _, lang in SUPPORTED_LANGS:
+ for prefix in ("", f"v2{os.path.sep}"):
+ for n in range(10):
+ ALLOWED_PROMPTS.add(f"{prefix}{lang}_speaker_{n}")
+
+
+CUR_PATH = os.path.dirname(os.path.abspath(__file__))
+
+
+default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache")
+CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "suno", "bark_v0")
+
+
+USE_SMALL_MODELS = _cast_bool_env_var(os.environ.get("SUNO_USE_SMALL_MODELS", "False"))
+GLOBAL_ENABLE_MPS = _cast_bool_env_var(os.environ.get("SUNO_ENABLE_MPS", "False"))
+OFFLOAD_CPU = _cast_bool_env_var(os.environ.get("SUNO_OFFLOAD_CPU", "False"))
+
+# Slower, possibly lower quality, but more memory efficient
+SUNO_HALF_PRECISION = _cast_bool_env_var(os.environ.get("SUNO_HALF_PRECISION", "False"))
+
+# Slower, possibly lower quality, but more memory efficient
+SUNO_HALF_BFLOAT16 = _cast_bool_env_var(os.environ.get("SUNO_HALF_BFLOAT16", "False"))
+
+SUNO_DISABLE_COMPILE = _cast_bool_env_var(os.environ.get("SUNO_DISABLE_COMPILE", "False"))
+
+if sys.platform == "win32":
+ SUNO_DISABLE_COMPILE = True
+
+
+if SUNO_USE_DIRECTML is True:
+ OFFLOAD_CPU = False
+
+OFFLOAD_CPU = False
+
+REMOTE_MODEL_PATHS = {
+ "text_small": {
+ "repo_id": "suno/bark",
+ "file_name": "text.pt",
+ },
+ "coarse_small": {
+ "repo_id": "suno/bark",
+ "file_name": "coarse.pt",
+ },
+ "fine_small": {
+ "repo_id": "suno/bark",
+ "file_name": "fine.pt",
+ },
+ "text": {
+ "repo_id": "suno/bark",
+ "file_name": "text_2.pt",
+ },
+ "coarse": {
+ "repo_id": "suno/bark",
+ "file_name": "coarse_2.pt",
+ },
+ "fine": {
+ "repo_id": "suno/bark",
+ "file_name": "fine_2.pt",
+ },
+}
+
+if not hasattr(torch.nn.functional, "scaled_dot_product_attention") and torch.cuda.is_available():
+ logger.warning(
+ "torch version does not support flash attention. You will get faster"
+ + " inference speed by upgrade torch to newest nightly version."
+ )
+
+
+def _grab_best_device(use_gpu=True):
+ if torch.cuda.device_count() > 0 and use_gpu:
+ device = "cuda"
+ elif torch.backends.mps.is_available() and use_gpu and GLOBAL_ENABLE_MPS:
+ device = "mps"
+ else:
+ device = "cpu"
+
+ return device
+
+
+def _get_ckpt_path(model_type, use_small=False):
+ key = model_type
+ if use_small or USE_SMALL_MODELS:
+ key += "_small"
+ return os.path.join(CACHE_DIR, REMOTE_MODEL_PATHS[key]["file_name"])
+
+
+def _download(from_hf_path, file_name):
+ os.makedirs(CACHE_DIR, exist_ok=True)
+ hf_hub_download(repo_id=from_hf_path, filename=file_name, local_dir=CACHE_DIR)
+
+
+class InferenceContext:
+ def __init__(self, benchmark=False):
+ # we can't expect inputs to be the same length, so disable benchmarking by default
+ self._chosen_cudnn_benchmark = benchmark
+ self._cudnn_benchmark = None
+
+ def __enter__(self):
+ self._cudnn_benchmark = torch.backends.cudnn.benchmark
+ torch.backends.cudnn.benchmark = self._chosen_cudnn_benchmark
+
+ def __exit__(self, exc_type, exc_value, exc_traceback):
+ torch.backends.cudnn.benchmark = self._cudnn_benchmark
+
+
+if torch.cuda.is_available():
+ torch.backends.cuda.matmul.allow_tf32 = True
+ torch.backends.cudnn.allow_tf32 = True
+
+
+@contextlib.contextmanager
+def _inference_mode():
+ if SUNO_USE_DIRECTML is True:
+ with InferenceContext(), torch.inference_mode(mode=False), torch.no_grad(), autocast():
+ yield
+ else:
+ with InferenceContext(), torch.inference_mode(), torch.no_grad(), autocast():
+ yield
+
+
+def _clear_cuda_cache():
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+ torch.cuda.synchronize()
+
+
+def clean_models(model_key=None):
+ global models
+ model_keys = [model_key] if model_key is not None else list(models.keys())
+ for k in model_keys:
+ if k in models:
+ del models[k]
+
+ _clear_cuda_cache()
+ gc.collect()
+
+
+def _load_codec_model(device):
+ model = EncodecModel.encodec_model_24khz()
+ model.set_target_bandwidth(6.0)
+ model.eval()
+
+ print_loading_info("codec", "EncodecModelPath", device)
+
+ if SUNO_USE_DIRECTML is True:
+ model.to(dml)
+ else:
+ model.to(device)
+
+ if callable(getattr(torch, "compile")) and not SUNO_DISABLE_COMPILE:
+ logger.info("torch.compile available, compiling codec model.")
+ model = torch.compile(model)
+ else:
+ logger.info(
+ "torch.compile *not* available, you will get better performance if you use pytorch >= 2.0."
+ )
+
+ _clear_cuda_cache()
+ return model
+
+
+def load_codec_model(use_gpu=True, force_reload=False):
+ global models
+ global models_devices
+ device = _grab_best_device(use_gpu=use_gpu)
+ if device == "mps":
+ # encodec doesn't support mps
+ device = "cpu"
+ model_key = "codec"
+ if OFFLOAD_CPU:
+ models_devices[model_key] = device
+ device = "cpu"
+ if model_key not in models or force_reload:
+ clean_models(model_key=model_key)
+
+ model = _load_codec_model(device)
+ models[model_key] = model
+
+ if SUNO_USE_DIRECTML is True:
+ models[model_key].to(dml)
+ else:
+ models[model_key].to(device)
+
+ return models[model_key]
+
+
+####
+# Generation Functionality
+####
+
+
+def _tokenize(tokenizer, text):
+ return tokenizer.encode(text, add_special_tokens=False)
+
+
+def _detokenize(tokenizer, enc_text):
+ return tokenizer.decode(enc_text)
+
+
+def _normalize_whitespace(text):
+ return re.sub(r"\s+", " ", text).strip()
+
+
+TEXT_ENCODING_OFFSET = 10_048
+SEMANTIC_PAD_TOKEN = 10_000
+TEXT_PAD_TOKEN = 129_595
+SEMANTIC_INFER_TOKEN = 129_599
+
+
+def _load_history_prompt(history_prompt_input):
+ if isinstance(history_prompt_input, str) and history_prompt_input.endswith(".npz"):
+ history_prompt = np.load(history_prompt_input)
+ elif isinstance(history_prompt_input, str):
+ # make sure this works on non-ubuntu
+ history_prompt_input = os.path.join(*history_prompt_input.split("/"))
+ if history_prompt_input not in ALLOWED_PROMPTS:
+ raise ValueError("history prompt not found")
+ history_prompt = np.load(
+ os.path.join(CUR_PATH, "assets", "prompts", f"{history_prompt_input}.npz")
+ )
+ elif isinstance(history_prompt_input, dict):
+ assert "semantic_prompt" in history_prompt_input
+ assert "coarse_prompt" in history_prompt_input
+ assert "fine_prompt" in history_prompt_input
+ history_prompt = history_prompt_input
+ else:
+ raise ValueError("history prompt format unrecognized")
+ return history_prompt
+
+
+def compute_log_probs(token_list, smoothing_factor, scaling_factor):
+ # Count the frequency of each token.
+ token_freq = Counter(token_list)
+
+ # Add a smoothing factor.
+ smoothed_token_freq = {token: freq + smoothing_factor for token, freq in token_freq.items()}
+
+ # Normalize to create a probability distribution.
+ total_tokens = len(token_list) + smoothing_factor * len(smoothed_token_freq)
+ token_probs = {token: freq / total_tokens for token, freq in smoothed_token_freq.items()}
+
+ # Transform into scaled log-probabilities.
+ log_probs = {token: scaling_factor * np.log(prob) for token, prob in token_probs.items()}
+
+ return log_probs
+
+
+def estimate_s_this_seems_wrong_so_many_math_crashes(prob):
+ epsilon = 1e-10
+ num = 0
+ den = 0
+ for i in range(
+ min(len(prob), 10000)
+ ): # apparently any number is fine here but they paper was on natural language so maybe not for us?
+ # for i in range(768):
+ b = prob[i] / (prob[i + 1] + epsilon)
+ t = (i + 2) / (i + 1)
+ if b > 0 and t > 0:
+ num += math.log(b) * math.log(t)
+ den += math.log(t) ** 2
+ return num / den if den != 0 else 0
+
+
+def estimate_s(prob):
+ epsilon = 1e-10
+ num = 0
+ den = 0
+ # for i in range(3000):
+ # in the paper they say 100 is as good as any higher number? But it's not slow so maybe leave it higher?
+ # also in the paper they don't have catch divide by 0s though...
+ # also the paper was on natural language so maybe not for us. Let's just max it out
+ for i in range(min(len(prob), 10000)):
+ b = prob[i] / (prob[i + 1] + epsilon)
+ t = (i + 2) / (i + 1)
+ if b > 0 and t > 0:
+ num += math.log(b if b > 0 else 1) * math.log(t if t > 0 else 1)
+ # den += math.log(t)**2
+ den += math.log(t if t > 0 else 1) ** 2
+ # ok NOW this should never be zero and feels more right
+ return num / den
+ # return num / den if den != 0 else 0 # or should this be float("inf") ? doesn't seem right.
+
+
+def compute_k_original_paper(n, s, tau):
+ print(f"n: {n}, s: {s}, tau: {tau}")
+ eps = s - 1
+ k = ((eps * (2 ** (tau))) / (1 - n ** (-eps))) ** (1 / s)
+ k = round(k)
+ return k
+
+
+def compute_k(n, s, tau, max_k):
+ try:
+ eps = s - 1
+ n_eps = n ** (-eps)
+ if s <= 0:
+ return 0
+ tau_s = tau ** (1 / s)
+ k = (eps * 2 * tau_s / (1 - n_eps)) ** (1 / s)
+ if isinstance(k, complex):
+ return 0
+ k = round(k)
+ if k > max_k:
+ return max_k
+ return k
+ except OverflowError:
+ # Return maximum possible k
+ return max_k
+
+
+def compute_k_orig(n, s, tau):
+ print(f"n: {n}, s: {s}, tau: {tau}")
+ eps = s - 1
+ k = ((eps * (2 ** (tau))) / (1 - n ** (-eps))) ** (1 / s)
+ k = round(k)
+ return k
+
+
+def compute_k_not_right(n, s, tau, max_k):
+ print(f"n: {n}, s: {s}, tau: {tau}")
+ try:
+ eps = s - 1
+ n_eps = n ** (-eps)
+ if s <= 0:
+ return max_k
+ tau_s = tau ** (1 / s)
+ k = (eps * 2 * tau_s / (1 - n_eps)) ** (1 / s)
+ k = round(k)
+ return k
+ except OverflowError:
+ # Return maximum possible k
+ return max_k
+
+
+def compute_k_log(n, s, tau):
+ print(f"n: {n}, s: {s}, tau: {tau}")
+ eps = s - 1
+ try:
+ log_k = (math.log(eps) + tau * math.log(2) - math.log(1 - n ** (-eps))) / s
+ k = round(math.exp(log_k))
+ except OverflowError:
+ k = float("inf")
+ return k
+
+
+# https://github.com/basusourya/mirostat/blob/master/mirostat.py
+
+
+# try adjusting target tau dynamically based on just length even? Could you shape the "energy" of the clip?
+def mirostat_sampling_v1(
+ logits=None,
+ tau=5.0,
+ learning_rate=1.0,
+ max_surprise=None,
+ vocab_size=SEMANTIC_VOCAB_SIZE,
+ indices_surprise_history=[],
+ running_tot_surprise=0.0,
+ generated=[],
+):
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+ prob_original = torch.softmax(sorted_logits, dim=-1).tolist()
+
+ s = estimate_s(prob_original)
+
+ max_k = len(sorted_logits) - 1
+
+ k = compute_k(vocab_size, s, max_surprise, max_k) + 1
+
+ print(f"\n\nK: {k} s: {s} tau: {max_surprise}")
+
+ sorted_logits = sorted_logits[0:k]
+ sorted_indices = sorted_indices[0:k]
+
+ prob_topk = torch.softmax(sorted_logits, dim=0)
+
+ prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True)
+ index_surprise = math.log2(1 / prob_original[prev_i])
+ print(f"index_surprise: {index_surprise}")
+ indices_surprise_history.append(index_surprise)
+
+ running_tot_surprise += index_surprise
+ prev = sorted_indices[prev_i]
+ generated += prev.tolist()
+
+ error_surprise = index_surprise - tau
+ max_surprise -= learning_rate * error_surprise
+
+ # full_probs = torch.zeros_like(logits) # 0? or -inf?
+ full_probs = torch.empty_like(logits).fill_(-float("inf"))
+ full_probs[sorted_indices] = prob_topk.to(full_probs.dtype)
+
+ return (
+ sorted_indices[prev_i],
+ max_surprise,
+ full_probs,
+ indices_surprise_history,
+ running_tot_surprise,
+ generated,
+ )
+
+
+def mirostat_sampling_meh(
+ logits=None,
+ tau=5.0,
+ learning_rate=1.0,
+ max_surprise=None,
+ vocab_size=SEMANTIC_VOCAB_SIZE,
+ indices_surprise_history=[],
+ running_tot_surprise=0.0,
+ generated=[],
+):
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+ prob_original = torch.softmax(sorted_logits, dim=-1).tolist()
+
+ s = estimate_s(prob_original)
+
+ max_k = len(sorted_logits) - 1
+
+ k = compute_k(vocab_size, s, max_surprise, max_k) + 1
+
+ print(f"\n\nK: {k} s: {s} tau: {max_surprise}")
+
+ sorted_logits = sorted_logits[0:k]
+ sorted_indices = sorted_indices[0:k]
+
+ prob_topk = torch.softmax(sorted_logits, dim=0)
+
+ prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True)
+
+ index_surprise = math.log2(1 / prob_original[sorted_indices[prev_i].item()])
+ print(f"index_surprise: {index_surprise}")
+ indices_surprise_history.append(index_surprise)
+
+ running_tot_surprise += index_surprise
+ prev = sorted_indices[prev_i]
+ generated += prev.tolist()
+ error_surprise = index_surprise - tau
+ max_surprise -= learning_rate * error_surprise
+
+ full_probs = torch.empty_like(logits).fill_(-float("inf"))
+ full_probs[sorted_indices] = prob_topk.to(full_probs.dtype)
+
+ item_next = sorted_indices[prev_i]
+
+ return (
+ item_next,
+ max_surprise,
+ full_probs,
+ indices_surprise_history,
+ running_tot_surprise,
+ generated,
+ )
+
+
+def mirostat_sampling_least(
+ logits=None,
+ tau=5.0,
+ learning_rate=1.0,
+ max_surprise=None,
+ vocab_size=SEMANTIC_VOCAB_SIZE,
+ indices_surprise_history=[],
+ running_tot_surprise=0.0,
+ generated=[],
+):
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+ prob_original = torch.softmax(sorted_logits, dim=-1).tolist()
+
+ s = estimate_s(prob_original)
+
+ max_k = len(sorted_logits) - 1
+
+ k = compute_k(vocab_size, s, max_surprise, max_k) + 1
+
+ print(f"\n\nK: {k} s: {s} tau: {max_surprise}")
+
+ sorted_logits = sorted_logits[0:k]
+ sorted_indices = sorted_indices[0:k]
+
+ prob_topk = torch.softmax(sorted_logits, dim=0)
+
+ prev_i = torch.argmin(prob_topk).unsqueeze(0)
+
+ index_surprise = math.log2(1 / prob_original[sorted_indices[prev_i].item()])
+ print(f"index_surprise: {index_surprise}")
+ indices_surprise_history.append(index_surprise)
+
+ running_tot_surprise += index_surprise
+ prev = sorted_indices[prev_i]
+ generated += prev.tolist()
+
+ error_surprise = index_surprise - tau
+ max_surprise -= learning_rate * error_surprise
+
+ full_probs = torch.empty_like(logits).fill_(-float("inf"))
+ full_probs[sorted_indices] = prob_topk.to(full_probs.dtype)
+
+ # Return least likely token and reverse generated logits
+ # return sorted_indices[prev_i], max_surprise, torch.flip(full_probs, dims=[0]), indices_surprise_history, running_tot_surprise, generated
+ return (
+ sorted_indices[prev_i],
+ max_surprise,
+ full_probs,
+ indices_surprise_history,
+ running_tot_surprise,
+ generated,
+ )
+
+
+def sine_wave_temperature(current_token, max_token):
+ return 3.0 + 2.1 * (math.sin(2 * math.pi * (current_token / 150)) / 2.1 + 0.2)
+
+
+def sine_wave_temperature(current_token, max_token, period=100, phase_shift=0):
+ return 0.5 + 2.0 * (math.sin(2 * math.pi * (current_token / period) + phase_shift) / 2 + 0.5)
+
+
+def sine_wave_temperature(current_token, token_period, start_phase, temp_min, temp_max):
+ phase = 2 * math.pi * ((current_token + start_phase) / token_period)
+ temp_range = temp_max - temp_min
+ return temp_min + temp_range * ((math.sin(phase) / 2) + 0.5)
+
+
+def mirostat_sampling(
+ logits=None,
+ tau=5.0,
+ learning_rate=1.0,
+ max_surprise=None,
+ vocab_size=SEMANTIC_VOCAB_SIZE,
+ indices_surprise_history=[],
+ running_tot_surprise=0,
+ generated=[],
+ temperature_fn=None,
+):
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+ prob_original = torch.softmax(sorted_logits, dim=-1).tolist()
+
+ s = estimate_s(prob_original)
+
+ max_k = len(sorted_logits) - 1
+
+ k = compute_k(vocab_size, s, max_surprise, max_k) + 1
+
+ sorted_logits = sorted_logits[0:k]
+ sorted_indices = sorted_indices[0:k]
+
+ # Current location in the segment
+ current_token = len(generated)
+ max_token = 768 # Maximum sample length
+
+ if temperature_fn is not None:
+ temp = temperature_fn(current_token, max_token)
+ sorted_logits = torch.clamp(sorted_logits, -10000, 10000)
+ # Apply to logits before softmax
+ prob_topk = torch.softmax(sorted_logits / temp, dim=0)
+ prob_topk = torch.clamp(prob_topk, 1e-9, 1 - 1e-9) # Ensures probabilities are valid
+ else:
+ prob_topk = torch.softmax(sorted_logits, dim=0)
+
+ prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True)
+
+ epsilon = 1e-10
+ index_surprise = math.log2(1 / (prob_original[sorted_indices[prev_i].item()] + epsilon))
+
+ indices_surprise_history.append(index_surprise)
+
+ running_tot_surprise += index_surprise
+ prev = sorted_indices[prev_i]
+ generated += prev.tolist()
+
+ error_surprise = index_surprise - tau
+ max_surprise -= learning_rate * error_surprise
+
+ full_probs = torch.empty_like(logits).fill_(-float("inf"))
+ full_probs[sorted_indices] = prob_topk.to(full_probs.dtype)
+
+ if current_token % 25 == 0 and False:
+ print(f"Temperature: {temp}")
+ print(f"index_surprise: {index_surprise}")
+ print(f"\n\nK: {k} s: {s} tau: {max_surprise}")
+
+ return (
+ sorted_indices[prev_i],
+ max_surprise,
+ full_probs,
+ indices_surprise_history,
+ running_tot_surprise,
+ generated,
+ )
+
+
+cdevice = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+def compute_negative_influence(negative_logits, n, window_size, negative_scale):
+ # negative_logits is list of tensors
+ # we could calculate a local "negative influence" based on the tokens in negative_logits near position n.
+
+ # calculate the negative influence as a weighted average of the logits in negative_logits around position n, where the weights decrease the farther you get from n
+
+ # This code takes a window of logits around position n in negative_logits, weights them by their distance from n, and averages them to compute the negative influence.
+
+ # Check if negative_logits is empty
+ if len(negative_logits) == 0:
+ return 0
+
+ # Ensure n is within range
+ n = min(max(n, 0), len(negative_logits) - 1)
+
+ # Adjust window_size if it's larger than negative_logits length
+ window_size = min(window_size, len(negative_logits))
+
+ # Get the start and end of the window
+ start = max(0, n - window_size)
+ end = min(len(negative_logits), n + window_size + 1)
+
+ # Move tensors to the specified device
+ negative_logits = [logit.to(cdevice) for logit in negative_logits]
+ n = torch.tensor(n).to(cdevice)
+ window_size = torch.tensor(window_size).to(cdevice)
+ negative_scale = torch.tensor(negative_scale).to(cdevice)
+
+ # Generate a Gaussian distribution for the weights and normalize them
+ weights = torch.exp(
+ -((torch.arange(start, end).to(cdevice) - n) ** 2) / (2.0 * window_size**2)
+ )
+ weights /= weights.sum()
+
+ weights = weights.view(-1, 1)
+ negative_influence = torch.stack(negative_logits[start:end]).mul(weights).sum(0)
+
+ # Adjust the influence by the negative_scale
+ negative_scale = min(
+ max(negative_scale.item(), 0), 1
+ ) # Ensure negative_scale is between 0 and 1
+ negative_influence *= negative_scale
+
+ # print(f"Negative influence: {negative_influence}")
+
+ return negative_influence
+
+
+def fast_compute_negative_influence(negative_logits, window_size, negative_scale):
+ if len(negative_logits) == 0:
+ return 0
+
+ window_size = min(window_size, len(negative_logits))
+
+ negative_logits = torch.stack(negative_logits).unsqueeze(0).permute(0, 2, 1)
+
+ # Gaussian distribution for weights and norma
+ weights = torch.exp(
+ -((torch.arange(-window_size, window_size + 1).to(cdevice)) ** 2) / (2.0 * window_size**2)
+ )
+ weights /= weights.sum()
+
+ # Reshape weights tensor for convolution
+ # weights = weights.repeat(negative_logits.shape[1], 1).unsqueeze(1)
+ weights = weights.repeat(1, negative_logits.shape[1], 1)
+
+ # Compute cumulative sum of weighted logits
+ cum_logits = (
+ torch.nn.functional.conv1d(negative_logits, weights.flip(dims=[2]), padding=window_size)
+ .squeeze(0)
+ .permute(1, 0)
+ )
+
+ negative_scale = min(max(negative_scale, 0), 1) # Ensure negative_scale is between 0 and 1
+ cum_logits *= negative_scale
+
+ # print(f"Cumulative negative influence: {cum_logits}")
+
+ return cum_logits
+
+
+def generate_text_semantic(
+ text,
+ history_prompt=None,
+ temp=0.7,
+ top_k=None,
+ top_p=None,
+ silent=False,
+ min_eos_p=0.2,
+ max_gen_duration_s=None,
+ allow_early_stop=True,
+ use_kv_caching=True,
+ semantic_use_mirostat_sampling=False,
+ # semantic_mirostat_tau = 31100.0,
+ semantic_mirostat_tau=5.0,
+ semantic_mirostat_learning_rate=1.0,
+ semantic_token_repeat_penalty=0.0,
+ semantic_inverted_p=None,
+ semantic_bottom_k=None,
+ return_logits=False,
+ negative_tokens=None,
+ negative_logits=None,
+ negative_text_prompt_logits_scale=None,
+ negative_text_prompt_logits_sliding_scale=None,
+ negative_text_prompt_logits_scale_window_size=164,
+ negative_text_prompt_divergence_scale=None,
+):
+ """Generate semantic tokens from text."""
+
+ all_logits = None
+ if return_logits:
+ all_logits = []
+
+ if temp == 0:
+ temp = 0.001
+ # debug(locals())
+ logger.debug(locals())
+
+ assert isinstance(text, str)
+ text = _normalize_whitespace(text)
+ # assert len(text.strip()) > 0
+
+ if history_prompt is not None:
+ history_prompt = _load_history_prompt(history_prompt)
+ semantic_history = history_prompt["semantic_prompt"]
+ assert (
+ isinstance(semantic_history, np.ndarray)
+ and len(semantic_history.shape) == 1
+ and len(semantic_history) > 0
+ and semantic_history.min() >= 0
+ and semantic_history.max() <= SEMANTIC_VOCAB_SIZE - 1
+ )
+ else:
+ semantic_history = None
+ # load models if not yet exist
+ global models
+ global models_devices
+ if "text" not in models:
+ if SUNO_USE_DIRECTML is True:
+ preload_models(load_one_model_type="text")
+ else:
+ preload_models()
+ model_container = models["text"]
+ model = model_container["model"]
+ tokenizer = model_container["tokenizer"]
+ encoded_text = np.array(_tokenize(tokenizer, text)) + TEXT_ENCODING_OFFSET
+ if OFFLOAD_CPU:
+ if GLOBAL_ENABLE_MPS:
+ device = _grab_best_device(use_gpu=False)
+ models_devices["text"] = device
+ model.to(models_devices["text"])
+ device = next(model.parameters()).device
+ if len(encoded_text) > 256:
+ p = round((len(encoded_text) - 256) / len(encoded_text) * 100, 1)
+ logger.warning(f"warning, text too long, lopping of last {p}%")
+ encoded_text = encoded_text[:256]
+ encoded_text = np.pad(
+ encoded_text,
+ (0, 256 - len(encoded_text)),
+ constant_values=TEXT_PAD_TOKEN,
+ mode="constant",
+ )
+ if semantic_history is not None:
+ semantic_history = semantic_history.astype(np.int64)
+ # print(f"Actual length of semantic input: {len(semantic_history)}")
+ # lop off if history is too long, pad if needed
+ semantic_history = semantic_history[-256:]
+ semantic_history = np.pad(
+ semantic_history,
+ (0, 256 - len(semantic_history)),
+ constant_values=SEMANTIC_PAD_TOKEN,
+ mode="constant",
+ )
+ else:
+ semantic_history = np.array([SEMANTIC_PAD_TOKEN] * 256)
+ x = torch.from_numpy(
+ np.hstack([encoded_text, semantic_history, np.array([SEMANTIC_INFER_TOKEN])]).astype(
+ np.int64
+ )
+ )[None]
+ assert x.shape[1] == 256 + 256 + 1
+ with _inference_mode():
+ if SUNO_USE_DIRECTML is True:
+ device = dml
+ x = x.to(device)
+ n_tot_steps = 768
+
+ # preallocate tensor
+ x_initial = x.shape[1]
+ x = torch.hstack([x, torch.empty([1, n_tot_steps], dtype=torch.int32, device=device)])
+
+ # custom tqdm updates since we don't know when eos will occur
+ pbar = tqdm.tqdm(disable=silent, total=n_tot_steps)
+ pbar_state = 0
+ tot_generated_duration_s = 0
+ kv_cache = None
+
+ # mirostat
+ prev = None
+ max_surprise = 2 * semantic_mirostat_tau
+ indices_surprise_history = []
+ running_tot_surprise = 0.0
+ miro_generated = [] # debug
+
+ token_counts = defaultdict(int)
+
+ cum_negative_influence = None
+
+ if negative_logits is not None and negative_text_prompt_logits_sliding_scale is not None:
+ cum_negative_influence = fast_compute_negative_influence(
+ negative_logits,
+ negative_text_prompt_logits_scale_window_size,
+ negative_text_prompt_logits_scale,
+ )
+ # print(f"Shape of cum_negative_influence: {cum_negative_influence.shape}")
+ # Shape of cum_negative_influence: torch.Size([1, 10001])
+
+ for n in range(n_tot_steps):
+ # if use_kv_caching and kv_cache is not None:
+ # x_input = x[:, [-1]]
+ # else:
+ # x_input = x
+
+ x_input = (
+ x[:, [x_initial + n - 1]]
+ if use_kv_caching and kv_cache is not None
+ else x[:, : x_initial + n]
+ )
+ logits, kv_cache = model(
+ x_input, merge_context=True, use_cache=use_kv_caching, past_kv=kv_cache
+ )
+ relevant_logits = logits[0, 0, :SEMANTIC_VOCAB_SIZE]
+ if allow_early_stop:
+ relevant_logits = torch.hstack(
+ (relevant_logits, logits[0, 0, [SEMANTIC_PAD_TOKEN]]) # eos
+ )
+
+ # Detach and convert to numpy for faster calculations
+ original_device = relevant_logits.device
+ relevant_logits = relevant_logits.detach().cpu().type(torch.float32).numpy()
+
+ # Jon doing some silly here
+ if top_p is not None or semantic_inverted_p is not None:
+ if semantic_inverted_p is not None:
+ sorted_indices = np.argsort(relevant_logits)
+ cumulative_limit = semantic_inverted_p
+ elif top_p is not None:
+ sorted_indices = np.argsort(relevant_logits)[::-1]
+ cumulative_limit = top_p
+ sorted_logits = relevant_logits[sorted_indices]
+ cumulative_probs = np.cumsum(softmax(sorted_logits))
+ sorted_indices_to_remove = cumulative_probs > cumulative_limit
+ sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
+ sorted_indices_to_remove[0] = False
+ relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf
+
+ relevant_logits = torch.from_numpy(relevant_logits)
+ relevant_logits = relevant_logits.to(original_device)
+
+ if top_k is not None or semantic_bottom_k is not None:
+ if semantic_bottom_k is not None:
+ v, _ = torch.topk(
+ relevant_logits,
+ max(semantic_bottom_k, relevant_logits.size(-1)),
+ largest=False,
+ )
+ relevant_logits[relevant_logits > v[-1]] = -float("Inf")
+ elif top_k is not None:
+ v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1)))
+ relevant_logits[relevant_logits < v[-1]] = -float("Inf")
+
+ if semantic_use_mirostat_sampling:
+ logits_for_miro = relevant_logits / temp
+ (
+ item_next,
+ max_surprise,
+ probs,
+ indices_surprise_history,
+ running_tot_surprise,
+ miro_generated,
+ ) = mirostat_sampling(
+ logits=logits_for_miro,
+ max_surprise=max_surprise,
+ tau=semantic_mirostat_tau,
+ learning_rate=semantic_mirostat_learning_rate,
+ vocab_size=SEMANTIC_VOCAB_SIZE,
+ indices_surprise_history=indices_surprise_history,
+ running_tot_surprise=running_tot_surprise,
+ generated=miro_generated,
+ temperature_fn=None,
+ )
+ # item_next = item_next.to(torch.int32)
+
+ else:
+ if semantic_token_repeat_penalty != 0.0 and semantic_token_repeat_penalty != 1.0:
+ for token, count in token_counts.items():
+ relevant_logits[token] += math.log(semantic_token_repeat_penalty) * count
+
+ if return_logits:
+ all_logits.append(relevant_logits)
+
+ if negative_logits is not None:
+ # debug(negative_logits)
+
+ # Compute the negative influence
+
+ neg_n = n - 1
+ if neg_n >= len(negative_logits):
+ neg_n = -1
+
+ if (
+ cum_negative_influence is not None
+ and negative_text_prompt_logits_sliding_scale is not None
+ and negative_text_prompt_logits_sliding_scale > 0
+ ):
+ negative_influence_torch = cum_negative_influence[neg_n]
+
+ negative_influence_torch = negative_influence_torch.squeeze()
+
+ relevant_logits -= negative_influence_torch
+
+ elif (
+ negative_text_prompt_divergence_scale is not None
+ and negative_text_prompt_divergence_scale > 0
+ ):
+ negative_probs = (
+ F.softmax(negative_logits[neg_n], dim=-1).cpu().detach().numpy()
+ )
+ positive_probs = F.softmax(relevant_logits, dim=-1).cpu().detach().numpy()
+ divergence = negative_text_prompt_divergence_scale * distance.jensenshannon(
+ negative_probs, positive_probs
+ )
+ relevant_logits -= (
+ torch.tensor(divergence).to(device) * negative_logits[neg_n]
+ )
+
+ elif (
+ negative_text_prompt_logits_scale is not None
+ and negative_text_prompt_logits_scale > 0
+ ):
+ relevant_logits -= (
+ negative_text_prompt_logits_scale * negative_logits[neg_n]
+ )
+
+ relevant_logits = torch.where(
+ torch.isfinite(relevant_logits),
+ relevant_logits,
+ torch.tensor(-1e10).to(device),
+ )
+
+ probs = F.softmax(relevant_logits / temp, dim=-1)
+ item_next = torch.multinomial(probs, num_samples=1).to(torch.int32)
+
+ if allow_early_stop and (
+ item_next == SEMANTIC_VOCAB_SIZE
+ or (min_eos_p is not None and probs[-1] >= min_eos_p)
+ ):
+ n -= 1 # backtrack 1
+ # eos found, so break
+ pbar.total = n
+ pbar.update(n - pbar_state)
+
+ break
+ # x = torch.cat((x, item_next[None]), dim=1)
+ if semantic_token_repeat_penalty != 0.0 and semantic_token_repeat_penalty != 1.0:
+ token_counts[int(item_next)] += 1
+
+ x[0][x_initial + n] = item_next
+ tot_generated_duration_s += 1 / SEMANTIC_RATE_HZ
+ if max_gen_duration_s is not None and tot_generated_duration_s > max_gen_duration_s:
+ pbar.total = n
+ pbar.update(n - pbar_state)
+ break
+ if n == n_tot_steps - 1:
+ pbar.total = n
+ pbar.update(n - pbar_state)
+ break
+ del logits, relevant_logits, probs, item_next
+ if n > pbar_state:
+ if n > pbar.total:
+ pbar.total = n
+ pbar.update(n - pbar_state)
+ pbar_state = n
+ pbar.total = n
+ pbar.refresh()
+
+ pbar.close()
+ # out = x.detach().cpu().numpy().squeeze()[256 + 256 + 1 :]
+ out = x.detach().cpu().numpy().squeeze()[x_initial : x_initial + n + 1]
+ if semantic_use_mirostat_sampling and False:
+ print(f"Target tau: {semantic_mirostat_tau}")
+ print("Total surprise value:", sum(indices_surprise_history))
+ print("Average surprise value:", sum(indices_surprise_history) / len(out))
+ print(f"Generated Miro: {miro_generated}")
+ print(f"out: {out}")
+ if OFFLOAD_CPU:
+ model.to("cpu")
+ assert all(0 <= out) and all(out < SEMANTIC_VOCAB_SIZE)
+ _clear_cuda_cache()
+
+ if SUNO_USE_DIRECTML is True:
+ clean_models()
+
+ if return_logits:
+ return out, all_logits
+ else:
+ return out
+
+
+def generate_text_semantic_branching_not_batching(
+ text,
+ history_prompt=None,
+ temp=0.7,
+ top_k=None,
+ top_p=None,
+ silent=False,
+ min_eos_p=0.2,
+ max_gen_duration_s=None,
+ allow_early_stop=True,
+ use_kv_caching=True,
+ num_sample_per_step=2,
+):
+ """Generate semantic tokens from text."""
+ assert isinstance(text, str)
+ text = _normalize_whitespace(text)
+ assert len(text.strip()) > 0
+ if history_prompt is not None:
+ history_prompt = _load_history_prompt(history_prompt)
+ semantic_history = history_prompt["semantic_prompt"]
+ assert (
+ isinstance(semantic_history, np.ndarray)
+ and len(semantic_history.shape) == 1
+ and len(semantic_history) > 0
+ and semantic_history.min() >= 0
+ and semantic_history.max() <= SEMANTIC_VOCAB_SIZE - 1
+ )
+ else:
+ semantic_history = None
+ # load models if not yet exist
+ global models
+ global models_devices
+ if "text" not in models:
+ if SUNO_USE_DIRECTML is True:
+ preload_models(load_one_model_type="text")
+ else:
+ preload_models()
+ model_container = models["text"]
+ model = model_container["model"]
+ tokenizer = model_container["tokenizer"]
+ encoded_text = np.array(_tokenize(tokenizer, text)) + TEXT_ENCODING_OFFSET
+ if OFFLOAD_CPU:
+ model.to(models_devices["text"])
+ device = next(model.parameters()).device
+ if len(encoded_text) > 256:
+ p = round((len(encoded_text) - 256) / len(encoded_text) * 100, 1)
+ logger.warning(f"warning, text too long, lopping of last {p}%")
+ encoded_text = encoded_text[:256]
+ encoded_text = np.pad(
+ encoded_text,
+ (0, 256 - len(encoded_text)),
+ constant_values=TEXT_PAD_TOKEN,
+ mode="constant",
+ )
+ if semantic_history is not None:
+ semantic_history = semantic_history.astype(np.int64)
+ # lop off if history is too long, pad if needed
+ semantic_history = semantic_history[-256:]
+ semantic_history = np.pad(
+ semantic_history,
+ (0, 256 - len(semantic_history)),
+ constant_values=SEMANTIC_PAD_TOKEN,
+ mode="constant",
+ )
+ else:
+ semantic_history = np.array([SEMANTIC_PAD_TOKEN] * 256)
+ # x = torch.from_numpy(
+ # np.hstack([
+ # encoded_text, semantic_history, np.array([SEMANTIC_INFER_TOKEN])
+ # ]).astype(np.int64)
+ # )[None]
+
+ x = torch.from_numpy(
+ np.hstack([encoded_text, semantic_history, np.array([SEMANTIC_INFER_TOKEN])]).astype(
+ np.int64
+ )
+ ).repeat(num_sample_per_step, 1)
+
+ assert x.shape[1] == 256 + 256 + 1
+ with _inference_mode():
+ x = x.to(device)
+ n_tot_steps = 768
+ # custom tqdm updates since we don't know when eos will occur
+ pbar = tqdm.tqdm(disable=silent, total=n_tot_steps)
+ pbar_state = 0
+ tot_generated_duration_s = 0
+ kv_cache = None
+ for n in range(n_tot_steps):
+ if use_kv_caching and kv_cache is not None:
+ x_input = x[:, [-1]]
+ else:
+ x_input = x
+ logits, kv_cache = model(
+ x_input, merge_context=True, use_cache=use_kv_caching, past_kv=kv_cache
+ )
+ relevant_logits = logits[0, 0, :SEMANTIC_VOCAB_SIZE]
+ if allow_early_stop:
+ relevant_logits = torch.hstack(
+ (relevant_logits, logits[0, 0, [SEMANTIC_PAD_TOKEN]]) # eos
+ )
+ if top_p is not None:
+ # faster to convert to numpy
+ original_device = relevant_logits.device
+ relevant_logits = relevant_logits.detach().cpu().type(torch.float32).numpy()
+ sorted_indices = np.argsort(relevant_logits)[::-1]
+ sorted_logits = relevant_logits[sorted_indices]
+ cumulative_probs = np.cumsum(softmax(sorted_logits))
+ sorted_indices_to_remove = cumulative_probs > top_p
+ sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
+ sorted_indices_to_remove[0] = False
+ relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf
+ relevant_logits = torch.from_numpy(relevant_logits)
+ relevant_logits = relevant_logits.to(original_device)
+ if top_k is not None:
+ v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1)))
+ relevant_logits[relevant_logits < v[-1]] = -float("Inf")
+ # probs = F.softmax(relevant_logits / temp, dim=-1)
+ # item_next = torch.multinomial(probs, num_samples=1).to(torch.int32)
+
+ probs = F.softmax(relevant_logits / temp, dim=-1)
+ item_next = torch.multinomial(probs, num_samples=num_sample_per_step).to(torch.int32)
+ if allow_early_stop and (
+ item_next == SEMANTIC_VOCAB_SIZE
+ or (min_eos_p is not None and probs[-1] >= min_eos_p)
+ ):
+ # eos found, so break
+ pbar.update(n - pbar_state)
+ break
+ # x = torch.cat((x, item_next[None]), dim=1)
+ for i in range(num_sample_per_step):
+ x[i] = torch.cat((x[i], item_next[i][None]), dim=0)
+ tot_generated_duration_s += 1 / SEMANTIC_RATE_HZ
+ if max_gen_duration_s is not None and tot_generated_duration_s > max_gen_duration_s:
+ pbar.update(n - pbar_state)
+ break
+ if n == n_tot_steps - 1:
+ pbar.update(n - pbar_state)
+ break
+ del logits, relevant_logits, probs, item_next
+ if n > pbar_state:
+ if n > pbar.total:
+ pbar.total = n
+ pbar.update(n - pbar_state)
+ pbar_state = n
+ pbar.total = n
+ pbar.refresh()
+ pbar.close()
+ out = x.detach().cpu().numpy().squeeze()[256 + 256 + 1 :]
+ if OFFLOAD_CPU:
+ model.to("cpu")
+ assert all(0 <= out) and all(out < SEMANTIC_VOCAB_SIZE)
+ _clear_cuda_cache()
+ return out
+
+
+def generate_coarse(
+ x_semantic,
+ history_prompt=None,
+ temp=0.7,
+ top_k=None,
+ top_p=None,
+ silent=False,
+ max_coarse_history=630, # min 60 (faster), max 630 (more context)
+ sliding_window_len=60,
+ use_kv_caching=True,
+ x_coarse_history_alignment_hack=-2,
+):
+ """Generate coarse audio codes from semantic tokens."""
+
+ logger.debug(locals())
+ assert (
+ isinstance(x_semantic, np.ndarray)
+ and len(x_semantic.shape) == 1
+ and len(x_semantic) > 0
+ and x_semantic.min() >= 0
+ and x_semantic.max() <= SEMANTIC_VOCAB_SIZE - 1
+ )
+ assert 60 <= max_coarse_history <= 630
+ assert max_coarse_history + sliding_window_len <= 1024 - 256
+ semantic_to_coarse_ratio = COARSE_RATE_HZ / SEMANTIC_RATE_HZ * N_COARSE_CODEBOOKS
+
+ max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
+ if history_prompt is not None:
+ history_prompt = _load_history_prompt(history_prompt)
+ x_semantic_history = history_prompt["semantic_prompt"]
+ x_coarse_history = history_prompt["coarse_prompt"]
+
+ # print(f"Pre Trim sem coars: {x_semantic_history.shape} {x_coarse_history.shape}")
+ assert (
+ isinstance(x_semantic_history, np.ndarray)
+ and len(x_semantic_history.shape) == 1
+ and len(x_semantic_history) > 0
+ and x_semantic_history.min() >= 0
+ and x_semantic_history.max() <= SEMANTIC_VOCAB_SIZE - 1
+ and isinstance(x_coarse_history, np.ndarray)
+ and len(x_coarse_history.shape) == 2
+ and x_coarse_history.shape[0] == N_COARSE_CODEBOOKS
+ and x_coarse_history.shape[-1] >= 0
+ and x_coarse_history.min() >= 0
+ and x_coarse_history.max() <= CODEBOOK_SIZE - 1
+ and (
+ round(x_coarse_history.shape[-1] / len(x_semantic_history), 1)
+ == round(semantic_to_coarse_ratio / N_COARSE_CODEBOOKS, 1)
+ )
+ )
+
+ x_coarse_history = _flatten_codebooks(x_coarse_history) + SEMANTIC_VOCAB_SIZE
+ # trim histories correctly
+ n_semantic_hist_provided = np.min(
+ [
+ max_semantic_history,
+ len(x_semantic_history) - len(x_semantic_history) % 2,
+ int(np.floor(len(x_coarse_history) / semantic_to_coarse_ratio)),
+ ]
+ )
+ n_coarse_hist_provided = int(round(n_semantic_hist_provided * semantic_to_coarse_ratio))
+ x_semantic_history = x_semantic_history[-n_semantic_hist_provided:].astype(np.int32)
+ x_coarse_history = x_coarse_history[-n_coarse_hist_provided:].astype(np.int32)
+ # TODO: bit of a hack for time alignment (sounds better)
+ # x_coarse_history = x_coarse_history[:-2]
+ x_coarse_history = x_coarse_history[:x_coarse_history_alignment_hack]
+
+ else:
+ x_semantic_history = np.array([], dtype=np.int32)
+ x_coarse_history = np.array([], dtype=np.int32)
+
+ # print(f"actual lengths we're using, x_semantic_history: {len(x_semantic_history)} x_coarse_history: {len(x_coarse_history)}")
+
+ # load models if not yet exist
+ global models
+ global models_devices
+ if "coarse" not in models:
+ if SUNO_USE_DIRECTML is True:
+ preload_models(load_one_model_type="coarse")
+ else:
+ preload_models()
+ model = models["coarse"]
+ if OFFLOAD_CPU:
+ if GLOBAL_ENABLE_MPS:
+ device = _grab_best_device(use_gpu=False)
+ models_devices["coarse"] = device
+ model.to(models_devices["coarse"])
+
+ device = next(model.parameters()).device
+ # start loop
+ n_steps = int(
+ round(
+ np.floor(len(x_semantic) * semantic_to_coarse_ratio / N_COARSE_CODEBOOKS)
+ * N_COARSE_CODEBOOKS
+ )
+ )
+ assert n_steps > 0 and n_steps % N_COARSE_CODEBOOKS == 0
+
+ # reminder to try filling up some of the COARSE_INFER_TOKEN with history to get better short clips
+ x_semantic = np.hstack([x_semantic_history, x_semantic]).astype(np.int32)
+ x_coarse = x_coarse_history.astype(np.int32)
+ base_semantic_idx = len(x_semantic_history)
+ with _inference_mode():
+ if SUNO_USE_DIRECTML is True:
+ device = dml
+ x_semantic_in = torch.from_numpy(x_semantic)[None].to(device)
+ x_coarse_in = torch.from_numpy(x_coarse)[None].to(device)
+ n_window_steps = int(np.ceil(n_steps / sliding_window_len))
+ n_step = 0
+ for _ in tqdm.tqdm(range(n_window_steps), total=n_window_steps, disable=silent):
+ semantic_idx = base_semantic_idx + int(round(n_step / semantic_to_coarse_ratio))
+ # pad from right side
+ x_in = x_semantic_in[:, np.max([0, semantic_idx - max_semantic_history]) :]
+ x_in = x_in[:, :256]
+ x_in = F.pad(
+ x_in,
+ (0, 256 - x_in.shape[-1]),
+ "constant",
+ COARSE_SEMANTIC_PAD_TOKEN,
+ )
+
+ x_in = torch.hstack(
+ [
+ x_in,
+ torch.tensor([COARSE_INFER_TOKEN])[None].to(device),
+ x_coarse_in[:, -max_coarse_history:],
+ ]
+ )
+ kv_cache = None
+ for _ in range(sliding_window_len):
+ if n_step >= n_steps:
+ continue
+ is_major_step = n_step % N_COARSE_CODEBOOKS == 0
+
+ if use_kv_caching and kv_cache is not None:
+ x_input = x_in[:, [-1]]
+ else:
+ x_input = x_in
+
+ logits, kv_cache = model(x_input, use_cache=use_kv_caching, past_kv=kv_cache)
+ logit_start_idx = SEMANTIC_VOCAB_SIZE + (1 - int(is_major_step)) * CODEBOOK_SIZE
+ logit_end_idx = SEMANTIC_VOCAB_SIZE + (2 - int(is_major_step)) * CODEBOOK_SIZE
+ relevant_logits = logits[0, 0, logit_start_idx:logit_end_idx]
+ if top_p is not None:
+ # faster to convert to numpy
+ logits_device = relevant_logits.device
+ logits_dtype = relevant_logits.type()
+ relevant_logits = relevant_logits.detach().cpu().type(torch.float32).numpy()
+ sorted_indices = np.argsort(relevant_logits)[::-1]
+ sorted_logits = relevant_logits[sorted_indices]
+ cumulative_probs = np.cumsum(softmax(sorted_logits))
+ sorted_indices_to_remove = cumulative_probs > top_p
+ sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
+ sorted_indices_to_remove[0] = False
+ relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf
+ relevant_logits = torch.from_numpy(relevant_logits)
+ relevant_logits = relevant_logits.to(logits_device).type(logits_dtype)
+ if top_k is not None:
+ v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1)))
+ relevant_logits[relevant_logits < v[-1]] = -float("Inf")
+ probs = F.softmax(relevant_logits / temp, dim=-1)
+ # multinomial bugged on mps: shuttle to cpu if necessary
+ inf_device = probs.device
+ if probs.device.type == "mps":
+ probs = probs.to("cpu")
+ item_next = torch.multinomial(probs, num_samples=1)
+ probs = probs.to(inf_device)
+ item_next = item_next.to(inf_device)
+ item_next += logit_start_idx
+ x_coarse_in = torch.cat((x_coarse_in, item_next[None]), dim=1)
+ x_in = torch.cat((x_in, item_next[None]), dim=1)
+ del logits, relevant_logits, probs, item_next
+ n_step += 1
+ del x_in
+ del x_semantic_in
+ if OFFLOAD_CPU:
+ model.to("cpu")
+ gen_coarse_arr = x_coarse_in.detach().cpu().numpy().squeeze()[len(x_coarse_history) :]
+ del x_coarse_in
+ assert len(gen_coarse_arr) == n_steps
+ gen_coarse_audio_arr = gen_coarse_arr.reshape(-1, N_COARSE_CODEBOOKS).T - SEMANTIC_VOCAB_SIZE
+ for n in range(1, N_COARSE_CODEBOOKS):
+ gen_coarse_audio_arr[n, :] -= n * CODEBOOK_SIZE
+ _clear_cuda_cache()
+ if SUNO_USE_DIRECTML is True:
+ clean_models()
+ return gen_coarse_audio_arr
+
+
+def generate_coarse_amd_directml(
+ x_semantic,
+ history_prompt=None,
+ temp=0.7,
+ top_k=None,
+ top_p=None,
+ silent=False,
+ max_coarse_history=630, # min 60 (faster), max 630 (more context)
+ sliding_window_len=60,
+ use_kv_caching=True,
+ x_coarse_history_alignment_hack=-2,
+):
+ """Generate coarse audio codes from semantic tokens."""
+
+ logger.debug(locals())
+
+ assert (
+ isinstance(x_semantic, np.ndarray)
+ and len(x_semantic.shape) == 1
+ and len(x_semantic) > 0
+ and x_semantic.min() >= 0
+ and x_semantic.max() <= SEMANTIC_VOCAB_SIZE - 1
+ )
+ assert 60 <= max_coarse_history <= 630
+ assert max_coarse_history + sliding_window_len <= 1024 - 256
+ semantic_to_coarse_ratio = COARSE_RATE_HZ / SEMANTIC_RATE_HZ * N_COARSE_CODEBOOKS
+ max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
+ if history_prompt is not None:
+ history_prompt = _load_history_prompt(history_prompt)
+ x_semantic_history = history_prompt["semantic_prompt"]
+ x_coarse_history = history_prompt["coarse_prompt"]
+ assert (
+ isinstance(x_semantic_history, np.ndarray)
+ and len(x_semantic_history.shape) == 1
+ and len(x_semantic_history) > 0
+ and x_semantic_history.min() >= 0
+ and x_semantic_history.max() <= SEMANTIC_VOCAB_SIZE - 1
+ and isinstance(x_coarse_history, np.ndarray)
+ and len(x_coarse_history.shape) == 2
+ and x_coarse_history.shape[0] == N_COARSE_CODEBOOKS
+ and x_coarse_history.shape[-1] >= 0
+ and x_coarse_history.min() >= 0
+ and x_coarse_history.max() <= CODEBOOK_SIZE - 1
+ and (
+ round(x_coarse_history.shape[-1] / len(x_semantic_history), 1)
+ == round(semantic_to_coarse_ratio / N_COARSE_CODEBOOKS, 1)
+ )
+ )
+ x_coarse_history = _flatten_codebooks(x_coarse_history) + SEMANTIC_VOCAB_SIZE
+ # trim histories correctly
+ n_semantic_hist_provided = np.min(
+ [
+ max_semantic_history,
+ len(x_semantic_history) - len(x_semantic_history) % 2,
+ int(np.floor(len(x_coarse_history) / semantic_to_coarse_ratio)),
+ ]
+ )
+ n_coarse_hist_provided = int(round(n_semantic_hist_provided * semantic_to_coarse_ratio))
+ x_semantic_history = x_semantic_history[-n_semantic_hist_provided:].astype(np.int32)
+ x_coarse_history = x_coarse_history[-n_coarse_hist_provided:].astype(np.int32)
+ # TODO: bit of a hack for time alignment (sounds better)
+ x_coarse_history = x_coarse_history[:-2]
+ else:
+ x_semantic_history = np.array([], dtype=np.int32)
+ x_coarse_history = np.array([], dtype=np.int32)
+ # load models if not yet exist
+ global models
+ global models_devices
+ if "coarse" not in models:
+ if SUNO_USE_DIRECTML is True:
+ preload_models(load_one_model_type="coarse")
+ else:
+ preload_models()
+ model = models["coarse"]
+ if OFFLOAD_CPU:
+ if GLOBAL_ENABLE_MPS:
+ device = _grab_best_device(use_gpu=False)
+ models_devices["coarse"] = device
+ model.to(models_devices["coarse"])
+ # device = next(model.parameters()).device
+
+ # start loop
+ n_steps = int(
+ round(
+ np.floor(len(x_semantic) * semantic_to_coarse_ratio / N_COARSE_CODEBOOKS)
+ * N_COARSE_CODEBOOKS
+ )
+ )
+ assert n_steps > 0 and n_steps % N_COARSE_CODEBOOKS == 0
+ x_semantic = np.hstack([x_semantic_history, x_semantic]).astype(np.int32)
+ x_coarse = x_coarse_history.astype(np.int32)
+ base_semantic_idx = len(x_semantic_history)
+ cumulative_time = 0
+ with _inference_mode():
+ try:
+ # x_semantic_in = torch.from_numpy(x_semantic)[None].to(dml)
+ x_semantic_in_np = x_semantic[None]
+ # x_coarse_in = torch.from_numpy(x_coarse)[None].to(dml)
+ x_coarse_in_np = x_coarse[None]
+ n_window_steps = int(np.ceil(n_steps / sliding_window_len))
+ n_step = 0
+ for _ in tqdm.tqdm(range(n_window_steps), total=n_window_steps, disable=silent):
+ semantic_idx = base_semantic_idx + int(round(n_step / semantic_to_coarse_ratio))
+ # pad from right side
+ x_in_np = x_semantic_in_np[:, np.max([0, semantic_idx - max_semantic_history]) :]
+ x_in_np = x_in_np[:, :256]
+ """
+ x_in_np = F.pad(
+ x_in_np,
+ (0, 256 - x_in_np.shape[-1]),
+ "constant",
+ COARSE_SEMANTIC_PAD_TOKEN,
+ )
+ """
+ np_pad_size = ((0, 0), (0, 256 - x_in_np.shape[-1]))
+ x_in_np = np.pad(
+ x_in_np,
+ np_pad_size,
+ constant_values=COARSE_SEMANTIC_PAD_TOKEN,
+ mode="constant",
+ )
+
+ """
+ x_in = torch.hstack(
+ [
+ x_in,
+ torch.tensor([COARSE_INFER_TOKEN])[None].to(dml),
+ x_coarse_in[:, -max_coarse_history:],
+ ]
+ )
+ """
+
+ coarse_infer_token_np = np.array([COARSE_INFER_TOKEN])[None]
+
+ x_in_np = np.hstack(
+ [
+ x_in_np,
+ coarse_infer_token_np,
+ x_coarse_in_np[:, -max_coarse_history:],
+ ]
+ )
+
+ kv_cache = None
+ for _ in range(sliding_window_len):
+ if n_step >= n_steps:
+ continue
+ is_major_step = n_step % N_COARSE_CODEBOOKS == 0
+
+ if use_kv_caching and kv_cache is not None:
+ x_input = x_in_np[:, [-1]]
+ else:
+ x_input = x_in_np
+
+ x_input_tensor = torch.from_numpy(x_input).to(dml)
+
+ logits, kv_cache = model(
+ x_input_tensor, use_cache=use_kv_caching, past_kv=kv_cache
+ )
+
+ logit_start_idx = SEMANTIC_VOCAB_SIZE + (1 - int(is_major_step)) * CODEBOOK_SIZE
+ logit_end_idx = SEMANTIC_VOCAB_SIZE + (2 - int(is_major_step)) * CODEBOOK_SIZE
+ relevant_logits = logits[0, 0, logit_start_idx:logit_end_idx]
+
+ if top_p is not None:
+ # faster to convert to numpy
+ # original_device = relevant_logits.device
+ relevant_logits = relevant_logits.detach().cpu().type(torch.float32).numpy()
+ sorted_indices = np.argsort(relevant_logits)[::-1]
+ sorted_logits = relevant_logits[sorted_indices]
+ cumulative_probs = np.cumsum(softmax(sorted_logits))
+ sorted_indices_to_remove = cumulative_probs > top_p
+ sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
+ sorted_indices_to_remove[0] = False
+ relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf
+ relevant_logits = torch.from_numpy(relevant_logits)
+ # relevant_logits = relevant_logits.to(original_device)
+ # stay as numpy, since we converted for directml anyway...
+ if top_k is not None:
+ v, _ = torch.topk(
+ relevant_logits.to(dml),
+ min(top_k, relevant_logits.to(dml).size(-1)),
+ )
+ relevant_logits[relevant_logits < v[-1]] = -float("Inf")
+
+ # probs = F.softmax(relevant_logits.to(dml) / temp, dim=-1)
+
+ start_time = time.time()
+
+ # item_next = torch.multinomial(probs, num_samples=1).to(torch.int32)
+
+ probs_np = (
+ F.softmax(relevant_logits.to(dml) / temp, dim=-1)
+ .cpu()
+ .type(torch.float32)
+ .numpy()
+ )
+
+ item_next_np = np.random.choice(
+ np.arange(probs_np.shape[-1]), size=1, p=probs_np.flatten()
+ )
+
+ # item_next = torch.from_numpy(item_next_np).to(torch.int32).to(dml)
+
+ # doing in raw numpy same speed with AMD directML, but maybe faster if you setup MKL correctly?
+ # actually tha wasn't quite righ anyway...
+ end_time = time.time()
+ cumulative_time = cumulative_time + (end_time - start_time)
+
+ # amd_multinomial = torch_distributions.Categorical(probs)
+ # action = amd_multinomial.sample((1,))
+ # item_next = amd_multinomial.log_prob(action).to(torch.int32)
+
+ # multinomial bugged on mps: shuttle to cpu if necessary
+ # inf_device = probs.device
+ # if probs.device.type == "mps" or True:
+ # probs = probs.to("cpu")
+ # # print(f"Here in coarse: {probs.device}")
+ # item_next = torch.multinomial(probs, num_samples=1)
+ # probs = probs.to(inf_device)
+ # item_next = item_next.to(inf_device)
+
+ item_next_np += logit_start_idx
+
+ x_coarse_in_np = np.hstack((x_coarse_in_np, item_next_np[None]))
+
+ # x_coarse_in = torch.from_numpy(x_coarse_in_np).to(dml)
+ # x_in = torch.cat((x_in_np.to(dml), item_next_np[None]), dim=1)
+
+ x_in_np = np.hstack((x_in_np, item_next_np[None]))
+ del logits, relevant_logits, probs_np, item_next_np
+ n_step += 1
+ del x_in_np
+ del x_semantic_in_np
+ except RuntimeError as e:
+ print(f"RuntimeError: {e}")
+ # show all possble details and traceback, print to output
+ print(f"Traceback: {traceback.format_exc()}") # and print(sys.exc_info()[2])
+ print(f"Exception: {sys.exc_info()[2]}")
+
+ if OFFLOAD_CPU:
+ model.to("cpu")
+ gen_coarse_arr = x_coarse_in_np.squeeze()[len(x_coarse_history) :]
+ del x_coarse_in_np
+ assert len(gen_coarse_arr) == n_steps
+ gen_coarse_audio_arr = gen_coarse_arr.reshape(-1, N_COARSE_CODEBOOKS).T - SEMANTIC_VOCAB_SIZE
+ for n in range(1, N_COARSE_CODEBOOKS):
+ gen_coarse_audio_arr[n, :] -= n * CODEBOOK_SIZE
+ _clear_cuda_cache()
+ if SUNO_USE_DIRECTML is True:
+ clean_models()
+ return gen_coarse_audio_arr
+
+
+def generate_fine(
+ x_coarse_gen,
+ history_prompt=None,
+ temp=0.5,
+ silent=True,
+):
+ if temp == 0:
+ temp = 0.001
+
+ """Generate full audio codes from coarse audio codes."""
+ assert (
+ isinstance(x_coarse_gen, np.ndarray)
+ and len(x_coarse_gen.shape) == 2
+ and 1 <= x_coarse_gen.shape[0] <= N_FINE_CODEBOOKS - 1
+ and x_coarse_gen.shape[1] > 0
+ and x_coarse_gen.min() >= 0
+ and x_coarse_gen.max() <= CODEBOOK_SIZE - 1
+ )
+ if history_prompt is not None:
+ history_prompt = _load_history_prompt(history_prompt)
+ x_fine_history = history_prompt["fine_prompt"]
+ assert (
+ isinstance(x_fine_history, np.ndarray)
+ and len(x_fine_history.shape) == 2
+ and x_fine_history.shape[0] == N_FINE_CODEBOOKS
+ and x_fine_history.shape[1] >= 0
+ and x_fine_history.min() >= 0
+ and x_fine_history.max() <= CODEBOOK_SIZE - 1
+ )
+ else:
+ x_fine_history = None
+ n_coarse = x_coarse_gen.shape[0]
+ # load models if not yet exist
+ global models
+ global models_devices
+ if "fine" not in models:
+ if SUNO_USE_DIRECTML is True:
+ preload_models(load_one_model_type="fine")
+ else:
+ preload_models()
+ model = models["fine"]
+ if OFFLOAD_CPU:
+ if GLOBAL_ENABLE_MPS:
+ device = _grab_best_device(use_gpu=False)
+ models_devices["fine"] = device
+ model.to(models_devices["fine"])
+ device = next(model.parameters()).device
+ # make input arr
+ in_arr = np.vstack(
+ [
+ x_coarse_gen,
+ np.zeros((N_FINE_CODEBOOKS - n_coarse, x_coarse_gen.shape[1]))
+ + CODEBOOK_SIZE, # padding
+ ]
+ ).astype(np.int32)
+ # prepend history if available (max 512)
+ if x_fine_history is not None:
+ x_fine_history = x_fine_history.astype(np.int32)
+ in_arr = np.hstack(
+ [
+ x_fine_history[:, -512:].astype(np.int32),
+ in_arr,
+ ]
+ )
+ n_history = x_fine_history[:, -512:].shape[1]
+ else:
+ n_history = 0
+ n_remove_from_end = 0
+ # need to pad if too short (since non-causal model)
+ if in_arr.shape[1] < 1024:
+ n_remove_from_end = 1024 - in_arr.shape[1]
+ in_arr = np.hstack(
+ [
+ in_arr,
+ np.zeros((N_FINE_CODEBOOKS, n_remove_from_end), dtype=np.int32) + CODEBOOK_SIZE,
+ ]
+ )
+ # we can be lazy about fractional loop and just keep overwriting codebooks
+ n_loops = np.max([0, int(np.ceil((x_coarse_gen.shape[1] - (1024 - n_history)) / 512))]) + 1
+ with _inference_mode():
+ if SUNO_USE_DIRECTML is True:
+ device = dml
+ in_arr = torch.tensor(in_arr.T).to(device)
+ for n in tqdm.tqdm(range(n_loops), disable=silent):
+ start_idx = np.min([n * 512, in_arr.shape[0] - 1024])
+ start_fill_idx = np.min([n_history + n * 512, in_arr.shape[0] - 512])
+ rel_start_fill_idx = start_fill_idx - start_idx
+ in_buffer = in_arr[start_idx : start_idx + 1024, :][None]
+ for nn in range(n_coarse, N_FINE_CODEBOOKS):
+ logits = model(nn, in_buffer)
+ if temp is None:
+ relevant_logits = logits[0, rel_start_fill_idx:, :CODEBOOK_SIZE]
+ codebook_preds = torch.argmax(relevant_logits, -1)
+ else:
+ relevant_logits = logits[0, :, :CODEBOOK_SIZE] / temp
+ probs = F.softmax(relevant_logits, dim=-1)
+ codebook_preds = torch.multinomial(
+ probs[rel_start_fill_idx:1024], num_samples=1
+ ).reshape(-1)
+ codebook_preds = codebook_preds.to(torch.int32)
+ in_buffer[0, rel_start_fill_idx:, nn] = codebook_preds
+ del logits, codebook_preds
+ # transfer over info into model_in and convert to numpy
+ for nn in range(n_coarse, N_FINE_CODEBOOKS):
+ in_arr[
+ start_fill_idx : start_fill_idx + (1024 - rel_start_fill_idx), nn
+ ] = in_buffer[0, rel_start_fill_idx:, nn]
+ del in_buffer
+ gen_fine_arr = in_arr.detach().cpu().numpy().squeeze().T
+ del in_arr
+ if OFFLOAD_CPU:
+ model.to("cpu")
+ gen_fine_arr = gen_fine_arr[:, n_history:]
+ if n_remove_from_end > 0:
+ gen_fine_arr = gen_fine_arr[:, :-n_remove_from_end]
+ assert gen_fine_arr.shape[-1] == x_coarse_gen.shape[-1]
+ _clear_cuda_cache()
+ if SUNO_USE_DIRECTML is True:
+ clean_models()
+ return gen_fine_arr
+
+
+def _flatten_codebooks(arr, offset_size=CODEBOOK_SIZE):
+ assert len(arr.shape) == 2
+ arr = arr.copy()
+ if offset_size is not None:
+ for n in range(1, arr.shape[0]):
+ arr[n, :] += offset_size * n
+ flat_arr = arr.ravel("F")
+ return flat_arr
+
+
+COARSE_SEMANTIC_PAD_TOKEN = 12_048
+COARSE_INFER_TOKEN = 12_050
+
+
+def codec_decode(fine_tokens):
+ """Turn quantized audio codes into audio array using encodec."""
+ # load models if not yet exist
+ global models
+ global models_devices
+ if "codec" not in models:
+ if SUNO_USE_DIRECTML is True:
+ preload_models(load_one_model_type="codec")
+ else:
+ preload_models()
+ model = models["codec"]
+ if OFFLOAD_CPU:
+ if GLOBAL_ENABLE_MPS:
+ device = _grab_best_device(use_gpu=False)
+ models_devices["codec"] = device
+ model.to(models_devices["codec"])
+ device = next(model.parameters()).device
+ arr = torch.from_numpy(fine_tokens)[None]
+ if SUNO_USE_DIRECTML is True:
+ arr = arr.to(dml)
+ else:
+ arr = arr.to(device)
+ arr = arr.transpose(0, 1)
+ emb = model.quantizer.decode(arr)
+ out = model.decoder(emb)
+ audio_arr = out.detach().cpu().numpy().squeeze()
+ del arr, emb, out
+ if OFFLOAD_CPU:
+ model.to("cpu")
+ if SUNO_USE_DIRECTML is True:
+ clean_models()
+ return audio_arr
+
+
+## Added:
+
+
+# Just overriding this because somehow I keep loading the wrong models?
+def load_model(use_gpu=True, use_small=False, force_reload=False, model_type="text"):
+ logger.debug(locals())
+
+ _load_model_f = funcy.partial(_load_model, model_type=model_type, use_small=use_small)
+ if model_type not in ("text", "coarse", "fine"):
+ raise NotImplementedError()
+ global models
+ global models_devices
+ device = _grab_best_device(use_gpu=use_gpu)
+ model_key = f"{model_type}"
+ if OFFLOAD_CPU:
+ models_devices[model_key] = device
+ device = "cpu"
+ if model_key not in models or force_reload:
+ ckpt_path = _get_ckpt_path(model_type, use_small=use_small)
+ clean_models(model_key=model_key)
+ model = _load_model_f(ckpt_path, device)
+ models[model_key] = model
+ if model_type == "text":
+ if SUNO_USE_DIRECTML is True:
+ models[model_key]["model"].to(dml)
+ else:
+ models[model_key]["model"].to(device)
+ else:
+ if SUNO_USE_DIRECTML is True:
+ models[model_key].to(dml)
+ else:
+ models[model_key].to(device)
+ logger.debug(f"Loaded {model_key} onto {device}.")
+ return models[model_key]
+
+
+def print_loading_info(model_key, ckpt_path, device):
+ device_str = str(device)
+ if SUNO_USE_DIRECTML is True:
+ device_str = "directml (partial AMD GPU support)"
+ if GLOBAL_ENABLE_MPS:
+ device_str = "cpu/mps: Partial Apple Support"
+ if OFFLOAD_CPU:
+ device_str = "cpu/gpu: Offloading, cpu until needed, then gpu"
+
+ print(f"--Loading {model_key} model from {ckpt_path} to {device_str}")
+
+
+def _load_model(ckpt_path, device, use_small=False, model_type="text"):
+ if model_type == "text":
+ ConfigClass = GPTConfig
+ ModelClass = GPT
+ elif model_type == "coarse":
+ ConfigClass = GPTConfig
+ ModelClass = GPT
+ elif model_type == "fine":
+ ConfigClass = FineGPTConfig
+ ModelClass = FineGPT
+ else:
+ raise NotImplementedError()
+ model_key = f"{model_type}_small" if use_small or USE_SMALL_MODELS else model_type
+ model_info = REMOTE_MODEL_PATHS[model_key]
+ if not os.path.exists(ckpt_path):
+ logger.info(f"{model_type} model not found, downloading into `{CACHE_DIR}`.")
+
+ remote_filename = hf_hub_url(model_info["repo_id"], model_info["file_name"])
+ print(
+ f"Downloading {model_key} {model_info['repo_id']} remote model file {remote_filename} {model_info['file_name']} to {CACHE_DIR}"
+ ) # added
+ _download(model_info["repo_id"], model_info["file_name"])
+
+ print_loading_info(model_key, ckpt_path, device)
+
+ # If I try to load straight to DML, I get a strange error. So doing in two steps.
+ checkpoint = torch.load(ckpt_path, map_location=device)
+
+ # this is a hack
+ model_args = checkpoint["model_args"]
+ if "input_vocab_size" not in model_args:
+ model_args["input_vocab_size"] = model_args["vocab_size"]
+ model_args["output_vocab_size"] = model_args["vocab_size"]
+ del model_args["vocab_size"]
+ gptconf = ConfigClass(**checkpoint["model_args"])
+ model = ModelClass(gptconf)
+
+ if SUNO_HALF_PRECISION:
+ model = model.half()
+ elif SUNO_HALF_BFLOAT16:
+ model.bfloat16()
+
+ state_dict = checkpoint["model"]
+ # fixup checkpoint
+ unwanted_prefix = "_orig_mod."
+ for k, v in list(state_dict.items()):
+ if k.startswith(unwanted_prefix):
+ state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
+ extra_keys = set(state_dict.keys()) - set(model.state_dict().keys())
+ extra_keys = set([k for k in extra_keys if not k.endswith(".attn.bias")])
+ missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
+ missing_keys = set([k for k in missing_keys if not k.endswith(".attn.bias")])
+ if len(extra_keys) != 0:
+ raise ValueError(f"extra keys found: {extra_keys}")
+ if len(missing_keys) != 0:
+ raise ValueError(f"missing keys: {missing_keys}")
+ model.load_state_dict(state_dict, strict=False)
+ n_params = model.get_num_params()
+ val_loss = checkpoint["best_val_loss"].item()
+ logger.info(f"model loaded: {round(n_params/1e6,1)}M params, {round(val_loss,3)} loss")
+ model.eval()
+ if SUNO_USE_DIRECTML is True:
+ model.to(dml)
+ else:
+ model.to(device)
+ # del checkpoint, state_dict
+ del checkpoint, state_dict, model_args, val_loss
+ _clear_cuda_cache()
+ if model_type == "text":
+ tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
+
+ return {
+ "model": model,
+ "tokenizer": tokenizer,
+ }
+ return model
+
+
+def preload_models(
+ text_use_gpu=True,
+ text_use_small=False,
+ coarse_use_gpu=True,
+ coarse_use_small=False,
+ fine_use_gpu=True,
+ fine_use_small=False,
+ codec_use_gpu=True,
+ force_reload=False,
+ load_one_model_type=None,
+):
+ """Load all the necessary models for the pipeline."""
+
+ if SUNO_USE_DIRECTML is True:
+ text_use_gpu = False
+ coarse_use_gpu = False
+ fine_use_gpu = False
+
+ # What is going on here
+ logger.debug(
+ f"USE_SMALL_MODELS = {USE_SMALL_MODELS} GLOBAL_ENABLE_MPS = {GLOBAL_ENABLE_MPS}, OFFLOAD_CPU = {OFFLOAD_CPU}"
+ )
+ logger.debug(
+ f"text_use_gpu = {text_use_gpu}, text_use_small = {text_use_small}, coarse_use_gpu = {coarse_use_gpu}, coarse_use_small = {coarse_use_small}, fine_use_gpu = {fine_use_gpu}, fine_use_small = {fine_use_small}, codec_use_gpu = {codec_use_gpu}, force_reload = {force_reload}"
+ )
+
+ if USE_SMALL_MODELS:
+ text_use_small = True
+ coarse_use_small = True
+ fine_use_small = True
+
+ if _grab_best_device() == "cpu" and (
+ text_use_gpu or coarse_use_gpu or fine_use_gpu or codec_use_gpu
+ ):
+ warning_string = " -->No GPU being used. Careful, inference might be very slow!"
+
+ if SUNO_USE_DIRECTML is True:
+ warning_string = "-->GPU using DirectML (partial AMD GPU support)"
+ if GLOBAL_ENABLE_MPS:
+ warning_string = "-->cpu/mps: Partial Apple Support"
+
+ # logger.warning(warning_string)
+ print(f"{warning_string}")
+
+ if load_one_model_type is not None:
+ if load_one_model_type == "text":
+ _ = load_model(
+ model_type="text",
+ use_gpu=text_use_gpu,
+ use_small=text_use_small,
+ force_reload=force_reload,
+ )
+ elif load_one_model_type == "coarse":
+ _ = load_model(
+ model_type="coarse",
+ use_gpu=coarse_use_gpu,
+ use_small=coarse_use_small,
+ force_reload=force_reload,
+ )
+ elif load_one_model_type == "fine":
+ _ = load_model(
+ model_type="fine",
+ use_gpu=fine_use_gpu,
+ use_small=fine_use_small,
+ force_reload=force_reload,
+ )
+ elif load_one_model_type == "codec":
+ _ = load_codec_model(use_gpu=codec_use_gpu, force_reload=force_reload)
+ else:
+ _ = load_model(
+ model_type="text",
+ use_gpu=text_use_gpu,
+ use_small=text_use_small,
+ force_reload=force_reload,
+ )
+ _ = load_model(
+ model_type="coarse",
+ use_gpu=coarse_use_gpu,
+ use_small=coarse_use_small,
+ force_reload=force_reload,
+ )
+ _ = load_model(
+ model_type="fine",
+ use_gpu=fine_use_gpu,
+ use_small=fine_use_small,
+ force_reload=force_reload,
+ )
+ _ = load_codec_model(use_gpu=codec_use_gpu, force_reload=force_reload)
diff --git a/bark_infinity/generation_bb.py b/bark_infinity/generation_bb.py
new file mode 100644
index 0000000000000000000000000000000000000000..ceb472969e89496afdbac1194cce9a3c6e97752a
--- /dev/null
+++ b/bark_infinity/generation_bb.py
@@ -0,0 +1,2008 @@
+import contextlib
+import gc
+import os
+import re
+
+import random
+from encodec import EncodecModel
+import funcy
+import numpy as np
+from scipy.special import softmax
+import torch
+
+import math
+
+
+import torch.distributions as torch_distributions
+
+import torch.nn.functional as F
+import tqdm
+from transformers import BertTokenizer
+from huggingface_hub import hf_hub_download
+
+from .model import GPTConfig, GPT
+from .model_fine import FineGPT, FineGPTConfig
+
+import traceback
+import sys
+import time
+
+import math
+
+from rich.pretty import pprint
+
+from .config import logger, load_all_defaults
+
+from huggingface_hub import hf_hub_url
+from collections import Counter
+
+from devtools import debug
+from collections import defaultdict
+
+
+def _cast_bool_env_var(s):
+ return s.lower() in ("true", "1", "t")
+
+
+def get_SUNO_USE_DIRECTML():
+ if _cast_bool_env_var(os.environ.get("SUNO_USE_DIRECTML", "False")):
+ return True
+
+ kwargs = {}
+ defaults = load_all_defaults(*kwargs)
+ if defaults["SUNO_USE_DIRECTML"] is True:
+ return True
+ else:
+ return False
+
+
+SUNO_USE_DIRECTML = get_SUNO_USE_DIRECTML()
+
+dml = None
+if SUNO_USE_DIRECTML is True:
+ print(f" --->> Experimental AMD DirectML support enabled.")
+ import torch_directml
+
+ torch.cuda.is_available = lambda: False
+
+ dml = torch_directml.device()
+
+
+if (
+ torch.cuda.is_available()
+ and hasattr(torch.cuda, "amp")
+ and hasattr(torch.cuda.amp, "autocast")
+ and hasattr(torch.cuda, "is_bf16_supported")
+ and torch.cuda.is_bf16_supported()
+):
+ # print(f" --->> Experimental NVIDIA BF16 support enabled.")
+ autocast = funcy.partial(torch.cuda.amp.autocast, dtype=torch.bfloat16)
+else:
+
+ @contextlib.contextmanager
+ def autocast():
+ yield
+
+
+# hold models in global scope to lazy load
+global models
+models = {}
+
+global models_devices
+models_devices = {}
+
+
+CONTEXT_WINDOW_SIZE = 1024
+
+SEMANTIC_RATE_HZ = 49.9
+SEMANTIC_VOCAB_SIZE = 10_000
+
+CODEBOOK_SIZE = 1024
+N_COARSE_CODEBOOKS = 2
+N_FINE_CODEBOOKS = 8
+COARSE_RATE_HZ = 75
+
+SAMPLE_RATE = 24_000
+
+
+SUPPORTED_LANGS = [
+ ("English", "en"),
+ ("German", "de"),
+ ("Spanish", "es"),
+ ("French", "fr"),
+ ("Hindi", "hi"),
+ ("Italian", "it"),
+ ("Japanese", "ja"),
+ ("Korean", "ko"),
+ ("Polish", "pl"),
+ ("Portuguese", "pt"),
+ ("Russian", "ru"),
+ ("Turkish", "tr"),
+ ("Chinese", "zh"),
+]
+
+ALLOWED_PROMPTS = {"announcer"}
+for _, lang in SUPPORTED_LANGS:
+ for prefix in ("", f"v2{os.path.sep}"):
+ for n in range(10):
+ ALLOWED_PROMPTS.add(f"{prefix}{lang}_speaker_{n}")
+
+
+SUPPORTED_LANGS = [
+ ("English", "en"),
+ ("German", "de"),
+ ("Spanish", "es"),
+ ("French", "fr"),
+ ("Hindi", "hi"),
+ ("Italian", "it"),
+ ("Japanese", "ja"),
+ ("Korean", "ko"),
+ ("Polish", "pl"),
+ ("Portuguese", "pt"),
+ ("Russian", "ru"),
+ ("Turkish", "tr"),
+ ("Chinese", "zh"),
+]
+
+ALLOWED_PROMPTS = {"announcer"}
+for _, lang in SUPPORTED_LANGS:
+ for prefix in ("", f"v2{os.path.sep}"):
+ for n in range(10):
+ ALLOWED_PROMPTS.add(f"{prefix}{lang}_speaker_{n}")
+
+
+CUR_PATH = os.path.dirname(os.path.abspath(__file__))
+
+
+default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache")
+CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "suno", "bark_v0")
+
+
+USE_SMALL_MODELS = _cast_bool_env_var(os.environ.get("SUNO_USE_SMALL_MODELS", "False"))
+GLOBAL_ENABLE_MPS = _cast_bool_env_var(os.environ.get("SUNO_ENABLE_MPS", "False"))
+OFFLOAD_CPU = _cast_bool_env_var(os.environ.get("SUNO_OFFLOAD_CPU", "False"))
+
+# Slower, possibly lower quality, but more memory efficient
+SUNO_HALF_PRECISION = _cast_bool_env_var(os.environ.get("SUNO_HALF_PRECISION", "False"))
+
+# Slower, possibly lower quality, but more memory efficient
+SUNO_HALF_BFLOAT16 = _cast_bool_env_var(os.environ.get("SUNO_HALF_BFLOAT16", "False"))
+
+SUNO_DISABLE_COMPILE = _cast_bool_env_var(os.environ.get("SUNO_DISABLE_COMPILE", "False"))
+
+if sys.platform == "win32":
+ SUNO_DISABLE_COMPILE = True
+
+
+if SUNO_USE_DIRECTML is True:
+ OFFLOAD_CPU = False
+
+OFFLOAD_CPU = False
+
+REMOTE_MODEL_PATHS = {
+ "text_small": {
+ "repo_id": "suno/bark",
+ "file_name": "text.pt",
+ },
+ "coarse_small": {
+ "repo_id": "suno/bark",
+ "file_name": "coarse.pt",
+ },
+ "fine_small": {
+ "repo_id": "suno/bark",
+ "file_name": "fine.pt",
+ },
+ "text": {
+ "repo_id": "suno/bark",
+ "file_name": "text_2.pt",
+ },
+ "coarse": {
+ "repo_id": "suno/bark",
+ "file_name": "coarse_2.pt",
+ },
+ "fine": {
+ "repo_id": "suno/bark",
+ "file_name": "fine_2.pt",
+ },
+}
+
+if not hasattr(torch.nn.functional, "scaled_dot_product_attention") and torch.cuda.is_available():
+ logger.warning(
+ "torch version does not support flash attention. You will get faster"
+ + " inference speed by upgrade torch to newest nightly version."
+ )
+
+
+def _grab_best_device(use_gpu=True):
+ if torch.cuda.device_count() > 0 and use_gpu:
+ device = "cuda"
+ elif torch.backends.mps.is_available() and use_gpu and GLOBAL_ENABLE_MPS:
+ device = "mps"
+ else:
+ device = "cpu"
+
+ return device
+
+
+def _get_ckpt_path(model_type, use_small=False):
+ key = model_type
+ if use_small or USE_SMALL_MODELS:
+ key += "_small"
+ return os.path.join(CACHE_DIR, REMOTE_MODEL_PATHS[key]["file_name"])
+
+
+def _download(from_hf_path, file_name):
+ os.makedirs(CACHE_DIR, exist_ok=True)
+ hf_hub_download(repo_id=from_hf_path, filename=file_name, local_dir=CACHE_DIR)
+
+
+class InferenceContext:
+ def __init__(self, benchmark=False):
+ # we can't expect inputs to be the same length, so disable benchmarking by default
+ self._chosen_cudnn_benchmark = benchmark
+ self._cudnn_benchmark = None
+
+ def __enter__(self):
+ self._cudnn_benchmark = torch.backends.cudnn.benchmark
+ torch.backends.cudnn.benchmark = self._chosen_cudnn_benchmark
+
+ def __exit__(self, exc_type, exc_value, exc_traceback):
+ torch.backends.cudnn.benchmark = self._cudnn_benchmark
+
+
+if torch.cuda.is_available():
+ torch.backends.cuda.matmul.allow_tf32 = True
+ torch.backends.cudnn.allow_tf32 = True
+
+
+@contextlib.contextmanager
+def _inference_mode():
+ if SUNO_USE_DIRECTML is True:
+ with InferenceContext(), torch.inference_mode(mode=False), torch.no_grad(), autocast():
+ yield
+ else:
+ with InferenceContext(), torch.inference_mode(), torch.no_grad(), autocast():
+ yield
+
+
+def _clear_cuda_cache():
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+ torch.cuda.synchronize()
+
+
+def clean_models(model_key=None):
+ global models
+ model_keys = [model_key] if model_key is not None else list(models.keys())
+ for k in model_keys:
+ if k in models:
+ del models[k]
+
+ _clear_cuda_cache()
+ gc.collect()
+
+
+def _load_codec_model(device):
+ model = EncodecModel.encodec_model_24khz()
+ model.set_target_bandwidth(6.0)
+ model.eval()
+
+ print_loading_info("codec", "EncodecModelPath", device)
+
+ if SUNO_USE_DIRECTML is True:
+ model.to(dml)
+ else:
+ model.to(device)
+
+ if callable(getattr(torch, "compile")) and not SUNO_DISABLE_COMPILE:
+ logger.info("torch.compile available, compiling codec model.")
+ model = torch.compile(model)
+ else:
+ logger.info(
+ "torch.compile *not* available, you will get better performance if you use pytorch >= 2.0."
+ )
+
+ _clear_cuda_cache()
+ return model
+
+
+def load_codec_model(use_gpu=True, force_reload=False):
+ global models
+ global models_devices
+ device = _grab_best_device(use_gpu=use_gpu)
+ if device == "mps":
+ # encodec doesn't support mps
+ device = "cpu"
+ model_key = "codec"
+ if OFFLOAD_CPU:
+ models_devices[model_key] = device
+ device = "cpu"
+ if model_key not in models or force_reload:
+ clean_models(model_key=model_key)
+
+ model = _load_codec_model(device)
+ models[model_key] = model
+
+ if SUNO_USE_DIRECTML is True:
+ models[model_key].to(dml)
+ else:
+ models[model_key].to(device)
+
+ return models[model_key]
+
+
+####
+# Generation Functionality
+####
+
+
+def _tokenize(tokenizer, text):
+ return tokenizer.encode(text, add_special_tokens=False)
+
+
+def _detokenize(tokenizer, enc_text):
+ return tokenizer.decode(enc_text)
+
+
+def _normalize_whitespace(text):
+ return re.sub(r"\s+", " ", text).strip()
+
+
+TEXT_ENCODING_OFFSET = 10_048
+SEMANTIC_PAD_TOKEN = 10_000
+TEXT_PAD_TOKEN = 129_595
+SEMANTIC_INFER_TOKEN = 129_599
+
+
+def _load_history_prompt(history_prompt_input):
+ if isinstance(history_prompt_input, str) and history_prompt_input.endswith(".npz"):
+ history_prompt = np.load(history_prompt_input)
+ elif isinstance(history_prompt_input, str):
+ # make sure this works on non-ubuntu
+ history_prompt_input = os.path.join(*history_prompt_input.split("/"))
+ if history_prompt_input not in ALLOWED_PROMPTS:
+ raise ValueError("history prompt not found")
+ history_prompt = np.load(
+ os.path.join(CUR_PATH, "assets", "prompts", f"{history_prompt_input}.npz")
+ )
+ elif isinstance(history_prompt_input, dict):
+ assert "semantic_prompt" in history_prompt_input
+ assert "coarse_prompt" in history_prompt_input
+ assert "fine_prompt" in history_prompt_input
+ history_prompt = history_prompt_input
+ else:
+ raise ValueError("history prompt format unrecognized")
+ return history_prompt
+
+
+def compute_log_probs(token_list, smoothing_factor, scaling_factor):
+ # Count the frequency of each token.
+ token_freq = Counter(token_list)
+
+ # Add a smoothing factor.
+ smoothed_token_freq = {token: freq + smoothing_factor for token, freq in token_freq.items()}
+
+ # Normalize to create a probability distribution.
+ total_tokens = len(token_list) + smoothing_factor * len(smoothed_token_freq)
+ token_probs = {token: freq / total_tokens for token, freq in smoothed_token_freq.items()}
+
+ # Transform into scaled log-probabilities.
+ log_probs = {token: scaling_factor * np.log(prob) for token, prob in token_probs.items()}
+
+ return log_probs
+
+
+def estimate_s_this_seems_wrong_so_many_math_crashes(prob):
+ epsilon = 1e-10
+ num = 0
+ den = 0
+ for i in range(
+ min(len(prob), 10000)
+ ): # apparently any number is fine here but they paper was on natural language so maybe not for us?
+ # for i in range(768):
+ b = prob[i] / (prob[i + 1] + epsilon)
+ t = (i + 2) / (i + 1)
+ if b > 0 and t > 0:
+ num += math.log(b) * math.log(t)
+ den += math.log(t) ** 2
+ return num / den if den != 0 else 0
+
+
+def estimate_s(prob):
+ epsilon = 1e-10
+ num = 0
+ den = 0
+ # for i in range(3000):
+ # in the paper they say 100 is as good as any higher number? But it's not slow so maybe leave it higher?
+ # also in the paper they don't have catch divide by 0s though...
+ # also the paper was on natural language so maybe not for us. Let's just max it out
+ for i in range(min(len(prob), 10000)):
+ b = prob[i] / (prob[i + 1] + epsilon)
+ t = (i + 2) / (i + 1)
+ if b > 0 and t > 0:
+ num += math.log(b if b > 0 else 1) * math.log(t if t > 0 else 1)
+ # den += math.log(t)**2
+ den += math.log(t if t > 0 else 1) ** 2
+ # ok NOW this should never be zero and feels more right
+ return num / den
+ # return num / den if den != 0 else 0 # or should this be float("inf") ? doesn't seem right.
+
+
+def compute_k_original_paper(n, s, tau):
+ print(f"n: {n}, s: {s}, tau: {tau}")
+ eps = s - 1
+ k = ((eps * (2 ** (tau))) / (1 - n ** (-eps))) ** (1 / s)
+ k = round(k)
+ return k
+
+
+def compute_k(n, s, tau, max_k):
+ try:
+ eps = s - 1
+ n_eps = n ** (-eps)
+ if s <= 0:
+ return 0
+ tau_s = tau ** (1 / s)
+ k = (eps * 2 * tau_s / (1 - n_eps)) ** (1 / s)
+ if isinstance(k, complex):
+ return 0
+ k = round(k)
+ if k > max_k:
+ return max_k
+ return k
+ except OverflowError:
+ # Return maximum possible k
+ return max_k
+
+
+def compute_k_orig(n, s, tau):
+ print(f"n: {n}, s: {s}, tau: {tau}")
+ eps = s - 1
+ k = ((eps * (2 ** (tau))) / (1 - n ** (-eps))) ** (1 / s)
+ k = round(k)
+ return k
+
+
+def compute_k_not_right(n, s, tau, max_k):
+ print(f"n: {n}, s: {s}, tau: {tau}")
+ try:
+ eps = s - 1
+ n_eps = n ** (-eps)
+ if s <= 0:
+ return max_k
+ tau_s = tau ** (1 / s)
+ k = (eps * 2 * tau_s / (1 - n_eps)) ** (1 / s)
+ k = round(k)
+ return k
+ except OverflowError:
+ # Return maximum possible k
+ return max_k
+
+
+def compute_k_log(n, s, tau):
+ print(f"n: {n}, s: {s}, tau: {tau}")
+ eps = s - 1
+ try:
+ log_k = (math.log(eps) + tau * math.log(2) - math.log(1 - n ** (-eps))) / s
+ k = round(math.exp(log_k))
+ except OverflowError:
+ k = float("inf")
+ return k
+
+
+# https://github.com/basusourya/mirostat/blob/master/mirostat.py
+
+
+# try adjusting target tau dynamically based on just length even? Could you shape the "energy" of the clip?
+def mirostat_sampling_v1(
+ logits=None,
+ tau=5.0,
+ learning_rate=1.0,
+ max_surprise=None,
+ vocab_size=SEMANTIC_VOCAB_SIZE,
+ indices_surprise_history=[],
+ running_tot_surprise=0,
+ generated=[],
+):
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+ prob_original = torch.softmax(sorted_logits, dim=-1).tolist()
+
+ s = estimate_s(prob_original)
+
+ max_k = len(sorted_logits) - 1
+
+ k = compute_k(vocab_size, s, max_surprise, max_k) + 1
+
+ print(f"\n\nK: {k} s: {s} tau: {max_surprise}")
+
+ sorted_logits = sorted_logits[0:k]
+ sorted_indices = sorted_indices[0:k]
+
+ prob_topk = torch.softmax(sorted_logits, dim=0)
+
+ prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True)
+ index_surprise = math.log2(1 / prob_original[prev_i])
+ print(f"index_surprise: {index_surprise}")
+ indices_surprise_history.append(index_surprise)
+
+ running_tot_surprise += index_surprise
+ prev = sorted_indices[prev_i]
+ generated += prev.tolist()
+
+ error_surprise = index_surprise - tau
+ max_surprise -= learning_rate * error_surprise
+
+ # full_probs = torch.zeros_like(logits) # 0? or -inf?
+ full_probs = torch.empty_like(logits).fill_(-float("inf"))
+ full_probs[sorted_indices] = prob_topk.to(full_probs.dtype)
+
+ return (
+ sorted_indices[prev_i],
+ max_surprise,
+ full_probs,
+ indices_surprise_history,
+ running_tot_surprise,
+ generated,
+ )
+
+
+def mirostat_sampling_meh(
+ logits=None,
+ tau=5.0,
+ learning_rate=1.0,
+ max_surprise=None,
+ vocab_size=SEMANTIC_VOCAB_SIZE,
+ indices_surprise_history=[],
+ running_tot_surprise=0,
+ generated=[],
+):
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+ prob_original = torch.softmax(sorted_logits, dim=-1).tolist()
+
+ s = estimate_s(prob_original)
+
+ max_k = len(sorted_logits) - 1
+
+ k = compute_k(vocab_size, s, max_surprise, max_k) + 1
+
+ print(f"\n\nK: {k} s: {s} tau: {max_surprise}")
+
+ sorted_logits = sorted_logits[0:k]
+ sorted_indices = sorted_indices[0:k]
+
+ prob_topk = torch.softmax(sorted_logits, dim=0)
+
+ prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True)
+
+ index_surprise = math.log2(1 / prob_original[sorted_indices[prev_i].item()])
+ print(f"index_surprise: {index_surprise}")
+ indices_surprise_history.append(index_surprise)
+
+ running_tot_surprise += index_surprise
+ prev = sorted_indices[prev_i]
+ generated += prev.tolist()
+ error_surprise = index_surprise - tau
+ max_surprise -= learning_rate * error_surprise
+
+ full_probs = torch.empty_like(logits).fill_(-float("inf"))
+ full_probs[sorted_indices] = prob_topk.to(full_probs.dtype)
+
+ item_next = sorted_indices[prev_i]
+
+ return (
+ item_next,
+ max_surprise,
+ full_probs,
+ indices_surprise_history,
+ running_tot_surprise,
+ generated,
+ )
+
+
+def mirostat_sampling_least(
+ logits=None,
+ tau=5.0,
+ learning_rate=1.0,
+ max_surprise=None,
+ vocab_size=SEMANTIC_VOCAB_SIZE,
+ indices_surprise_history=[],
+ running_tot_surprise=0,
+ generated=[],
+):
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+ prob_original = torch.softmax(sorted_logits, dim=-1).tolist()
+
+ s = estimate_s(prob_original)
+
+ max_k = len(sorted_logits) - 1
+
+ k = compute_k(vocab_size, s, max_surprise, max_k) + 1
+
+ print(f"\n\nK: {k} s: {s} tau: {max_surprise}")
+
+ sorted_logits = sorted_logits[0:k]
+ sorted_indices = sorted_indices[0:k]
+
+ prob_topk = torch.softmax(sorted_logits, dim=0)
+
+ prev_i = torch.argmin(prob_topk).unsqueeze(0)
+
+ index_surprise = math.log2(1 / prob_original[sorted_indices[prev_i].item()])
+ print(f"index_surprise: {index_surprise}")
+ indices_surprise_history.append(index_surprise)
+
+ running_tot_surprise += index_surprise
+ prev = sorted_indices[prev_i]
+ generated += prev.tolist()
+
+ error_surprise = index_surprise - tau
+ max_surprise -= learning_rate * error_surprise
+
+ full_probs = torch.empty_like(logits).fill_(-float("inf"))
+ full_probs[sorted_indices] = prob_topk.to(full_probs.dtype)
+
+ # Return least likely token and reverse generated logits
+ # return sorted_indices[prev_i], max_surprise, torch.flip(full_probs, dims=[0]), indices_surprise_history, running_tot_surprise, generated
+ return (
+ sorted_indices[prev_i],
+ max_surprise,
+ full_probs,
+ indices_surprise_history,
+ running_tot_surprise,
+ generated,
+ )
+
+
+def sine_wave_temperature(current_token, max_token):
+ return 3.0 + 2.1 * (math.sin(2 * math.pi * (current_token / 150)) / 2.1 + 0.2)
+
+
+def sine_wave_temperature(current_token, max_token, period=100, phase_shift=0):
+ return 0.5 + 2.0 * (math.sin(2 * math.pi * (current_token / period) + phase_shift) / 2 + 0.5)
+
+
+def sine_wave_temperature(current_token, token_period, start_phase, temp_min, temp_max):
+ phase = 2 * math.pi * ((current_token + start_phase) / token_period)
+ temp_range = temp_max - temp_min
+ return temp_min + temp_range * ((math.sin(phase) / 2) + 0.5)
+
+
+def mirostat_sampling(
+ logits=None,
+ tau=5.0,
+ learning_rate=1.0,
+ max_surprise=None,
+ vocab_size=SEMANTIC_VOCAB_SIZE,
+ indices_surprise_history=[],
+ running_tot_surprise=0,
+ generated=[],
+ temperature_fn=None,
+):
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+ prob_original = torch.softmax(sorted_logits, dim=-1).tolist()
+
+ s = estimate_s(prob_original)
+
+ max_k = len(sorted_logits) - 1
+
+ k = compute_k(vocab_size, s, max_surprise, max_k) + 1
+
+ sorted_logits = sorted_logits[0:k]
+ sorted_indices = sorted_indices[0:k]
+
+ # Current location in the segment
+ current_token = len(generated)
+ max_token = 768 # Maximum sample length
+
+ if temperature_fn is not None:
+ temp = temperature_fn(current_token, max_token)
+ sorted_logits = torch.clamp(sorted_logits, -10000, 10000)
+ # Apply to logits before softmax
+ prob_topk = torch.softmax(sorted_logits / temp, dim=0)
+ prob_topk = torch.clamp(prob_topk, 1e-9, 1 - 1e-9) # Ensures probabilities are valid
+ else:
+ prob_topk = torch.softmax(sorted_logits, dim=0)
+
+ prev_i = torch.multinomial(prob_topk, num_samples=1, replacement=True)
+
+ epsilon = 1e-10
+ index_surprise = math.log2(1 / (prob_original[sorted_indices[prev_i].item()] + epsilon))
+
+ indices_surprise_history.append(index_surprise)
+
+ running_tot_surprise += index_surprise
+ prev = sorted_indices[prev_i]
+ generated += prev.tolist()
+
+ error_surprise = index_surprise - tau
+ max_surprise -= learning_rate * error_surprise
+
+ full_probs = torch.empty_like(logits).fill_(-float("inf"))
+ full_probs[sorted_indices] = prob_topk.to(full_probs.dtype)
+
+ if current_token % 25 == 0 and False:
+ print(f"Temperature: {temp}")
+ print(f"index_surprise: {index_surprise}")
+ print(f"\n\nK: {k} s: {s} tau: {max_surprise}")
+
+ return (
+ sorted_indices[prev_i],
+ max_surprise,
+ full_probs,
+ indices_surprise_history,
+ running_tot_surprise,
+ generated,
+ )
+
+
+def compute_negative_influence(negative_logits, n, window_size, negative_scale):
+ # Check if negative_logits is empty
+ if len(negative_logits) == 0:
+ return 0
+
+ # Ensure n is within range
+ n = min(max(n, 0), len(negative_logits) - 1)
+
+ # Adjust window_size if it's larger than negative_logits length
+ window_size = min(window_size, len(negative_logits))
+
+ # Get the start and end of the window
+ start = max(0, n - window_size)
+ end = min(len(negative_logits), n + window_size + 1)
+
+ # Generate a Gaussian distribution for the weights and normalize them
+ weights = np.exp(-((np.arange(start, end) - n) ** 2) / (2.0 * window_size**2))
+ weights /= weights.sum()
+
+ # Compute a weighted average of negative_logits within the window
+ negative_influence = np.average(negative_logits[start:end], weights=weights, axis=0)
+
+ # Adjust the influence by the negative_scale
+ negative_influence *= min(max(negative_scale, 0), 1) # Ensure negative_scale is between 0 and 1
+
+ return negative_influence
+
+
+def generate_text_semantic(
+ text,
+ history_prompt=None,
+ temp=0.7,
+ top_k=None,
+ top_p=None,
+ silent=False,
+ min_eos_p=0.2,
+ max_gen_duration_s=None,
+ allow_early_stop=True,
+ use_kv_caching=True,
+ use_mirostat_sampling=False,
+ # tau = 31100.0,
+ tau=5.0,
+ miro_learning_rate=1.0,
+ token_repeat_penalty=0.0,
+ inverted_p=None,
+ bottom_k=None,
+ return_logits=False,
+ negative_tokens=None,
+ negative_logits=None,
+ negative_text_prompt_logits_scale=None,
+ negative_text_prompt_logits_scale_window_size=64,
+ negative_text_prompt_divergence_scale=None,
+):
+ """Generate semantic tokens from text."""
+
+ if return_logits:
+ all_logits = []
+
+ if temp == 0:
+ temp = 0.001
+ # debug(locals())
+ logger.debug(locals())
+ assert isinstance(text, str)
+ text = _normalize_whitespace(text)
+ # assert len(text.strip()) > 0
+
+ if history_prompt is not None:
+ history_prompt = _load_history_prompt(history_prompt)
+ semantic_history = history_prompt["semantic_prompt"]
+ assert (
+ isinstance(semantic_history, np.ndarray)
+ and len(semantic_history.shape) == 1
+ and len(semantic_history) > 0
+ and semantic_history.min() >= 0
+ and semantic_history.max() <= SEMANTIC_VOCAB_SIZE - 1
+ )
+ else:
+ semantic_history = None
+ # load models if not yet exist
+ global models
+ global models_devices
+ if "text" not in models:
+ if SUNO_USE_DIRECTML is True:
+ preload_models(load_one_model_type="text")
+ else:
+ preload_models()
+ model_container = models["text"]
+ model = model_container["model"]
+ tokenizer = model_container["tokenizer"]
+ encoded_text = np.array(_tokenize(tokenizer, text)) + TEXT_ENCODING_OFFSET
+ if OFFLOAD_CPU:
+ if GLOBAL_ENABLE_MPS:
+ device = _grab_best_device(use_gpu=False)
+ models_devices["text"] = device
+ model.to(models_devices["text"])
+ device = next(model.parameters()).device
+ if len(encoded_text) > 256:
+ p = round((len(encoded_text) - 256) / len(encoded_text) * 100, 1)
+ logger.warning(f"warning, text too long, lopping of last {p}%")
+ encoded_text = encoded_text[:256]
+ encoded_text = np.pad(
+ encoded_text,
+ (0, 256 - len(encoded_text)),
+ constant_values=TEXT_PAD_TOKEN,
+ mode="constant",
+ )
+ if semantic_history is not None:
+ semantic_history = semantic_history.astype(np.int64)
+ # print(f"Actual length of semantic input: {len(semantic_history)}")
+ # lop off if history is too long, pad if needed
+ semantic_history = semantic_history[-256:]
+ semantic_history = np.pad(
+ semantic_history,
+ (0, 256 - len(semantic_history)),
+ constant_values=SEMANTIC_PAD_TOKEN,
+ mode="constant",
+ )
+ else:
+ semantic_history = np.array([SEMANTIC_PAD_TOKEN] * 256)
+ x = torch.from_numpy(
+ np.hstack([encoded_text, semantic_history, np.array([SEMANTIC_INFER_TOKEN])]).astype(
+ np.int64
+ )
+ )[None]
+ assert x.shape[1] == 256 + 256 + 1
+ with _inference_mode():
+ if SUNO_USE_DIRECTML is True:
+ device = dml
+ x = x.to(device)
+ n_tot_steps = 768
+
+ # preallocate tensor
+ x_initial = x.shape[1]
+ x = torch.hstack([x, torch.empty([1, n_tot_steps], dtype=torch.int32, device=device)])
+
+ # custom tqdm updates since we don't know when eos will occur
+ pbar = tqdm.tqdm(disable=silent, total=n_tot_steps)
+ pbar_state = 0
+ tot_generated_duration_s = 0
+ kv_cache = None
+
+ # mirostat
+ prev = None
+ max_surprise = 2 * tau
+ indices_surprise_history = []
+ running_tot_surprise = 0
+ miro_generated = [] # debug
+
+ token_counts = defaultdict(int)
+ for n in range(n_tot_steps):
+ # if use_kv_caching and kv_cache is not None:
+ # x_input = x[:, [-1]]
+ # else:
+ # x_input = x
+
+ x_input = (
+ x[:, [x_initial + n - 1]]
+ if use_kv_caching and kv_cache is not None
+ else x[:, : x_initial + n]
+ )
+ logits, kv_cache = model(
+ x_input, merge_context=True, use_cache=use_kv_caching, past_kv=kv_cache
+ )
+ relevant_logits = logits[0, 0, :SEMANTIC_VOCAB_SIZE]
+ if allow_early_stop:
+ relevant_logits = torch.hstack(
+ (relevant_logits, logits[0, 0, [SEMANTIC_PAD_TOKEN]]) # eos
+ )
+
+ # Detach and convert to numpy for faster calculations
+ original_device = relevant_logits.device
+ relevant_logits = relevant_logits.detach().cpu().type(torch.float32).numpy()
+
+ # Jon doing some silly ideas here, but inverted_p seems genuinely useful
+ if top_p is not None or inverted_p is not None:
+ if inverted_p is not None:
+ sorted_indices = np.argsort(relevant_logits)
+ cumulative_limit = inverted_p
+ elif top_p is not None:
+ sorted_indices = np.argsort(relevant_logits)[::-1]
+ cumulative_limit = top_p
+ sorted_logits = relevant_logits[sorted_indices]
+ cumulative_probs = np.cumsum(softmax(sorted_logits))
+ sorted_indices_to_remove = cumulative_probs > cumulative_limit
+ sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
+ sorted_indices_to_remove[0] = False
+ relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf
+
+ relevant_logits = torch.from_numpy(relevant_logits)
+ relevant_logits = relevant_logits.to(original_device)
+
+ if top_k is not None or bottom_k is not None:
+ if bottom_k is not None:
+ v, _ = torch.topk(
+ relevant_logits, max(bottom_k, relevant_logits.size(-1)), largest=False
+ )
+ relevant_logits[relevant_logits > v[-1]] = -float("Inf")
+ elif top_k is not None:
+ v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1)))
+ relevant_logits[relevant_logits < v[-1]] = -float("Inf")
+
+ if use_mirostat_sampling:
+ logits_for_miro = relevant_logits / temp
+ (
+ item_next,
+ max_surprise,
+ probs,
+ indices_surprise_history,
+ running_tot_surprise,
+ miro_generated,
+ ) = mirostat_sampling(
+ logits=logits_for_miro,
+ max_surprise=max_surprise,
+ tau=tau,
+ learning_rate=miro_learning_rate,
+ vocab_size=SEMANTIC_VOCAB_SIZE,
+ indices_surprise_history=indices_surprise_history,
+ running_tot_surprise=running_tot_surprise,
+ generated=miro_generated,
+ temperature_fn=None,
+ )
+ # item_next = item_next.to(torch.int32)
+
+ else:
+ if token_repeat_penalty != 0.0 and token_repeat_penalty != 1.0:
+ for token, count in token_counts.items():
+ relevant_logits[token] += math.log(token_repeat_penalty) * count
+
+ if return_logits:
+ all_logits.append(relevant_logits)
+
+ probs = F.softmax(relevant_logits / temp, dim=-1)
+ item_next = torch.multinomial(probs, num_samples=1).to(torch.int32)
+
+ if allow_early_stop and (
+ item_next == SEMANTIC_VOCAB_SIZE
+ or (min_eos_p is not None and probs[-1] >= min_eos_p)
+ ):
+ n -= 1 # backtrack 1
+ # eos found, so break
+ pbar.total = n
+ pbar.update(n - pbar_state)
+
+ break
+ # x = torch.cat((x, item_next[None]), dim=1)
+ if token_repeat_penalty != 0.0 and token_repeat_penalty != 1.0:
+ token_counts[int(item_next)] += 1
+
+ x[0][x_initial + n] = item_next
+ tot_generated_duration_s += 1 / SEMANTIC_RATE_HZ
+ if max_gen_duration_s is not None and tot_generated_duration_s > max_gen_duration_s:
+ pbar.total = n
+ pbar.update(n - pbar_state)
+ break
+ if n == n_tot_steps - 1:
+ pbar.total = n
+ pbar.update(n - pbar_state)
+ break
+ del logits, relevant_logits, probs, item_next
+ if n > pbar_state:
+ if n > pbar.total:
+ pbar.total = n
+ pbar.update(n - pbar_state)
+ pbar_state = n
+ pbar.total = n
+ pbar.refresh()
+
+ pbar.close()
+ # out = x.detach().cpu().numpy().squeeze()[256 + 256 + 1 :]
+ out = x.detach().cpu().numpy().squeeze()[x_initial : x_initial + n + 1]
+ if use_mirostat_sampling and False:
+ print(f"Target tau: {tau}")
+ print("Total surprise value:", sum(indices_surprise_history))
+ print("Average surprise value:", sum(indices_surprise_history) / len(out))
+ print(f"Generated Miro: {miro_generated}")
+ print(f"out: {out}")
+ if OFFLOAD_CPU:
+ model.to("cpu")
+ assert all(0 <= out) and all(out < SEMANTIC_VOCAB_SIZE)
+ _clear_cuda_cache()
+
+ if SUNO_USE_DIRECTML is True:
+ clean_models()
+
+ if return_logits:
+ return out, all_logits
+ else:
+ return out
+
+
+def generate_text_semantic_branching_not_batching(
+ text,
+ history_prompt=None,
+ temp=0.7,
+ top_k=None,
+ top_p=None,
+ silent=False,
+ min_eos_p=0.2,
+ max_gen_duration_s=None,
+ allow_early_stop=True,
+ use_kv_caching=True,
+ num_sample_per_step=2,
+):
+ """Generate semantic tokens from text."""
+ assert isinstance(text, str)
+ text = _normalize_whitespace(text)
+ assert len(text.strip()) > 0
+ if history_prompt is not None:
+ history_prompt = _load_history_prompt(history_prompt)
+ semantic_history = history_prompt["semantic_prompt"]
+ assert (
+ isinstance(semantic_history, np.ndarray)
+ and len(semantic_history.shape) == 1
+ and len(semantic_history) > 0
+ and semantic_history.min() >= 0
+ and semantic_history.max() <= SEMANTIC_VOCAB_SIZE - 1
+ )
+ else:
+ semantic_history = None
+ # load models if not yet exist
+ global models
+ global models_devices
+ if "text" not in models:
+ if SUNO_USE_DIRECTML is True:
+ preload_models(load_one_model_type="text")
+ else:
+ preload_models()
+ model_container = models["text"]
+ model = model_container["model"]
+ tokenizer = model_container["tokenizer"]
+ encoded_text = np.array(_tokenize(tokenizer, text)) + TEXT_ENCODING_OFFSET
+ if OFFLOAD_CPU:
+ model.to(models_devices["text"])
+ device = next(model.parameters()).device
+ if len(encoded_text) > 256:
+ p = round((len(encoded_text) - 256) / len(encoded_text) * 100, 1)
+ logger.warning(f"warning, text too long, lopping of last {p}%")
+ encoded_text = encoded_text[:256]
+ encoded_text = np.pad(
+ encoded_text,
+ (0, 256 - len(encoded_text)),
+ constant_values=TEXT_PAD_TOKEN,
+ mode="constant",
+ )
+ if semantic_history is not None:
+ semantic_history = semantic_history.astype(np.int64)
+ # lop off if history is too long, pad if needed
+ semantic_history = semantic_history[-256:]
+ semantic_history = np.pad(
+ semantic_history,
+ (0, 256 - len(semantic_history)),
+ constant_values=SEMANTIC_PAD_TOKEN,
+ mode="constant",
+ )
+ else:
+ semantic_history = np.array([SEMANTIC_PAD_TOKEN] * 256)
+ # x = torch.from_numpy(
+ # np.hstack([
+ # encoded_text, semantic_history, np.array([SEMANTIC_INFER_TOKEN])
+ # ]).astype(np.int64)
+ # )[None]
+
+ x = torch.from_numpy(
+ np.hstack([encoded_text, semantic_history, np.array([SEMANTIC_INFER_TOKEN])]).astype(
+ np.int64
+ )
+ ).repeat(num_sample_per_step, 1)
+
+ assert x.shape[1] == 256 + 256 + 1
+ with _inference_mode():
+ x = x.to(device)
+ n_tot_steps = 768
+ # custom tqdm updates since we don't know when eos will occur
+ pbar = tqdm.tqdm(disable=silent, total=n_tot_steps)
+ pbar_state = 0
+ tot_generated_duration_s = 0
+ kv_cache = None
+ for n in range(n_tot_steps):
+ if use_kv_caching and kv_cache is not None:
+ x_input = x[:, [-1]]
+ else:
+ x_input = x
+ logits, kv_cache = model(
+ x_input, merge_context=True, use_cache=use_kv_caching, past_kv=kv_cache
+ )
+ relevant_logits = logits[0, 0, :SEMANTIC_VOCAB_SIZE]
+ if allow_early_stop:
+ relevant_logits = torch.hstack(
+ (relevant_logits, logits[0, 0, [SEMANTIC_PAD_TOKEN]]) # eos
+ )
+ if top_p is not None:
+ # faster to convert to numpy
+ original_device = relevant_logits.device
+ relevant_logits = relevant_logits.detach().cpu().type(torch.float32).numpy()
+ sorted_indices = np.argsort(relevant_logits)[::-1]
+ sorted_logits = relevant_logits[sorted_indices]
+ cumulative_probs = np.cumsum(softmax(sorted_logits))
+ sorted_indices_to_remove = cumulative_probs > top_p
+ sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
+ sorted_indices_to_remove[0] = False
+ relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf
+ relevant_logits = torch.from_numpy(relevant_logits)
+ relevant_logits = relevant_logits.to(original_device)
+ if top_k is not None:
+ v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1)))
+ relevant_logits[relevant_logits < v[-1]] = -float("Inf")
+ # probs = F.softmax(relevant_logits / temp, dim=-1)
+ # item_next = torch.multinomial(probs, num_samples=1).to(torch.int32)
+
+ probs = F.softmax(relevant_logits / temp, dim=-1)
+ item_next = torch.multinomial(probs, num_samples=num_sample_per_step).to(torch.int32)
+ if allow_early_stop and (
+ item_next == SEMANTIC_VOCAB_SIZE
+ or (min_eos_p is not None and probs[-1] >= min_eos_p)
+ ):
+ # eos found, so break
+ pbar.update(n - pbar_state)
+ break
+ # x = torch.cat((x, item_next[None]), dim=1)
+ for i in range(num_sample_per_step):
+ x[i] = torch.cat((x[i], item_next[i][None]), dim=0)
+ tot_generated_duration_s += 1 / SEMANTIC_RATE_HZ
+ if max_gen_duration_s is not None and tot_generated_duration_s > max_gen_duration_s:
+ pbar.update(n - pbar_state)
+ break
+ if n == n_tot_steps - 1:
+ pbar.update(n - pbar_state)
+ break
+ del logits, relevant_logits, probs, item_next
+ if n > pbar_state:
+ if n > pbar.total:
+ pbar.total = n
+ pbar.update(n - pbar_state)
+ pbar_state = n
+ pbar.total = n
+ pbar.refresh()
+ pbar.close()
+ out = x.detach().cpu().numpy().squeeze()[256 + 256 + 1 :]
+ if OFFLOAD_CPU:
+ model.to("cpu")
+ assert all(0 <= out) and all(out < SEMANTIC_VOCAB_SIZE)
+ _clear_cuda_cache()
+ return out
+
+
+def generate_coarse(
+ x_semantic,
+ history_prompt=None,
+ temp=0.7,
+ top_k=None,
+ top_p=None,
+ silent=False,
+ max_coarse_history=630, # min 60 (faster), max 630 (more context)
+ sliding_window_len=60,
+ use_kv_caching=True,
+ x_coarse_history_alignment_hack=-2,
+):
+ """Generate coarse audio codes from semantic tokens."""
+
+ logger.debug(locals())
+ assert (
+ isinstance(x_semantic, np.ndarray)
+ and len(x_semantic.shape) == 1
+ and len(x_semantic) > 0
+ and x_semantic.min() >= 0
+ and x_semantic.max() <= SEMANTIC_VOCAB_SIZE - 1
+ )
+ assert 60 <= max_coarse_history <= 630
+ assert max_coarse_history + sliding_window_len <= 1024 - 256
+ semantic_to_coarse_ratio = COARSE_RATE_HZ / SEMANTIC_RATE_HZ * N_COARSE_CODEBOOKS
+
+ max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
+ if history_prompt is not None:
+ history_prompt = _load_history_prompt(history_prompt)
+ x_semantic_history = history_prompt["semantic_prompt"]
+ x_coarse_history = history_prompt["coarse_prompt"]
+
+ # print(f"Pre Trim sem coars: {x_semantic_history.shape} {x_coarse_history.shape}")
+ assert (
+ isinstance(x_semantic_history, np.ndarray)
+ and len(x_semantic_history.shape) == 1
+ and len(x_semantic_history) > 0
+ and x_semantic_history.min() >= 0
+ and x_semantic_history.max() <= SEMANTIC_VOCAB_SIZE - 1
+ and isinstance(x_coarse_history, np.ndarray)
+ and len(x_coarse_history.shape) == 2
+ and x_coarse_history.shape[0] == N_COARSE_CODEBOOKS
+ and x_coarse_history.shape[-1] >= 0
+ and x_coarse_history.min() >= 0
+ and x_coarse_history.max() <= CODEBOOK_SIZE - 1
+ and (
+ round(x_coarse_history.shape[-1] / len(x_semantic_history), 1)
+ == round(semantic_to_coarse_ratio / N_COARSE_CODEBOOKS, 1)
+ )
+ )
+
+ x_coarse_history = _flatten_codebooks(x_coarse_history) + SEMANTIC_VOCAB_SIZE
+ # trim histories correctly
+ n_semantic_hist_provided = np.min(
+ [
+ max_semantic_history,
+ len(x_semantic_history) - len(x_semantic_history) % 2,
+ int(np.floor(len(x_coarse_history) / semantic_to_coarse_ratio)),
+ ]
+ )
+ n_coarse_hist_provided = int(round(n_semantic_hist_provided * semantic_to_coarse_ratio))
+ x_semantic_history = x_semantic_history[-n_semantic_hist_provided:].astype(np.int32)
+ x_coarse_history = x_coarse_history[-n_coarse_hist_provided:].astype(np.int32)
+ # TODO: bit of a hack for time alignment (sounds better)
+ # x_coarse_history = x_coarse_history[:-2]
+ x_coarse_history = x_coarse_history[:x_coarse_history_alignment_hack]
+
+ else:
+ x_semantic_history = np.array([], dtype=np.int32)
+ x_coarse_history = np.array([], dtype=np.int32)
+
+ # print(f"actual lengths we're using, x_semantic_history: {len(x_semantic_history)} x_coarse_history: {len(x_coarse_history)}")
+
+ # load models if not yet exist
+ global models
+ global models_devices
+ if "coarse" not in models:
+ if SUNO_USE_DIRECTML is True:
+ preload_models(load_one_model_type="coarse")
+ else:
+ preload_models()
+ model = models["coarse"]
+ if OFFLOAD_CPU:
+ if GLOBAL_ENABLE_MPS:
+ device = _grab_best_device(use_gpu=False)
+ models_devices["coarse"] = device
+ model.to(models_devices["coarse"])
+
+ device = next(model.parameters()).device
+ # start loop
+ n_steps = int(
+ round(
+ np.floor(len(x_semantic) * semantic_to_coarse_ratio / N_COARSE_CODEBOOKS)
+ * N_COARSE_CODEBOOKS
+ )
+ )
+ assert n_steps > 0 and n_steps % N_COARSE_CODEBOOKS == 0
+
+ # reminder to try filling up some of the COARSE_INFER_TOKEN with history to get better short clips
+ x_semantic = np.hstack([x_semantic_history, x_semantic]).astype(np.int32)
+ x_coarse = x_coarse_history.astype(np.int32)
+ base_semantic_idx = len(x_semantic_history)
+ with _inference_mode():
+ if SUNO_USE_DIRECTML is True:
+ device = dml
+ x_semantic_in = torch.from_numpy(x_semantic)[None].to(device)
+ x_coarse_in = torch.from_numpy(x_coarse)[None].to(device)
+ n_window_steps = int(np.ceil(n_steps / sliding_window_len))
+ n_step = 0
+ for _ in tqdm.tqdm(range(n_window_steps), total=n_window_steps, disable=silent):
+ semantic_idx = base_semantic_idx + int(round(n_step / semantic_to_coarse_ratio))
+ # pad from right side
+ x_in = x_semantic_in[:, np.max([0, semantic_idx - max_semantic_history]) :]
+ x_in = x_in[:, :256]
+ x_in = F.pad(
+ x_in,
+ (0, 256 - x_in.shape[-1]),
+ "constant",
+ COARSE_SEMANTIC_PAD_TOKEN,
+ )
+
+ x_in = torch.hstack(
+ [
+ x_in,
+ torch.tensor([COARSE_INFER_TOKEN])[None].to(device),
+ x_coarse_in[:, -max_coarse_history:],
+ ]
+ )
+ kv_cache = None
+ for _ in range(sliding_window_len):
+ if n_step >= n_steps:
+ continue
+ is_major_step = n_step % N_COARSE_CODEBOOKS == 0
+
+ if use_kv_caching and kv_cache is not None:
+ x_input = x_in[:, [-1]]
+ else:
+ x_input = x_in
+
+ logits, kv_cache = model(x_input, use_cache=use_kv_caching, past_kv=kv_cache)
+ logit_start_idx = SEMANTIC_VOCAB_SIZE + (1 - int(is_major_step)) * CODEBOOK_SIZE
+ logit_end_idx = SEMANTIC_VOCAB_SIZE + (2 - int(is_major_step)) * CODEBOOK_SIZE
+ relevant_logits = logits[0, 0, logit_start_idx:logit_end_idx]
+ if top_p is not None:
+ # faster to convert to numpy
+ logits_device = relevant_logits.device
+ logits_dtype = relevant_logits.type()
+ relevant_logits = relevant_logits.detach().cpu().type(torch.float32).numpy()
+ sorted_indices = np.argsort(relevant_logits)[::-1]
+ sorted_logits = relevant_logits[sorted_indices]
+ cumulative_probs = np.cumsum(softmax(sorted_logits))
+ sorted_indices_to_remove = cumulative_probs > top_p
+ sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
+ sorted_indices_to_remove[0] = False
+ relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf
+ relevant_logits = torch.from_numpy(relevant_logits)
+ relevant_logits = relevant_logits.to(logits_device).type(logits_dtype)
+ if top_k is not None:
+ v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1)))
+ relevant_logits[relevant_logits < v[-1]] = -float("Inf")
+ probs = F.softmax(relevant_logits / temp, dim=-1)
+ # multinomial bugged on mps: shuttle to cpu if necessary
+ inf_device = probs.device
+ if probs.device.type == "mps":
+ probs = probs.to("cpu")
+ item_next = torch.multinomial(probs, num_samples=1)
+ probs = probs.to(inf_device)
+ item_next = item_next.to(inf_device)
+ item_next += logit_start_idx
+ x_coarse_in = torch.cat((x_coarse_in, item_next[None]), dim=1)
+ x_in = torch.cat((x_in, item_next[None]), dim=1)
+ del logits, relevant_logits, probs, item_next
+ n_step += 1
+ del x_in
+ del x_semantic_in
+ if OFFLOAD_CPU:
+ model.to("cpu")
+ gen_coarse_arr = x_coarse_in.detach().cpu().numpy().squeeze()[len(x_coarse_history) :]
+ del x_coarse_in
+ assert len(gen_coarse_arr) == n_steps
+ gen_coarse_audio_arr = gen_coarse_arr.reshape(-1, N_COARSE_CODEBOOKS).T - SEMANTIC_VOCAB_SIZE
+ for n in range(1, N_COARSE_CODEBOOKS):
+ gen_coarse_audio_arr[n, :] -= n * CODEBOOK_SIZE
+ _clear_cuda_cache()
+ if SUNO_USE_DIRECTML is True:
+ clean_models()
+ return gen_coarse_audio_arr
+
+
+def generate_coarse_amd_directml(
+ x_semantic,
+ history_prompt=None,
+ temp=0.7,
+ top_k=None,
+ top_p=None,
+ silent=False,
+ max_coarse_history=630, # min 60 (faster), max 630 (more context)
+ sliding_window_len=60,
+ use_kv_caching=True,
+ x_coarse_history_alignment_hack=-2,
+):
+ """Generate coarse audio codes from semantic tokens."""
+
+ logger.debug(locals())
+
+ assert (
+ isinstance(x_semantic, np.ndarray)
+ and len(x_semantic.shape) == 1
+ and len(x_semantic) > 0
+ and x_semantic.min() >= 0
+ and x_semantic.max() <= SEMANTIC_VOCAB_SIZE - 1
+ )
+ assert 60 <= max_coarse_history <= 630
+ assert max_coarse_history + sliding_window_len <= 1024 - 256
+ semantic_to_coarse_ratio = COARSE_RATE_HZ / SEMANTIC_RATE_HZ * N_COARSE_CODEBOOKS
+ max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
+ if history_prompt is not None:
+ history_prompt = _load_history_prompt(history_prompt)
+ x_semantic_history = history_prompt["semantic_prompt"]
+ x_coarse_history = history_prompt["coarse_prompt"]
+ assert (
+ isinstance(x_semantic_history, np.ndarray)
+ and len(x_semantic_history.shape) == 1
+ and len(x_semantic_history) > 0
+ and x_semantic_history.min() >= 0
+ and x_semantic_history.max() <= SEMANTIC_VOCAB_SIZE - 1
+ and isinstance(x_coarse_history, np.ndarray)
+ and len(x_coarse_history.shape) == 2
+ and x_coarse_history.shape[0] == N_COARSE_CODEBOOKS
+ and x_coarse_history.shape[-1] >= 0
+ and x_coarse_history.min() >= 0
+ and x_coarse_history.max() <= CODEBOOK_SIZE - 1
+ and (
+ round(x_coarse_history.shape[-1] / len(x_semantic_history), 1)
+ == round(semantic_to_coarse_ratio / N_COARSE_CODEBOOKS, 1)
+ )
+ )
+ x_coarse_history = _flatten_codebooks(x_coarse_history) + SEMANTIC_VOCAB_SIZE
+ # trim histories correctly
+ n_semantic_hist_provided = np.min(
+ [
+ max_semantic_history,
+ len(x_semantic_history) - len(x_semantic_history) % 2,
+ int(np.floor(len(x_coarse_history) / semantic_to_coarse_ratio)),
+ ]
+ )
+ n_coarse_hist_provided = int(round(n_semantic_hist_provided * semantic_to_coarse_ratio))
+ x_semantic_history = x_semantic_history[-n_semantic_hist_provided:].astype(np.int32)
+ x_coarse_history = x_coarse_history[-n_coarse_hist_provided:].astype(np.int32)
+ # TODO: bit of a hack for time alignment (sounds better)
+ x_coarse_history = x_coarse_history[:-2]
+ else:
+ x_semantic_history = np.array([], dtype=np.int32)
+ x_coarse_history = np.array([], dtype=np.int32)
+ # load models if not yet exist
+ global models
+ global models_devices
+ if "coarse" not in models:
+ if SUNO_USE_DIRECTML is True:
+ preload_models(load_one_model_type="coarse")
+ else:
+ preload_models()
+ model = models["coarse"]
+ if OFFLOAD_CPU:
+ if GLOBAL_ENABLE_MPS:
+ device = _grab_best_device(use_gpu=False)
+ models_devices["coarse"] = device
+ model.to(models_devices["coarse"])
+ # device = next(model.parameters()).device
+
+ # start loop
+ n_steps = int(
+ round(
+ np.floor(len(x_semantic) * semantic_to_coarse_ratio / N_COARSE_CODEBOOKS)
+ * N_COARSE_CODEBOOKS
+ )
+ )
+ assert n_steps > 0 and n_steps % N_COARSE_CODEBOOKS == 0
+ x_semantic = np.hstack([x_semantic_history, x_semantic]).astype(np.int32)
+ x_coarse = x_coarse_history.astype(np.int32)
+ base_semantic_idx = len(x_semantic_history)
+ cumulative_time = 0
+ with _inference_mode():
+ try:
+ # x_semantic_in = torch.from_numpy(x_semantic)[None].to(dml)
+ x_semantic_in_np = x_semantic[None]
+ # x_coarse_in = torch.from_numpy(x_coarse)[None].to(dml)
+ x_coarse_in_np = x_coarse[None]
+ n_window_steps = int(np.ceil(n_steps / sliding_window_len))
+ n_step = 0
+ for _ in tqdm.tqdm(range(n_window_steps), total=n_window_steps, disable=silent):
+ semantic_idx = base_semantic_idx + int(round(n_step / semantic_to_coarse_ratio))
+ # pad from right side
+ x_in_np = x_semantic_in_np[:, np.max([0, semantic_idx - max_semantic_history]) :]
+ x_in_np = x_in_np[:, :256]
+ """
+ x_in_np = F.pad(
+ x_in_np,
+ (0, 256 - x_in_np.shape[-1]),
+ "constant",
+ COARSE_SEMANTIC_PAD_TOKEN,
+ )
+ """
+ np_pad_size = ((0, 0), (0, 256 - x_in_np.shape[-1]))
+ x_in_np = np.pad(
+ x_in_np,
+ np_pad_size,
+ constant_values=COARSE_SEMANTIC_PAD_TOKEN,
+ mode="constant",
+ )
+
+ """
+ x_in = torch.hstack(
+ [
+ x_in,
+ torch.tensor([COARSE_INFER_TOKEN])[None].to(dml),
+ x_coarse_in[:, -max_coarse_history:],
+ ]
+ )
+ """
+
+ coarse_infer_token_np = np.array([COARSE_INFER_TOKEN])[None]
+
+ x_in_np = np.hstack(
+ [
+ x_in_np,
+ coarse_infer_token_np,
+ x_coarse_in_np[:, -max_coarse_history:],
+ ]
+ )
+
+ kv_cache = None
+ for _ in range(sliding_window_len):
+ if n_step >= n_steps:
+ continue
+ is_major_step = n_step % N_COARSE_CODEBOOKS == 0
+
+ if use_kv_caching and kv_cache is not None:
+ x_input = x_in_np[:, [-1]]
+ else:
+ x_input = x_in_np
+
+ x_input_tensor = torch.from_numpy(x_input).to(dml)
+
+ logits, kv_cache = model(
+ x_input_tensor, use_cache=use_kv_caching, past_kv=kv_cache
+ )
+
+ logit_start_idx = SEMANTIC_VOCAB_SIZE + (1 - int(is_major_step)) * CODEBOOK_SIZE
+ logit_end_idx = SEMANTIC_VOCAB_SIZE + (2 - int(is_major_step)) * CODEBOOK_SIZE
+ relevant_logits = logits[0, 0, logit_start_idx:logit_end_idx]
+
+ if top_p is not None:
+ # faster to convert to numpy
+ # original_device = relevant_logits.device
+ relevant_logits = relevant_logits.detach().cpu().type(torch.float32).numpy()
+ sorted_indices = np.argsort(relevant_logits)[::-1]
+ sorted_logits = relevant_logits[sorted_indices]
+ cumulative_probs = np.cumsum(softmax(sorted_logits))
+ sorted_indices_to_remove = cumulative_probs > top_p
+ sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
+ sorted_indices_to_remove[0] = False
+ relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf
+ relevant_logits = torch.from_numpy(relevant_logits)
+ # relevant_logits = relevant_logits.to(original_device)
+ # stay as numpy, since we converted for directml anyway...
+ if top_k is not None:
+ v, _ = torch.topk(
+ relevant_logits.to(dml),
+ min(top_k, relevant_logits.to(dml).size(-1)),
+ )
+ relevant_logits[relevant_logits < v[-1]] = -float("Inf")
+
+ # probs = F.softmax(relevant_logits.to(dml) / temp, dim=-1)
+
+ start_time = time.time()
+
+ # item_next = torch.multinomial(probs, num_samples=1).to(torch.int32)
+
+ probs_np = (
+ F.softmax(relevant_logits.to(dml) / temp, dim=-1)
+ .cpu()
+ .type(torch.float32)
+ .numpy()
+ )
+
+ item_next_np = np.random.choice(
+ np.arange(probs_np.shape[-1]), size=1, p=probs_np.flatten()
+ )
+
+ # item_next = torch.from_numpy(item_next_np).to(torch.int32).to(dml)
+
+ # doing in raw numpy same speed with AMD directML, but maybe faster if you setup MKL correctly?
+ # actually tha wasn't quite righ anyway...
+ end_time = time.time()
+ cumulative_time = cumulative_time + (end_time - start_time)
+
+ # amd_multinomial = torch_distributions.Categorical(probs)
+ # action = amd_multinomial.sample((1,))
+ # item_next = amd_multinomial.log_prob(action).to(torch.int32)
+
+ # multinomial bugged on mps: shuttle to cpu if necessary
+ # inf_device = probs.device
+ # if probs.device.type == "mps" or True:
+ # probs = probs.to("cpu")
+ # # print(f"Here in coarse: {probs.device}")
+ # item_next = torch.multinomial(probs, num_samples=1)
+ # probs = probs.to(inf_device)
+ # item_next = item_next.to(inf_device)
+
+ item_next_np += logit_start_idx
+
+ x_coarse_in_np = np.hstack((x_coarse_in_np, item_next_np[None]))
+
+ # x_coarse_in = torch.from_numpy(x_coarse_in_np).to(dml)
+ # x_in = torch.cat((x_in_np.to(dml), item_next_np[None]), dim=1)
+
+ x_in_np = np.hstack((x_in_np, item_next_np[None]))
+ del logits, relevant_logits, probs_np, item_next_np
+ n_step += 1
+ del x_in_np
+ del x_semantic_in_np
+ except RuntimeError as e:
+ print(f"RuntimeError: {e}")
+ # show all possble details and traceback, print to output
+ print(f"Traceback: {traceback.format_exc()}") # and print(sys.exc_info()[2])
+ print(f"Exception: {sys.exc_info()[2]}")
+
+ if OFFLOAD_CPU:
+ model.to("cpu")
+ gen_coarse_arr = x_coarse_in_np.squeeze()[len(x_coarse_history) :]
+ del x_coarse_in_np
+ assert len(gen_coarse_arr) == n_steps
+ gen_coarse_audio_arr = gen_coarse_arr.reshape(-1, N_COARSE_CODEBOOKS).T - SEMANTIC_VOCAB_SIZE
+ for n in range(1, N_COARSE_CODEBOOKS):
+ gen_coarse_audio_arr[n, :] -= n * CODEBOOK_SIZE
+ _clear_cuda_cache()
+ if SUNO_USE_DIRECTML is True:
+ clean_models()
+ return gen_coarse_audio_arr
+
+
+def generate_fine(
+ x_coarse_gen,
+ history_prompt=None,
+ temp=0.5,
+ silent=True,
+):
+ if temp == 0:
+ temp = 0.001
+
+ """Generate full audio codes from coarse audio codes."""
+ assert (
+ isinstance(x_coarse_gen, np.ndarray)
+ and len(x_coarse_gen.shape) == 2
+ and 1 <= x_coarse_gen.shape[0] <= N_FINE_CODEBOOKS - 1
+ and x_coarse_gen.shape[1] > 0
+ and x_coarse_gen.min() >= 0
+ and x_coarse_gen.max() <= CODEBOOK_SIZE - 1
+ )
+ if history_prompt is not None:
+ history_prompt = _load_history_prompt(history_prompt)
+ x_fine_history = history_prompt["fine_prompt"]
+ assert (
+ isinstance(x_fine_history, np.ndarray)
+ and len(x_fine_history.shape) == 2
+ and x_fine_history.shape[0] == N_FINE_CODEBOOKS
+ and x_fine_history.shape[1] >= 0
+ and x_fine_history.min() >= 0
+ and x_fine_history.max() <= CODEBOOK_SIZE - 1
+ )
+ else:
+ x_fine_history = None
+ n_coarse = x_coarse_gen.shape[0]
+ # load models if not yet exist
+ global models
+ global models_devices
+ if "fine" not in models:
+ if SUNO_USE_DIRECTML is True:
+ preload_models(load_one_model_type="fine")
+ else:
+ preload_models()
+ model = models["fine"]
+ if OFFLOAD_CPU:
+ if GLOBAL_ENABLE_MPS:
+ device = _grab_best_device(use_gpu=False)
+ models_devices["fine"] = device
+ model.to(models_devices["fine"])
+ device = next(model.parameters()).device
+ # make input arr
+ in_arr = np.vstack(
+ [
+ x_coarse_gen,
+ np.zeros((N_FINE_CODEBOOKS - n_coarse, x_coarse_gen.shape[1]))
+ + CODEBOOK_SIZE, # padding
+ ]
+ ).astype(np.int32)
+ # prepend history if available (max 512)
+ if x_fine_history is not None:
+ x_fine_history = x_fine_history.astype(np.int32)
+ in_arr = np.hstack(
+ [
+ x_fine_history[:, -512:].astype(np.int32),
+ in_arr,
+ ]
+ )
+ n_history = x_fine_history[:, -512:].shape[1]
+ else:
+ n_history = 0
+ n_remove_from_end = 0
+ # need to pad if too short (since non-causal model)
+ if in_arr.shape[1] < 1024:
+ n_remove_from_end = 1024 - in_arr.shape[1]
+ in_arr = np.hstack(
+ [
+ in_arr,
+ np.zeros((N_FINE_CODEBOOKS, n_remove_from_end), dtype=np.int32) + CODEBOOK_SIZE,
+ ]
+ )
+ # we can be lazy about fractional loop and just keep overwriting codebooks
+ n_loops = np.max([0, int(np.ceil((x_coarse_gen.shape[1] - (1024 - n_history)) / 512))]) + 1
+ with _inference_mode():
+ if SUNO_USE_DIRECTML is True:
+ device = dml
+ in_arr = torch.tensor(in_arr.T).to(device)
+ for n in tqdm.tqdm(range(n_loops), disable=silent):
+ start_idx = np.min([n * 512, in_arr.shape[0] - 1024])
+ start_fill_idx = np.min([n_history + n * 512, in_arr.shape[0] - 512])
+ rel_start_fill_idx = start_fill_idx - start_idx
+ in_buffer = in_arr[start_idx : start_idx + 1024, :][None]
+ for nn in range(n_coarse, N_FINE_CODEBOOKS):
+ logits = model(nn, in_buffer)
+ if temp is None:
+ relevant_logits = logits[0, rel_start_fill_idx:, :CODEBOOK_SIZE]
+ codebook_preds = torch.argmax(relevant_logits, -1)
+ else:
+ relevant_logits = logits[0, :, :CODEBOOK_SIZE] / temp
+ probs = F.softmax(relevant_logits, dim=-1)
+ codebook_preds = torch.multinomial(
+ probs[rel_start_fill_idx:1024], num_samples=1
+ ).reshape(-1)
+ codebook_preds = codebook_preds.to(torch.int32)
+ in_buffer[0, rel_start_fill_idx:, nn] = codebook_preds
+ del logits, codebook_preds
+ # transfer over info into model_in and convert to numpy
+ for nn in range(n_coarse, N_FINE_CODEBOOKS):
+ in_arr[
+ start_fill_idx : start_fill_idx + (1024 - rel_start_fill_idx), nn
+ ] = in_buffer[0, rel_start_fill_idx:, nn]
+ del in_buffer
+ gen_fine_arr = in_arr.detach().cpu().numpy().squeeze().T
+ del in_arr
+ if OFFLOAD_CPU:
+ model.to("cpu")
+ gen_fine_arr = gen_fine_arr[:, n_history:]
+ if n_remove_from_end > 0:
+ gen_fine_arr = gen_fine_arr[:, :-n_remove_from_end]
+ assert gen_fine_arr.shape[-1] == x_coarse_gen.shape[-1]
+ _clear_cuda_cache()
+ if SUNO_USE_DIRECTML is True:
+ clean_models()
+ return gen_fine_arr
+
+
+def _flatten_codebooks(arr, offset_size=CODEBOOK_SIZE):
+ assert len(arr.shape) == 2
+ arr = arr.copy()
+ if offset_size is not None:
+ for n in range(1, arr.shape[0]):
+ arr[n, :] += offset_size * n
+ flat_arr = arr.ravel("F")
+ return flat_arr
+
+
+COARSE_SEMANTIC_PAD_TOKEN = 12_048
+COARSE_INFER_TOKEN = 12_050
+
+
+def codec_decode(fine_tokens):
+ """Turn quantized audio codes into audio array using encodec."""
+ # load models if not yet exist
+ global models
+ global models_devices
+ if "codec" not in models:
+ if SUNO_USE_DIRECTML is True:
+ preload_models(load_one_model_type="codec")
+ else:
+ preload_models()
+ model = models["codec"]
+ if OFFLOAD_CPU:
+ if GLOBAL_ENABLE_MPS:
+ device = _grab_best_device(use_gpu=False)
+ models_devices["codec"] = device
+ model.to(models_devices["codec"])
+ device = next(model.parameters()).device
+ arr = torch.from_numpy(fine_tokens)[None]
+ if SUNO_USE_DIRECTML is True:
+ arr = arr.to(dml)
+ else:
+ arr = arr.to(device)
+ arr = arr.transpose(0, 1)
+ emb = model.quantizer.decode(arr)
+ out = model.decoder(emb)
+ audio_arr = out.detach().cpu().numpy().squeeze()
+ del arr, emb, out
+ if OFFLOAD_CPU:
+ model.to("cpu")
+ if SUNO_USE_DIRECTML is True:
+ clean_models()
+ return audio_arr
+
+
+## Added:
+
+
+# Just overriding this because somehow I keep loading the wrong models?
+def load_model(use_gpu=True, use_small=False, force_reload=False, model_type="text"):
+ logger.debug(locals())
+
+ _load_model_f = funcy.partial(_load_model, model_type=model_type, use_small=use_small)
+ if model_type not in ("text", "coarse", "fine"):
+ raise NotImplementedError()
+ global models
+ global models_devices
+ device = _grab_best_device(use_gpu=use_gpu)
+ model_key = f"{model_type}"
+ if OFFLOAD_CPU:
+ models_devices[model_key] = device
+ device = "cpu"
+ if model_key not in models or force_reload:
+ ckpt_path = _get_ckpt_path(model_type, use_small=use_small)
+ clean_models(model_key=model_key)
+ model = _load_model_f(ckpt_path, device)
+ models[model_key] = model
+ if model_type == "text":
+ if SUNO_USE_DIRECTML is True:
+ models[model_key]["model"].to(dml)
+ else:
+ models[model_key]["model"].to(device)
+ else:
+ if SUNO_USE_DIRECTML is True:
+ models[model_key].to(dml)
+ else:
+ models[model_key].to(device)
+ logger.debug(f"Loaded {model_key} onto {device}.")
+ return models[model_key]
+
+
+def print_loading_info(model_key, ckpt_path, device):
+ device_str = str(device)
+ if SUNO_USE_DIRECTML is True:
+ device_str = "directml (partial AMD GPU support)"
+ if GLOBAL_ENABLE_MPS:
+ device_str = "cpu/mps: Partial Apple Support"
+ if OFFLOAD_CPU:
+ device_str = "cpu/gpu: Offloading, cpu until needed, then gpu"
+
+ print(f"--Loading {model_key} model from {ckpt_path} to {device_str}")
+
+
+def _load_model(ckpt_path, device, use_small=False, model_type="text"):
+ if model_type == "text":
+ ConfigClass = GPTConfig
+ ModelClass = GPT
+ elif model_type == "coarse":
+ ConfigClass = GPTConfig
+ ModelClass = GPT
+ elif model_type == "fine":
+ ConfigClass = FineGPTConfig
+ ModelClass = FineGPT
+ else:
+ raise NotImplementedError()
+ model_key = f"{model_type}_small" if use_small or USE_SMALL_MODELS else model_type
+ model_info = REMOTE_MODEL_PATHS[model_key]
+ if not os.path.exists(ckpt_path):
+ logger.info(f"{model_type} model not found, downloading into `{CACHE_DIR}`.")
+
+ remote_filename = hf_hub_url(model_info["repo_id"], model_info["file_name"])
+ print(
+ f"Downloading {model_key} {model_info['repo_id']} remote model file {remote_filename} {model_info['file_name']} to {CACHE_DIR}"
+ ) # added
+ _download(model_info["repo_id"], model_info["file_name"])
+
+ print_loading_info(model_key, ckpt_path, device)
+
+ # If I try to load straight to DML, I get a strange error. So doing in two steps.
+ checkpoint = torch.load(ckpt_path, map_location=device)
+
+ # this is a hack
+ model_args = checkpoint["model_args"]
+ if "input_vocab_size" not in model_args:
+ model_args["input_vocab_size"] = model_args["vocab_size"]
+ model_args["output_vocab_size"] = model_args["vocab_size"]
+ del model_args["vocab_size"]
+ gptconf = ConfigClass(**checkpoint["model_args"])
+ model = ModelClass(gptconf)
+
+ if SUNO_HALF_PRECISION:
+ model = model.half()
+ elif SUNO_HALF_BFLOAT16:
+ model.bfloat16()
+
+ state_dict = checkpoint["model"]
+ # fixup checkpoint
+ unwanted_prefix = "_orig_mod."
+ for k, v in list(state_dict.items()):
+ if k.startswith(unwanted_prefix):
+ state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
+ extra_keys = set(state_dict.keys()) - set(model.state_dict().keys())
+ extra_keys = set([k for k in extra_keys if not k.endswith(".attn.bias")])
+ missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
+ missing_keys = set([k for k in missing_keys if not k.endswith(".attn.bias")])
+ if len(extra_keys) != 0:
+ raise ValueError(f"extra keys found: {extra_keys}")
+ if len(missing_keys) != 0:
+ raise ValueError(f"missing keys: {missing_keys}")
+ model.load_state_dict(state_dict, strict=False)
+ n_params = model.get_num_params()
+ val_loss = checkpoint["best_val_loss"].item()
+ logger.info(f"model loaded: {round(n_params/1e6,1)}M params, {round(val_loss,3)} loss")
+ model.eval()
+ if SUNO_USE_DIRECTML is True:
+ model.to(dml)
+ else:
+ model.to(device)
+ # del checkpoint, state_dict
+ del checkpoint, state_dict, model_args, val_loss
+ _clear_cuda_cache()
+ if model_type == "text":
+ tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
+
+ return {
+ "model": model,
+ "tokenizer": tokenizer,
+ }
+ return model
+
+
+def preload_models(
+ text_use_gpu=True,
+ text_use_small=False,
+ coarse_use_gpu=True,
+ coarse_use_small=False,
+ fine_use_gpu=True,
+ fine_use_small=False,
+ codec_use_gpu=True,
+ force_reload=False,
+ load_one_model_type=None,
+):
+ """Load all the necessary models for the pipeline."""
+
+ if SUNO_USE_DIRECTML is True:
+ text_use_gpu = False
+ coarse_use_gpu = False
+ fine_use_gpu = False
+
+ # What is going on here
+ logger.debug(
+ f"USE_SMALL_MODELS = {USE_SMALL_MODELS} GLOBAL_ENABLE_MPS = {GLOBAL_ENABLE_MPS}, OFFLOAD_CPU = {OFFLOAD_CPU}"
+ )
+ logger.debug(
+ f"text_use_gpu = {text_use_gpu}, text_use_small = {text_use_small}, coarse_use_gpu = {coarse_use_gpu}, coarse_use_small = {coarse_use_small}, fine_use_gpu = {fine_use_gpu}, fine_use_small = {fine_use_small}, codec_use_gpu = {codec_use_gpu}, force_reload = {force_reload}"
+ )
+
+ if USE_SMALL_MODELS:
+ text_use_small = True
+ coarse_use_small = True
+ fine_use_small = True
+
+ if _grab_best_device() == "cpu" and (
+ text_use_gpu or coarse_use_gpu or fine_use_gpu or codec_use_gpu
+ ):
+ warning_string = " -->No GPU being used. Careful, inference might be very slow!"
+
+ if SUNO_USE_DIRECTML is True:
+ warning_string = "-->GPU using DirectML (partial AMD GPU support)"
+ if GLOBAL_ENABLE_MPS:
+ warning_string = "-->cpu/mps: Partial Apple Support"
+
+ # logger.warning(warning_string)
+ print(f"{warning_string}")
+
+ if load_one_model_type is not None:
+ if load_one_model_type == "text":
+ _ = load_model(
+ model_type="text",
+ use_gpu=text_use_gpu,
+ use_small=text_use_small,
+ force_reload=force_reload,
+ )
+ elif load_one_model_type == "coarse":
+ _ = load_model(
+ model_type="coarse",
+ use_gpu=coarse_use_gpu,
+ use_small=coarse_use_small,
+ force_reload=force_reload,
+ )
+ elif load_one_model_type == "fine":
+ _ = load_model(
+ model_type="fine",
+ use_gpu=fine_use_gpu,
+ use_small=fine_use_small,
+ force_reload=force_reload,
+ )
+ elif load_one_model_type == "codec":
+ _ = load_codec_model(use_gpu=codec_use_gpu, force_reload=force_reload)
+ else:
+ _ = load_model(
+ model_type="text",
+ use_gpu=text_use_gpu,
+ use_small=text_use_small,
+ force_reload=force_reload,
+ )
+ _ = load_model(
+ model_type="coarse",
+ use_gpu=coarse_use_gpu,
+ use_small=coarse_use_small,
+ force_reload=force_reload,
+ )
+ _ = load_model(
+ model_type="fine",
+ use_gpu=fine_use_gpu,
+ use_small=fine_use_small,
+ force_reload=force_reload,
+ )
+ _ = load_codec_model(use_gpu=codec_use_gpu, force_reload=force_reload)
diff --git a/bark_infinity/generation_sampling_mess.py b/bark_infinity/generation_sampling_mess.py
new file mode 100644
index 0000000000000000000000000000000000000000..c88c3a42dfdcbbd61f275af7b8a13ee57980af43
--- /dev/null
+++ b/bark_infinity/generation_sampling_mess.py
@@ -0,0 +1,1262 @@
+import contextlib
+import gc
+import os
+import re
+
+import random
+from encodec import EncodecModel
+import funcy
+import numpy as np
+from scipy.special import softmax
+import torch
+import torch.nn.functional as F
+import tqdm
+from transformers import BertTokenizer
+from huggingface_hub import hf_hub_download
+
+from .model import GPTConfig, GPT
+from .model_fine import FineGPT, FineGPTConfig
+
+
+from rich.pretty import pprint
+
+from .config import logger
+
+from huggingface_hub import hf_hub_url
+from collections import Counter
+if (
+ torch.cuda.is_available() and
+ hasattr(torch.cuda, "amp") and
+ hasattr(torch.cuda.amp, "autocast") and
+ hasattr(torch.cuda, "is_bf16_supported") and
+ torch.cuda.is_bf16_supported()
+):
+ autocast = funcy.partial(torch.cuda.amp.autocast, dtype=torch.bfloat16)
+else:
+ @contextlib.contextmanager
+ def autocast():
+ yield
+
+
+# hold models in global scope to lazy load
+global models
+models = {}
+
+global models_devices
+models_devices = {}
+
+
+CONTEXT_WINDOW_SIZE = 1024
+
+SEMANTIC_RATE_HZ = 49.9
+SEMANTIC_VOCAB_SIZE = 10_000
+
+CODEBOOK_SIZE = 1024
+N_COARSE_CODEBOOKS = 2
+N_FINE_CODEBOOKS = 8
+COARSE_RATE_HZ = 75
+
+SAMPLE_RATE = 24_000
+
+
+SUPPORTED_LANGS = [
+ ("English", "en"),
+ ("German", "de"),
+ ("Spanish", "es"),
+ ("French", "fr"),
+ ("Hindi", "hi"),
+ ("Italian", "it"),
+ ("Japanese", "ja"),
+ ("Korean", "ko"),
+ ("Polish", "pl"),
+ ("Portuguese", "pt"),
+ ("Russian", "ru"),
+ ("Turkish", "tr"),
+ ("Chinese", "zh"),
+]
+
+ALLOWED_PROMPTS = {"announcer"}
+for _, lang in SUPPORTED_LANGS:
+ for prefix in ("", f"v2{os.path.sep}"):
+ for n in range(10):
+ ALLOWED_PROMPTS.add(f"{prefix}{lang}_speaker_{n}")
+
+
+
+
+CUR_PATH = os.path.dirname(os.path.abspath(__file__))
+
+
+default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache")
+CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "suno", "bark_v0")
+
+
+USE_SMALL_MODELS = os.environ.get("SUNO_USE_SMALL_MODELS", False)
+GLOBAL_ENABLE_MPS = os.environ.get("SUNO_ENABLE_MPS", False)
+OFFLOAD_CPU = os.environ.get("SUNO_OFFLOAD_CPU", False)
+
+
+
+REMOTE_MODEL_PATHS = {
+ "text_small": {
+ "repo_id": "suno/bark",
+ "file_name": "text.pt",
+ },
+ "coarse_small": {
+ "repo_id": "suno/bark",
+ "file_name": "coarse.pt",
+ },
+ "fine_small": {
+ "repo_id": "suno/bark",
+ "file_name": "fine.pt",
+ },
+ "text": {
+ "repo_id": "suno/bark",
+ "file_name": "text_2.pt",
+ },
+ "coarse": {
+ "repo_id": "suno/bark",
+ "file_name": "coarse_2.pt",
+ },
+ "fine": {
+ "repo_id": "suno/bark",
+ "file_name": "fine_2.pt",
+ },
+}
+
+
+if not hasattr(torch.nn.functional, 'scaled_dot_product_attention') and torch.cuda.is_available():
+ logger.warning(
+ "torch version does not support flash attention. You will get faster" +
+ " inference speed by upgrade torch to newest nightly version."
+ )
+
+
+def _grab_best_device(use_gpu=True):
+ if torch.cuda.device_count() > 0 and use_gpu:
+ device = "cuda"
+ elif torch.backends.mps.is_available() and use_gpu and GLOBAL_ENABLE_MPS:
+ device = "mps"
+ else:
+ device = "cpu"
+ return device
+
+
+def _get_ckpt_path(model_type, use_small=False):
+ key = model_type
+ if use_small:
+ key += "_small"
+ return os.path.join(CACHE_DIR, REMOTE_MODEL_PATHS[key]["file_name"])
+
+
+def _download(from_hf_path, file_name):
+ os.makedirs(CACHE_DIR, exist_ok=True)
+ hf_hub_download(repo_id=from_hf_path, filename=file_name, local_dir=CACHE_DIR)
+
+
+class InferenceContext:
+ def __init__(self, benchmark=False):
+ # we can't expect inputs to be the same length, so disable benchmarking by default
+ self._chosen_cudnn_benchmark = benchmark
+ self._cudnn_benchmark = None
+
+ def __enter__(self):
+ self._cudnn_benchmark = torch.backends.cudnn.benchmark
+ torch.backends.cudnn.benchmark = self._chosen_cudnn_benchmark
+
+ def __exit__(self, exc_type, exc_value, exc_traceback):
+ torch.backends.cudnn.benchmark = self._cudnn_benchmark
+
+
+if torch.cuda.is_available():
+ torch.backends.cuda.matmul.allow_tf32 = True
+ torch.backends.cudnn.allow_tf32 = True
+
+
+@contextlib.contextmanager
+def _inference_mode():
+ with InferenceContext(), torch.inference_mode(), torch.no_grad(), autocast():
+ yield
+
+
+def _clear_cuda_cache():
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+ torch.cuda.synchronize()
+
+
+def clean_models(model_key=None):
+ global models
+ model_keys = [model_key] if model_key is not None else models.keys()
+ for k in model_keys:
+ if k in models:
+ del models[k]
+ _clear_cuda_cache()
+ gc.collect()
+
+
+# def _load_model(ckpt_path, device, use_small=False, model_type="text"):
+
+
+
+def _load_codec_model(device):
+ model = EncodecModel.encodec_model_24khz()
+ model.set_target_bandwidth(6.0)
+ model.eval()
+ model.to(device)
+ _clear_cuda_cache()
+ return model
+
+
+
+
+
+def load_codec_model(use_gpu=True, force_reload=False):
+ global models
+ global models_devices
+ device = _grab_best_device(use_gpu=use_gpu)
+ if device == "mps":
+ # encodec doesn't support mps
+ device = "cpu"
+ model_key = "codec"
+ if OFFLOAD_CPU:
+ models_devices[model_key] = device
+ device = "cpu"
+ if model_key not in models or force_reload:
+ clean_models(model_key=model_key)
+ model = _load_codec_model(device)
+ models[model_key] = model
+ models[model_key].to(device)
+ return models[model_key]
+
+"""
+def preload_models(
+ text_use_gpu=True,
+ text_use_small=False,
+ coarse_use_gpu=True,
+ coarse_use_small=False,
+ fine_use_gpu=True,
+ fine_use_small=False,
+ codec_use_gpu=True,
+ force_reload=False,
+):
+"""
+
+####
+# Generation Functionality
+####
+
+
+def _tokenize(tokenizer, text):
+ return tokenizer.encode(text, add_special_tokens=False)
+
+
+def _detokenize(tokenizer, enc_text):
+ return tokenizer.decode(enc_text)
+
+
+def _normalize_whitespace(text):
+ return re.sub(r"\s+", " ", text).strip()
+
+
+TEXT_ENCODING_OFFSET = 10_048
+SEMANTIC_PAD_TOKEN = 10_000
+TEXT_PAD_TOKEN = 129_595
+SEMANTIC_INFER_TOKEN = 129_599
+
+
+def _load_history_prompt(history_prompt_input):
+ if isinstance(history_prompt_input, str) and history_prompt_input.endswith(".npz"):
+ history_prompt = np.load(history_prompt_input)
+ elif isinstance(history_prompt_input, str):
+ # make sure this works on non-ubuntu
+ history_prompt_input = os.path.join(*history_prompt_input.split("/"))
+ if history_prompt_input not in ALLOWED_PROMPTS:
+ raise ValueError("history prompt not found")
+ history_prompt = np.load(
+ os.path.join(CUR_PATH, "assets", "prompts", f"{history_prompt_input}.npz")
+ )
+ elif isinstance(history_prompt_input, dict):
+ assert("semantic_prompt" in history_prompt_input)
+ assert("coarse_prompt" in history_prompt_input)
+ assert("fine_prompt" in history_prompt_input)
+ history_prompt = history_prompt_input
+ else:
+ raise ValueError("history prompt format unrecognized")
+ return history_prompt
+# removed semantic_history_oversize_limit because merging
+
+def compute_log_probs(token_list, smoothing_factor, scaling_factor):
+ # Count the frequency of each token.
+ token_freq = Counter(token_list)
+
+ # Add a smoothing factor.
+ smoothed_token_freq = {token: freq + smoothing_factor for token, freq in token_freq.items()}
+
+ # Normalize to create a probability distribution.
+ total_tokens = len(token_list) + smoothing_factor * len(smoothed_token_freq)
+ token_probs = {token: freq / total_tokens for token, freq in smoothed_token_freq.items()}
+
+ # Transform into scaled log-probabilities.
+ log_probs = {token: scaling_factor * np.log(prob) for token, prob in token_probs.items()}
+
+ return log_probs
+
+
+
+
+def generate_text_semantic(
+ text,
+ history_prompt=None,
+ temp=0.7,
+ top_k=None,
+ top_p=None,
+ silent=False,
+ min_eos_p=0.2,
+ max_gen_duration_s=None,
+ allow_early_stop=True,
+ use_kv_caching=False,
+ history_prompt_magic=None,
+ history_prompt_magic_text=None, # removed just do patch
+
+):
+ """Generate semantic tokens from text."""
+
+
+ logger.debug(locals())
+ assert isinstance(text, str)
+ text = _normalize_whitespace(text)
+ assert len(text.strip()) > 0
+ if history_prompt is not None:
+ history_prompt = _load_history_prompt(history_prompt)
+ semantic_history = history_prompt["semantic_prompt"]
+ assert (
+ isinstance(semantic_history, np.ndarray)
+ and len(semantic_history.shape) == 1
+ and len(semantic_history) > 0
+ and semantic_history.min() >= 0
+ and semantic_history.max() <= SEMANTIC_VOCAB_SIZE - 1
+ )
+ else:
+ semantic_history = None
+
+ if history_prompt_magic is not None:
+ assert (
+ isinstance(history_prompt_magic, np.ndarray)
+ and len(history_prompt_magic.shape) == 1
+ and len(history_prompt_magic) > 0
+ and history_prompt_magic.min() >= 0
+ and history_prompt_magic.max() <= SEMANTIC_VOCAB_SIZE - 1
+ )
+ else:
+ history_prompt_magic = None
+ # load models if not yet exist
+ global models
+ global models_devices
+ if "text" not in models:
+ preload_models()
+ model_container = models["text"]
+ model = model_container["model"]
+ tokenizer = model_container["tokenizer"]
+ encoded_text = np.array(_tokenize(tokenizer, text)) + TEXT_ENCODING_OFFSET
+ if OFFLOAD_CPU:
+ model.to(models_devices["text"])
+ device = next(model.parameters()).device
+ if len(encoded_text) > 256:
+ p = round((len(encoded_text) - 256) / len(encoded_text) * 100, 1)
+ logger.warning(f"warning, text too long, lopping of last {p}%")
+ encoded_text = encoded_text[:256]
+ encoded_text = np.pad(
+ encoded_text,
+ (0, 256 - len(encoded_text)),
+ constant_values=TEXT_PAD_TOKEN,
+ mode="constant",
+ )
+ if semantic_history is not None:
+ semantic_history = semantic_history.astype(np.int64)
+ # lop off if history is too long, pad if needed
+ semantic_history = semantic_history[-256:]
+ semantic_history = np.pad(
+ semantic_history,
+ (0, 256 - len(semantic_history)),
+ constant_values=SEMANTIC_PAD_TOKEN,
+ mode="constant",
+ )
+ #print(f"Actual length of semantic history: {len(semantic_history)}")
+ else:
+ #print(f"No semantic history provided.")
+ semantic_history = np.array([SEMANTIC_PAD_TOKEN] * 256)
+
+
+
+ x = torch.from_numpy(
+ np.hstack([
+ encoded_text, semantic_history, np.array([SEMANTIC_INFER_TOKEN])
+ ]).astype(np.int64)
+ )[None]
+ assert x.shape[1] == 256 + 256 + 1
+ with _inference_mode():
+ x = x.to(device)
+ n_tot_steps = 768
+ # custom tqdm updates since we don't know when eos will occur
+ pbar = tqdm.tqdm(disable=silent, total=100)
+ pbar_state = 0
+ tot_generated_duration_s = 0
+ kv_cache = None
+ for n in range(n_tot_steps):
+ if use_kv_caching and kv_cache is not None:
+ x_input = x[:, [-1]]
+ else:
+ x_input = x
+ logits, kv_cache = model(
+ x_input, merge_context=True, use_cache=use_kv_caching, past_kv=kv_cache
+ )
+ relevant_logits = logits[0, 0, :SEMANTIC_VOCAB_SIZE]
+ if allow_early_stop:
+ relevant_logits = torch.hstack(
+ (relevant_logits, logits[0, 0, [SEMANTIC_PAD_TOKEN]]) # eos
+ )
+ if top_p is not None:
+ # faster to convert to numpy
+ logits_device = relevant_logits.device
+ logits_dtype = relevant_logits.type()
+ relevant_logits = relevant_logits.detach().cpu().type(torch.float32).numpy()
+ sorted_indices = np.argsort(relevant_logits)[::-1]
+ sorted_logits = relevant_logits[sorted_indices]
+ cumulative_probs = np.cumsum(softmax(sorted_logits))
+ sorted_indices_to_remove = cumulative_probs > top_p
+ sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
+ sorted_indices_to_remove[0] = False
+ relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf
+ relevant_logits = torch.from_numpy(relevant_logits)
+ relevant_logits = relevant_logits.to(logits_device).type(logits_dtype)
+ if top_k is not None:
+ v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1)))
+ relevant_logits[relevant_logits < v[-1]] = -float("Inf")
+ probs = F.softmax(relevant_logits / temp, dim=-1)
+ # multinomial bugged on mps: shuttle to cpu if necessary
+ inf_device = probs.device
+ if probs.device.type == "mps":
+ probs = probs.to("cpu")
+ item_next = torch.multinomial(probs, num_samples=1)
+ probs = probs.to(inf_device)
+ item_next = item_next.to(inf_device)
+ if allow_early_stop and (
+ item_next == SEMANTIC_VOCAB_SIZE
+ or (min_eos_p is not None and probs[-1] >= min_eos_p)
+ ):
+ # eos found, so break
+ pbar.update(100 - pbar_state)
+ break
+ x = torch.cat((x, item_next[None]), dim=1)
+ tot_generated_duration_s += 1 / SEMANTIC_RATE_HZ
+ if max_gen_duration_s is not None and tot_generated_duration_s > max_gen_duration_s:
+ pbar.update(100 - pbar_state)
+ break
+ if n == n_tot_steps - 1:
+ pbar.update(100 - pbar_state)
+ break
+ del logits, relevant_logits, probs, item_next
+ req_pbar_state = np.min([100, int(round(100 * n / n_tot_steps))])
+ if req_pbar_state > pbar_state:
+ pbar.update(req_pbar_state - pbar_state)
+ pbar_state = req_pbar_state
+ pbar.close()
+ out = x.detach().cpu().numpy().squeeze()[256 + 256 + 1 :]
+ if OFFLOAD_CPU:
+ model.to("cpu")
+ assert all(0 <= out) and all(out < SEMANTIC_VOCAB_SIZE)
+ _clear_cuda_cache()
+ return out
+
+
+
+#
+def generate_text_semantic_garbage_version(
+ text,
+ history_prompt=None,
+ temp=0.7,
+ top_k=None,
+ top_p=None,
+ silent=False,
+ min_eos_p=0.2,
+ max_gen_duration_s=None,
+ allow_early_stop=True,
+ use_kv_caching=False,
+ history_prompt_magic=None,
+ history_prompt_magic_text=None,
+ banned_tokens = None,
+ absolute_banned_tokens = None,
+ outside_banned_penalty = -100.0,
+ target_distribution = None,
+ target_k_smoothing_factor = 0.2,
+ target_scaling_factor = 0.5, # scale and weight are too correlated, better to find some other way to represent this
+
+ history_prompt_distribution = None,
+
+
+ history_prompt_k_smoothing_factor = 0.2,
+ history_prompt_scaling_factor = 0.5,
+
+
+ history_prompt_average_distribution = None,
+ history_prompt_average_k_smoothing_factor = 0.2,
+ history_prompt_average_scaling_factor = 0.5,
+
+ target_outside_default_penalty = -5.0, # default penalty for tokens outside target distribution
+ target_outside_outlier_penalty = -25.0, # rare or absent in speaker and target
+ history_prompt_unique_voice_penalty = -1.0, # if we think this is specific to the speaker, maybe this should actually be positivbe?
+
+ consider_common_threshold = 100 / 10001, # todo: no idea what's good valu here
+ history_prompt_unique_voice_threshold = 100 / 10001,
+
+):
+ """Generate semantic tokens from text."""
+
+
+
+ logger.debug(locals())
+ assert isinstance(text, str)
+ text = _normalize_whitespace(text)
+ #assert len(text.strip()) > 0
+ if history_prompt is not None:
+ history_prompt = _load_history_prompt(history_prompt)
+ semantic_history = history_prompt["semantic_prompt"]
+ assert (
+ isinstance(semantic_history, np.ndarray)
+ and len(semantic_history.shape) == 1
+ and len(semantic_history) > 0
+ and semantic_history.min() >= 0
+ and semantic_history.max() <= SEMANTIC_VOCAB_SIZE - 1
+ )
+
+ else:
+ semantic_history = None
+
+ if history_prompt_magic is not None:
+ assert (
+ isinstance(history_prompt_magic, np.ndarray)
+ and len(history_prompt_magic.shape) == 1
+ and len(history_prompt_magic) > 0
+ and history_prompt_magic.min() >= 0
+ and history_prompt_magic.max() <= SEMANTIC_VOCAB_SIZE - 1
+ )
+ else:
+ history_prompt_magic = None
+ # load models if not yet exist
+ global models
+ global models_devices
+ if "text" not in models:
+ preload_models()
+ model_container = models["text"]
+ model = model_container["model"]
+ tokenizer = model_container["tokenizer"]
+ encoded_text = np.array(_tokenize(tokenizer, text)) + TEXT_ENCODING_OFFSET
+ if OFFLOAD_CPU:
+ model.to(models_devices["text"])
+ device = next(model.parameters()).device
+ if len(encoded_text) > 256:
+ p = round((len(encoded_text) - 256) / len(encoded_text) * 100, 1)
+ logger.warning(f"warning, text too long, lopping of last {p}%")
+ encoded_text = encoded_text[:256]
+ encoded_text = np.pad(
+ encoded_text,
+ (0, 256 - len(encoded_text)),
+ constant_values=TEXT_PAD_TOKEN,
+ mode="constant",
+ )
+ if semantic_history is not None:
+ semantic_history = semantic_history.astype(np.int64)
+ # lop off if history is too long, pad if needed
+ semantic_history = semantic_history[-256:]
+
+ print(f"Semantic history Input Length pre 256 trim: {len(semantic_history)}")
+ semantic_history = np.pad(
+ semantic_history,
+ (0, 256 - len(semantic_history)),
+ constant_values=SEMANTIC_PAD_TOKEN,
+ mode="constant",
+ )
+
+ else:
+ print(f"No semantic history provided.")
+ semantic_history = np.array([SEMANTIC_PAD_TOKEN] * 256)
+
+
+
+ x = torch.from_numpy(
+ np.hstack([
+ encoded_text, semantic_history, np.array([SEMANTIC_INFER_TOKEN])
+ ]).astype(np.int64)
+ )[None]
+ assert x.shape[1] == 256 + 256 + 1
+
+
+ penalty_tensor = None
+ banned_tokens_tensor = None
+ # TODO Handle the non history_prompt case, just using either single reference distribution, or speaker reference + reference
+
+
+
+
+
+
+ if target_distribution is not None and history_prompt is not None:
+ # TODO defaults chosen arbitrarily. try to find better values
+
+
+
+
+ history_prompt_distribution_log_probs = compute_log_probs(history_prompt_distribution, history_prompt_k_smoothing_factor, history_prompt_scaling_factor)
+ target_distribution_log_probs = compute_log_probs(target_distribution, target_k_smoothing_factor, target_scaling_factor)
+
+ if history_prompt_average_distribution is not None:
+
+ history_prompt_average_distribution_log_probs = compute_log_probs(history_prompt_average_distribution , history_prompt_average_k_smoothing_factor, history_prompt_average_scaling_factor )
+
+
+ history_prompt_uniqueness = {token: history_prompt_distribution_log_probs[token] - history_prompt_average_distribution_log_probs.get(token, 0) for token in history_prompt_distribution_log_probs.keys()}
+
+
+ penalty_tensor = torch.full((10001,), target_outside_default_penalty, device=device, dtype=torch.float32)
+
+ history_prompt_unique_voice_threshold_logn = np.log(history_prompt_unique_voice_threshold)
+
+ for token in range(10001):
+ history_prompt_prob = history_prompt_distribution_log_probs.get(token, None)
+ target_prob = target_distribution_log_probs.get(token, None)
+
+ if target_prob is not None:
+
+ penalty_tensor[token] = target_prob
+
+
+
+ # Okay let's just back up and yank start removing things from this file, it doesn't seem like the quality increasing
+ # let's get back to the simplest version that was still amazing.
+ """
+ if history_prompt_uniqueness[token] > history_prompt_unique_voice_threshold_logn:
+ # looks like a token unique to our speaker
+ penalty_tensor[token] = history_prompt_prob[token] + history_prompt_unique_voice_penalty
+ # maybe should also scale penalty by target frequency, but with scaling factor? gah too many options
+ else:
+ penalty_tensor[token] = target_prob
+
+
+ """
+
+ """
+ token_freq = Counter(target_distribution)
+
+ smoothed_token_freq = {token: freq + target_k_smoothing_factor for token, freq in token_freq.items()}
+
+ # Normalize
+ total_tokens = len(target_distribution) + target_k_smoothing_factor * len(smoothed_token_freq)
+ token_probs = {token: freq / total_tokens for token, freq in smoothed_token_freq.items()}
+
+
+ log_probs = {token: np.log(prob) for token, prob in token_probs.items()}
+ # are there some special bark tokens to exclude? seems to work fine without
+ #log_probs_tensor = torch.full((10001,), -np.inf, device=device, dtype=torch.float32)
+ log_probs_tensor = torch.full((10001,), target_outside_penalty, device=device, dtype=torch.float32)
+
+ for token, log_prob in log_probs.items():
+ log_probs_tensor[token] = target_scaling_factor * log_prob
+ """
+
+ with _inference_mode():
+ x = x.to(device)
+ n_tot_steps = 768
+ # custom tqdm updates since we don't know when eos will occur
+ pbar = tqdm.tqdm(disable=silent, total=100)
+ pbar_state = 0
+ tot_generated_duration_s = 0
+ kv_cache = None
+
+
+
+ for n in range(n_tot_steps):
+ if use_kv_caching and kv_cache is not None:
+ x_input = x[:, [-1]]
+ else:
+ x_input = x
+ logits, kv_cache = model(
+ x_input, merge_context=True, use_cache=use_kv_caching, past_kv=kv_cache
+ )
+ relevant_logits = logits[0, 0, :SEMANTIC_VOCAB_SIZE]
+ if allow_early_stop:
+ relevant_logits = torch.hstack(
+ (relevant_logits, logits[0, 0, [SEMANTIC_PAD_TOKEN]]) # eos
+ )
+ if top_p is not None:
+ # faster to convert to numpy
+ logits_device = relevant_logits.device
+ logits_dtype = relevant_logits.type()
+ relevant_logits = relevant_logits.detach().cpu().type(torch.float32).numpy()
+ sorted_indices = np.argsort(relevant_logits)[::-1]
+ sorted_logits = relevant_logits[sorted_indices]
+ cumulative_probs = np.cumsum(softmax(sorted_logits))
+ sorted_indices_to_remove = cumulative_probs > top_p
+ sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
+ sorted_indices_to_remove[0] = False
+ relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf
+ relevant_logits = torch.from_numpy(relevant_logits)
+ relevant_logits = relevant_logits.to(logits_device).type(logits_dtype)
+
+
+
+
+ if top_k is not None:
+ v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1)))
+ relevant_logits[relevant_logits < v[-1]] = -float("Inf")
+
+ # TODO not banning speaker most unique tokens compared to referenc history class
+
+
+
+ if absolute_banned_tokens is not None:
+
+ banned_tokens_tensor = torch.tensor(absolute_banned_tokens, device=relevant_logits.device)
+ penalty_tensor = torch.full(banned_tokens_tensor.shape, -10000.0, device=relevant_logits.device, dtype=relevant_logits.dtype)
+ relevant_logits.index_add_(0, banned_tokens_tensor, penalty_tensor)
+
+ elif banned_tokens is not None:
+
+ banned_tokens_tensor = torch.tensor(banned_tokens, device=relevant_logits.device)
+ penalty_tensor = torch.full(banned_tokens_tensor.shape, outside_banned_penalty, device=relevant_logits.device, dtype=relevant_logits.dtype)
+ relevant_logits.index_add_(0, banned_tokens_tensor, penalty_tensor)
+
+
+ if penalty_tensor is not None and target_distribution is not None:
+ relevant_logits += penalty_tensor
+
+
+ probs = F.softmax(relevant_logits / temp, dim=-1)
+
+
+
+ # multinomial bugged on mps: shuttle to cpu if necessary
+ inf_device = probs.device
+ if probs.device.type == "mps":
+ probs = probs.to("cpu")
+
+
+
+ item_next = torch.multinomial(probs, num_samples=1)
+
+
+ probs = probs.to(inf_device)
+ item_next = item_next.to(inf_device)
+ if allow_early_stop and (
+ item_next == SEMANTIC_VOCAB_SIZE
+ or (min_eos_p is not None and probs[-1] >= min_eos_p)
+ ):
+ # eos found, so break
+ pbar.update(100 - pbar_state)
+ break
+ x = torch.cat((x, item_next[None]), dim=1)
+ tot_generated_duration_s += 1 / SEMANTIC_RATE_HZ
+ if max_gen_duration_s is not None and tot_generated_duration_s > max_gen_duration_s:
+ pbar.update(100 - pbar_state)
+ break
+ if n == n_tot_steps - 1:
+ pbar.update(100 - pbar_state)
+ break
+ del logits, relevant_logits, probs, item_next
+ req_pbar_state = np.min([100, int(round(100 * n / n_tot_steps))])
+ if req_pbar_state > pbar_state:
+ pbar.update(req_pbar_state - pbar_state)
+ pbar_state = req_pbar_state
+ pbar.close()
+ out = x.detach().cpu().numpy().squeeze()[256 + 256 + 1 :]
+ if OFFLOAD_CPU:
+ model.to("cpu")
+ assert all(0 <= out) and all(out < SEMANTIC_VOCAB_SIZE)
+ _clear_cuda_cache()
+ return out
+
+
+
+
+def generate_coarse(
+ x_semantic,
+ history_prompt=None,
+ temp=0.7,
+ top_k=None,
+ top_p=None,
+ silent=False,
+ max_coarse_history=630, # min 60 (faster), max 630 (more context)
+ sliding_window_len=60,
+ use_kv_caching=False,
+ x_coarse_history_alignment_hack = -2,
+):
+ """Generate coarse audio codes from semantic tokens."""
+
+
+
+
+ logger.debug(locals())
+ assert (
+ isinstance(x_semantic, np.ndarray)
+ and len(x_semantic.shape) == 1
+ and len(x_semantic) > 0
+ and x_semantic.min() >= 0
+ and x_semantic.max() <= SEMANTIC_VOCAB_SIZE - 1
+ )
+ assert 60 <= max_coarse_history <= 630
+ assert max_coarse_history + sliding_window_len <= 1024 - 256
+ semantic_to_coarse_ratio = COARSE_RATE_HZ / SEMANTIC_RATE_HZ * N_COARSE_CODEBOOKS
+
+ max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
+ if history_prompt is not None:
+ history_prompt = _load_history_prompt(history_prompt)
+ x_semantic_history = history_prompt["semantic_prompt"]
+ x_coarse_history = history_prompt["coarse_prompt"]
+
+ print(f"Pre Trim lengths of semantic and coarse history: {x_semantic_history.shape} {x_coarse_history.shape}")
+ assert (
+ isinstance(x_semantic_history, np.ndarray)
+ and len(x_semantic_history.shape) == 1
+ and len(x_semantic_history) > 0
+ and x_semantic_history.min() >= 0
+ and x_semantic_history.max() <= SEMANTIC_VOCAB_SIZE - 1
+ and isinstance(x_coarse_history, np.ndarray)
+ and len(x_coarse_history.shape) == 2
+ and x_coarse_history.shape[0] == N_COARSE_CODEBOOKS
+ and x_coarse_history.shape[-1] >= 0
+ and x_coarse_history.min() >= 0
+ and x_coarse_history.max() <= CODEBOOK_SIZE - 1
+ and (
+ round(x_coarse_history.shape[-1] / len(x_semantic_history), 1)
+ == round(semantic_to_coarse_ratio / N_COARSE_CODEBOOKS, 1)
+ )
+ )
+ x_coarse_history = _flatten_codebooks(x_coarse_history) + SEMANTIC_VOCAB_SIZE
+ # trim histories correctly
+ n_semantic_hist_provided = np.min(
+ [
+ max_semantic_history,
+ len(x_semantic_history) - len(x_semantic_history) % 2,
+ int(np.floor(len(x_coarse_history) / semantic_to_coarse_ratio)),
+ ]
+ )
+ n_coarse_hist_provided = int(round(n_semantic_hist_provided * semantic_to_coarse_ratio))
+ x_semantic_history = x_semantic_history[-n_semantic_hist_provided:].astype(np.int32)
+ x_coarse_history = x_coarse_history[-n_coarse_hist_provided:].astype(np.int32)
+ # TODO: bit of a hack for time alignment (sounds better)
+ #x_coarse_history = x_coarse_history[:-2]
+ x_coarse_history = x_coarse_history[:x_coarse_history_alignment_hack]
+
+ else:
+ x_semantic_history = np.array([], dtype=np.int32)
+ x_coarse_history = np.array([], dtype=np.int32)
+
+
+ #print(f"actual lengths we're using, x_semantic_history: {len(x_semantic_history)} x_coarse_history: {len(x_coarse_history)}")
+
+ # load models if not yet exist
+ global models
+ global models_devices
+ if "coarse" not in models:
+ preload_models()
+ model = models["coarse"]
+ if OFFLOAD_CPU:
+ model.to(models_devices["coarse"])
+ device = next(model.parameters()).device
+ # start loop
+ n_steps = int(
+ round(
+ np.floor(len(x_semantic) * semantic_to_coarse_ratio / N_COARSE_CODEBOOKS)
+ * N_COARSE_CODEBOOKS
+ )
+ )
+ assert n_steps > 0 and n_steps % N_COARSE_CODEBOOKS == 0
+
+ # reminder to try filling up some of the COARSE_INFER_TOKEN with history to get better short clips
+ x_semantic = np.hstack([x_semantic_history, x_semantic]).astype(np.int32)
+ x_coarse = x_coarse_history.astype(np.int32)
+ base_semantic_idx = len(x_semantic_history)
+ with _inference_mode():
+ x_semantic_in = torch.from_numpy(x_semantic)[None].to(device)
+ x_coarse_in = torch.from_numpy(x_coarse)[None].to(device)
+ n_window_steps = int(np.ceil(n_steps / sliding_window_len))
+ n_step = 0
+ for _ in tqdm.tqdm(range(n_window_steps), total=n_window_steps, disable=silent):
+ semantic_idx = base_semantic_idx + int(round(n_step / semantic_to_coarse_ratio))
+ # pad from right side
+ x_in = x_semantic_in[:, np.max([0, semantic_idx - max_semantic_history]) :]
+ x_in = x_in[:, :256]
+ x_in = F.pad(
+ x_in,
+ (0, 256 - x_in.shape[-1]),
+ "constant",
+ COARSE_SEMANTIC_PAD_TOKEN,
+ )
+ x_in = torch.hstack(
+ [
+ x_in,
+ torch.tensor([COARSE_INFER_TOKEN])[None].to(device),
+ x_coarse_in[:, -max_coarse_history:],
+ ]
+ )
+ kv_cache = None
+ for _ in range(sliding_window_len):
+ if n_step >= n_steps:
+ continue
+ is_major_step = n_step % N_COARSE_CODEBOOKS == 0
+
+ if use_kv_caching and kv_cache is not None:
+ x_input = x_in[:, [-1]]
+ else:
+ x_input = x_in
+
+ logits, kv_cache = model(x_input, use_cache=use_kv_caching, past_kv=kv_cache)
+ logit_start_idx = (
+ SEMANTIC_VOCAB_SIZE + (1 - int(is_major_step)) * CODEBOOK_SIZE
+ )
+ logit_end_idx = (
+ SEMANTIC_VOCAB_SIZE + (2 - int(is_major_step)) * CODEBOOK_SIZE
+ )
+ relevant_logits = logits[0, 0, logit_start_idx:logit_end_idx]
+ if top_p is not None:
+ # faster to convert to numpy
+ logits_device = relevant_logits.device
+ logits_dtype = relevant_logits.type()
+ relevant_logits = relevant_logits.detach().cpu().type(torch.float32).numpy()
+ sorted_indices = np.argsort(relevant_logits)[::-1]
+ sorted_logits = relevant_logits[sorted_indices]
+ cumulative_probs = np.cumsum(softmax(sorted_logits))
+ sorted_indices_to_remove = cumulative_probs > top_p
+ sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
+ sorted_indices_to_remove[0] = False
+ relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf
+ relevant_logits = torch.from_numpy(relevant_logits)
+ relevant_logits = relevant_logits.to(logits_device).type(logits_dtype)
+ if top_k is not None:
+ v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1)))
+ relevant_logits[relevant_logits < v[-1]] = -float("Inf")
+ probs = F.softmax(relevant_logits / temp, dim=-1)
+ # multinomial bugged on mps: shuttle to cpu if necessary
+ inf_device = probs.device
+ if probs.device.type == "mps":
+ probs = probs.to("cpu")
+ item_next = torch.multinomial(probs, num_samples=1)
+ probs = probs.to(inf_device)
+ item_next = item_next.to(inf_device)
+ item_next += logit_start_idx
+ x_coarse_in = torch.cat((x_coarse_in, item_next[None]), dim=1)
+ x_in = torch.cat((x_in, item_next[None]), dim=1)
+ del logits, relevant_logits, probs, item_next
+ n_step += 1
+ del x_in
+ del x_semantic_in
+ if OFFLOAD_CPU:
+ model.to("cpu")
+ gen_coarse_arr = x_coarse_in.detach().cpu().numpy().squeeze()[len(x_coarse_history) :]
+ del x_coarse_in
+ assert len(gen_coarse_arr) == n_steps
+ gen_coarse_audio_arr = gen_coarse_arr.reshape(-1, N_COARSE_CODEBOOKS).T - SEMANTIC_VOCAB_SIZE
+ for n in range(1, N_COARSE_CODEBOOKS):
+ gen_coarse_audio_arr[n, :] -= n * CODEBOOK_SIZE
+ _clear_cuda_cache()
+ return gen_coarse_audio_arr
+
+
+def generate_fine(
+ x_coarse_gen,
+ history_prompt=None,
+ temp=0.5,
+ silent=True,
+):
+ """Generate full audio codes from coarse audio codes."""
+
+ logger.debug(locals())
+ assert (
+ isinstance(x_coarse_gen, np.ndarray)
+ and len(x_coarse_gen.shape) == 2
+ and 1 <= x_coarse_gen.shape[0] <= N_FINE_CODEBOOKS - 1
+ and x_coarse_gen.shape[1] > 0
+ and x_coarse_gen.min() >= 0
+ and x_coarse_gen.max() <= CODEBOOK_SIZE - 1
+ )
+ if history_prompt is not None:
+ history_prompt = _load_history_prompt(history_prompt)
+ x_fine_history = history_prompt["fine_prompt"]
+ assert (
+ isinstance(x_fine_history, np.ndarray)
+ and len(x_fine_history.shape) == 2
+ and x_fine_history.shape[0] == N_FINE_CODEBOOKS
+ and x_fine_history.shape[1] >= 0
+ and x_fine_history.min() >= 0
+ and x_fine_history.max() <= CODEBOOK_SIZE - 1
+ )
+ else:
+ x_fine_history = None
+ n_coarse = x_coarse_gen.shape[0]
+ # load models if not yet exist
+ global models
+ global models_devices
+ if "fine" not in models:
+ preload_models()
+ model = models["fine"]
+ if OFFLOAD_CPU:
+ model.to(models_devices["fine"])
+ device = next(model.parameters()).device
+ # make input arr
+ in_arr = np.vstack(
+ [
+ x_coarse_gen,
+ np.zeros((N_FINE_CODEBOOKS - n_coarse, x_coarse_gen.shape[1]))
+ + CODEBOOK_SIZE, # padding
+ ]
+ ).astype(np.int32)
+ # prepend history if available (max 512)
+ if x_fine_history is not None:
+ x_fine_history = x_fine_history.astype(np.int32)
+ in_arr = np.hstack(
+ [
+ x_fine_history[:, -512:].astype(np.int32),
+ in_arr,
+ ]
+ )
+ n_history = x_fine_history[:, -512:].shape[1]
+ else:
+ n_history = 0
+ n_remove_from_end = 0
+ # need to pad if too short (since non-causal model)
+ if in_arr.shape[1] < 1024:
+ n_remove_from_end = 1024 - in_arr.shape[1]
+ in_arr = np.hstack(
+ [
+ in_arr,
+ np.zeros((N_FINE_CODEBOOKS, n_remove_from_end), dtype=np.int32) + CODEBOOK_SIZE,
+ ]
+ )
+ # we can be lazy about fractional loop and just keep overwriting codebooks
+ n_loops = np.max([0, int(np.ceil((x_coarse_gen.shape[1] - (1024 - n_history)) / 512))]) + 1
+ with _inference_mode():
+ in_arr = torch.tensor(in_arr.T).to(device)
+ for n in tqdm.tqdm(range(n_loops), disable=silent):
+ start_idx = np.min([n * 512, in_arr.shape[0] - 1024])
+ start_fill_idx = np.min([n_history + n * 512, in_arr.shape[0] - 512])
+ rel_start_fill_idx = start_fill_idx - start_idx
+ in_buffer = in_arr[start_idx : start_idx + 1024, :][None]
+ for nn in range(n_coarse, N_FINE_CODEBOOKS):
+ logits = model(nn, in_buffer)
+ if temp is None:
+ relevant_logits = logits[0, rel_start_fill_idx:, :CODEBOOK_SIZE]
+ codebook_preds = torch.argmax(relevant_logits, -1)
+ else:
+ relevant_logits = logits[0, :, :CODEBOOK_SIZE] / temp
+ probs = F.softmax(relevant_logits, dim=-1)
+ # multinomial bugged on mps: shuttle to cpu if necessary
+ inf_device = probs.device
+ if probs.device.type == "mps":
+ probs = probs.to("cpu")
+ codebook_preds = torch.hstack(
+ [
+ torch.multinomial(probs[nnn], num_samples=1).to(inf_device)
+ for nnn in range(rel_start_fill_idx, 1024)
+ ]
+ )
+ in_buffer[0, rel_start_fill_idx:, nn] = codebook_preds
+ del logits, codebook_preds
+ # transfer over info into model_in and convert to numpy
+ for nn in range(n_coarse, N_FINE_CODEBOOKS):
+ in_arr[
+ start_fill_idx : start_fill_idx + (1024 - rel_start_fill_idx), nn
+ ] = in_buffer[0, rel_start_fill_idx:, nn]
+ del in_buffer
+ gen_fine_arr = in_arr.detach().cpu().numpy().squeeze().T
+ del in_arr
+ if OFFLOAD_CPU:
+ model.to("cpu")
+ gen_fine_arr = gen_fine_arr[:, n_history:]
+ if n_remove_from_end > 0:
+ gen_fine_arr = gen_fine_arr[:, :-n_remove_from_end]
+ assert gen_fine_arr.shape[-1] == x_coarse_gen.shape[-1]
+ _clear_cuda_cache()
+ return gen_fine_arr
+
+
+
+def _flatten_codebooks(arr, offset_size=CODEBOOK_SIZE):
+ assert len(arr.shape) == 2
+ arr = arr.copy()
+ if offset_size is not None:
+ for n in range(1, arr.shape[0]):
+ arr[n, :] += offset_size * n
+ flat_arr = arr.ravel("F")
+ return flat_arr
+
+
+COARSE_SEMANTIC_PAD_TOKEN = 12_048
+COARSE_INFER_TOKEN = 12_050
+
+
+
+
+def codec_decode(fine_tokens):
+ """Turn quantized audio codes into audio array using encodec."""
+ # load models if not yet exist
+ global models
+ global models_devices
+ if "codec" not in models:
+ preload_models()
+ model = models["codec"]
+ if OFFLOAD_CPU:
+ model.to(models_devices["codec"])
+ device = next(model.parameters()).device
+ arr = torch.from_numpy(fine_tokens)[None]
+ arr = arr.to(device)
+ arr = arr.transpose(0, 1)
+ emb = model.quantizer.decode(arr)
+ out = model.decoder(emb)
+ audio_arr = out.detach().cpu().numpy().squeeze()
+ del arr, emb, out
+ if OFFLOAD_CPU:
+ model.to("cpu")
+ return audio_arr
+
+
+## Added:
+
+# Just overriding this because somehow I keep loading the wrong models?
+def load_model(use_gpu=True, use_small=False, force_reload=False, model_type="text"):
+
+ logger.debug(locals())
+
+ _load_model_f = funcy.partial(_load_model, model_type=model_type, use_small=use_small)
+ if model_type not in ("text", "coarse", "fine"):
+ raise NotImplementedError()
+ global models
+ global models_devices
+ device = _grab_best_device(use_gpu=use_gpu)
+ model_key = f"{model_type}"
+ if OFFLOAD_CPU:
+ models_devices[model_key] = device
+ device = "cpu"
+ if model_key not in models or force_reload:
+ ckpt_path = _get_ckpt_path(model_type, use_small=use_small)
+ clean_models(model_key=model_key)
+ model = _load_model_f(ckpt_path, device)
+ models[model_key] = model
+ if model_type == "text":
+ models[model_key]["model"].to(device)
+ else:
+ models[model_key].to(device)
+ logger.debug(f"Loaded {model_key} onto {device}.")
+ return models[model_key]
+
+
+def _load_model(ckpt_path, device, use_small=False, model_type="text"):
+ if model_type == "text":
+ ConfigClass = GPTConfig
+ ModelClass = GPT
+ elif model_type == "coarse":
+ ConfigClass = GPTConfig
+ ModelClass = GPT
+ elif model_type == "fine":
+ ConfigClass = FineGPTConfig
+ ModelClass = FineGPT
+ else:
+ raise NotImplementedError()
+ model_key = f"{model_type}_small" if use_small or USE_SMALL_MODELS else model_type
+ model_info = REMOTE_MODEL_PATHS[model_key]
+ if not os.path.exists(ckpt_path):
+ logger.info(f"{model_type} model not found, downloading into `{CACHE_DIR}`.")
+
+ ## added, actually screw logging, just print, rest easy always knowing which model is loaded
+ remote_filename = hf_hub_url(model_info["repo_id"], model_info["file_name"])
+ print(f"Downloading {model_key} {model_info['repo_id']} remote model file {remote_filename} {model_info['file_name']} to {CACHE_DIR}") # added
+ _download(model_info["repo_id"], model_info["file_name"])
+ ## added
+ print(f"Loading {model_key} model from {ckpt_path} to {device}") # added
+ checkpoint = torch.load(ckpt_path, map_location=device)
+
+ # this is a hack
+ model_args = checkpoint["model_args"]
+ if "input_vocab_size" not in model_args:
+ model_args["input_vocab_size"] = model_args["vocab_size"]
+ model_args["output_vocab_size"] = model_args["vocab_size"]
+ del model_args["vocab_size"]
+ gptconf = ConfigClass(**checkpoint["model_args"])
+ model = ModelClass(gptconf)
+ state_dict = checkpoint["model"]
+ # fixup checkpoint
+ unwanted_prefix = "_orig_mod."
+ for k, v in list(state_dict.items()):
+ if k.startswith(unwanted_prefix):
+ state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
+ extra_keys = set(state_dict.keys()) - set(model.state_dict().keys())
+ extra_keys = set([k for k in extra_keys if not k.endswith(".attn.bias")])
+ missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
+ missing_keys = set([k for k in missing_keys if not k.endswith(".attn.bias")])
+ if len(extra_keys) != 0:
+ raise ValueError(f"extra keys found: {extra_keys}")
+ if len(missing_keys) != 0:
+ raise ValueError(f"missing keys: {missing_keys}")
+ model.load_state_dict(state_dict, strict=False)
+ n_params = model.get_num_params()
+ val_loss = checkpoint["best_val_loss"].item()
+ logger.info(f"model loaded: {round(n_params/1e6,1)}M params, {round(val_loss,3)} loss")
+ model.eval()
+ model.to(device)
+ del checkpoint, state_dict
+ _clear_cuda_cache()
+ if model_type == "text":
+ tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
+ return {
+ "model": model,
+ "tokenizer": tokenizer,
+ }
+ return model
+
+
+def preload_models(
+ text_use_gpu=True,
+ text_use_small=False,
+ coarse_use_gpu=True,
+ coarse_use_small=False,
+ fine_use_gpu=True,
+ fine_use_small=False,
+ codec_use_gpu=True,
+ force_reload=False,
+):
+ """Load all the necessary models for the pipeline."""
+
+
+
+ # What is going on here
+ logger.debug(f"USE_SMALL_MODELS = {USE_SMALL_MODELS} GLOBAL_ENABLE_MPS = {GLOBAL_ENABLE_MPS}, OFFLOAD_CPU = {OFFLOAD_CPU}")
+ logger.debug(f"text_use_gpu = {text_use_gpu}, text_use_small = {text_use_small}, coarse_use_gpu = {coarse_use_gpu}, coarse_use_small = {coarse_use_small}, fine_use_gpu = {fine_use_gpu}, fine_use_small = {fine_use_small}, codec_use_gpu = {codec_use_gpu}, force_reload = {force_reload}")
+
+ # Is this actually bugged in Bark main, not my fault? This is checked further down the stack, but the chkpt_path is not updated in places
+
+ # So we should also set this here, right, otherwise when not preloading, it tries to load a model which may not exist yet.
+
+ if USE_SMALL_MODELS:
+ text_use_small = True
+ coarse_use_small = True
+ fine_use_small = True
+
+ if _grab_best_device() == "cpu" and (
+ text_use_gpu or coarse_use_gpu or fine_use_gpu or codec_use_gpu
+ ):
+ logger.warning("No GPU being used. Careful, inference might be very slow!")
+ _ = load_model(
+ model_type="text", use_gpu=text_use_gpu, use_small=text_use_small, force_reload=force_reload
+ )
+ _ = load_model(
+ model_type="coarse",
+ use_gpu=coarse_use_gpu,
+ use_small=coarse_use_small,
+ force_reload=force_reload,
+ )
+ _ = load_model(
+ model_type="fine", use_gpu=fine_use_gpu, use_small=fine_use_small, force_reload=force_reload
+ )
+ _ = load_codec_model(use_gpu=codec_use_gpu, force_reload=force_reload)
+
+
+
diff --git a/bark_infinity/hubert/__init__.py b/bark_infinity/hubert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/bark_infinity/hubert/__pycache__/__init__.cpython-310.pyc b/bark_infinity/hubert/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..56ee2654f802a22f17002aa699c4b33a075808f2
Binary files /dev/null and b/bark_infinity/hubert/__pycache__/__init__.cpython-310.pyc differ
diff --git a/bark_infinity/hubert/__pycache__/__init__.cpython-38.pyc b/bark_infinity/hubert/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab8c982e485eaf6ea0b6095229e667506174405b
Binary files /dev/null and b/bark_infinity/hubert/__pycache__/__init__.cpython-38.pyc differ
diff --git a/bark_infinity/hubert/__pycache__/customtokenizer.cpython-310.pyc b/bark_infinity/hubert/__pycache__/customtokenizer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a4ea7c696fb0d34fcbdc203e4d930d6486672aed
Binary files /dev/null and b/bark_infinity/hubert/__pycache__/customtokenizer.cpython-310.pyc differ
diff --git a/bark_infinity/hubert/__pycache__/customtokenizer.cpython-38.pyc b/bark_infinity/hubert/__pycache__/customtokenizer.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d21c4c8af33ebbe42edd0f996586ef59572342f
Binary files /dev/null and b/bark_infinity/hubert/__pycache__/customtokenizer.cpython-38.pyc differ
diff --git a/bark_infinity/hubert/__pycache__/hubert_manager.cpython-310.pyc b/bark_infinity/hubert/__pycache__/hubert_manager.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5044d3f78286598d9db91135fc20cea0647c2c3
Binary files /dev/null and b/bark_infinity/hubert/__pycache__/hubert_manager.cpython-310.pyc differ
diff --git a/bark_infinity/hubert/__pycache__/pre_kmeans_hubert.cpython-310.pyc b/bark_infinity/hubert/__pycache__/pre_kmeans_hubert.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff91733ef318708d374b58df882dc5ef75540be3
Binary files /dev/null and b/bark_infinity/hubert/__pycache__/pre_kmeans_hubert.cpython-310.pyc differ
diff --git a/bark_infinity/hubert/customtokenizer.py b/bark_infinity/hubert/customtokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a6fcf6d8753a42e223c2baa860fe12e1979bc0a
--- /dev/null
+++ b/bark_infinity/hubert/customtokenizer.py
@@ -0,0 +1,200 @@
+"""
+Custom tokenizer model.
+Author: https://www.github.com/gitmylo/
+License: MIT
+"""
+
+import json
+import os.path
+from zipfile import ZipFile
+
+import numpy
+import torch
+from torch import nn, optim
+from torch.serialization import MAP_LOCATION
+
+
+class CustomTokenizer(nn.Module):
+ def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0):
+ super(CustomTokenizer, self).__init__()
+ next_size = input_size
+ if version == 0:
+ self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
+ next_size = hidden_size
+ if version == 1:
+ self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
+ self.intermediate = nn.Linear(hidden_size, 4096)
+ next_size = 4096
+
+ self.fc = nn.Linear(next_size, output_size)
+ self.softmax = nn.LogSoftmax(dim=1)
+ self.optimizer: optim.Optimizer = None
+ self.lossfunc = nn.CrossEntropyLoss()
+ self.input_size = input_size
+ self.hidden_size = hidden_size
+ self.output_size = output_size
+ self.version = version
+
+ def forward(self, x):
+ x, _ = self.lstm(x)
+ if self.version == 1:
+ x = self.intermediate(x)
+ x = self.fc(x)
+ x = self.softmax(x)
+ return x
+
+ @torch.no_grad()
+ def get_token(self, x):
+ """
+ Used to get the token for the first
+ :param x: An array with shape (N, input_size) where N is a whole number greater or equal to 1, and input_size is the input size used when creating the model.
+ :return: An array with shape (N,) where N is the same as N from the input. Every number in the array is a whole number in range 0...output_size - 1 where output_size is the output size used when creating the model.
+ """
+ return torch.argmax(self(x), dim=1)
+
+ def prepare_training(self):
+ self.optimizer = optim.Adam(self.parameters(), 0.001)
+
+ def train_step(self, x_train, y_train, log_loss=False):
+ # y_train = y_train[:-1]
+ # y_train = y_train[1:]
+
+ optimizer = self.optimizer
+ lossfunc = self.lossfunc
+ # Zero the gradients
+ self.zero_grad()
+
+ # Forward pass
+ y_pred = self(x_train)
+
+ y_train_len = len(y_train)
+ y_pred_len = y_pred.shape[0]
+
+ if y_train_len > y_pred_len:
+ diff = y_train_len - y_pred_len
+ y_train = y_train[diff:]
+ elif y_train_len < y_pred_len:
+ diff = y_pred_len - y_train_len
+ y_pred = y_pred[:-diff, :]
+
+ y_train_hot = torch.zeros(len(y_train), self.output_size)
+ y_train_hot[range(len(y_train)), y_train] = 1
+ y_train_hot = y_train_hot.to("cuda")
+
+ # Calculate the loss
+ loss = lossfunc(y_pred, y_train_hot)
+
+ # Print loss
+ if log_loss:
+ print("Loss", loss.item())
+
+ # Backward pass
+ loss.backward()
+
+ # Update the weights
+ optimizer.step()
+
+ def save(self, path):
+ info_path = ".".join(os.path.basename(path).split(".")[:-1]) + "/.info"
+ torch.save(self.state_dict(), path)
+ data_from_model = Data(self.input_size, self.hidden_size, self.output_size, self.version)
+ with ZipFile(path, "a") as model_zip:
+ model_zip.writestr(info_path, data_from_model.save())
+ model_zip.close()
+
+ @staticmethod
+ def load_from_checkpoint(path, map_location: MAP_LOCATION = torch.device("cpu")):
+ # print(f"Loading model from {path}...")
+ # old = True
+ old = False
+ with ZipFile(path) as model_zip:
+ filesMatch = [file for file in model_zip.namelist() if file.endswith("/.info")]
+ file = filesMatch[0] if filesMatch else None
+ if file:
+ old = False
+ data_from_model = Data.load(model_zip.read(file).decode("utf-8"))
+ model_zip.close()
+ if old:
+ model = CustomTokenizer()
+ else:
+ model = CustomTokenizer(
+ data_from_model.hidden_size,
+ data_from_model.input_size,
+ data_from_model.output_size,
+ data_from_model.version,
+ )
+ model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))
+ if map_location:
+ model = model.to(map_location)
+ return model
+
+
+class Data:
+ input_size: int
+ hidden_size: int
+ output_size: int
+ version: int
+
+ def __init__(self, input_size=768, hidden_size=1024, output_size=10000, version=0):
+ self.input_size = input_size
+ self.hidden_size = hidden_size
+ self.output_size = output_size
+ self.version = version
+
+ @staticmethod
+ def load(string):
+ data = json.loads(string)
+ return Data(data["input_size"], data["hidden_size"], data["output_size"], data["version"])
+
+ def save(self):
+ data = {
+ "input_size": self.input_size,
+ "hidden_size": self.hidden_size,
+ "output_size": self.output_size,
+ "version": self.version,
+ }
+ return json.dumps(data)
+
+
+def auto_train(data_path, save_path="model.pth", load_model: str | None = None, save_epochs=1):
+ data_x, data_y = [], []
+
+ if load_model and os.path.isfile(load_model):
+ # print('Loading model from', load_model)
+ model_training = CustomTokenizer.load_from_checkpoint(load_model, "cuda")
+ else:
+ # print('Creating new model.')
+ model_training = CustomTokenizer(version=1).to(
+ "cuda"
+ ) # Settings for the model to run without lstm
+ save_path = os.path.join(data_path, save_path)
+ base_save_path = ".".join(save_path.split(".")[:-1])
+
+ sem_string = "_semantic.npy"
+ feat_string = "_semantic_features.npy"
+
+ ready = os.path.join(data_path, "ready")
+ for input_file in os.listdir(ready):
+ full_path = os.path.join(ready, input_file)
+ if input_file.endswith(sem_string):
+ data_y.append(numpy.load(full_path))
+ elif input_file.endswith(feat_string):
+ data_x.append(numpy.load(full_path))
+ model_training.prepare_training()
+
+ epoch = 1
+
+ while 1:
+ for i in range(save_epochs):
+ j = 0
+ for x, y in zip(data_x, data_y):
+ model_training.train_step(
+ torch.tensor(x).to("cuda"), torch.tensor(y).to("cuda"), j % 50 == 0
+ ) # Print loss every 50 steps
+ j += 1
+ save_p = save_path
+ save_p_2 = f"{base_save_path}_epoch_{epoch}.pth"
+ model_training.save(save_p)
+ model_training.save(save_p_2)
+ print(f"Epoch {epoch} completed")
+ epoch += 1
diff --git a/bark_infinity/hubert/hubert_manager.py b/bark_infinity/hubert/hubert_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..cba8e98514286d276c16d2f3e248c8588bfbffe1
--- /dev/null
+++ b/bark_infinity/hubert/hubert_manager.py
@@ -0,0 +1,35 @@
+import os.path
+import shutil
+import urllib.request
+
+import huggingface_hub
+
+
+class HuBERTManager:
+ @staticmethod
+ def make_sure_hubert_installed(download_url: str = 'https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt', file_name: str = 'hubert.pt'):
+ install_dir = os.path.join('data', 'models', 'hubert')
+ if not os.path.isdir(install_dir):
+ os.makedirs(install_dir, exist_ok=True)
+ install_file = os.path.join(install_dir, file_name)
+ if not os.path.isfile(install_file):
+ print(f'Downloading HuBERT model {download_url} to {install_file}')
+ urllib.request.urlretrieve(download_url, install_file)
+ print('Downloaded HuBERT')
+ return install_file
+
+
+ @staticmethod
+ #def make_sure_tokenizer_installed(model: str = 'quantifier_hubert_base_ls960_14.pth', repo: str = 'GitMylo/bark-voice-cloning', local_file: str = 'tokenizer.pth'):
+ def make_sure_tokenizer_installed(model: str = 'quantifier_V1_hubert_base_ls960_23.pth', repo: str = 'GitMylo/bark-voice-cloning', local_file: str = 'tokenizer.pth'):
+ # print(f"Downloading tokenizer {model} from {repo} to {local_file}")
+ install_dir = os.path.join('data', 'models', 'hubert')
+ if not os.path.isdir(install_dir):
+ os.makedirs(install_dir, exist_ok=True)
+ install_file = os.path.join(install_dir, local_file)
+ if not os.path.isfile(install_file):
+ print('Downloading HuBERT custom tokenizer')
+ huggingface_hub.hf_hub_download(repo, model, local_dir=install_dir, local_dir_use_symlinks=False)
+ shutil.move(os.path.join(install_dir, model), install_file)
+ print('Downloaded tokenizer')
+ return install_file
\ No newline at end of file
diff --git a/bark_infinity/hubert/pre_kmeans_hubert.py b/bark_infinity/hubert/pre_kmeans_hubert.py
new file mode 100644
index 0000000000000000000000000000000000000000..56c5783115c7b9e05187f369495e75c00d5e4454
--- /dev/null
+++ b/bark_infinity/hubert/pre_kmeans_hubert.py
@@ -0,0 +1,107 @@
+"""
+Modified HuBERT model without kmeans.
+Original author: https://github.com/lucidrains/
+Modified by: https://www.github.com/gitmylo/
+License: MIT
+"""
+
+# Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py
+
+from pathlib import Path
+
+import torch
+from torch import nn
+from einops import pack, unpack
+
+import fairseq
+
+from torchaudio.functional import resample
+
+from audiolm_pytorch.utils import curtail_to_multiple
+
+import logging
+logging.root.setLevel(logging.ERROR)
+
+
+def exists(val):
+ return val is not None
+
+
+def default(val, d):
+ return val if exists(val) else d
+
+
+class CustomHubert(nn.Module):
+ """
+ checkpoint and kmeans can be downloaded at https://github.com/facebookresearch/fairseq/tree/main/examples/hubert
+ or you can train your own
+ """
+
+ def __init__(
+ self,
+ checkpoint_path,
+ target_sample_hz=16000,
+ seq_len_multiple_of=None,
+ output_layer=9,
+ device=None
+ ):
+ super().__init__()
+ self.target_sample_hz = target_sample_hz
+ self.seq_len_multiple_of = seq_len_multiple_of
+ self.output_layer = output_layer
+
+ if device is not None:
+ self.to(device)
+
+ model_path = Path(checkpoint_path)
+
+ assert model_path.exists(), f'path {checkpoint_path} does not exist'
+
+ checkpoint = torch.load(checkpoint_path)
+ load_model_input = {checkpoint_path: checkpoint}
+ print(f"checkpoint_path: {checkpoint_path}")
+ model, *_ = fairseq.checkpoint_utils.load_model_ensemble_and_task(load_model_input)
+
+ if device is not None:
+ model[0].to(device)
+
+ self.model = model[0]
+ self.model.eval()
+
+ @property
+ def groups(self):
+ return 1
+
+ @torch.no_grad()
+ def forward(
+ self,
+ wav_input,
+ flatten=True,
+ input_sample_hz=None
+ ):
+ device = wav_input.device
+
+ if exists(input_sample_hz):
+ wav_input = resample(wav_input, input_sample_hz, self.target_sample_hz)
+
+ if exists(self.seq_len_multiple_of):
+ wav_input = curtail_to_multiple(wav_input, self.seq_len_multiple_of)
+
+ embed = self.model(
+ wav_input,
+ features_only=True,
+ mask=False, # thanks to @maitycyrus for noticing that mask is defaulted to True in the fairseq code
+ output_layer=self.output_layer
+ )
+
+ embed, packed_shape = pack([embed['x']], '* d')
+
+ # codebook_indices = self.kmeans.predict(embed.cpu().detach().numpy())
+
+ codebook_indices = torch.from_numpy(embed.cpu().detach().numpy()).to(device) # .long()
+
+ if flatten:
+ return codebook_indices
+
+ codebook_indices, = unpack(codebook_indices, packed_shape, '*')
+ return codebook_indices
\ No newline at end of file
diff --git a/bark_infinity/model.py b/bark_infinity/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfe5be899baaf2dc0022baa940abed0637c372cd
--- /dev/null
+++ b/bark_infinity/model.py
@@ -0,0 +1,220 @@
+"""
+Much of this code is adapted from Andrej Karpathy's NanoGPT
+(https://github.com/karpathy/nanoGPT)
+"""
+import math
+from dataclasses import dataclass
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+class LayerNorm(nn.Module):
+ """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """
+
+ def __init__(self, ndim, bias):
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(ndim))
+ self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
+
+ def forward(self, input):
+ return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
+
+class CausalSelfAttention(nn.Module):
+
+ def __init__(self, config):
+ super().__init__()
+ assert config.n_embd % config.n_head == 0
+ # key, query, value projections for all heads, but in a batch
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+ # output projection
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+ # regularization
+ self.attn_dropout = nn.Dropout(config.dropout)
+ self.resid_dropout = nn.Dropout(config.dropout)
+ self.n_head = config.n_head
+ self.n_embd = config.n_embd
+ self.dropout = config.dropout
+ # flash attention make GPU go brrrrr but support is only in PyTorch nightly and still a bit scary
+ self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
+ if not self.flash:
+ # print("WARNING: using slow attention. Flash Attention atm needs PyTorch nightly and dropout=0.0")
+ # causal mask to ensure that attention is only applied to the left in the input sequence
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
+ .view(1, 1, config.block_size, config.block_size))
+ # else:
+ #print(f"Using Flash Attention.")
+
+ def forward(self, x, past_kv=None, use_cache=False):
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
+
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+ q, k ,v = self.c_attn(x).split(self.n_embd, dim=2)
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+
+ if past_kv is not None:
+ past_key = past_kv[0]
+ past_value = past_kv[1]
+ k = torch.cat((past_key, k), dim=-2)
+ v = torch.cat((past_value, v), dim=-2)
+
+ FULL_T = k.shape[-2]
+
+ if use_cache is True:
+ present = (k, v)
+ else:
+ present = None
+
+ # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+ if self.flash:
+ # efficient attention using Flash Attention CUDA kernels
+ if past_kv is not None:
+ # When `past_kv` is provided, we're doing incremental decoding and `q.shape[2] == 1`: q only contains
+ # the query for the last token. scaled_dot_product_attention interprets this as the first token in the
+ # sequence, so if is_causal=True it will mask out all attention from it. This is not what we want, so
+ # to work around this we set is_causal=False.
+ is_causal = False
+ else:
+ is_causal = True
+
+ y = torch.nn.functional.scaled_dot_product_attention(q, k, v, dropout_p=self.dropout, is_causal=is_causal)
+ else:
+ # manual implementation of attention
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+ att = att.masked_fill(self.bias[:,:,FULL_T-T:FULL_T,:FULL_T] == 0, float('-inf'))
+ att = F.softmax(att, dim=-1)
+ att = self.attn_dropout(att)
+ y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
+
+ # output projection
+ y = self.resid_dropout(self.c_proj(y))
+ return (y, present)
+
+class MLP(nn.Module):
+
+ def __init__(self, config):
+ super().__init__()
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
+ self.dropout = nn.Dropout(config.dropout)
+ self.gelu = nn.GELU()
+
+ def forward(self, x):
+ x = self.c_fc(x)
+ x = self.gelu(x)
+ x = self.c_proj(x)
+ x = self.dropout(x)
+ return x
+
+class Block(nn.Module):
+
+ def __init__(self, config, layer_idx):
+ super().__init__()
+ self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
+ self.attn = CausalSelfAttention(config)
+ self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
+ self.mlp = MLP(config)
+ self.layer_idx = layer_idx
+
+ def forward(self, x, past_kv=None, use_cache=False):
+ attn_output, prev_kvs = self.attn(self.ln_1(x), past_kv=past_kv, use_cache=use_cache)
+ x = x + attn_output
+ x = x + self.mlp(self.ln_2(x))
+ return (x, prev_kvs)
+
+@dataclass
+class GPTConfig:
+ block_size: int = 1024
+ input_vocab_size: int = 10_048
+ output_vocab_size: int = 10_048
+ n_layer: int = 12
+ n_head: int = 12
+ n_embd: int = 768
+ dropout: float = 0.0
+ bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
+
+class GPT(nn.Module):
+
+ def __init__(self, config):
+ super().__init__()
+ assert config.input_vocab_size is not None
+ assert config.output_vocab_size is not None
+ assert config.block_size is not None
+ self.config = config
+
+ self.transformer = nn.ModuleDict(dict(
+ wte = nn.Embedding(config.input_vocab_size, config.n_embd),
+ wpe = nn.Embedding(config.block_size, config.n_embd),
+ drop = nn.Dropout(config.dropout),
+ h = nn.ModuleList([Block(config, idx) for idx in range(config.n_layer)]),
+ ln_f = LayerNorm(config.n_embd, bias=config.bias),
+ ))
+ self.lm_head = nn.Linear(config.n_embd, config.output_vocab_size, bias=False)
+
+ def get_num_params(self, non_embedding=True):
+ """
+ Return the number of parameters in the model.
+ For non-embedding count (default), the position embeddings get subtracted.
+ The token embeddings would too, except due to the parameter sharing these
+ params are actually used as weights in the final layer, so we include them.
+ """
+ n_params = sum(p.numel() for p in self.parameters())
+ if non_embedding:
+ n_params -= self.transformer.wte.weight.numel()
+ n_params -= self.transformer.wpe.weight.numel()
+ return n_params
+
+ def forward(self, idx, merge_context=False, past_kv=None, position_ids=None, use_cache=False):
+ device = idx.device
+ b, t = idx.size()
+ if past_kv is not None:
+ assert t == 1
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
+ else:
+ if merge_context:
+ assert(idx.shape[1] >= 256+256+1)
+ t = idx.shape[1] - 256
+ else:
+ assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+
+ # forward the GPT model itself
+ if merge_context:
+ tok_emb = torch.cat([
+ self.transformer.wte(idx[:,:256]) + self.transformer.wte(idx[:,256:256+256]),
+ self.transformer.wte(idx[:,256+256:])
+ ], dim=1)
+ else:
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
+
+ if past_kv is None:
+ past_length = 0
+ past_kv = tuple([None] * len(self.transformer.h))
+ else:
+ past_length = past_kv[0][0].size(-2)
+
+ if position_ids is None:
+ position_ids = torch.arange(past_length, t + past_length, dtype=torch.long, device=device)
+ position_ids = position_ids.unsqueeze(0) # shape (1, t)
+ assert position_ids.shape == (1, t)
+
+ pos_emb = self.transformer.wpe(position_ids) # position embeddings of shape (1, t, n_embd)
+
+ x = self.transformer.drop(tok_emb + pos_emb)
+
+ new_kv = () if use_cache else None
+
+ for i, (block, past_layer_kv) in enumerate(zip(self.transformer.h, past_kv)):
+ x, kv = block(x, past_kv=past_layer_kv, use_cache=use_cache)
+
+ if use_cache:
+ new_kv = new_kv + (kv,)
+
+ x = self.transformer.ln_f(x)
+
+ # inference-time mini-optimization: only forward the lm_head on the very last position
+ logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
+
+ return (logits, new_kv)
diff --git a/bark_infinity/model_fine.py b/bark_infinity/model_fine.py
new file mode 100644
index 0000000000000000000000000000000000000000..50b8ddd612360fcfa2b15107d7b2bd2316a6092d
--- /dev/null
+++ b/bark_infinity/model_fine.py
@@ -0,0 +1,151 @@
+"""
+Much of this code is adapted from Andrej Karpathy's NanoGPT
+(https://github.com/karpathy/nanoGPT)
+"""
+from dataclasses import dataclass
+import math
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+from .model import GPT, GPTConfig, MLP
+
+
+class NonCausalSelfAttention(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ assert config.n_embd % config.n_head == 0
+ # key, query, value projections for all heads, but in a batch
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+ # output projection
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+ # regularization
+ self.attn_dropout = nn.Dropout(config.dropout)
+ self.resid_dropout = nn.Dropout(config.dropout)
+ self.n_head = config.n_head
+ self.n_embd = config.n_embd
+ self.dropout = config.dropout
+ # flash attention make GPU go brrrrr but support is only in PyTorch nightly and still a bit scary
+ # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
+ self.flash = (
+ # hasattr(torch.nn.functional, "scaled_dot_product_attention") and self.dropout == 0.0
+ hasattr(torch.nn.functional, "scaled_dot_product_attention")
+ )
+
+ def forward(self, x):
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
+
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+ q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+
+ # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+ if self.flash:
+ # efficient attention using Flash Attention CUDA kernels
+ y = torch.nn.functional.scaled_dot_product_attention(
+ q, k, v, attn_mask=None, dropout_p=self.dropout, is_causal=False
+ )
+ else:
+ # manual implementation of attention
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+ att = F.softmax(att, dim=-1)
+ att = self.attn_dropout(att)
+ y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+ y = (
+ y.transpose(1, 2).contiguous().view(B, T, C)
+ ) # re-assemble all head outputs side by side
+
+ # output projection
+ y = self.resid_dropout(self.c_proj(y))
+ return y
+
+
+class FineBlock(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.ln_1 = nn.LayerNorm(config.n_embd)
+ self.attn = NonCausalSelfAttention(config)
+ self.ln_2 = nn.LayerNorm(config.n_embd)
+ self.mlp = MLP(config)
+
+ def forward(self, x):
+ x = x + self.attn(self.ln_1(x))
+ x = x + self.mlp(self.ln_2(x))
+ return x
+
+
+class FineGPT(GPT):
+ def __init__(self, config):
+ super().__init__(config)
+ del self.lm_head
+ self.config = config
+ self.n_codes_total = config.n_codes_total
+ self.transformer = nn.ModuleDict(
+ dict(
+ wtes=nn.ModuleList(
+ [
+ nn.Embedding(config.input_vocab_size, config.n_embd)
+ for _ in range(config.n_codes_total)
+ ]
+ ),
+ wpe=nn.Embedding(config.block_size, config.n_embd),
+ drop=nn.Dropout(config.dropout),
+ h=nn.ModuleList([FineBlock(config) for _ in range(config.n_layer)]),
+ ln_f=nn.LayerNorm(config.n_embd),
+ )
+ )
+ self.lm_heads = nn.ModuleList(
+ [
+ nn.Linear(config.n_embd, config.output_vocab_size, bias=False)
+ for _ in range(config.n_codes_given, self.n_codes_total)
+ ]
+ )
+ for i in range(self.n_codes_total - config.n_codes_given):
+ self.transformer.wtes[i + 1].weight = self.lm_heads[i].weight
+
+ def forward(self, pred_idx, idx):
+ device = idx.device
+ b, t, codes = idx.size()
+ assert (
+ t <= self.config.block_size
+ ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+ assert pred_idx > 0, "cannot predict 0th codebook"
+ assert codes == self.n_codes_total, (b, t, codes)
+ pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)
+
+ # forward the GPT model itself
+ tok_embs = [
+ wte(idx[:, :, i]).unsqueeze(-1) for i, wte in enumerate(self.transformer.wtes)
+ ] # token embeddings of shape (b, t, n_embd)
+ tok_emb = torch.cat(tok_embs, dim=-1)
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
+ x = tok_emb[:, :, :, : pred_idx + 1].sum(dim=-1)
+ x = self.transformer.drop(x + pos_emb)
+ for block in self.transformer.h:
+ x = block(x)
+ x = self.transformer.ln_f(x)
+ logits = self.lm_heads[pred_idx - self.config.n_codes_given](x)
+ return logits
+
+ def get_num_params(self, non_embedding=True):
+ """
+ Return the number of parameters in the model.
+ For non-embedding count (default), the position embeddings get subtracted.
+ The token embeddings would too, except due to the parameter sharing these
+ params are actually used as weights in the final layer, so we include them.
+ """
+ n_params = sum(p.numel() for p in self.parameters())
+ if non_embedding:
+ for wte in self.transformer.wtes:
+ n_params -= wte.weight.numel()
+ n_params -= self.transformer.wpe.weight.numel()
+ return n_params
+
+
+@dataclass
+class FineGPTConfig(GPTConfig):
+ n_codes_total: int = 8
+ n_codes_given: int = 1
diff --git a/bark_infinity/text_processing.py b/bark_infinity/text_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..07ee3a046ad05a93c7891c27e075349fcddc484d
--- /dev/null
+++ b/bark_infinity/text_processing.py
@@ -0,0 +1,471 @@
+from typing import Dict, Optional, Union
+
+from .config import logger, console
+from typing import List
+import os
+import re
+import datetime
+import random
+from typing import List
+
+import re
+import textwrap
+from datetime import datetime
+
+from rich.pretty import pprint
+from rich.table import Table
+
+
+from collections import defaultdict
+
+from typing import List
+import re
+import random
+from typing import Dict, Optional, Union
+import logging
+logger = logging.getLogger(__name__)
+
+import re
+
+
+
+
+def ordinal(n):
+ """Add ordinal suffix to a number"""
+ return str(n) + ("th" if 4<=n%100<=20 else {1:"st",2:"nd",3:"rd"}.get(n%10, "th"))
+
+def time_of_day(hour):
+ """Define time of day based on hour"""
+ if 5 <= hour < 12:
+ return "in the morning"
+ elif 12 <= hour < 17:
+ return "in the afternoon"
+ elif 17 <= hour < 21:
+ return "in the evening"
+ else:
+ return "at night"
+
+def current_date_time_in_words():
+ now = datetime.now()
+ day_of_week = now.strftime('%A')
+ month = now.strftime('%B')
+ day = ordinal(now.day)
+ year = now.year
+ hour = now.hour
+ minute = now.minute
+
+ time_of_day_str = time_of_day(hour)
+
+ if minute == 0:
+ minute_str = ""
+ elif minute == 1:
+ minute_str = "1 minute past"
+ elif minute == 15:
+ minute_str = "quarter past"
+ elif minute == 30:
+ minute_str = "half past"
+ elif minute == 45:
+ minute_str = "quarter to "
+ hour += 1
+ elif minute < 30:
+ minute_str = str(minute) + " minutes past"
+ else:
+ minute_str = str(60 - minute) + " minutes to "
+ hour += 1
+
+ hour_str = str(hour if hour <= 12 else hour - 12)
+
+ if minute_str:
+ time_str = minute_str + " " + hour_str
+ else:
+ time_str = hour_str + " o'clock"
+
+
+ time_string = f"{day_of_week}, {month} {day}, {year}, {time_str} {time_of_day_str}."
+
+ # Prepare final output
+ return time_string
+
+
+#Let's keep comptability for now in case people are used to this
+# Chunked generation originally from https://github.com/serp-ai/bark-with-voice-clone
+def split_general_purpose(text, split_character_goal_length=150, split_character_max_length=200):
+ # return nltk.sent_tokenize(text)
+
+ # from https://github.com/neonbjb/tortoise-tts
+ """Split text it into chunks of a desired length trying to keep sentences intact."""
+ # normalize text, remove redundant whitespace and convert non-ascii quotes to ascii
+ text = re.sub(r"\n\n+", "\n", text)
+ text = re.sub(r"\s+", " ", text)
+ text = re.sub(r"[โโ]", '"', text)
+
+ rv = []
+ in_quote = False
+ current = ""
+ split_pos = []
+ pos = -1
+ end_pos = len(text) - 1
+
+ def seek(delta):
+ nonlocal pos, in_quote, current
+ is_neg = delta < 0
+ for _ in range(abs(delta)):
+ if is_neg:
+ pos -= 1
+ current = current[:-1]
+ else:
+ pos += 1
+ current += text[pos]
+ if text[pos] == '"':
+ in_quote = not in_quote
+ return text[pos]
+
+ def peek(delta):
+ p = pos + delta
+ return text[p] if p < end_pos and p >= 0 else ""
+
+ def commit():
+ nonlocal rv, current, split_pos
+ rv.append(current)
+ current = ""
+ split_pos = []
+
+ while pos < end_pos:
+ c = seek(1)
+ # do we need to force a split?
+ if len(current) >= split_character_max_length:
+ if len(split_pos) > 0 and len(current) > (split_character_goal_length / 2):
+ # we have at least one sentence and we are over half the desired length, seek back to the last split
+ d = pos - split_pos[-1]
+ seek(-d)
+ else:
+ # should split on semicolon too
+ # no full sentences, seek back until we are not in the middle of a word and split there
+ while c not in ";!?.\n " and pos > 0 and len(current) > split_character_goal_length:
+ c = seek(-1)
+ commit()
+ # check for sentence boundaries
+ elif not in_quote and (c in ";!?\n" or (c == "." and peek(1) in "\n ")):
+ # seek forward if we have consecutive boundary markers but still within the max length
+ while (
+ pos < len(text) - 1 and len(current) < split_character_max_length and peek(1) in "!?."
+ ):
+ c = seek(1)
+ split_pos.append(pos)
+ if len(current) >= split_character_goal_length:
+ commit()
+ # treat end of quote as a boundary if its followed by a space or newline
+ elif in_quote and peek(1) == '"' and peek(2) in "\n ":
+ seek(2)
+ split_pos.append(pos)
+ rv.append(current)
+
+ # clean up, remove lines with only whitespace or punctuation
+ rv = [s.strip() for s in rv]
+ rv = [s for s in rv if len(s) > 0 and not re.match(r"^[\s\.,;:!?]*$", s)]
+
+ return rv
+
+def is_sentence_ending(s):
+ return s in {"!", "?", ".", ";"}
+
+def is_boundary_marker(s):
+ return s in {"!", "?", ".", "\n"}
+
+
+def split_general_purpose_hm(text, split_character_goal_length=110, split_character_max_length=160):
+ def clean_text(text):
+ text = re.sub(r"\n\n+", "\n", text)
+ text = re.sub(r"\s+", " ", text)
+ text = re.sub(r"[โโ]", '"', text)
+ return text
+
+ def _split_text(text):
+ sentences = []
+ sentence = ""
+ in_quote = False
+ for i, c in enumerate(text):
+ sentence += c
+ if c == '"':
+ in_quote = not in_quote
+ elif not in_quote and (is_sentence_ending(c) or c == "\n"):
+ if i < len(text) - 1 and text[i + 1] in '!?.':
+ continue
+ sentences.append(sentence.strip())
+ sentence = ""
+ if sentence.strip():
+ sentences.append(sentence.strip())
+ return sentences
+
+ def recombine_chunks(chunks):
+ combined_chunks = []
+ current_chunk = ""
+ for chunk in chunks:
+ if len(current_chunk) + len(chunk) + 1 <= split_character_max_length:
+ current_chunk += " " + chunk
+ else:
+ combined_chunks.append(current_chunk.strip())
+ current_chunk = chunk
+ if current_chunk.strip():
+ combined_chunks.append(current_chunk.strip())
+ return combined_chunks
+
+ cleaned_text = clean_text(text)
+ sentences = _split_text(cleaned_text)
+ wrapped_sentences = [textwrap.fill(s, width=split_character_goal_length) for s in sentences]
+ chunks = [chunk for s in wrapped_sentences for chunk in s.split('\n')]
+ combined_chunks = recombine_chunks(chunks)
+
+ return combined_chunks
+
+
+
+def split_text(text: str, split_type: Optional[str] = None, split_type_quantity = 1, split_type_string: Optional[str] = None, split_type_value_type: Optional[str] = None) -> List[str]:
+ if text == '':
+ return [text]
+
+ # the old syntax still works if you don't use this parameter, ie
+ # split_type line, split_type_value 4, splits into groups of 4 lines
+ if split_type_value_type == '':
+ split_type_value_type = split_type
+
+ """
+ if split_type == 'phrase':
+ # print(f"Loading spacy to split by phrase.")
+ nlp = spacy.load('en_core_web_sm')
+
+ chunks = split_by_phrase(text, nlp)
+ # print(chunks)
+ return chunks
+ """
+ if split_type == 'string' or split_type == 'regex':
+
+ if split_type_string is None:
+ logger.warning(
+ f"Splitting by {split_type} requires a string to split by. Returning original text.")
+ return [text]
+
+ split_type_to_function = {
+ 'word': split_by_words,
+ 'line': split_by_lines,
+ 'sentence': split_by_sentence,
+ 'string': split_by_string,
+ 'char' : split_by_char,
+ #'random': split_by_random,
+ # 'rhyme': split_by_rhymes,
+ # 'pos': split_by_part_of_speech,
+ 'regex': split_by_regex,
+ }
+
+
+
+ if split_type in split_type_to_function:
+ # split into groups of 1 by the desired type
+ # this is so terrible even I'm embarassed, destroy all this code later, but I guess it does something useful atm
+ segmented_text = split_type_to_function[split_type](text, split_type = split_type, split_type_quantity=1, split_type_string=split_type_string, split_type_value_type=split_type_value_type)
+ final_segmented_text = []
+ current_segment = ''
+ split_type_quantity_found = 0
+
+ if split_type_value_type is None:
+ split_type_value_type = split_type
+
+ for seg in segmented_text: # for each line, for example, we can now split by 'words' or whatever, as a counter for when to break the group
+ current_segment += seg
+
+ #print(split_type_to_function[split_type](current_segment, split_type=split_type_value_type, split_type_quantity=1, split_type_string=split_type_string))
+ split_type_quantity_found = len(split_type_to_function[split_type_value_type](current_segment, split_type=split_type_value_type, split_type_quantity=1, split_type_string=split_type_string))
+ #print(f"I see {split_type_quantity_found} {split_type_value_type} in {current_segment}")
+ if split_type_quantity_found >= int(split_type_quantity):
+ final_segmented_text.append(current_segment)
+ split_type_quantity_found = 0
+ current_segment = ''
+
+ return final_segmented_text
+
+ logger.warning(
+ f"Splitting by {split_type} not a supported option. Returning original text.")
+ return [text]
+
+def split_by_string(text: str, split_type: Optional[str] = None, split_type_quantity: Optional[int] = 1, split_type_string: Optional[str] = None, split_type_value_type: Optional[str] = None) -> List[str]:
+ if split_type_string is not None:
+ split_pattern = f"({split_type_string})"
+ split_list = re.split(split_pattern, text)
+ result = [split_list[0]]
+ for i in range(1, len(split_list), 2):
+ result.append(split_list[i] + split_list[i+1])
+ return result
+ else:
+ return text.split()
+
+def split_by_regex(text: str, split_type: Optional[str] = None, split_type_quantity: Optional[int] = 1, split_type_string: Optional[str] = None, split_type_value_type: Optional[str] = None) -> List[str]:
+ chunks = []
+ start = 0
+ if split_type_string is not None:
+ for match in re.finditer(split_type_string, text):
+ end = match.start()
+ chunks.append(text[start:end].strip())
+ start = end
+
+ chunks.append(text[start:].strip())
+ return chunks
+ else:
+ return text.split()
+
+def split_by_char(text: str, split_type: Optional[str] = None, split_type_quantity = 1, split_type_string: Optional[str] = None, split_type_value_type: Optional[str] = None) -> List[str]:
+ return list(text)
+
+def split_by_words(text: str, split_type: Optional[str] = None, split_type_quantity = 1, split_type_string: Optional[str] = None, split_type_value_type: Optional[str] = None) -> List[str]:
+
+ return [word + ' ' for word in text.split() if text.strip()]
+ #return [' '.join(words[i:i + split_type_quantity]) for i in range(0, len(words), split_type_quantity)]
+
+
+def split_by_lines(text: str, split_type: Optional[str] = None, split_type_quantity = 1, split_type_string: Optional[str] = None, split_type_value_type: Optional[str] = None) -> List[str]:
+ lines = [line + '\n' for line in text.split('\n') if line.strip()]
+ return lines
+ #return ['\n'.join(lines[i:i + split_type_quantity]) for i in range(0, len(lines), split_type_quantity)]
+
+def split_by_sentence(text: str, split_type: Optional[str] = None, split_type_quantity: Optional[int] = 1, split_type_string: Optional[str] = None, split_type_value_type: Optional[str] = None) -> List[str]:
+ import nltk
+ text = text.replace("\n", " ").strip()
+ sentences = nltk.sent_tokenize(text)
+ return [sentence + ' ' for sentence in sentences]
+ #return [' '.join(sentences[i:i + split_type_quantity]) for i in range(0, len(sentences), split_type_quantity)]
+
+
+"""
+def split_by_sentences(text: str, n: int, language="en") -> List[str]:
+ seg = pysbd.Segmenter(language=language, clean=False)
+ sentences = seg.segment(text)
+ return [' '.join(sentences[i:i + n]) for i in range(0, len(sentences), n)]
+"""
+
+def load_text(file_path: str) -> Union[str, None]:
+ try:
+ with open(file_path, "r", encoding="utf-8") as f:
+ content = f.read()
+ logger.info(f"Successfully loaded the file: {file_path}")
+ return content
+ except FileNotFoundError:
+ logger.error(f"File not found: {file_path}")
+ except PermissionError:
+ logger.error(f"Permission denied to read the file: {file_path}")
+ except Exception as e:
+ logger.error(
+ f"An unexpected error occurred while reading the file: {file_path}. Error: {e}")
+ return None
+
+
+# Good for just exploring random voices
+"""
+def split_by_random(text: str, n: int) -> List[str]:
+ words = text.split()
+ chunks = []
+ min_len = max(1, n - 2)
+ max_len = n + 2
+ while words:
+ chunk_len = random.randint(min_len, max_len)
+ chunk = ' '.join(words[:chunk_len])
+ chunks.append(chunk)
+ words = words[chunk_len:]
+ return chunks
+"""
+# too many libraries, removing
+"""
+def split_by_phrase(text: str, nlp, min_duration=8, max_duration=18, words_per_second=2.3) -> list:
+
+ if text is None:
+ return ''
+ doc = nlp(text)
+ chunks = []
+ min_words = int(min_duration * words_per_second)
+ max_words = int(max_duration * words_per_second)
+
+ current_chunk = ""
+ current_word_count = 0
+
+ for sent in doc.sents:
+ word_count = len(sent.text.split())
+ if current_word_count + word_count < min_words:
+ current_chunk += " " + sent.text.strip()
+ current_word_count += word_count
+ elif current_word_count + word_count <= max_words:
+ current_chunk += " " + sent.text.strip()
+ chunks.append(current_chunk.strip())
+ current_chunk = ""
+ current_word_count = 0
+ else:
+ # Emergency cutoff
+ words = sent.text.split()
+ while words:
+ chunk_len = max_words - current_word_count
+ chunk = ' '.join(words[:chunk_len])
+ current_chunk += " " + chunk
+ chunks.append(current_chunk.strip())
+ current_chunk = ""
+ current_word_count = 0
+ words = words[chunk_len:]
+
+ if current_chunk:
+ chunks.append(current_chunk.strip())
+
+ return chunks
+"""
+
+"""
+def split_by_rhymes(text: str, n: int) -> List[str]:
+ words = text.split()
+ chunks = []
+ current_chunk = []
+ rhyming_word_count = 0
+ for word in words:
+ current_chunk.append(word)
+ if any(rhyme_word in words for rhyme_word in rhymes(word)):
+ rhyming_word_count += 1
+ if rhyming_word_count >= n:
+ chunks.append(' '.join(current_chunk))
+ current_chunk = []
+ rhyming_word_count = 0
+ if current_chunk:
+ chunks.append(' '.join(current_chunk))
+ return chunks
+"""
+
+# 'NN' for noun. 'VB' for verb. 'JJ' for adjective. 'RB' for adverb.
+# NN-VV Noun followed by a verb
+# JJR, JJS
+# UH = Interjection, Goodbye Goody Gosh Wow Jeepers Jee-sus Hubba Hey Kee-reist Oops amen huh howdy uh dammit whammo shucks heck anyways whodunnit honey golly man baby diddle hush sonuvabitch ...
+
+"""
+def split_by_part_of_speech(text: str, pos_pattern: str) -> List[str]:
+ tokens = word_tokenize(text)
+ tagged_tokens = pos_tag(tokens)
+ pos_pattern = pos_pattern.split('-')
+ original_pos_pattern = pos_pattern.copy()
+
+ chunks = []
+ current_chunk = []
+
+ for word, pos in tagged_tokens:
+ current_chunk.append(word)
+ if pos in pos_pattern:
+ pos_index = pos_pattern.index(pos)
+ if pos_index == 0:
+ pos_pattern.pop(0)
+ else:
+ current_chunk = current_chunk[:-1]
+ pos_pattern = original_pos_pattern.copy()
+ if not pos_pattern:
+ chunks.append(' '.join(current_chunk))
+ current_chunk = [word]
+ pos_pattern = original_pos_pattern.copy()
+
+ if current_chunk:
+ chunks.append(' '.join(current_chunk))
+
+ return chunks
+"""
+
+
+
diff --git a/bark_perform.py b/bark_perform.py
new file mode 100644
index 0000000000000000000000000000000000000000..649d9b53d7915b0f961cafc54deca85281ce7c7e
--- /dev/null
+++ b/bark_perform.py
@@ -0,0 +1,164 @@
+import argparse
+import numpy as np
+
+from rich import print
+
+from bark_infinity import config
+
+logger = config.logger
+
+from bark_infinity import generation
+from bark_infinity import api
+
+from bark_infinity import text_processing
+import time
+
+import random
+
+text_prompts_in_this_file = []
+
+
+import torch
+from torch.utils import collect_env
+
+
+try:
+ text_prompts_in_this_file.append(
+ f"It's {text_processing.current_date_time_in_words()} And if you're hearing this, Bark is working. But you didn't provide any text"
+ )
+except Exception as e:
+ print(f"An error occurred: {e}")
+
+text_prompt = """
+ In the beginning the Universe was created. This has made a lot of people very angry and been widely regarded as a bad move. However, Bark is working.
+"""
+text_prompts_in_this_file.append(text_prompt)
+
+text_prompt = """
+ A common mistake that people make when trying to design something completely foolproof is to underestimate the ingenuity of complete fools.
+"""
+text_prompts_in_this_file.append(text_prompt)
+
+
+def get_group_args(group_name, updated_args):
+ # Convert the Namespace object to a dictionary
+ updated_args_dict = vars(updated_args)
+
+ group_args = {}
+ for key, value in updated_args_dict.items():
+ if key in dict(config.DEFAULTS[group_name]):
+ group_args[key] = value
+ return group_args
+
+
+def main(args):
+ if args.loglevel is not None:
+ logger.setLevel(args.loglevel)
+
+ if args.OFFLOAD_CPU is not None:
+ generation.OFFLOAD_CPU = args.OFFLOAD_CPU
+ # print(f"OFFLOAD_CPU is set to {generation.OFFLOAD_CPU}")
+ else:
+ if generation.get_SUNO_USE_DIRECTML() is not True:
+ generation.OFFLOAD_CPU = True # default on just in case
+ if args.USE_SMALL_MODELS is not None:
+ generation.USE_SMALL_MODELS = args.USE_SMALL_MODELS
+ # print(f"USE_SMALL_MODELS is set to {generation.USE_SMALL_MODELS}")
+ if args.GLOBAL_ENABLE_MPS is not None:
+ generation.GLOBAL_ENABLE_MPS = args.GLOBAL_ENABLE_MPS
+ # print(f"GLOBAL_ENABLE_MPS is set to {generation.GLOBAL_ENABLE_MPS}")
+
+ if not args.silent:
+ if args.detailed_gpu_report or args.show_all_reports:
+ print(api.startup_status_report(quick=False))
+ elif not args.text_prompt and not args.prompt_file: # probably a test run, default to show
+ print(api.startup_status_report(quick=True))
+ if args.detailed_hugging_face_cache_report or args.show_all_reports:
+ print(api.hugging_face_cache_report())
+ if args.detailed_cuda_report or args.show_all_reports:
+ print(api.cuda_status_report())
+ if args.detailed_numpy_report:
+ print(api.numpy_report())
+ if args.run_numpy_benchmark or args.show_all_reports:
+ from bark_infinity.debug import numpy_benchmark
+
+ numpy_benchmark()
+
+ if args.list_speakers:
+ api.list_speakers()
+ return
+
+ if args.render_npz_samples:
+ api.render_npz_samples()
+ return
+
+ if args.text_prompt:
+ text_prompts_to_process = [args.text_prompt]
+ elif args.prompt_file:
+ text_file = text_processing.load_text(args.prompt_file)
+ if text_file is None:
+ logger.error(f"Error loading file: {args.prompt_file}")
+ return
+ text_prompts_to_process = text_processing.split_text(
+ text_processing.load_text(args.prompt_file),
+ args.split_input_into_separate_prompts_by,
+ args.split_input_into_separate_prompts_by_value,
+ )
+
+ print(f"\nProcessing file: {args.prompt_file}")
+ print(f" Looks like: {len(text_prompts_to_process)} prompt(s)")
+
+ else:
+ print("No --text_prompt or --prompt_file specified, using test prompt.")
+ text_prompts_to_process = random.sample(text_prompts_in_this_file, 2)
+
+ things = len(text_prompts_to_process) + args.output_iterations
+ if things > 10:
+ if args.dry_run is False:
+ print(
+ f"WARNING: You are about to process {things} prompts. Consider using '--dry-run' to test things first."
+ )
+
+ # pprint(args)
+ print("Loading Bark models...")
+ if not args.dry_run and generation.get_SUNO_USE_DIRECTML() is not True:
+ generation.preload_models(
+ args.text_use_gpu,
+ args.text_use_small,
+ args.coarse_use_gpu,
+ args.coarse_use_small,
+ args.fine_use_gpu,
+ args.fine_use_small,
+ args.codec_use_gpu,
+ args.force_reload,
+ )
+
+ print("Done.")
+
+ for idx, text_prompt in enumerate(text_prompts_to_process, start=1):
+ if len(text_prompts_to_process) > 1:
+ print(f"\nPrompt {idx}/{len(text_prompts_to_process)}:")
+
+ # print(f"Text prompt: {text_prompt}")
+ for iteration in range(1, args.output_iterations + 1):
+ if args.output_iterations > 1:
+ print(f"\nIteration {iteration} of {args.output_iterations}.")
+ if iteration == 1:
+ print("ss", text_prompt)
+
+ args.current_iteration = iteration
+ args.text_prompt = text_prompt
+ args_dict = vars(args)
+
+ api.generate_audio_long(**args_dict)
+
+
+if __name__ == "__main__":
+ parser = config.create_argument_parser()
+
+ args = parser.parse_args()
+
+ updated_args = config.update_group_args_with_defaults(args)
+
+ namespace_args = argparse.Namespace(**updated_args)
+ main(namespace_args)
diff --git a/bark_webui.bat b/bark_webui.bat
new file mode 100644
index 0000000000000000000000000000000000000000..1911357bb2230278aae7b98ea08b94a2efe16b81
--- /dev/null
+++ b/bark_webui.bat
@@ -0,0 +1,4 @@
+@echo off
+call %USERPROFILE%\mambaforge\Scripts\activate.bat bark-infinity-oneclick
+python %USERPROFILE%\bark\bark_webui.py
+pause
diff --git a/bark_webui.py b/bark_webui.py
new file mode 100644
index 0000000000000000000000000000000000000000..dea0a370a423ad081b8098a0df6b5b2c8a9021b2
--- /dev/null
+++ b/bark_webui.py
@@ -0,0 +1,3032 @@
+import datetime
+import os
+import random
+import glob
+import argparse
+import gradio as gr
+from gradio.components import Markdown as m
+import sys
+from collections import defaultdict
+from tqdm import tqdm
+
+os.environ["TERM"] = "dumb"
+import io
+from bark_infinity import config
+from bark_infinity import debug
+
+logger = config.logger
+logger.setLevel("INFO")
+
+from bark_infinity import generation
+from bark_infinity import api
+from startfile import startfile
+import requests
+
+import torch
+
+pytorch_version = float(".".join(torch.__version__.split(".")[:2]))
+print(f"Pytorch version: {pytorch_version}")
+
+ENABLE_DIRECTML_CLONE = os.environ.get("ENABLE_DIRECTML_CLONE", "0")
+
+current_tab = "generate"
+barkdebug = False
+
+if generation.get_SUNO_USE_DIRECTML() is not True:
+ generation.OFFLOAD_CPU = True
+
+base_theme = gr.themes.Base()
+default_theme = gr.themes.Default()
+monochrome_theme = gr.themes.Monochrome()
+soft_theme = gr.themes.Soft()
+glass_theme = gr.themes.Glass()
+
+
+def close_gradio(self):
+ print("Shutdown request received")
+ gr.close()
+
+
+gradio_hf_hub_themes = [
+ "gradio/glass",
+ "gradio/monochrome",
+ "gradio/seafoam",
+ "gradio/soft",
+ "freddyaboulton/dracula_revamped",
+ "gradio/dracula_test",
+ "abidlabs/dracula_test",
+ "abidlabs/pakistan",
+ "dawood/microsoft_windows",
+ "ysharma/steampunk",
+]
+
+
+def add_text(history, text):
+ history = history + [(text, None)]
+ return history, ""
+
+
+def add_file(history, file):
+ history = history + [((file.name,), None)]
+ return history
+
+
+def bot(history):
+ response = "**That's cool!**"
+ history[-1][1] = response
+ return history
+
+
+if not generation.get_SUNO_USE_DIRECTML() is True or ENABLE_DIRECTML_CLONE != "1":
+ from bark_infinity.clonevoice import clone_voice
+
+
+print(api.startup_status_report(True))
+
+import threading
+import time
+
+from webui import styles
+from webui import transformations
+from webui.ui_components import FormRow, FormColumn, FormGroup, ToolButton, FormHTML
+
+from webui import ui_loadsave
+
+style_csv = "webui/styles.csv"
+user_style_csv = "webui/user_styles.csv"
+
+transformation_csv = "webui/transformations.csv"
+user_transformation_csv = "webui/user_transformations.csv"
+
+prompt_styles = styles.StyleDatabase(style_csv, user_style_csv)
+
+prompt_transformations = transformations.TransformationDatabase(
+ transformation_csv, user_transformation_csv
+)
+
+# prompt_styles = styles.StyleDatabase("webui/styles.csv", "webui/user_styles.csv")
+# prompt_transformations = transformations.TransformationDatabase("webui/transformations.csv", "webui/user_transformations.csv")
+
+cancel_process = False
+
+last_audio_samples = []
+
+# not right but just to get it working
+global_outputs_to_show = 5
+
+
+loadsave = ui_loadsave.UiLoadsave("gradio_options.json")
+
+
+global save_log_lines
+save_log_lines = 100
+
+
+scroll_style = """
+
+"""
+
+bark_console_style = """
+.bark_console {
+font: 1.3rem Inconsolata, monospace;
+ white-space: pre;
+ padding: 5px;
+ border: 2px dashed orange;
+ border-radius: 3px;
+ max-height: 500px;
+ overflow-y: scroll;
+ font-size: 90%;
+ overflow-x: hidden;
+ }
+
+
+ #cloning {background: green !important;}
+
+
+
+ #styles_row button {
+display: flex;
+width: 2em;
+ align-self: end;
+ margin: 8px 13px 0px 0px;
+ }
+
+
+ #styles_row div .wrap .wrap-inner, #styles_row div.panel {
+padding: 0px;
+ margin: 0px;
+ min-height: 34px;
+
+ }
+
+ #styles_row div.form {
+ border: none;
+ position: absolute;
+ background: none;
+ }
+
+
+div#styles_row {
+ min-height: 100px;
+
+}
+
+ body #styles_row button {
+
+position: absolute;
+
+ }
+
+
+body div.tiny_column {
+
+ min-width: 0px !important;
+
+}
+
+body div#selected_npz_file {
+ padding: 0 !important;
+
+}
+
+body div#selected_npz_file > label > textarea {
+
+
+
+ background: re !important;
+}
+
+body div#selected_npz_file > div {
+ display: none;
+
+}
+
+body .bark_upload_audio, body .bark_upload_file, body .bark_output_audio {
+ height: 90px !important;
+}
+
+body .bark_output_audio {
+ height: 120px !important;
+}
+
+
+
+
+
+body div#speaker_preview_audio {
+ height: 90px !important;
+
+}
+
+
+body div#speaker_preview_audio svg {
+ position: relative;
+ top: -40px;
+
+
+}
+
+
+body div#specific_npz_file {
+ height: 126px !important;
+
+}
+
+body .bark_upload_audio#specific_npz_file{
+}
+
+
+.bark_upload_audio .svelte-19sk1im::before {
+ content: "Click to Crop Audio File";
+ position: absolute;
+ left: -145px;
+}
+#main_top_ui_tabs > .tab-nav > button {
+ font-size: 135%;
+
+}
+
+#main_top_ui_tabs > .tab-nav > button.selected {
+
+}
+
+body div#generate_options_row_id > div > span {
+ font-size: 22px !important;
+
+}
+
+body div#generate_options_row_id > div > span:hover {
+ box-shadow: 0 5px 15px rgba(0, 0, 0, 0.8);
+
+}
+
+
+"""
+import functools
+
+
+where_am_i = os.getcwd()
+
+
+def timeout(seconds):
+ def decorator(func):
+ @functools.wraps(func)
+ def wrapper(*args, **kwargs):
+ result = [None]
+ thread = threading.Thread(target=lambda: result.__setitem__(0, func(*args, **kwargs)))
+ thread.start()
+ thread.join(seconds)
+ if thread.is_alive():
+ return None
+ return result[0]
+
+ return wrapper
+
+ return decorator
+
+
+# I made a CLI app. This is my solution. I'm not proud of it.
+def parse_extra_args(extra_args_str):
+ extra_args = extra_args_str.split("--")
+ parsed_args = {}
+ for arg in extra_args:
+ if not arg.strip():
+ continue
+ key, value = arg.strip().split(" ", 1)
+ if value.lower() == "true":
+ value = True
+ elif value.lower() == "false":
+ value = False
+ else:
+ try:
+ value = int(value)
+ except ValueError:
+ try:
+ value = float(value)
+ except ValueError:
+ pass # If it's not a number, keep it as a string
+ parsed_args[key] = value
+ return parsed_args
+
+
+def clone_voice_gradio(
+ audio_filepath,
+ input_audio_filename_secondary,
+ speaker_as_clone_content,
+ dest_filename,
+ extra_blurry_clones,
+ even_more_clones,
+ audio_filepath_directory,
+ simple_clones_only,
+):
+ if not generation.get_SUNO_USE_DIRECTML() or ENABLE_DIRECTML_CLONE != "0":
+ clone_dir = clone_voice(
+ audio_filepath,
+ input_audio_filename_secondary,
+ dest_filename,
+ speaker_as_clone_content,
+ progress=gr.Progress(track_tqdm=True),
+ max_retries=2,
+ even_more_clones=even_more_clones,
+ extra_blurry_clones=extra_blurry_clones,
+ audio_filepath_directory=audio_filepath_directory,
+ simple_clones_only=simple_clones_only,
+ )
+ return clone_dir
+ else:
+ print("Using DirectML for cloning not yet supported")
+ # if extra_blurry_clones is True:
+ # return clone_dir
+ # else:
+ # return False
+
+
+def add_text(history, text):
+ history = history + [(text, None)]
+ return history, ""
+
+
+def add_file(history, file):
+ # history = history + [((file.name,), None)]
+ history = history + [((file,), None)]
+ return history
+
+
+def bot(history):
+ response = "**That's cool!**"
+ history[-1][1] = response
+ return history
+
+
+def validate_and_update(prompt, kwargs, min_length=6, barkdebug=False):
+ try:
+ if not prompt: # Checks if the prompt is not None and not an empty string
+ if barkdebug:
+ print(f"Skipping {prompt}: Null or Empty")
+ return kwargs
+ if isinstance(prompt, list):
+ if prompt: # Checks if the list is not empty
+ selected = prompt[0] # Gets first item from list
+ if barkdebug:
+ print(f"Selected first item from list: {selected}")
+ elif isinstance(prompt, str):
+ selected = prompt.strip()
+ if barkdebug:
+ print(f"Selected string after stripping: {selected}")
+
+ elif hasattr(prompt, "name"):
+ selected = prompt.name
+ if (
+ len(selected) >= min_length
+ ): # Checks if string length is greater than or equal to min_length
+ kwargs["history_prompt"] = selected
+ if barkdebug:
+ print(f"Updated kwargs[history_prompt] to {selected}")
+ else:
+ if barkdebug:
+ print(f"Skipping {selected}: Length less than {min_length}")
+ except Exception as e:
+ if barkdebug:
+ print(f"Error in validate_and_update function: {str(e)} {prompt} {type(prompt)}")
+ return kwargs
+
+
+def generate_audio_long_gradio(
+ input,
+ audio_prompt_input,
+ bark_speaker_as_the_prompt,
+ npz_dropdown,
+ generated_voices,
+ cloned_voices,
+ bark_infinity_voices,
+ confused_travolta_mode,
+ allow_blank,
+ stable_mode_interval,
+ separate_prompts,
+ separate_prompts_flipper,
+ split_character_goal_length,
+ split_character_max_length,
+ process_text_by_each,
+ in_groups_of_size,
+ group_text_by_counting,
+ split_type_string,
+ prompt_text_prefix,
+ prompt_text_suffix,
+ seed,
+ text_splits_only,
+ output_iterations,
+ hoarder_mode,
+ text_temp,
+ waveform_temp,
+ semantic_min_eos_p,
+ output_dir,
+ output_filename,
+ output_format,
+ add_silence_between_segments,
+ semantic_top_k,
+ semantic_top_p,
+ coarse_top_k,
+ coarse_top_p,
+ specific_npz_file,
+ audio_file_as_history_prompt,
+ specific_npz_folder,
+ split_character_jitter,
+ semantic_token_repeat_penalty,
+ semantic_inverted_p,
+ semantic_bottom_k,
+ semantic_use_mirostat_sampling,
+ semantic_mirostat_tau,
+ semantic_mirostat_learning_rate,
+ negative_text_prompt,
+ specific_npz_file_negative_prompt,
+ negative_text_prompt_logits_scale,
+ negative_text_prompt_divergence_scale,
+ extra_args_str,
+ progress=gr.Progress(track_tqdm=True),
+):
+ print("\n")
+
+ global last_audio_samples
+ # todo allow blank
+ if (input == None or len(input) < 4) and not allow_blank:
+ print(
+ "\nLooks like you forgot to enter a text prompt. There is a checkbox to enable empty prompts, if you really want."
+ )
+ return
+ text_splits_only = True
+
+ trim_logs()
+ global cancel_process
+
+ progress(0, desc="Starting...")
+ waiting = 0
+ while api.gradio_try_to_cancel and not api.done_cancelling:
+ waiting += 1
+ print("Waiting up to 10s current generation to finish before starting another...")
+ progress(
+ waiting,
+ desc="Waiting up to 10s current generation to finish before starting another...",
+ )
+ if waiting > 10:
+ print(
+ "Everything might be okay, but something didn't quite cancel properly so restart if things are weird."
+ )
+ break
+ time.sleep(1)
+
+ if api.gradio_try_to_cancel and api.done_cancelling:
+ cleanup_after_cancel()
+ api.gradio_try_to_cancel = False
+ api.done_cancelling = False
+ cancel_process = False
+
+ if api.done_cancelling:
+ print("Shouldn't happen, just return for now.")
+ return
+
+ if barkdebug is True:
+ print(locals())
+
+ kwargs = {}
+ kwargs["text_prompt"] = input
+
+ kwargs["negative_text_prompt"] = negative_text_prompt
+
+ # Validate and update prompts
+ kwargs = validate_and_update(npz_dropdown, kwargs, barkdebug=barkdebug)
+ kwargs = validate_and_update(bark_infinity_voices, kwargs, barkdebug=barkdebug)
+ kwargs = validate_and_update(generated_voices, kwargs, barkdebug=barkdebug)
+ kwargs = validate_and_update(cloned_voices, kwargs, barkdebug=barkdebug)
+ kwargs = validate_and_update(specific_npz_file, kwargs, barkdebug=barkdebug)
+
+ # specific_npz_file_negative_prompt with specific_npz_file_negative_prompt.name
+
+ if specific_npz_file_negative_prompt != "" and specific_npz_file_negative_prompt is not None:
+ specific_npz_file_negative_prompt_name = specific_npz_file_negative_prompt.name
+ kwargs["specific_npz_file_negative_prompt"] = specific_npz_file_negative_prompt_name
+
+ if audio_file_as_history_prompt != "" and audio_file_as_history_prompt is not None:
+ # audio_file_as_history_prompt_name = audio_file_as_history_prompt.name
+ kwargs["audio_file_as_history_prompt"] = audio_file_as_history_prompt
+
+ if bark_speaker_as_the_prompt != "" and bark_speaker_as_the_prompt is not None:
+ # bark_speaker_as_the_prompt_name = bark_speaker_as_the_prompt.name
+ kwargs["bark_speaker_as_the_prompt"] = bark_speaker_as_the_prompt
+
+ if audio_prompt_input is not None and audio_prompt_input != "":
+ kwargs["audio_prompt"] = audio_prompt_input
+
+ if specific_npz_folder != "" and specific_npz_folder is not None:
+ kwargs["specific_npz_folder"] = specific_npz_folder
+
+ kwargs["split_character_goal_length"] = int(split_character_goal_length)
+ kwargs["split_character_max_length"] = int(split_character_max_length)
+
+ if split_character_jitter != "" and split_character_jitter is not None:
+ kwargs["split_character_jitter"] = float(split_character_jitter)
+
+ if process_text_by_each is not None and process_text_by_each != "":
+ kwargs["process_text_by_each"] = process_text_by_each
+
+ if in_groups_of_size is not None:
+ kwargs["in_groups_of_size"] = int(in_groups_of_size)
+
+ if group_text_by_counting is not None and group_text_by_counting != "":
+ kwargs["group_text_by_counting"] = group_text_by_counting
+
+ if split_type_string is not None and split_type_string != "":
+ kwargs["split_type_string"] = split_type_string
+
+ if prompt_text_prefix is not None and prompt_text_prefix != "":
+ kwargs["prompt_text_prefix"] = prompt_text_prefix
+
+ if prompt_text_suffix is not None and prompt_text_suffix != "":
+ kwargs["prompt_text_suffix"] = prompt_text_suffix
+
+ if seed != "" and seed is not None and seed > 0 or seed < 0:
+ # because i moved iterations to Gradio, we can't just pass the seed or
+ # it will be reset for iteration.
+ # for now, let's set it manually
+ # kwargs["single_starting_seed"] = int(seed)
+ custom_seed = int(seed)
+ api.set_seed(custom_seed) # will also let them renable with -1
+
+ if stable_mode_interval != "" and stable_mode_interval is not None:
+ if stable_mode_interval == "Continuous":
+ kwargs["stable_mode_interval"] = 0
+ elif stable_mode_interval == "Stable":
+ kwargs["stable_mode_interval"] = 1
+ elif stable_mode_interval == "Stable-2":
+ kwargs["stable_mode_interval"] = 2
+ elif stable_mode_interval == "Stable-3":
+ kwargs["stable_mode_interval"] = 3
+ elif stable_mode_interval == "Stable-4":
+ kwargs["stable_mode_interval"] = 4
+ elif stable_mode_interval == "Stable-5":
+ kwargs["stable_mode_interval"] = 5
+ else:
+ kwargs["stable_mode_interval"] = int(stable_mode_interval)
+
+ if text_splits_only != "" and text_splits_only is not None:
+ kwargs["text_splits_only"] = text_splits_only
+
+ if separate_prompts != "" and separate_prompts is not None:
+ kwargs["separate_prompts"] = separate_prompts
+
+ if separate_prompts_flipper != "" and separate_prompts_flipper is not None:
+ kwargs["separate_prompts_flipper"] = separate_prompts_flipper
+
+ if hoarder_mode != "" and hoarder_mode is not None:
+ kwargs["hoarder_mode"] = hoarder_mode
+
+ if confused_travolta_mode != "" and confused_travolta_mode is not None:
+ kwargs["confused_travolta_mode"] = confused_travolta_mode
+
+ # I didn't dml all the code yet
+ if generation.get_SUNO_USE_DIRECTML() is True:
+ semantic_top_k = None
+ semantic_top_p = None
+ coarse_top_k = None
+ coarse_top_p = None
+
+ if semantic_top_k is not None and semantic_top_k != "" and semantic_top_k > 0:
+ kwargs["semantic_top_k"] = int(semantic_top_k)
+
+ if semantic_top_p is not None and semantic_top_p != "" and semantic_top_p > 0:
+ kwargs["semantic_top_p"] = float(semantic_top_p)
+
+ if coarse_top_k is not None and coarse_top_k != "" and coarse_top_k > 0:
+ kwargs["coarse_top_k"] = int(coarse_top_k)
+
+ if coarse_top_p is not None and coarse_top_p != "" and coarse_top_p > 0:
+ kwargs["coarse_top_p"] = float(coarse_top_p)
+
+ if (
+ negative_text_prompt_logits_scale is not None
+ and negative_text_prompt_logits_scale != ""
+ and negative_text_prompt_logits_scale > 0
+ ):
+ kwargs["negative_text_prompt_logits_scale"] = float(negative_text_prompt_logits_scale)
+
+ if (
+ negative_text_prompt_divergence_scale is not None
+ and negative_text_prompt_divergence_scale != ""
+ and negative_text_prompt_divergence_scale > 0
+ ):
+ kwargs["negative_text_prompt_divergence_scale"] = float(
+ negative_text_prompt_divergence_scale
+ )
+
+ if (
+ semantic_token_repeat_penalty is not None
+ and semantic_token_repeat_penalty != ""
+ and semantic_token_repeat_penalty > 0
+ ):
+ kwargs["semantic_token_repeat_penalty"] = float(semantic_token_repeat_penalty)
+
+ if semantic_inverted_p is not None and semantic_inverted_p != "" and semantic_inverted_p > 0:
+ kwargs["semantic_inverted_p"] = float(semantic_inverted_p)
+
+ if semantic_bottom_k is not None and semantic_bottom_k != "" and semantic_bottom_k > 0:
+ kwargs["semantic_bottom_k"] = int(semantic_bottom_k)
+
+ if semantic_use_mirostat_sampling is not None and semantic_use_mirostat_sampling != "":
+ kwargs["semantic_use_mirostat_sampling"] = semantic_use_mirostat_sampling
+
+ if semantic_mirostat_tau is not None and semantic_mirostat_tau != "":
+ kwargs["semantic_mirostat_tau"] = float(semantic_mirostat_tau)
+
+ if semantic_mirostat_learning_rate is not None and semantic_mirostat_learning_rate != "":
+ kwargs["semantic_mirostat_learning_rate"] = float(semantic_mirostat_learning_rate)
+
+ if output_dir is not None and output_dir != "":
+ kwargs["output_dir"] = output_dir
+
+ if output_filename is not None and output_filename != "":
+ kwargs["output_filename"] = output_filename
+
+ if output_format is not None and output_format != "":
+ kwargs["output_format"] = output_format
+
+ if text_temp is not None and text_temp != "":
+ kwargs["text_temp"] = float(text_temp)
+
+ if waveform_temp is not None and waveform_temp != "":
+ kwargs["waveform_temp"] = float(waveform_temp)
+
+ if semantic_min_eos_p is not None and semantic_min_eos_p != "":
+ kwargs["semantic_min_eos_p"] = float(semantic_min_eos_p)
+
+ if add_silence_between_segments is not None and add_silence_between_segments != "":
+ kwargs["add_silence_between_segments"] = float(add_silence_between_segments)
+
+ kwargs.update(parse_extra_args(extra_args_str))
+
+ using_these_params = kwargs.copy()
+ using_these_params["text_prompt"] = f"{input[:10]}... {len(input)} chars"
+ # print(f"Using these params: {using_these_params}")
+
+ if output_iterations is not None and output_iterations != "":
+ output_iterations = int(output_iterations)
+ else:
+ output_iterations = 1
+
+ if text_splits_only:
+ output_iterations = 1
+ (
+ full_generation_segments,
+ audio_arr_segments,
+ final_filename_will_be,
+ clone_created_filepaths,
+ ) = (
+ None,
+ None,
+ None,
+ [],
+ )
+
+ kwargs["output_iterations"] = output_iterations
+
+ npz_files = None
+ if specific_npz_folder is not None and specific_npz_folder != "":
+ print(f"Looking for npz files in {specific_npz_folder}")
+ npz_files = [f for f in os.listdir(specific_npz_folder) if f.endswith(".npz")]
+ npz_files.sort()
+ if len(npz_files) == 0:
+ print(f"Found no npz files in {specific_npz_folder}")
+ else:
+ total_iterations = kwargs["output_iterations"] * len(npz_files)
+
+ print(
+ f"Found {len(npz_files)} npz files in {specific_npz_folder} so will generate {total_iterations} total outputs"
+ )
+
+ # print(f"kwargs: {kwargs}")
+ if npz_files is not None and len(npz_files) > 0:
+ for i, npz_file in enumerate(npz_files):
+ print(f"Using npz file {i+1} of {len(npz_files)}: {npz_file}")
+ kwargs["history_prompt"] = os.path.join(specific_npz_folder, npz_file)
+
+ for iteration in range(1, output_iterations + 1):
+ text_prompt = kwargs.get("text_prompt")
+ if output_iterations > 1:
+ if iteration == 1:
+ print(" ", text_prompt)
+
+ kwargs["current_iteration"] = iteration
+ progress(
+ iteration,
+ desc=f"Iteration: {iteration}/{output_iterations}...",
+ total=output_iterations,
+ )
+
+ (
+ full_generation_segments,
+ audio_arr_segments,
+ final_filename_will_be,
+ clone_created_filepaths,
+ ) = api.generate_audio_long_from_gradio(**kwargs)
+ last_audio_samples.append(final_filename_will_be)
+
+ if cancel_process:
+ return final_filename_will_be
+ if kwargs.get("text_splits_only", False):
+ final_filename_will_be = "bark_infinity/assets/split_the_text.wav"
+ return final_filename_will_be
+ else:
+ for iteration in range(1, output_iterations + 1):
+ text_prompt = kwargs.get("text_prompt")
+ if output_iterations > 1:
+ if iteration == 1:
+ print(" ", text_prompt)
+
+ kwargs["current_iteration"] = iteration
+ progress(
+ iteration,
+ desc=f"Iteration: {iteration}/{output_iterations}...",
+ total=output_iterations,
+ )
+
+ (
+ full_generation_segments,
+ audio_arr_segments,
+ final_filename_will_be,
+ clone_created_filepaths,
+ ) = api.generate_audio_long_from_gradio(**kwargs)
+ last_audio_samples.append(final_filename_will_be)
+
+ if cancel_process:
+ return final_filename_will_be
+ if kwargs.get("text_splits_only", False):
+ final_filename_will_be = "bark_infinity/assets/split_the_text.wav"
+
+ return final_filename_will_be
+
+
+voice_directories = config.VALID_HISTORY_PROMPT_DIRS
+
+
+def generate_audio_long_gradio_clones(
+ input,
+ audio_prompt_input,
+ bark_speaker_as_the_prompt,
+ npz_dropdown,
+ generated_voices,
+ cloned_voices,
+ bark_infinity_voices,
+ confused_travolta_mode,
+ allow_blank,
+ stable_mode_interval,
+ separate_prompts,
+ separate_prompts_flipper,
+ split_character_goal_length,
+ split_character_max_length,
+ process_text_by_each,
+ in_groups_of_size,
+ group_text_by_counting,
+ split_type_string,
+ prompt_text_prefix,
+ prompt_text_suffix,
+ seed,
+ text_splits_only,
+ output_iterations,
+ hoarder_mode,
+ text_temp,
+ waveform_temp,
+ semantic_min_eos_p,
+ output_dir,
+ output_filename,
+ output_format,
+ add_silence_between_segments,
+ semantic_top_k,
+ semantic_top_p,
+ coarse_top_k,
+ coarse_top_p,
+ specific_npz_file,
+ audio_file_as_history_prompt,
+ specific_npz_folder,
+ split_character_jitter,
+ semantic_token_repeat_penalty,
+ semantic_inverted_p,
+ semantic_bottom_k,
+ semantic_use_mirostat_sampling,
+ semantic_mirostat_tau,
+ semantic_mirostat_learning_rate,
+ negative_text_prompt,
+ specific_npz_file_negative_prompt,
+ negative_text_prompt_logits_scale,
+ negative_text_prompt_divergence_scale,
+ extra_args_str,
+ progress=gr.Progress(track_tqdm=True),
+):
+ if input is None or input == "":
+ print("No input text provided to render samples.")
+ return None
+
+ hoarder_mode = True
+ output_dir = specific_npz_folder
+
+ print(f"output_dir: {output_dir}")
+ output_dir = f"cloned_voices/{output_filename}_samples"
+
+ return generate_audio_long_gradio(
+ input,
+ audio_prompt_input,
+ bark_speaker_as_the_prompt,
+ npz_dropdown,
+ generated_voices,
+ cloned_voices,
+ bark_infinity_voices,
+ confused_travolta_mode,
+ allow_blank,
+ stable_mode_interval,
+ separate_prompts,
+ separate_prompts_flipper,
+ split_character_goal_length,
+ split_character_max_length,
+ process_text_by_each,
+ in_groups_of_size,
+ group_text_by_counting,
+ split_type_string,
+ prompt_text_prefix,
+ prompt_text_suffix,
+ seed,
+ text_splits_only,
+ output_iterations,
+ hoarder_mode,
+ text_temp,
+ waveform_temp,
+ semantic_min_eos_p,
+ output_dir,
+ output_filename,
+ output_format,
+ add_silence_between_segments,
+ semantic_top_k,
+ semantic_top_p,
+ coarse_top_k,
+ coarse_top_p,
+ specific_npz_file,
+ audio_file_as_history_prompt,
+ specific_npz_folder,
+ split_character_jitter,
+ semantic_token_repeat_penalty,
+ semantic_inverted_p,
+ semantic_bottom_k,
+ semantic_use_mirostat_sampling,
+ semantic_mirostat_tau,
+ semantic_mirostat_learning_rate,
+ negative_text_prompt,
+ specific_npz_file_negative_prompt,
+ negative_text_prompt_logits_scale,
+ negative_text_prompt_divergence_scale,
+ extra_args_str,
+ progress=gr.Progress(track_tqdm=True),
+ )
+
+
+import os
+import pathlib
+
+
+def get_filename(x, debug=barkdebug):
+ if debug:
+ print(f"Type of x: {type(x)}")
+ print(f"Value of x: {x}")
+
+ if isinstance(x, str):
+ filename = x
+ elif hasattr(x, "name"):
+ filename = x.name
+ else:
+ return "", "", None
+
+ if debug:
+ print(f"Filename: {filename}")
+
+ audio_speaker_preview = None
+ audio_preview_segment = None
+ try:
+ if filename.endswith(".npz"):
+ base_dir = pathlib.Path(filename).parent
+ base_name = pathlib.Path(filename).stem
+
+ if debug:
+ print(f"Base dir: {base_dir}")
+ print(f"Base name: {base_name}")
+
+ """
+ audio_extensions = [".wav", ".mp3", ".ogg", ".flac", ".mp4"]
+ for extension in audio_extensions:
+ possible_audio_file = base_dir / f"{base_name}{extension}"
+ if debug:
+ print(f"audio 1: {audio_speaker_preview}")
+ if possible_audio_file.exists():
+ audio_speaker_preview = str(possible_audio_file)
+
+ break
+ possible_audio_file = base_dir / f"{base_name}"
+
+ if debug:
+ print(f"audio 1: {audio_speaker_preview}")
+ if possible_audio_file.exists():
+ audio_speaker_preview = str(possible_audio_file)
+ break
+ """
+
+ if audio_speaker_preview:
+ audio_preview_segment = gr.Audio.update(
+ audio_speaker_preview,
+ label="",
+ visible=True,
+ )
+ except Exception as e:
+ if debug:
+ print(f"An error occurred: {e}")
+ return os.path.basename(filename), filename, None
+
+ return os.path.basename(filename), filename, audio_preview_segment
+
+
+def create_npz_dropdown_single(directory, label, info="", allow_custom_value=False):
+ npz_files_by_subfolder = defaultdict(list)
+
+ for npz_file in glob.glob(os.path.join(directory, "**", "*.npz"), recursive=True):
+ subfolder = os.path.dirname(npz_file)
+ npz_files_by_subfolder[subfolder].append(npz_file)
+
+ sorted_npz_files = []
+ for subfolder in sorted(npz_files_by_subfolder.keys()):
+ sorted_npz_files.extend(sorted(npz_files_by_subfolder[subfolder]))
+
+ # npz_dropdown = gr.Dropdown(sorted_npz_files, label=label, info=info, allow_custom_value=allow_custom_value)
+ npz_dropdown = gr.Dropdown(
+ sorted_npz_files,
+ label=label,
+ info=info,
+ allow_custom_value=True,
+ multiselect=True,
+ max_choices=1,
+ )
+
+ return npz_dropdown
+
+
+def create_npz_dropdown(
+ directory_list, base_path=where_am_i, label="", info="", allow_custom_value=False
+):
+ npz_files_by_subfolder = defaultdict(list)
+
+ # Check if a single string is passed and convert to a single element list
+ if isinstance(directory_list, str):
+ directory_list = [directory_list]
+
+ for directory in directory_list:
+ full_path = os.path.join(base_path, directory) # Join with base directory
+ if os.path.exists(full_path):
+ for npz_file in glob.glob(os.path.join(full_path, "**", "*.npz"), recursive=True):
+ if os.path.getsize(npz_file) > 0: # Check if file is not empty
+ # Get the relative path from base_path
+ relative_path = os.path.relpath(npz_file, base_path)
+ subfolder = os.path.dirname(relative_path)
+ npz_files_by_subfolder[subfolder].append(relative_path)
+ else:
+ logger.debug(f"File {relative_path} is empty. Skipping.")
+ else:
+ logger.debug(f"Directory {full_path} does not exist. Skipping.")
+
+ sorted_npz_files = []
+ for subfolder in sorted(npz_files_by_subfolder.keys()):
+ sorted_npz_files.extend(sorted(npz_files_by_subfolder[subfolder]))
+
+ npz_dropdown = gr.Dropdown(
+ sorted_npz_files,
+ label=label,
+ info=info,
+ allow_custom_value=allow_custom_value,
+ multiselect=True,
+ max_choices=1,
+ )
+
+ return npz_dropdown
+
+
+outputs_dirs = ["bark_samples"]
+
+
+class Logger:
+ def __init__(self, filename):
+ self.terminal = sys.stdout
+ self.log = open(filename, "w", encoding="utf-8")
+
+ def write(self, message):
+ self.terminal.write(message)
+ self.log.write(message)
+
+ def flush(self):
+ self.terminal.flush()
+ self.log.flush()
+
+ def isatty(self):
+ return False
+
+
+sys.stdout = io.TextIOWrapper(
+ sys.stdout.buffer,
+ encoding="utf-8",
+ errors="replace",
+ newline="",
+ line_buffering=True,
+)
+sys.stderr = io.TextIOWrapper(
+ sys.stderr.buffer,
+ encoding="utf-8",
+ errors="replace",
+ newline="",
+ line_buffering=True,
+)
+
+sys.stdout = Logger("gradio_terminal_ouput.log")
+
+
+def test(x):
+ return
+
+
+def read_logs():
+ sys.stdout.flush()
+ with open("gradio_terminal_ouput.log", "r", encoding="utf-8") as f:
+ return f.read()
+
+
+def update_option(option_list, key, value, extra_help=None):
+ # Make a copy of the list so we don't modify the original
+ option_list = option_list.copy()
+
+ # Look for the option we want to change
+ for i, (option_key, option_values) in enumerate(option_list):
+ if option_key == key:
+ # Make a copy of the dict so we don't modify the original
+ option_values = option_values.copy()
+
+ # Update the option
+ option_values["value"] = value
+ if extra_help:
+ option_values["help"] += " " + extra_help
+
+ # Create a new tuple and replace the old one in the list
+ option_list[i] = (option_key, option_values)
+ break
+
+ return option_list
+
+
+model_options = [
+ (
+ "text_use_gpu",
+ {"value": True, "type": bool, "help": "Load the text model on the GPU."},
+ ),
+ (
+ "text_use_small",
+ {"value": False, "type": bool, "help": "Use a smaller/faster text model."},
+ ),
+ (
+ "coarse_use_gpu",
+ {"value": True, "type": bool, "help": "Load the coarse model on the GPU."},
+ ),
+ (
+ "coarse_use_small",
+ {"value": False, "type": bool, "help": "Use a smaller/faster coarse model."},
+ ),
+ (
+ "fine_use_gpu",
+ {"value": True, "type": bool, "help": "Load the fine model on the GPU."},
+ ),
+ (
+ "fine_use_small",
+ {"value": False, "type": bool, "help": "Use a smaller/faster fine model."},
+ ),
+ (
+ "codec_use_gpu",
+ {"value": True, "type": bool, "help": "Load the codec model on the GPU."},
+ ),
+ (
+ "force_reload",
+ {
+ "value": True,
+ "type": bool,
+ "help": "Force the models to be moved to the new device or size.",
+ },
+ ),
+]
+
+if generation.SUNO_HALF_PRECISION:
+ model_options = update_option(
+ model_options, "coarse_use_small", True, "(Default ON because of SUNO_HALF_PRECISION)"
+ )
+
+
+def preload_models_gradio(
+ text_use_gpu,
+ text_use_small,
+ coarse_use_gpu,
+ coarse_use_small,
+ fine_use_gpu,
+ fine_use_small,
+ codec_use_gpu,
+ force_reload,
+):
+ print("Preloading models...")
+ generation.preload_models(
+ text_use_gpu=text_use_gpu,
+ text_use_small=text_use_small,
+ coarse_use_gpu=coarse_use_gpu,
+ coarse_use_small=coarse_use_small,
+ fine_use_gpu=fine_use_gpu,
+ fine_use_small=fine_use_small,
+ codec_use_gpu=codec_use_gpu,
+ force_reload=force_reload,
+ )
+
+
+def cleanup_after_cancel():
+ global cancel_process
+
+ # put all the models on the right device
+ generation.preload_models(
+ force_reload=True,
+ )
+ # print("Fixing models...")
+
+
+def try_to_cancel(
+ text_use_gpu,
+ text_use_small,
+ coarse_use_gpu,
+ coarse_use_small,
+ fine_use_gpu,
+ fine_use_small,
+ codec_use_gpu,
+ force_reload,
+):
+ global cancel_process
+ cancel_process = True
+ api.gradio_try_to_cancel = True
+ api.done_cancelling = False
+ print("Trying to cancel...")
+
+
+# terrible b
+def generate_speaker_variations(variation_path, variation_count):
+ if variation_count is not None and variation_count != "":
+ variation_count = int(variation_count)
+ print(f"Generating {variation_count} for speakers {variation_path}...")
+
+ # should still link this as a lighter option
+ # api.render_npz_samples(npz_directory=variation_path,gen_minor_variants=variation_count)
+
+ api.doctor_random_speaker_surgery(variation_path, variation_count)
+ return
+
+
+def soundboard_directory_download(
+ soundboard_url="https://www.101soundboards.com/boards/27047-bob-ross-soundboard",
+ soundboard_directory="downloaded_sounds",
+):
+ from bark_infinity import data_utils
+
+ data_utils.fetch_and_convert_sounds(soundboard_directory, soundboard_url)
+
+ return
+
+
+def generate_sample_audio(sample_gen_path):
+ print("Generating sample audio...")
+ api.render_npz_samples(npz_directory=sample_gen_path)
+ return
+
+
+def generate_sample_audio_coarse(sample_gen_path):
+ print("Generating sample audio...")
+ api.render_npz_samples(npz_directory=sample_gen_path, start_from="coarse_prompt")
+ return
+
+
+def generate_sample_audio_semantic(sample_gen_path):
+ print("Generating sample audio...")
+ api.render_npz_samples(npz_directory=sample_gen_path, start_from="semantic_prompt")
+ return
+
+
+def set_XDG_CACHE_HOME(XDG_CACHE_HOME_textbox):
+ if XDG_CACHE_HOME_textbox is not None and XDG_CACHE_HOME_textbox != "":
+ print(f"Setting XDG_CACHE_HOME to {XDG_CACHE_HOME_textbox}")
+ os.environ["XDG_CACHE_HOME"] = XDG_CACHE_HOME_textbox
+ # this doesn't stick unless I restart so I'll just set directly
+ default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache")
+ generation.CACHE_DIR = os.path.join(
+ os.getenv("XDG_CACHE_HOME", default_cache_dir), "suno", "bark_v0"
+ )
+ print(f"Setting cache dir to {generation.CACHE_DIR}")
+
+
+def clean_models_button_click():
+ generation.clean_models()
+ return
+
+
+def sent_bark_envs(
+ env_config_group,
+ loglevel,
+ save_log_lines_number,
+ XDG_CACHE_HOME_textbox,
+ text_use_gpu,
+ text_use_small,
+ coarse_use_gpu,
+ coarse_use_small,
+ fine_use_gpu,
+ fine_use_small,
+ codec_use_gpu,
+ force_reload,
+):
+ set_XDG_CACHE_HOME(XDG_CACHE_HOME_textbox)
+
+ generation.OFFLOAD_CPU = "OFFLOAD_CPU" in env_config_group
+ generation.USE_SMALL_MODELS = "USE_SMALL_MODELS" in env_config_group
+ generation.GLOBAL_ENABLE_MPS = "GLOBAL_ENABLE_MPS" in env_config_group
+
+ print(
+ f"Setting these envs: OFFLOAD_CPU={generation.OFFLOAD_CPU}, USE_SMALL_MODELS={generation.USE_SMALL_MODELS}, GLOBAL_ENABLE_MPS={generation.GLOBAL_ENABLE_MPS}"
+ )
+
+ if loglevel is not None and loglevel != "":
+ print(f"Setting log level to {loglevel}")
+ logger.setLevel(loglevel)
+
+ global save_log_lines
+ save_log_lines = save_log_lines_number
+
+ preload_models_gradio(
+ text_use_gpu,
+ text_use_small,
+ coarse_use_gpu,
+ coarse_use_small,
+ fine_use_gpu,
+ fine_use_small,
+ codec_use_gpu,
+ force_reload,
+ )
+ return f"{generation.CACHE_DIR}"
+
+
+def set_gradio_options(save_log_lines_number):
+ global save_log_lines
+ save_log_lines = save_log_lines_number
+
+ generation.OFFLOAD_CPU = OFFLOAD_CPU
+ generation.USE_SMALL_MODELS = USE_SMALL_MODELS
+ generation.GLOBAL_ENABLE_MPS = GLOBAL_ENABLE_MPS
+
+
+def output_filesystem_button(directory):
+ # i can't get this
+ if current_tab == "clone":
+ directory = "cloned_voices"
+
+ directory = os.path.join(where_am_i, directory)
+
+ if not os.path.isdir(directory):
+ print(f"Error: The directory {directory} does not exist.")
+ return
+
+ startfile(directory)
+
+
+def generate_gradio_widgets(options):
+ widgets = []
+ for option_name, option_info in options:
+ if option_info["type"] == bool:
+ checkbox = gr.Checkbox(
+ label=option_name, value=option_info["value"], info=option_info["help"]
+ )
+ widgets.append(checkbox)
+ return widgets
+
+
+generated_widgets = generate_gradio_widgets(model_options)
+
+
+def format_defaults(defaults):
+ formatted_text = ""
+ for group_name, arguments in defaults.items():
+ formatted_text += f"{group_name}:\n"
+ for key, arg in arguments:
+ formatted_text += f" {key}:\n"
+ formatted_text += f" Type: {arg['type'].__name__}\n"
+ formatted_text += f" Default: {arg['value']}\n"
+ formatted_text += f" Help: {arg['help']}\n"
+ if "choices" in arg:
+ formatted_text += f" Choices: {', '.join(map(str, arg['choices']))}\n"
+ formatted_text += "\n"
+ return formatted_text
+
+
+formatted_defaults = format_defaults(config.DEFAULTS)
+
+
+def update_speaker(option):
+ if option == "File":
+ specific_npz_file.hide = False
+ return [gr.update(visible=False)]
+
+
+# When using Unicode with Python, replace "+" with "000" from the Unicode. And then prefix the Unicode with "\".
+# Using constants for these since the variation selector isn't visible.
+# Important that they exactly match script.js for tooltip to work.
+random_symbol = "\U0001f3b2\ufe0f" # ๐ฒ๏ธ
+reuse_symbol = "\u267b\ufe0f" # โป๏ธ
+paste_symbol = "\u2199\ufe0f" # โ
+refresh_symbol = "\U0001f504" # ๐
+save_style_symbol = "\U0001f4be" # ๐พ
+apply_style_symbol = "\U0001f4cb" # ๐
+clear_prompt_symbol = "\U0001f5d1\ufe0f" # ๐๏ธ
+extra_networks_symbol = "\U0001F3B4" # ๐ด
+switch_values_symbol = "\U000021C5" # โ
+restore_progress_symbol = "\U0001F300" # ๐
+
+text_transformation_symbol = "\U00002728" # โจ
+apply_style_symbol = "\U0001F3A8" # ๐จ
+
+
+def create_refresh_button(refresh_component, refresh_method, refreshed_args, elem_id):
+ def refresh():
+ refresh_method()
+ args = refreshed_args() if callable(refreshed_args) else refreshed_args
+
+ for k, v in args.items():
+ setattr(refresh_component, k, v)
+
+ return gr.update(**(args or {}))
+
+ refresh_button = ToolButton(value=refresh_symbol, elem_id=elem_id)
+ refresh_button.click(fn=refresh, inputs=[], outputs=[refresh_component])
+ return refresh_button
+
+
+def apply_styles(prompt, styles):
+ prompt = prompt_styles.apply_styles_to_prompt(prompt, styles)
+
+ return [gr.Textbox.update(value=prompt), gr.Dropdown.update(value=[])]
+
+
+def apply_transformations(prompt, styles):
+ prompt = prompt_transformations.apply_transformations_to_prompt(prompt, styles)
+
+ return [gr.Textbox.update(value=prompt), gr.Dropdown.update(value=[])]
+
+
+def trim_logs():
+ global save_log_lines
+ # print(f"Trimming logs to {save_log_lines} lines...")
+ save_log_lines = int(save_log_lines)
+
+ if save_log_lines < 0:
+ return
+
+ with open("gradio_terminal_ouput.log", "r", encoding="utf-8") as f:
+ lines = f.readlines()
+
+ if save_log_lines > 0 and len(lines) > save_log_lines:
+ lines = lines[-save_log_lines:]
+
+ with open("gradio_terminal_ouput.log", "w", encoding="utf-8") as f:
+ f.writelines(lines)
+
+
+def get_refresh_gpu_report():
+ full_gpu_report = api.gpu_status_report()
+ # full_gpu_report += api.gpu_memory_report()
+ return full_gpu_report
+
+
+with gr.Blocks(theme=default_theme, css=bark_console_style, title="Bark Infinity") as demo:
+ gr.Markdown(
+ """
+ # ๐ถ Bark Infinity ๐จโ๐ฌ๐งฌ๐๐ฏโโ๏ธ๐ https://github.com/JonathanFly/bark
+ """
+ )
+
+ with gr.Tabs(elem_id="main_top_ui_tabs") as main_top_tabs_block:
+ with gr.Tab(
+ "๐งโ๐ค Generate Audio", elem_id="main_tabs_generate_audio"
+ ) as generate_audio_main_tab:
+ with gr.Row():
+ with gr.Column(variant="primary", scale=1):
+ with gr.Row():
+ with gr.Column(variant="panel", scale=1):
+ gr.Markdown("## ๐ง๐ Main Bark Input - What to Say")
+
+ with gr.Tab(
+ "Text Prompts", elem_id="text_prompts_tab"
+ ) as text_prompts_tab:
+ with gr.Row(elem_id=f"text_row"):
+ input = gr.TextArea(
+ placeholder="Text Prompt",
+ label="Main Text Prompt",
+ info="The main text goes here. It can be as long as you want. You will see how the text will be split into smaller chunks in the 'console' in bottom right. A whole book if you want.",
+ elem_id="main_text_prompt",
+ )
+
+ with gr.Column():
+ allow_blank = gr.Checkbox(
+ label="Allow Blank Text Prompts",
+ info="Typically you want Always Maximum Length as well.",
+ value=False,
+ )
+
+ confused_travolta_mode = gr.Checkbox(
+ label="Always Generate Maximum Length.",
+ info="(Formerly ๐บ๐บ Confused Mode) Speakers will keep talking even when they should be done. Try continuing music as well.",
+ value=False,
+ )
+
+ with gr.Row(elem_id=f"styles_row"):
+ with gr.Column(variant="panel", scale=0.5):
+ prompt_styles_dropdown = gr.Dropdown(
+ label=f"Insert A Text Snippet: {user_style_csv}",
+ info=f"([bracket] words are very hit or miss, and .npz dependent.)",
+ elem_id=f"styles",
+ choices=[k for k, v in prompt_styles.styles.items()],
+ value=[],
+ multiselect=True,
+ )
+ # create_refresh_button(prompt_styles_dropdown, prompt_styles.reload, lambda: {"choices": [k for k, v in prompt_styles.styles.items()]}, f"refresh_styles")
+ prompt_style_apply = ToolButton(
+ value=apply_style_symbol,
+ elem_id=f"style_apply",
+ )
+ # save_style = ToolButton(value=save_style_symbol, elem_id=f"style_create")
+ with gr.Column(variant="panel", scale=0.5):
+ prompt_transformations_dropdown = gr.Dropdown(
+ label=f"Modify The Text Prompt",
+ info=f"Also customized at: {user_transformation_csv}",
+ elem_id=f"transformations",
+ choices=[
+ k
+ for k, v in prompt_transformations.transformations.items()
+ ],
+ value=[],
+ multiselect=True,
+ )
+ # create_refresh_button(prompt_styles_dropdown, prompt_styles.reload, lambda: {"choices": [k for k, v in prompt_styles.styles.items()]}, f"refresh_styles")
+ prompt_transformations_apply = ToolButton(
+ value=text_transformation_symbol,
+ elem_id=f"transformation_apply",
+ )
+ # save_style = ToolButton(value=save_style_symbol, elem_id=f"style_create")
+ prompt_style_apply.click(
+ fn=apply_styles,
+ inputs=[input, prompt_styles_dropdown],
+ outputs=[input, prompt_styles_dropdown],
+ )
+
+ prompt_transformations_apply.click(
+ fn=apply_transformations,
+ inputs=[input, prompt_transformations_dropdown],
+ outputs=[input, prompt_transformations_dropdown],
+ )
+
+ with gr.Tab('Audio/Speaker "Prompts" (Experimental)'):
+ with gr.Row(elem_id=f"text_row"):
+ with gr.Column(variant="panel", scale=1):
+ gr.Markdown(
+ "Use an audio clip as the prompt, instead of text. Audio less than 14s if you want hope your speaker sounds the same. Longer audio to explore what happens."
+ )
+
+ audio_prompt_input = gr.Audio(
+ label="Audio Prompts",
+ info="Use most common audio formats",
+ source="upload",
+ type="filepath",
+ elem_classes="bark_upload_audio",
+ )
+
+ gr.Markdown(
+ "๐ฃ๏ธ Use a speaker .npz as the *prompt*, not the voice. So you can still pick a *different* different speaker.npz to actually speak. Invoking the elemental syllables of creation."
+ )
+ bark_speaker_as_the_prompt = gr.File(
+ label="Pick a Specific NPZ From Filesystem",
+ file_types=["npz"],
+ elem_classes="bark_upload_file",
+ )
+
+ with gr.Tab('"Negative Prompt" (Experimental)'):
+ with gr.Row(elem_id=f"text_row"):
+ with gr.Column(variant="panel", scale=1):
+ gr.Markdown(
+ """## Negative Prompts and Anti Speakers are Work in Progress, currently not operational**. """
+ )
+ gr.Markdown(
+ "(These settings will do something, but not what you or anyone wants.)"
+ )
+
+ gr.Markdown(
+ """ (Try Semantic Inverted-P under Experimental Sampling, that works and is pretty fun.)"""
+ )
+
+ negative_text_prompt = gr.TextArea(
+ placeholder="Negative Text Prompt",
+ label="Negative Main Text Prompt",
+ info='I\'m not sure a "negative" prompt even makes sense in this model. But it sounds fun.',
+ elem_id="negative_text_prompt",
+ )
+ negative_text_prompt_divergence_scale = gr.Slider(
+ label="Negative Text Prompt Divergence Scale",
+ info="0 means the negative prompt divergence no impact, while a value of 1 would allow full impact.",
+ minimum=0.0,
+ maximum=2.0,
+ value=0.0,
+ interactive=True,
+ )
+ negative_text_prompt_logits_scale = gr.Slider(
+ label="Negative Text Prompt Scale",
+ info="0 means the negative prompt has no influence, 1 means the negative prompt has maximum influence.",
+ minimum=0.0,
+ maximum=2.0,
+ value=0.0,
+ interactive=True,
+ )
+
+ gr.Markdown(
+ """Experimental doesn't really cover it -- what does 'working correctly' look like for negative text prompt in a text to speech model? Anyone have suggestions? I'm thinking something like: a negative prompt \"I'm screaming and I hate you!!!\" makes Bark more inclined to generate quieter, friendly speech."""
+ )
+
+ gr.Markdown(
+ """I've been noodling with the idea of allowing the text prompt (or the voice prompt) to change mid generation. So partway through the audio file being generated, Bark clones off the current state and rewrites a designed part of the model context. It would probably be a bit in the past so the audio wouldn't clip, for example, maybe just the audio segment between 2 and 4 seconds previously. I'm not sure this enables anything useful, but a similar concept is very powerful in Stable Diffusion prompts so it may be worth exploring. At the very least it should let you use multiple .npz files in a prompt, switching as needed, and generate audio clips that are at least sound connected, even if not very coherent."""
+ )
+
+ with gr.Column(scale=1, variant="panel"):
+ m("## ๐งโ๐ค Bark Speaker.npz - Who Says It")
+
+ with gr.Tab("Simple"):
+ gr.Markdown("## ๐ฑ๐๏ธ Create A New Voice With Bark")
+ m(
+ "***Create a new voice.*** It's largely random but your text prompt will influence the voice."
+ )
+ with gr.Row():
+ with gr.Column(scale=1, elem_classes="tiny_column"):
+ force_random_speaker = gr.Checkbox(
+ label="๐ฒ Random Voice", value=False
+ )
+ with gr.Column(scale=3, elem_classes="tiny_column"):
+ m(
+ "You'll default to a random speaker if you don't select one. Check \"Save Every NPZ\" if you're actively looking for a voice."
+ )
+
+ gr.Markdown("""## ๐งโ๐ค ***OR:*** Choose An Existing Voice""")
+
+ with gr.Row():
+ with gr.Column(scale=3, elem_classes="tiny_column"):
+ npz_dropdown = create_npz_dropdown(
+ voice_directories,
+ label="๐งโ๐ค Built In Voice",
+ info="Speakers provided by Suno-ai, in many languages. The v2 ones are good for a basic clear voice. Also some I accidentally left in the github repo. Better ones incoming.",
+ )
+ with gr.Column(scale=1, elem_classes="tiny_column"):
+ specific_npz_file = gr.File(
+ label="Any .NPZ File",
+ file_types=["npz"],
+ elem_classes="bark_upload_file",
+ show_label=True,
+ elem_id="specific_npz_file",
+ )
+
+ with gr.Tab("Advanced"):
+ with gr.Row():
+ with gr.Tab("๐ต๐ An Audio Sample"):
+ gr.Markdown("A Quick Voice Clone. Or A Song Continued.")
+ audio_file_as_history_prompt = gr.Audio(
+ label="Create a Speaker From An Audio File + Text Prompt",
+ info="",
+ source="upload",
+ type="filepath",
+ elem_classes="bark_upload_audio",
+ )
+
+ gr.Markdown(
+ "Bark will try and clone your audio clip, then the clone will be used as your speaker.npz and will speak the prompt. You will have two new voice .npzs after. MAIN.npz is just from the original audio. And others are saved after speaking the prompt. (Typically it improves the quality.) Try cloning music or sounds. Serious clones have a second tab."
+ )
+
+ bark_infinity_voices = gr.Textbox(visible=False)
+
+ with gr.Tab("๐ฅ๐ Your Creations"):
+ gr.Markdown(
+ """#### ๐ฅ Choose from your Cloned Voices Directory"""
+ )
+
+ generated_voices = gr.Textbox(visible=False)
+ cloned_voices = create_npz_dropdown(
+ "cloned_voices/",
+ label="Cloned Voices",
+ info="Clones you tried to make. This is just a temporary UI, we're gonna need a serious upgrade to select, organize, and rank numerous clones.",
+ )
+
+ gr.Markdown(
+ """#### Generate audio for every .npz voice in a directory. (For clone tests, also check "Save Every NPZ".)"""
+ )
+
+ specific_npz_folder = gr.Textbox(
+ label=f"๐ A directory containing .npz files. Each one will generate the prompt.",
+ info=f"(The full directory path or from {where_am_i}/",
+ value="",
+ placeholder=f"Directory name or path.",
+ )
+
+ with gr.Tab("Anti-Speaker (Experimental)"):
+ with gr.Row():
+ gr.Markdown(
+ "Anti Speaker. Use a voice the least like this one? Another concept I'm no sure even makes sense in this model. What is the opposite of a voice? I just did the simplest possible version for now."
+ )
+
+ specific_npz_file_negative_prompt = gr.File(
+ label="Any .NPZ File, Negative Speaker",
+ file_types=["npz"],
+ elem_classes="bark_upload_file",
+ show_label=True,
+ elem_id="specific_npz_file_negative_prompt",
+ )
+
+ gr.Markdown(
+ """Similar questions as the negative text prompt. If you have a nice clear voice with no background as the anti-speaker get a noisy voice with background sounds in your final output audio? That's logical, but probably annoying right? Ideally this is more about higher level features?"""
+ )
+
+ with gr.Row():
+ with gr.Column(scale=3, elem_classes="tiny_column"):
+ selected_npz_file = gr.Textbox(
+ label=f"",
+ info=f"๐งโ๐ค Selected Voice. (Will need more than one soon.)",
+ visible=True,
+ show_label=False,
+ elem_id=f"selected_npz_file",
+ interactive=False,
+ )
+
+ speaker_preview_audio = gr.Audio(
+ label="",
+ show_label=False,
+ type="filepath",
+ elem_classes="bark_output_audio",
+ elem_id="speaker_preview_audio",
+ interactive=False,
+ visible=False,
+ )
+
+ selected_npz_file_full = gr.Textbox(
+ label=f"",
+ info=f"",
+ visible=False,
+ show_label=False,
+ elem_id=f"selected_npz_file_full",
+ )
+ specific_npz_file.change(
+ get_filename,
+ inputs=[specific_npz_file],
+ outputs=[
+ selected_npz_file,
+ selected_npz_file_full,
+ speaker_preview_audio,
+ ],
+ )
+
+ audio_file_as_history_prompt.change(
+ get_filename,
+ inputs=[audio_file_as_history_prompt],
+ outputs=[
+ selected_npz_file,
+ selected_npz_file_full,
+ speaker_preview_audio,
+ ],
+ )
+
+ npz_dropdown.change(
+ get_filename,
+ inputs=[npz_dropdown],
+ outputs=[
+ selected_npz_file,
+ selected_npz_file_full,
+ speaker_preview_audio,
+ ],
+ )
+
+ # speaker_selection = gr.Textbox(label="Speakers Selected", lines=1, placeholder='', value='', info="")
+ """
+ with gr.Column(variant="panel",scale=0.25):
+ m("## ...")
+ #speaker_selection = gr.Textbox(label="Speakers Selected", lines=1, placeholder='', value='Random Speaker', info="")
+ """
+
+ with gr.Accordion(
+ "โถ Detailed Audio Options (Click to Toggle)",
+ open=True,
+ elem_classes="generate_options_row",
+ elem_id="generate_options_row_id",
+ ):
+ with gr.Row():
+ with gr.Column(variant="panel", scale=1):
+ m("## โ๏ธ Splitting Up Long Text")
+
+ with gr.Tab("Simple"):
+ m(
+ "Try to aim about 10s per audio clip. It's fine to leave these on defaults. "
+ )
+ split_character_goal_length = gr.Slider(
+ label="Try for this many characters in each",
+ value=165,
+ maximum=500,
+ step=1,
+ )
+ split_character_max_length = gr.Slider(
+ label="But never go higher than this many",
+ value=205,
+ maximum=500,
+ step=1,
+ )
+
+ with gr.Tab("Advanced"):
+ prompt_text_prefix = gr.Textbox(
+ label="Put this text **in front** of every text segment, after splitting.",
+ value="",
+ )
+ prompt_text_suffix = gr.Textbox(
+ label="Put this text **after** every text segment, after splitting.",
+ value="",
+ )
+ split_character_jitter = gr.Slider(
+ label="Randomize character splits by this much",
+ info="If you're generating a lot of iterations you might try randomizing the splits a bit with this.",
+ value=0,
+ maximum=500,
+ step=1,
+ )
+ m(
+ "Below is mostly placeholder. But these old functions still sort of work:"
+ )
+ m(
+ "For example for song lyrics, in the below 3 boxes pick: `line` then `4` then `line` this will split the text in groups of 4 lines each."
+ )
+ process_text_by_each = gr.Dropdown(
+ [
+ "word",
+ "line",
+ "sentence",
+ "char",
+ "string",
+ "random",
+ "regex",
+ ],
+ label="Process the text in chunks of:",
+ value=None,
+ # multiselect=True,
+ # max_choices=1,
+ )
+ group_text_by_counting = gr.Dropdown(
+ [
+ "word",
+ "line",
+ "sentence",
+ "char",
+ "string",
+ "random",
+ "regex",
+ ],
+ label="Group the text by counting:",
+ value=None,
+ # multiselect=True,
+ # max_choices=1,
+ )
+ in_groups_of_size = gr.Slider(
+ label="And start a new audio clip with you have this many:",
+ minimum=1,
+ maximum=50,
+ step=1,
+ value=None,
+ )
+
+ split_type_string = gr.Textbox(
+ label="(Optional String for string or regex.)",
+ value="",
+ )
+
+ text_splits_only = gr.Checkbox(
+ label="๐บ๏ธโ๏ธ No audio, just show me text splits.",
+ value=False,
+ )
+
+ with gr.Column(variant="panel", scale=1):
+ m("## ๐ Connecting Audio Segments")
+ with gr.Tab("Simple"):
+ m(
+ "#### Bark generates 14s audio clips by default.\n Each clip will be joined together to create longer audio."
+ )
+
+ stable_mode_interval = gr.Dropdown(
+ [
+ "Continuous",
+ "Stable",
+ "Stable-2",
+ "Stable-3",
+ "Stable-4",
+ "Stable-5",
+ ],
+ label="How to Join Clips",
+ info="",
+ value="Stable",
+ )
+
+ m(
+ """ - ***Stable*** for reliable long clips.
+ - For now, stick with ***Stable*** unless you are exploring.
+ - ***Continuous*** means each clip acts like the voice for the following clip.
+ - Very smooth, but voices will change quite a bit after even 20 or 30 seconds.
+ - (coming soon, stable and smooth.)"""
+ )
+
+ with gr.Tab("Advanced"):
+ add_silence_between_segments = gr.Slider(
+ label="Add Silence",
+ minimum=0.0,
+ maximum=5.0,
+ value=0.0,
+ interactive=True,
+ info="Try 0.25 if using 'Stable' mode to space it out a bit.",
+ )
+ m("### More Advanced Joining Coming...")
+
+ """
+ m("### Enlarge or clip histories. Not in this version yet.")
+ history_prompt_semantic_weight = gr.Slider(label="History Prompt Semantic Weight", minimum=0.0, maximum=2.0, value = 1.0, interactive = True)
+ history_prompt_coarse_weight = gr.Slider(label="History Prompt Coarse Weight", minimum=0.0, maximum=2.0, value = 1.0, interactive = True)
+ history_prompt_fine_weight = gr.Slider(label="History Prompt Fine Weight", minimum=0.0, maximum=2.0, value = 1.0, interactive = True)
+
+ prev_semantic_weight = gr.Slider(label="Prev Semantic Weight", minimum=0.0, maximum=2.0, value = 1.0, interactive = True)
+ prev_coarse_weight = gr.Slider(label="Prev Coarse Weight", minimum=0.0, maximum=2.0, value = 1.0, interactive = True)
+ prev_fine_weight = gr.Slider(label="Prev Fine Weight", minimum=0.0, maximum=2.0, value = 1.0, interactive = True)
+ """
+
+ with gr.Tab("Experimental"):
+ m(
+ """### Don't Connect Audio Segments \n
+ Split the text normally. But ***use a random speaker*** for each segment."""
+ )
+ m("Good for discovering speakers.")
+ separate_prompts = gr.Checkbox(
+ label="Separate Prompts",
+ value=False,
+ interactive=True,
+ visible=True,
+ )
+
+ m(
+ "When using ***Separate Prompts*** keep the newly created voice the same for the next segment. This gives you an accurate sample for each random voice."
+ )
+ separate_prompts_flipper = gr.Checkbox(
+ label="Separate Prompts, but do one generation",
+ value=False,
+ interactive=True,
+ visible=True,
+ )
+
+ with gr.Column(variant="panel", scale=1):
+ m("## ๐ฃ๏ธ Generation (Sampling)")
+
+ with gr.Tab("Simple"):
+ semantic_min_eos_p = gr.Slider(
+ label="Clip Length Chance",
+ minimum=0.0,
+ maximum=1.0,
+ value=0.2,
+ interactive=True,
+ info="Getting extra words? Try 0.10 or 0.05.",
+ )
+ m(
+ """#### ๐ก๏ธ Temperature: โฌ๏ธ = more diverse, โฌ๏ธ = more conservative"""
+ )
+
+ text_temp = gr.Slider(
+ label="text temperature ๐ก๏ธ: ",
+ info="'text' is about clip 'content'",
+ minimum=0.000,
+ maximum=2.0,
+ value=0.70,
+ interactive=True,
+ )
+ waveform_temp = gr.Slider(
+ label="wave temperature ๐ก๏ธ: ",
+ info="'wave' is about detailed sound",
+ minimum=0.000,
+ maximum=2.0,
+ value=0.50,
+ interactive=True,
+ )
+
+ with gr.Tab("Advanced"):
+ seed = gr.Number(
+ label="Seed",
+ info="Leave 0 for random. Set -1 to restore random. Using a seed slows generation time.",
+ value=0,
+ )
+ m(
+ """Sampling parameters which should have an impact. So far hard to say."""
+ )
+ semantic_top_k = gr.Slider(
+ label="semantic_top_k",
+ value=100,
+ minimum=0,
+ maximum=1000,
+ step=1,
+ )
+ semantic_top_p = gr.Slider(
+ label="semantic_top_p",
+ value=0.95,
+ minimum=0.0,
+ maximum=1.0,
+ )
+ coarse_top_k = gr.Slider(
+ label="coarse_top_k",
+ value=100,
+ minimum=0,
+ maximum=1000,
+ step=1,
+ )
+ coarse_top_p = gr.Slider(
+ label="coarse_top_p",
+ value=0.95,
+ minimum=0.0,
+ maximum=1.0,
+ )
+
+ with gr.Tab("Experimental"):
+ m(
+ """***Token Repetition Penalty*** tends to make speakers talk faster. If you set it just a little bit over 1.0, it may slow them down. """
+ )
+ semantic_token_repeat_penalty = gr.Slider(
+ label="Token Repetition Penalty",
+ info="Every time a token is generated, make the token this many times likely to appear again. So 0.5 is half as likely every time. 1.1 is 10% more likely. Set to 0 to disable.",
+ minimum=0.000,
+ maximum=2.0,
+ value=0.0,
+ interactive=True,
+ )
+ m(
+ """***Semantic Inverted-P*** has a narrow and fiddly range, but it makes very interesting speech patterns and samples within the useful range. It's very speaker dependent, could be as low as 0.25, as high as 0.80 or more."""
+ )
+ semantic_inverted_p = gr.Slider(
+ label="Semantic Inverted-P",
+ info="Inverted Sampling: With negative top-p, instead of selecting from the *top* tokens until we reach a cumulative probability of top_p, select from the *least* probable tokens, until a cumulative probability of inverted_p. Set to 0 to disable.",
+ value=0.0,
+ minimum=0.0,
+ maximum=1.0,
+ interactive=True,
+ )
+
+ semantic_bottom_k = gr.Slider(
+ label="Semantic Bottom K",
+ info="Set to 0 to disable.",
+ value=0,
+ minimum=0,
+ maximum=1000,
+ step=1,
+ )
+
+ m(
+ """Inverted-P overrides top_p, and bottom_k overrides top_k. But you can use inverted p and regular k together, or vice versa."""
+ )
+ m(
+ """I'm not sure I left Mirostat in a working state. The effect of Mirostat, if it was ever working, is supposed to be fairly subtle despite the term 'surprise factor' it really just means perplexity and it is trying to have higher quality output, not 'shocking' or 'surprising'. These settings still change the output so they are doing *something*. With mirostat you can try temperatures above 1.0, it should bring the output back into normal range. Surprise should not be at 40 so it's not right, but lower values were getting a lot of silence. """
+ )
+ semantic_use_mirostat_sampling = gr.Checkbox(
+ label="Use Semantic Mirostat Sampling",
+ info="",
+ value=False,
+ )
+
+ semantic_mirostat_tau = gr.Slider(
+ label="Semantic Surprise Factor (Mirostat Tau)",
+ info="",
+ minimum=0.000,
+ maximum=100,
+ value=40.0,
+ step=0.1,
+ interactive=True,
+ )
+
+ semantic_mirostat_learning_rate = gr.Slider(
+ label="Semantic Mirostat Learning Rate",
+ info="",
+ minimum=0.000,
+ maximum=2.0,
+ value=0.75,
+ interactive=True,
+ )
+
+ with gr.Column(variant="panel", scale=1):
+ m("## ๐Final Output")
+ with gr.Tab("Simple"):
+ hoarder_mode = gr.Checkbox(
+ label="๐๐Save Every NPZ",
+ info="Every time Bark generates audio, the voice becomes a little different by the end of the clip. You can tweak a voice this way if you save every version. Try speaking a large amount of text, the new version will speak faster.",
+ value=False,
+ )
+ output_dir = gr.Textbox(
+ label="Output directory", value="bark_samples"
+ )
+ clone_output_dir = gr.Textbox(
+ label="Output directory",
+ value="cloned_voices/",
+ visible=False,
+ )
+
+ output_iterations = gr.Slider(
+ label="Repeat This Many Times",
+ step=1,
+ value=1,
+ minimum=1,
+ maximum=100,
+ )
+ with gr.Tab("Advanced"):
+ output_filename = gr.Textbox(
+ label="Output filename",
+ value="",
+ info="Use prompt, speaker, and date if left blank.",
+ )
+
+ output_format = gr.Dropdown(
+ ["wav", "mp3", "ogg", "flac", "mp4"],
+ value="mp3",
+ label="Audio File Output Format",
+ info="(You can re-render wavs if you save .npzs)",
+ )
+
+ with gr.Row():
+ with gr.Column(scale=1):
+ generate_button = gr.Button("Generate Audio", variant="primary")
+
+ with gr.Column(scale=1):
+ cancel_button = gr.Button(
+ "Cancel (Hit once, it finishes current stage.)",
+ label="",
+ variant="stop",
+ )
+
+ with gr.Tab(
+ "๐จโ๐ฌ๐งฌ Clone A Voice",
+ elem_id="main_tabs_cloning",
+ ) as clone_main_tab:
+ # Model Developed by from https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer
+ with gr.Row():
+ gr.Markdown("## ๐จโ๐ฌ๐งฌ Clone a Voice")
+ gr.Markdown(
+ "### (Under the hood: [gitmylo's Hubert Model](https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer) )"
+ )
+
+ with gr.Row():
+ with gr.Column(scale=5, variant="panel"):
+ gr.Markdown("### All you need for voice clone 1๏ธโฃ2๏ธโฃ3๏ธโฃ ")
+ with gr.Column(scale=1):
+ gr.Markdown("### 1๏ธโฃ Select Audio Sample For Voice Clone:")
+ with gr.Column(scale=1):
+ input_audio_filename = gr.Audio(
+ label="Audio Sample For Voice Clone",
+ info="As short as 10 seconds or as long as five minutes. Noise reduction helps a lot.",
+ source="upload",
+ type="filepath",
+ elem_classes="bark_upload_audio",
+ )
+
+ initialname = "New_Voice_Clone"
+ gr.Markdown("### 2๏ธโฃ Name Your Voice Clone")
+ output_voice = gr.Textbox(
+ label="Voice Clone Name",
+ lines=1,
+ placeholder=initialname,
+ value=initialname,
+ info="You find the clones .npz in cloned_voices/clone_name/",
+ )
+
+ gr.Markdown(
+ "### 3๏ธโฃ (Optional) Write one or two good text prompts that capture the speaking style of the voice clone. You don't need to do this but it helps. You can use the main text input and splitting functions."
+ )
+
+ gr.Markdown(
+ "Words can hear in your head. Consider additional commas or nontraditional word spelling if the rhythm or pronunciation is especially unique."
+ )
+
+ clone_prompt_1 = gr.Textbox(
+ lines=1,
+ placeholder="Text clearly in the style of your clone.",
+ label="3๏ธโฃ One Clone Short Text Prompt",
+ info="Maybe a sentence or two. Feel free to experiment.",
+ visible=False,
+ )
+
+ clone_prompt_2 = gr.Textbox(
+ lines=2,
+ placeholder="Text clearly in the style of your clone.",
+ label="3๏ธโฃ One Clone Long Text Prompt",
+ info="At least 2 sentences, 3 or 4 is better, as long as it is still reasonable to say everything in less than 14 seconds.",
+ visible=False,
+ )
+
+ gr.Markdown(
+ "The text prompts will use the standard settings on the main tab, so if you want to tweak temperature or anything, go ahead. You can even use very long text tor multiple iterations. If your text prompts have having terrible results change them up totally."
+ )
+
+ with gr.Column(scale=5, variant="panel"):
+ with gr.Tab("Cloning Help"):
+ gr.Markdown("## The basic process:")
+
+ gr.Markdown(
+ """
+ 1. Create voice clones from your audio original sample.
+ 2. For each clone, use Bark to have the clone speak a text sample. (Choose something in the style of the clone.)
+ 3. Save the clone again after that sample. While this changes the voice, it can also improve the voice, so you typically need to get a lucky generation that improves the clone without changing it for a really good clone.
+ 4. *Text can matter a lot*. Try to find a few decent clones set those aside. Those are the ones you are going try lots of text and try to get a really good clone.
+ 5. It may be worth trying very different sampling parameters. In particular, try zeroing out all the top_k and top_p values if you aren't getting good results."""
+ )
+
+ gr.Markdown(
+ """Use audio as long or short audio you like, but for now stick to a few minutes at most, for memory reasons. It's typically better if your audio has a natural pause at the end, but not absolutely necessary. Update: Long clips work a lot better now."""
+ )
+
+ gr.Markdown(
+ """Presently, longer audio is not being used to train a model or referenced as a whole. Instead you will get a speaker created every every few seconds in that audio. Effectively this is what you would have gotten if had cut up a long clip pieces. (It is a little better, the clips overlap instead of simply split.) (*Update*: It's quite a bit better now. Try 3 to 6 minutes of clear voice samples.)"""
+ )
+
+ gr.Markdown(
+ """A natural pause at the end of a short clip is ideal. You will fine some clones named MAIN, these are the ones that use the end of the clip and are the most likely better quality.
+ \n *Noise Reduction* is extremely helpful. You want a a clear audio sample of a single person speaking. Though it's not completely clear cut. You may want to try both noise reduced and non noised audio. I have found some noisy voices that are noisy ins a very distinctive way (background chatter of a particular TV show for example) may actually help define the voice for the Bark.
+ \n (For creative use, use music or anything at all.)"""
+ )
+
+ gr.Markdown(
+ """If you get an error switching between cloning and generation, click the preload models button in the Model Options tab. There's something I missed cleaning up after switching."""
+ )
+
+ with gr.Tab("Extra/Future Options"):
+ gr.Markdown("""### ๐กโก๏ธ๐งช Some Extra, Mostly Future """)
+
+ gr.Markdown(
+ """I pulled the weirder stuff for now - everyone was confused on just using the UI. We'll get starter clones going for everyone first, maybe add complexity later if it can't be easily automated"""
+ )
+
+ gr.Markdown("Directory of wav files to use as inputs.")
+ audio_filepath_directory = gr.Textbox(
+ label="Voice Clone Directory",
+ lines=1,
+ placeholder="",
+ value="",
+ info=f"Relative to: {where_am_i} or absolute path.",
+ )
+
+ simple_clones_only = gr.Checkbox(
+ label="Just use the end of the audio clip (or clips) as the voice clone.",
+ info="You will get one clone per audio file with this option",
+ value=False,
+ )
+
+ gr.Markdown("""#### ๐ถ๐ซ๏ธ๐โ๐ฆบ Create Extra Blurry Clones.""")
+ extra_blurry_clones = gr.Checkbox(
+ label="๐ถ๐ซ๏ธ๐โ๐ฆบ Extra Blurry Clones. Not so useful for accuracy but often creates nice new voices.",
+ info="(This clone is only passed the coarse model, not the fine model.)",
+ value=False,
+ )
+
+ gr.Markdown("""#### Create Extra Foreign Clones ๐งฌ๐ฏโโ๏ธ๐ฏโโ๏ธ""")
+ even_more_clones = gr.Checkbox(
+ label="Extra Foreign Clones ๐งฌ๐ฏโโ๏ธ๐ฏโโ๏ธ",
+ info="Create about twice as many total clones by also using the Polish voice cloning model. Much worse for English voices but the clones aren't *identical* so one could be better. (They tend to have accents.)",
+ value=False,
+ )
+
+ gr.Markdown("""(The last two checkboxes stack.""")
+
+ speaker_as_clone_content = gr.File(
+ label="Throw a copy of a good clone into the mix.",
+ file_types=["npz"],
+ elem_classes="bark_upload_file",
+ )
+
+ gr.Markdown("""Secondary Audio Sample For Cloning:""")
+ gr.Markdown(
+ """Secondary audio file, generally between 7 and 13 seconds, but longer can be okay. Try to choose the most iconic clips. Using this field activated a bunch of randomization that takes a long time and generates a lot of clones. I thought it didn't work, but I have heard from some people it did *sometimes* make a really nice clone."""
+ )
+
+ input_audio_filename_secondary = gr.Audio(
+ label="Secondary Audio File",
+ info="Use most common audio formats",
+ source="upload",
+ type="filepath",
+ elem_classes="bark_upload_audio",
+ )
+
+ gr.Markdown(
+ """(Clone Blender. Throw in your favorites, hopes something better comes out.) (Not yet operational.)"""
+ )
+
+ # speaker_as_clone_content = gr.Slider(label="Space between audio clones segments in the files", info="If you've only got a short sample or you feel like you just just barely missing a good voice, you can try lower values. On the default each speak already overlaps a lot. For very long clips, very high numbers will just take a few samples.", step=1, value=164, maximum=10000, minimum=32, interactive=False)
+
+ gr.Markdown(
+ "The prompts a bit skinny by default to and get some diversity over a clip."
+ )
+
+ # even_more_clones = gr.Slider(label="Just give me more clones. ๐ฑ๐กโก๏ธ๐งช๐งฌ๐ฏโโ๏ธ๐ฏโโ๏ธ Yo'll get more clones, but they will not be very dgood. But sometimes you get lucky. Very slow, just going 1 to 2 will take a few times longer.", step=1, value=1, maximum=5, minimum=1)
+
+ gr.Markdown(
+ """Make sure you put text in the main text prompt for your samples. Take time to get text that is has the style and rhythm the voice you want to tclnoe, it will save after each sample, they often work well as clones."""
+ )
+
+ with gr.Row():
+ clone_voice_button = gr.Button(
+ "Begin Generating Voice Clones",
+ variant="primary",
+ elem_id="cloning",
+ )
+ dummy = gr.Text(label="Cloning Progress...")
+
+ with gr.Tab("๐๐ Settings", elem_id="main_tabs_settings") as settings_tab:
+ with gr.Row():
+ with gr.Column(scale=1, variant="panel"):
+ gr.Markdown(
+ """## ๐ถ Bark Model Options
+ ### Three Bark Models: ***text***, ***coarse***, and ***fine***.
+ Each model can run on GPU or CPU, each has a small version.\n
+ You can mix and GPU and CPU, small and large.\n
+ Recommend using large ***text*** even if it must be onCPU.\n
+ For speed, try just small ***coarse*** - it's the slowest model."""
+ )
+ model_checkboxes = generate_gradio_widgets(model_options)
+
+ env_config_vars = [
+ "OFFLOAD_CPU",
+ "USE_SMALL_MODELS",
+ "GLOBAL_ENABLE_MPS",
+ ]
+ env_config_values = ["OFFLOAD_CPU", "", ""]
+ gr.Markdown("### ๐ถ Bark Environment Variables")
+ env_config_group = gr.CheckboxGroup(
+ choices=env_config_vars,
+ value=env_config_values,
+ label="Set GLOBAL_ENABLE_MPS for Apple M1",
+ type="value",
+ interactive=True,
+ visible=True,
+ )
+
+ # model_button = gr.Button("Preload Models Now")
+ # model_button.click(preload_models_gradio, inputs=model_checkboxes)
+
+ with gr.Column(scale=3, variant="panel"):
+ gr.Markdown("## Bark Infinity Options")
+ with gr.Row():
+ with gr.Column(scale=4):
+ gr.Markdown(
+ """You can use all large models on a GPU with 6GB GPU and OFFLOAD_CPU, and it's almost as fast.
+ If you only have 4GB of GPU memory you have two options:
+ 1. text_use_gpu = False, and use the CPU for the text model. (Recommended.)
+ 2. use_small_models = True, and use the small text model."""
+ )
+
+ def get_model_dir():
+ return generation.CACHE_DIR
+
+ def get_XDG_CACHE_HOME():
+ return os.getenv("XDG_CACHE_HOME")
+
+ XDG_CACHE_HOME_textbox = gr.Textbox(
+ label="Bark Model Download Directory",
+ value=get_XDG_CACHE_HOME(),
+ interactive=True,
+ )
+ model_dir_text = gr.Textbox(
+ label="(Final Path Will Be)",
+ value=get_model_dir(),
+ interactive=False,
+ )
+
+ with gr.Column(scale=2):
+ gr.Markdown(""" ## ๐จโ๐ป GPU and Model Info Dumps ๐ฉโ๐ป""")
+ gpu_report = gr.TextArea(
+ f"{get_refresh_gpu_report()}",
+ label="""(Don't worry about this, it's for fixing problems.)""",
+ max_lines=6,
+ )
+ refresh_gpu_report = gr.Button(
+ "Refresh GPU Status", elem_id="refresh_gpu_report"
+ )
+ refresh_hugging_face_cache_report = gr.Button(
+ "Hugging Face Model Cache Info Dump",
+ elem_id="refresh_hugging_face_cache_report",
+ )
+
+ run_numpy_benchmark = gr.Button(
+ "Run Numpy and MKL CPU Benchmark",
+ elem_id="run_numpy_benchmark",
+ )
+ refresh_gpu_report.click(
+ get_refresh_gpu_report,
+ inputs=None,
+ outputs=[gpu_report],
+ queue=None,
+ )
+ refresh_hugging_face_cache_report.click(
+ api.hugging_face_cache_report,
+ inputs=None,
+ outputs=[gpu_report],
+ queue=None,
+ )
+ run_numpy_benchmark.click(
+ debug.numpy_benchmark,
+ inputs=None,
+ outputs=[gpu_report],
+ queue=None,
+ )
+
+ with gr.Row():
+ with gr.Column(scale=1):
+ loglevel = gr.Dropdown(
+ ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
+ label="Bark Infinity Log Level",
+ value="WARNING",
+ )
+
+ with gr.Column(scale=1):
+ save_log_lines_number = gr.Number(
+ label="When you click Generate, clear all but this many lines from the console",
+ value=1000,
+ )
+
+ env_button = gr.Button(
+ "Apply Settings and Preload Models",
+ variant="secondary",
+ elem_classes="secondary_button",
+ elem_id="env_button_apply",
+ )
+
+ clean_models_button = gr.Button(
+ "Clean Models (Clear GPU Memory)",
+ variant="secondary",
+ elem_classes="secondary_button",
+ elem_id="env_button_apply",
+ )
+
+ env_input_list = (
+ [env_config_group]
+ + [loglevel, save_log_lines_number, XDG_CACHE_HOME_textbox]
+ + model_checkboxes
+ )
+
+ env_button.click(
+ sent_bark_envs, inputs=env_input_list, outputs=[model_dir_text]
+ )
+
+ clean_models_button.click(clean_models_button_click, inputs=[], outputs=[])
+
+ with gr.Row():
+ with gr.Column():
+ # gr.themes.builder()
+ # hg_gradio_theme = gr.Dropdown(gradio_hf_hub_themes)
+
+ gr.Markdown("## Alternative Color Themes, Click To Change")
+ theme_selector = gr.Radio(
+ ["Base", "Default", "Monochrome", "Soft", "Glass"],
+ value="Base",
+ label="Interface Theme",
+ )
+ with gr.Row():
+ theme_selector.change(
+ None,
+ theme_selector,
+ None,
+ _js=f"""
+ (theme) => {{
+ if (!document.querySelector('.theme-css')) {{
+ var theme_elem = document.createElement('style');
+ theme_elem.classList.add('theme-css');
+ document.head.appendChild(theme_elem);
+
+ var link_elem = document.createElement('link');
+ link_elem.classList.add('link-css');
+ link_elem.rel = 'stylesheet';
+ document.head.appendChild(link_elem);
+ }} else {{
+ var theme_elem = document.querySelector('.theme-css');
+ var link_elem = document.querySelector('.link-css');
+ }}
+ if (theme == "Base") {{
+ var theme_css = `{base_theme._get_theme_css()}`;
+ var link_css = `{base_theme._stylesheets[0]}`;
+ }} else if (theme == "Default") {{
+ var theme_css = `{default_theme._get_theme_css()}`;
+ var link_css = `{default_theme._stylesheets[0]}`;
+ }} else if (theme == "Monochrome") {{
+ var theme_css = `{monochrome_theme._get_theme_css()}`;
+ var link_css = `{monochrome_theme._stylesheets[0]}`;
+ }} else if (theme == "Soft") {{
+ var theme_css = `{soft_theme._get_theme_css()}`;
+ var link_css = `{soft_theme._stylesheets[0]}`;
+ }} else if (theme == "Glass") {{
+ var theme_css = `{glass_theme._get_theme_css()}`;
+ var link_css = `{glass_theme._stylesheets[0]}`;
+ }}
+ theme_elem.innerHTML = theme_css;
+ link_elem.href = link_css;
+ }}
+ """,
+ )
+
+ with gr.Tab("๐ ๏ธ๐จโ๐ฌ Advanced / Under Construction", elem_id="main_tabs_advanced"):
+ with gr.Row():
+ with gr.Column(scale=1, variant="panel"):
+ with gr.Tab("๐จ๐ปโโ๏ธ๐งฌSpeaker Surgery Center"):
+ with gr.Row():
+ with gr.Column(scale=0.1):
+ m("### ๐ Regenerate NPZ Files")
+ m(
+ "Quickly generate a sample audio clip for each speaker file in a directory. Have a bunch of NPZ and want to get quick idea what they sound like? This is for you."
+ )
+ sample_gen_path = gr.Textbox(
+ label="Sample Directory",
+ value="bark/assets/prompts/v2",
+ )
+
+ gr.Markdown("Recreate the exact audio file from the the NPZ files.")
+ sample_gen_button = gr.Button(
+ "Regenerate Original NPZ Audio Files",
+ info="This is the exact audio of the original samples",
+ variant="primary",
+ )
+ sample_gen_button.click(
+ generate_sample_audio, inputs=sample_gen_path
+ )
+
+ gr.Markdown(
+ "Generate Slight Variations. These will sound almost but not quite the same as original. Not particularly useful honestly."
+ )
+ sample_gen_button_2 = gr.Button(
+ "Generate Slight Variations.",
+ info="",
+ variant="primary",
+ )
+ sample_gen_button_2.click(
+ generate_sample_audio_coarse, inputs=sample_gen_path
+ )
+
+ gr.Markdown(
+ "Generate Wild Variations. These are wildly different from the original. They may not be the same gender. This is a decent way to find different but somewhat similar voices, but it's not the that useful either."
+ )
+ sample_gen_button_3 = gr.Button(
+ "Wildly Different Samples",
+ info="Wildly Different samples",
+ variant="primary",
+ )
+
+ sample_gen_button_3.click(
+ generate_sample_audio_semantic,
+ inputs=sample_gen_path,
+ )
+
+ gr.Markdown(
+ "The most useful range for this process by bar is the space middle between between Slight and Wild, but I need to build that into the UI."
+ )
+
+ with gr.Column(scale=2):
+ gr.Markdown("### ๐ฃ Speaker Surgery.")
+ gr.Markdown(
+ "(May 20: This is old stuff I don't use at all anymore. But it is hooked up to the UI and works, so I left it here for now.)"
+ )
+ gr.Markdown(
+ "Have a great voice but something isn't right? Wish you you could fix it? First, try making a wide variety of new clips with different prompts and re-saving it? But if that doesn't work, it might be time to call in the doctor."
+ ""
+ )
+ with gr.Tab("### Doctor RNG ๐ฉ๐ปโโ๏ธ๐ฒ"):
+ gr.Markdown(
+ """We've just opened the surgery center and our first hire is a bit questionable. We can't promise to *fix* your troubled .npz.
+ But we *can* close our eyes and slice and dice it up randomly. You'll end up with a lot of versions ofs your original file. Not the most efficient method of medical care, but you know what they say about . Don't worry we have more doctors on the way."""
+ )
+ variation_path = gr.Textbox(
+ label="Speaker NPZ Path",
+ value="bark_samples/myspeakerfile.npz",
+ )
+ variation_count = gr.Number(
+ label="How Many Variations", value=10
+ )
+ generate_speaker_variations_button = gr.Button(
+ "Generate Voice Variations", variant="primary"
+ )
+
+ generate_speaker_variations_button.click(
+ generate_speaker_variations,
+ inputs=[variation_path, variation_count],
+ )
+
+ with gr.Tab("### Doctor ๐ช๏ธ๐ฉ๐ปโโ๏ธ"):
+ gr.Markdown(
+ """### This is a non purely random way to do the the same kind of edits based some rules and heuristics instead. Not ported to UI yet."""
+ )
+
+ with gr.Tab("### Personality Separation Surgery"):
+ gr.Markdown(
+ """### Tries to split out a few different voices from a speaker file, if possible. Very simple but might be wrotht a shot."""
+ )
+
+ with gr.Tab("### Model Merging"):
+ gr.Markdown(
+ """### Placeholder. This is pretty fun, people want voice clones."""
+ )
+
+ with gr.Tab("### Sampling and Sets"):
+ gr.Markdown("""### Placeholder Placeholder.""")
+
+ with gr.Tab("Utilities"):
+ with gr.Row():
+ with gr.Column(scale=1, variant="panel"):
+ m("# Utilities")
+
+ m("# 101soundboards")
+
+ soundboard_url = gr.Textbox(
+ label="Soundboard URL",
+ value="https://www.101soundboards.com/boards/27047-bob-ross-soundboard",
+ )
+
+ soundboard_directory = gr.Textbox(
+ label="Soundboard Local Directory",
+ value="downloaded_sounds",
+ )
+
+ soundboard_directory_button = gr.Button(
+ "Download Sounds", variant="primary"
+ )
+
+ soundboard_directory_button.click(
+ soundboard_directory_download,
+ inputs=[soundboard_url, soundboard_directory],
+ )
+
+ with gr.Tab("More Options"):
+ with gr.Row():
+ with gr.Column(scale=1, variant="panel"):
+ m("# ๐๐ Advanced Options")
+ m(
+ "Some of these even work. Type them like you would on a command line."
+ )
+ m("```--semantic_top_k 50```")
+ m("```--semantic_min_eos_p 0.05```")
+
+ with gr.Column(scale=1, variant="panel"):
+ m(
+ "### ๐๐ Raw list of some advanced options that may or may not be implemented or working."
+ )
+ gr.HTML(
+ f"{formatted_defaults}",
+ elem_classes="bark_console",
+ info=". I cut a lot of these out because they were buggy or took too long to try and merge with regular Bark because I don't really understand the stuff I poke at very well.",
+ )
+ with gr.Column(scale=1, variant="panel"):
+ extra_args_input = gr.TextArea(
+ lines=15,
+ label="Extra Arguments",
+ elem_classes="bark_console",
+ )
+ with gr.Tab("Save/Load Defaults", elem_id="main_tabs_config"):
+ loadsave.create_ui()
+
+ with gr.Row():
+ with gr.Column(scale=1, variant="panel"):
+ directory_to_open = output_dir
+ output_dir_display = f"{where_am_i} / {directory_to_open.value}"
+ with gr.Row():
+ gr.Markdown(f"""Output Folder {output_dir_display}""")
+
+ with gr.Row():
+ with gr.Column(scale=1):
+ show_outputs_in_filesystem_button = gr.Button(
+ value=f'๐ Browse Output Folder: "{directory_to_open.value}"'
+ )
+ show_outputs_in_filesystem_button.click(
+ output_filesystem_button,
+ inputs=[directory_to_open],
+ queue=False,
+ )
+
+ with gr.Column(scale=1):
+ max_audio_outputs = 4
+
+ def variable_outputs_forward(k):
+ global last_audio_samples
+
+ k = int(k)
+
+ audio_list = []
+ for i in range(min(k, len(last_audio_samples))):
+ audio_list.append(
+ gr.Audio.update(
+ value=last_audio_samples[i],
+ label=f"{last_audio_samples[i]}",
+ visible=True,
+ )
+ )
+
+ for _ in range(k - len(audio_list)):
+ audio_list.append(
+ gr.Audio.update(
+ f"bark_infinity/assets/split_the_text.wav",
+ label="placeholder",
+ visible=False,
+ )
+ )
+
+ audio_list += [gr.Audio.update(visible=False)] * (max_audio_outputs - k)
+
+ return audio_list
+
+ def variable_outputs(k):
+ global last_audio_samples
+ k = int(k)
+
+ audio_list = []
+ for i in range(-1, -min(k, len(last_audio_samples)) - 1, -1):
+ index = (
+ len(last_audio_samples) + i
+ ) # Calculate the index in the original list
+ audio_list.append(
+ gr.Audio.update(
+ value=last_audio_samples[i],
+ label=f"#{index+1}, Value: {last_audio_samples[i]}",
+ visible=True,
+ )
+ )
+
+ for _ in range(k - len(audio_list)):
+ audio_list.append(
+ gr.Audio.update(
+ f"bark_infinity/assets/split_the_text.wav",
+ label="placeholder",
+ visible=False,
+ )
+ )
+
+ audio_list += [gr.Audio.update(visible=False)] * (max_audio_outputs - k)
+
+ return audio_list
+
+ num_audio_to_show = gr.Slider(
+ 1,
+ max_audio_outputs,
+ value=max_audio_outputs,
+ step=1,
+ label="Last Samples to Show:",
+ info="Click Browse button to use your OS browser instead.",
+ )
+
+ with gr.Row():
+ with gr.Column(scale=1):
+ m(
+ "#### (If you can't click on Audio Play button, move slider. Gradio bug.)"
+ )
+ audio_outputs = []
+ for i in range(max_audio_outputs):
+ t = gr.Audio(
+ value=f"bark_infinity/assets/split_the_text.wav",
+ label="placeholder",
+ visible=False,
+ )
+ audio_outputs.append(t)
+
+ num_audio_to_show.change(
+ variable_outputs,
+ num_audio_to_show,
+ audio_outputs,
+ queue=False,
+ )
+
+ with gr.Column(scale=1, variant="panel"):
+ audio_output = gr.Audio(
+ label="Last Audio Sample",
+ type="filepath",
+ elem_classes="bark_output_audio",
+ )
+
+ output = gr.HTML(elem_classes="bark_console", interactive=True)
+
+ def clear_logs():
+ with open("gradio_terminal_ouput.log", "w", encoding="utf-8") as f:
+ f.write("")
+
+ clear_button = gr.Button("Clear The Console")
+ clear_button.click(clear_logs)
+
+ def set_current_tab(tab):
+ global current_tab
+ # print(f"Setting current tab to {tab}")
+
+ current_tab = tab
+
+ if current_tab == "clone":
+ # print("Setting current tab to clone")
+ directory_to_open = clone_output_dir
+ return gr.Button.update(
+ value=f"๐ Browse Clone General Folder: {directory_to_open.value}"
+ )
+ elif current_tab == "generate":
+ # print("Setting current tab to generate")
+ directory_to_open = output_dir
+ return gr.Button.update(value=f"๐ Browse Output Folder: {directory_to_open.value}")
+ elif current_tab == "settings_tab":
+ # print("Setting current tab to settings_tab")
+ return get_XDG_CACHE_HOME()
+
+ # is this the only way to know what tab you are on?
+ clone_main_tab.select(
+ lambda: set_current_tab("clone"),
+ None,
+ show_outputs_in_filesystem_button,
+ queue=False,
+ )
+ generate_audio_main_tab.select(
+ lambda: set_current_tab("generate"),
+ None,
+ show_outputs_in_filesystem_button,
+ queue=False,
+ )
+ settings_tab.select(
+ lambda: set_current_tab("settings_tab"),
+ None,
+ XDG_CACHE_HOME_textbox,
+ queue=False,
+ )
+
+ loadsave.add_block(main_top_tabs_block, "bark_infinity")
+
+ generate_event = generate_button.click(
+ generate_audio_long_gradio,
+ inputs=[
+ input,
+ audio_prompt_input,
+ bark_speaker_as_the_prompt,
+ npz_dropdown,
+ generated_voices,
+ cloned_voices,
+ bark_infinity_voices,
+ confused_travolta_mode,
+ allow_blank,
+ stable_mode_interval,
+ separate_prompts,
+ separate_prompts_flipper,
+ split_character_goal_length,
+ split_character_max_length,
+ process_text_by_each,
+ in_groups_of_size,
+ group_text_by_counting,
+ split_type_string,
+ prompt_text_prefix,
+ prompt_text_suffix,
+ seed,
+ text_splits_only,
+ output_iterations,
+ hoarder_mode,
+ text_temp,
+ waveform_temp,
+ semantic_min_eos_p,
+ output_dir,
+ output_filename,
+ output_format,
+ add_silence_between_segments,
+ semantic_top_k,
+ semantic_top_p,
+ coarse_top_k,
+ coarse_top_p,
+ specific_npz_file,
+ audio_file_as_history_prompt,
+ specific_npz_folder,
+ split_character_jitter,
+ semantic_token_repeat_penalty,
+ semantic_inverted_p,
+ semantic_bottom_k,
+ semantic_use_mirostat_sampling,
+ semantic_mirostat_tau,
+ semantic_mirostat_learning_rate,
+ negative_text_prompt,
+ specific_npz_file_negative_prompt,
+ negative_text_prompt_logits_scale,
+ negative_text_prompt_divergence_scale,
+ extra_args_input,
+ ],
+ outputs=[audio_output],
+ )
+
+ clone_button_event = clone_voice_button.click(
+ clone_voice_gradio,
+ inputs=[
+ input_audio_filename,
+ input_audio_filename_secondary,
+ speaker_as_clone_content,
+ output_voice,
+ extra_blurry_clones,
+ even_more_clones,
+ audio_filepath_directory,
+ simple_clones_only,
+ ],
+ outputs=dummy,
+ )
+
+ clone_button_event_success = clone_button_event.success(
+ generate_audio_long_gradio_clones,
+ inputs=[
+ input,
+ audio_prompt_input,
+ bark_speaker_as_the_prompt,
+ npz_dropdown,
+ generated_voices,
+ cloned_voices,
+ bark_infinity_voices,
+ confused_travolta_mode,
+ allow_blank,
+ stable_mode_interval,
+ separate_prompts,
+ separate_prompts_flipper,
+ split_character_goal_length,
+ split_character_max_length,
+ process_text_by_each,
+ in_groups_of_size,
+ group_text_by_counting,
+ split_type_string,
+ prompt_text_prefix,
+ prompt_text_suffix,
+ seed,
+ text_splits_only,
+ output_iterations,
+ hoarder_mode,
+ text_temp,
+ waveform_temp,
+ semantic_min_eos_p,
+ output_dir,
+ output_voice,
+ output_format,
+ add_silence_between_segments,
+ semantic_top_k,
+ semantic_top_p,
+ coarse_top_k,
+ coarse_top_p,
+ specific_npz_file,
+ audio_file_as_history_prompt,
+ dummy,
+ split_character_jitter,
+ semantic_token_repeat_penalty,
+ semantic_inverted_p,
+ semantic_bottom_k,
+ semantic_use_mirostat_sampling,
+ semantic_mirostat_tau,
+ semantic_mirostat_learning_rate,
+ negative_text_prompt,
+ specific_npz_file_negative_prompt,
+ negative_text_prompt_logits_scale,
+ negative_text_prompt_divergence_scale,
+ extra_args_input,
+ ],
+ outputs=[audio_output],
+ )
+
+ cancel_button.click(
+ fn=try_to_cancel,
+ inputs=model_checkboxes,
+ outputs=None,
+ cancels=[generate_event, clone_button_event, clone_button_event_success],
+ queue=None,
+ )
+
+ loadsave.setup_ui()
+ loadsave.dump_defaults()
+ demo.ui_loadsave = loadsave
+
+ logs = gr.HTML()
+ demo.load(read_logs, None, output, every=2)
+ demo.load(variable_outputs, inputs=num_audio_to_show, outputs=audio_outputs, every=10)
+
+
+parser = argparse.ArgumentParser(description="Gradio app command line options.")
+parser.add_argument("--share", action="store_true", help="Enable share setting.")
+parser.add_argument("--user", type=str, help="User for authentication.")
+parser.add_argument("--password", type=str, help="Password for authentication.")
+parser.add_argument("--listen", action="store_true", help="Server name setting.")
+parser.add_argument("--server_port", type=int, default=7860, help="Port setting.")
+parser.add_argument(
+ "--no-autolaunch",
+ action="store_false",
+ default=False,
+ help="Disable automatic opening of the app in browser.",
+)
+parser.add_argument(
+ "--debug",
+ action="store_true",
+ default=False,
+ help="Enable detailed error messages and extra outputs.",
+)
+
+parser.add_argument(
+ "--barkdebug",
+ action="store_true",
+ default=False,
+ help="Misc Bark Debug.",
+)
+parser.add_argument("--incolab", action="store_true", default=False, help="Default for Colab.")
+
+
+parser.add_argument(
+ "--no_offload_cpu",
+ action="store_true",
+ default=False,
+ help="Do not offload models to the CPU when not in use.",
+)
+parser.add_argument(
+ "--use_small_models",
+ action="store_true",
+ default=False,
+ help="Set to use small models.",
+)
+parser.add_argument(
+ "--global_enable_mps",
+ type=str,
+ default=False,
+ help="Set for enabling MPS on Apple M1.",
+)
+parser.add_argument("--xdg_cache_home", type=str, help="Model directory.")
+
+
+args = parser.parse_args()
+
+
+auth = None
+
+
+share = args.share
+
+if args.barkdebug:
+ barkdebug = barkdebug
+
+if args.incolab:
+ generation.OFFLOAD_CPU = False
+ share = True
+
+if args.user and args.password:
+ auth = (args.user, args.password)
+
+if args.share and auth is None:
+ print("You may want to set a password, you are sharing this Gradio publicly.")
+
+if args.no_offload_cpu:
+ generation.OFFLOAD_CPU = False
+ print("CPU Offloading disabled.")
+
+if args.use_small_models:
+ generation.USE_SMALL_MODELS = True
+ print("Using small models.")
+
+if args.global_enable_mps:
+ generation.GLOBAL_ENABLE_MPS = True
+ print("MPS enabled.")
+
+if args.xdg_cache_home:
+ set_XDG_CACHE_HOME(args.xdg_cache_home)
+
+
+server_name = "0.0.0.0" if args.listen else "127.0.0.1"
+
+print(api.startup_status_report(True))
+
+print(f"\n\nYou should see Bark Infinity in your web browser now.")
+print(f"If not go the the website you see below as 'Running on local URL:'")
+print(f"python bark_webui.py --help for specific Gradio options.\n\n")
+# demo.queue(concurrency_count=2, max_size=2)
+demo.queue()
+
+do_not_launch = not args.no_autolaunch
+
+do_not_launch = True
+
+demo.launch(
+ share=args.share,
+ auth=auth,
+ server_name=server_name,
+ server_port=args.server_port,
+ inbrowser=do_not_launch,
+ debug=args.debug,
+)
+
+# Only auto launch one time.
+do_not_launch = True
diff --git a/barki-allpip.txt b/barki-allpip.txt
new file mode 100644
index 0000000000000000000000000000000000000000..797e8234ad1ac784aa444f89e133e218c1d1a771
--- /dev/null
+++ b/barki-allpip.txt
@@ -0,0 +1,59 @@
+setuptools
+transformers
+diffusers
+ffmpeg-downloader
+ffmpeg
+ffmpeg-python
+sox ; platform_system == 'Linux'
+sox ; platform_system == 'Darwin'
+soundfile==0.12.1 ; platform_system == 'Windows'
+librosa
+boto3
+funcy
+numpy
+scipy
+tokenizers
+tqdm
+ipython
+huggingface_hub>=0.15.1
+rich
+pathvalidate
+rich-argparse
+encodec
+chardet
+pydub
+requests
+audio2numpy
+faiss-cpu
+joblib
+universal-startfile
+gradio>=3.35.1
+pywin32 ; platform_system == 'Windows'
+hydra_colorlog
+julius
+spacy==3.5.2
+demucs
+flashy>=0.0.1
+av
+einops
+hydra-core>=1.1
+num2words
+sentencepiece
+python-dotenv
+vector_quantize_pytorch
+devtools
+jsonschema
+ffprobe
+pygments
+tensorboard
+pyyaml
+numba
+matplotlib
+accelerate
+nbformat
+fastjsonschema
+jupyter-client
+beartype
+ema-pytorch
+lion-pytorch
+local-attention
diff --git a/data/models/hubert/hubert.pt b/data/models/hubert/hubert.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7c5ecb6409ac55a2a170080a560ef1f1eb4983ca
--- /dev/null
+++ b/data/models/hubert/hubert.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1703cf8d2cdc76f8c046f5f6a9bcd224e0e6caf4744cad1a1f4199c32cac8c8d
+size 1136468879
diff --git a/data/models/hubert/tokenizer_large.pth b/data/models/hubert/tokenizer_large.pth
new file mode 100644
index 0000000000000000000000000000000000000000..435414721294f8dc14d19ce98a5c68ed5fc582e5
--- /dev/null
+++ b/data/models/hubert/tokenizer_large.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d94c5dd646bcfe1a8bb470372f0004c189acf65d913831f3a6ed6414c9ba86f
+size 243656111
diff --git a/gradio_options.json b/gradio_options.json
new file mode 100644
index 0000000000000000000000000000000000000000..31244c2f4630a9cfafa1d4f386a8ae13a59243ee
--- /dev/null
+++ b/gradio_options.json
@@ -0,0 +1,232 @@
+{
+ "bark_infinity/Tabs@main_top_ui_tabs/selected": null,
+ "bark_infinity/Main Text Prompt/visible": true,
+ "bark_infinity/Main Text Prompt/value": "",
+ "bark_infinity/Allow Blank Text Prompts/visible": true,
+ "bark_infinity/Allow Blank Text Prompts/value": false,
+ "bark_infinity/Always Generate Maximum Length./visible": true,
+ "bark_infinity/Always Generate Maximum Length./value": false,
+ "bark_infinity/Insert A Text Snippet: webui/user_styles.csv/visible": true,
+ "bark_infinity/Insert A Text Snippet: webui/user_styles.csv/value": [],
+ "bark_infinity/\ud83c\udfa8/visible": true,
+ "bark_infinity/Modify The Text Prompt/visible": true,
+ "bark_infinity/Modify The Text Prompt/value": [],
+ "bark_infinity/\u2728/visible": true,
+ "bark_infinity/Negative Main Text Prompt/visible": true,
+ "bark_infinity/Negative Main Text Prompt/value": "",
+ "bark_infinity/Negative Text Prompt Divergence Scale/visible": true,
+ "bark_infinity/Negative Text Prompt Divergence Scale/value": 0.0,
+ "bark_infinity/Negative Text Prompt Divergence Scale/minimum": 0.0,
+ "bark_infinity/Negative Text Prompt Divergence Scale/maximum": 2.0,
+ "bark_infinity/Negative Text Prompt Divergence Scale/step": 0.01,
+ "bark_infinity/Negative Text Prompt Scale/visible": true,
+ "bark_infinity/Negative Text Prompt Scale/value": 0.0,
+ "bark_infinity/Negative Text Prompt Scale/minimum": 0.0,
+ "bark_infinity/Negative Text Prompt Scale/maximum": 2.0,
+ "bark_infinity/Negative Text Prompt Scale/step": 0.01,
+ "bark_infinity/\ud83c\udfb2 Random Voice/visible": true,
+ "bark_infinity/\ud83c\udfb2 Random Voice/value": false,
+ "bark_infinity/\ud83e\uddd1\u200d\ud83c\udfa4 Built In Voice/visible": true,
+ "bark_infinity/\ud83e\uddd1\u200d\ud83c\udfa4 Built In Voice/value": null,
+ "bark_infinity/Cloned Voices/visible": true,
+ "bark_infinity/Cloned Voices/value": null,
+ "bark_infinity/\ud83d\udcc1 A directory containing .npz files. Each one will generate the prompt./visible": true,
+ "bark_infinity/\ud83d\udcc1 A directory containing .npz files. Each one will generate the prompt./value": "",
+ "bark_infinity//visible": true,
+ "bark_infinity//value": "",
+ "bark_infinity/Try for this many characters in each/visible": true,
+ "bark_infinity/Try for this many characters in each/value": 165,
+ "bark_infinity/Try for this many characters in each/minimum": 0,
+ "bark_infinity/Try for this many characters in each/maximum": 500,
+ "bark_infinity/Try for this many characters in each/step": 1,
+ "bark_infinity/But never go higher than this many/visible": true,
+ "bark_infinity/But never go higher than this many/value": 205,
+ "bark_infinity/But never go higher than this many/minimum": 0,
+ "bark_infinity/But never go higher than this many/maximum": 500,
+ "bark_infinity/But never go higher than this many/step": 1,
+ "bark_infinity/Put this text **in front** of every text segment, after splitting./visible": true,
+ "bark_infinity/Put this text **in front** of every text segment, after splitting./value": "",
+ "bark_infinity/Put this text **after** every text segment, after splitting./visible": true,
+ "bark_infinity/Put this text **after** every text segment, after splitting./value": "",
+ "bark_infinity/Randomize character splits by this much/visible": true,
+ "bark_infinity/Randomize character splits by this much/value": 0,
+ "bark_infinity/Randomize character splits by this much/minimum": 0,
+ "bark_infinity/Randomize character splits by this much/maximum": 500,
+ "bark_infinity/Randomize character splits by this much/step": 1,
+ "bark_infinity/Process the text in chunks of:/visible": true,
+ "bark_infinity/Process the text in chunks of:/value": null,
+ "bark_infinity/Group the text by counting:/visible": true,
+ "bark_infinity/Group the text by counting:/value": null,
+ "bark_infinity/And start a new audio clip with you have this many:/visible": true,
+ "bark_infinity/And start a new audio clip with you have this many:/value": 1,
+ "bark_infinity/And start a new audio clip with you have this many:/minimum": 1,
+ "bark_infinity/And start a new audio clip with you have this many:/maximum": 50,
+ "bark_infinity/And start a new audio clip with you have this many:/step": 1,
+ "bark_infinity/(Optional String for string or regex.)/visible": true,
+ "bark_infinity/(Optional String for string or regex.)/value": "",
+ "bark_infinity/\ud83d\uddfa\ufe0f\u2702\ufe0f No audio, just show me text splits./visible": true,
+ "bark_infinity/\ud83d\uddfa\ufe0f\u2702\ufe0f No audio, just show me text splits./value": false,
+ "bark_infinity/How to Join Clips/visible": true,
+ "bark_infinity/How to Join Clips/value": "Stable",
+ "bark_infinity/Add Silence/visible": true,
+ "bark_infinity/Add Silence/value": 0.0,
+ "bark_infinity/Add Silence/minimum": 0.0,
+ "bark_infinity/Add Silence/maximum": 5.0,
+ "bark_infinity/Add Silence/step": 0.01,
+ "bark_infinity/Separate Prompts/visible": true,
+ "bark_infinity/Separate Prompts/value": false,
+ "bark_infinity/Separate Prompts, but do one generation/visible": true,
+ "bark_infinity/Separate Prompts, but do one generation/value": false,
+ "bark_infinity/Clip Length Chance/visible": true,
+ "bark_infinity/Clip Length Chance/value": 0.2,
+ "bark_infinity/Clip Length Chance/minimum": 0.0,
+ "bark_infinity/Clip Length Chance/maximum": 1.0,
+ "bark_infinity/Clip Length Chance/step": 0.01,
+ "bark_infinity/text temperature \ud83c\udf21\ufe0f: /visible": true,
+ "bark_infinity/text temperature \ud83c\udf21\ufe0f: /value": 0.7,
+ "bark_infinity/text temperature \ud83c\udf21\ufe0f: /minimum": 0.0,
+ "bark_infinity/text temperature \ud83c\udf21\ufe0f: /maximum": 2.0,
+ "bark_infinity/text temperature \ud83c\udf21\ufe0f: /step": 0.01,
+ "bark_infinity/wave temperature \ud83c\udf21\ufe0f: /visible": true,
+ "bark_infinity/wave temperature \ud83c\udf21\ufe0f: /value": 0.5,
+ "bark_infinity/wave temperature \ud83c\udf21\ufe0f: /minimum": 0.0,
+ "bark_infinity/wave temperature \ud83c\udf21\ufe0f: /maximum": 2.0,
+ "bark_infinity/wave temperature \ud83c\udf21\ufe0f: /step": 0.01,
+ "bark_infinity/Seed/visible": true,
+ "bark_infinity/Seed/value": 0.0,
+ "bark_infinity/semantic_top_k/visible": true,
+ "bark_infinity/semantic_top_k/value": 100,
+ "bark_infinity/semantic_top_k/minimum": 0,
+ "bark_infinity/semantic_top_k/maximum": 1000,
+ "bark_infinity/semantic_top_k/step": 1,
+ "bark_infinity/semantic_top_p/visible": true,
+ "bark_infinity/semantic_top_p/value": 0.95,
+ "bark_infinity/semantic_top_p/minimum": 0.0,
+ "bark_infinity/semantic_top_p/maximum": 1.0,
+ "bark_infinity/semantic_top_p/step": 0.01,
+ "bark_infinity/coarse_top_k/visible": true,
+ "bark_infinity/coarse_top_k/value": 100,
+ "bark_infinity/coarse_top_k/minimum": 0,
+ "bark_infinity/coarse_top_k/maximum": 1000,
+ "bark_infinity/coarse_top_k/step": 1,
+ "bark_infinity/coarse_top_p/visible": true,
+ "bark_infinity/coarse_top_p/value": 0.95,
+ "bark_infinity/coarse_top_p/minimum": 0.0,
+ "bark_infinity/coarse_top_p/maximum": 1.0,
+ "bark_infinity/coarse_top_p/step": 0.01,
+ "bark_infinity/Token Repetition Penalty/visible": true,
+ "bark_infinity/Token Repetition Penalty/value": 0.0,
+ "bark_infinity/Token Repetition Penalty/minimum": 0.0,
+ "bark_infinity/Token Repetition Penalty/maximum": 2.0,
+ "bark_infinity/Token Repetition Penalty/step": 0.01,
+ "bark_infinity/Semantic Inverted-P/visible": true,
+ "bark_infinity/Semantic Inverted-P/value": 0.0,
+ "bark_infinity/Semantic Inverted-P/minimum": 0.0,
+ "bark_infinity/Semantic Inverted-P/maximum": 1.0,
+ "bark_infinity/Semantic Inverted-P/step": 0.01,
+ "bark_infinity/Semantic Bottom K/visible": true,
+ "bark_infinity/Semantic Bottom K/value": 0,
+ "bark_infinity/Semantic Bottom K/minimum": 0,
+ "bark_infinity/Semantic Bottom K/maximum": 1000,
+ "bark_infinity/Semantic Bottom K/step": 1,
+ "bark_infinity/Use Semantic Mirostat Sampling/visible": true,
+ "bark_infinity/Use Semantic Mirostat Sampling/value": false,
+ "bark_infinity/Semantic Surprise Factor (Mirostat Tau)/visible": true,
+ "bark_infinity/Semantic Surprise Factor (Mirostat Tau)/value": 40.0,
+ "bark_infinity/Semantic Surprise Factor (Mirostat Tau)/minimum": 0.0,
+ "bark_infinity/Semantic Surprise Factor (Mirostat Tau)/maximum": 100,
+ "bark_infinity/Semantic Surprise Factor (Mirostat Tau)/step": 0.1,
+ "bark_infinity/Semantic Mirostat Learning Rate/visible": true,
+ "bark_infinity/Semantic Mirostat Learning Rate/value": 0.75,
+ "bark_infinity/Semantic Mirostat Learning Rate/minimum": 0.0,
+ "bark_infinity/Semantic Mirostat Learning Rate/maximum": 2.0,
+ "bark_infinity/Semantic Mirostat Learning Rate/step": 0.01,
+ "bark_infinity/\ud83d\udc8e\ud83d\udc8eSave Every NPZ/visible": true,
+ "bark_infinity/\ud83d\udc8e\ud83d\udc8eSave Every NPZ/value": false,
+ "bark_infinity/Output directory/visible": true,
+ "bark_infinity/Output directory/value": "bark_samples",
+ "bark_infinity/Repeat This Many Times/visible": true,
+ "bark_infinity/Repeat This Many Times/value": 1,
+ "bark_infinity/Repeat This Many Times/minimum": 1,
+ "bark_infinity/Repeat This Many Times/maximum": 100,
+ "bark_infinity/Repeat This Many Times/step": 1,
+ "bark_infinity/Output filename/visible": true,
+ "bark_infinity/Output filename/value": "",
+ "bark_infinity/Audio File Output Format/visible": true,
+ "bark_infinity/Audio File Output Format/value": "mp3",
+ "bark_infinity/Generate Audio/visible": true,
+ "bark_infinity/Voice Clone Name/visible": true,
+ "bark_infinity/Voice Clone Name/value": "New_Voice_Clone",
+ "bark_infinity/3\ufe0f\u20e3 One Clone Short Text Prompt/value": "",
+ "bark_infinity/3\ufe0f\u20e3 One Clone Long Text Prompt/value": "",
+ "bark_infinity/Voice Clone Directory/visible": true,
+ "bark_infinity/Voice Clone Directory/value": "",
+ "bark_infinity/Just use the end of the audio clip (or clips) as the voice clone./visible": true,
+ "bark_infinity/Just use the end of the audio clip (or clips) as the voice clone./value": false,
+ "bark_infinity/\ud83d\udc36\ud83c\udf2b\ufe0f\ud83d\udc15\u200d\ud83e\uddba Extra Blurry Clones. Not so useful for accuracy but often creates nice new voices./visible": true,
+ "bark_infinity/\ud83d\udc36\ud83c\udf2b\ufe0f\ud83d\udc15\u200d\ud83e\uddba Extra Blurry Clones. Not so useful for accuracy but often creates nice new voices./value": false,
+ "bark_infinity/Extra Foreign Clones \ud83e\uddec\ud83d\udc6f\u200d\u2642\ufe0f\ud83d\udc6f\u200d\u2640\ufe0f/visible": true,
+ "bark_infinity/Extra Foreign Clones \ud83e\uddec\ud83d\udc6f\u200d\u2642\ufe0f\ud83d\udc6f\u200d\u2640\ufe0f/value": false,
+ "bark_infinity/Begin Generating Voice Clones/visible": true,
+ "bark_infinity/Cloning Progress.../visible": true,
+ "bark_infinity/Cloning Progress.../value": "",
+ "bark_infinity/text_use_gpu/visible": true,
+ "bark_infinity/text_use_gpu/value": true,
+ "bark_infinity/text_use_small/visible": true,
+ "bark_infinity/text_use_small/value": false,
+ "bark_infinity/coarse_use_gpu/visible": true,
+ "bark_infinity/coarse_use_gpu/value": true,
+ "bark_infinity/coarse_use_small/visible": true,
+ "bark_infinity/coarse_use_small/value": false,
+ "bark_infinity/fine_use_gpu/visible": true,
+ "bark_infinity/fine_use_gpu/value": true,
+ "bark_infinity/fine_use_small/visible": true,
+ "bark_infinity/fine_use_small/value": false,
+ "bark_infinity/codec_use_gpu/visible": true,
+ "bark_infinity/codec_use_gpu/value": true,
+ "bark_infinity/force_reload/visible": true,
+ "bark_infinity/force_reload/value": true,
+ "bark_infinity/Bark Model Download Directory/visible": true,
+ "bark_infinity/Bark Model Download Directory/value": null,
+ "bark_infinity/(Final Path Will Be)/visible": true,
+ "bark_infinity/(Final Path Will Be)/value": "/home/vscode/.cache/suno/bark_v0",
+ "bark_infinity/(Don't worry about this, it's for fixing problems.)/visible": true,
+ "bark_infinity/(Don't worry about this, it's for fixing problems.)/value": "No CUDA device is detected.\n",
+ "bark_infinity/Refresh GPU Status/visible": true,
+ "bark_infinity/Hugging Face Model Cache Info Dump/visible": true,
+ "bark_infinity/Run Numpy and MKL CPU Benchmark/visible": true,
+ "bark_infinity/Bark Infinity Log Level/visible": true,
+ "bark_infinity/Bark Infinity Log Level/value": "WARNING",
+ "bark_infinity/When you click Generate, clear all but this many lines from the console/visible": true,
+ "bark_infinity/When you click Generate, clear all but this many lines from the console/value": 1000.0,
+ "bark_infinity/Apply Settings and Preload Models/visible": true,
+ "bark_infinity/Clean Models (Clear GPU Memory)/visible": true,
+ "bark_infinity/Interface Theme/visible": true,
+ "bark_infinity/Interface Theme/value": "Base",
+ "bark_infinity/Sample Directory/visible": true,
+ "bark_infinity/Sample Directory/value": "bark/assets/prompts/v2",
+ "bark_infinity/Regenerate Original NPZ Audio Files/visible": true,
+ "bark_infinity/Generate Slight Variations./visible": true,
+ "bark_infinity/Wildly Different Samples/visible": true,
+ "bark_infinity/Speaker NPZ Path/visible": true,
+ "bark_infinity/Speaker NPZ Path/value": "bark_samples/myspeakerfile.npz",
+ "bark_infinity/How Many Variations/visible": true,
+ "bark_infinity/How Many Variations/value": 10.0,
+ "bark_infinity/Generate Voice Variations/visible": true,
+ "bark_infinity/Soundboard URL/visible": true,
+ "bark_infinity/Soundboard URL/value": "https://www.101soundboards.com/boards/27047-bob-ross-soundboard",
+ "bark_infinity/Soundboard Local Directory/visible": true,
+ "bark_infinity/Soundboard Local Directory/value": "downloaded_sounds",
+ "bark_infinity/Download Sounds/visible": true,
+ "bark_infinity/Extra Arguments/visible": true,
+ "bark_infinity/Extra Arguments/value": "",
+ "bark_infinity/View changes/visible": true,
+ "bark_infinity/Apply/visible": true,
+ "bark_infinity/\ud83d\udcc1 Browse Output Folder: \"bark_samples\"/visible": true,
+ "bark_infinity/Last Samples to Show:/visible": true,
+ "bark_infinity/Last Samples to Show:/value": 4,
+ "bark_infinity/Last Samples to Show:/minimum": 1,
+ "bark_infinity/Last Samples to Show:/maximum": 4,
+ "bark_infinity/Last Samples to Show:/step": 1,
+ "bark_infinity/Clear The Console/visible": true
+}
\ No newline at end of file
diff --git a/gradio_terminal_ouput.log b/gradio_terminal_ouput.log
new file mode 100644
index 0000000000000000000000000000000000000000..46168aefaacbb07ef2799775d471c72836ef90dd
--- /dev/null
+++ b/gradio_terminal_ouput.log
@@ -0,0 +1,39 @@
+You may want to set a password, you are sharing this Gradio publicly.
+No CUDA device is detected.
+
+OFFLOAD_CPU: True (Default is True)
+USE_SMALL_MODELS: False (Default is False)
+GLOBAL_ENABLE_MPS (Apple): False (Default is False)
+GPU Memory: None GB
+SUNO_HALF_PRECISION: False (Default is False)
+SUNO_HALF_BFLOAT16: False (Default is False)
+SUNO_DISABLE_COMPILE: False (Default is False)
+SUNO_USE_DIRECTML (AMD): False (Default is False)
+Torch Num CPU Threads: 6
+Bark Model Location: /home/vscode/.cache/suno/bark_v0 (Env var 'XDG_CACHE_HOME' to override)
+HF_HOME: /workspaces/bark/bark_infinity/data/models/unclassified
+
+FFmpeg status, this should say version 6.0
+FFmpeg binaries directory: None
+FFmpeg Version: None
+FFmpeg Path: /home/vscode/.local/share/ffmpeg-downloader/ffmpeg/ffmpeg
+FFprobe Path: /home/vscode/.local/share/ffmpeg-downloader/ffmpeg/ffprobe
+FFplay Path: /home/vscode/.local/share/ffmpeg-downloader/ffmpeg/ffplay
+
+
+
+You should see Bark Infinity in your web browser now.
+If not go the the website you see below as 'Running on local URL:'
+python bark_webui.py --help for specific Gradio options.
+
+
+Running on local URL: http://127.0.0.1:7860
+Running on public URL: https://9e293702412e86b0ec.gradio.live
+
+This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
+Found the audio file /tmp/gradio/c5177a17c3c4fccb64544b643fd829aef8cdcfa5/Penn Badgley Reads Classic Literature as Joe From You B7HXnSGnisE vocals-0-100.wav.
+Cloning voice from /tmp/gradio/c5177a17c3c4fccb64544b643fd829aef8cdcfa5/Penn Badgley Reads Classic Literature as Joe From You B7HXnSGnisE vocals-0-100.wav to New_Voice_Clone
+attempt 1 of 1
+Loading HuBERT models ('quantifier_V1_hubert_base_ls960_23.pth', 'tokenizer_large.pth') from data/models/hubert/hubert.pt
+checkpoint_path: data/models/hubert/hubert.pt
+Keyboard interruption in main thread... closing server.
diff --git a/model-card.md b/model-card.md
new file mode 100644
index 0000000000000000000000000000000000000000..5ead3a9b4c3591eb7957e603437a842cdd3629ac
--- /dev/null
+++ b/model-card.md
@@ -0,0 +1,40 @@
+# Model Card: Bark
+
+This is the official codebase for running the text to audio model, from Suno.ai.
+
+The following is additional information about the models released here.
+
+## Model Details
+
+Bark is a series of three transformer models that turn text into audio.
+### Text to semantic tokens
+ - Input: text, tokenized with [BERT tokenizer from Hugging Face](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertTokenizer)
+ - Output: semantic tokens that encode the audio to be generated
+
+### Semantic to coarse tokens
+ - Input: semantic tokens
+ - Output: tokens from the first two codebooks of the [EnCodec Codec](https://github.com/facebookresearch/encodec) from facebook
+
+### Coarse to fine tokens
+ - Input: the first two codebooks from EnCodec
+ - Output: 8 codebooks from EnCodec
+
+### Architecture
+| Model | Parameters | Attention | Output Vocab size |
+|:-------------------------:|:----------:|------------|:-----------------:|
+| Text to semantic tokens | 80 M | Causal | 10,000 |
+| Semantic to coarse tokens | 80 M | Causal | 2x 1,024 |
+| Coarse to fine tokens | 80 M | Non-causal | 6x 1,024 |
+
+
+### Release date
+April 2023
+
+## Broader Implications
+We anticipate that this model's text to audio capabilities can be used to improve accessbility tools in a variety of languages.
+Straightforward improvements will allow models to run faster than realtime, rendering them useful for applications such as virtual assistants.
+
+While we hope that this release will enable users to express their creativity and build applications that are a force
+for good, we acknowledge that any text to audio model has the potential for dual use. While it is not straightforward
+to voice clone known people with Bark, they can still be used for nefarious purposes. To further reduce the chances of unintended use of Bark,
+we also release a simple classifier to detect Bark-generated audio with high accuracy (see notebooks section of the main repository).
diff --git a/notebooks/Bark-Infinity.ipynb b/notebooks/Bark-Infinity.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..1dba1f0001cd2d2ef82e0385daf2524e5ff3407d
--- /dev/null
+++ b/notebooks/Bark-Infinity.ipynb
@@ -0,0 +1,47 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": [],
+ "gpuType": "T4",
+ "authorship_tag": "ABX9TyPwSM/Piw4ecN7LXoGsFpPu",
+ "include_colab_link": true
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ },
+ "accelerator": "GPU"
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "!git clone https://github.com/JonathanFly/bark.git\n",
+ "%cd bark\n",
+ "!pip install -r old_setup_files/requirements-pip.txt\n",
+ "!pip install encodec rich-argparse librosa pydub devtools\n",
+ "!python bark_webui.py --share"
+ ],
+ "metadata": {
+ "id": "h2hmNnKDlWvM"
+ },
+ "execution_count": null,
+ "outputs": []
+ }
+ ]
+}
\ No newline at end of file
diff --git a/notebooks/Bark_Infinity_Long_Form_Audio_Colab.ipynb b/notebooks/Bark_Infinity_Long_Form_Audio_Colab.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..ab4691cbf63d1195861a2d7aa1cf9627dbb7eb9a
--- /dev/null
+++ b/notebooks/Bark_Infinity_Long_Form_Audio_Colab.ipynb
@@ -0,0 +1,492 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#@title Connect and check GPU and runtime\n",
+ "from psutil import virtual_memory\n",
+ "gpu_info = !nvidia-smi\n",
+ "gpu_info = '\\n'.join(gpu_info)\n",
+ "ram_gb = virtual_memory().total / 1e9\n",
+ "if gpu_info.find('failed') >= 0:\n",
+ " print('Not connected to a GPU', end=\"\")\n",
+ "elif gpu_info.find('not found') >= 0:\n",
+ " print('Not connected to a GPU', end=\"\")\n",
+ "else:\n",
+ " print('GPU Connected', end=\"\")\n",
+ "print(', your runtime has {:.1f} gigabytes of available RAM\\n'.format(ram_gb))\n"
+ ],
+ "metadata": {
+ "cellView": "form",
+ "id": "ogUYjFfhcxTG"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "HJQ4TI0_Qowr"
+ },
+ "source": [
+ "## Setup Notebook, Install dependencies\n",
+ "Run both cells to install system and needed functions. \n",
+ "_If Colab for some reason crashes re-run cell 0.2 before contining._\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "r8wG_tIaOV0Q",
+ "cellView": "form"
+ },
+ "outputs": [],
+ "source": [
+ "#@title 0.1 - Install system\n",
+ "from IPython.display import clear_output\n",
+ "!git clone https://github.com/JonathanFly/bark.git\n",
+ "%cd bark\n",
+ "!pip install -r old_setup_files/requirements-pip.txt\n",
+ "!pip install encodec rich-argparse\n",
+ "!pip install librosa pydub devtools\n",
+ "\n",
+ "#clear_output()\n",
+ "#print('Cell completed.')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "jKTvqvVkOwXM",
+ "cellView": "form"
+ },
+ "outputs": [],
+ "source": [
+ " #@title 0.2 - Setup required functions and helpers\n",
+ "import os\n",
+ "import time\n",
+ "from bark_infinity import config\n",
+ "import numpy as np\n",
+ "\n",
+ "logger = config.logger\n",
+ "logger.setLevel(\"WARNING\")\n",
+ "\n",
+ "from bark_infinity import generation\n",
+ "from bark_infinity import api\n",
+ "\n",
+ "import rich\n",
+ "from rich import print\n",
+ "from rich import pretty\n",
+ "from rich.pretty import pprint\n",
+ "from rich import inspect\n",
+ "\n",
+ "import librosa\n",
+ "from pydub import AudioSegment\n",
+ "import ipywidgets as widgets\n",
+ "from IPython.display import display, Audio\n",
+ "from io import BytesIO\n",
+ "\n",
+ "# None of this code, just fiddlign with Colab stuff\n",
+ "# Just to save Colab with outputs and float32 wavs are GIGANTO\n",
+ "# actually this doesn't work, the iPython widget converts it back to float32? or I messed up\n",
+ "\n",
+ "def display_audio_int16_but(audio_arr_segments, file_name, sample_rate=generation.SAMPLE_RATE, width='200px'):\n",
+ " file_name_label = widgets.Label(value=f\"Playing: {file_name}\")\n",
+ " file_name_label.layout.width = width\n",
+ " audio_data_int16 = audio_arr_segments\n",
+ " if isinstance(audio_data_int16, list):\n",
+ " audio_data_int16 = np.concatenate(audio_data_int16)\n",
+ "\n",
+ " #audio_data_int16 = np.int16(audio_data_int16 * np.iinfo(np.int16).max)\n",
+ "\n",
+ "\n",
+ " audio_widget = Audio(audio_data_int16, rate=sample_rate)\n",
+ " display(file_name_label, audio_widget)\n",
+ "\n",
+ "\n",
+ "def on_button_click(button):\n",
+ " audio_data, sample_rate = librosa.load(button.wav_path, sr=None)\n",
+ " file_name = os.path.basename(button.wav_path)\n",
+ " display_audio_int16_but(audio_data,file_name, sample_rate)\n",
+ "\n",
+ "def display_wav_files(directory, matchType=\".wav\"):\n",
+ " subdirs, wav_files = [], []\n",
+ "\n",
+ " for item in os.listdir(directory):\n",
+ " item_path = os.path.join(directory, item)\n",
+ "\n",
+ " if os.path.isfile(item_path) and item_path.endswith(matchType):\n",
+ " wav_files.append(item_path)\n",
+ " elif os.path.isdir(item_path):\n",
+ " subdirs.append(item_path)\n",
+ "\n",
+ " wav_files.sort(key=lambda x: os.path.basename(x))\n",
+ "\n",
+ " for wav_file in wav_files:\n",
+ "\n",
+ " filename = os.path.basename(wav_file)\n",
+ " print(f\" {filename}\")\n",
+ " display( Audio(filename=wav_file, rate=generation.SAMPLE_RATE) )\n",
+ " #button = widgets.Button(description=f\"Play {filename}\")\n",
+ " #button.wav_path = wav_file\n",
+ " #button.on_click(on_button_click)\n",
+ " #display(button)\n",
+ "\n",
+ " for subdir in sorted(subdirs):\n",
+ " print(f\"<{subdir}>\")\n",
+ " display_wav_files(subdir, matchType)\n",
+ "\n",
+ "def display_mp4_files(directory):\n",
+ " return display_wav_files(directory, '.mp4')\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## 1.0 - Gradio App"
+ ],
+ "metadata": {
+ "id": "VbIE0Bv8jxtN"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#@markdown Run the WebUI with all features. \n",
+ "#@markdown When loaded click the second link to launch WebUI in another window.\n",
+ "!python bark_webui.py --share"
+ ],
+ "metadata": {
+ "cellView": "form",
+ "id": "BQfEqnxMpUk1"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "OTRtNy1xT1sI"
+ },
+ "source": [
+ "## 2.0 - Manual generation\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "LKLe_gYkQ59l"
+ },
+ "source": [
+ "### 2.1 - Choose Bark Models and set Text and Other Generation Options\n",
+ "\n",
+ "Required for 3.0 and 4.0"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Time to complete cell: ca. 3min\n",
+ "generation.OFFLOAD_CPU = False # On your home system set to True probably, but Colab GPU should have plenty of memory for all three models\n",
+ "generation.preload_models() # Optional, will lazy load if not preloaded. First time run in New Colab has to download models"
+ ],
+ "metadata": {
+ "id": "QLa2jPOUjSyd"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "nTzF9iamO1Tm"
+ },
+ "outputs": [],
+ "source": [
+ "text = \"\"\"\n",
+ "Hey, have you heard about this new text-to-audio model called \"Bark\"?\n",
+ "It's like rain on your wedding day. It's a free ride when you've already paid. It's the good advice that you just didn't take.\n",
+ "And who would've thought? It figures.\n",
+ "\n",
+ "Well, life has a funny way of sneaking up on you. When you think everything's okay and everything's going right.\n",
+ "And life has a funny way of helping you out. When you think everything's gone wrong.\n",
+ "And everything blows up in your face.\n",
+ "\n",
+ "It's a traffic jam when you're already late. A \"No smoking\" sign on your cigarette break.\n",
+ "It's like ten thousand spoons when all you need is a knife. It's meeting the man of my dreams.\n",
+ "And then meeting his beautiful wife.\n",
+ "\n",
+ "And isn't it ironic? Don't you think? A little too ironic.\n",
+ "And yeah, I really do think.\n",
+ "\"\"\"\n",
+ "\n",
+ "# For split set split_character_goal_length and split_character_max_length\n",
+ "kwargs = {}\n",
+ "\n",
+ "kwargs = config.load_all_defaults()\n",
+ "kwargs['text_prompt'] = text\n",
+ "kwargs['hoarder_mode'] = True\n",
+ "kwargs[\"output_dir\"] = 'bark_samples'\n",
+ "kwargs[\"history_prompt\"] = None\n",
+ "# kwargs[\"single_starting_seed\"] = None #\n",
+ "# If you set seed you might want manually call generation.set_seed(-1) after to disable deterministic generation settings\n",
+ "# I'm not cleaning up after this paramater at the moment and I'm not sure on other side effects\n",
+ "kwargs[\"stable_mode_interval\"] = 1 # 0 for continous, 2,3,4 for mixed\n",
+ "kwargs[\"split_character_goal_length\"] = 90\n",
+ "kwargs[\"split_character_max_length\"] = 130\n",
+ "# kwargs[\"output_iterations\"] = 1\n",
+ "kwargs[\"add_silence_between_segments\"] = .025 # See: https://github.com/suno-ai/bark/blob/main/notebooks/long_form_generation.ipynb but not great for songs or stable_mode_interval 0\n",
+ "kwargs[\"semantic_min_eos_p\"] = 0.05 # 0.20 is default, lower means more likely to stotp\n",
+ "\n",
+ "\n",
+ "# not sure on overall effect so far from these, but for example:\n",
+ "kwargs[\"semantic_top_k\"] = 50\n",
+ "kwargs[\"semantic_top_p\"] = 0.95"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### 3.0 First Attempt"
+ ],
+ "metadata": {
+ "id": "uuqkchecXnbm"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#### 3.2 Before we run, let's double check out settings"
+ ],
+ "metadata": {
+ "id": "6ANm93mHZIa6"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "6SPmYLSoQBBp"
+ },
+ "outputs": [],
+ "source": [
+ "kwargs[\"dry_run\"] = True # Check how the text is being split, don't actually run the model.\n",
+ "full_generation_segments, audio_arr_segments, final_filename_will_be, _ = api.generate_audio_long(**kwargs)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "IEFQgcemX7Ih"
+ },
+ "outputs": [],
+ "source": [
+ "# that's the output we expect to see, we didn't generate audio yet\n",
+ "# these text segments look a little small small so let's try this instead\n",
+ "kwargs[\"split_character_goal_length\"] = 110\n",
+ "kwargs[\"split_character_max_length\"] = 175\n",
+ "\n",
+ "full_generation_segments, audio_arr_segments, final_filename_will_be, _ = api.generate_audio_long(**kwargs)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#### 3.2 Run Bark"
+ ],
+ "metadata": {
+ "id": "iPwaLVKCZNaN"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "gnvv0zEZY7vP"
+ },
+ "outputs": [],
+ "source": [
+ "# These segement sizes look better so now so set dry_run to False to run for real\n",
+ "# Because we set hoarder_mode we can see the wav files for each segment in the Colab File Manager\n",
+ "\n",
+ "kwargs[\"dry_run\"] = False\n",
+ "full_generation_segments, audio_arr_segments, final_filename_will_be, _ = api.generate_audio_long(**kwargs)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#### 3.3 Save and list files + playbutton"
+ ],
+ "metadata": {
+ "id": "pGeaoAZ6Y9yX"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Qqyw-Uk1axiC"
+ },
+ "outputs": [],
+ "source": [
+ "print(f\"Final audiofile: {final_filename_will_be}\")\n",
+ "# (we see many wav because we set hoarder_mode, but one file will be the final product\n",
+ "# set hoarder_mode=False if you just want the file wav and aren't in explore mode\n",
+ "\n",
+ "# or play here\n",
+ "Audio(np.concatenate(audio_arr_segments), rate=generation.SAMPLE_RATE)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# because we set hoarder mode we also saved each segement as its own seperate sample with wav\n",
+ "\n",
+ "!find \"bark_samples/\" -name \"*.npz\"\n",
+ "\n",
+ "display_mp4_files(\"bark_samples/\")"
+ ],
+ "metadata": {
+ "id": "yqORA8ajXMrk"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### 4.0 Second Attempt. Can we do better?"
+ ],
+ "metadata": {
+ "id": "psbR-0mxW4Dn"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#### 4.1 settings"
+ ],
+ "metadata": {
+ "id": "g8vPUGahb0Ar"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# we used stable_mode_interval = 1, so the history_prompt does not evolve between segments\n",
+ "# even still the voices that are saved for each segment are one-generation different than the original history prompt\n",
+ "# this means they are a *little* bit different, and we may prefer one of them over the original\n",
+ "# for example maybe segment 2 was a little more clear, or had a particular emotion, we could use that segment's version as the speaker\n",
+ "# in the particular run I'm doing now, that segment ended with a little bit an interesting accent. I'm curious if I can bring that out more.\n",
+ "\n",
+ "# (should probably rename the file to something sensible though)\n",
+ "\n",
+ "kwargs[\"history_prompt\"] = \"/content/bark/bark_samples/Hey_have_you_he-23-0714-0743-27-SPK-random/002_Its_the_good_ad-23-0714-0744-51-SPK-random.mp4.npz\""
+ ],
+ "metadata": {
+ "id": "WQvyiM2sW5xC"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#### 4.2 generate"
+ ],
+ "metadata": {
+ "id": "t95ZGI6kcNBq"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "kwargs[\"text_prompt\"] = f\"I'm speaker number two. I'm the best speaker. Also I'm a free spirit. Let me evolve my voice with every step. Here's my version.\"\n",
+ "kwargs[\"text_prompt\"] += text\n",
+ "kwargs[\"stable_mode_interval\"] = 0\n",
+ "kwargs[\"output_dir\"] = \"speaker_2_test\"\n",
+ "kwargs[\"add_silence_between_segments\"] = 0.0 # No silence, fully merge clips\n",
+ "\n",
+ "kwargs[\"semantic_min_eos_p\"] = 0.20 # Back to default, let Bark umm and ahh a bit\n",
+ "full_generation_segments, audio_arr_segments, final_filename_will_be, _ = api.generate_audio_long(**kwargs)\n"
+ ],
+ "metadata": {
+ "id": "btp4V86BfAT1"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "#### 4.3 Save and list files + playbutton"
+ ],
+ "metadata": {
+ "id": "GrXY5zfscCwi"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(f\"Final audiofile: {final_filename_will_be}\")\n",
+ "Audio(np.concatenate(audio_arr_segments), rate=generation.SAMPLE_RATE)"
+ ],
+ "metadata": {
+ "id": "9fthW9oGw-5g"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# this clip probably got really weird after a full segments, fully feedbacking into itself. So kwargs[\"stable_mode_interval\"] = 3 might be a good compromise\n",
+ "\n",
+ "display_mp4_files(\"speaker_2_test\")"
+ ],
+ "metadata": {
+ "id": "Rlv4CQrfthY8"
+ },
+ "execution_count": null,
+ "outputs": []
+ }
+ ],
+ "metadata": {
+ "accelerator": "GPU",
+ "colab": {
+ "collapsed_sections": [
+ "HJQ4TI0_Qowr",
+ "FHlxNCt3QwIr",
+ "uuqkchecXnbm"
+ ],
+ "provenance": [],
+ "machine_shape": "hm",
+ "gpuType": "T4",
+ "include_colab_link": true
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/notebooks/Bark_Testing.ipynb b/notebooks/Bark_Testing.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..d25d4428347e352d9ab6af7c79561733fae13b2b
--- /dev/null
+++ b/notebooks/Bark_Testing.ipynb
@@ -0,0 +1,8826 @@
+{
+ "cells": [
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "HJQ4TI0_Qowr"
+ },
+ "source": [
+ "## Setup Notebook, Install\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "r8wG_tIaOV0Q",
+ "outputId": "6ea4ba04-3578-41e7-f9b5-4a44ca654452"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "fatal: destination path 'bark' already exists and is not an empty directory.\n",
+ "/home/jon/mamba_projects/bark_postfixes/bark/bark\n",
+ "\u001b[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements-pip.txt'\u001b[0m\u001b[31m\n",
+ "\u001b[0mRequirement already satisfied: encodec in /home/jon/mambaforge/envs/bark-infinity-oneclick/lib/python3.10/site-packages (0.1.1)\n",
+ "Requirement already satisfied: rich-argparse in /home/jon/mambaforge/envs/bark-infinity-oneclick/lib/python3.10/site-packages (1.1.0)\n",
+ "Requirement already satisfied: torch in /home/jon/mambaforge/envs/bark-infinity-oneclick/lib/python3.10/site-packages (from encodec) (2.0.0)\n",
+ "Requirement already satisfied: numpy in /home/jon/mambaforge/envs/bark-infinity-oneclick/lib/python3.10/site-packages (from encodec) (1.24.3)\n",
+ "Requirement already satisfied: einops in /home/jon/mambaforge/envs/bark-infinity-oneclick/lib/python3.10/site-packages (from encodec) (0.6.1)\n",
+ "Requirement already satisfied: torchaudio in /home/jon/mambaforge/envs/bark-infinity-oneclick/lib/python3.10/site-packages (from encodec) (2.0.0)\n",
+ "Requirement already satisfied: rich>=11.0.0 in /home/jon/mambaforge/envs/bark-infinity-oneclick/lib/python3.10/site-packages (from rich-argparse) (12.5.1)\n",
+ "Requirement already satisfied: pygments<3.0.0,>=2.6.0 in /home/jon/mambaforge/envs/bark-infinity-oneclick/lib/python3.10/site-packages (from rich>=11.0.0->rich-argparse) (2.11.2)\n",
+ "Requirement already satisfied: commonmark<0.10.0,>=0.9.0 in /home/jon/mambaforge/envs/bark-infinity-oneclick/lib/python3.10/site-packages (from rich>=11.0.0->rich-argparse) (0.9.1)\n",
+ "Requirement already satisfied: filelock in /home/jon/mambaforge/envs/bark-infinity-oneclick/lib/python3.10/site-packages (from torch->encodec) (3.9.0)\n",
+ "Requirement already satisfied: typing-extensions in /home/jon/mambaforge/envs/bark-infinity-oneclick/lib/python3.10/site-packages (from torch->encodec) (4.5.0)\n",
+ "Requirement already satisfied: sympy in /home/jon/mambaforge/envs/bark-infinity-oneclick/lib/python3.10/site-packages (from torch->encodec) (1.11.1)\n",
+ "Requirement already satisfied: networkx in /home/jon/mambaforge/envs/bark-infinity-oneclick/lib/python3.10/site-packages (from torch->encodec) (2.8.4)\n",
+ "Requirement already satisfied: jinja2 in /home/jon/mambaforge/envs/bark-infinity-oneclick/lib/python3.10/site-packages (from torch->encodec) (3.1.2)\n",
+ "Requirement already satisfied: MarkupSafe>=2.0 in /home/jon/mambaforge/envs/bark-infinity-oneclick/lib/python3.10/site-packages (from jinja2->torch->encodec) (2.1.1)\n",
+ "Requirement already satisfied: mpmath>=0.19 in /home/jon/mambaforge/envs/bark-infinity-oneclick/lib/python3.10/site-packages/mpmath-1.2.1-py3.10.egg (from sympy->torch->encodec) (1.2.1)\n",
+ "^C\n",
+ "\u001b[31mERROR: Operation cancelled by user\u001b[0m\u001b[31m\n",
+ "\u001b[0m"
+ ]
+ }
+ ],
+ "source": [
+ "!git clone https://github.com/JonathanFly/bark.git\n",
+ "%cd bark\n",
+ "!pip install -r requirements-pip.txt\n",
+ "!pip install encodec rich-argparse\n",
+ "!pip install librosa pydub"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "FHlxNCt3QwIr"
+ },
+ "source": [
+ "## Run Once Per Notebook Restart (if files still exist)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "jKTvqvVkOwXM"
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import time\n",
+ "from bark_infinity import config\n",
+ "import numpy as np\n",
+ "\n",
+ "logger = config.logger\n",
+ "logger.setLevel(\"WARNING\")\n",
+ "\n",
+ "from bark_infinity import generation\n",
+ "from bark_infinity import api\n",
+ "\n",
+ "import rich\n",
+ "from rich import print\n",
+ "from rich import pretty\n",
+ "from rich.pretty import pprint\n",
+ "from rich import inspect\n",
+ "\n",
+ "from pydub import AudioSegment\n",
+ "import ipywidgets as widgets\n",
+ "from IPython.display import display, Audio\n",
+ "from io import BytesIO\n",
+ "\n",
+ "# None of this code, just fiddlign with Colab stuff\n",
+ "# Just to save Colab with outputs and float32 wavs are GIGANTO\n",
+ "# actually this doesn't work, the iPython widget converts it back to float32? or I messed up\n",
+ "\n",
+ "def display_audio_int16_but(audio_arr_segments, file_name, sample_rate=generation.SAMPLE_RATE, width='200px'):\n",
+ " file_name_label = widgets.Label(value=f\"Playing: {file_name}\")\n",
+ " file_name_label.layout.width = width\n",
+ " audio_data_int16 = audio_arr_segments\n",
+ " if isinstance(audio_data_int16, list):\n",
+ " audio_data_int16 = np.concatenate(audio_data_int16) \n",
+ "\n",
+ " #audio_data_int16 = np.int16(audio_data_int16 * np.iinfo(np.int16).max)\n",
+ "\n",
+ "\n",
+ " audio_widget = Audio(audio_data_int16, rate=sample_rate)\n",
+ " display(file_name_label, audio_widget)\n",
+ " \n",
+ "\n",
+ "def on_button_click(button):\n",
+ " audio_data, sample_rate = librosa.load(button.wav_path, sr=None)\n",
+ " file_name = os.path.basename(button.wav_path)\n",
+ " display_audio_int16_but(audio_data,file_name, sample_rate)\n",
+ "\n",
+ "\n",
+ "def display_wav_files(directory):\n",
+ " subdirs, wav_files = [], []\n",
+ " \n",
+ " for item in os.listdir(directory):\n",
+ " item_path = os.path.join(directory, item)\n",
+ " \n",
+ " if os.path.isfile(item_path) and item_path.endswith('.wav'):\n",
+ " wav_files.append(item_path)\n",
+ " elif os.path.isdir(item_path):\n",
+ " subdirs.append(item_path)\n",
+ "\n",
+ " wav_files.sort(key=lambda x: os.path.basename(x))\n",
+ "\n",
+ " for wav_file in wav_files:\n",
+ "\n",
+ " filename = os.path.basename(wav_file)\n",
+ " print(f\" {filename}\")\n",
+ " button = widgets.Button(description=f\"Play {filename}\")\n",
+ " button.wav_path = wav_file \n",
+ " button.on_click(on_button_click)\n",
+ " display(button)\n",
+ "\n",
+ " for subdir in sorted(subdirs):\n",
+ " print(f\"<{subdir}>\")\n",
+ " display_wav_files(subdir)\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "\n",
+ "def call_with_non_none_params(func, **kwargs):\n",
+ " non_none_params = {key: value for key, value in kwargs.items() if value is not None}\n",
+ " return func(**non_none_params)\n",
+ "\n",
+ "global save_semantic_tokens\n",
+ "def generate_audio_barki(\n",
+ " text: str,\n",
+ " **kwargs,\n",
+ "):\n",
+ " \"\"\"Generate audio array from input text.\n",
+ "\n",
+ " Args:\n",
+ " text: text to be turned into audio\n",
+ " history_prompt: history choice for audio cloning\n",
+ " text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)\n",
+ " waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)\n",
+ " silent: disable progress bar\n",
+ " output_full: return full generation to be used as a history prompt\n",
+ "\n",
+ "\n",
+ " Returns:\n",
+ " numpy audio array at sample frequency 24khz\n",
+ " \"\"\"\n",
+ " #logger.debug(locals())\n",
+ " #print(\"before load all defaults\")\n",
+ " kwargs = config.load_all_defaults(**kwargs)\n",
+ "\n",
+ " #logger.debug(locals())\n",
+ " #print(\"after load all defaults\")\n",
+ " history_prompt = kwargs.get(\"history_prompt\", None)\n",
+ " text_temp = kwargs.get(\"text_temp\", None)\n",
+ " waveform_temp = kwargs.get(\"waveform_temp\", None)\n",
+ " silent = kwargs.get(\"silent\", None)\n",
+ " output_full = kwargs.get(\"output_full\", None)\n",
+ "\n",
+ " global gradio_try_to_cancel\n",
+ " global done_cancelling\n",
+ "\n",
+ " seed = kwargs.get(\"seed\",None)\n",
+ " if seed is not None:\n",
+ " generation.set_seed(seed)\n",
+ "\n",
+ " ## TODO seperate stage seeds\n",
+ "\n",
+ " ## Semantic Options\n",
+ " semantic_temp = text_temp\n",
+ " if kwargs.get(\"semantic_temp\", None):\n",
+ " semantic_temp = kwargs.get(\"semantic_temp\")\n",
+ "\n",
+ " semantic_seed = kwargs.get(\"semantic_seed\",None)\n",
+ " if semantic_seed is not None:\n",
+ " generation.set_seed(semantic_seed)\n",
+ "\n",
+ "\n",
+ " if api.gradio_try_to_cancel:\n",
+ " done_cancelling = True\n",
+ " return None, None\n",
+ " semantic_tokens = call_with_non_none_params(\n",
+ " api.generate_text_semantic,\n",
+ " text=text,\n",
+ " history_prompt=history_prompt,\n",
+ " temp=semantic_temp,\n",
+ " top_k=kwargs.get(\"semantic_top_k\", None),\n",
+ " top_p=kwargs.get(\"semantic_top_p\", None),\n",
+ " silent=silent,\n",
+ " min_eos_p = kwargs.get(\"semantic_min_eos_p\", None),\n",
+ " max_gen_duration_s = kwargs.get(\"semantic_max_gen_duration_s\", None),\n",
+ " allow_early_stop = kwargs.get(\"semantic_allow_early_stop\", True),\n",
+ " use_kv_caching=kwargs.get(\"semantic_use_kv_caching\", True),\n",
+ " )\n",
+ " \n",
+ " inspect(semantic_tokens)\n",
+ " if api.gradio_try_to_cancel:\n",
+ " done_cancelling = True\n",
+ " return None, None\n",
+ "\n",
+ " ## Coarse Options\n",
+ " coarse_temp = waveform_temp\n",
+ " if kwargs.get(\"coarse_temp\", None):\n",
+ " coarse_temp = kwargs.get(\"coarse_temp\")\n",
+ "\n",
+ " coarse_seed = kwargs.get(\"coarse_seed\",None)\n",
+ " if coarse_seed is not None:\n",
+ " generation.set_seed(coarse_seed)\n",
+ " \n",
+ " \n",
+ " if api.gradio_try_to_cancel:\n",
+ " done_cancelling = True\n",
+ " return None, None\n",
+ " \n",
+ " coarse_tokens = call_with_non_none_params(\n",
+ " generation.generate_coarse,\n",
+ " x_semantic=semantic_tokens,\n",
+ " history_prompt=history_prompt,\n",
+ " temp=coarse_temp,\n",
+ " top_k=kwargs.get(\"coarse_top_k\", None),\n",
+ " top_p=kwargs.get(\"coarse_top_p\", None),\n",
+ " silent=silent,\n",
+ " max_coarse_history=kwargs.get(\"coarse_max_coarse_history\", None),\n",
+ " sliding_window_len=kwargs.get(\"coarse_sliding_window_len\", None),\n",
+ " use_kv_caching=kwargs.get(\"coarse_kv_caching\", True),\n",
+ " )\n",
+ "\n",
+ " fine_temp = kwargs.get(\"fine_temp\", 0.5)\n",
+ "\n",
+ " fine_seed = kwargs.get(\"fine_seed\",None)\n",
+ " if fine_seed is not None:\n",
+ " generation.set_seed(fine_seed)\n",
+ "\n",
+ " if api.gradio_try_to_cancel:\n",
+ " done_cancelling = True\n",
+ " return None, None\n",
+ " fine_tokens = call_with_non_none_params(\n",
+ " api.generate_fine,\n",
+ " x_coarse_gen=coarse_tokens,\n",
+ " history_prompt=history_prompt,\n",
+ " temp=fine_temp,\n",
+ " silent=silent,\n",
+ " )\n",
+ "\n",
+ " # do we ever care about setting this seed? Probably not? You can always just decode it again\n",
+ "\n",
+ " if api.gradio_try_to_cancel:\n",
+ " done_cancelling = True\n",
+ " return None, None\n",
+ " audio_arr = api.codec_decode(fine_tokens)\n",
+ " full_generation = {\n",
+ " \"semantic_prompt\": semantic_tokens,\n",
+ " \"coarse_prompt\": coarse_tokens,\n",
+ " \"fine_prompt\": fine_tokens,\n",
+ " }\n",
+ "\n",
+ " if api.gradio_try_to_cancel:\n",
+ " done_cancelling = True\n",
+ " return None, None\n",
+ " \n",
+ " hoarder_mode = kwargs.get(\"hoarder_mode\", None)\n",
+ " total_segments = kwargs.get(\"total_segments\", 1)\n",
+ " if hoarder_mode and (total_segments > 1):\n",
+ " kwargs[\"text\"] = text\n",
+ " api.write_one_segment(audio_arr, full_generation, **kwargs)\n",
+ "\n",
+ " if output_full:\n",
+ " return full_generation, audio_arr\n",
+ " \n",
+ " return audio_arr\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/home/jon/mamba_projects/bark_postfixes/bark\n"
+ ]
+ }
+ ],
+ "source": [
+ "!pwd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
() Segment Breakdown \n",
+ "โโโโโณโโโโโโโโณโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n",
+ "โ # โ Words โ Time Est โ Splitting long text aiming for 145 chars max 190 โ\n",
+ "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n",
+ "โ 1 โ 16 โ 6.40 s โ With me, brave the tumultuous seas, claim treasures untold, and send foes to their โ\n",
+ "โโโ 94 chars โ watery doom โ\n",
+ "โโโโโดโโโโโโโโดโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[3m () Segment Breakdown \u001b[0m\n",
+ "โโโโโณโโโโโโโโณโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n",
+ "โ\u001b[1m \u001b[0m\u001b[1m#\u001b[0m\u001b[1m \u001b[0mโ\u001b[1m \u001b[0m\u001b[1mWords\u001b[0m\u001b[1m \u001b[0mโ\u001b[1m \u001b[0m\u001b[1mTime Est\u001b[0m\u001b[1m \u001b[0mโ\u001b[1m \u001b[0m\u001b[1mSplitting long text aiming for 145 chars max 190 \u001b[0m\u001b[1m \u001b[0mโ\n",
+ "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n",
+ "โ\u001b[35m \u001b[0m\u001b[35m1\u001b[0m\u001b[35m \u001b[0mโ\u001b[32m \u001b[0m\u001b[32m16 \u001b[0m\u001b[32m \u001b[0mโ\u001b[32m \u001b[0m\u001b[32m6.40 s \u001b[0m\u001b[32m \u001b[0mโ With me, brave the tumultuous seas, claim treasures untold, and send foes to their โ\n",
+ "โ\u001b[35m \u001b[0mโ\u001b[32m \u001b[0mโ\u001b[32m \u001b[0m\u001b[32m94 chars\u001b[0m\u001b[32m \u001b[0mโ watery doom โ\n",
+ "โโโโโดโโโโโโโโดโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "
write_seg_npz .npz saved to long/With_me_brave_t-SPK-hark_woman_3.wav_initial_prompt.npz api.py:316\n",
+ "
write_seg_wav .wav saved to custom_speakers/en_fiery_1.wav api.py:325\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "write_seg_wav .wav saved to custom_speakers/en_fiery_1.wav \u001b]8;id=763391;file:///home/jon/mamba_projects/bark_postfixes/bark/bark_infinity/api.py\u001b\\\u001b[2mapi.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=14160;file:///home/jon/mamba_projects/bark_postfixes/bark/bark_infinity/api.py#325\u001b\\\u001b[2m325\u001b[0m\u001b]8;;\u001b\\\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "api.render_npz_samples(\"custom_speakers\", start_from=\"semantic_prompt\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "Audio(audio_arr_segments_barki, rate=generation.SAMPLE_RATE) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "generation.preload_models()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "text = \"\"\"\n",
+ "Hark! I, the phantom visage of Edward Teach, rise from the \n",
+ "abyss, forever bound to the briny depths. With me, brave the tumultuous seas, claim treasures untold,\n",
+ "and send foes to their watery doom.\n",
+ "\"\"\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "history_prompt = np.load(\"pirates/base/pirate.npz\")\n",
+ "from rich import inspect"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for key in history_prompt.keys():\n",
+ " length = len(history_prompt[key])\n",
+ " print(f\"key: {key}, length: {length}\")\n",
+ " inspect(history_prompt[key], title=f\"{key} ({length})\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "new_semantic = np.hstack([x_semantic_history, x_semantic]).astype(np.int32)\n",
+ "\n",
+ "semantic_prompt = history_prompt[\"semantic_prompt\"]\n",
+ "midpoint = len(semantic_prompt) // 2\n",
+ "new_semantic_first_half = semantic_prompt[:midpoint].astype(np.int32)\n",
+ "\n",
+ "\n",
+ "Instead I would like new_semantic to be half the size of x_semantic_history, just the last half of the space."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ " if length > 0:\n",
+ " for sub_key in history_prompt[key].keys():\n",
+ " print(f\" {sub_key}={history_prompt[key][sub_key]}\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "gen_minor_variants = 20\n",
+ "import random\n",
+ "\n",
+ "npz_file = \"pirate.npz\"\n",
+ "npz_directory = \"pirates/base\"\n",
+ "npz_filepath = \"pirates/base/pirate.npz\"\n",
+ "\n",
+ "semantic_prompt = history_prompt[\"semantic_prompt\"]\n",
+ "original_semantic_prompt = semantic_prompt.copy()\n",
+ "starting_point = 128\n",
+ "ending_point = len(semantic_prompt) - starting_point\n",
+ "\n",
+ "\n",
+ "\n",
+ "points = np.linspace(starting_point, ending_point, gen_minor_variants)\n",
+ " \n",
+ "for starting_point in points:\n",
+ " starting_point = int(starting_point)\n",
+ " print(starting_point)\n",
+ "\n",
+ " new_semantic_from_beginning = original_semantic_prompt[:starting_point].astype(np.int32)\n",
+ " new_semantic_from_ending = original_semantic_prompt[starting_point:].astype(np.int32)\n",
+ "\n",
+ " for semantic_prompt in [new_semantic_from_beginning, new_semantic_from_ending]:\n",
+ " \n",
+ " print(f\"len(semantic_prompt): {len(semantic_prompt)}\")\n",
+ " print(f\"starting_point: {starting_point}, ending_poinst: {ending_point}\") \n",
+ "\n",
+ " temp_coarse = random.uniform(0.5, 0.9)\n",
+ " top_k_coarse = None if random.random() < 1/3 else random.randint(50, 100)\n",
+ " top_p_coarse = None if random.random() < 1/3 else random.uniform(0.8, 0.95)\n",
+ "\n",
+ " max_coarse_history_options = [630, random.randint(500, 630), random.randint(60, 500)]\n",
+ " max_coarse_history = random.choice(max_coarse_history_options)\n",
+ "\n",
+ " coarse_tokens = generation.generate_coarse(semantic_prompt, temp=temp_coarse, top_k=top_k_coarse, top_p=top_p_coarse, max_coarse_history=max_coarse_history)\n",
+ "\n",
+ " temp_fine = random.uniform(0.3, 0.7)\n",
+ " fine_tokens = generation.generate_fine(coarse_tokens, temp=temp_fine)\n",
+ "\n",
+ " history_prompt_render_variant = {\"semantic_prompt\": semantic_prompt, \"coarse_prompt\": coarse_tokens, \"fine_prompt\": fine_tokens}\n",
+ "\n",
+ " try:\n",
+ " audio_arr = generation.codec_decode(fine_tokens)\n",
+ " base_output_filename = os.path.splitext(npz_file)[0] + f\"_var_{i}.wav\"\n",
+ " output_filepath = os.path.join(npz_directory, base_output_filename)\n",
+ " output_filepath = api.generate_unique_filepath(output_filepath)\n",
+ " print(f\" Rendering minor variant voice audio for {npz_filepath} to {output_filepath}\")\n",
+ " api.write_seg_wav(output_filepath, audio_arr)\n",
+ "\n",
+ " api.write_seg_npz(output_filepath, history_prompt_render_variant)\n",
+ " except:\n",
+ " print(f\" \")"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "yPr_MuqFXZ5r"
+ },
+ "source": [
+ "### Set Text and Other Generation Options"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "nTzF9iamO1Tm"
+ },
+ "outputs": [],
+ "source": [
+ "text = \"\"\"\n",
+ "Hey, have you heard about this new text-to-audio model called \"Bark\"? \n",
+ "It's like rain on your wedding day. It's a free ride when you've already paid. It's the good advice that you just didn't take.\n",
+ "And who would've thought? It figures.\n",
+ "\n",
+ "Well, life has a funny way of sneaking up on you. When you think everything's okay and everything's going right. \n",
+ "And life has a funny way of helping you out. When you think everything's gone wrong. \n",
+ "And everything blows up in your face.\n",
+ "\n",
+ "It's a traffic jam when you're already late. A \"No smoking\" sign on your cigarette break.\n",
+ "It's like ten thousand spoons when all you need is a knife. It's meeting the man of my dreams.\n",
+ "And then meeting his beautiful wife.\n",
+ "\n",
+ "And isn't it ironic? Don't you think? A little too ironic.\n",
+ "And yeah, I really do think.\n",
+ "\"\"\"\n",
+ "\n",
+ "# FOr split set split_character_goal_length and split_character_max_length\n",
+ "kwargs = {}\n",
+ "\n",
+ "kwargs = config.load_all_defaults()\n",
+ "kwargs['text_prompt'] = text\n",
+ "kwargs['hoarder_mode'] = True\n",
+ "kwargs[\"output_dir\"] = 'bark_samples'\n",
+ "kwargs[\"history_prompt\"] = None\n",
+ "# kwargs[\"single_starting_seed\"] = None # \n",
+ "# If you set seed you might want manually call generation.set_seed(-1) after to disable deterministic generation settings \n",
+ "# I'm not cleaning up after this paramater at the moment and I'm not sure on other side effects\n",
+ "kwargs[\"stable_mode_interval\"] = 1 # 0 for continous, 2,3,4 for mixed\n",
+ "kwargs[\"split_character_goal_length\"] = 90\n",
+ "kwargs[\"split_character_max_length\"] = 130\n",
+ "# kwargs[\"output_iterations\"] = 1\n",
+ "kwargs[\"add_silence_between_segments\"] = .025 # See: https://github.com/suno-ai/bark/blob/main/notebooks/long_form_generation.ipynb but not great for songs or stable_mode_interval 0\n",
+ "kwargs[\"semantic_min_eos_p\"] = 0.05 # 0.20 is default, lower means more likely to stotp\n",
+ "\n",
+ "\n",
+ "# not sure on overall effect so far from these, but for example:\n",
+ "kwargs[\"semantic_top_k\"] = 50\n",
+ "kwargs[\"semantic_top_p\"] = 0.95"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "uuqkchecXnbm"
+ },
+ "source": [
+ "### First Attempt"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "6ANm93mHZIa6"
+ },
+ "source": [
+ "#### Before we run, let's double check out settings"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 582
+ },
+ "id": "6SPmYLSoQBBp",
+ "outputId": "d896cba0-fc95-4343-ff59-57d822a71467"
+ },
+ "outputs": [],
+ "source": [
+ "kwargs[\"dry_run\"] = True # Check how the text is being split, don't actually run the model. \n",
+ "full_generation_segments, audio_arr_segments, final_filename_will_be = api.generate_audio_long(**kwargs)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 582
+ },
+ "id": "IEFQgcemX7Ih",
+ "outputId": "49e553d8-2b3d-43e4-9c8a-62cea105bb86"
+ },
+ "outputs": [],
+ "source": [
+ "# that's the output we expect to see, we didn't generate audio yet\n",
+ "# these text segments look a little small small so let's try this instead\n",
+ "kwargs[\"split_character_goal_length\"] = 110\n",
+ "kwargs[\"split_character_max_length\"] = 175\n",
+ "\n",
+ "full_generation_segments, audio_arr_segments, final_filename_will_be = api.generate_audio_long(**kwargs)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "iPwaLVKCZNaN"
+ },
+ "source": [
+ "#### Run Bark"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 908
+ },
+ "id": "gnvv0zEZY7vP",
+ "outputId": "5a501a72-a2d0-4a66-8ab9-ab876cf6a10f"
+ },
+ "outputs": [],
+ "source": [
+ "# These segement sizes look better so now so set dry_run to False to run for real\n",
+ "# Because we set hoarder_mode we can see the wav files for each segment in the Colab File Manager\n",
+ "\n",
+ "kwargs[\"dry_run\"] = False\n",
+ "full_generation_segments, audio_arr_segments, final_filename_will_be = api.generate_audio_long(**kwargs)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Qqyw-Uk1axiC"
+ },
+ "outputs": [],
+ "source": [
+ "print(f\" final wav at {final_filename_will_be} \")\n",
+ "# (we see many wav because we set hoarder_mode, but one file will be the final product\n",
+ "# set hoarder_mode=False if you just want the file wav and aren't in explore mode\n",
+ "\n",
+ "# or play here \n",
+ "Audio(np.concatenate(audio_arr_segments), rate=generation.SAMPLE_RATE) \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 497,
+ "referenced_widgets": [
+ "8ea29b7225d541f080777684c195f999",
+ "a98dc53d118d496981706e6b51165c62",
+ "6caacddbe9c342a7b385dee13264b934",
+ "2abd2834c2e64c0fb4ca048e3e022949",
+ "c85a04777707427fb4feaa1a490bbf59",
+ "917113d8bf08419eaae8225762200a3b",
+ "71bbd6f43cf144f7ab36bed80b4a68d6",
+ "5d10427092ce494092dd22dde8b329f6",
+ "a3f6f1b2f7e84dd5a49a5581c0769a62",
+ "6b6084bdd35e4d178057706993d60140",
+ "7f4e5dff04df4aaeb093c8f0c82df3b3",
+ "9019d02759574f9e959c85ec2730ca59",
+ "db9a72ac819040338d9af5d9814820a0",
+ "25d7021cabf5433298269e42666f9a85",
+ "c812f5dc984444ad9dba077f0e64ef1c",
+ "da58ffbea7604844aa6c54815887445f",
+ "4221c0af5d7f4eed884d4647db6a4d7e",
+ "16c5ddc2cd62477e80085a7f969ea4e0",
+ "6d939e0401824fa0a02a00e1358504ad",
+ "a32ffcd3be074dc7be0a6e2d65b0c4bb",
+ "fe89cce3a1eb4ef59210501f8e853cf2"
+ ]
+ },
+ "id": "yqORA8ajXMrk",
+ "outputId": "1c9960b5-da6c-458a-9af2-d8a814db0b99"
+ },
+ "outputs": [],
+ "source": [
+ "# because we set hoarder mode we also saved each segement as its own seperate sample with wav\n",
+ "\n",
+ "!find \"bark_samples/\" -name \"*.npz\"\n",
+ "\n",
+ "display_wav_files(\"bark_samples/\")"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "psbR-0mxW4Dn"
+ },
+ "source": [
+ "### Second Attempt. Can we do better?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "WQvyiM2sW5xC"
+ },
+ "outputs": [],
+ "source": [
+ "# we used stable_mode_interval = 1, so the history_prompt does not evolve between segments\n",
+ "# even still the voices that are saved for each segment are one-generation different than the original history prompt\n",
+ "# this means they are a *little* bit different, and we may prefer one of them over the original\n",
+ "# for example maybe segment 2 was a little more clear, or had a particular emotion, we could use that segment's version as the speaker\n",
+ "# in the particular run I'm doing now, that segment ended with a little bit an interesting accent. I'm curious if I can bring that out more.\n",
+ "\n",
+ "# (should probably rename the file to something sensible though)\n",
+ "\n",
+ "kwargs[\"history_prompt\"] = \"/content/bark/bark_samples/Hey_have_you_heard_a-SPK-random.wav/002_Its_the_good_advice_-SPK-random.wav.npz\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 1000
+ },
+ "id": "btp4V86BfAT1",
+ "outputId": "0dc09a5e-baca-450f-c7e9-63302a974bea"
+ },
+ "outputs": [],
+ "source": [
+ "kwargs[\"text_prompt\"] = f\"I'm speaker number two. I'm the best speaker. Also I'm a free spirit. Let me evolve my voice with every step. Here's my version.\"\n",
+ "kwargs[\"text_prompt\"] += text\n",
+ "kwargs[\"stable_mode_interval\"] = 0 \n",
+ "kwargs[\"output_dir\"] = \"speaker_2_test\"\n",
+ "kwargs[\"add_silence_between_segments\"] = 0.0 # No silence, fully merge clips\n",
+ "\n",
+ "kwargs[\"semantic_min_eos_p\"] = 0.20 # Back to default, let Bark umm and ahh a bit\n",
+ "full_generation_segments, audio_arr_segments, final_filename_will_be = api.generate_audio_long(**kwargs)\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "9fthW9oGw-5g"
+ },
+ "outputs": [],
+ "source": [
+ "print(f\" final wave at {final_filename_will_be}\")\n",
+ "Audio(np.concatenate(audio_arr_segments), rate=generation.SAMPLE_RATE) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 419,
+ "referenced_widgets": [
+ "84a7d2a44cff4af0a7ac51201a451c2c",
+ "6ea50794f0df4129a5ccc7c736a177c8",
+ "500a8ffd11ba44e781ba9bd30cf01798",
+ "34f31ddd3abf4912ad4b4d1b6db17c0a",
+ "0bcae29b25c24fb7b2304394c5e4b5ff",
+ "c4665df69acf4675ac6aa728ece70631",
+ "f11e33866bae4f97afd7d8be04f69d08",
+ "ba676a7a9b4540f08199903d9d83fea8",
+ "d95d5fec436040698a5b4236f3d5d81f",
+ "bfb0c1f37e4541aa938041ceb9bfacc2",
+ "8aec7773934a4651a88ba343c1a6f6a6",
+ "e1a97cc9969c482ebe5ce0c9d1967989",
+ "18e25b6d233f4fbeb9fc9ed8e8d0c609",
+ "20a060f910274c75a04af4493000a478",
+ "d1798f0507c444248c68286d27c26458",
+ "8fb28922d6a24e97b5ebe582cc981b07",
+ "ee01a0665af84f35a3a9e779e140ef67",
+ "b595442996e94d9b87b1fd657ebc512a",
+ "8fd2ab5f1b704ac2bbc88a95ff60614a",
+ "000ef644370a46adb4bc9918ce248dc2",
+ "a3550d6eb97a45809978825e4b8b36a0",
+ "82165a42741a4baf8ffc795687a249de",
+ "471b04ef64584089b8526b7a2805f94f",
+ "e61f0582de914f0ca216966185139808"
+ ]
+ },
+ "id": "Rlv4CQrfthY8",
+ "outputId": "a759d618-ab34-4e72-f28b-fad64205550a"
+ },
+ "outputs": [],
+ "source": [
+ "# this clip probably got really weird after a full segments, fully feedbacking into itself. So kwargs[\"stable_mode_interval\"] = 3 might be a good compromise\n",
+ "\n",
+ "display_wav_files(\"speaker_2_test\")"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "z64RIV6Kc2_K"
+ },
+ "source": [
+ "### Finding Our Voice"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Q_2NNCiso3vV"
+ },
+ "outputs": [],
+ "source": [
+ "# That final clip is an improvement, the random voice we got isn't bad but it's not quite doing our beautiful prose justice\n",
+ "# we could use an existing history_prompt, but let's try to summon a perfect speaker from the model\n",
+ "# we do that by generating many speakers randomly\n",
+ "# we could use our first segment text, in my experience there is a better method\n",
+ "# try to image: what type of text would be the context in voice I want to hear is likely to appear?\n",
+ "# then let's generate 20 sample clips from that\n",
+ "\n",
+ "# TODO"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "text=\"\"\"\n",
+ "How many fucken morons can wave his arms and keep people in tempo? \n",
+ "I was there to push people beyond what's expected of them. \n",
+ "I believe that is an absolute necessity. \n",
+ "Otherwise we're depriving the world of the next Louis Armstrong, \n",
+ "or the next Charlie Parker. \n",
+ "Have I told you that story about how Charlie Parker became Charlie Parker?\n",
+ "Parker's a young kid, pretty good on the Sax, \n",
+ "gets up to play at a cutting session, \n",
+ "and well, he fucks it up. \n",
+ "And Jones nearly decapitates him for it, throws a cymbal at his head. \n",
+ "And Charlie's laughed off stage. Cries himself to sleep that night. \n",
+ "But the next morning, what does he do? He practices. \n",
+ "And he practices and he practices and he practices. \n",
+ "With one goal in mind - never to be laughed at again. \n",
+ "And a year later he goes back to the Reno, \n",
+ "and he steps up on that stage \n",
+ "and he plays the best motherfucken solo the world has ever heard. \n",
+ "So imagine if Jones had just said, \n",
+ "\"Well that's okay, Charlie, that was alright. Good job.โ \n",
+ "And Charlie thinks to himself, โWell, shit I did do a pretty good job.โ \n",
+ "End of story. That to me is an absolute tragedy. \n",
+ "But that's just what the world wants now. \n",
+ "No wonder Jazz is dying.\n",
+ "\n",
+ "[He takes a sip of his drink.]\n",
+ "I tell you man, every Starbucks โJazzโ album, just proves my point really \n",
+ "- there are no two words more harmful in the English language than โGood jobโ.\n",
+ "\n",
+ "The truth is Andrew Iโฆ never really had a Charlie Parker. \n",
+ "But I tried. I actually fucking tried. \n",
+ "And that's more than most people ever do, \n",
+ "and I will never apologise for how I tried.\n",
+ "\"\"\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import re\n",
+ "def apply_rule_to_prompt(regex, replacement, flags, text):\n",
+ "\n",
+ " \n",
+ " re_flags = 0\n",
+ " if 'MULTILINE' in flags:\n",
+ " re_flags |= re.MULTILINE\n",
+ " return re.sub(regex, replacement, text, flags=re_flags)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "regex = \"[\\.?,]\"\n",
+ "replacement = \" \"\n",
+ "flags = \"MULTILINE\"\n",
+ "\n",
+ "print(apply_rule_to_prompt(regex, replacement, flags, text))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "metadata": {
+ "id": "Q0v6zLxmcr5W"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'\\ndef split_by_sentences(text: str, n: int, language=\"en\") -> List[str]:\\n seg = pysbd.Segmenter(language=language, clean=False)\\n sentences = seg.segment(text)\\n return [\\' \\'.join(sentences[i:i + n]) for i in range(0, len(sentences), n)]\\n'"
+ ]
+ },
+ "execution_count": 67,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from typing import List\n",
+ "import re\n",
+ "import random \n",
+ "from typing import Dict, Optional, Union\n",
+ "import logging\n",
+ "\n",
+ "import rich\n",
+ "from rich import print\n",
+ "from rich import pretty\n",
+ "from rich.pretty import pprint\n",
+ "from rich import inspect\n",
+ "\n",
+ "\n",
+ "\n",
+ "logger = logging.getLogger(__name__)\n",
+ "\n",
+ "def split_text(text: str, split_type: Optional[str] = None, split_type_quantity = 1, split_type_string: Optional[str] = None, split_type_value_type: Optional[str] = None) -> List[str]:\n",
+ " if text == '':\n",
+ " return [text]\n",
+ "\n",
+ " # the old syntax still works if you don't use this parameter, ie\n",
+ " # split_type line, split_type_value 4, splits into groups of 4 lines\n",
+ " if split_type_value_type == '':\n",
+ " split_type_value_type = split_type\n",
+ "\n",
+ " \"\"\"\n",
+ " if split_type == 'phrase':\n",
+ " # print(f\"Loading spacy to split by phrase.\")\n",
+ " nlp = spacy.load('en_core_web_sm')\n",
+ "\n",
+ " chunks = split_by_phrase(text, nlp)\n",
+ " # print(chunks)\n",
+ " return chunks\n",
+ " \"\"\"\n",
+ " if split_type == 'string' or split_type == 'regex':\n",
+ "\n",
+ " if split_type_string is None:\n",
+ " logger.warning(\n",
+ " f\"Splitting by {split_type} requires a string to split by. Returning original text.\")\n",
+ " return [text]\n",
+ "\n",
+ " split_type_to_function = {\n",
+ " 'word': split_by_words,\n",
+ " 'line': split_by_lines,\n",
+ " 'sentence': split_by_sentence,\n",
+ " 'string': split_by_string,\n",
+ " 'char' : split_by_char,\n",
+ " #'random': split_by_random,\n",
+ " # 'rhyme': split_by_rhymes,\n",
+ " # 'pos': split_by_part_of_speech,\n",
+ " 'regex': split_by_regex,\n",
+ " }\n",
+ "\n",
+ "\n",
+ "\n",
+ " if split_type in split_type_to_function:\n",
+ " # split into groups of 1 by the desired type\n",
+ " # this is so terrible even I'm embarassed, destroy all this code later, but I guess it does something useful atm\n",
+ " segmented_text = split_type_to_function[split_type](text, split_type = split_type, split_type_quantity=1, split_type_string=split_type_string, split_type_value_type=split_type_value_type)\n",
+ " final_segmented_text = []\n",
+ " current_segment = ''\n",
+ " split_type_quantity_found = 0\n",
+ "\n",
+ " if split_type_value_type is None:\n",
+ " split_type_value_type = split_type\n",
+ " \n",
+ " for seg in segmented_text: # for each line, for example, we can now split by 'words' or whatever, as a counter for when to break the group\n",
+ " current_segment += seg\n",
+ "\n",
+ " #print(split_type_to_function[split_type](current_segment, split_type=split_type_value_type, split_type_quantity=1, split_type_string=split_type_string))\n",
+ " split_type_quantity_found = len(split_type_to_function[split_type_value_type](current_segment, split_type=split_type_value_type, split_type_quantity=1, split_type_string=split_type_string))\n",
+ " #print(f\"I see {split_type_quantity_found} {split_type_value_type} in {current_segment}\")\n",
+ " if split_type_quantity_found >= split_type_quantity:\n",
+ " final_segmented_text.append(current_segment)\n",
+ " split_type_quantity_found = 0\n",
+ " current_segment = ''\n",
+ " \n",
+ " return final_segmented_text\n",
+ "\n",
+ " logger.warning(\n",
+ " f\"Splitting by {split_type} not a supported option. Returning original text.\")\n",
+ " return [text]\n",
+ "\n",
+ "def split_by_string(text: str, split_type: Optional[str] = None, split_type_quantity: Optional[int] = 1, split_type_string: Optional[str] = None, split_type_value_type: Optional[str] = None) -> List[str]:\n",
+ " if split_type_string is not None:\n",
+ " split_pattern = f\"({split_type_string})\"\n",
+ " split_list = re.split(split_pattern, text)\n",
+ " result = [split_list[0]]\n",
+ " for i in range(1, len(split_list), 2):\n",
+ " result.append(split_list[i] + split_list[i+1])\n",
+ " return result\n",
+ " else:\n",
+ " return text.split()\n",
+ "\n",
+ "def split_by_regex(text: str, split_type: Optional[str] = None, split_type_quantity: Optional[int] = 1, split_type_string: Optional[str] = None, split_type_value_type: Optional[str] = None) -> List[str]:\n",
+ " chunks = []\n",
+ " start = 0\n",
+ " if split_type_string is not None:\n",
+ " for match in re.finditer(split_type_string, text):\n",
+ " end = match.start()\n",
+ " chunks.append(text[start:end].strip())\n",
+ " start = end\n",
+ "\n",
+ " chunks.append(text[start:].strip())\n",
+ " return chunks\n",
+ " else:\n",
+ " return text.split()\n",
+ "\n",
+ "def split_by_char(text: str, split_type: Optional[str] = None, split_type_quantity = 1, split_type_string: Optional[str] = None, split_type_value_type: Optional[str] = None) -> List[str]:\n",
+ " return list(text)\n",
+ "\n",
+ "def split_by_words(text: str, split_type: Optional[str] = None, split_type_quantity = 1, split_type_string: Optional[str] = None, split_type_value_type: Optional[str] = None) -> List[str]:\n",
+ " \n",
+ " return [word + ' ' for word in text.split() if text.strip()]\n",
+ " #return [' '.join(words[i:i + split_type_quantity]) for i in range(0, len(words), split_type_quantity)]\n",
+ "\n",
+ "\n",
+ "def split_by_lines(text: str, split_type: Optional[str] = None, split_type_quantity = 1, split_type_string: Optional[str] = None, split_type_value_type: Optional[str] = None) -> List[str]:\n",
+ " lines = [line + '\\n' for line in text.split('\\n') if line.strip()]\n",
+ " return lines\n",
+ " #return ['\\n'.join(lines[i:i + split_type_quantity]) for i in range(0, len(lines), split_type_quantity)]\n",
+ "\n",
+ "def split_by_sentence(text: str, split_type: Optional[str] = None, split_type_quantity = 1, split_type_string: Optional[str] = None, split_type_value_type: Optional[str] = None) -> List[str]:\n",
+ " import nltk \n",
+ " text = text.replace(\"\\n\", \" \").strip()\n",
+ " sentences = nltk.sent_tokenize(text)\n",
+ " return [sentence + ' ' for sentence in sentences]\n",
+ " #return [' '.join(sentences[i:i + split_type_quantity]) for i in range(0, len(sentences), split_type_quantity)]\n",
+ "\n",
+ "\n",
+ "\"\"\"\n",
+ "def split_by_sentences(text: str, n: int, language=\"en\") -> List[str]:\n",
+ " seg = pysbd.Segmenter(language=language, clean=False)\n",
+ " sentences = seg.segment(text)\n",
+ " return [' '.join(sentences[i:i + n]) for i in range(0, len(sentences), n)]\n",
+ "\"\"\"\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "text = \"\"\"\n",
+ "How many fucken morons can wave his arms and keep people in tempo? \n",
+ "I was there to push people beyond what's expected of them. \n",
+ "I believe that is an absolute necessity. \n",
+ "Otherwise we're depriving the world of the next Louis Armstrong, \n",
+ "or the next Charlie Parker. \n",
+ "Have I told you that story about how Charlie Parker became Charlie Parker?\n",
+ "Parker's a young kid, pretty good on the Sax, \n",
+ "gets up to play at a cutting session, \n",
+ "and well, he fucks it up. \n",
+ "And Jones nearly decapitates him for it, throws a cymbal at his head. \n",
+ "\"\"\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
[\n",
+ " 'How many fucken morons can wave his arms and keep people in tempo? ',\n",
+ " \"I was there to push people beyond what's expected of them. \",\n",
+ " 'I believe that is an absolute necessity. ',\n",
+ " \"Otherwise we're depriving the world of the next Louis Armstrong, or the next Charlie Parker. \",\n",
+ " 'Have I told you that story about how Charlie Parker became Charlie Parker? ',\n",
+ " \"Parker's a young kid, pretty good on the Sax, gets up to play at a cutting session, and well, he fucks it \n",
+ "up. \",\n",
+ " 'And Jones nearly decapitates him for it, throws a cymbal at his head. '\n",
+ "]\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[1m[\u001b[0m\n",
+ " \u001b[32m'How many fucken morons can wave his arms and keep people in tempo? '\u001b[0m,\n",
+ " \u001b[32m\"I was there to push people beyond what's expected of them. \"\u001b[0m,\n",
+ " \u001b[32m'I believe that is an absolute necessity. '\u001b[0m,\n",
+ " \u001b[32m\"Otherwise we're depriving the world of the next Louis Armstrong, or the next Charlie Parker. \"\u001b[0m,\n",
+ " \u001b[32m'Have I told you that story about how Charlie Parker became Charlie Parker? '\u001b[0m,\n",
+ " \u001b[32m\"Parker's a young kid, pretty good on the Sax, gets up to play at a cutting session, and well, he fucks it \u001b[0m\n",
+ "\u001b[32mup. \"\u001b[0m,\n",
+ " \u001b[32m'And Jones nearly decapitates him for it, throws a cymbal at his head. '\u001b[0m\n",
+ "\u001b[1m]\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "print(split_by_sentence(text))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(list(text))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(split_by_words(text))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['How many fucken morons can wave his arms and ke', 'ep pe', 'ople in te', 'mpo? \\nI was there to push pe', \"ople beyond what's e\", \"xpected of them. \\nI believe that is an absolute necessity. \\nOtherwise we're d\", \"epriving the world of the next Louis Armstrong, \\nor the next Charlie Parker. \\nHave I told you that story about how Charlie Parker became Charlie Parker?\\nParker's a young kid, pretty good on the Sax, \\ngets\", 'up to play at a cutting session, \\nand well, he fucks it', 'up. \\nAnd Jones nearly dec', 'apitates him for it, throws a cymbal at his head.']"
+ ]
+ },
+ "execution_count": 71,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "split_text(text, split_type=\"regex\", split_type_string=f\"[\\w]p\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
() Segment Breakdown \n",
+ "โโโโโณโโโโโโโโณโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n",
+ "โ # โ Words โ Time Est โ Splitting long text aiming for 145 chars max 190 โ\n",
+ "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n",
+ "โ 1 โ 16 โ 6.40 s โ With me, brave the tumultuous seas, claim treasures untold, and send foes to their โ\n",
+ "โโโ 94 chars โ watery doom โ\n",
+ "โโโโโดโโโโโโโโดโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n",
+ "
\n"
+ ],
+ "text/plain": [
+ "\u001b[3m () Segment Breakdown \u001b[0m\n",
+ "โโโโโณโโโโโโโโณโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n",
+ "โ\u001b[1m \u001b[0m\u001b[1m#\u001b[0m\u001b[1m \u001b[0mโ\u001b[1m \u001b[0m\u001b[1mWords\u001b[0m\u001b[1m \u001b[0mโ\u001b[1m \u001b[0m\u001b[1mTime Est\u001b[0m\u001b[1m \u001b[0mโ\u001b[1m \u001b[0m\u001b[1mSplitting long text aiming for 145 chars max 190 \u001b[0m\u001b[1m \u001b[0mโ\n",
+ "โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ\n",
+ "โ\u001b[35m \u001b[0m\u001b[35m1\u001b[0m\u001b[35m \u001b[0mโ\u001b[32m \u001b[0m\u001b[32m16 \u001b[0m\u001b[32m \u001b[0mโ\u001b[32m \u001b[0m\u001b[32m6.40 s \u001b[0m\u001b[32m \u001b[0mโ With me, brave the tumultuous seas, claim treasures untold, and send foes to their โ\n",
+ "โ\u001b[35m \u001b[0mโ\u001b[32m \u001b[0mโ\u001b[32m \u001b[0m\u001b[32m94 chars\u001b[0m\u001b[32m \u001b[0mโ watery doom โ\n",
+ "โโโโโดโโโโโโโโดโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "
write_seg_npz .npz saved to long/With_me_brave_t-SPK-hark_woman_3.wav_initial_prompt.npz api.py:316\n",
+ "
"
+ ]
+
+ for path, old_value, new_value in self.iter_changes(self.read_from_file(), values):
+ if old_value is None:
+ old_value = "None"
+
+ text.append(f"
{path}
{old_value}
{new_value}
")
+
+ if len(text) == 1:
+ text.append("
No changes
")
+
+ text.append("")
+ return "".join(text)
+
+ def ui_apply(self, *values):
+ num_changed = 0
+
+ current_ui_settings = self.read_from_file()
+
+ for path, _, new_value in self.iter_changes(current_ui_settings.copy(), values):
+ num_changed += 1
+ current_ui_settings[path] = new_value
+
+ if num_changed == 0:
+ return "No changes."
+
+ self.write_to_file(current_ui_settings)
+
+ return f"Wrote {num_changed} changes."
+
+ def create_ui(self):
+ """creates ui elements for editing defaults UI, without adding any logic to them"""
+
+ gr.HTML(
+ f"This page allows you to change default values in UI elements on other tabs. "
+ f"Make your changes, press 'View changes' to review the changed default values, "
+ f"then press 'Apply' to write them to {self.filename}. "
+ f"New defaults will apply after you restart the UI. "
+ f"You can edit the gradio_options.json file, or delete it to reset defaults. "
+ )
+
+ with gr.Row():
+ self.ui_defaults_view = gr.Button(
+ value="View changes", elem_id="ui_defaults_view", variant="secondary"
+ )
+ self.ui_defaults_apply = gr.Button(
+ value="Apply", elem_id="ui_defaults_apply", variant="primary"
+ )
+
+ self.ui_defaults_review = gr.HTML("")
+
+ def setup_ui(self):
+ """adds logic to elements created with create_ui; all add_block class must be made before this"""
+
+ assert not self.finalized_ui
+ self.finalized_ui = True
+
+ self.ui_defaults_view.click(
+ fn=self.ui_view,
+ inputs=list(self.component_mapping.values()),
+ outputs=[self.ui_defaults_review],
+ )
+ self.ui_defaults_apply.click(
+ fn=self.ui_apply,
+ inputs=list(self.component_mapping.values()),
+ outputs=[self.ui_defaults_review],
+ )
+
+ # print(f"UI default component+path mapping: {self.component_mapping}")
diff --git a/webui/user_styles.csv b/webui/user_styles.csv
new file mode 100644
index 0000000000000000000000000000000000000000..91034bc5ba18ef5beed0a1bf437bb49c10631eb0
--- /dev/null
+++ b/webui/user_styles.csv
@@ -0,0 +1,2 @@
+๏ปฟname,prompt,negative_prompt
+"Universe Prompt","In the beginning the Universe was created. This has made a lot of people very angry and been widely regarded as a bad move. However, the user templates are working.", negative_prompt,
diff --git a/webui/user_transformations.csv b/webui/user_transformations.csv
new file mode 100644
index 0000000000000000000000000000000000000000..b46e785eba92d85081a771b65ec9823278309b44
--- /dev/null
+++ b/webui/user_transformations.csv
@@ -0,0 +1 @@
+๏ปฟname,regex,replacement,flags,long_description
\ No newline at end of file
diff --git a/webui/user_transformations_french.csv b/webui/user_transformations_french.csv
new file mode 100644
index 0000000000000000000000000000000000000000..8bad7a2c1885f3e8bdbb2e860350c59429691e7b
--- /dev/null
+++ b/webui/user_transformations_french.csv
@@ -0,0 +1,6 @@
+๏ปฟname,regex,replacement,flags,long_description
+Save Dialogues in Quotation Marks,ยซ (.*?) ยป,{QUOTES\1QUOTES},MULTILINE,Temporarily replace dialogues in quotation marks with unique placeholders
+Save Dialogues with Dashes,โ (.*?)(?=\n|$),{DASHES\1DASHES},MULTILINE,Temporarily replace dialogues with dashes with unique placeholders
+Remove Non-dialogue Lines,^(?!{DASHES|{QUOTES).*,,MULTILINE,Remove all lines that don't start with dialog placeholders
+Restore Dialogues from Quotation Marks Placeholders,{QUOTES(.*?)QUOTES},ยซ \1 ยป,MULTILINE,Restore dialogues from quotation marks placeholders
+Restore Dialogues from Dashes Placeholders,{DASHES(.*?)DASHES},\1,MULTILINE,Restore dialogues from dashes placeholders
\ No newline at end of file