In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"


from IPython.display import Audio
import nltk # we'll use this to split into sentences
import numpy as np

from bark.generation import (
 generate_text_semantic,
 preload_models,
)
from bark.api import semantic_to_waveform
from bark import generate_audio, SAMPLE_RATE

In [29]:
preload_models()

# Simple Long-Form Generation
We split longer text into sentences using `nltk` and generate the sentences one by one.

In [33]:
script = """
Hey, have you heard about this new text-to-audio model called "Bark"? 
Apparently, it's the most realistic and natural-sounding text-to-audio model 
out there right now. People are saying it sounds just like a real person speaking. 
I think it uses advanced machine learning algorithms to analyze and understand the 
nuances of human speech, and then replicates those nuances in its own speech output. 
It's pretty impressive, and I bet it could be used for things like audiobooks or podcasts. 
In fact, I heard that some publishers are already starting to use Bark to create audiobooks. 
It would be like having your own personal voiceover artist. I really think Bark is going to 
be a game-changer in the world of text-to-audio technology.
""".replace("\n", " ").strip()

In [34]:
sentences = nltk.sent_tokenize(script)

In [35]:
SPEAKER = "v2/en_speaker_6"
silence = np.zeros(int(0.25 * SAMPLE_RATE)) # quarter second of silence

pieces = []
for sentence in sentences:
 audio_array = generate_audio(sentence, history_prompt=SPEAKER)
 pieces += [audio_array, silence.copy()]


100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 43.03it/s]
100%|████████████████████████████████████████████████████████████████████████| 17/17 [00:06<00:00, 2.45it/s]
100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 22.73it/s]
100%|████████████████████████████████████████████████████████████████████████| 33/33 [00:13<00:00, 2.52it/s]
100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 66.30it/s]
100%|████████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00, 2.46it/s]
100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 20.99it/s]
100%|████████████████████████████████████████████████████████████████████████| 35/35 [00:14<00:00, 2.46it/s]
100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 25.63it/s]
100%|█████████

In [None]:
Audio(np.concatenate(pieces), rate=SAMPLE_RATE)

# $ \\ $

# Advanced Long-Form Generation
Somtimes Bark will hallucinate a little extra audio at the end of the prompt.
We can solve this issue by lowering the threshold for bark to stop generating text. 
We use the `min_eos_p` kwarg in `generate_text_semantic`

In [37]:
GEN_TEMP = 0.6
SPEAKER = "v2/en_speaker_6"
silence = np.zeros(int(0.25 * SAMPLE_RATE)) # quarter second of silence

pieces = []
for sentence in sentences:
 semantic_tokens = generate_text_semantic(
 sentence,
 history_prompt=SPEAKER,
 temp=GEN_TEMP,
 min_eos_p=0.05, # this controls how likely the generation is to end
 )

 audio_array = semantic_to_waveform(semantic_tokens, history_prompt=SPEAKER,)
 pieces += [audio_array, silence.copy()]



100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 38.05it/s]
100%|████████████████████████████████████████████████████████████████████████| 18/18 [00:07<00:00, 2.46it/s]
100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 32.28it/s]
100%|████████████████████████████████████████████████████████████████████████| 21/21 [00:08<00:00, 2.54it/s]
100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 55.78it/s]
100%|████████████████████████████████████████████████████████████████████████| 14/14 [00:05<00:00, 2.57it/s]
100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:06<00:00, 14.73it/s]
100%|████████████████████████████████████████████████████████████████████████| 35/35 [00:14<00:00, 2.47it/s]
100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 40.29it/s]
100%|█████████

In [None]:
Audio(np.concatenate(pieces), rate=SAMPLE_RATE)

# $ \\ $

# Make a Long-Form Dialog with Bark

### Step 1: Format a script and speaker lookup

In [14]:
speaker_lookup = {"Samantha": "v2/en_speaker_9", "John": "v2/en_speaker_2"}

# Script generated by chat GPT
script = """
Samantha: Hey, have you heard about this new text-to-audio model called "Bark"?

John: No, I haven't. What's so special about it?

Samantha: Well, apparently it's the most realistic and natural-sounding text-to-audio model out there right now. People are saying it sounds just like a real person speaking.

John: Wow, that sounds amazing. How does it work?

Samantha: I think it uses advanced machine learning algorithms to analyze and understand the nuances of human speech, and then replicates those nuances in its own speech output.

John: That's pretty impressive. Do you think it could be used for things like audiobooks or podcasts?

Samantha: Definitely! In fact, I heard that some publishers are already starting to use Bark to create audiobooks. And I bet it would be great for podcasts too.

John: I can imagine. It would be like having your own personal voiceover artist.

Samantha: Exactly! I think Bark is going to be a game-changer in the world of text-to-audio technology."""
script = script.strip().split("\n")
script = [s.strip() for s in script if s]
script

['Samantha: Hey, have you heard about this new text-to-audio model called "Bark"?',
 "John: No, I haven't. What's so special about it?",
 "Samantha: Well, apparently it's the most realistic and natural-sounding text-to-audio model out there right now. People are saying it sounds just like a real person speaking.",
 'John: Wow, that sounds amazing. How does it work?',
 'Samantha: I think it uses advanced machine learning algorithms to analyze and understand the nuances of human speech, and then replicates those nuances in its own speech output.',
 "John: That's pretty impressive. Do you think it could be used for things like audiobooks or podcasts?",
 'Samantha: Definitely! In fact, I heard that some publishers are already starting to use Bark to create audiobooks. And I bet it would be great for podcasts too.',
 'John: I can imagine. It would be like having your own personal voiceover artist.',
 'Samantha: Exactly! I think Bark is going to be a game-changer in the world of text-to-audi

### Step 2: Generate the audio for every speaker turn

In [15]:
pieces = []
silence = np.zeros(int(0.5*SAMPLE_RATE))
for line in script:
 speaker, text = line.split(": ")
 audio_array = generate_audio(text, history_prompt=speaker_lookup[speaker], )
 pieces += [audio_array, silence.copy()]

100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 34.03it/s]
100%|████████████████████████████████████████████████████████████████████████| 22/22 [00:08<00:00, 2.55it/s]
100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 71.58it/s]
100%|████████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00, 2.65it/s]
100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 22.75it/s]
100%|████████████████████████████████████████████████████████████████████████| 33/33 [00:13<00:00, 2.53it/s]
100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 70.76it/s]
100%|████████████████████████████████████████████████████████████████████████| 11/11 [00:04<00:00, 2.63it/s]
100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 20.46it/s]
100%|█████████

### Step 3: Concatenate all of the audio and play it

In [None]:
Audio(np.concatenate(pieces), rate=SAMPLE_RATE)