il-hoon commited on
Commit
5beedd8
1 Parent(s): e9c04c2

Update demo_cli.py

Browse files
Files changed (1) hide show
  1. demo_cli.py +103 -136
demo_cli.py CHANGED
@@ -70,58 +70,46 @@ if __name__ == '__main__':
70
  else:
71
  print("Using CPU for inference.\n")
72
 
73
- ## Remind the user to download pretrained models if needed
74
- check_model_paths(encoder_path=args.enc_model_fpath,
75
- synthesizer_path=args.syn_model_fpath,
76
- vocoder_path=args.voc_model_fpath)
77
-
78
- ## Load the models one by one.
79
- print("Preparing the encoder, the synthesizer and the vocoder...")
80
- encoder.load_model(args.enc_model_fpath)
81
- synthesizer = Synthesizer(args.syn_model_fpath)
82
- vocoder.load_model(args.voc_model_fpath)
 
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
- ## Run a test
86
- print("Testing your configuration with small inputs.")
87
- # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
88
- # sampling rate, which may differ.
89
- # If you're unfamiliar with digital audio, know that it is encoded as an array of floats
90
- # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
91
- # The sampling rate is the number of values (samples) recorded per second, it is set to
92
- # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
93
- # to an audio of 1 second.
94
- print("\tTesting the encoder...")
95
- encoder.embed_utterance(np.zeros(encoder.sampling_rate))
96
-
97
- # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
98
- # returns, but here we're going to make one ourselves just for the sake of showing that it's
99
- # possible.
100
- embed = np.random.rand(speaker_embedding_size)
101
- # Embeddings are L2-normalized (this isn't important here, but if you want to make your own
102
- # embeddings it will be).
103
- embed /= np.linalg.norm(embed)
104
- # The synthesizer can handle multiple inputs with batching. Let's create another embedding to
105
- # illustrate that
106
- embeds = [embed, np.zeros(speaker_embedding_size)]
107
- texts = ["test 1", "test 2"]
108
- print("\tTesting the synthesizer... (loading the model will output a lot of text)")
109
- mels = synthesizer.synthesize_spectrograms(texts, embeds)
110
-
111
- # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
112
- # can concatenate the mel spectrograms to a single one.
113
- mel = np.concatenate(mels, axis=1)
114
- # The vocoder can take a callback function to display the generation. More on that later. For
115
- # now we'll simply hide it like this:
116
- no_action = lambda *args: None
117
- print("\tTesting the vocoder...")
118
- # For the sake of making this test short, we'll pass a short target length. The target length
119
- # is the length of the wav segments that are processed in parallel. E.g. for audio sampled
120
- # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
121
- # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
122
- # that has a detrimental effect on the quality of the audio. The default parameters are
123
- # recommended in general.
124
- vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
125
 
126
  print("All test passed! You can now synthesize speech.\n\n")
127
 
@@ -132,94 +120,73 @@ if __name__ == '__main__':
132
  "an explanation of what is happening.\n")
133
 
134
  print("Interactive generation loop")
135
- num_generated = 0
136
- while True:
137
- try:
138
- # Get the reference audio filepath
139
- message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \
140
- "wav, m4a, flac, ...):\n"
141
- in_fpath = Path(input(message).replace("\"", "").replace("\'", ""))
142
 
143
- if in_fpath.suffix.lower() == ".mp3" and args.no_mp3_support:
144
- print("Can't Use mp3 files please try again:")
145
- continue
146
- ## Computing the embedding
147
- # First, we load the wav using the function that the speaker encoder provides. This is
148
- # important: there is preprocessing that must be applied.
149
-
150
- # The following two methods are equivalent:
151
- # - Directly load from the filepath:
152
- preprocessed_wav = encoder.preprocess_wav(in_fpath)
153
- # - If the wav is already loaded:
154
- original_wav, sampling_rate = librosa.load(str(in_fpath))
155
- preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
156
- print("Loaded file succesfully")
157
-
158
- # Then we derive the embedding. There are many functions and parameters that the
159
- # speaker encoder interfaces. These are mostly for in-depth research. You will typically
160
- # only use this function (with its default parameters):
161
- embed = encoder.embed_utterance(preprocessed_wav)
162
- print("Created the embedding")
163
-
164
-
165
- ## Generating the spectrogram
166
- text = input("Write a sentence (+-20 words) to be synthesized:\n")
167
-
168
- # If seed is specified, reset torch seed and force synthesizer reload
169
- if args.seed is not None:
170
- torch.manual_seed(args.seed)
171
- synthesizer = Synthesizer(args.syn_model_fpath)
172
 
173
- # The synthesizer works in batch, so you need to put your data in a list or numpy array
174
- texts = [text]
175
- embeds = [embed]
176
- # If you know what the attention layer alignments are, you can retrieve them here by
177
- # passing return_alignments=True
178
- specs = synthesizer.synthesize_spectrograms(texts, embeds)
179
- spec = specs[0]
180
- print("Created the mel spectrogram")
181
-
182
-
183
- ## Generating the waveform
184
- print("Synthesizing the waveform:")
185
 
186
- # If seed is specified, reset torch seed and reload vocoder
187
- if args.seed is not None:
188
- torch.manual_seed(args.seed)
189
- vocoder.load_model(args.voc_model_fpath)
190
 
191
- # Synthesizing the waveform is fairly straightforward. Remember that the longer the
192
- # spectrogram, the more time-efficient the vocoder.
193
- generated_wav = vocoder.infer_waveform(spec)
194
-
195
-
196
- ## Post-generation
197
- # There's a bug with sounddevice that makes the audio cut one second earlier, so we
198
- # pad it.
199
- generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
200
 
201
- # Trim excess silences to compensate for gaps in spectrograms (issue #53)
202
- generated_wav = encoder.preprocess_wav(generated_wav)
203
-
204
- # Play the audio (non-blocking)
205
- if not args.no_sound:
206
- try:
207
- sd.stop()
208
- sd.play(generated_wav, synthesizer.sample_rate)
209
- except sd.PortAudioError as e:
210
- print("\nCaught exception: %s" % repr(e))
211
- print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
212
- except:
213
- raise
214
-
215
- # Save it on the disk
216
- filename = "demo_output_%02d.wav" % num_generated
217
- print(generated_wav.dtype)
218
- sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)
219
- num_generated += 1
220
- print("\nSaved output as %s\n\n" % filename)
221
-
222
-
223
- except Exception as e:
224
- print("Caught exception: %s" % repr(e))
225
- print("Restarting\n")
 
70
  else:
71
  print("Using CPU for inference.\n")
72
 
73
+ ## Run a test
74
+ # print("Testing your configuration with small inputs.")
75
+ # # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
76
+ # # sampling rate, which may differ.
77
+ # # If you're unfamiliar with digital audio, know that it is encoded as an array of floats
78
+ # # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
79
+ # # The sampling rate is the number of values (samples) recorded per second, it is set to
80
+ # # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
81
+ # # to an audio of 1 second.
82
+ # print(" Testing the encoder...")
83
+ # encoder.embed_utterance(np.zeros(encoder.sampling_rate))
84
 
85
+ # # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
86
+ # # returns, but here we're going to make one ourselves just for the sake of showing that it's
87
+ # # possible.
88
+ # embed = np.random.rand(speaker_embedding_size)
89
+ # # Embeddings are L2-normalized (this isn't important here, but if you want to make your own
90
+ # # embeddings it will be).
91
+ # embed /= np.linalg.norm(embed)
92
+ # # The synthesizer can handle multiple inputs with batching. Let's create another embedding to
93
+ # # illustrate that
94
+ # embeds = [embed, np.zeros(speaker_embedding_size)]
95
+ # texts = ["test 1", "test 2"]
96
+ # print(" Testing the synthesizer... (loading the model will output a lot of text)")
97
+ # mels = synthesizer.synthesize_spectrograms(texts, embeds)
98
 
99
+ # # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
100
+ # # can concatenate the mel spectrograms to a single one.
101
+ # mel = np.concatenate(mels, axis=1)
102
+ # # The vocoder can take a callback function to display the generation. More on that later. For
103
+ # # now we'll simply hide it like this:
104
+ # no_action = lambda *args: None
105
+ # print(" Testing the vocoder...")
106
+ # # For the sake of making this test short, we'll pass a short target length. The target length
107
+ # # is the length of the wav segments that are processed in parallel. E.g. for audio sampled
108
+ # # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
109
+ # # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
110
+ # # that has a detrimental effect on the quality of the audio. The default parameters are
111
+ # # recommended in general.
112
+ # vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
  print("All test passed! You can now synthesize speech.\n\n")
115
 
 
120
  "an explanation of what is happening.\n")
121
 
122
  print("Interactive generation loop")
123
+ # while True:
124
+ # Get the reference audio filepath
125
+ message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " "wav, m4a, flac, ...):\n"
126
+ in_fpath = args.audio_path
 
 
 
127
 
128
+ if in_fpath.suffix.lower() == ".mp3" and args.no_mp3_support:
129
+ print("Can't Use mp3 files please try again:")
130
+ ## Computing the embedding
131
+ # First, we load the wav using the function that the speaker encoder provides. This is
132
+ # important: there is preprocessing that must be applied.
133
+
134
+ # The following two methods are equivalent:
135
+ # - Directly load from the filepath:
136
+ preprocessed_wav = encoder.preprocess_wav(in_fpath)
137
+ # - If the wav is already loaded:
138
+ original_wav, sampling_rate = librosa.load(str(in_fpath))
139
+ preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
140
+ print("Loaded file succesfully")
141
+
142
+ # Then we derive the embedding. There are many functions and parameters that the
143
+ # speaker encoder interfaces. These are mostly for in-depth research. You will typically
144
+ # only use this function (with its default parameters):
145
+ embed = encoder.embed_utterance(preprocessed_wav)
146
+ print("Created the embedding")
147
+
148
+
149
+ ## Generating the spectrogram
150
+ text = args.text
151
+
152
+ # If seed is specified, reset torch seed and force synthesizer reload
153
+ if args.seed is not None:
154
+ torch.manual_seed(args.seed)
155
+ synthesizer = Synthesizer(args.syn_model_fpath)
 
156
 
157
+ # The synthesizer works in batch, so you need to put your data in a list or numpy array
158
+ texts = [text]
159
+ embeds = [embed]
160
+ # If you know what the attention layer alignments are, you can retrieve them here by
161
+ # passing return_alignments=True
162
+ specs = synthesizer.synthesize_spectrograms(texts, embeds)
163
+ spec = specs[0]
164
+ print("Created the mel spectrogram")
165
+
166
+
167
+ ## Generating the waveform
168
+ print("Synthesizing the waveform:")
169
 
170
+ # If seed is specified, reset torch seed and reload vocoder
171
+ if args.seed is not None:
172
+ torch.manual_seed(args.seed)
173
+ vocoder.load_model(args.voc_model_fpath)
174
 
175
+ # Synthesizing the waveform is fairly straightforward. Remember that the longer the
176
+ # spectrogram, the more time-efficient the vocoder.
177
+ generated_wav = vocoder.infer_waveform(spec)
178
+
179
+
180
+ ## Post-generation
181
+ # There's a bug with sounddevice that makes the audio cut one second earlier, so we
182
+ # pad it.
183
+ generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
184
 
185
+ # Trim excess silences to compensate for gaps in spectrograms (issue #53)
186
+ generated_wav = encoder.preprocess_wav(generated_wav)
187
+
188
+ # Save it on the disk
189
+ filename = args.output_path
190
+ print(generated_wav.dtype)
191
+ sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)
192
+ print("\nSaved output as %s\n\n" % filename)