Clip diffusion inputs
Browse files
api.py
CHANGED
@@ -181,6 +181,7 @@ class TextToSpeech:
|
|
181 |
samples = []
|
182 |
num_batches = num_autoregressive_samples // self.autoregressive_batch_size
|
183 |
stop_mel_token = self.autoregressive.stop_mel_token
|
|
|
184 |
self.autoregressive = self.autoregressive.cuda()
|
185 |
for b in tqdm(range(num_batches)):
|
186 |
codes = self.autoregressive.inference_speech(conds, text,
|
@@ -212,8 +213,20 @@ class TextToSpeech:
|
|
212 |
self.diffusion = self.diffusion.cuda()
|
213 |
self.vocoder = self.vocoder.cuda()
|
214 |
for b in range(best_results.shape[0]):
|
215 |
-
|
216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
wav = self.vocoder.inference(mel)
|
218 |
wav_candidates.append(wav.cpu())
|
219 |
self.diffusion = self.diffusion.cpu()
|
|
|
181 |
samples = []
|
182 |
num_batches = num_autoregressive_samples // self.autoregressive_batch_size
|
183 |
stop_mel_token = self.autoregressive.stop_mel_token
|
184 |
+
calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
|
185 |
self.autoregressive = self.autoregressive.cuda()
|
186 |
for b in tqdm(range(num_batches)):
|
187 |
codes = self.autoregressive.inference_speech(conds, text,
|
|
|
213 |
self.diffusion = self.diffusion.cuda()
|
214 |
self.vocoder = self.vocoder.cuda()
|
215 |
for b in range(best_results.shape[0]):
|
216 |
+
codes = best_results[b].unsqueeze(0)
|
217 |
+
|
218 |
+
# Find the first occurrence of the "calm" token and trim the codes to that.
|
219 |
+
ctokens = 0
|
220 |
+
for k in range(codes.shape[-1]):
|
221 |
+
if codes[0, k] == calm_token:
|
222 |
+
ctokens += 1
|
223 |
+
else:
|
224 |
+
ctokens = 0
|
225 |
+
if ctokens > 8: # 8 tokens gives the diffusion model some "breathing room" to terminate speech.
|
226 |
+
codes = codes[:, :k]
|
227 |
+
break
|
228 |
+
|
229 |
+
mel = do_spectrogram_diffusion(self.diffusion, diffuser, codes, voice_samples, temperature=diffusion_temperature)
|
230 |
wav = self.vocoder.inference(mel)
|
231 |
wav_candidates.append(wav.cpu())
|
232 |
self.diffusion = self.diffusion.cpu()
|