Update sweep & eval_multiple with new voices
Browse files
api.py
CHANGED
@@ -140,6 +140,13 @@ class TextToSpeech:
|
|
140 |
average_conditioning_embeddings=True).cpu().eval()
|
141 |
self.autoregressive.load_state_dict(torch.load('.models/autoregressive_diverse.pth'))
|
142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
self.clip = VoiceCLIP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12,
|
144 |
text_seq_len=350, text_heads=8,
|
145 |
num_speech_tokens=8192, speech_enc_depth=12, speech_heads=8, speech_seq_len=430,
|
@@ -221,11 +228,11 @@ class TextToSpeech:
|
|
221 |
# The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
|
222 |
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
223 |
# results, but will increase memory usage.
|
224 |
-
self.
|
225 |
-
best_latents = self.
|
226 |
torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=conds.device),
|
227 |
return_latent=True, clip_inputs=False)
|
228 |
-
self.
|
229 |
|
230 |
print("Performing vocoding..")
|
231 |
wav_candidates = []
|
|
|
140 |
average_conditioning_embeddings=True).cpu().eval()
|
141 |
self.autoregressive.load_state_dict(torch.load('.models/autoregressive_diverse.pth'))
|
142 |
|
143 |
+
self.autoregressive_for_latents = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
|
144 |
+
model_dim=1024,
|
145 |
+
heads=16, number_text_tokens=256, start_text_token=255, checkpointing=False,
|
146 |
+
train_solo_embeddings=False,
|
147 |
+
average_conditioning_embeddings=True).cpu().eval()
|
148 |
+
self.autoregressive_for_latents.load_state_dict(torch.load('.models/autoregressive_diverse.pth'))
|
149 |
+
|
150 |
self.clip = VoiceCLIP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=12,
|
151 |
text_seq_len=350, text_heads=8,
|
152 |
num_speech_tokens=8192, speech_enc_depth=12, speech_heads=8, speech_seq_len=430,
|
|
|
228 |
# The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
|
229 |
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
|
230 |
# results, but will increase memory usage.
|
231 |
+
self.autoregressive_for_latents = self.autoregressive_for_latents.cuda()
|
232 |
+
best_latents = self.autoregressive_for_latents(conds, text, torch.tensor([text.shape[-1]], device=conds.device), best_results,
|
233 |
torch.tensor([best_results.shape[-1]*self.autoregressive.mel_length_compression], device=conds.device),
|
234 |
return_latent=True, clip_inputs=False)
|
235 |
+
self.autoregressive_for_latents = self.autoregressive_for_latents.cpu()
|
236 |
|
237 |
print("Performing vocoding..")
|
238 |
wav_candidates = []
|
eval_multiple.py
CHANGED
@@ -6,32 +6,35 @@ from api import TextToSpeech
|
|
6 |
from utils.audio import load_audio
|
7 |
|
8 |
if __name__ == '__main__':
|
9 |
-
fname = 'Y:\\
|
10 |
-
|
|
|
11 |
outpath_real = 'D:\\tmp\\tortoise-tts-eval\\real'
|
12 |
|
13 |
-
os.makedirs(outpath, exist_ok=True)
|
14 |
os.makedirs(outpath_real, exist_ok=True)
|
15 |
with open(fname, 'r', encoding='utf-8') as f:
|
16 |
lines = [l.strip().split('\t') for l in f.readlines()]
|
17 |
|
18 |
-
recorder = open(os.path.join(outpath, 'transcript.tsv'), 'w', encoding='utf-8')
|
19 |
tts = TextToSpeech()
|
20 |
-
for
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
6 |
from utils.audio import load_audio
|
7 |
|
8 |
if __name__ == '__main__':
|
9 |
+
fname = 'Y:\\clips\\books2\\subset512-oco.tsv'
|
10 |
+
stop_after = 128
|
11 |
+
outpath_base = 'D:\\tmp\\tortoise-tts-eval\\diverse'
|
12 |
outpath_real = 'D:\\tmp\\tortoise-tts-eval\\real'
|
13 |
|
|
|
14 |
os.makedirs(outpath_real, exist_ok=True)
|
15 |
with open(fname, 'r', encoding='utf-8') as f:
|
16 |
lines = [l.strip().split('\t') for l in f.readlines()]
|
17 |
|
|
|
18 |
tts = TextToSpeech()
|
19 |
+
for k in range(4):
|
20 |
+
outpath = f'{outpath_base}_{k}'
|
21 |
+
os.makedirs(outpath, exist_ok=True)
|
22 |
+
recorder = open(os.path.join(outpath, 'transcript.tsv'), 'w', encoding='utf-8')
|
23 |
+
for e, line in enumerate(lines):
|
24 |
+
if e >= stop_after:
|
25 |
+
break
|
26 |
+
transcript = line[0]
|
27 |
+
path = os.path.join(os.path.dirname(fname), line[1])
|
28 |
+
cond_audio = load_audio(path, 22050)
|
29 |
+
torchaudio.save(os.path.join(outpath_real, os.path.basename(line[1])), cond_audio, 22050)
|
30 |
+
sample = tts.tts(transcript, [cond_audio, cond_audio], num_autoregressive_samples=128, k=1,
|
31 |
+
repetition_penalty=2.0, length_penalty=2, temperature=.5, top_p=.5,
|
32 |
+
diffusion_temperature=.7, cond_free_k=2, diffusion_iterations=70)
|
33 |
|
34 |
+
down = torchaudio.functional.resample(sample, 24000, 22050)
|
35 |
+
fout_path = os.path.join(outpath, os.path.basename(line[1]))
|
36 |
+
torchaudio.save(fout_path, down.squeeze(0), 22050)
|
37 |
|
38 |
+
recorder.write(f'{transcript}\t{fout_path}\n')
|
39 |
+
recorder.flush()
|
40 |
+
recorder.close()
|
read.py
CHANGED
@@ -30,24 +30,13 @@ if __name__ == '__main__':
|
|
30 |
# These are voices drawn randomly from the training set. You are free to substitute your own voices in, but testing
|
31 |
# has shown that the model does not generalize to new voices very well.
|
32 |
preselected_cond_voices = {
|
33 |
-
|
34 |
-
'
|
35 |
-
'harris': ['voices/harris/1.wav', 'voices/harris/2.wav'],
|
36 |
-
'lescault': ['voices/lescault/1.wav', 'voices/lescault/2.wav'],
|
37 |
-
'otto': ['voices/otto/1.wav', 'voices/otto/2.wav'],
|
38 |
-
'obama': ['voices/obama/1.wav', 'voices/obama/2.wav'],
|
39 |
-
'carlin': ['voices/carlin/1.wav', 'voices/carlin/2.wav'],
|
40 |
-
# Female voices
|
41 |
-
'atkins': ['voices/atkins/1.wav', 'voices/atkins/2.wav'],
|
42 |
-
'grace': ['voices/grace/1.wav', 'voices/grace/2.wav'],
|
43 |
-
'kennard': ['voices/kennard/1.wav', 'voices/kennard/2.wav'],
|
44 |
-
'mol': ['voices/mol/1.wav', 'voices/mol/2.wav'],
|
45 |
-
'lj': ['voices/lj/1.wav', 'voices/lj/2.wav'],
|
46 |
}
|
47 |
|
48 |
parser = argparse.ArgumentParser()
|
49 |
parser.add_argument('-textfile', type=str, help='A file containing the text to read.', default="data/riding_hood.txt")
|
50 |
-
parser.add_argument('-voice', type=str, help='Use a preset conditioning voice (defined above). Overrides cond_path.', default='
|
51 |
parser.add_argument('-num_samples', type=int, help='How many total outputs the autoregressive transformer should produce.', default=512)
|
52 |
parser.add_argument('-batch_size', type=int, help='How many samples to process at once in the autoregressive model.', default=16)
|
53 |
parser.add_argument('-output_path', type=str, help='Where to store outputs.', default='results/longform/')
|
|
|
30 |
# These are voices drawn randomly from the training set. You are free to substitute your own voices in, but testing
|
31 |
# has shown that the model does not generalize to new voices very well.
|
32 |
preselected_cond_voices = {
|
33 |
+
'emma_stone': ['voices/emma_stone/1.wav','voices/emma_stone/2.wav','voices/emma_stone/3.wav'],
|
34 |
+
'tom_hanks': ['voices/tom_hanks/1.wav','voices/tom_hanks/2.wav','voices/tom_hanks/3.wav'],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
}
|
36 |
|
37 |
parser = argparse.ArgumentParser()
|
38 |
parser.add_argument('-textfile', type=str, help='A file containing the text to read.', default="data/riding_hood.txt")
|
39 |
+
parser.add_argument('-voice', type=str, help='Use a preset conditioning voice (defined above). Overrides cond_path.', default='emma_stone')
|
40 |
parser.add_argument('-num_samples', type=int, help='How many total outputs the autoregressive transformer should produce.', default=512)
|
41 |
parser.add_argument('-batch_size', type=int, help='How many samples to process at once in the autoregressive model.', default=16)
|
42 |
parser.add_argument('-output_path', type=str, help='Where to store outputs.', default='results/longform/')
|
sweep.py
CHANGED
@@ -24,19 +24,24 @@ def permutations(args):
|
|
24 |
|
25 |
|
26 |
if __name__ == '__main__':
|
27 |
-
fname = 'Y:\\
|
28 |
-
|
|
|
29 |
outpath_real = 'D:\\tmp\\tortoise-tts-eval\\real'
|
30 |
|
31 |
arg_ranges = {
|
32 |
-
'top_p': [.
|
33 |
-
'temperature': [.5,
|
|
|
|
|
|
|
34 |
}
|
35 |
cfgs = permutations(arg_ranges)
|
36 |
shuffle(cfgs)
|
37 |
|
38 |
for cfg in cfgs:
|
39 |
-
|
|
|
40 |
os.makedirs(outpath, exist_ok=True)
|
41 |
os.makedirs(outpath_real, exist_ok=True)
|
42 |
with open(fname, 'r', encoding='utf-8') as f:
|
@@ -45,15 +50,14 @@ if __name__ == '__main__':
|
|
45 |
recorder = open(os.path.join(outpath, 'transcript.tsv'), 'w', encoding='utf-8')
|
46 |
tts = TextToSpeech()
|
47 |
for e, line in enumerate(lines):
|
|
|
|
|
48 |
transcript = line[0]
|
49 |
-
if len(transcript) > 120:
|
50 |
-
continue # We need to support this, but cannot yet.
|
51 |
path = os.path.join(os.path.dirname(fname), line[1])
|
52 |
cond_audio = load_audio(path, 22050)
|
53 |
torchaudio.save(os.path.join(outpath_real, os.path.basename(line[1])), cond_audio, 22050)
|
54 |
-
sample = tts.tts(transcript, [cond_audio, cond_audio], num_autoregressive_samples=256,
|
55 |
-
|
56 |
-
diffusion_temperature=.7, cond_free_k=2, **cfg)
|
57 |
down = torchaudio.functional.resample(sample, 24000, 22050)
|
58 |
fout_path = os.path.join(outpath, os.path.basename(line[1]))
|
59 |
torchaudio.save(fout_path, down.squeeze(0), 22050)
|
|
|
24 |
|
25 |
|
26 |
if __name__ == '__main__':
|
27 |
+
fname = 'Y:\\clips\\books2\\subset512-oco.tsv'
|
28 |
+
stop_after = 128
|
29 |
+
outpath_base = 'D:\\tmp\\tortoise-tts-eval\\sweep'
|
30 |
outpath_real = 'D:\\tmp\\tortoise-tts-eval\\real'
|
31 |
|
32 |
arg_ranges = {
|
33 |
+
'top_p': [.5, 1],
|
34 |
+
'temperature': [.5, 1],
|
35 |
+
'diffusion_temperature': [.6, 1],
|
36 |
+
'cond_free_k': [0, 1, 4],
|
37 |
+
'repetition_penalty': [1.0, 2.0]
|
38 |
}
|
39 |
cfgs = permutations(arg_ranges)
|
40 |
shuffle(cfgs)
|
41 |
|
42 |
for cfg in cfgs:
|
43 |
+
cfg_desc = '_'.join([f'{k}-{v}' for k,v in cfg.items()])
|
44 |
+
outpath = os.path.join(outpath_base, f'{cfg_desc}')
|
45 |
os.makedirs(outpath, exist_ok=True)
|
46 |
os.makedirs(outpath_real, exist_ok=True)
|
47 |
with open(fname, 'r', encoding='utf-8') as f:
|
|
|
50 |
recorder = open(os.path.join(outpath, 'transcript.tsv'), 'w', encoding='utf-8')
|
51 |
tts = TextToSpeech()
|
52 |
for e, line in enumerate(lines):
|
53 |
+
if e >= stop_after:
|
54 |
+
break
|
55 |
transcript = line[0]
|
|
|
|
|
56 |
path = os.path.join(os.path.dirname(fname), line[1])
|
57 |
cond_audio = load_audio(path, 22050)
|
58 |
torchaudio.save(os.path.join(outpath_real, os.path.basename(line[1])), cond_audio, 22050)
|
59 |
+
sample = tts.tts(transcript, [cond_audio, cond_audio], num_autoregressive_samples=256,
|
60 |
+
k=1, diffusion_iterations=70, length_penalty=1.0, **cfg)
|
|
|
61 |
down = torchaudio.functional.resample(sample, 24000, 22050)
|
62 |
fout_path = os.path.join(outpath, os.path.basename(line[1]))
|
63 |
torchaudio.save(fout_path, down.squeeze(0), 22050)
|