Upload config for debug
Browse files
output/config/debug/model.yaml
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
mode: train
|
2 |
+
num_emotion: 5
|
3 |
+
conformer:
|
4 |
+
encoder_dim: 256
|
5 |
+
decoder_dim: 256
|
6 |
+
num_encode_layers: 4
|
7 |
+
num_decode_layers: 6
|
8 |
+
num_attention_heads: 2
|
9 |
+
feed_forward_expansion_factor: 4
|
10 |
+
conv_expansion_factor: 2
|
11 |
+
feed_forward_dropout_p: 0.2
|
12 |
+
attention_dropout_p: 0.2
|
13 |
+
conv_dropout_p: 0.2
|
14 |
+
conv_kernel_size: 7
|
15 |
+
half_step_residual: true
|
16 |
+
|
17 |
+
reference_encoder:
|
18 |
+
encoder_dim: 128
|
19 |
+
dropout: 0.2
|
20 |
+
|
21 |
+
variance_predictor:
|
22 |
+
filter_size: 256
|
23 |
+
kernel_size: 3
|
24 |
+
dropout: 0.5
|
25 |
+
|
26 |
+
variance_embedding:
|
27 |
+
pitch_quantization: "linear"
|
28 |
+
energy_quantization: "linear"
|
29 |
+
n_bins: 256
|
30 |
+
|
31 |
+
max_seq_len: 1000
|
32 |
+
|
33 |
+
vocoder:
|
34 |
+
model: "HiFi-GAN"
|
35 |
+
speaker: "tth"
|
output/config/debug/preprocess.yaml
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset: "vlsp2023emo"
|
2 |
+
|
3 |
+
path:
|
4 |
+
corpus_path: "./data/pretrained_tts_dataset/tuyendv.dict"
|
5 |
+
lexicon_path: "../datasets/ess-vlsp2023-lexicon/lexicon.dict" # "data/lexicon"
|
6 |
+
raw_path: "./data/pretrained_tts_dataset_raw"
|
7 |
+
preprocessed_path: "../datasets/ess-vlsp2023-emo-processed-phoneme-level" # "processed_vlsp_data_phoneme_level"
|
8 |
+
|
9 |
+
emotion2id:
|
10 |
+
neutral: 0
|
11 |
+
happy: 1
|
12 |
+
sad: 2
|
13 |
+
angry: 3
|
14 |
+
surprise: 4
|
15 |
+
|
16 |
+
id2emotion:
|
17 |
+
0: neutral
|
18 |
+
1: happy
|
19 |
+
2: sad
|
20 |
+
3: angry
|
21 |
+
4: surprise
|
22 |
+
|
23 |
+
smoothing_label: 0.1
|
24 |
+
|
25 |
+
preprocessing:
|
26 |
+
val_size: 512
|
27 |
+
text:
|
28 |
+
text_cleaners: []
|
29 |
+
language: "en"
|
30 |
+
audio:
|
31 |
+
sampling_rate: 22050
|
32 |
+
max_wav_value: 32768.0
|
33 |
+
stft:
|
34 |
+
filter_length: 1024
|
35 |
+
hop_length: 256
|
36 |
+
win_length: 1024
|
37 |
+
mel:
|
38 |
+
n_mel_channels: 80
|
39 |
+
mel_fmin: 0
|
40 |
+
mel_fmax: 8000
|
41 |
+
|
42 |
+
# phoneme_level
|
43 |
+
pitch:
|
44 |
+
feature: "phoneme_level"
|
45 |
+
normalization: True
|
46 |
+
energy:
|
47 |
+
feature: "phoneme_level"
|
48 |
+
normalization: True
|
output/config/debug/train.yaml
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
path:
|
2 |
+
ckpt_path: "../output/ckpt/vlsp2023emo"
|
3 |
+
log_path: "../output/log/vlsp2023emo"
|
4 |
+
result_path: "../output/result/vlsp2023emo"
|
5 |
+
hf:
|
6 |
+
use_hf: True
|
7 |
+
hf_repo: "hahunavth/abc"
|
8 |
+
restore_from_hf: True
|
9 |
+
push_to_hf: True
|
10 |
+
|
11 |
+
optimizer:
|
12 |
+
batch_size: 48 # 64
|
13 |
+
betas: [0.9, 0.98]
|
14 |
+
eps: 0.000000001
|
15 |
+
weight_decay: 0.0
|
16 |
+
grad_clip_thresh: 1.0
|
17 |
+
grad_acc_step: 1
|
18 |
+
warm_up_step: 2000
|
19 |
+
anneal_steps: [300000, 400000, 500000]
|
20 |
+
anneal_rate: 0.3
|
21 |
+
step:
|
22 |
+
total_step: 400000
|
23 |
+
log_step: 1000
|
24 |
+
synth_step: 5000
|
25 |
+
val_step: 1000
|
26 |
+
save_step: 2000
|