mirco commited on
Commit
6cf215c
1 Parent(s): a1d16fe

cleaned inference hyparam file

Browse files
Files changed (2) hide show
  1. hyperparams.yaml +22 -141
  2. hyperparams_train.yaml +184 -0
hyperparams.yaml CHANGED
@@ -1,114 +1,21 @@
1
- # Generated 2021-05-22 from:
2
- # /home/mila/s/subakany/speechbrain_new/recipes/WSJ0Mix/separation/yamls/sepformer-whamr-16k.yaml
3
- # yamllint disable
4
  # ################################
5
- # Model: SepFormer for source separation
6
  # https://arxiv.org/abs/2010.13154
7
- #
8
- # Dataset : WSJ0-2mix and WSJ0-3mix
9
- # ################################
10
- # Basic parameters
11
- # Seed needs to be set at top of yaml, before objects with parameters are made
12
- #
13
- seed: 1234
14
- __set_seed: !apply:torch.manual_seed [1234]
15
-
16
- # Data params
17
-
18
- # the data folder for the wham dataset
19
- # data_folder needs to follow the format: /yourpath/whamr.
20
- # make sure to use the name whamr at your top folder for the dataset!
21
- data_folder: /network/tmp1/subakany/whamr_16k
22
 
23
- # the path for wsj0/si_tr_s/ folder -- only needed if dynamic mixing is used
24
- # e.g. /yourpath/wsj0-processed/si_tr_s/
25
- # you need to convert the original wsj0 to 8k
26
- # you can do this conversion with the script ../meta/preprocess_dynamic_mixing.py
27
- wsj0_tr: /yourpath/wsj0-processed/si_tr_s/
28
 
29
- experiment_name: sepformer-whamr-randomreverb-16k
30
- output_folder: results/sepformer-whamr-randomreverb-16k/1234
31
- train_log: results/sepformer-whamr-randomreverb-16k/1234/train_log.txt
32
- save_folder: results/sepformer-whamr-randomreverb-16k/1234/save
33
-
34
- # the file names should start with whamr instead of whamorg
35
- train_data: results/sepformer-whamr-randomreverb-16k/1234/save/whamr_tr.csv
36
- valid_data: results/sepformer-whamr-randomreverb-16k/1234/save/whamr_cv.csv
37
- test_data: results/sepformer-whamr-randomreverb-16k/1234/save/whamr_tt.csv
38
- skip_prep: false
39
-
40
- # Experiment params
41
- auto_mix_prec: false # Set it to True for mixed precision
42
- test_only: true
43
- num_spks: 2 # set to 3 for wsj0-3mix
44
- progressbar: true
45
- save_audio: false # Save estimated sources on disk
46
  sample_rate: 16000
47
-
48
- # Training parameters
49
- N_epochs: 200
50
- batch_size: 1
51
- lr: 0.00015
52
- clip_grad_norm: 5
53
- loss_upper_lim: 999999 # this is the upper limit for an acceptable loss
54
- # if True, the training sequences are cut to a specified length
55
- limit_training_signal_len: true
56
- # this is the length of sequences if we choose to limit
57
- # the signal length of training sequences
58
- training_signal_len: 64000
59
-
60
- # Set it to True to dynamically create mixtures at training time
61
- dynamic_mixing: false
62
-
63
- # Parameters for data augmentation
64
-
65
- # rir_path variable points to the directory of the room impulse responses
66
- # e.g. /miniscratch/subakany/rir_wavs
67
- # If the path does not exist, it is created automatically.
68
- rir_path: /network/tmp1/subakany/rir_wavs_16k
69
-
70
- use_wavedrop: false
71
- use_speedperturb: true
72
- use_speedperturb_sameforeachsource: false
73
- use_rand_shift: false
74
- min_shift: -8000
75
- max_shift: 8000
76
-
77
- speedperturb: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
78
- perturb_prob: 1.0
79
- drop_freq_prob: 0.0
80
- drop_chunk_prob: 0.0
81
- sample_rate: 16000
82
- speeds: [95, 100, 105]
83
-
84
- wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
85
- perturb_prob: 0.0
86
- drop_freq_prob: 1.0
87
- drop_chunk_prob: 1.0
88
- sample_rate: 16000
89
-
90
- # loss thresholding -- this thresholds the training loss
91
- threshold_byloss: true
92
- threshold: -30
93
-
94
- # Encoder parameters
95
- N_encoder_out: 256
96
- out_channels: 256
97
- kernel_size: 16
98
- kernel_stride: 8
99
-
100
- # Dataloader options
101
- dataloader_opts:
102
- batch_size: 1
103
- num_workers: 3
104
 
105
  # Specifying the network
106
- Encoder: &id003 !new:speechbrain.lobes.models.dual_path.Encoder
107
  kernel_size: 16
108
  out_channels: 256
109
 
110
-
111
- SBtfintra: &id001 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
112
  num_layers: 8
113
  d_model: 256
114
  nhead: 8
@@ -117,7 +24,7 @@ SBtfintra: &id001 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
117
  use_positional_encoding: true
118
  norm_before: true
119
 
120
- SBtfinter: &id002 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
121
  num_layers: 8
122
  d_model: 256
123
  nhead: 8
@@ -126,59 +33,33 @@ SBtfinter: &id002 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
126
  use_positional_encoding: true
127
  norm_before: true
128
 
129
- MaskNet: &id005 !new:speechbrain.lobes.models.dual_path.Dual_Path_Model
130
-
131
- num_spks: 2
132
  in_channels: 256
133
  out_channels: 256
134
  num_layers: 2
135
  K: 250
136
- intra_model: *id001
137
- inter_model: *id002
138
  norm: ln
139
  linear_layer_after_inter_intra: false
140
  skip_around_intra: true
141
 
142
- Decoder: &id004 !new:speechbrain.lobes.models.dual_path.Decoder
143
  in_channels: 256
144
  out_channels: 1
145
  kernel_size: 16
146
  stride: 8
147
  bias: false
148
 
149
- optimizer: !name:torch.optim.Adam
150
- lr: 0.00015
151
- weight_decay: 0
152
-
153
- loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper
154
-
155
- lr_scheduler: &id007 !new:speechbrain.nnet.schedulers.ReduceLROnPlateau
156
-
157
- factor: 0.5
158
- patience: 2
159
- dont_halve_until_epoch: 85
160
-
161
- epoch_counter: &id006 !new:speechbrain.utils.epoch_loop.EpochCounter
162
- limit: 200
163
-
164
  modules:
165
- encoder: *id003
166
- decoder: *id004
167
- masknet: *id005
168
- checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
169
- checkpoints_dir: results/sepformer-whamr-randomreverb-16k/1234/save
170
- recoverables:
171
- encoder: *id003
172
- decoder: *id004
173
- masknet: *id005
174
- counter: *id006
175
- lr_scheduler: *id007
176
- train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
177
- save_file: results/sepformer-whamr-randomreverb-16k/1234/train_log.txt
178
-
179
 
180
  pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
181
- loadables:
182
- masknet: !ref <MaskNet>
183
- encoder: !ref <Encoder>
184
- decoder: !ref <Decoder>
 
 
 
 
 
1
  # ################################
2
+ # Model: Inference for source separation with SepFormer
3
  # https://arxiv.org/abs/2010.13154
4
+ # Generated from speechbrain/recipes/WSJ0Mix/separation/train/hparams/sepformer-whamr-16khz.yaml
5
+ # Dataset : Whamr-16kHz
6
+ # ###############################
 
 
 
 
 
 
 
 
 
 
 
 
7
 
 
 
 
 
 
8
 
9
+ # Parameters
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  sample_rate: 16000
11
+ num_spks: 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  # Specifying the network
14
+ Encoder: !new:speechbrain.lobes.models.dual_path.Encoder
15
  kernel_size: 16
16
  out_channels: 256
17
 
18
+ SBtfintra: !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
 
19
  num_layers: 8
20
  d_model: 256
21
  nhead: 8
 
24
  use_positional_encoding: true
25
  norm_before: true
26
 
27
+ SBtfinter: !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
28
  num_layers: 8
29
  d_model: 256
30
  nhead: 8
 
33
  use_positional_encoding: true
34
  norm_before: true
35
 
36
+ MaskNet: !new:speechbrain.lobes.models.dual_path.Dual_Path_Model
37
+ num_spks: !ref <num_spks>
 
38
  in_channels: 256
39
  out_channels: 256
40
  num_layers: 2
41
  K: 250
42
+ intra_model: !ref <SBtfintra>
43
+ inter_model: !ref <SBtfinter>
44
  norm: ln
45
  linear_layer_after_inter_intra: false
46
  skip_around_intra: true
47
 
48
+ Decoder: !new:speechbrain.lobes.models.dual_path.Decoder
49
  in_channels: 256
50
  out_channels: 1
51
  kernel_size: 16
52
  stride: 8
53
  bias: false
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  modules:
56
+ encoder: !ref <Encoder>
57
+ decoder: !ref <Decoder>
58
+ masknet: !ref <MaskNet>
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
61
+ loadables:
62
+ masknet: !ref <MaskNet>
63
+ encoder: !ref <Encoder>
64
+ decoder: !ref <Decoder>
65
+
hyperparams_train.yaml ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated 2021-05-22 from:
2
+ # /home/mila/s/subakany/speechbrain_new/recipes/WSJ0Mix/separation/yamls/sepformer-whamr-16k.yaml
3
+ # yamllint disable
4
+ # ################################
5
+ # Model: SepFormer for source separation
6
+ # https://arxiv.org/abs/2010.13154
7
+ #
8
+ # Dataset : WSJ0-2mix and WSJ0-3mix
9
+ # ################################
10
+ # Basic parameters
11
+ # Seed needs to be set at top of yaml, before objects with parameters are made
12
+ #
13
+ seed: 1234
14
+ __set_seed: !apply:torch.manual_seed [1234]
15
+
16
+ # Data params
17
+
18
+ # the data folder for the wham dataset
19
+ # data_folder needs to follow the format: /yourpath/whamr.
20
+ # make sure to use the name whamr at your top folder for the dataset!
21
+ data_folder: /network/tmp1/subakany/whamr_16k
22
+
23
+ # the path for wsj0/si_tr_s/ folder -- only needed if dynamic mixing is used
24
+ # e.g. /yourpath/wsj0-processed/si_tr_s/
25
+ # you need to convert the original wsj0 to 8k
26
+ # you can do this conversion with the script ../meta/preprocess_dynamic_mixing.py
27
+ wsj0_tr: /yourpath/wsj0-processed/si_tr_s/
28
+
29
+ experiment_name: sepformer-whamr-randomreverb-16k
30
+ output_folder: results/sepformer-whamr-randomreverb-16k/1234
31
+ train_log: results/sepformer-whamr-randomreverb-16k/1234/train_log.txt
32
+ save_folder: results/sepformer-whamr-randomreverb-16k/1234/save
33
+
34
+ # the file names should start with whamr instead of whamorg
35
+ train_data: results/sepformer-whamr-randomreverb-16k/1234/save/whamr_tr.csv
36
+ valid_data: results/sepformer-whamr-randomreverb-16k/1234/save/whamr_cv.csv
37
+ test_data: results/sepformer-whamr-randomreverb-16k/1234/save/whamr_tt.csv
38
+ skip_prep: false
39
+
40
+ # Experiment params
41
+ auto_mix_prec: false # Set it to True for mixed precision
42
+ test_only: true
43
+ num_spks: 2 # set to 3 for wsj0-3mix
44
+ progressbar: true
45
+ save_audio: false # Save estimated sources on disk
46
+ sample_rate: 16000
47
+
48
+ # Training parameters
49
+ N_epochs: 200
50
+ batch_size: 1
51
+ lr: 0.00015
52
+ clip_grad_norm: 5
53
+ loss_upper_lim: 999999 # this is the upper limit for an acceptable loss
54
+ # if True, the training sequences are cut to a specified length
55
+ limit_training_signal_len: true
56
+ # this is the length of sequences if we choose to limit
57
+ # the signal length of training sequences
58
+ training_signal_len: 64000
59
+
60
+ # Set it to True to dynamically create mixtures at training time
61
+ dynamic_mixing: false
62
+
63
+ # Parameters for data augmentation
64
+
65
+ # rir_path variable points to the directory of the room impulse responses
66
+ # e.g. /miniscratch/subakany/rir_wavs
67
+ # If the path does not exist, it is created automatically.
68
+ rir_path: /network/tmp1/subakany/rir_wavs_16k
69
+
70
+ use_wavedrop: false
71
+ use_speedperturb: true
72
+ use_speedperturb_sameforeachsource: false
73
+ use_rand_shift: false
74
+ min_shift: -8000
75
+ max_shift: 8000
76
+
77
+ speedperturb: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
78
+ perturb_prob: 1.0
79
+ drop_freq_prob: 0.0
80
+ drop_chunk_prob: 0.0
81
+ sample_rate: 16000
82
+ speeds: [95, 100, 105]
83
+
84
+ wavedrop: !new:speechbrain.lobes.augment.TimeDomainSpecAugment
85
+ perturb_prob: 0.0
86
+ drop_freq_prob: 1.0
87
+ drop_chunk_prob: 1.0
88
+ sample_rate: 16000
89
+
90
+ # loss thresholding -- this thresholds the training loss
91
+ threshold_byloss: true
92
+ threshold: -30
93
+
94
+ # Encoder parameters
95
+ N_encoder_out: 256
96
+ out_channels: 256
97
+ kernel_size: 16
98
+ kernel_stride: 8
99
+
100
+ # Dataloader options
101
+ dataloader_opts:
102
+ batch_size: 1
103
+ num_workers: 3
104
+
105
+ # Specifying the network
106
+ Encoder: &id003 !new:speechbrain.lobes.models.dual_path.Encoder
107
+ kernel_size: 16
108
+ out_channels: 256
109
+
110
+
111
+ SBtfintra: &id001 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
112
+ num_layers: 8
113
+ d_model: 256
114
+ nhead: 8
115
+ d_ffn: 1024
116
+ dropout: 0
117
+ use_positional_encoding: true
118
+ norm_before: true
119
+
120
+ SBtfinter: &id002 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
121
+ num_layers: 8
122
+ d_model: 256
123
+ nhead: 8
124
+ d_ffn: 1024
125
+ dropout: 0
126
+ use_positional_encoding: true
127
+ norm_before: true
128
+
129
+ MaskNet: &id005 !new:speechbrain.lobes.models.dual_path.Dual_Path_Model
130
+
131
+ num_spks: 2
132
+ in_channels: 256
133
+ out_channels: 256
134
+ num_layers: 2
135
+ K: 250
136
+ intra_model: *id001
137
+ inter_model: *id002
138
+ norm: ln
139
+ linear_layer_after_inter_intra: false
140
+ skip_around_intra: true
141
+
142
+ Decoder: &id004 !new:speechbrain.lobes.models.dual_path.Decoder
143
+ in_channels: 256
144
+ out_channels: 1
145
+ kernel_size: 16
146
+ stride: 8
147
+ bias: false
148
+
149
+ optimizer: !name:torch.optim.Adam
150
+ lr: 0.00015
151
+ weight_decay: 0
152
+
153
+ loss: !name:speechbrain.nnet.losses.get_si_snr_with_pitwrapper
154
+
155
+ lr_scheduler: &id007 !new:speechbrain.nnet.schedulers.ReduceLROnPlateau
156
+
157
+ factor: 0.5
158
+ patience: 2
159
+ dont_halve_until_epoch: 85
160
+
161
+ epoch_counter: &id006 !new:speechbrain.utils.epoch_loop.EpochCounter
162
+ limit: 200
163
+
164
+ modules:
165
+ encoder: *id003
166
+ decoder: *id004
167
+ masknet: *id005
168
+ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
169
+ checkpoints_dir: results/sepformer-whamr-randomreverb-16k/1234/save
170
+ recoverables:
171
+ encoder: *id003
172
+ decoder: *id004
173
+ masknet: *id005
174
+ counter: *id006
175
+ lr_scheduler: *id007
176
+ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
177
+ save_file: results/sepformer-whamr-randomreverb-16k/1234/train_log.txt
178
+
179
+
180
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
181
+ loadables:
182
+ masknet: !ref <MaskNet>
183
+ encoder: !ref <Encoder>
184
+ decoder: !ref <Decoder>