abuzahid commited on
Commit
127d53c
1 Parent(s): 32c5534

Upload 542 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .cardboardlint.yml +5 -0
  2. .dockerignore +9 -0
  3. .gitignore +171 -0
  4. .pre-commit-config.yaml +27 -0
  5. .pylintrc +597 -0
  6. .readthedocs.yml +18 -0
  7. CITATION.cff +20 -0
  8. CODE_OF_CONDUCT.md +133 -0
  9. CODE_OWNERS.rst +75 -0
  10. CONTRIBUTING.md +136 -0
  11. Dockerfile +12 -0
  12. LICENSE.txt +373 -0
  13. MANIFEST.in +15 -0
  14. Makefile +72 -0
  15. README.md +343 -3
  16. TTS/.models.json +819 -0
  17. TTS/VERSION +1 -0
  18. TTS/__init__.py +6 -0
  19. TTS/api.py +628 -0
  20. TTS/bin/__init__.py +0 -0
  21. TTS/bin/collect_env_info.py +48 -0
  22. TTS/bin/compute_attention_masks.py +165 -0
  23. TTS/bin/compute_embeddings.py +172 -0
  24. TTS/bin/compute_statistics.py +96 -0
  25. TTS/bin/eval_encoder.py +88 -0
  26. TTS/bin/extract_tts_spectrograms.py +286 -0
  27. TTS/bin/find_unique_chars.py +45 -0
  28. TTS/bin/find_unique_phonemes.py +74 -0
  29. TTS/bin/remove_silence_using_vad.py +93 -0
  30. TTS/bin/resample.py +90 -0
  31. TTS/bin/synthesize.py +418 -0
  32. TTS/bin/train_encoder.py +319 -0
  33. TTS/bin/train_tts.py +71 -0
  34. TTS/bin/train_vocoder.py +77 -0
  35. TTS/bin/tune_wavegrad.py +103 -0
  36. TTS/config/__init__.py +132 -0
  37. TTS/config/shared_configs.py +268 -0
  38. TTS/encoder/README.md +18 -0
  39. TTS/encoder/__init__.py +0 -0
  40. TTS/encoder/configs/base_encoder_config.py +61 -0
  41. TTS/encoder/configs/emotion_encoder_config.py +12 -0
  42. TTS/encoder/configs/speaker_encoder_config.py +11 -0
  43. TTS/encoder/dataset.py +147 -0
  44. TTS/encoder/losses.py +226 -0
  45. TTS/encoder/models/base_encoder.py +161 -0
  46. TTS/encoder/models/lstm.py +99 -0
  47. TTS/encoder/models/resnet.py +198 -0
  48. TTS/encoder/requirements.txt +2 -0
  49. TTS/encoder/utils/__init__.py +0 -0
  50. TTS/encoder/utils/generic_utils.py +182 -0
.cardboardlint.yml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ linters:
2
+ - pylint:
3
+ # pylintrc: pylintrc
4
+ filefilter: ['- test_*.py', '+ *.py', '- *.npy']
5
+ # exclude:
.dockerignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ .git/
2
+ Dockerfile
3
+ build/
4
+ dist/
5
+ TTS.egg-info/
6
+ tests/outputs/*
7
+ tests/train_outputs/*
8
+ __pycache__/
9
+ *.pyc
.gitignore ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WadaSNR/
2
+ .idea/
3
+ *.pyc
4
+ .DS_Store
5
+ ./__init__.py
6
+ # Byte-compiled / optimized / DLL files
7
+ __pycache__/
8
+ *.py[cod]
9
+ *$py.class
10
+
11
+ # C extensions
12
+ *.so
13
+
14
+ # Distribution / packaging
15
+ .Python
16
+ build/
17
+ develop-eggs/
18
+ dist/
19
+ downloads/
20
+ eggs/
21
+ .eggs/
22
+ lib/
23
+ lib64/
24
+ parts/
25
+ sdist/
26
+ var/
27
+ wheels/
28
+ *.egg-info/
29
+ .installed.cfg
30
+ *.egg
31
+ MANIFEST
32
+
33
+ # PyInstaller
34
+ # Usually these files are written by a python script from a template
35
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
36
+ *.manifest
37
+ *.spec
38
+
39
+ # Installer logs
40
+ pip-log.txt
41
+ pip-delete-this-directory.txt
42
+
43
+ # Unit test / coverage reports
44
+ htmlcov/
45
+ .tox/
46
+ .coverage
47
+ .coverage.*
48
+ .cache
49
+ nosetests.xml
50
+ coverage.xml
51
+ *.cover
52
+ .hypothesis/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ .static_storage/
61
+ .media/
62
+ local_settings.py
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # pyenv
81
+ .python-version
82
+
83
+ # celery beat schedule file
84
+ celerybeat-schedule
85
+
86
+ # SageMath parsed files
87
+ *.sage.py
88
+
89
+ # Environments
90
+ .env
91
+ .venv
92
+ env/
93
+ venv/
94
+ ENV/
95
+ env.bak/
96
+ venv.bak/
97
+
98
+ # Spyder project settings
99
+ .spyderproject
100
+ .spyproject
101
+
102
+ # Rope project settings
103
+ .ropeproject
104
+
105
+ # mkdocs documentation
106
+ /site
107
+
108
+ # mypy
109
+ .mypy_cache/
110
+
111
+ # vim
112
+ *.swp
113
+ *.swm
114
+ *.swn
115
+ *.swo
116
+
117
+ # pytorch models
118
+ *.pth
119
+ *.pth.tar
120
+ !dummy_speakers.pth
121
+ result/
122
+
123
+ # setup.py
124
+ version.py
125
+
126
+ # jupyter dummy files
127
+ core
128
+
129
+ # ignore local datasets
130
+ recipes/WIP/*
131
+ recipes/ljspeech/LJSpeech-1.1/*
132
+ recipes/vctk/VCTK/*
133
+ recipes/**/*.npy
134
+ recipes/**/*.json
135
+ VCTK-Corpus-removed-silence/*
136
+
137
+ # ignore training logs
138
+ trainer_*_log.txt
139
+
140
+ # files used internally for dev, test etc.
141
+ tests/outputs/*
142
+ tests/train_outputs/*
143
+ TODO.txt
144
+ .vscode/*
145
+ data/*
146
+ notebooks/data/*
147
+ TTS/tts/utils/monotonic_align/core.c
148
+ .vscode-upload.json
149
+ temp_build/*
150
+ events.out*
151
+ old_configs/*
152
+ model_importers/*
153
+ model_profiling/*
154
+ docs/source/TODO/*
155
+ .noseids
156
+ .dccache
157
+ log.txt
158
+ umap.png
159
+ *.out
160
+ SocialMedia.txt
161
+ output.wav
162
+ tts_output.wav
163
+ deps.json
164
+ speakers.json
165
+ internal/*
166
+ *_pitch.npy
167
+ *_phoneme.npy
168
+ wandb
169
+ depot/*
170
+ coqui_recipes/*
171
+ local_scripts/*
.pre-commit-config.yaml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: 'https://github.com/pre-commit/pre-commit-hooks'
3
+ rev: v2.3.0
4
+ hooks:
5
+ - id: check-yaml
6
+ - id: end-of-file-fixer
7
+ - id: trailing-whitespace
8
+ - repo: 'https://github.com/psf/black'
9
+ rev: 22.3.0
10
+ hooks:
11
+ - id: black
12
+ language_version: python3
13
+ - repo: https://github.com/pycqa/isort
14
+ rev: 5.8.0
15
+ hooks:
16
+ - id: isort
17
+ name: isort (python)
18
+ - id: isort
19
+ name: isort (cython)
20
+ types: [cython]
21
+ - id: isort
22
+ name: isort (pyi)
23
+ types: [pyi]
24
+ - repo: https://github.com/pycqa/pylint
25
+ rev: v2.8.2
26
+ hooks:
27
+ - id: pylint
.pylintrc ADDED
@@ -0,0 +1,597 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [MASTER]
2
+
3
+ # A comma-separated list of package or module names from where C extensions may
4
+ # be loaded. Extensions are loading into the active Python interpreter and may
5
+ # run arbitrary code.
6
+ extension-pkg-whitelist=
7
+
8
+ # Add files or directories to the blacklist. They should be base names, not
9
+ # paths.
10
+ ignore=CVS
11
+
12
+ # Add files or directories matching the regex patterns to the blacklist. The
13
+ # regex matches against base names, not paths.
14
+ ignore-patterns=
15
+
16
+ # Python code to execute, usually for sys.path manipulation such as
17
+ # pygtk.require().
18
+ #init-hook=
19
+
20
+ # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
21
+ # number of processors available to use.
22
+ jobs=1
23
+
24
+ # Control the amount of potential inferred values when inferring a single
25
+ # object. This can help the performance when dealing with large functions or
26
+ # complex, nested conditions.
27
+ limit-inference-results=100
28
+
29
+ # List of plugins (as comma separated values of python modules names) to load,
30
+ # usually to register additional checkers.
31
+ load-plugins=
32
+
33
+ # Pickle collected data for later comparisons.
34
+ persistent=yes
35
+
36
+ # Specify a configuration file.
37
+ #rcfile=
38
+
39
+ # When enabled, pylint would attempt to guess common misconfiguration and emit
40
+ # user-friendly hints instead of false-positive error messages.
41
+ suggestion-mode=yes
42
+
43
+ # Allow loading of arbitrary C extensions. Extensions are imported into the
44
+ # active Python interpreter and may run arbitrary code.
45
+ unsafe-load-any-extension=no
46
+
47
+
48
+ [MESSAGES CONTROL]
49
+
50
+ # Only show warnings with the listed confidence levels. Leave empty to show
51
+ # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
52
+ confidence=
53
+
54
+ # Disable the message, report, category or checker with the given id(s). You
55
+ # can either give multiple identifiers separated by comma (,) or put this
56
+ # option multiple times (only on the command line, not in the configuration
57
+ # file where it should appear only once). You can also use "--disable=all" to
58
+ # disable everything first and then reenable specific checks. For example, if
59
+ # you want to run only the similarities checker, you can use "--disable=all
60
+ # --enable=similarities". If you want to run only the classes checker, but have
61
+ # no Warning level messages displayed, use "--disable=all --enable=classes
62
+ # --disable=W".
63
+ disable=missing-docstring,
64
+ too-many-public-methods,
65
+ too-many-lines,
66
+ bare-except,
67
+ ## for avoiding weird p3.6 CI linter error
68
+ ## TODO: see later if we can remove this
69
+ assigning-non-slot,
70
+ unsupported-assignment-operation,
71
+ ## end
72
+ line-too-long,
73
+ fixme,
74
+ wrong-import-order,
75
+ ungrouped-imports,
76
+ wrong-import-position,
77
+ import-error,
78
+ invalid-name,
79
+ too-many-instance-attributes,
80
+ arguments-differ,
81
+ arguments-renamed,
82
+ no-name-in-module,
83
+ no-member,
84
+ unsubscriptable-object,
85
+ print-statement,
86
+ parameter-unpacking,
87
+ unpacking-in-except,
88
+ old-raise-syntax,
89
+ backtick,
90
+ long-suffix,
91
+ old-ne-operator,
92
+ old-octal-literal,
93
+ import-star-module-level,
94
+ non-ascii-bytes-literal,
95
+ raw-checker-failed,
96
+ bad-inline-option,
97
+ locally-disabled,
98
+ file-ignored,
99
+ suppressed-message,
100
+ useless-suppression,
101
+ deprecated-pragma,
102
+ use-symbolic-message-instead,
103
+ useless-object-inheritance,
104
+ too-few-public-methods,
105
+ too-many-branches,
106
+ too-many-arguments,
107
+ too-many-locals,
108
+ too-many-statements,
109
+ apply-builtin,
110
+ basestring-builtin,
111
+ buffer-builtin,
112
+ cmp-builtin,
113
+ coerce-builtin,
114
+ execfile-builtin,
115
+ file-builtin,
116
+ long-builtin,
117
+ raw_input-builtin,
118
+ reduce-builtin,
119
+ standarderror-builtin,
120
+ unicode-builtin,
121
+ xrange-builtin,
122
+ coerce-method,
123
+ delslice-method,
124
+ getslice-method,
125
+ setslice-method,
126
+ no-absolute-import,
127
+ old-division,
128
+ dict-iter-method,
129
+ dict-view-method,
130
+ next-method-called,
131
+ metaclass-assignment,
132
+ indexing-exception,
133
+ raising-string,
134
+ reload-builtin,
135
+ oct-method,
136
+ hex-method,
137
+ nonzero-method,
138
+ cmp-method,
139
+ input-builtin,
140
+ round-builtin,
141
+ intern-builtin,
142
+ unichr-builtin,
143
+ map-builtin-not-iterating,
144
+ zip-builtin-not-iterating,
145
+ range-builtin-not-iterating,
146
+ filter-builtin-not-iterating,
147
+ using-cmp-argument,
148
+ eq-without-hash,
149
+ div-method,
150
+ idiv-method,
151
+ rdiv-method,
152
+ exception-message-attribute,
153
+ invalid-str-codec,
154
+ sys-max-int,
155
+ bad-python3-import,
156
+ deprecated-string-function,
157
+ deprecated-str-translate-call,
158
+ deprecated-itertools-function,
159
+ deprecated-types-field,
160
+ next-method-defined,
161
+ dict-items-not-iterating,
162
+ dict-keys-not-iterating,
163
+ dict-values-not-iterating,
164
+ deprecated-operator-function,
165
+ deprecated-urllib-function,
166
+ xreadlines-attribute,
167
+ deprecated-sys-function,
168
+ exception-escape,
169
+ comprehension-escape,
170
+ duplicate-code,
171
+ not-callable,
172
+ import-outside-toplevel
173
+
174
+ # Enable the message, report, category or checker with the given id(s). You can
175
+ # either give multiple identifier separated by comma (,) or put this option
176
+ # multiple time (only on the command line, not in the configuration file where
177
+ # it should appear only once). See also the "--disable" option for examples.
178
+ enable=c-extension-no-member
179
+
180
+
181
+ [REPORTS]
182
+
183
+ # Python expression which should return a note less than 10 (10 is the highest
184
+ # note). You have access to the variables errors warning, statement which
185
+ # respectively contain the number of errors / warnings messages and the total
186
+ # number of statements analyzed. This is used by the global evaluation report
187
+ # (RP0004).
188
+ evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
189
+
190
+ # Template used to display messages. This is a python new-style format string
191
+ # used to format the message information. See doc for all details.
192
+ #msg-template=
193
+
194
+ # Set the output format. Available formats are text, parseable, colorized, json
195
+ # and msvs (visual studio). You can also give a reporter class, e.g.
196
+ # mypackage.mymodule.MyReporterClass.
197
+ output-format=text
198
+
199
+ # Tells whether to display a full report or only the messages.
200
+ reports=no
201
+
202
+ # Activate the evaluation score.
203
+ score=yes
204
+
205
+
206
+ [REFACTORING]
207
+
208
+ # Maximum number of nested blocks for function / method body
209
+ max-nested-blocks=5
210
+
211
+ # Complete name of functions that never returns. When checking for
212
+ # inconsistent-return-statements if a never returning function is called then
213
+ # it will be considered as an explicit return statement and no message will be
214
+ # printed.
215
+ never-returning-functions=sys.exit
216
+
217
+
218
+ [LOGGING]
219
+
220
+ # Format style used to check logging format string. `old` means using %
221
+ # formatting, while `new` is for `{}` formatting.
222
+ logging-format-style=old
223
+
224
+ # Logging modules to check that the string format arguments are in logging
225
+ # function parameter format.
226
+ logging-modules=logging
227
+
228
+
229
+ [SPELLING]
230
+
231
+ # Limits count of emitted suggestions for spelling mistakes.
232
+ max-spelling-suggestions=4
233
+
234
+ # Spelling dictionary name. Available dictionaries: none. To make it working
235
+ # install python-enchant package..
236
+ spelling-dict=
237
+
238
+ # List of comma separated words that should not be checked.
239
+ spelling-ignore-words=
240
+
241
+ # A path to a file that contains private dictionary; one word per line.
242
+ spelling-private-dict-file=
243
+
244
+ # Tells whether to store unknown words to indicated private dictionary in
245
+ # --spelling-private-dict-file option instead of raising a message.
246
+ spelling-store-unknown-words=no
247
+
248
+
249
+ [MISCELLANEOUS]
250
+
251
+ # List of note tags to take in consideration, separated by a comma.
252
+ notes=FIXME,
253
+ XXX,
254
+ TODO
255
+
256
+
257
+ [TYPECHECK]
258
+
259
+ # List of decorators that produce context managers, such as
260
+ # contextlib.contextmanager. Add to this list to register other decorators that
261
+ # produce valid context managers.
262
+ contextmanager-decorators=contextlib.contextmanager
263
+
264
+ # List of members which are set dynamically and missed by pylint inference
265
+ # system, and so shouldn't trigger E1101 when accessed. Python regular
266
+ # expressions are accepted.
267
+ generated-members=numpy.*,torch.*
268
+
269
+ # Tells whether missing members accessed in mixin class should be ignored. A
270
+ # mixin class is detected if its name ends with "mixin" (case insensitive).
271
+ ignore-mixin-members=yes
272
+
273
+ # Tells whether to warn about missing members when the owner of the attribute
274
+ # is inferred to be None.
275
+ ignore-none=yes
276
+
277
+ # This flag controls whether pylint should warn about no-member and similar
278
+ # checks whenever an opaque object is returned when inferring. The inference
279
+ # can return multiple potential results while evaluating a Python object, but
280
+ # some branches might not be evaluated, which results in partial inference. In
281
+ # that case, it might be useful to still emit no-member and other checks for
282
+ # the rest of the inferred objects.
283
+ ignore-on-opaque-inference=yes
284
+
285
+ # List of class names for which member attributes should not be checked (useful
286
+ # for classes with dynamically set attributes). This supports the use of
287
+ # qualified names.
288
+ ignored-classes=optparse.Values,thread._local,_thread._local
289
+
290
+ # List of module names for which member attributes should not be checked
291
+ # (useful for modules/projects where namespaces are manipulated during runtime
292
+ # and thus existing member attributes cannot be deduced by static analysis. It
293
+ # supports qualified module names, as well as Unix pattern matching.
294
+ ignored-modules=
295
+
296
+ # Show a hint with possible names when a member name was not found. The aspect
297
+ # of finding the hint is based on edit distance.
298
+ missing-member-hint=yes
299
+
300
+ # The minimum edit distance a name should have in order to be considered a
301
+ # similar match for a missing member name.
302
+ missing-member-hint-distance=1
303
+
304
+ # The total number of similar names that should be taken in consideration when
305
+ # showing a hint for a missing member.
306
+ missing-member-max-choices=1
307
+
308
+
309
+ [VARIABLES]
310
+
311
+ # List of additional names supposed to be defined in builtins. Remember that
312
+ # you should avoid defining new builtins when possible.
313
+ additional-builtins=
314
+
315
+ # Tells whether unused global variables should be treated as a violation.
316
+ allow-global-unused-variables=yes
317
+
318
+ # List of strings which can identify a callback function by name. A callback
319
+ # name must start or end with one of those strings.
320
+ callbacks=cb_,
321
+ _cb
322
+
323
+ # A regular expression matching the name of dummy variables (i.e. expected to
324
+ # not be used).
325
+ dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
326
+
327
+ # Argument names that match this expression will be ignored. Default to name
328
+ # with leading underscore.
329
+ ignored-argument-names=_.*|^ignored_|^unused_
330
+
331
+ # Tells whether we should check for unused import in __init__ files.
332
+ init-import=no
333
+
334
+ # List of qualified module names which can have objects that can redefine
335
+ # builtins.
336
+ redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
337
+
338
+
339
+ [FORMAT]
340
+
341
+ # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
342
+ expected-line-ending-format=
343
+
344
+ # Regexp for a line that is allowed to be longer than the limit.
345
+ ignore-long-lines=^\s*(# )?<?https?://\S+>?$
346
+
347
+ # Number of spaces of indent required inside a hanging or continued line.
348
+ indent-after-paren=4
349
+
350
+ # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
351
+ # tab).
352
+ indent-string=' '
353
+
354
+ # Maximum number of characters on a single line.
355
+ max-line-length=120
356
+
357
+ # Maximum number of lines in a module.
358
+ max-module-lines=1000
359
+
360
+ # List of optional constructs for which whitespace checking is disabled. `dict-
361
+ # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}.
362
+ # `trailing-comma` allows a space between comma and closing bracket: (a, ).
363
+ # `empty-line` allows space-only lines.
364
+ no-space-check=trailing-comma,
365
+ dict-separator
366
+
367
+ # Allow the body of a class to be on the same line as the declaration if body
368
+ # contains single statement.
369
+ single-line-class-stmt=no
370
+
371
+ # Allow the body of an if to be on the same line as the test if there is no
372
+ # else.
373
+ single-line-if-stmt=no
374
+
375
+
376
+ [SIMILARITIES]
377
+
378
+ # Ignore comments when computing similarities.
379
+ ignore-comments=yes
380
+
381
+ # Ignore docstrings when computing similarities.
382
+ ignore-docstrings=yes
383
+
384
+ # Ignore imports when computing similarities.
385
+ ignore-imports=no
386
+
387
+ # Minimum lines number of a similarity.
388
+ min-similarity-lines=4
389
+
390
+
391
+ [BASIC]
392
+
393
+ # Naming style matching correct argument names.
394
+ argument-naming-style=snake_case
395
+
396
+ # Regular expression matching correct argument names. Overrides argument-
397
+ # naming-style.
398
+ argument-rgx=[a-z_][a-z0-9_]{0,30}$
399
+
400
+ # Naming style matching correct attribute names.
401
+ attr-naming-style=snake_case
402
+
403
+ # Regular expression matching correct attribute names. Overrides attr-naming-
404
+ # style.
405
+ #attr-rgx=
406
+
407
+ # Bad variable names which should always be refused, separated by a comma.
408
+ bad-names=
409
+
410
+ # Naming style matching correct class attribute names.
411
+ class-attribute-naming-style=any
412
+
413
+ # Regular expression matching correct class attribute names. Overrides class-
414
+ # attribute-naming-style.
415
+ #class-attribute-rgx=
416
+
417
+ # Naming style matching correct class names.
418
+ class-naming-style=PascalCase
419
+
420
+ # Regular expression matching correct class names. Overrides class-naming-
421
+ # style.
422
+ #class-rgx=
423
+
424
+ # Naming style matching correct constant names.
425
+ const-naming-style=UPPER_CASE
426
+
427
+ # Regular expression matching correct constant names. Overrides const-naming-
428
+ # style.
429
+ #const-rgx=
430
+
431
+ # Minimum line length for functions/classes that require docstrings, shorter
432
+ # ones are exempt.
433
+ docstring-min-length=-1
434
+
435
+ # Naming style matching correct function names.
436
+ function-naming-style=snake_case
437
+
438
+ # Regular expression matching correct function names. Overrides function-
439
+ # naming-style.
440
+ #function-rgx=
441
+
442
+ # Good variable names which should always be accepted, separated by a comma.
443
+ good-names=i,
444
+ j,
445
+ k,
446
+ x,
447
+ ex,
448
+ Run,
449
+ _
450
+
451
+ # Include a hint for the correct naming format with invalid-name.
452
+ include-naming-hint=no
453
+
454
+ # Naming style matching correct inline iteration names.
455
+ inlinevar-naming-style=any
456
+
457
+ # Regular expression matching correct inline iteration names. Overrides
458
+ # inlinevar-naming-style.
459
+ #inlinevar-rgx=
460
+
461
+ # Naming style matching correct method names.
462
+ method-naming-style=snake_case
463
+
464
+ # Regular expression matching correct method names. Overrides method-naming-
465
+ # style.
466
+ #method-rgx=
467
+
468
+ # Naming style matching correct module names.
469
+ module-naming-style=snake_case
470
+
471
+ # Regular expression matching correct module names. Overrides module-naming-
472
+ # style.
473
+ #module-rgx=
474
+
475
+ # Colon-delimited sets of names that determine each other's naming style when
476
+ # the name regexes allow several styles.
477
+ name-group=
478
+
479
+ # Regular expression which should only match function or class names that do
480
+ # not require a docstring.
481
+ no-docstring-rgx=^_
482
+
483
+ # List of decorators that produce properties, such as abc.abstractproperty. Add
484
+ # to this list to register other decorators that produce valid properties.
485
+ # These decorators are taken in consideration only for invalid-name.
486
+ property-classes=abc.abstractproperty
487
+
488
+ # Naming style matching correct variable names.
489
+ variable-naming-style=snake_case
490
+
491
+ # Regular expression matching correct variable names. Overrides variable-
492
+ # naming-style.
493
+ variable-rgx=[a-z_][a-z0-9_]{0,30}$
494
+
495
+
496
+ [STRING]
497
+
498
+ # This flag controls whether the implicit-str-concat-in-sequence should
499
+ # generate a warning on implicit string concatenation in sequences defined over
500
+ # several lines.
501
+ check-str-concat-over-line-jumps=no
502
+
503
+
504
+ [IMPORTS]
505
+
506
+ # Allow wildcard imports from modules that define __all__.
507
+ allow-wildcard-with-all=no
508
+
509
+ # Analyse import fallback blocks. This can be used to support both Python 2 and
510
+ # 3 compatible code, which means that the block might have code that exists
511
+ # only in one or another interpreter, leading to false positives when analysed.
512
+ analyse-fallback-blocks=no
513
+
514
+ # Deprecated modules which should not be used, separated by a comma.
515
+ deprecated-modules=optparse,tkinter.tix
516
+
517
+ # Create a graph of external dependencies in the given file (report RP0402 must
518
+ # not be disabled).
519
+ ext-import-graph=
520
+
521
+ # Create a graph of every (i.e. internal and external) dependencies in the
522
+ # given file (report RP0402 must not be disabled).
523
+ import-graph=
524
+
525
+ # Create a graph of internal dependencies in the given file (report RP0402 must
526
+ # not be disabled).
527
+ int-import-graph=
528
+
529
+ # Force import order to recognize a module as part of the standard
530
+ # compatibility libraries.
531
+ known-standard-library=
532
+
533
+ # Force import order to recognize a module as part of a third party library.
534
+ known-third-party=enchant
535
+
536
+
537
+ [CLASSES]
538
+
539
+ # List of method names used to declare (i.e. assign) instance attributes.
540
+ defining-attr-methods=__init__,
541
+ __new__,
542
+ setUp
543
+
544
+ # List of member names, which should be excluded from the protected access
545
+ # warning.
546
+ exclude-protected=_asdict,
547
+ _fields,
548
+ _replace,
549
+ _source,
550
+ _make
551
+
552
+ # List of valid names for the first argument in a class method.
553
+ valid-classmethod-first-arg=cls
554
+
555
+ # List of valid names for the first argument in a metaclass class method.
556
+ valid-metaclass-classmethod-first-arg=cls
557
+
558
+
559
+ [DESIGN]
560
+
561
+ # Maximum number of arguments for function / method.
562
+ max-args=5
563
+
564
+ # Maximum number of attributes for a class (see R0902).
565
+ max-attributes=7
566
+
567
+ # Maximum number of boolean expressions in an if statement.
568
+ max-bool-expr=5
569
+
570
+ # Maximum number of branch for function / method body.
571
+ max-branches=12
572
+
573
+ # Maximum number of locals for function / method body.
574
+ max-locals=15
575
+
576
+ # Maximum number of parents for a class (see R0901).
577
+ max-parents=15
578
+
579
+ # Maximum number of public methods for a class (see R0904).
580
+ max-public-methods=20
581
+
582
+ # Maximum number of return / yield for function / method body.
583
+ max-returns=6
584
+
585
+ # Maximum number of statements in function / method body.
586
+ max-statements=50
587
+
588
+ # Minimum number of public methods for a class (see R0903).
589
+ min-public-methods=2
590
+
591
+
592
+ [EXCEPTIONS]
593
+
594
+ # Exceptions that will emit a warning when being caught. Defaults to
595
+ # "BaseException, Exception".
596
+ overgeneral-exceptions=BaseException,
597
+ Exception
.readthedocs.yml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # .readthedocs.yml
2
+ # Read the Docs configuration file
3
+ # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4
+
5
+ # Required
6
+ version: 2
7
+
8
+ # Build documentation in the docs/ directory with Sphinx
9
+ sphinx:
10
+ builder: html
11
+ configuration: docs/source/conf.py
12
+
13
+ # Optionally set the version of Python and requirements required to build your docs
14
+ python:
15
+ version: 3.7
16
+ install:
17
+ - requirements: docs/requirements.txt
18
+ - requirements: requirements.txt
CITATION.cff ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cff-version: 1.2.0
2
+ message: "If you want to cite 🐸💬, feel free to use this (but only if you loved it 😊)"
3
+ title: "Coqui TTS"
4
+ abstract: "A deep learning toolkit for Text-to-Speech, battle-tested in research and production"
5
+ date-released: 2021-01-01
6
+ authors:
7
+ - family-names: "Eren"
8
+ given-names: "Gölge"
9
+ - name: "The Coqui TTS Team"
10
+ version: 1.4
11
+ doi: 10.5281/zenodo.6334862
12
+ license: "MPL-2.0"
13
+ url: "https://www.coqui.ai"
14
+ repository-code: "https://github.com/coqui-ai/TTS"
15
+ keywords:
16
+ - machine learning
17
+ - deep learning
18
+ - artificial intelligence
19
+ - text to speech
20
+ - TTS
CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Contributor Covenant Code of Conduct
3
+
4
+ ## Our Pledge
5
+
6
+ We as members, contributors, and leaders pledge to make participation in our
7
+ community a harassment-free experience for everyone, regardless of age, body
8
+ size, visible or invisible disability, ethnicity, sex characteristics, gender
9
+ identity and expression, level of experience, education, socio-economic status,
10
+ nationality, personal appearance, race, caste, color, religion, or sexual identity
11
+ and orientation.
12
+
13
+ We pledge to act and interact in ways that contribute to an open, welcoming,
14
+ diverse, inclusive, and healthy community.
15
+
16
+ ## Our Standards
17
+
18
+ Examples of behavior that contributes to a positive environment for our
19
+ community include:
20
+
21
+ * Demonstrating empathy and kindness toward other people
22
+ * Being respectful of differing opinions, viewpoints, and experiences
23
+ * Giving and gracefully accepting constructive feedback
24
+ * Accepting responsibility and apologizing to those affected by our mistakes,
25
+ and learning from the experience
26
+ * Focusing on what is best not just for us as individuals, but for the
27
+ overall community
28
+
29
+ Examples of unacceptable behavior include:
30
+
31
+ * The use of sexualized language or imagery, and sexual attention or
32
+ advances of any kind
33
+ * Trolling, insulting or derogatory comments, and personal or political attacks
34
+ * Public or private harassment
35
+ * Publishing others' private information, such as a physical or email
36
+ address, without their explicit permission
37
+ * Other conduct which could reasonably be considered inappropriate in a
38
+ professional setting
39
+
40
+ ## Enforcement Responsibilities
41
+
42
+ Community leaders are responsible for clarifying and enforcing our standards of
43
+ acceptable behavior and will take appropriate and fair corrective action in
44
+ response to any behavior that they deem inappropriate, threatening, offensive,
45
+ or harmful.
46
+
47
+ Community leaders have the right and responsibility to remove, edit, or reject
48
+ comments, commits, code, wiki edits, issues, and other contributions that are
49
+ not aligned to this Code of Conduct, and will communicate reasons for moderation
50
+ decisions when appropriate.
51
+
52
+ ## Scope
53
+
54
+ This Code of Conduct applies within all community spaces, and also applies when
55
+ an individual is officially representing the community in public spaces.
56
+ Examples of representing our community include using an official e-mail address,
57
+ posting via an official social media account, or acting as an appointed
58
+ representative at an online or offline event.
59
+
60
+ ## Enforcement
61
+
62
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
63
+ reported to the community leaders responsible for enforcement at
64
65
+ All complaints will be reviewed and investigated promptly and fairly.
66
+
67
+ All community leaders are obligated to respect the privacy and security of the
68
+ reporter of any incident.
69
+
70
+ ## Enforcement Guidelines
71
+
72
+ Community leaders will follow these Community Impact Guidelines in determining
73
+ the consequences for any action they deem in violation of this Code of Conduct:
74
+
75
+ ### 1. Correction
76
+
77
+ **Community Impact**: Use of inappropriate language or other behavior deemed
78
+ unprofessional or unwelcome in the community.
79
+
80
+ **Consequence**: A private, written warning from community leaders, providing
81
+ clarity around the nature of the violation and an explanation of why the
82
+ behavior was inappropriate. A public apology may be requested.
83
+
84
+ ### 2. Warning
85
+
86
+ **Community Impact**: A violation through a single incident or series
87
+ of actions.
88
+
89
+ **Consequence**: A warning with consequences for continued behavior. No
90
+ interaction with the people involved, including unsolicited interaction with
91
+ those enforcing the Code of Conduct, for a specified period of time. This
92
+ includes avoiding interactions in community spaces as well as external channels
93
+ like social media. Violating these terms may lead to a temporary or
94
+ permanent ban.
95
+
96
+ ### 3. Temporary Ban
97
+
98
+ **Community Impact**: A serious violation of community standards, including
99
+ sustained inappropriate behavior.
100
+
101
+ **Consequence**: A temporary ban from any sort of interaction or public
102
+ communication with the community for a specified period of time. No public or
103
+ private interaction with the people involved, including unsolicited interaction
104
+ with those enforcing the Code of Conduct, is allowed during this period.
105
+ Violating these terms may lead to a permanent ban.
106
+
107
+ ### 4. Permanent Ban
108
+
109
+ **Community Impact**: Demonstrating a pattern of violation of community
110
+ standards, including sustained inappropriate behavior, harassment of an
111
+ individual, or aggression toward or disparagement of classes of individuals.
112
+
113
+ **Consequence**: A permanent ban from any sort of public interaction within
114
+ the community.
115
+
116
+ ## Attribution
117
+
118
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage],
119
+ version 2.0, available at
120
+ [https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0].
121
+
122
+ Community Impact Guidelines were inspired by
123
+ [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
124
+
125
+ For answers to common questions about this code of conduct, see the FAQ at
126
+ [https://www.contributor-covenant.org/faq][FAQ]. Translations are available
127
+ at [https://www.contributor-covenant.org/translations][translations].
128
+
129
+ [homepage]: https://www.contributor-covenant.org
130
+ [v2.0]: https://www.contributor-covenant.org/version/2/0/code_of_conduct.html
131
+ [Mozilla CoC]: https://github.com/mozilla/diversity
132
+ [FAQ]: https://www.contributor-covenant.org/faq
133
+ [translations]: https://www.contributor-covenant.org/translations
CODE_OWNERS.rst ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TTS code owners / governance system
2
+ ==========================================
3
+
4
+ TTS is run under a governance system inspired (and partially copied from) by the `Mozilla module ownership system <https://www.mozilla.org/about/governance/policies/module-ownership/>`_. The project is roughly divided into modules, and each module has its owners, which are responsible for reviewing pull requests and deciding on technical direction for their modules. Module ownership authority is given to people who have worked extensively on areas of the project.
5
+
6
+ Module owners also have the authority of naming other module owners or appointing module peers, which are people with authority to review pull requests in that module. They can also sub-divide their module into sub-modules with their owners.
7
+
8
+ Module owners are not tyrants. They are chartered to make decisions with input from the community and in the best interest of the community. Module owners are not required to make code changes or additions solely because the community wants them to do so. (Like anyone else, the module owners may write code because they want to, because their employers want them to, because the community wants them to, or for some other reason.) Module owners do need to pay attention to patches submitted to that module. However “pay attention” does not mean agreeing to every patch. Some patches may not make sense for the WebThings project; some may be poorly implemented. Module owners have the authority to decline a patch; this is a necessary part of the role. We ask the module owners to describe in the relevant issue their reasons for wanting changes to a patch, for declining it altogether, or for postponing review for some period. We don’t ask or expect them to rewrite patches to make them acceptable. Similarly, module owners may need to delay review of a promising patch due to an upcoming deadline. For example, a patch may be of interest, but not for the next milestone. In such a case it may make sense for the module owner to postpone review of a patch until after matters needed for a milestone have been finalized. Again, we expect this to be described in the relevant issue. And of course, it shouldn’t go on very often or for very long or escalation and review is likely.
9
+
10
+ The work of the various module owners and peers is overseen by the global owners, which are responsible for making final decisions in case there's conflict between owners as well as set the direction for the project as a whole.
11
+
12
+ This file describes module owners who are active on the project and which parts of the code they have expertise on (and interest in). If you're making changes to the code and are wondering who's an appropriate person to talk to, this list will tell you who to ping.
13
+
14
+ There's overlap in the areas of expertise of each owner, and in particular when looking at which files are covered by each area, there is a lot of overlap. Don't worry about getting it exactly right when requesting review, any code owner will be happy to redirect the request to a more appropriate person.
15
+
16
+ Global owners
17
+ ----------------
18
+
19
+ These are people who have worked on the project extensively and are familiar with all or most parts of it. Their expertise and review guidance is trusted by other code owners to cover their own areas of expertise. In case of conflicting opinions from other owners, global owners will make a final decision.
20
+
21
+ - Eren Gölge (@erogol)
22
+ - Reuben Morais (@reuben)
23
+
24
+ Training, feeding
25
+ -----------------
26
+
27
+ - Eren Gölge (@erogol)
28
+
29
+ Model exporting
30
+ ---------------
31
+
32
+ - Eren Gölge (@erogol)
33
+
34
+ Multi-Speaker TTS
35
+ -----------------
36
+
37
+ - Eren Gölge (@erogol)
38
+ - Edresson Casanova (@edresson)
39
+
40
+ TTS
41
+ ---
42
+
43
+ - Eren Gölge (@erogol)
44
+
45
+ Vocoders
46
+ --------
47
+
48
+ - Eren Gölge (@erogol)
49
+
50
+ Speaker Encoder
51
+ ---------------
52
+
53
+ - Eren Gölge (@erogol)
54
+
55
+ Testing & CI
56
+ ------------
57
+
58
+ - Eren Gölge (@erogol)
59
+ - Reuben Morais (@reuben)
60
+
61
+ Python bindings
62
+ ---------------
63
+
64
+ - Eren Gölge (@erogol)
65
+ - Reuben Morais (@reuben)
66
+
67
+ Documentation
68
+ -------------
69
+
70
+ - Eren Gölge (@erogol)
71
+
72
+ Third party bindings
73
+ --------------------
74
+
75
+ Owned by the author.
CONTRIBUTING.md ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contribution guidelines
2
+
3
+ Welcome to the 🐸TTS!
4
+
5
+ This repository is governed by [the Contributor Covenant Code of Conduct](https://github.com/coqui-ai/TTS/blob/main/CODE_OF_CONDUCT.md).
6
+
7
+ ## Where to start.
8
+ We welcome everyone who likes to contribute to 🐸TTS.
9
+
10
+ You can contribute not only with code but with bug reports, comments, questions, answers, or just a simple tweet to spread the word.
11
+
12
+ If you like to contribute code, squash a bug but if you don't know where to start, here are some pointers.
13
+
14
+ - [Development Road Map](https://github.com/coqui-ai/TTS/issues/378)
15
+
16
+ You can pick something out of our road map. We keep the progess of the project in this simple issue thread. It has new model proposals or developmental updates etc.
17
+
18
+ - [Github Issues Tracker](https://github.com/coqui-ai/TTS/issues)
19
+
20
+ This is a place to find feature requests, bugs.
21
+
22
+ Issues with the ```good first issue``` tag are good place for beginners to take on.
23
+
24
+ - ✨**PR**✨ [pages](https://github.com/coqui-ai/TTS/pulls) with the ```🚀new version``` tag.
25
+
26
+ We list all the target improvements for the next version. You can pick one of them and start contributing.
27
+
28
+ - Also feel free to suggest new features, ideas and models. We're always open for new things.
29
+
30
+ ## Call for sharing language models
31
+ If possible, please consider sharing your pre-trained models in any language (if the licences allow for you to do so). We will include them in our model catalogue for public use and give the proper attribution, whether it be your name, company, website or any other source specified.
32
+
33
+ This model can be shared in two ways:
34
+ 1. Share the model files with us and we serve them with the next 🐸 TTS release.
35
+ 2. Upload your models on GDrive and share the link.
36
+
37
+ Models are served under `.models.json` file and any model is available under TTS CLI or Server end points.
38
+
39
+ Either way you choose, please make sure you send the models [here](https://github.com/coqui-ai/TTS/issues/380).
40
+
41
+ ## Sending a ✨**PR**✨
42
+
43
+ If you have a new feature, a model to implement, or a bug to squash, go ahead and send a ✨**PR**✨.
44
+ Please use the following steps to send a ✨**PR**✨.
45
+ Let us know if you encounter a problem along the way.
46
+
47
+ The following steps are tested on an Ubuntu system.
48
+
49
+ 1. Fork 🐸TTS[https://github.com/coqui-ai/TTS] by clicking the fork button at the top right corner of the project page.
50
+
51
+ 2. Clone 🐸TTS and add the main repo as a new remote named ```upsteam```.
52
+
53
+ ```bash
54
+ $ git clone [email protected]:<your Github name>/TTS.git
55
+ $ cd TTS
56
+ $ git remote add upstream https://github.com/coqui-ai/TTS.git
57
+ ```
58
+
59
+ 3. Install 🐸TTS for development.
60
+
61
+ ```bash
62
+ $ make system-deps # intended to be used on Ubuntu (Debian). Let us know if you have a different OS.
63
+ $ make install
64
+ ```
65
+
66
+ 4. Create a new branch with an informative name for your goal.
67
+
68
+ ```bash
69
+ $ git checkout -b an_informative_name_for_my_branch
70
+ ```
71
+
72
+ 5. Implement your changes on your new branch.
73
+
74
+ 6. Explain your code using [Google Style](https://google.github.io/styleguide/pyguide.html#381-docstrings) docstrings.
75
+
76
+ 7. Add your tests to our test suite under ```tests``` folder. It is important to show that your code works, edge cases are considered, and inform others about the intended use.
77
+
78
+ 8. Run the tests to see how your updates work with the rest of the project. You can repeat this step multiple times as you implement your changes to make sure you are on the right direction.
79
+
80
+ ```bash
81
+ $ make test # stop at the first error
82
+ $ make test_all # run all the tests, report all the errors
83
+ ```
84
+
85
+ 9. Format your code. We use ```black``` for code and ```isort``` for ```import``` formatting.
86
+
87
+ ```bash
88
+ $ make style
89
+ ```
90
+
91
+ 10. Run the linter and correct the issues raised. We use ```pylint``` for linting. It helps to enforce a coding standard, offers simple refactoring suggestions.
92
+
93
+ ```bash
94
+ $ make lint
95
+ ```
96
+
97
+ 11. When things are good, add new files and commit your changes.
98
+
99
+ ```bash
100
+ $ git add my_file1.py my_file2.py ...
101
+ $ git commit
102
+ ```
103
+
104
+ It's a good practice to regularly sync your local copy of the project with the upstream code to keep up with the recent updates.
105
+
106
+ ```bash
107
+ $ git fetch upstream
108
+ $ git rebase upstream/master
109
+ # or for the development version
110
+ $ git rebase upstream/dev
111
+ ```
112
+
113
+ 12. Send a PR to ```dev``` branch.
114
+
115
+ Push your branch to your fork.
116
+
117
+ ```bash
118
+ $ git push -u origin an_informative_name_for_my_branch
119
+ ```
120
+
121
+ Then go to your fork's Github page and click on 'Pull request' to send your ✨**PR**✨.
122
+
123
+ Please set ✨**PR**✨'s target branch to ```dev``` as we use ```dev``` to work on the next version.
124
+
125
+ 13. Let's discuss until it is perfect. 💪
126
+
127
+ We might ask you for certain changes that would appear in the ✨**PR**✨'s page under 🐸TTS[https://github.com/coqui-ai/TTS/pulls].
128
+
129
+ 14. Once things look perfect, We merge it to the ```dev``` branch and make it ready for the next version.
130
+
131
+ Feel free to ping us at any step you need help using our communication channels.
132
+
133
+ If you are new to Github or open-source contribution, These are good resources.
134
+
135
+ - [Github Docs](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/proposing-changes-to-your-work-with-pull-requests)
136
+ - [First-Contribution](https://github.com/firstcontributions/first-contributions)
Dockerfile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04
2
+ FROM ${BASE}
3
+ RUN apt-get update && apt-get upgrade -y
4
+ RUN apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
5
+ RUN pip3 install llvmlite --ignore-installed
6
+
7
+ WORKDIR /root
8
+ COPY . /root
9
+ RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
10
+ RUN make install
11
+ ENTRYPOINT ["tts"]
12
+ CMD ["--help"]
LICENSE.txt ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Mozilla Public License Version 2.0
2
+ ==================================
3
+
4
+ 1. Definitions
5
+ --------------
6
+
7
+ 1.1. "Contributor"
8
+ means each individual or legal entity that creates, contributes to
9
+ the creation of, or owns Covered Software.
10
+
11
+ 1.2. "Contributor Version"
12
+ means the combination of the Contributions of others (if any) used
13
+ by a Contributor and that particular Contributor's Contribution.
14
+
15
+ 1.3. "Contribution"
16
+ means Covered Software of a particular Contributor.
17
+
18
+ 1.4. "Covered Software"
19
+ means Source Code Form to which the initial Contributor has attached
20
+ the notice in Exhibit A, the Executable Form of such Source Code
21
+ Form, and Modifications of such Source Code Form, in each case
22
+ including portions thereof.
23
+
24
+ 1.5. "Incompatible With Secondary Licenses"
25
+ means
26
+
27
+ (a) that the initial Contributor has attached the notice described
28
+ in Exhibit B to the Covered Software; or
29
+
30
+ (b) that the Covered Software was made available under the terms of
31
+ version 1.1 or earlier of the License, but not also under the
32
+ terms of a Secondary License.
33
+
34
+ 1.6. "Executable Form"
35
+ means any form of the work other than Source Code Form.
36
+
37
+ 1.7. "Larger Work"
38
+ means a work that combines Covered Software with other material, in
39
+ a separate file or files, that is not Covered Software.
40
+
41
+ 1.8. "License"
42
+ means this document.
43
+
44
+ 1.9. "Licensable"
45
+ means having the right to grant, to the maximum extent possible,
46
+ whether at the time of the initial grant or subsequently, any and
47
+ all of the rights conveyed by this License.
48
+
49
+ 1.10. "Modifications"
50
+ means any of the following:
51
+
52
+ (a) any file in Source Code Form that results from an addition to,
53
+ deletion from, or modification of the contents of Covered
54
+ Software; or
55
+
56
+ (b) any new file in Source Code Form that contains any Covered
57
+ Software.
58
+
59
+ 1.11. "Patent Claims" of a Contributor
60
+ means any patent claim(s), including without limitation, method,
61
+ process, and apparatus claims, in any patent Licensable by such
62
+ Contributor that would be infringed, but for the grant of the
63
+ License, by the making, using, selling, offering for sale, having
64
+ made, import, or transfer of either its Contributions or its
65
+ Contributor Version.
66
+
67
+ 1.12. "Secondary License"
68
+ means either the GNU General Public License, Version 2.0, the GNU
69
+ Lesser General Public License, Version 2.1, the GNU Affero General
70
+ Public License, Version 3.0, or any later versions of those
71
+ licenses.
72
+
73
+ 1.13. "Source Code Form"
74
+ means the form of the work preferred for making modifications.
75
+
76
+ 1.14. "You" (or "Your")
77
+ means an individual or a legal entity exercising rights under this
78
+ License. For legal entities, "You" includes any entity that
79
+ controls, is controlled by, or is under common control with You. For
80
+ purposes of this definition, "control" means (a) the power, direct
81
+ or indirect, to cause the direction or management of such entity,
82
+ whether by contract or otherwise, or (b) ownership of more than
83
+ fifty percent (50%) of the outstanding shares or beneficial
84
+ ownership of such entity.
85
+
86
+ 2. License Grants and Conditions
87
+ --------------------------------
88
+
89
+ 2.1. Grants
90
+
91
+ Each Contributor hereby grants You a world-wide, royalty-free,
92
+ non-exclusive license:
93
+
94
+ (a) under intellectual property rights (other than patent or trademark)
95
+ Licensable by such Contributor to use, reproduce, make available,
96
+ modify, display, perform, distribute, and otherwise exploit its
97
+ Contributions, either on an unmodified basis, with Modifications, or
98
+ as part of a Larger Work; and
99
+
100
+ (b) under Patent Claims of such Contributor to make, use, sell, offer
101
+ for sale, have made, import, and otherwise transfer either its
102
+ Contributions or its Contributor Version.
103
+
104
+ 2.2. Effective Date
105
+
106
+ The licenses granted in Section 2.1 with respect to any Contribution
107
+ become effective for each Contribution on the date the Contributor first
108
+ distributes such Contribution.
109
+
110
+ 2.3. Limitations on Grant Scope
111
+
112
+ The licenses granted in this Section 2 are the only rights granted under
113
+ this License. No additional rights or licenses will be implied from the
114
+ distribution or licensing of Covered Software under this License.
115
+ Notwithstanding Section 2.1(b) above, no patent license is granted by a
116
+ Contributor:
117
+
118
+ (a) for any code that a Contributor has removed from Covered Software;
119
+ or
120
+
121
+ (b) for infringements caused by: (i) Your and any other third party's
122
+ modifications of Covered Software, or (ii) the combination of its
123
+ Contributions with other software (except as part of its Contributor
124
+ Version); or
125
+
126
+ (c) under Patent Claims infringed by Covered Software in the absence of
127
+ its Contributions.
128
+
129
+ This License does not grant any rights in the trademarks, service marks,
130
+ or logos of any Contributor (except as may be necessary to comply with
131
+ the notice requirements in Section 3.4).
132
+
133
+ 2.4. Subsequent Licenses
134
+
135
+ No Contributor makes additional grants as a result of Your choice to
136
+ distribute the Covered Software under a subsequent version of this
137
+ License (see Section 10.2) or under the terms of a Secondary License (if
138
+ permitted under the terms of Section 3.3).
139
+
140
+ 2.5. Representation
141
+
142
+ Each Contributor represents that the Contributor believes its
143
+ Contributions are its original creation(s) or it has sufficient rights
144
+ to grant the rights to its Contributions conveyed by this License.
145
+
146
+ 2.6. Fair Use
147
+
148
+ This License is not intended to limit any rights You have under
149
+ applicable copyright doctrines of fair use, fair dealing, or other
150
+ equivalents.
151
+
152
+ 2.7. Conditions
153
+
154
+ Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
155
+ in Section 2.1.
156
+
157
+ 3. Responsibilities
158
+ -------------------
159
+
160
+ 3.1. Distribution of Source Form
161
+
162
+ All distribution of Covered Software in Source Code Form, including any
163
+ Modifications that You create or to which You contribute, must be under
164
+ the terms of this License. You must inform recipients that the Source
165
+ Code Form of the Covered Software is governed by the terms of this
166
+ License, and how they can obtain a copy of this License. You may not
167
+ attempt to alter or restrict the recipients' rights in the Source Code
168
+ Form.
169
+
170
+ 3.2. Distribution of Executable Form
171
+
172
+ If You distribute Covered Software in Executable Form then:
173
+
174
+ (a) such Covered Software must also be made available in Source Code
175
+ Form, as described in Section 3.1, and You must inform recipients of
176
+ the Executable Form how they can obtain a copy of such Source Code
177
+ Form by reasonable means in a timely manner, at a charge no more
178
+ than the cost of distribution to the recipient; and
179
+
180
+ (b) You may distribute such Executable Form under the terms of this
181
+ License, or sublicense it under different terms, provided that the
182
+ license for the Executable Form does not attempt to limit or alter
183
+ the recipients' rights in the Source Code Form under this License.
184
+
185
+ 3.3. Distribution of a Larger Work
186
+
187
+ You may create and distribute a Larger Work under terms of Your choice,
188
+ provided that You also comply with the requirements of this License for
189
+ the Covered Software. If the Larger Work is a combination of Covered
190
+ Software with a work governed by one or more Secondary Licenses, and the
191
+ Covered Software is not Incompatible With Secondary Licenses, this
192
+ License permits You to additionally distribute such Covered Software
193
+ under the terms of such Secondary License(s), so that the recipient of
194
+ the Larger Work may, at their option, further distribute the Covered
195
+ Software under the terms of either this License or such Secondary
196
+ License(s).
197
+
198
+ 3.4. Notices
199
+
200
+ You may not remove or alter the substance of any license notices
201
+ (including copyright notices, patent notices, disclaimers of warranty,
202
+ or limitations of liability) contained within the Source Code Form of
203
+ the Covered Software, except that You may alter any license notices to
204
+ the extent required to remedy known factual inaccuracies.
205
+
206
+ 3.5. Application of Additional Terms
207
+
208
+ You may choose to offer, and to charge a fee for, warranty, support,
209
+ indemnity or liability obligations to one or more recipients of Covered
210
+ Software. However, You may do so only on Your own behalf, and not on
211
+ behalf of any Contributor. You must make it absolutely clear that any
212
+ such warranty, support, indemnity, or liability obligation is offered by
213
+ You alone, and You hereby agree to indemnify every Contributor for any
214
+ liability incurred by such Contributor as a result of warranty, support,
215
+ indemnity or liability terms You offer. You may include additional
216
+ disclaimers of warranty and limitations of liability specific to any
217
+ jurisdiction.
218
+
219
+ 4. Inability to Comply Due to Statute or Regulation
220
+ ---------------------------------------------------
221
+
222
+ If it is impossible for You to comply with any of the terms of this
223
+ License with respect to some or all of the Covered Software due to
224
+ statute, judicial order, or regulation then You must: (a) comply with
225
+ the terms of this License to the maximum extent possible; and (b)
226
+ describe the limitations and the code they affect. Such description must
227
+ be placed in a text file included with all distributions of the Covered
228
+ Software under this License. Except to the extent prohibited by statute
229
+ or regulation, such description must be sufficiently detailed for a
230
+ recipient of ordinary skill to be able to understand it.
231
+
232
+ 5. Termination
233
+ --------------
234
+
235
+ 5.1. The rights granted under this License will terminate automatically
236
+ if You fail to comply with any of its terms. However, if You become
237
+ compliant, then the rights granted under this License from a particular
238
+ Contributor are reinstated (a) provisionally, unless and until such
239
+ Contributor explicitly and finally terminates Your grants, and (b) on an
240
+ ongoing basis, if such Contributor fails to notify You of the
241
+ non-compliance by some reasonable means prior to 60 days after You have
242
+ come back into compliance. Moreover, Your grants from a particular
243
+ Contributor are reinstated on an ongoing basis if such Contributor
244
+ notifies You of the non-compliance by some reasonable means, this is the
245
+ first time You have received notice of non-compliance with this License
246
+ from such Contributor, and You become compliant prior to 30 days after
247
+ Your receipt of the notice.
248
+
249
+ 5.2. If You initiate litigation against any entity by asserting a patent
250
+ infringement claim (excluding declaratory judgment actions,
251
+ counter-claims, and cross-claims) alleging that a Contributor Version
252
+ directly or indirectly infringes any patent, then the rights granted to
253
+ You by any and all Contributors for the Covered Software under Section
254
+ 2.1 of this License shall terminate.
255
+
256
+ 5.3. In the event of termination under Sections 5.1 or 5.2 above, all
257
+ end user license agreements (excluding distributors and resellers) which
258
+ have been validly granted by You or Your distributors under this License
259
+ prior to termination shall survive termination.
260
+
261
+ ************************************************************************
262
+ * *
263
+ * 6. Disclaimer of Warranty *
264
+ * ------------------------- *
265
+ * *
266
+ * Covered Software is provided under this License on an "as is" *
267
+ * basis, without warranty of any kind, either expressed, implied, or *
268
+ * statutory, including, without limitation, warranties that the *
269
+ * Covered Software is free of defects, merchantable, fit for a *
270
+ * particular purpose or non-infringing. The entire risk as to the *
271
+ * quality and performance of the Covered Software is with You. *
272
+ * Should any Covered Software prove defective in any respect, You *
273
+ * (not any Contributor) assume the cost of any necessary servicing, *
274
+ * repair, or correction. This disclaimer of warranty constitutes an *
275
+ * essential part of this License. No use of any Covered Software is *
276
+ * authorized under this License except under this disclaimer. *
277
+ * *
278
+ ************************************************************************
279
+
280
+ ************************************************************************
281
+ * *
282
+ * 7. Limitation of Liability *
283
+ * -------------------------- *
284
+ * *
285
+ * Under no circumstances and under no legal theory, whether tort *
286
+ * (including negligence), contract, or otherwise, shall any *
287
+ * Contributor, or anyone who distributes Covered Software as *
288
+ * permitted above, be liable to You for any direct, indirect, *
289
+ * special, incidental, or consequential damages of any character *
290
+ * including, without limitation, damages for lost profits, loss of *
291
+ * goodwill, work stoppage, computer failure or malfunction, or any *
292
+ * and all other commercial damages or losses, even if such party *
293
+ * shall have been informed of the possibility of such damages. This *
294
+ * limitation of liability shall not apply to liability for death or *
295
+ * personal injury resulting from such party's negligence to the *
296
+ * extent applicable law prohibits such limitation. Some *
297
+ * jurisdictions do not allow the exclusion or limitation of *
298
+ * incidental or consequential damages, so this exclusion and *
299
+ * limitation may not apply to You. *
300
+ * *
301
+ ************************************************************************
302
+
303
+ 8. Litigation
304
+ -------------
305
+
306
+ Any litigation relating to this License may be brought only in the
307
+ courts of a jurisdiction where the defendant maintains its principal
308
+ place of business and such litigation shall be governed by laws of that
309
+ jurisdiction, without reference to its conflict-of-law provisions.
310
+ Nothing in this Section shall prevent a party's ability to bring
311
+ cross-claims or counter-claims.
312
+
313
+ 9. Miscellaneous
314
+ ----------------
315
+
316
+ This License represents the complete agreement concerning the subject
317
+ matter hereof. If any provision of this License is held to be
318
+ unenforceable, such provision shall be reformed only to the extent
319
+ necessary to make it enforceable. Any law or regulation which provides
320
+ that the language of a contract shall be construed against the drafter
321
+ shall not be used to construe this License against a Contributor.
322
+
323
+ 10. Versions of the License
324
+ ---------------------------
325
+
326
+ 10.1. New Versions
327
+
328
+ Mozilla Foundation is the license steward. Except as provided in Section
329
+ 10.3, no one other than the license steward has the right to modify or
330
+ publish new versions of this License. Each version will be given a
331
+ distinguishing version number.
332
+
333
+ 10.2. Effect of New Versions
334
+
335
+ You may distribute the Covered Software under the terms of the version
336
+ of the License under which You originally received the Covered Software,
337
+ or under the terms of any subsequent version published by the license
338
+ steward.
339
+
340
+ 10.3. Modified Versions
341
+
342
+ If you create software not governed by this License, and you want to
343
+ create a new license for such software, you may create and use a
344
+ modified version of this License if you rename the license and remove
345
+ any references to the name of the license steward (except to note that
346
+ such modified license differs from this License).
347
+
348
+ 10.4. Distributing Source Code Form that is Incompatible With Secondary
349
+ Licenses
350
+
351
+ If You choose to distribute Source Code Form that is Incompatible With
352
+ Secondary Licenses under the terms of this version of the License, the
353
+ notice described in Exhibit B of this License must be attached.
354
+
355
+ Exhibit A - Source Code Form License Notice
356
+ -------------------------------------------
357
+
358
+ This Source Code Form is subject to the terms of the Mozilla Public
359
+ License, v. 2.0. If a copy of the MPL was not distributed with this
360
+ file, You can obtain one at http://mozilla.org/MPL/2.0/.
361
+
362
+ If it is not possible or desirable to put the notice in a particular
363
+ file, then You may include the notice in a location (such as a LICENSE
364
+ file in a relevant directory) where a recipient would be likely to look
365
+ for such a notice.
366
+
367
+ You may add additional accurate notices of copyright ownership.
368
+
369
+ Exhibit B - "Incompatible With Secondary Licenses" Notice
370
+ ---------------------------------------------------------
371
+
372
+ This Source Code Form is "Incompatible With Secondary Licenses", as
373
+ defined by the Mozilla Public License, v. 2.0.
MANIFEST.in ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ include README.md
2
+ include LICENSE.txt
3
+ include requirements.*.txt
4
+ include *.cff
5
+ include requirements.txt
6
+ include TTS/VERSION
7
+ recursive-include TTS *.json
8
+ recursive-include TTS *.html
9
+ recursive-include TTS *.png
10
+ recursive-include TTS *.md
11
+ recursive-include TTS *.py
12
+ recursive-include TTS *.pyx
13
+ recursive-include images *.png
14
+ recursive-exclude tests *
15
+ prune tests*
Makefile ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .DEFAULT_GOAL := help
2
+ .PHONY: test system-deps dev-deps deps style lint install help docs
3
+
4
+ help:
5
+ @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
6
+
7
+ target_dirs := tests TTS notebooks recipes
8
+
9
+ test_all: ## run tests and don't stop on an error.
10
+ nose2 --with-coverage --coverage TTS tests
11
+ ./run_bash_tests.sh
12
+
13
+ test: ## run tests.
14
+ nose2 -F -v -B --with-coverage --coverage TTS tests
15
+
16
+ test_vocoder: ## run vocoder tests.
17
+ nose2 -F -v -B --with-coverage --coverage TTS tests.vocoder_tests
18
+
19
+ test_tts: ## run tts tests.
20
+ nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests
21
+
22
+ test_aux: ## run aux tests.
23
+ nose2 -F -v -B --with-coverage --coverage TTS tests.aux_tests
24
+ ./run_bash_tests.sh
25
+
26
+ test_zoo: ## run zoo tests.
27
+ nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests
28
+
29
+ inference_tests: ## run inference tests.
30
+ nose2 -F -v -B --with-coverage --coverage TTS tests.inference_tests
31
+
32
+ data_tests: ## run data tests.
33
+ nose2 -F -v -B --with-coverage --coverage TTS tests.data_tests
34
+
35
+ test_text: ## run text tests.
36
+ nose2 -F -v -B --with-coverage --coverage TTS tests.text_tests
37
+
38
+ test_failed: ## only run tests failed the last time.
39
+ nose2 -F -v -B --with-coverage --coverage TTS tests
40
+
41
+ style: ## update code style.
42
+ black ${target_dirs}
43
+ isort ${target_dirs}
44
+
45
+ lint: ## run pylint linter.
46
+ pylint ${target_dirs}
47
+ black ${target_dirs} --check
48
+ isort ${target_dirs} --check-only
49
+
50
+ system-deps: ## install linux system deps
51
+ sudo apt-get install -y libsndfile1-dev
52
+
53
+ dev-deps: ## install development deps
54
+ pip install -r requirements.dev.txt
55
+
56
+ doc-deps: ## install docs dependencies
57
+ pip install -r docs/requirements.txt
58
+
59
+ build-docs: ## build the docs
60
+ cd docs && make clean && make build
61
+
62
+ hub-deps: ## install deps for torch hub use
63
+ pip install -r requirements.hub.txt
64
+
65
+ deps: ## install 🐸 requirements.
66
+ pip install -r requirements.txt
67
+
68
+ install: ## install 🐸 TTS for development.
69
+ pip install -e .[all]
70
+
71
+ docs: ## build the docs
72
+ $(MAKE) -C docs clean && $(MAKE) -C docs html
README.md CHANGED
@@ -1,3 +1,343 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
2
+
3
+ ----
4
+
5
+ ### 📣 Clone your voice with a single click on [🐸Coqui.ai](https://app.coqui.ai/auth/signin)
6
+
7
+ ----
8
+
9
+ 🐸TTS is a library for advanced Text-to-Speech generation. It's built on the latest research, was designed to achieve the best trade-off among ease-of-training, speed and quality.
10
+ 🐸TTS comes with pretrained models, tools for measuring dataset quality and already used in **20+ languages** for products and research projects.
11
+
12
+ [![Dicord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv)
13
+ [![License](<https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg>)](https://opensource.org/licenses/MPL-2.0)
14
+ [![PyPI version](https://badge.fury.io/py/TTS.svg)](https://badge.fury.io/py/TTS)
15
+ [![Covenant](https://camo.githubusercontent.com/7d620efaa3eac1c5b060ece5d6aacfcc8b81a74a04d05cd0398689c01c4463bb/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6e7472696275746f72253230436f76656e616e742d76322e3025323061646f707465642d6666363962342e737667)](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md)
16
+ [![Downloads](https://pepy.tech/badge/tts)](https://pepy.tech/project/tts)
17
+ [![DOI](https://zenodo.org/badge/265612440.svg)](https://zenodo.org/badge/latestdoi/265612440)
18
+
19
+ ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/aux_tests.yml/badge.svg)
20
+ ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/data_tests.yml/badge.svg)
21
+ ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/docker.yaml/badge.svg)
22
+ ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/inference_tests.yml/badge.svg)
23
+ ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/style_check.yml/badge.svg)
24
+ ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/text_tests.yml/badge.svg)
25
+ ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/tts_tests.yml/badge.svg)
26
+ ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/vocoder_tests.yml/badge.svg)
27
+ ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests0.yml/badge.svg)
28
+ ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests1.yml/badge.svg)
29
+ ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests2.yml/badge.svg)
30
+ [![Docs](<https://readthedocs.org/projects/tts/badge/?version=latest&style=plastic>)](https://tts.readthedocs.io/en/latest/)
31
+
32
+ 📰 [**Subscribe to 🐸Coqui.ai Newsletter**](https://coqui.ai/?subscription=true)
33
+
34
+ 📢 [English Voice Samples](https://erogol.github.io/ddc-samples/) and [SoundCloud playlist](https://soundcloud.com/user-565970875/pocket-article-wavernn-and-tacotron2)
35
+
36
+ 📄 [Text-to-Speech paper collection](https://github.com/erogol/TTS-papers)
37
+
38
+ <img src="https://static.scarf.sh/a.png?x-pxid=cf317fe7-2188-4721-bc01-124bb5d5dbb2" />
39
+
40
+ ## 💬 Where to ask questions
41
+ Please use our dedicated channels for questions and discussion. Help is much more valuable if it's shared publicly so that more people can benefit from it.
42
+
43
+ | Type | Platforms |
44
+ | ------------------------------- | --------------------------------------- |
45
+ | 🚨 **Bug Reports** | [GitHub Issue Tracker] |
46
+ | 🎁 **Feature Requests & Ideas** | [GitHub Issue Tracker] |
47
+ | 👩‍💻 **Usage Questions** | [GitHub Discussions] |
48
+ | 🗯 **General Discussion** | [GitHub Discussions] or [Discord] |
49
+
50
+ [github issue tracker]: https://github.com/coqui-ai/tts/issues
51
+ [github discussions]: https://github.com/coqui-ai/TTS/discussions
52
+ [discord]: https://discord.gg/5eXr5seRrv
53
+ [Tutorials and Examples]: https://github.com/coqui-ai/TTS/wiki/TTS-Notebooks-and-Tutorials
54
+
55
+
56
+ ## 🔗 Links and Resources
57
+ | Type | Links |
58
+ | ------------------------------- | --------------------------------------- |
59
+ | 💼 **Documentation** | [ReadTheDocs](https://tts.readthedocs.io/en/latest/)
60
+ | 💾 **Installation** | [TTS/README.md](https://github.com/coqui-ai/TTS/tree/dev#install-tts)|
61
+ | 👩‍💻 **Contributing** | [CONTRIBUTING.md](https://github.com/coqui-ai/TTS/blob/main/CONTRIBUTING.md)|
62
+ | 📌 **Road Map** | [Main Development Plans](https://github.com/coqui-ai/TTS/issues/378)
63
+ | 🚀 **Released Models** | [TTS Releases](https://github.com/coqui-ai/TTS/releases) and [Experimental Models](https://github.com/coqui-ai/TTS/wiki/Experimental-Released-Models)|
64
+
65
+ ## 🥇 TTS Performance
66
+ <p align="center"><img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/TTS-performance.png" width="800" /></p>
67
+
68
+ Underlined "TTS*" and "Judy*" are 🐸TTS models
69
+ <!-- [Details...](https://github.com/coqui-ai/TTS/wiki/Mean-Opinion-Score-Results) -->
70
+
71
+ ## Features
72
+ - High-performance Deep Learning models for Text2Speech tasks.
73
+ - Text2Spec models (Tacotron, Tacotron2, Glow-TTS, SpeedySpeech).
74
+ - Speaker Encoder to compute speaker embeddings efficiently.
75
+ - Vocoder models (MelGAN, Multiband-MelGAN, GAN-TTS, ParallelWaveGAN, WaveGrad, WaveRNN)
76
+ - Fast and efficient model training.
77
+ - Detailed training logs on the terminal and Tensorboard.
78
+ - Support for Multi-speaker TTS.
79
+ - Efficient, flexible, lightweight but feature complete `Trainer API`.
80
+ - Released and ready-to-use models.
81
+ - Tools to curate Text2Speech datasets under```dataset_analysis```.
82
+ - Utilities to use and test your models.
83
+ - Modular (but not too much) code base enabling easy implementation of new ideas.
84
+
85
+ ## Implemented Models
86
+ ### Spectrogram models
87
+ - Tacotron: [paper](https://arxiv.org/abs/1703.10135)
88
+ - Tacotron2: [paper](https://arxiv.org/abs/1712.05884)
89
+ - Glow-TTS: [paper](https://arxiv.org/abs/2005.11129)
90
+ - Speedy-Speech: [paper](https://arxiv.org/abs/2008.03802)
91
+ - Align-TTS: [paper](https://arxiv.org/abs/2003.01950)
92
+ - FastPitch: [paper](https://arxiv.org/pdf/2006.06873.pdf)
93
+ - FastSpeech: [paper](https://arxiv.org/abs/1905.09263)
94
+ - FastSpeech2: [paper](https://arxiv.org/abs/2006.04558)
95
+ - SC-GlowTTS: [paper](https://arxiv.org/abs/2104.05557)
96
+ - Capacitron: [paper](https://arxiv.org/abs/1906.03402)
97
+ - OverFlow: [paper](https://arxiv.org/abs/2211.06892)
98
+ - Neural HMM TTS: [paper](https://arxiv.org/abs/2108.13320)
99
+
100
+ ### End-to-End Models
101
+ - VITS: [paper](https://arxiv.org/pdf/2106.06103)
102
+ - YourTTS: [paper](https://arxiv.org/abs/2112.02418)
103
+
104
+ ### Attention Methods
105
+ - Guided Attention: [paper](https://arxiv.org/abs/1710.08969)
106
+ - Forward Backward Decoding: [paper](https://arxiv.org/abs/1907.09006)
107
+ - Graves Attention: [paper](https://arxiv.org/abs/1910.10288)
108
+ - Double Decoder Consistency: [blog](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/)
109
+ - Dynamic Convolutional Attention: [paper](https://arxiv.org/pdf/1910.10288.pdf)
110
+ - Alignment Network: [paper](https://arxiv.org/abs/2108.10447)
111
+
112
+ ### Speaker Encoder
113
+ - GE2E: [paper](https://arxiv.org/abs/1710.10467)
114
+ - Angular Loss: [paper](https://arxiv.org/pdf/2003.11982.pdf)
115
+
116
+ ### Vocoders
117
+ - MelGAN: [paper](https://arxiv.org/abs/1910.06711)
118
+ - MultiBandMelGAN: [paper](https://arxiv.org/abs/2005.05106)
119
+ - ParallelWaveGAN: [paper](https://arxiv.org/abs/1910.11480)
120
+ - GAN-TTS discriminators: [paper](https://arxiv.org/abs/1909.11646)
121
+ - WaveRNN: [origin](https://github.com/fatchord/WaveRNN/)
122
+ - WaveGrad: [paper](https://arxiv.org/abs/2009.00713)
123
+ - HiFiGAN: [paper](https://arxiv.org/abs/2010.05646)
124
+ - UnivNet: [paper](https://arxiv.org/abs/2106.07889)
125
+
126
+ You can also help us implement more models.
127
+
128
+ ## Install TTS
129
+ 🐸TTS is tested on Ubuntu 18.04 with **python >= 3.7, < 3.11.**.
130
+
131
+ If you are only interested in [synthesizing speech](https://tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option.
132
+
133
+ ```bash
134
+ pip install TTS
135
+ ```
136
+
137
+ If you plan to code or train models, clone 🐸TTS and install it locally.
138
+
139
+ ```bash
140
+ git clone https://github.com/coqui-ai/TTS
141
+ pip install -e .[all,dev,notebooks] # Select the relevant extras
142
+ ```
143
+
144
+ If you are on Ubuntu (Debian), you can also run following commands for installation.
145
+
146
+ ```bash
147
+ $ make system-deps # intended to be used on Ubuntu (Debian). Let us know if you have a different OS.
148
+ $ make install
149
+ ```
150
+
151
+ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/how-can-i-run-mozilla-tts-coqui-tts-training-with-cuda-on-a-windows-system).
152
+
153
+
154
+ ## Docker Image
155
+ You can also try TTS without install with the docker image.
156
+ Simply run the following command and you will be able to run TTS without installing it.
157
+
158
+ ```bash
159
+ docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu
160
+ python3 TTS/server/server.py --list_models #To get the list of available models
161
+ python3 TTS/server/server.py --model_name tts_models/en/vctk/vits # To start a server
162
+ ```
163
+
164
+ You can then enjoy the TTS server [here](http://[::1]:5002/)
165
+ More details about the docker images (like GPU support) can be found [here](https://tts.readthedocs.io/en/latest/docker_images.html)
166
+
167
+
168
+ ## Synthesizing speech by 🐸TTS
169
+
170
+ ### 🐍 Python API
171
+
172
+ ```python
173
+ from TTS.api import TTS
174
+
175
+ # Running a multi-speaker and multi-lingual model
176
+
177
+ # List available 🐸TTS models and choose the first one
178
+ model_name = TTS.list_models()[0]
179
+ # Init TTS
180
+ tts = TTS(model_name)
181
+ # Run TTS
182
+ # ❗ Since this model is multi-speaker and multi-lingual, we must set the target speaker and the language
183
+ # Text to speech with a numpy output
184
+ wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
185
+ # Text to speech to a file
186
+ tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
187
+
188
+ # Running a single speaker model
189
+
190
+ # Init TTS with the target model name
191
+ tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
192
+ # Run TTS
193
+ tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
194
+
195
+ # Example voice cloning with YourTTS in English, French and Portuguese:
196
+ tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
197
+ tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
198
+ tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr-fr", file_path="output.wav")
199
+ tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt-br", file_path="output.wav")
200
+
201
+
202
+ # Example voice conversion converting speaker of the `source_wav` to the speaker of the `target_wav`
203
+
204
+ tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False, gpu=True)
205
+ tts.voice_conversion_to_file(source_wav="my/source.wav", target_wav="my/target.wav", file_path="output.wav")
206
+
207
+ # Example voice cloning by a single speaker TTS model combining with the voice conversion model. This way, you can
208
+ # clone voices by using any model in 🐸TTS.
209
+
210
+ tts = TTS("tts_models/de/thorsten/tacotron2-DDC")
211
+ tts.tts_with_vc_to_file(
212
+ "Wie sage ich auf Italienisch, dass ich dich liebe?",
213
+ speaker_wav="target/speaker.wav",
214
+ file_path="ouptut.wav"
215
+ )
216
+
217
+ # Example text to speech using [🐸Coqui Studio](https://coqui.ai) models. You can use all of your available speakers in the studio.
218
+ # [🐸Coqui Studio](https://coqui.ai) API token is required. You can get it from the [account page](https://coqui.ai/account).
219
+ # You should set the `COQUI_STUDIO_TOKEN` environment variable to use the API token.
220
+
221
+ # If you have a valid API token set you will see the studio speakers as separate models in the list.
222
+ # The name format is coqui_studio/en/<studio_speaker_name>/coqui_studio
223
+ models = TTS().list_models()
224
+ # Init TTS with the target studio speaker
225
+ tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False, gpu=False)
226
+ # Run TTS
227
+ tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
228
+ # Run TTS with emotion and speed control
229
+ tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
230
+ ```
231
+
232
+ ### Command line `tts`
233
+ #### Single Speaker Models
234
+
235
+ - List provided models:
236
+
237
+ ```
238
+ $ tts --list_models
239
+ ```
240
+ - Get model info (for both tts_models and vocoder_models):
241
+ - Query by type/name:
242
+ The model_info_by_name uses the name as it from the --list_models.
243
+ ```
244
+ $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
245
+ ```
246
+ For example:
247
+
248
+ ```
249
+ $ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
250
+ ```
251
+ ```
252
+ $ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
253
+ ```
254
+ - Query by type/idx:
255
+ The model_query_idx uses the corresponding idx from --list_models.
256
+ ```
257
+ $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
258
+ ```
259
+ For example:
260
+
261
+ ```
262
+ $ tts --model_info_by_idx tts_models/3
263
+ ```
264
+
265
+ - Run TTS with default models:
266
+
267
+ ```
268
+ $ tts --text "Text for TTS" --out_path output/path/speech.wav
269
+ ```
270
+
271
+ - Run a TTS model with its default vocoder model:
272
+
273
+ ```
274
+ $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
275
+ ```
276
+ For example:
277
+
278
+ ```
279
+ $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
280
+ ```
281
+
282
+ - Run with specific TTS and vocoder models from the list:
283
+
284
+ ```
285
+ $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
286
+ ```
287
+
288
+ For example:
289
+
290
+ ```
291
+ $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
292
+ ```
293
+
294
+
295
+ - Run your own TTS model (Using Griffin-Lim Vocoder):
296
+
297
+ ```
298
+ $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
299
+ ```
300
+
301
+ - Run your own TTS and Vocoder models:
302
+ ```
303
+ $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
304
+ --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
305
+ ```
306
+
307
+ #### Multi-speaker Models
308
+
309
+ - List the available speakers and choose as <speaker_id> among them:
310
+
311
+ ```
312
+ $ tts --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
313
+ ```
314
+
315
+ - Run the multi-speaker TTS model with the target speaker ID:
316
+
317
+ ```
318
+ $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
319
+ ```
320
+
321
+ - Run your own multi-speaker TTS model:
322
+
323
+ ```
324
+ $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
325
+ ```
326
+
327
+ ## Directory Structure
328
+ ```
329
+ |- notebooks/ (Jupyter Notebooks for model evaluation, parameter selection and data analysis.)
330
+ |- utils/ (common utilities.)
331
+ |- TTS
332
+ |- bin/ (folder for all the executables.)
333
+ |- train*.py (train your target model.)
334
+ |- ...
335
+ |- tts/ (text to speech models)
336
+ |- layers/ (model layer definitions)
337
+ |- models/ (model definitions)
338
+ |- utils/ (model specific utilities.)
339
+ |- speaker_encoder/ (Speaker Encoder models.)
340
+ |- (same)
341
+ |- vocoder/ (Vocoder models.)
342
+ |- (same)
343
+ ```
TTS/.models.json ADDED
@@ -0,0 +1,819 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tts_models": {
3
+ "multilingual":{
4
+ "multi-dataset":{
5
+ "your_tts":{
6
+ "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
7
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
8
+ "default_vocoder": null,
9
+ "commit": "e9a1953e",
10
+ "license": "CC BY-NC-ND 4.0",
11
+ "contact": "[email protected]"
12
+ }
13
+ }
14
+ },
15
+ "bg": {
16
+ "cv": {
17
+ "vits":{
18
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--bg--cv--vits.zip",
19
+ "default_vocoder": null,
20
+ "commit": null,
21
+ "author": "@NeonGeckoCom",
22
+ "license": "bsd-3-clause"
23
+ }
24
+ }
25
+ },
26
+ "cs": {
27
+ "cv": {
28
+ "vits":{
29
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--cs--cv--vits.zip",
30
+ "default_vocoder": null,
31
+ "commit": null,
32
+ "author": "@NeonGeckoCom",
33
+ "license": "bsd-3-clause"
34
+ }
35
+ }
36
+ },
37
+ "da": {
38
+ "cv": {
39
+ "vits":{
40
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--da--cv--vits.zip",
41
+ "default_vocoder": null,
42
+ "commit": null,
43
+ "author": "@NeonGeckoCom",
44
+ "license": "bsd-3-clause"
45
+ }
46
+ }
47
+ },
48
+ "et": {
49
+ "cv": {
50
+ "vits":{
51
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--et--cv--vits.zip",
52
+ "default_vocoder": null,
53
+ "commit": null,
54
+ "author": "@NeonGeckoCom",
55
+ "license": "bsd-3-clause"
56
+ }
57
+ }
58
+ },
59
+ "ga": {
60
+ "cv": {
61
+ "vits":{
62
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ga--cv--vits.zip",
63
+ "default_vocoder": null,
64
+ "commit": null,
65
+ "author": "@NeonGeckoCom",
66
+ "license": "bsd-3-clause"
67
+ }
68
+ }
69
+ },
70
+ "en": {
71
+ "ek1": {
72
+ "tacotron2": {
73
+ "description": "EK1 en-rp tacotron2 by NMStoker",
74
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ek1--tacotron2.zip",
75
+ "default_vocoder": "vocoder_models/en/ek1/wavegrad",
76
+ "commit": "c802255",
77
+ "license": "apache 2.0"
78
+ }
79
+ },
80
+ "ljspeech": {
81
+ "tacotron2-DDC": {
82
+ "description": "Tacotron2 with Double Decoder Consistency.",
83
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC.zip",
84
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
85
+ "commit": "bae2ad0f",
86
+ "author": "Eren Gölge @erogol",
87
+ "license": "apache 2.0",
88
+ "contact": "[email protected]"
89
+ },
90
+ "tacotron2-DDC_ph": {
91
+ "description": "Tacotron2 with Double Decoder Consistency with phonemes.",
92
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
93
+ "default_vocoder": "vocoder_models/en/ljspeech/univnet",
94
+ "commit": "3900448",
95
+ "author": "Eren Gölge @erogol",
96
+ "license": "apache 2.0",
97
+ "contact": "[email protected]"
98
+ },
99
+ "glow-tts": {
100
+ "description": "",
101
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--glow-tts.zip",
102
+ "stats_file": null,
103
+ "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
104
+ "commit": "",
105
+ "author": "Eren Gölge @erogol",
106
+ "license": "MPL",
107
+ "contact": "[email protected]"
108
+ },
109
+ "speedy-speech": {
110
+ "description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.",
111
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--speedy-speech.zip",
112
+ "stats_file": null,
113
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
114
+ "commit": "4581e3d",
115
+ "author": "Eren Gölge @erogol",
116
+ "license": "apache 2.0",
117
+ "contact": "[email protected]"
118
+ },
119
+ "tacotron2-DCA": {
120
+ "description": "",
121
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DCA.zip",
122
+ "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
123
+ "commit": "",
124
+ "author": "Eren Gölge @erogol",
125
+ "license": "MPL",
126
+ "contact": "[email protected]"
127
+ },
128
+ "vits": {
129
+ "description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
130
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--vits.zip",
131
+ "default_vocoder": null,
132
+ "commit": "3900448",
133
+ "author": "Eren Gölge @erogol",
134
+ "license": "apache 2.0",
135
+ "contact": "[email protected]"
136
+ },
137
+ "vits--neon": {
138
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--en--ljspeech--vits.zip",
139
+ "default_vocoder": null,
140
+ "author": "@NeonGeckoCom",
141
+ "license": "bsd-3-clause",
142
+ "contact": null,
143
+ "commit": null
144
+ },
145
+ "fast_pitch": {
146
+ "description": "FastPitch model trained on LJSpeech using the Aligner Network",
147
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
148
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
149
+ "commit": "b27b3ba",
150
+ "author": "Eren Gölge @erogol",
151
+ "license": "apache 2.0",
152
+ "contact": "[email protected]"
153
+ },
154
+ "overflow": {
155
+ "description": "Overflow model trained on LJSpeech",
156
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.0_models/tts_models--en--ljspeech--overflow.zip",
157
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
158
+ "commit": "3b1a28f",
159
+ "author": "Eren Gölge @erogol",
160
+ "license": "apache 2.0",
161
+ "contact": "[email protected]"
162
+ },
163
+ "neural_hmm": {
164
+ "description": "Neural HMM model trained on LJSpeech",
165
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.11.0_models/tts_models--en--ljspeech--neural_hmm.zip",
166
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
167
+ "commit": "3b1a28f",
168
+ "author": "Shivam Metha @shivammehta25",
169
+ "license": "apache 2.0",
170
+ "contact": "d83ee8fe45e3c0d776d4a865aca21d7c2ac324c4"
171
+ }
172
+ },
173
+ "vctk": {
174
+ "vits": {
175
+ "description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.",
176
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--vits.zip",
177
+ "default_vocoder": null,
178
+ "commit": "3900448",
179
+ "author": "Eren @erogol",
180
+ "license": "apache 2.0",
181
+ "contact": "[email protected]"
182
+ },
183
+ "fast_pitch":{
184
+ "description": "FastPitch model trained on VCTK dataseset.",
185
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip",
186
+ "default_vocoder": null,
187
+ "commit": "bdab788d",
188
+ "author": "Eren @erogol",
189
+ "license": "CC BY-NC-ND 4.0",
190
+ "contact": "[email protected]"
191
+ }
192
+ },
193
+ "sam": {
194
+ "tacotron-DDC": {
195
+ "description": "Tacotron2 with Double Decoder Consistency trained with Aceenture's Sam dataset.",
196
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--sam--tacotron-DDC.zip",
197
+ "default_vocoder": "vocoder_models/en/sam/hifigan_v2",
198
+ "commit": "bae2ad0f",
199
+ "author": "Eren Gölge @erogol",
200
+ "license": "apache 2.0",
201
+ "contact": "[email protected]"
202
+ }
203
+ },
204
+ "blizzard2013": {
205
+ "capacitron-t2-c50": {
206
+ "description": "Capacitron additions to Tacotron 2 with Capacity at 50 as in https://arxiv.org/pdf/1906.03402.pdf",
207
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c50.zip",
208
+ "commit": "d6284e7",
209
+ "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
210
+ "author": "Adam Froghyar @a-froghyar",
211
+ "license": "apache 2.0",
212
+ "contact": "[email protected]"
213
+ },
214
+ "capacitron-t2-c150_v2": {
215
+ "description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
216
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.1_models/tts_models--en--blizzard2013--capacitron-t2-c150_v2.zip",
217
+ "commit": "a67039d",
218
+ "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
219
+ "author": "Adam Froghyar @a-froghyar",
220
+ "license": "apache 2.0",
221
+ "contact": "[email protected]"
222
+ }
223
+ }
224
+ },
225
+ "es": {
226
+ "mai": {
227
+ "tacotron2-DDC": {
228
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--es--mai--tacotron2-DDC.zip",
229
+ "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
230
+ "commit": "",
231
+ "author": "Eren Gölge @erogol",
232
+ "license": "MPL",
233
+ "contact": "[email protected]"
234
+ }
235
+ },
236
+ "css10":{
237
+ "vits":{
238
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--es--css10--vits.zip",
239
+ "default_vocoder": null,
240
+ "commit": null,
241
+ "author": "@NeonGeckoCom",
242
+ "license": "bsd-3-clause"
243
+ }
244
+ }
245
+ },
246
+ "fr": {
247
+ "mai": {
248
+ "tacotron2-DDC": {
249
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
250
+ "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
251
+ "commit": null,
252
+ "author": "Eren Gölge @erogol",
253
+ "license": "MPL",
254
+ "contact": "[email protected]"
255
+ }
256
+ },
257
+ "css10":{
258
+ "vits":{
259
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fr--css10--vits.zip",
260
+ "default_vocoder": null,
261
+ "commit": null,
262
+ "author": "@NeonGeckoCom",
263
+ "license": "bsd-3-clause"
264
+ }
265
+ }
266
+ },
267
+ "uk":{
268
+ "mai": {
269
+ "glow-tts": {
270
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip",
271
+ "author":"@robinhad",
272
+ "commit": "bdab788d",
273
+ "license": "MIT",
274
+ "contact": "",
275
+ "default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
276
+ },
277
+ "vits":{
278
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--uk--mai--vits.zip",
279
+ "default_vocoder": null,
280
+ "commit": null,
281
+ "author": "@NeonGeckoCom",
282
+ "license": "bsd-3-clause"
283
+ }
284
+ }
285
+ },
286
+ "zh-CN": {
287
+ "baker": {
288
+ "tacotron2-DDC-GST": {
289
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
290
+ "commit": "unknown",
291
+ "author": "@kirianguiller",
292
+ "license": "apache 2.0",
293
+ "default_vocoder": null
294
+ }
295
+ }
296
+ },
297
+ "nl": {
298
+ "mai": {
299
+ "tacotron2-DDC": {
300
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--nl--mai--tacotron2-DDC.zip",
301
+ "author": "@r-dh",
302
+ "license": "apache 2.0",
303
+ "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan",
304
+ "stats_file": null,
305
+ "commit": "540d811"
306
+ }
307
+ },
308
+ "css10":{
309
+ "vits":{
310
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--nl--css10--vits.zip",
311
+ "default_vocoder": null,
312
+ "commit": null,
313
+ "author": "@NeonGeckoCom",
314
+ "license": "bsd-3-clause"
315
+ }
316
+ }
317
+ },
318
+ "de": {
319
+ "thorsten": {
320
+ "tacotron2-DCA": {
321
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--de--thorsten--tacotron2-DCA.zip",
322
+ "default_vocoder": "vocoder_models/de/thorsten/fullband-melgan",
323
+ "author": "@thorstenMueller",
324
+ "license": "apache 2.0",
325
+ "commit": "unknown"
326
+ },
327
+ "vits": {
328
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--de--thorsten--vits.zip",
329
+ "default_vocoder": null,
330
+ "author": "@thorstenMueller",
331
+ "license": "apache 2.0",
332
+ "commit": "unknown"
333
+ },
334
+ "tacotron2-DDC": {
335
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--thorsten--tacotron2-DDC.zip",
336
+ "default_vocoder": "vocoder_models/de/thorsten/hifigan_v1",
337
+ "description": "Thorsten-Dec2021-22k-DDC",
338
+ "author": "@thorstenMueller",
339
+ "license": "apache 2.0",
340
+ "commit": "unknown"
341
+ }
342
+ },
343
+ "css10": {
344
+ "vits-neon":{
345
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--css10--vits.zip",
346
+ "default_vocoder": null,
347
+ "author": "@NeonGeckoCom",
348
+ "license": "bsd-3-clause",
349
+ "commit": null
350
+ }
351
+ }
352
+ },
353
+ "ja": {
354
+ "kokoro": {
355
+ "tacotron2-DDC": {
356
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--ja--kokoro--tacotron2-DDC.zip",
357
+ "default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1",
358
+ "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
359
+ "author": "@kaiidams",
360
+ "license": "apache 2.0",
361
+ "commit": "401fbd89"
362
+ }
363
+ }
364
+ },
365
+ "tr":{
366
+ "common-voice": {
367
+ "glow-tts":{
368
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip",
369
+ "default_vocoder": "vocoder_models/tr/common-voice/hifigan",
370
+ "license": "MIT",
371
+ "description": "Turkish GlowTTS model using an unknown speaker from the Common-Voice dataset.",
372
+ "author": "Fatih Akademi",
373
+ "commit": null
374
+ }
375
+ }
376
+ },
377
+ "it": {
378
+ "mai_female": {
379
+ "glow-tts":{
380
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip",
381
+ "default_vocoder": null,
382
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
383
+ "author": "@nicolalandro",
384
+ "license": "apache 2.0",
385
+ "commit": null
386
+ },
387
+ "vits":{
388
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip",
389
+ "default_vocoder": null,
390
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
391
+ "author": "@nicolalandro",
392
+ "license": "apache 2.0",
393
+ "commit": null
394
+ }
395
+ },
396
+ "mai_male": {
397
+ "glow-tts":{
398
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip",
399
+ "default_vocoder": null,
400
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
401
+ "author": "@nicolalandro",
402
+ "license": "apache 2.0",
403
+ "commit": null
404
+ },
405
+ "vits":{
406
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip",
407
+ "default_vocoder": null,
408
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
409
+ "author": "@nicolalandro",
410
+ "license": "apache 2.0",
411
+ "commit": null
412
+ }
413
+ }
414
+ },
415
+ "ewe": {
416
+ "openbible": {
417
+ "vits":{
418
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--ewe--openbible--vits.zip",
419
+ "default_vocoder": null,
420
+ "license": "CC-BY-SA 4.0",
421
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
422
+ "author": "@coqui_ai",
423
+ "commit": "1b22f03"
424
+ }
425
+ }
426
+ },
427
+ "hau": {
428
+ "openbible": {
429
+ "vits":{
430
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--hau--openbible--vits.zip",
431
+ "default_vocoder": null,
432
+ "license": "CC-BY-SA 4.0",
433
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
434
+ "author": "@coqui_ai",
435
+ "commit": "1b22f03"
436
+ }
437
+ }
438
+ },
439
+ "lin": {
440
+ "openbible": {
441
+ "vits":{
442
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--lin--openbible--vits.zip",
443
+ "default_vocoder": null,
444
+ "license": "CC-BY-SA 4.0",
445
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
446
+ "author": "@coqui_ai",
447
+ "commit": "1b22f03"
448
+ }
449
+ }
450
+ },
451
+ "tw_akuapem": {
452
+ "openbible": {
453
+ "vits":{
454
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip",
455
+ "default_vocoder": null,
456
+ "license": "CC-BY-SA 4.0",
457
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
458
+ "author": "@coqui_ai",
459
+ "commit": "1b22f03"
460
+ }
461
+ }
462
+ },
463
+ "tw_asante": {
464
+ "openbible": {
465
+ "vits":{
466
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip",
467
+ "default_vocoder": null,
468
+ "license": "CC-BY-SA 4.0",
469
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
470
+ "author": "@coqui_ai",
471
+ "commit": "1b22f03"
472
+ }
473
+ }
474
+ },
475
+ "yor": {
476
+ "openbible": {
477
+ "vits":{
478
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--yor--openbible--vits.zip",
479
+ "default_vocoder": null,
480
+ "license": "CC-BY-SA 4.0",
481
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
482
+ "author": "@coqui_ai",
483
+ "commit": "1b22f03"
484
+ }
485
+ }
486
+ },
487
+ "hu": {
488
+ "css10": {
489
+ "vits": {
490
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hu--css10--vits.zip",
491
+ "default_vocoder": null,
492
+ "commit": null,
493
+ "author": "@NeonGeckoCom",
494
+ "license": "bsd-3-clause"
495
+ }
496
+ }
497
+ },
498
+ "el": {
499
+ "cv": {
500
+ "vits": {
501
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--el--cv--vits.zip",
502
+ "default_vocoder": null,
503
+ "commit": null,
504
+ "author": "@NeonGeckoCom",
505
+ "license": "bsd-3-clause"
506
+ }
507
+ }
508
+ },
509
+ "fi": {
510
+ "css10": {
511
+ "vits":{
512
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fi--css10--vits.zip",
513
+ "default_vocoder": null,
514
+ "commit": null,
515
+ "author": "@NeonGeckoCom",
516
+ "license": "bsd-3-clause"
517
+ }
518
+ }
519
+ },
520
+ "hr": {
521
+ "cv": {
522
+ "vits":{
523
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hr--cv--vits.zip",
524
+ "default_vocoder": null,
525
+ "commit": null,
526
+ "author": "@NeonGeckoCom",
527
+ "license": "bsd-3-clause"
528
+ }
529
+ }
530
+ },
531
+ "lt": {
532
+ "cv": {
533
+ "vits":{
534
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lt--cv--vits.zip",
535
+ "default_vocoder": null,
536
+ "commit": null,
537
+ "author": "@NeonGeckoCom",
538
+ "license": "bsd-3-clause"
539
+ }
540
+ }
541
+ },
542
+ "lv": {
543
+ "cv": {
544
+ "vits":{
545
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lv--cv--vits.zip",
546
+ "default_vocoder": null,
547
+ "commit": null,
548
+ "author": "@NeonGeckoCom",
549
+ "license": "bsd-3-clause"
550
+ }
551
+ }
552
+ },
553
+ "mt": {
554
+ "cv": {
555
+ "vits":{
556
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--mt--cv--vits.zip",
557
+ "default_vocoder": null,
558
+ "commit": null,
559
+ "author": "@NeonGeckoCom",
560
+ "license": "bsd-3-clause"
561
+ }
562
+ }
563
+ },
564
+ "pl": {
565
+ "mai_female": {
566
+ "vits":{
567
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pl--mai_female--vits.zip",
568
+ "default_vocoder": null,
569
+ "commit": null,
570
+ "author": "@NeonGeckoCom",
571
+ "license": "bsd-3-clause"
572
+ }
573
+ }
574
+ },
575
+ "pt": {
576
+ "cv": {
577
+ "vits":{
578
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pt--cv--vits.zip",
579
+ "default_vocoder": null,
580
+ "commit": null,
581
+ "author": "@NeonGeckoCom",
582
+ "license": "bsd-3-clause"
583
+ }
584
+ }
585
+ },
586
+ "ro": {
587
+ "cv": {
588
+ "vits":{
589
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ro--cv--vits.zip",
590
+ "default_vocoder": null,
591
+ "commit": null,
592
+ "author": "@NeonGeckoCom",
593
+ "license": "bsd-3-clause"
594
+ }
595
+ }
596
+ },
597
+ "sk": {
598
+ "cv": {
599
+ "vits":{
600
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sk--cv--vits.zip",
601
+ "default_vocoder": null,
602
+ "commit": null,
603
+ "author": "@NeonGeckoCom",
604
+ "license": "bsd-3-clause"
605
+ }
606
+ }
607
+ },
608
+ "sl": {
609
+ "cv": {
610
+ "vits":{
611
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sl--cv--vits.zip",
612
+ "default_vocoder": null,
613
+ "commit": null,
614
+ "author": "@NeonGeckoCom",
615
+ "license": "bsd-3-clause"
616
+ }
617
+ }
618
+ },
619
+ "sv": {
620
+ "cv": {
621
+ "vits":{
622
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sv--cv--vits.zip",
623
+ "default_vocoder": null,
624
+ "commit": null,
625
+ "author": "@NeonGeckoCom",
626
+ "license": "bsd-3-clause"
627
+ }
628
+ }
629
+ },
630
+ "ca": {
631
+ "custom": {
632
+ "vits":{
633
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--ca--custom--vits.zip",
634
+ "default_vocoder": null,
635
+ "commit": null,
636
+ "description": " It is trained from zero with 101460 utterances consisting of 257 speakers, approx 138 hours of speech. We used three datasets;\nFestcat and Google Catalan TTS (both TTS datasets) and also a part of Common Voice 8. It is trained with TTS v0.8.0.\nhttps://github.com/coqui-ai/TTS/discussions/930#discussioncomment-4466345",
637
+ "author": "@gullabi",
638
+ "license": "CC-BY-4.0"
639
+ }
640
+ }
641
+ },
642
+ "fa":{
643
+ "custom":{
644
+ "glow-tts": {
645
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--fa--custom--glow-tts.zip",
646
+ "default_vocoder": null,
647
+ "commit": null,
648
+ "description": "persian-tts-female-glow_tts model for text to speech purposes. Single-speaker female voice Trained on persian-tts-dataset-famale. \nThis model has no compatible vocoder thus the output quality is not very good. \nDataset: https://www.kaggle.com/datasets/magnoliasis/persian-tts-dataset-famale.",
649
+ "author": "@karim23657",
650
+ "license": "CC-BY-4.0"
651
+ }
652
+ }
653
+ }
654
+ },
655
+ "vocoder_models": {
656
+ "universal": {
657
+ "libri-tts": {
658
+ "wavegrad": {
659
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--wavegrad.zip",
660
+ "commit": "ea976b0",
661
+ "author": "Eren Gölge @erogol",
662
+ "license": "MPL",
663
+ "contact": "[email protected]"
664
+ },
665
+ "fullband-melgan": {
666
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--fullband-melgan.zip",
667
+ "commit": "4132240",
668
+ "author": "Eren Gölge @erogol",
669
+ "license": "MPL",
670
+ "contact": "[email protected]"
671
+ }
672
+ }
673
+ },
674
+ "en": {
675
+ "ek1": {
676
+ "wavegrad": {
677
+ "description": "EK1 en-rp wavegrad by NMStoker",
678
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip",
679
+ "commit": "c802255",
680
+ "license": "apache 2.0"
681
+ }
682
+ },
683
+ "ljspeech": {
684
+ "multiband-melgan": {
685
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip",
686
+ "commit": "ea976b0",
687
+ "author": "Eren Gölge @erogol",
688
+ "license": "MPL",
689
+ "contact": "[email protected]"
690
+ },
691
+ "hifigan_v2": {
692
+ "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
693
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip",
694
+ "commit": "bae2ad0f",
695
+ "author": "@erogol",
696
+ "license": "apache 2.0",
697
+ "contact": "[email protected]"
698
+ },
699
+ "univnet": {
700
+ "description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.",
701
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip",
702
+ "commit": "4581e3d",
703
+ "author": "Eren @erogol",
704
+ "license": "apache 2.0",
705
+ "contact": "[email protected]"
706
+ }
707
+ },
708
+ "blizzard2013": {
709
+ "hifigan_v2": {
710
+ "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
711
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/vocoder_models--en--blizzard2013--hifigan_v2.zip",
712
+ "commit": "d6284e7",
713
+ "author": "Adam Froghyar @a-froghyar",
714
+ "license": "apache 2.0",
715
+ "contact": "[email protected]"
716
+ }
717
+ },
718
+ "vctk": {
719
+ "hifigan_v2": {
720
+ "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",
721
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip",
722
+ "commit": "2f07160",
723
+ "author": "Edresson Casanova",
724
+ "license": "apache 2.0",
725
+ "contact": ""
726
+ }
727
+ },
728
+ "sam": {
729
+ "hifigan_v2": {
730
+ "description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC",
731
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip",
732
+ "commit": "2f07160",
733
+ "author": "Eren Gölge @erogol",
734
+ "license": "apache 2.0",
735
+ "contact": "[email protected]"
736
+ }
737
+ }
738
+ },
739
+ "nl": {
740
+ "mai": {
741
+ "parallel-wavegan": {
742
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip",
743
+ "author": "@r-dh",
744
+ "license": "apache 2.0",
745
+ "commit": "unknown"
746
+ }
747
+ }
748
+ },
749
+ "de": {
750
+ "thorsten": {
751
+ "wavegrad": {
752
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip",
753
+ "author": "@thorstenMueller",
754
+ "license": "apache 2.0",
755
+ "commit": "unknown"
756
+ },
757
+ "fullband-melgan": {
758
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip",
759
+ "author": "@thorstenMueller",
760
+ "license": "apache 2.0",
761
+ "commit": "unknown"
762
+ },
763
+ "hifigan_v1": {
764
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/vocoder_models--de--thorsten--hifigan_v1.zip",
765
+ "description": "HifiGAN vocoder model for Thorsten Neutral Dec2021 22k Samplerate Tacotron2 DDC model",
766
+ "author": "@thorstenMueller",
767
+ "license": "apache 2.0",
768
+ "commit": "unknown"
769
+ }
770
+ }
771
+ },
772
+ "ja": {
773
+ "kokoro": {
774
+ "hifigan_v1": {
775
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip",
776
+ "description": "HifiGAN model trained for kokoro dataset by @kaiidams",
777
+ "author": "@kaiidams",
778
+ "license": "apache 2.0",
779
+ "commit": "3900448"
780
+ }
781
+ }
782
+ },
783
+ "uk": {
784
+ "mai": {
785
+ "multiband-melgan": {
786
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip",
787
+ "author":"@robinhad",
788
+ "commit": "bdab788d",
789
+ "license": "MIT",
790
+ "contact": ""
791
+ }
792
+ }
793
+ },
794
+ "tr":{
795
+ "common-voice": {
796
+ "hifigan":{
797
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip",
798
+ "description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
799
+ "author": "Fatih Akademi",
800
+ "license": "MIT",
801
+ "commit": null
802
+ }
803
+ }
804
+ }
805
+ },
806
+ "voice_conversion_models":{
807
+ "multilingual":{
808
+ "vctk":{
809
+ "freevc24":{
810
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
811
+ "description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
812
+ "author": "Jing-Yi Li @OlaWod",
813
+ "license": "MIT",
814
+ "commit": null
815
+ }
816
+ }
817
+ }
818
+ }
819
+ }
TTS/VERSION ADDED
@@ -0,0 +1 @@
 
 
1
+ 0.13.0
TTS/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ with open(os.path.join(os.path.dirname(__file__), "VERSION"), "r", encoding="utf-8") as f:
4
+ version = f.read().strip()
5
+
6
+ __version__ = version
TTS/api.py ADDED
@@ -0,0 +1,628 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import http.client
2
+ import json
3
+ import os
4
+ import tempfile
5
+ import urllib.request
6
+ from pathlib import Path
7
+ from typing import Tuple
8
+
9
+ import numpy as np
10
+ from scipy.io import wavfile
11
+
12
+ from TTS.utils.audio.numpy_transforms import save_wav
13
+ from TTS.utils.manage import ModelManager
14
+ from TTS.utils.synthesizer import Synthesizer
15
+
16
+
17
+ class Speaker(object):
18
+ """Convert dict to object."""
19
+
20
+ def __init__(self, d, is_voice=False):
21
+ self.is_voice = is_voice
22
+ for k, v in d.items():
23
+ if isinstance(k, (list, tuple)):
24
+ setattr(self, k, [Speaker(x) if isinstance(x, dict) else x for x in v])
25
+ else:
26
+ setattr(self, k, Speaker(v) if isinstance(v, dict) else v)
27
+
28
+ def __repr__(self):
29
+ return str(self.__dict__)
30
+
31
+
32
+ class CS_API:
33
+ """🐸Coqui Studio API Wrapper.
34
+
35
+ 🐸Coqui Studio is the most advanced voice generation platform. You can generate new voices by voice cloning, voice
36
+ interpolation, or our unique prompt to voice technology. It also provides a set of built-in voices with different
37
+ characteristics. You can use these voices to generate new audio files or use them in your applications.
38
+ You can use all the built-in and your own 🐸Coqui Studio speakers with this API with an API token.
39
+ You can signup to 🐸Coqui Studio from https://app.coqui.ai/auth/signup and get an API token from
40
+ https://app.coqui.ai/account. We can either enter the token as an environment variable as
41
+ `export COQUI_STUDIO_TOKEN=<token>` or pass it as `CS_API(api_token=<toke>)`.
42
+ Visit https://app.coqui.ai/api for more information.
43
+
44
+ Example listing all available speakers:
45
+ >>> from TTS.api import CS_API
46
+ >>> tts = CS_API()
47
+ >>> tts.speakers
48
+
49
+ Example listing all emotions:
50
+ >>> from TTS.api import CS_API
51
+ >>> tts = CS_API()
52
+ >>> tts.emotions
53
+
54
+ Example with a built-in 🐸 speaker:
55
+ >>> from TTS.api import CS_API
56
+ >>> tts = CS_API()
57
+ >>> wav, sr = api.tts("Hello world", speaker_name="Claribel Dervla")
58
+ >>> filepath = tts.tts_to_file(text="Hello world!", speaker_name=tts.speakers[0].name, file_path="output.wav")
59
+ """
60
+
61
+ def __init__(self, api_token=None):
62
+ self.api_token = api_token
63
+ self.api_prefix = "/api/v2"
64
+ self.headers = None
65
+ self._speakers = None
66
+ self._check_token()
67
+
68
+ @property
69
+ def speakers(self):
70
+ if self._speakers is None:
71
+ self._speakers = self.list_all_speakers()
72
+ return self._speakers
73
+
74
+ @property
75
+ def emotions(self):
76
+ """Return a list of available emotions.
77
+
78
+ TODO: Get this from the API endpoint.
79
+ """
80
+ return ["Neutral", "Happy", "Sad", "Angry", "Dull"]
81
+
82
+ def _check_token(self):
83
+ if self.api_token is None:
84
+ self.api_token = os.environ.get("COQUI_STUDIO_TOKEN")
85
+ self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"}
86
+ if not self.api_token:
87
+ raise ValueError(
88
+ "No API token found for 🐸Coqui Studio voices - https://coqui.ai.\n"
89
+ "Visit 🔗https://app.coqui.ai/account to get one.\n"
90
+ "Set it as an environment variable `export COQUI_STUDIO_TOKEN=<token>`\n"
91
+ ""
92
+ )
93
+
94
+ def list_all_speakers(self):
95
+ """Return both built-in Coqui Studio speakers and custom voices created by the user."""
96
+ return self.list_speakers() + self.list_voices()
97
+
98
+ def list_speakers(self):
99
+ """List built-in Coqui Studio speakers."""
100
+ self._check_token()
101
+ conn = http.client.HTTPSConnection("app.coqui.ai")
102
+ conn.request("GET", f"{self.api_prefix}/speakers", headers=self.headers)
103
+ res = conn.getresponse()
104
+ data = res.read()
105
+ return [Speaker(s) for s in json.loads(data)["result"]]
106
+
107
+ def list_voices(self):
108
+ """List custom voices created by the user."""
109
+ conn = http.client.HTTPSConnection("app.coqui.ai")
110
+ conn.request("GET", f"{self.api_prefix}/voices", headers=self.headers)
111
+ res = conn.getresponse()
112
+ data = res.read()
113
+ return [Speaker(s, True) for s in json.loads(data)["result"]]
114
+
115
+ def list_speakers_as_tts_models(self):
116
+ """List speakers in ModelManager format."""
117
+ models = []
118
+ for speaker in self.speakers:
119
+ model = f"coqui_studio/en/{speaker.name}/coqui_studio"
120
+ models.append(model)
121
+ return models
122
+
123
+ def name_to_speaker(self, name):
124
+ for speaker in self.speakers:
125
+ if speaker.name == name:
126
+ return speaker
127
+ raise ValueError(f"Speaker {name} not found.")
128
+
129
+ def id_to_speaker(self, speaker_id):
130
+ for speaker in self.speakers:
131
+ if speaker.id == speaker_id:
132
+ return speaker
133
+ raise ValueError(f"Speaker {speaker_id} not found.")
134
+
135
+ @staticmethod
136
+ def url_to_np(url):
137
+ tmp_file, _ = urllib.request.urlretrieve(url)
138
+ rate, data = wavfile.read(tmp_file)
139
+ return data, rate
140
+
141
+ @staticmethod
142
+ def _create_payload(text, speaker, emotion, speed):
143
+ payload = {}
144
+ if speaker.is_voice:
145
+ payload["voice_id"] = speaker.id
146
+ else:
147
+ payload["speaker_id"] = speaker.id
148
+ payload.update(
149
+ {
150
+ "emotion": emotion,
151
+ "name": speaker.name,
152
+ "text": text,
153
+ "speed": speed,
154
+ }
155
+ )
156
+ return payload
157
+
158
+ def tts(
159
+ self,
160
+ text: str,
161
+ speaker_name: str = None,
162
+ speaker_id=None,
163
+ emotion="Neutral",
164
+ speed=1.0,
165
+ language=None, # pylint: disable=unused-argument
166
+ ) -> Tuple[np.ndarray, int]:
167
+ """Synthesize speech from text.
168
+
169
+ Args:
170
+ text (str): Text to synthesize.
171
+ speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
172
+ voices (user generated speakers) with `list_voices()`.
173
+ speaker_id (str): Speaker ID. If None, the speaker name is used.
174
+ emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
175
+ speed (float): Speed of the speech. 1.0 is normal speed.
176
+ language (str): Language of the text. If None, the default language of the speaker is used.
177
+ """
178
+ self._check_token()
179
+ if speaker_name is None and speaker_id is None:
180
+ raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.")
181
+ if speaker_id is None:
182
+ speaker = self.name_to_speaker(speaker_name)
183
+ else:
184
+ speaker = self.id_to_speaker(speaker_id)
185
+ conn = http.client.HTTPSConnection("app.coqui.ai")
186
+ payload = self._create_payload(text, speaker, emotion, speed)
187
+ conn.request("POST", "/api/v2/samples", json.dumps(payload), self.headers)
188
+ res = conn.getresponse()
189
+ data = res.read()
190
+ try:
191
+ wav, sr = self.url_to_np(json.loads(data)["audio_url"])
192
+ except KeyError as e:
193
+ raise ValueError(f" [!] 🐸 API returned error: {data}") from e
194
+ return wav, sr
195
+
196
+ def tts_to_file(
197
+ self,
198
+ text: str,
199
+ speaker_name: str,
200
+ speaker_id=None,
201
+ emotion="Neutral",
202
+ speed=1.0,
203
+ language=None,
204
+ file_path: str = None,
205
+ ) -> str:
206
+ """Synthesize speech from text and save it to a file.
207
+
208
+ Args:
209
+ text (str): Text to synthesize.
210
+ speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
211
+ voices (user generated speakers) with `list_voices()`.
212
+ speaker_id (str): Speaker ID. If None, the speaker name is used.
213
+ emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
214
+ speed (float): Speed of the speech. 1.0 is normal speed.
215
+ language (str): Language of the text. If None, the default language of the speaker is used.
216
+ file_path (str): Path to save the file. If None, a temporary file is created.
217
+ """
218
+ if file_path is None:
219
+ file_path = tempfile.mktemp(".wav")
220
+ wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language)
221
+ wavfile.write(file_path, sr, wav)
222
+ return file_path
223
+
224
+
225
+ class TTS:
226
+ """TODO: Add voice conversion and Capacitron support."""
227
+
228
+ def __init__(
229
+ self,
230
+ model_name: str = None,
231
+ model_path: str = None,
232
+ config_path: str = None,
233
+ vocoder_path: str = None,
234
+ vocoder_config_path: str = None,
235
+ progress_bar: bool = True,
236
+ gpu=False,
237
+ ):
238
+ """🐸TTS python interface that allows to load and use the released models.
239
+
240
+ Example with a multi-speaker model:
241
+ >>> from TTS.api import TTS
242
+ >>> tts = TTS(TTS.list_models()[0])
243
+ >>> wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
244
+ >>> tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
245
+
246
+ Example with a single-speaker model:
247
+ >>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
248
+ >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
249
+
250
+ Example loading a model from a path:
251
+ >>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False, gpu=False)
252
+ >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
253
+
254
+ Example voice cloning with YourTTS in English, French and Portuguese:
255
+ >>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
256
+ >>> tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="thisisit.wav")
257
+ >>> tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="thisisit.wav")
258
+ >>> tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="thisisit.wav")
259
+
260
+ Args:
261
+ model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None.
262
+ model_path (str, optional): Path to the model checkpoint. Defaults to None.
263
+ config_path (str, optional): Path to the model config. Defaults to None.
264
+ vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
265
+ vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
266
+ progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
267
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
268
+ """
269
+ self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)
270
+
271
+ self.synthesizer = None
272
+ self.voice_converter = None
273
+ self.csapi = None
274
+ self.model_name = None
275
+
276
+ if model_name:
277
+ self.load_tts_model_by_name(model_name, gpu)
278
+
279
+ if model_path:
280
+ self.load_tts_model_by_path(
281
+ model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu
282
+ )
283
+
284
+ @property
285
+ def models(self):
286
+ return self.manager.list_tts_models()
287
+
288
+ @property
289
+ def is_multi_speaker(self):
290
+ if hasattr(self.synthesizer.tts_model, "speaker_manager") and self.synthesizer.tts_model.speaker_manager:
291
+ return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
292
+ return False
293
+
294
+ @property
295
+ def is_coqui_studio(self):
296
+ return "coqui_studio" in self.model_name
297
+
298
+ @property
299
+ def is_multi_lingual(self):
300
+ if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
301
+ return self.synthesizer.tts_model.language_manager.num_languages > 1
302
+ return False
303
+
304
+ @property
305
+ def speakers(self):
306
+ if not self.is_multi_speaker:
307
+ return None
308
+ return self.synthesizer.tts_model.speaker_manager.speaker_names
309
+
310
+ @property
311
+ def languages(self):
312
+ if not self.is_multi_lingual:
313
+ return None
314
+ return self.synthesizer.tts_model.language_manager.language_names
315
+
316
+ @staticmethod
317
+ def get_models_file_path():
318
+ return Path(__file__).parent / ".models.json"
319
+
320
+ @staticmethod
321
+ def list_models():
322
+ try:
323
+ csapi = CS_API()
324
+ models = csapi.list_speakers_as_tts_models()
325
+ except ValueError as e:
326
+ print(e)
327
+ models = []
328
+ manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
329
+ return manager.list_tts_models() + models
330
+
331
+ def download_model_by_name(self, model_name: str):
332
+ model_path, config_path, model_item = self.manager.download_model(model_name)
333
+ if model_item.get("default_vocoder") is None:
334
+ return model_path, config_path, None, None
335
+ vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"])
336
+ return model_path, config_path, vocoder_path, vocoder_config_path
337
+
338
+ def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
339
+ """Load one of the voice conversion models by name.
340
+
341
+ Args:
342
+ model_name (str): Model name to load. You can list models by ```tts.models```.
343
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
344
+ """
345
+ model_path, config_path, _, _ = self.download_model_by_name(model_name)
346
+ self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu)
347
+
348
+ def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
349
+ """Load one of 🐸TTS models by name.
350
+
351
+ Args:
352
+ model_name (str): Model name to load. You can list models by ```tts.models```.
353
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
354
+
355
+ TODO: Add tests
356
+ """
357
+ self.synthesizer = None
358
+ self.csapi = None
359
+ self.model_name = model_name
360
+
361
+ if "coqui_studio" in model_name:
362
+ self.csapi = CS_API()
363
+ else:
364
+ model_path, config_path, vocoder_path, vocoder_config_path = self.download_model_by_name(model_name)
365
+
366
+ # init synthesizer
367
+ # None values are fetch from the model
368
+ self.synthesizer = Synthesizer(
369
+ tts_checkpoint=model_path,
370
+ tts_config_path=config_path,
371
+ tts_speakers_file=None,
372
+ tts_languages_file=None,
373
+ vocoder_checkpoint=vocoder_path,
374
+ vocoder_config=vocoder_config_path,
375
+ encoder_checkpoint=None,
376
+ encoder_config=None,
377
+ use_cuda=gpu,
378
+ )
379
+
380
+ def load_tts_model_by_path(
381
+ self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
382
+ ):
383
+ """Load a model from a path.
384
+
385
+ Args:
386
+ model_path (str): Path to the model checkpoint.
387
+ config_path (str): Path to the model config.
388
+ vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
389
+ vocoder_config (str, optional): Path to the vocoder config. Defaults to None.
390
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
391
+ """
392
+
393
+ self.synthesizer = Synthesizer(
394
+ tts_checkpoint=model_path,
395
+ tts_config_path=config_path,
396
+ tts_speakers_file=None,
397
+ tts_languages_file=None,
398
+ vocoder_checkpoint=vocoder_path,
399
+ vocoder_config=vocoder_config,
400
+ encoder_checkpoint=None,
401
+ encoder_config=None,
402
+ use_cuda=gpu,
403
+ )
404
+
405
+ def _check_arguments(
406
+ self,
407
+ speaker: str = None,
408
+ language: str = None,
409
+ speaker_wav: str = None,
410
+ emotion: str = None,
411
+ speed: float = None,
412
+ ) -> None:
413
+ """Check if the arguments are valid for the model."""
414
+ if not self.is_coqui_studio:
415
+ # check for the coqui tts models
416
+ if self.is_multi_speaker and (speaker is None and speaker_wav is None):
417
+ raise ValueError("Model is multi-speaker but no `speaker` is provided.")
418
+ if self.is_multi_lingual and language is None:
419
+ raise ValueError("Model is multi-lingual but no `language` is provided.")
420
+ if not self.is_multi_speaker and speaker is not None:
421
+ raise ValueError("Model is not multi-speaker but `speaker` is provided.")
422
+ if not self.is_multi_lingual and language is not None:
423
+ raise ValueError("Model is not multi-lingual but `language` is provided.")
424
+ if not emotion is None and not speed is None:
425
+ raise ValueError("Emotion and speed can only be used with Coqui Studio models.")
426
+ else:
427
+ if emotion is None:
428
+ emotion = "Neutral"
429
+ if speed is None:
430
+ speed = 1.0
431
+ # check for the studio models
432
+ if speaker_wav is not None:
433
+ raise ValueError("Coqui Studio models do not support `speaker_wav` argument.")
434
+ if speaker is not None:
435
+ raise ValueError("Coqui Studio models do not support `speaker` argument.")
436
+ if language is not None and language != "en":
437
+ raise ValueError("Coqui Studio models currently support only `language=en` argument.")
438
+ if emotion not in ["Neutral", "Happy", "Sad", "Angry", "Dull"]:
439
+ raise ValueError(f"Emotion - `{emotion}` - must be one of `Neutral`, `Happy`, `Sad`, `Angry`, `Dull`.")
440
+
441
+ def tts_coqui_studio(
442
+ self,
443
+ text: str,
444
+ speaker_name: str = None,
445
+ language: str = None,
446
+ emotion: str = "Neutral",
447
+ speed: float = 1.0,
448
+ file_path: str = None,
449
+ ):
450
+ """Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
451
+
452
+ Args:
453
+ text (str):
454
+ Input text to synthesize.
455
+ speaker_name (str, optional):
456
+ Speaker name from Coqui Studio. Defaults to None.
457
+ language (str, optional):
458
+ Language code. Coqui Studio currently supports only English. Defaults to None.
459
+ emotion (str, optional):
460
+ Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Defaults to "Neutral".
461
+ speed (float, optional):
462
+ Speed of the speech. Defaults to 1.0.
463
+ file_path (str, optional):
464
+ Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None.
465
+ """
466
+ speaker_name = self.model_name.split("/")[2]
467
+ if file_path is None:
468
+ return self.csapi.tts_to_file(
469
+ text=text,
470
+ speaker_name=speaker_name,
471
+ language=language,
472
+ speed=speed,
473
+ emotion=emotion,
474
+ file_path=file_path,
475
+ )[0]
476
+ return self.csapi.tts(text=text, speaker_name=speaker_name, language=language, speed=speed, emotion=emotion)[0]
477
+
478
+ def tts(
479
+ self,
480
+ text: str,
481
+ speaker: str = None,
482
+ language: str = None,
483
+ speaker_wav: str = None,
484
+ emotion: str = None,
485
+ speed: float = None,
486
+ ):
487
+ """Convert text to speech.
488
+
489
+ Args:
490
+ text (str):
491
+ Input text to synthesize.
492
+ speaker (str, optional):
493
+ Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
494
+ `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
495
+ language (str, optional):
496
+ Language code for multi-lingual models. You can check whether loaded model is multi-lingual
497
+ `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
498
+ speaker_wav (str, optional):
499
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
500
+ Defaults to None.
501
+ emotion (str, optional):
502
+ Emotion to use for 🐸Coqui Studio models. If None, Studio models use "Neutral". Defaults to None.
503
+ speed (float, optional):
504
+ Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0.
505
+ Defaults to None.
506
+ """
507
+ self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed)
508
+ if self.csapi is not None:
509
+ return self.tts_coqui_studio(
510
+ text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed
511
+ )
512
+
513
+ wav = self.synthesizer.tts(
514
+ text=text,
515
+ speaker_name=speaker,
516
+ language_name=language,
517
+ speaker_wav=speaker_wav,
518
+ reference_wav=None,
519
+ style_wav=None,
520
+ style_text=None,
521
+ reference_speaker_name=None,
522
+ )
523
+ return wav
524
+
525
+ def tts_to_file(
526
+ self,
527
+ text: str,
528
+ speaker: str = None,
529
+ language: str = None,
530
+ speaker_wav: str = None,
531
+ emotion: str = "Neutral",
532
+ speed: float = 1.0,
533
+ file_path: str = "output.wav",
534
+ ):
535
+ """Convert text to speech.
536
+
537
+ Args:
538
+ text (str):
539
+ Input text to synthesize.
540
+ speaker (str, optional):
541
+ Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
542
+ `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
543
+ language (str, optional):
544
+ Language code for multi-lingual models. You can check whether loaded model is multi-lingual
545
+ `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
546
+ speaker_wav (str, optional):
547
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
548
+ Defaults to None.
549
+ emotion (str, optional):
550
+ Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral".
551
+ speed (float, optional):
552
+ Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
553
+ file_path (str, optional):
554
+ Output file path. Defaults to "output.wav".
555
+ """
556
+ self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav)
557
+
558
+ if self.csapi is not None:
559
+ return self.tts_coqui_studio(
560
+ text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed, file_path=file_path
561
+ )
562
+ wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav)
563
+ self.synthesizer.save_wav(wav=wav, path=file_path)
564
+ return file_path
565
+
566
+ def voice_conversion(
567
+ self,
568
+ sourve_wav: str,
569
+ target_wav: str,
570
+ ):
571
+ """Voice conversion with FreeVC. Convert source wav to target speaker.
572
+
573
+ Args:
574
+ source_wav (str):
575
+ Path to the source wav file.
576
+ target_wav (str):
577
+ Path to the target wav file.
578
+ """
579
+ wav = self.synthesizer.voice_conversion(source_wav=sourve_wav, target_wav=target_wav)
580
+ return wav
581
+
582
+ def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None):
583
+ """Convert text to speech with voice conversion.
584
+
585
+ It combines tts with voice conversion to fake voice cloning.
586
+
587
+ - Convert text to speech with tts.
588
+ - Convert the output wav to target speaker with voice conversion.
589
+
590
+ Args:
591
+ text (str):
592
+ Input text to synthesize.
593
+ language (str, optional):
594
+ Language code for multi-lingual models. You can check whether loaded model is multi-lingual
595
+ `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
596
+ speaker_wav (str, optional):
597
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
598
+ Defaults to None.
599
+ """
600
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
601
+ # Lazy code... save it to a temp file to resample it while reading it for VC
602
+ self.tts_to_file(text=text, speaker=None, language=language, file_path=fp.name)
603
+ if self.voice_converter is None:
604
+ self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24")
605
+ wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav)
606
+ return wav
607
+
608
+ def tts_with_vc_to_file(
609
+ self, text: str, language: str = None, speaker_wav: str = None, file_path: str = "output.wav"
610
+ ):
611
+ """Convert text to speech with voice conversion and save to file.
612
+
613
+ Check `tts_with_vc` for more details.
614
+
615
+ Args:
616
+ text (str):
617
+ Input text to synthesize.
618
+ language (str, optional):
619
+ Language code for multi-lingual models. You can check whether loaded model is multi-lingual
620
+ `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
621
+ speaker_wav (str, optional):
622
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
623
+ Defaults to None.
624
+ file_path (str, optional):
625
+ Output file path. Defaults to "output.wav".
626
+ """
627
+ wav = self.tts_with_vc(text=text, language=language, speaker_wav=speaker_wav)
628
+ save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
TTS/bin/__init__.py ADDED
File without changes
TTS/bin/collect_env_info.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Get detailed info about the working environment."""
2
+ import os
3
+ import platform
4
+ import sys
5
+
6
+ import numpy
7
+ import torch
8
+
9
+ sys.path += [os.path.abspath(".."), os.path.abspath(".")]
10
+ import json
11
+
12
+ import TTS
13
+
14
+
15
+ def system_info():
16
+ return {
17
+ "OS": platform.system(),
18
+ "architecture": platform.architecture(),
19
+ "version": platform.version(),
20
+ "processor": platform.processor(),
21
+ "python": platform.python_version(),
22
+ }
23
+
24
+
25
+ def cuda_info():
26
+ return {
27
+ "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())],
28
+ "available": torch.cuda.is_available(),
29
+ "version": torch.version.cuda,
30
+ }
31
+
32
+
33
+ def package_info():
34
+ return {
35
+ "numpy": numpy.__version__,
36
+ "PyTorch_version": torch.__version__,
37
+ "PyTorch_debug": torch.version.debug,
38
+ "TTS": TTS.__version__,
39
+ }
40
+
41
+
42
+ def main():
43
+ details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()}
44
+ print(json.dumps(details, indent=4, sort_keys=True))
45
+
46
+
47
+ if __name__ == "__main__":
48
+ main()
TTS/bin/compute_attention_masks.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import importlib
3
+ import os
4
+ from argparse import RawTextHelpFormatter
5
+
6
+ import numpy as np
7
+ import torch
8
+ from torch.utils.data import DataLoader
9
+ from tqdm import tqdm
10
+
11
+ from TTS.config import load_config
12
+ from TTS.tts.datasets.TTSDataset import TTSDataset
13
+ from TTS.tts.models import setup_model
14
+ from TTS.tts.utils.text.characters import make_symbols, phonemes, symbols
15
+ from TTS.utils.audio import AudioProcessor
16
+ from TTS.utils.io import load_checkpoint
17
+
18
+ if __name__ == "__main__":
19
+ # pylint: disable=bad-option-value
20
+ parser = argparse.ArgumentParser(
21
+ description="""Extract attention masks from trained Tacotron/Tacotron2 models.
22
+ These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n"""
23
+ """Each attention mask is written to the same path as the input wav file with ".npy" file extension.
24
+ (e.g. path/bla.wav (wav file) --> path/bla.npy (attention mask))\n"""
25
+ """
26
+ Example run:
27
+ CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
28
+ --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth
29
+ --config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json
30
+ --dataset_metafile metadata.csv
31
+ --data_path /root/LJSpeech-1.1/
32
+ --batch_size 32
33
+ --dataset ljspeech
34
+ --use_cuda True
35
+ """,
36
+ formatter_class=RawTextHelpFormatter,
37
+ )
38
+ parser.add_argument("--model_path", type=str, required=True, help="Path to Tacotron/Tacotron2 model file ")
39
+ parser.add_argument(
40
+ "--config_path",
41
+ type=str,
42
+ required=True,
43
+ help="Path to Tacotron/Tacotron2 config file.",
44
+ )
45
+ parser.add_argument(
46
+ "--dataset",
47
+ type=str,
48
+ default="",
49
+ required=True,
50
+ help="Target dataset processor name from TTS.tts.dataset.preprocess.",
51
+ )
52
+
53
+ parser.add_argument(
54
+ "--dataset_metafile",
55
+ type=str,
56
+ default="",
57
+ required=True,
58
+ help="Dataset metafile inclusing file paths with transcripts.",
59
+ )
60
+ parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.")
61
+ parser.add_argument("--use_cuda", type=bool, default=False, help="enable/disable cuda.")
62
+
63
+ parser.add_argument(
64
+ "--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA."
65
+ )
66
+ args = parser.parse_args()
67
+
68
+ C = load_config(args.config_path)
69
+ ap = AudioProcessor(**C.audio)
70
+
71
+ # if the vocabulary was passed, replace the default
72
+ if "characters" in C.keys():
73
+ symbols, phonemes = make_symbols(**C.characters)
74
+
75
+ # load the model
76
+ num_chars = len(phonemes) if C.use_phonemes else len(symbols)
77
+ # TODO: handle multi-speaker
78
+ model = setup_model(C)
79
+ model, _ = load_checkpoint(model, args.model_path, args.use_cuda, True)
80
+
81
+ # data loader
82
+ preprocessor = importlib.import_module("TTS.tts.datasets.formatters")
83
+ preprocessor = getattr(preprocessor, args.dataset)
84
+ meta_data = preprocessor(args.data_path, args.dataset_metafile)
85
+ dataset = TTSDataset(
86
+ model.decoder.r,
87
+ C.text_cleaner,
88
+ compute_linear_spec=False,
89
+ ap=ap,
90
+ meta_data=meta_data,
91
+ characters=C.characters if "characters" in C.keys() else None,
92
+ add_blank=C["add_blank"] if "add_blank" in C.keys() else False,
93
+ use_phonemes=C.use_phonemes,
94
+ phoneme_cache_path=C.phoneme_cache_path,
95
+ phoneme_language=C.phoneme_language,
96
+ enable_eos_bos=C.enable_eos_bos_chars,
97
+ )
98
+
99
+ dataset.sort_and_filter_items(C.get("sort_by_audio_len", default=False))
100
+ loader = DataLoader(
101
+ dataset,
102
+ batch_size=args.batch_size,
103
+ num_workers=4,
104
+ collate_fn=dataset.collate_fn,
105
+ shuffle=False,
106
+ drop_last=False,
107
+ )
108
+
109
+ # compute attentions
110
+ file_paths = []
111
+ with torch.no_grad():
112
+ for data in tqdm(loader):
113
+ # setup input data
114
+ text_input = data[0]
115
+ text_lengths = data[1]
116
+ linear_input = data[3]
117
+ mel_input = data[4]
118
+ mel_lengths = data[5]
119
+ stop_targets = data[6]
120
+ item_idxs = data[7]
121
+
122
+ # dispatch data to GPU
123
+ if args.use_cuda:
124
+ text_input = text_input.cuda()
125
+ text_lengths = text_lengths.cuda()
126
+ mel_input = mel_input.cuda()
127
+ mel_lengths = mel_lengths.cuda()
128
+
129
+ model_outputs = model.forward(text_input, text_lengths, mel_input)
130
+
131
+ alignments = model_outputs["alignments"].detach()
132
+ for idx, alignment in enumerate(alignments):
133
+ item_idx = item_idxs[idx]
134
+ # interpolate if r > 1
135
+ alignment = (
136
+ torch.nn.functional.interpolate(
137
+ alignment.transpose(0, 1).unsqueeze(0),
138
+ size=None,
139
+ scale_factor=model.decoder.r,
140
+ mode="nearest",
141
+ align_corners=None,
142
+ recompute_scale_factor=None,
143
+ )
144
+ .squeeze(0)
145
+ .transpose(0, 1)
146
+ )
147
+ # remove paddings
148
+ alignment = alignment[: mel_lengths[idx], : text_lengths[idx]].cpu().numpy()
149
+ # set file paths
150
+ wav_file_name = os.path.basename(item_idx)
151
+ align_file_name = os.path.splitext(wav_file_name)[0] + "_attn.npy"
152
+ file_path = item_idx.replace(wav_file_name, align_file_name)
153
+ # save output
154
+ wav_file_abs_path = os.path.abspath(item_idx)
155
+ file_abs_path = os.path.abspath(file_path)
156
+ file_paths.append([wav_file_abs_path, file_abs_path])
157
+ np.save(file_path, alignment)
158
+
159
+ # ourput metafile
160
+ metafile = os.path.join(args.data_path, "metadata_attn_mask.txt")
161
+
162
+ with open(metafile, "w", encoding="utf-8") as f:
163
+ for p in file_paths:
164
+ f.write(f"{p[0]}|{p[1]}\n")
165
+ print(f" >> Metafile created: {metafile}")
TTS/bin/compute_embeddings.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ from argparse import RawTextHelpFormatter
4
+
5
+ import torch
6
+ from tqdm import tqdm
7
+
8
+ from TTS.config import load_config
9
+ from TTS.config.shared_configs import BaseDatasetConfig
10
+ from TTS.tts.datasets import load_tts_samples
11
+ from TTS.tts.utils.managers import save_file
12
+ from TTS.tts.utils.speakers import SpeakerManager
13
+
14
+
15
+ def compute_embeddings(
16
+ model_path,
17
+ config_path,
18
+ output_path,
19
+ old_spakers_file=None,
20
+ config_dataset_path=None,
21
+ formatter_name=None,
22
+ dataset_name=None,
23
+ dataset_path=None,
24
+ meta_file_train=None,
25
+ meta_file_val=None,
26
+ disable_cuda=False,
27
+ no_eval=False,
28
+ ):
29
+ use_cuda = torch.cuda.is_available() and not disable_cuda
30
+
31
+ if config_dataset_path is not None:
32
+ c_dataset = load_config(config_dataset_path)
33
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not no_eval)
34
+ else:
35
+ c_dataset = BaseDatasetConfig()
36
+ c_dataset.formatter = formatter_name
37
+ c_dataset.dataset_name = dataset_name
38
+ c_dataset.path = dataset_path
39
+ if meta_file_train is not None:
40
+ c_dataset.meta_file_train = meta_file_train
41
+ if meta_file_val is not None:
42
+ c_dataset.meta_file_val = meta_file_val
43
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not no_eval)
44
+
45
+ if meta_data_eval is None:
46
+ samples = meta_data_train
47
+ else:
48
+ samples = meta_data_train + meta_data_eval
49
+
50
+ encoder_manager = SpeakerManager(
51
+ encoder_model_path=model_path,
52
+ encoder_config_path=config_path,
53
+ d_vectors_file_path=old_spakers_file,
54
+ use_cuda=use_cuda,
55
+ )
56
+
57
+ class_name_key = encoder_manager.encoder_config.class_name_key
58
+
59
+ # compute speaker embeddings
60
+ speaker_mapping = {}
61
+ for fields in tqdm(samples):
62
+ class_name = fields[class_name_key]
63
+ audio_file = fields["audio_file"]
64
+ embedding_key = fields["audio_unique_name"]
65
+
66
+ if old_spakers_file is not None and embedding_key in encoder_manager.clip_ids:
67
+ # get the embedding from the old file
68
+ embedd = encoder_manager.get_embedding_by_clip(embedding_key)
69
+ else:
70
+ # extract the embedding
71
+ embedd = encoder_manager.compute_embedding_from_clip(audio_file)
72
+
73
+ # create speaker_mapping if target dataset is defined
74
+ speaker_mapping[embedding_key] = {}
75
+ speaker_mapping[embedding_key]["name"] = class_name
76
+ speaker_mapping[embedding_key]["embedding"] = embedd
77
+
78
+ if speaker_mapping:
79
+ # save speaker_mapping if target dataset is defined
80
+ if os.path.isdir(output_path):
81
+ mapping_file_path = os.path.join(output_path, "speakers.pth")
82
+ else:
83
+ mapping_file_path = output_path
84
+
85
+ if os.path.dirname(mapping_file_path) != "":
86
+ os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
87
+
88
+ save_file(speaker_mapping, mapping_file_path)
89
+ print("Speaker embeddings saved at:", mapping_file_path)
90
+
91
+
92
+ if __name__ == "__main__":
93
+ parser = argparse.ArgumentParser(
94
+ description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
95
+ """
96
+ Example runs:
97
+ python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --config_dataset_path dataset_config.json
98
+
99
+ python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv
100
+ """,
101
+ formatter_class=RawTextHelpFormatter,
102
+ )
103
+ parser.add_argument(
104
+ "--model_path",
105
+ type=str,
106
+ help="Path to model checkpoint file. It defaults to the released speaker encoder.",
107
+ default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
108
+ )
109
+ parser.add_argument(
110
+ "--config_path",
111
+ type=str,
112
+ help="Path to model config file. It defaults to the released speaker encoder config.",
113
+ default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
114
+ )
115
+ parser.add_argument(
116
+ "--config_dataset_path",
117
+ type=str,
118
+ help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
119
+ default=None,
120
+ )
121
+ parser.add_argument("--output_path", type=str, help="Path for output `pth` or `json` file.", default="speakers.pth")
122
+ parser.add_argument(
123
+ "--old_file", type=str, help="Previous embedding file to only compute new audios.", default=None
124
+ )
125
+ parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
126
+ parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False)
127
+ parser.add_argument(
128
+ "--formatter_name",
129
+ type=str,
130
+ help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
131
+ default=None,
132
+ )
133
+ parser.add_argument(
134
+ "--dataset_name",
135
+ type=str,
136
+ help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
137
+ default=None,
138
+ )
139
+ parser.add_argument(
140
+ "--dataset_path",
141
+ type=str,
142
+ help="Path to the dataset. You either need to provide this or `config_dataset_path`",
143
+ default=None,
144
+ )
145
+ parser.add_argument(
146
+ "--meta_file_train",
147
+ type=str,
148
+ help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
149
+ default=None,
150
+ )
151
+ parser.add_argument(
152
+ "--meta_file_val",
153
+ type=str,
154
+ help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
155
+ default=None,
156
+ )
157
+ args = parser.parse_args()
158
+
159
+ compute_embeddings(
160
+ args.model_path,
161
+ args.config_path,
162
+ args.output_path,
163
+ old_spakers_file=args.old_file,
164
+ config_dataset_path=args.config_dataset_path,
165
+ formatter_name=args.formatter_name,
166
+ dataset_name=args.dataset_name,
167
+ dataset_path=args.dataset_path,
168
+ meta_file_train=args.meta_file_train,
169
+ meta_file_val=args.meta_file_val,
170
+ disable_cuda=args.disable_cuda,
171
+ no_eval=args.no_eval,
172
+ )
TTS/bin/compute_statistics.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse
5
+ import glob
6
+ import os
7
+
8
+ import numpy as np
9
+ from tqdm import tqdm
10
+
11
+ # from TTS.utils.io import load_config
12
+ from TTS.config import load_config
13
+ from TTS.tts.datasets import load_tts_samples
14
+ from TTS.utils.audio import AudioProcessor
15
+
16
+
17
+ def main():
18
+ """Run preprocessing process."""
19
+ parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
20
+ parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
21
+ parser.add_argument("out_path", type=str, help="save path (directory and filename).")
22
+ parser.add_argument(
23
+ "--data_path",
24
+ type=str,
25
+ required=False,
26
+ help="folder including the target set of wavs overriding dataset config.",
27
+ )
28
+ args, overrides = parser.parse_known_args()
29
+
30
+ CONFIG = load_config(args.config_path)
31
+ CONFIG.parse_known_args(overrides, relaxed_parser=True)
32
+
33
+ # load config
34
+ CONFIG.audio.signal_norm = False # do not apply earlier normalization
35
+ CONFIG.audio.stats_path = None # discard pre-defined stats
36
+
37
+ # load audio processor
38
+ ap = AudioProcessor(**CONFIG.audio.to_dict())
39
+
40
+ # load the meta data of target dataset
41
+ if args.data_path:
42
+ dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True)
43
+ else:
44
+ dataset_items = load_tts_samples(CONFIG.datasets)[0] # take only train data
45
+ print(f" > There are {len(dataset_items)} files.")
46
+
47
+ mel_sum = 0
48
+ mel_square_sum = 0
49
+ linear_sum = 0
50
+ linear_square_sum = 0
51
+ N = 0
52
+ for item in tqdm(dataset_items):
53
+ # compute features
54
+ wav = ap.load_wav(item if isinstance(item, str) else item["audio_file"])
55
+ linear = ap.spectrogram(wav)
56
+ mel = ap.melspectrogram(wav)
57
+
58
+ # compute stats
59
+ N += mel.shape[1]
60
+ mel_sum += mel.sum(1)
61
+ linear_sum += linear.sum(1)
62
+ mel_square_sum += (mel**2).sum(axis=1)
63
+ linear_square_sum += (linear**2).sum(axis=1)
64
+
65
+ mel_mean = mel_sum / N
66
+ mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2)
67
+ linear_mean = linear_sum / N
68
+ linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2)
69
+
70
+ output_file_path = args.out_path
71
+ stats = {}
72
+ stats["mel_mean"] = mel_mean
73
+ stats["mel_std"] = mel_scale
74
+ stats["linear_mean"] = linear_mean
75
+ stats["linear_std"] = linear_scale
76
+
77
+ print(f" > Avg mel spec mean: {mel_mean.mean()}")
78
+ print(f" > Avg mel spec scale: {mel_scale.mean()}")
79
+ print(f" > Avg linear spec mean: {linear_mean.mean()}")
80
+ print(f" > Avg linear spec scale: {linear_scale.mean()}")
81
+
82
+ # set default config values for mean-var scaling
83
+ CONFIG.audio.stats_path = output_file_path
84
+ CONFIG.audio.signal_norm = True
85
+ # remove redundant values
86
+ del CONFIG.audio.max_norm
87
+ del CONFIG.audio.min_level_db
88
+ del CONFIG.audio.symmetric_norm
89
+ del CONFIG.audio.clip_norm
90
+ stats["audio_config"] = CONFIG.audio.to_dict()
91
+ np.save(output_file_path, stats, allow_pickle=True)
92
+ print(f" > stats saved to {output_file_path}")
93
+
94
+
95
+ if __name__ == "__main__":
96
+ main()
TTS/bin/eval_encoder.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from argparse import RawTextHelpFormatter
3
+
4
+ import torch
5
+ from tqdm import tqdm
6
+
7
+ from TTS.config import load_config
8
+ from TTS.tts.datasets import load_tts_samples
9
+ from TTS.tts.utils.speakers import SpeakerManager
10
+
11
+
12
+ def compute_encoder_accuracy(dataset_items, encoder_manager):
13
+ class_name_key = encoder_manager.encoder_config.class_name_key
14
+ map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)
15
+
16
+ class_acc_dict = {}
17
+
18
+ # compute embeddings for all wav_files
19
+ for item in tqdm(dataset_items):
20
+ class_name = item[class_name_key]
21
+ wav_file = item["audio_file"]
22
+
23
+ # extract the embedding
24
+ embedd = encoder_manager.compute_embedding_from_clip(wav_file)
25
+ if encoder_manager.encoder_criterion is not None and map_classid_to_classname is not None:
26
+ embedding = torch.FloatTensor(embedd).unsqueeze(0)
27
+ if encoder_manager.use_cuda:
28
+ embedding = embedding.cuda()
29
+
30
+ class_id = encoder_manager.encoder_criterion.softmax.inference(embedding).item()
31
+ predicted_label = map_classid_to_classname[str(class_id)]
32
+ else:
33
+ predicted_label = None
34
+
35
+ if class_name is not None and predicted_label is not None:
36
+ is_equal = int(class_name == predicted_label)
37
+ if class_name not in class_acc_dict:
38
+ class_acc_dict[class_name] = [is_equal]
39
+ else:
40
+ class_acc_dict[class_name].append(is_equal)
41
+ else:
42
+ raise RuntimeError("Error: class_name or/and predicted_label are None")
43
+
44
+ acc_avg = 0
45
+ for key, values in class_acc_dict.items():
46
+ acc = sum(values) / len(values)
47
+ print("Class", key, "Accuracy:", acc)
48
+ acc_avg += acc
49
+
50
+ print("Average Accuracy:", acc_avg / len(class_acc_dict))
51
+
52
+
53
+ if __name__ == "__main__":
54
+ parser = argparse.ArgumentParser(
55
+ description="""Compute the accuracy of the encoder.\n\n"""
56
+ """
57
+ Example runs:
58
+ python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json dataset_config.json
59
+ """,
60
+ formatter_class=RawTextHelpFormatter,
61
+ )
62
+ parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
63
+ parser.add_argument(
64
+ "config_path",
65
+ type=str,
66
+ help="Path to model config file.",
67
+ )
68
+
69
+ parser.add_argument(
70
+ "config_dataset_path",
71
+ type=str,
72
+ help="Path to dataset config file.",
73
+ )
74
+ parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
75
+ parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
76
+
77
+ args = parser.parse_args()
78
+
79
+ c_dataset = load_config(args.config_dataset_path)
80
+
81
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval)
82
+ items = meta_data_train + meta_data_eval
83
+
84
+ enc_manager = SpeakerManager(
85
+ encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda
86
+ )
87
+
88
+ compute_encoder_accuracy(items, enc_manager)
TTS/bin/extract_tts_spectrograms.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Extract Mel spectrograms with teacher forcing."""
3
+
4
+ import argparse
5
+ import os
6
+
7
+ import numpy as np
8
+ import torch
9
+ from torch.utils.data import DataLoader
10
+ from tqdm import tqdm
11
+
12
+ from TTS.config import load_config
13
+ from TTS.tts.datasets import TTSDataset, load_tts_samples
14
+ from TTS.tts.models import setup_model
15
+ from TTS.tts.utils.speakers import SpeakerManager
16
+ from TTS.tts.utils.text.tokenizer import TTSTokenizer
17
+ from TTS.utils.audio import AudioProcessor
18
+ from TTS.utils.generic_utils import count_parameters
19
+
20
+ use_cuda = torch.cuda.is_available()
21
+
22
+
23
+ def setup_loader(ap, r, verbose=False):
24
+ tokenizer, _ = TTSTokenizer.init_from_config(c)
25
+ dataset = TTSDataset(
26
+ outputs_per_step=r,
27
+ compute_linear_spec=False,
28
+ samples=meta_data,
29
+ tokenizer=tokenizer,
30
+ ap=ap,
31
+ batch_group_size=0,
32
+ min_text_len=c.min_text_len,
33
+ max_text_len=c.max_text_len,
34
+ min_audio_len=c.min_audio_len,
35
+ max_audio_len=c.max_audio_len,
36
+ phoneme_cache_path=c.phoneme_cache_path,
37
+ precompute_num_workers=0,
38
+ use_noise_augment=False,
39
+ verbose=verbose,
40
+ speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
41
+ d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
42
+ )
43
+
44
+ if c.use_phonemes and c.compute_input_seq_cache:
45
+ # precompute phonemes to have a better estimate of sequence lengths.
46
+ dataset.compute_input_seq(c.num_loader_workers)
47
+ dataset.preprocess_samples()
48
+
49
+ loader = DataLoader(
50
+ dataset,
51
+ batch_size=c.batch_size,
52
+ shuffle=False,
53
+ collate_fn=dataset.collate_fn,
54
+ drop_last=False,
55
+ sampler=None,
56
+ num_workers=c.num_loader_workers,
57
+ pin_memory=False,
58
+ )
59
+ return loader
60
+
61
+
62
+ def set_filename(wav_path, out_path):
63
+ wav_file = os.path.basename(wav_path)
64
+ file_name = wav_file.split(".")[0]
65
+ os.makedirs(os.path.join(out_path, "quant"), exist_ok=True)
66
+ os.makedirs(os.path.join(out_path, "mel"), exist_ok=True)
67
+ os.makedirs(os.path.join(out_path, "wav_gl"), exist_ok=True)
68
+ os.makedirs(os.path.join(out_path, "wav"), exist_ok=True)
69
+ wavq_path = os.path.join(out_path, "quant", file_name)
70
+ mel_path = os.path.join(out_path, "mel", file_name)
71
+ wav_gl_path = os.path.join(out_path, "wav_gl", file_name + ".wav")
72
+ wav_path = os.path.join(out_path, "wav", file_name + ".wav")
73
+ return file_name, wavq_path, mel_path, wav_gl_path, wav_path
74
+
75
+
76
+ def format_data(data):
77
+ # setup input data
78
+ text_input = data["token_id"]
79
+ text_lengths = data["token_id_lengths"]
80
+ mel_input = data["mel"]
81
+ mel_lengths = data["mel_lengths"]
82
+ item_idx = data["item_idxs"]
83
+ d_vectors = data["d_vectors"]
84
+ speaker_ids = data["speaker_ids"]
85
+ attn_mask = data["attns"]
86
+ avg_text_length = torch.mean(text_lengths.float())
87
+ avg_spec_length = torch.mean(mel_lengths.float())
88
+
89
+ # dispatch data to GPU
90
+ if use_cuda:
91
+ text_input = text_input.cuda(non_blocking=True)
92
+ text_lengths = text_lengths.cuda(non_blocking=True)
93
+ mel_input = mel_input.cuda(non_blocking=True)
94
+ mel_lengths = mel_lengths.cuda(non_blocking=True)
95
+ if speaker_ids is not None:
96
+ speaker_ids = speaker_ids.cuda(non_blocking=True)
97
+ if d_vectors is not None:
98
+ d_vectors = d_vectors.cuda(non_blocking=True)
99
+ if attn_mask is not None:
100
+ attn_mask = attn_mask.cuda(non_blocking=True)
101
+ return (
102
+ text_input,
103
+ text_lengths,
104
+ mel_input,
105
+ mel_lengths,
106
+ speaker_ids,
107
+ d_vectors,
108
+ avg_text_length,
109
+ avg_spec_length,
110
+ attn_mask,
111
+ item_idx,
112
+ )
113
+
114
+
115
+ @torch.no_grad()
116
+ def inference(
117
+ model_name,
118
+ model,
119
+ ap,
120
+ text_input,
121
+ text_lengths,
122
+ mel_input,
123
+ mel_lengths,
124
+ speaker_ids=None,
125
+ d_vectors=None,
126
+ ):
127
+ if model_name == "glow_tts":
128
+ speaker_c = None
129
+ if speaker_ids is not None:
130
+ speaker_c = speaker_ids
131
+ elif d_vectors is not None:
132
+ speaker_c = d_vectors
133
+ outputs = model.inference_with_MAS(
134
+ text_input,
135
+ text_lengths,
136
+ mel_input,
137
+ mel_lengths,
138
+ aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids},
139
+ )
140
+ model_output = outputs["model_outputs"]
141
+ model_output = model_output.detach().cpu().numpy()
142
+
143
+ elif "tacotron" in model_name:
144
+ aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
145
+ outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input)
146
+ postnet_outputs = outputs["model_outputs"]
147
+ # normalize tacotron output
148
+ if model_name == "tacotron":
149
+ mel_specs = []
150
+ postnet_outputs = postnet_outputs.data.cpu().numpy()
151
+ for b in range(postnet_outputs.shape[0]):
152
+ postnet_output = postnet_outputs[b]
153
+ mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T))
154
+ model_output = torch.stack(mel_specs).cpu().numpy()
155
+
156
+ elif model_name == "tacotron2":
157
+ model_output = postnet_outputs.detach().cpu().numpy()
158
+ return model_output
159
+
160
+
161
+ def extract_spectrograms(
162
+ data_loader, model, ap, output_path, quantized_wav=False, save_audio=False, debug=False, metada_name="metada.txt"
163
+ ):
164
+ model.eval()
165
+ export_metadata = []
166
+ for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
167
+ # format data
168
+ (
169
+ text_input,
170
+ text_lengths,
171
+ mel_input,
172
+ mel_lengths,
173
+ speaker_ids,
174
+ d_vectors,
175
+ _,
176
+ _,
177
+ _,
178
+ item_idx,
179
+ ) = format_data(data)
180
+
181
+ model_output = inference(
182
+ c.model.lower(),
183
+ model,
184
+ ap,
185
+ text_input,
186
+ text_lengths,
187
+ mel_input,
188
+ mel_lengths,
189
+ speaker_ids,
190
+ d_vectors,
191
+ )
192
+
193
+ for idx in range(text_input.shape[0]):
194
+ wav_file_path = item_idx[idx]
195
+ wav = ap.load_wav(wav_file_path)
196
+ _, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
197
+
198
+ # quantize and save wav
199
+ if quantized_wav:
200
+ wavq = ap.quantize(wav)
201
+ np.save(wavq_path, wavq)
202
+
203
+ # save TTS mel
204
+ mel = model_output[idx]
205
+ mel_length = mel_lengths[idx]
206
+ mel = mel[:mel_length, :].T
207
+ np.save(mel_path, mel)
208
+
209
+ export_metadata.append([wav_file_path, mel_path])
210
+ if save_audio:
211
+ ap.save_wav(wav, wav_path)
212
+
213
+ if debug:
214
+ print("Audio for debug saved at:", wav_gl_path)
215
+ wav = ap.inv_melspectrogram(mel)
216
+ ap.save_wav(wav, wav_gl_path)
217
+
218
+ with open(os.path.join(output_path, metada_name), "w", encoding="utf-8") as f:
219
+ for data in export_metadata:
220
+ f.write(f"{data[0]}|{data[1]+'.npy'}\n")
221
+
222
+
223
+ def main(args): # pylint: disable=redefined-outer-name
224
+ # pylint: disable=global-variable-undefined
225
+ global meta_data, speaker_manager
226
+
227
+ # Audio processor
228
+ ap = AudioProcessor(**c.audio)
229
+
230
+ # load data instances
231
+ meta_data_train, meta_data_eval = load_tts_samples(
232
+ c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
233
+ )
234
+
235
+ # use eval and training partitions
236
+ meta_data = meta_data_train + meta_data_eval
237
+
238
+ # init speaker manager
239
+ if c.use_speaker_embedding:
240
+ speaker_manager = SpeakerManager(data_items=meta_data)
241
+ elif c.use_d_vector_file:
242
+ speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file)
243
+ else:
244
+ speaker_manager = None
245
+
246
+ # setup model
247
+ model = setup_model(c)
248
+
249
+ # restore model
250
+ model.load_checkpoint(c, args.checkpoint_path, eval=True)
251
+
252
+ if use_cuda:
253
+ model.cuda()
254
+
255
+ num_params = count_parameters(model)
256
+ print("\n > Model has {} parameters".format(num_params), flush=True)
257
+ # set r
258
+ r = 1 if c.model.lower() == "glow_tts" else model.decoder.r
259
+ own_loader = setup_loader(ap, r, verbose=True)
260
+
261
+ extract_spectrograms(
262
+ own_loader,
263
+ model,
264
+ ap,
265
+ args.output_path,
266
+ quantized_wav=args.quantized,
267
+ save_audio=args.save_audio,
268
+ debug=args.debug,
269
+ metada_name="metada.txt",
270
+ )
271
+
272
+
273
+ if __name__ == "__main__":
274
+ parser = argparse.ArgumentParser()
275
+ parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
276
+ parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
277
+ parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
278
+ parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
279
+ parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
280
+ parser.add_argument("--quantized", action="store_true", help="Save quantized audio files")
281
+ parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
282
+ args = parser.parse_args()
283
+
284
+ c = load_config(args.config_path)
285
+ c.audio.trim_silence = False
286
+ main(args)
TTS/bin/find_unique_chars.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Find all the unique characters in a dataset"""
2
+ import argparse
3
+ from argparse import RawTextHelpFormatter
4
+
5
+ from TTS.config import load_config
6
+ from TTS.tts.datasets import load_tts_samples
7
+
8
+
9
+ def main():
10
+ # pylint: disable=bad-option-value
11
+ parser = argparse.ArgumentParser(
12
+ description="""Find all the unique characters or phonemes in a dataset.\n\n"""
13
+ """
14
+ Example runs:
15
+
16
+ python TTS/bin/find_unique_chars.py --config_path config.json
17
+ """,
18
+ formatter_class=RawTextHelpFormatter,
19
+ )
20
+ parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
21
+ args = parser.parse_args()
22
+
23
+ c = load_config(args.config_path)
24
+
25
+ # load all datasets
26
+ train_items, eval_items = load_tts_samples(
27
+ c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
28
+ )
29
+
30
+ items = train_items + eval_items
31
+
32
+ texts = "".join(item["text"] for item in items)
33
+ chars = set(texts)
34
+ lower_chars = filter(lambda c: c.islower(), chars)
35
+ chars_force_lower = [c.lower() for c in chars]
36
+ chars_force_lower = set(chars_force_lower)
37
+
38
+ print(f" > Number of unique characters: {len(chars)}")
39
+ print(f" > Unique characters: {''.join(sorted(chars))}")
40
+ print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
41
+ print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
42
+
43
+
44
+ if __name__ == "__main__":
45
+ main()
TTS/bin/find_unique_phonemes.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Find all the unique characters in a dataset"""
2
+ import argparse
3
+ import multiprocessing
4
+ from argparse import RawTextHelpFormatter
5
+
6
+ from tqdm.contrib.concurrent import process_map
7
+
8
+ from TTS.config import load_config
9
+ from TTS.tts.datasets import load_tts_samples
10
+ from TTS.tts.utils.text.phonemizers import Gruut
11
+
12
+
13
+ def compute_phonemes(item):
14
+ text = item["text"]
15
+ ph = phonemizer.phonemize(text).replace("|", "")
16
+ return set(list(ph))
17
+
18
+
19
+ def main():
20
+ # pylint: disable=W0601
21
+ global c, phonemizer
22
+ # pylint: disable=bad-option-value
23
+ parser = argparse.ArgumentParser(
24
+ description="""Find all the unique characters or phonemes in a dataset.\n\n"""
25
+ """
26
+ Example runs:
27
+
28
+ python TTS/bin/find_unique_phonemes.py --config_path config.json
29
+ """,
30
+ formatter_class=RawTextHelpFormatter,
31
+ )
32
+ parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
33
+ args = parser.parse_args()
34
+
35
+ c = load_config(args.config_path)
36
+
37
+ # load all datasets
38
+ train_items, eval_items = load_tts_samples(
39
+ c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
40
+ )
41
+ items = train_items + eval_items
42
+ print("Num items:", len(items))
43
+
44
+ language_list = [item["language"] for item in items]
45
+ is_lang_def = all(language_list)
46
+
47
+ if not c.phoneme_language or not is_lang_def:
48
+ raise ValueError("Phoneme language must be defined in config.")
49
+
50
+ if not language_list.count(language_list[0]) == len(language_list):
51
+ raise ValueError(
52
+ "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
53
+ )
54
+
55
+ phonemizer = Gruut(language=language_list[0], keep_puncs=True)
56
+
57
+ phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
58
+ phones = []
59
+ for ph in phonemes:
60
+ phones.extend(ph)
61
+
62
+ phones = set(phones)
63
+ lower_phones = filter(lambda c: c.islower(), phones)
64
+ phones_force_lower = [c.lower() for c in phones]
65
+ phones_force_lower = set(phones_force_lower)
66
+
67
+ print(f" > Number of unique phonemes: {len(phones)}")
68
+ print(f" > Unique phonemes: {''.join(sorted(phones))}")
69
+ print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
70
+ print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
71
+
72
+
73
+ if __name__ == "__main__":
74
+ main()
TTS/bin/remove_silence_using_vad.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import glob
3
+ import os
4
+ import pathlib
5
+
6
+ from tqdm import tqdm
7
+
8
+ from TTS.utils.vad import get_vad_model_and_utils, remove_silence
9
+
10
+
11
+ def adjust_path_and_remove_silence(audio_path):
12
+ output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
13
+ # ignore if the file exists
14
+ if os.path.exists(output_path) and not args.force:
15
+ return output_path
16
+
17
+ # create all directory structure
18
+ pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
19
+ # remove the silence and save the audio
20
+ output_path, is_speech = remove_silence(
21
+ model_and_utils,
22
+ audio_path,
23
+ output_path,
24
+ trim_just_beginning_and_end=args.trim_just_beginning_and_end,
25
+ use_cuda=args.use_cuda,
26
+ )
27
+
28
+ return output_path, is_speech
29
+
30
+
31
+ def preprocess_audios():
32
+ files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
33
+ print("> Number of files: ", len(files))
34
+ if not args.force:
35
+ print("> Ignoring files that already exist in the output idrectory.")
36
+
37
+ if args.trim_just_beginning_and_end:
38
+ print("> Trimming just the beginning and the end with nonspeech parts.")
39
+ else:
40
+ print("> Trimming all nonspeech parts.")
41
+
42
+ filtered_files = []
43
+ if files:
44
+ # create threads
45
+ # num_threads = multiprocessing.cpu_count()
46
+ # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
47
+ for f in tqdm(files):
48
+ output_path, is_speech = adjust_path_and_remove_silence(f)
49
+ if not is_speech:
50
+ filtered_files.append(output_path)
51
+
52
+ # write files that do not have speech
53
+ with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
54
+ for file in filtered_files:
55
+ f.write(file + "\n")
56
+ else:
57
+ print("> No files Found !")
58
+
59
+
60
+ if __name__ == "__main__":
61
+ parser = argparse.ArgumentParser(
62
+ description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
63
+ )
64
+ parser.add_argument("-i", "--input_dir", type=str, default="../VCTK-Corpus", help="Dataset root dir")
65
+ parser.add_argument(
66
+ "-o", "--output_dir", type=str, default="../VCTK-Corpus-removed-silence", help="Output Dataset dir"
67
+ )
68
+ parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
69
+ parser.add_argument(
70
+ "-g",
71
+ "--glob",
72
+ type=str,
73
+ default="**/*.wav",
74
+ help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav",
75
+ )
76
+ parser.add_argument(
77
+ "-t",
78
+ "--trim_just_beginning_and_end",
79
+ type=bool,
80
+ default=True,
81
+ help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True",
82
+ )
83
+ parser.add_argument(
84
+ "-c",
85
+ "--use_cuda",
86
+ type=bool,
87
+ default=False,
88
+ help="If True use cuda",
89
+ )
90
+ args = parser.parse_args()
91
+ # load the model and utils
92
+ model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda)
93
+ preprocess_audios()
TTS/bin/resample.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import glob
3
+ import os
4
+ from argparse import RawTextHelpFormatter
5
+ from multiprocessing import Pool
6
+ from shutil import copytree
7
+
8
+ import librosa
9
+ import soundfile as sf
10
+ from tqdm import tqdm
11
+
12
+
13
+ def resample_file(func_args):
14
+ filename, output_sr = func_args
15
+ y, sr = librosa.load(filename, sr=output_sr)
16
+ sf.write(filename, y, sr)
17
+
18
+
19
+ def resample_files(input_dir, output_sr, output_dir=None, file_ext="wav", n_jobs=10):
20
+ if output_dir:
21
+ print("Recursively copying the input folder...")
22
+ copytree(input_dir, output_dir)
23
+ input_dir = output_dir
24
+
25
+ print("Resampling the audio files...")
26
+ audio_files = glob.glob(os.path.join(input_dir, f"**/*.{file_ext}"), recursive=True)
27
+ print(f"Found {len(audio_files)} files...")
28
+ audio_files = list(zip(audio_files, len(audio_files) * [output_sr]))
29
+ with Pool(processes=n_jobs) as p:
30
+ with tqdm(total=len(audio_files)) as pbar:
31
+ for _, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
32
+ pbar.update()
33
+
34
+ print("Done !")
35
+
36
+
37
+ if __name__ == "__main__":
38
+ parser = argparse.ArgumentParser(
39
+ description="""Resample a folder recusively with librosa
40
+ Can be used in place or create a copy of the folder as an output.\n\n
41
+ Example run:
42
+ python TTS/bin/resample.py
43
+ --input_dir /root/LJSpeech-1.1/
44
+ --output_sr 22050
45
+ --output_dir /root/resampled_LJSpeech-1.1/
46
+ --file_ext wav
47
+ --n_jobs 24
48
+ """,
49
+ formatter_class=RawTextHelpFormatter,
50
+ )
51
+
52
+ parser.add_argument(
53
+ "--input_dir",
54
+ type=str,
55
+ default=None,
56
+ required=True,
57
+ help="Path of the folder containing the audio files to resample",
58
+ )
59
+
60
+ parser.add_argument(
61
+ "--output_sr",
62
+ type=int,
63
+ default=22050,
64
+ required=False,
65
+ help="Samlple rate to which the audio files should be resampled",
66
+ )
67
+
68
+ parser.add_argument(
69
+ "--output_dir",
70
+ type=str,
71
+ default=None,
72
+ required=False,
73
+ help="Path of the destination folder. If not defined, the operation is done in place",
74
+ )
75
+
76
+ parser.add_argument(
77
+ "--file_ext",
78
+ type=str,
79
+ default="wav",
80
+ required=False,
81
+ help="Extension of the audio files to resample",
82
+ )
83
+
84
+ parser.add_argument(
85
+ "--n_jobs", type=int, default=None, help="Number of threads to use, by default it uses all cores"
86
+ )
87
+
88
+ args = parser.parse_args()
89
+
90
+ resample_files(args.input_dir, args.output_sr, args.output_dir, args.file_ext, args.n_jobs)
TTS/bin/synthesize.py ADDED
@@ -0,0 +1,418 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse
5
+ import sys
6
+ from argparse import RawTextHelpFormatter
7
+
8
+ # pylint: disable=redefined-outer-name, unused-argument
9
+ from pathlib import Path
10
+
11
+ from TTS.utils.manage import ModelManager
12
+ from TTS.utils.synthesizer import Synthesizer
13
+
14
+
15
+ def str2bool(v):
16
+ if isinstance(v, bool):
17
+ return v
18
+ if v.lower() in ("yes", "true", "t", "y", "1"):
19
+ return True
20
+ if v.lower() in ("no", "false", "f", "n", "0"):
21
+ return False
22
+ raise argparse.ArgumentTypeError("Boolean value expected.")
23
+
24
+
25
+ def main():
26
+ description = """Synthesize speech on command line.
27
+
28
+ You can either use your trained model or choose a model from the provided list.
29
+
30
+ If you don't specify any models, then it uses LJSpeech based English model.
31
+
32
+ ## Example Runs
33
+
34
+ ### Single Speaker Models
35
+
36
+ - List provided models:
37
+
38
+ ```
39
+ $ tts --list_models
40
+ ```
41
+
42
+ - Query info for model info by idx:
43
+
44
+ ```
45
+ $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
46
+ ```
47
+
48
+ - Query info for model info by full name:
49
+
50
+ ```
51
+ $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
52
+ ```
53
+
54
+ - Run TTS with default models:
55
+
56
+ ```
57
+ $ tts --text "Text for TTS"
58
+ ```
59
+
60
+ - Run a TTS model with its default vocoder model:
61
+
62
+ ```
63
+ $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>
64
+ ```
65
+
66
+ - Run with specific TTS and vocoder models from the list:
67
+
68
+ ```
69
+ $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --output_path
70
+ ```
71
+
72
+ - Run your own TTS model (Using Griffin-Lim Vocoder):
73
+
74
+ ```
75
+ $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
76
+ ```
77
+
78
+ - Run your own TTS and Vocoder models:
79
+ ```
80
+ $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
81
+ --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
82
+ ```
83
+
84
+ ### Multi-speaker Models
85
+
86
+ - List the available speakers and choose as <speaker_id> among them:
87
+
88
+ ```
89
+ $ tts --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
90
+ ```
91
+
92
+ - Run the multi-speaker TTS model with the target speaker ID:
93
+
94
+ ```
95
+ $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
96
+ ```
97
+
98
+ - Run your own multi-speaker TTS model:
99
+
100
+ ```
101
+ $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
102
+ ```
103
+
104
+ ### Voice Conversion Models
105
+
106
+ ```
107
+ $ tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
108
+ ```
109
+ """
110
+ # We remove Markdown code formatting programmatically here to allow us to copy-and-paste from main README to keep
111
+ # documentation in sync more easily.
112
+ parser = argparse.ArgumentParser(
113
+ description=description.replace(" ```\n", ""),
114
+ formatter_class=RawTextHelpFormatter,
115
+ )
116
+
117
+ parser.add_argument(
118
+ "--list_models",
119
+ type=str2bool,
120
+ nargs="?",
121
+ const=True,
122
+ default=False,
123
+ help="list available pre-trained TTS and vocoder models.",
124
+ )
125
+
126
+ parser.add_argument(
127
+ "--model_info_by_idx",
128
+ type=str,
129
+ default=None,
130
+ help="model info using query format: <model_type>/<model_query_idx>",
131
+ )
132
+
133
+ parser.add_argument(
134
+ "--model_info_by_name",
135
+ type=str,
136
+ default=None,
137
+ help="model info using query format: <model_type>/<language>/<dataset>/<model_name>",
138
+ )
139
+
140
+ parser.add_argument("--text", type=str, default=None, help="Text to generate speech.")
141
+
142
+ # Args for running pre-trained TTS models.
143
+ parser.add_argument(
144
+ "--model_name",
145
+ type=str,
146
+ default="tts_models/en/ljspeech/tacotron2-DDC",
147
+ help="Name of one of the pre-trained TTS models in format <language>/<dataset>/<model_name>",
148
+ )
149
+ parser.add_argument(
150
+ "--vocoder_name",
151
+ type=str,
152
+ default=None,
153
+ help="Name of one of the pre-trained vocoder models in format <language>/<dataset>/<model_name>",
154
+ )
155
+
156
+ # Args for running custom models
157
+ parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
158
+ parser.add_argument(
159
+ "--model_path",
160
+ type=str,
161
+ default=None,
162
+ help="Path to model file.",
163
+ )
164
+ parser.add_argument(
165
+ "--out_path",
166
+ type=str,
167
+ default="tts_output.wav",
168
+ help="Output wav file path.",
169
+ )
170
+ parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
171
+ parser.add_argument(
172
+ "--vocoder_path",
173
+ type=str,
174
+ help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
175
+ default=None,
176
+ )
177
+ parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
178
+ parser.add_argument(
179
+ "--encoder_path",
180
+ type=str,
181
+ help="Path to speaker encoder model file.",
182
+ default=None,
183
+ )
184
+ parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
185
+
186
+ # args for multi-speaker synthesis
187
+ parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
188
+ parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
189
+ parser.add_argument(
190
+ "--speaker_idx",
191
+ type=str,
192
+ help="Target speaker ID for a multi-speaker TTS model.",
193
+ default=None,
194
+ )
195
+ parser.add_argument(
196
+ "--language_idx",
197
+ type=str,
198
+ help="Target language ID for a multi-lingual TTS model.",
199
+ default=None,
200
+ )
201
+ parser.add_argument(
202
+ "--speaker_wav",
203
+ nargs="+",
204
+ help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
205
+ default=None,
206
+ )
207
+ parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
208
+ parser.add_argument(
209
+ "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
210
+ )
211
+ parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
212
+ parser.add_argument(
213
+ "--list_speaker_idxs",
214
+ help="List available speaker ids for the defined multi-speaker model.",
215
+ type=str2bool,
216
+ nargs="?",
217
+ const=True,
218
+ default=False,
219
+ )
220
+ parser.add_argument(
221
+ "--list_language_idxs",
222
+ help="List available language ids for the defined multi-lingual model.",
223
+ type=str2bool,
224
+ nargs="?",
225
+ const=True,
226
+ default=False,
227
+ )
228
+ # aux args
229
+ parser.add_argument(
230
+ "--save_spectogram",
231
+ type=bool,
232
+ help="If true save raw spectogram for further (vocoder) processing in out_path.",
233
+ default=False,
234
+ )
235
+ parser.add_argument(
236
+ "--reference_wav",
237
+ type=str,
238
+ help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav",
239
+ default=None,
240
+ )
241
+ parser.add_argument(
242
+ "--reference_speaker_idx",
243
+ type=str,
244
+ help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
245
+ default=None,
246
+ )
247
+ parser.add_argument(
248
+ "--progress_bar",
249
+ type=str2bool,
250
+ help="If true shows a progress bar for the model download. Defaults to True",
251
+ default=True,
252
+ )
253
+
254
+ # voice conversion args
255
+ parser.add_argument(
256
+ "--source_wav",
257
+ type=str,
258
+ default=None,
259
+ help="Original audio file to convert in the voice of the target_wav",
260
+ )
261
+ parser.add_argument(
262
+ "--target_wav",
263
+ type=str,
264
+ default=None,
265
+ help="Target audio file to convert in the voice of the source_wav",
266
+ )
267
+
268
+ args = parser.parse_args()
269
+
270
+ # print the description if either text or list_models is not set
271
+ check_args = [
272
+ args.text,
273
+ args.list_models,
274
+ args.list_speaker_idxs,
275
+ args.list_language_idxs,
276
+ args.reference_wav,
277
+ args.model_info_by_idx,
278
+ args.model_info_by_name,
279
+ args.source_wav,
280
+ args.target_wav,
281
+ ]
282
+ if not any(check_args):
283
+ parser.parse_args(["-h"])
284
+
285
+ # load model manager
286
+ path = Path(__file__).parent / "../.models.json"
287
+ manager = ModelManager(path, progress_bar=args.progress_bar)
288
+
289
+ tts_path = None
290
+ tts_config_path = None
291
+ speakers_file_path = None
292
+ language_ids_file_path = None
293
+ vocoder_path = None
294
+ vocoder_config_path = None
295
+ encoder_path = None
296
+ encoder_config_path = None
297
+ vc_path = None
298
+ vc_config_path = None
299
+
300
+ # CASE1 #list : list pre-trained TTS models
301
+ if args.list_models:
302
+ manager.list_models()
303
+ sys.exit()
304
+
305
+ # CASE2 #info : model info for pre-trained TTS models
306
+ if args.model_info_by_idx:
307
+ model_query = args.model_info_by_idx
308
+ manager.model_info_by_idx(model_query)
309
+ sys.exit()
310
+
311
+ if args.model_info_by_name:
312
+ model_query_full_name = args.model_info_by_name
313
+ manager.model_info_by_full_name(model_query_full_name)
314
+ sys.exit()
315
+
316
+ # CASE3: load pre-trained model paths
317
+ if args.model_name is not None and not args.model_path:
318
+ model_path, config_path, model_item = manager.download_model(args.model_name)
319
+
320
+ # tts model
321
+ if model_item["model_type"] == "tts_models":
322
+ tts_path = model_path
323
+ tts_config_path = config_path
324
+ if "default_vocoder" in model_item:
325
+ args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
326
+
327
+ # voice conversion model
328
+ if model_item["model_type"] == "voice_conversion_models":
329
+ vc_path = model_path
330
+ vc_config_path = config_path
331
+
332
+ # load vocoder
333
+ if args.vocoder_name is not None and not args.vocoder_path:
334
+ vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
335
+
336
+ # CASE4: set custom model paths
337
+ if args.model_path is not None:
338
+ tts_path = args.model_path
339
+ tts_config_path = args.config_path
340
+ speakers_file_path = args.speakers_file_path
341
+ language_ids_file_path = args.language_ids_file_path
342
+
343
+ if args.vocoder_path is not None:
344
+ vocoder_path = args.vocoder_path
345
+ vocoder_config_path = args.vocoder_config_path
346
+
347
+ if args.encoder_path is not None:
348
+ encoder_path = args.encoder_path
349
+ encoder_config_path = args.encoder_config_path
350
+
351
+ # load models
352
+ synthesizer = Synthesizer(
353
+ tts_path,
354
+ tts_config_path,
355
+ speakers_file_path,
356
+ language_ids_file_path,
357
+ vocoder_path,
358
+ vocoder_config_path,
359
+ encoder_path,
360
+ encoder_config_path,
361
+ vc_path,
362
+ vc_config_path,
363
+ args.use_cuda,
364
+ )
365
+
366
+ # query speaker ids of a multi-speaker model.
367
+ if args.list_speaker_idxs:
368
+ print(
369
+ " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
370
+ )
371
+ print(synthesizer.tts_model.speaker_manager.name_to_id)
372
+ return
373
+
374
+ # query langauge ids of a multi-lingual model.
375
+ if args.list_language_idxs:
376
+ print(
377
+ " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
378
+ )
379
+ print(synthesizer.tts_model.language_manager.name_to_id)
380
+ return
381
+
382
+ # check the arguments against a multi-speaker model.
383
+ if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
384
+ print(
385
+ " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
386
+ "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
387
+ )
388
+ return
389
+
390
+ # RUN THE SYNTHESIS
391
+ if args.text:
392
+ print(" > Text: {}".format(args.text))
393
+
394
+ # kick it
395
+ if tts_path is not None:
396
+ wav = synthesizer.tts(
397
+ args.text,
398
+ args.speaker_idx,
399
+ args.language_idx,
400
+ args.speaker_wav,
401
+ reference_wav=args.reference_wav,
402
+ style_wav=args.capacitron_style_wav,
403
+ style_text=args.capacitron_style_text,
404
+ reference_speaker_name=args.reference_speaker_idx,
405
+ )
406
+ elif vc_path is not None:
407
+ wav = synthesizer.voice_conversion(
408
+ source_wav=args.source_wav,
409
+ target_wav=args.target_wav,
410
+ )
411
+
412
+ # save the results
413
+ print(" > Saving output to {}".format(args.out_path))
414
+ synthesizer.save_wav(wav, args.out_path)
415
+
416
+
417
+ if __name__ == "__main__":
418
+ main()
TTS/bin/train_encoder.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import os
5
+ import sys
6
+ import time
7
+ import traceback
8
+
9
+ import torch
10
+ from torch.utils.data import DataLoader
11
+ from trainer.torch import NoamLR
12
+ from trainer.trainer_utils import get_optimizer
13
+
14
+ from TTS.encoder.dataset import EncoderDataset
15
+ from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_encoder_model
16
+ from TTS.encoder.utils.training import init_training
17
+ from TTS.encoder.utils.visual import plot_embeddings
18
+ from TTS.tts.datasets import load_tts_samples
19
+ from TTS.utils.audio import AudioProcessor
20
+ from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
21
+ from TTS.utils.io import copy_model_files
22
+ from TTS.utils.samplers import PerfectBatchSampler
23
+ from TTS.utils.training import check_update
24
+
25
+ torch.backends.cudnn.enabled = True
26
+ torch.backends.cudnn.benchmark = True
27
+ torch.manual_seed(54321)
28
+ use_cuda = torch.cuda.is_available()
29
+ num_gpus = torch.cuda.device_count()
30
+ print(" > Using CUDA: ", use_cuda)
31
+ print(" > Number of GPUs: ", num_gpus)
32
+
33
+
34
+ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False):
35
+ num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class
36
+ num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch
37
+
38
+ dataset = EncoderDataset(
39
+ c,
40
+ ap,
41
+ meta_data_eval if is_val else meta_data_train,
42
+ voice_len=c.voice_len,
43
+ num_utter_per_class=num_utter_per_class,
44
+ num_classes_in_batch=num_classes_in_batch,
45
+ verbose=verbose,
46
+ augmentation_config=c.audio_augmentation if not is_val else None,
47
+ use_torch_spec=c.model_params.get("use_torch_spec", False),
48
+ )
49
+ # get classes list
50
+ classes = dataset.get_class_list()
51
+
52
+ sampler = PerfectBatchSampler(
53
+ dataset.items,
54
+ classes,
55
+ batch_size=num_classes_in_batch * num_utter_per_class, # total batch size
56
+ num_classes_in_batch=num_classes_in_batch,
57
+ num_gpus=1,
58
+ shuffle=not is_val,
59
+ drop_last=True,
60
+ )
61
+
62
+ if len(classes) < num_classes_in_batch:
63
+ if is_val:
64
+ raise RuntimeError(
65
+ f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !"
66
+ )
67
+ raise RuntimeError(
68
+ f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !"
69
+ )
70
+
71
+ # set the classes to avoid get wrong class_id when the number of training and eval classes are not equal
72
+ if is_val:
73
+ dataset.set_classes(train_classes)
74
+
75
+ loader = DataLoader(
76
+ dataset,
77
+ num_workers=c.num_loader_workers,
78
+ batch_sampler=sampler,
79
+ collate_fn=dataset.collate_fn,
80
+ )
81
+
82
+ return loader, classes, dataset.get_map_classid_to_classname()
83
+
84
+
85
+ def evaluation(model, criterion, data_loader, global_step):
86
+ eval_loss = 0
87
+ for _, data in enumerate(data_loader):
88
+ with torch.no_grad():
89
+ # setup input data
90
+ inputs, labels = data
91
+
92
+ # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
93
+ labels = torch.transpose(
94
+ labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1
95
+ ).reshape(labels.shape)
96
+ inputs = torch.transpose(
97
+ inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1
98
+ ).reshape(inputs.shape)
99
+
100
+ # dispatch data to GPU
101
+ if use_cuda:
102
+ inputs = inputs.cuda(non_blocking=True)
103
+ labels = labels.cuda(non_blocking=True)
104
+
105
+ # forward pass model
106
+ outputs = model(inputs)
107
+
108
+ # loss computation
109
+ loss = criterion(
110
+ outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels
111
+ )
112
+
113
+ eval_loss += loss.item()
114
+
115
+ eval_avg_loss = eval_loss / len(data_loader)
116
+ # save stats
117
+ dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
118
+ # plot the last batch in the evaluation
119
+ figures = {
120
+ "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
121
+ }
122
+ dashboard_logger.eval_figures(global_step, figures)
123
+ return eval_avg_loss
124
+
125
+
126
+ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
127
+ model.train()
128
+ best_loss = float("inf")
129
+ avg_loader_time = 0
130
+ end_time = time.time()
131
+ for epoch in range(c.epochs):
132
+ tot_loss = 0
133
+ epoch_time = 0
134
+ for _, data in enumerate(data_loader):
135
+ start_time = time.time()
136
+
137
+ # setup input data
138
+ inputs, labels = data
139
+ # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
140
+ labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(
141
+ labels.shape
142
+ )
143
+ inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(
144
+ inputs.shape
145
+ )
146
+ # ToDo: move it to a unit test
147
+ # labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
148
+ # inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
149
+ # idx = 0
150
+ # for j in range(0, c.num_classes_in_batch, 1):
151
+ # for i in range(j, len(labels), c.num_classes_in_batch):
152
+ # if not torch.all(labels[i].eq(labels_converted[idx])) or not torch.all(inputs[i].eq(inputs_converted[idx])):
153
+ # print("Invalid")
154
+ # print(labels)
155
+ # exit()
156
+ # idx += 1
157
+ # labels = labels_converted
158
+ # inputs = inputs_converted
159
+
160
+ loader_time = time.time() - end_time
161
+ global_step += 1
162
+
163
+ # setup lr
164
+ if c.lr_decay:
165
+ scheduler.step()
166
+ optimizer.zero_grad()
167
+
168
+ # dispatch data to GPU
169
+ if use_cuda:
170
+ inputs = inputs.cuda(non_blocking=True)
171
+ labels = labels.cuda(non_blocking=True)
172
+
173
+ # forward pass model
174
+ outputs = model(inputs)
175
+
176
+ # loss computation
177
+ loss = criterion(
178
+ outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels
179
+ )
180
+ loss.backward()
181
+ grad_norm, _ = check_update(model, c.grad_clip)
182
+ optimizer.step()
183
+
184
+ step_time = time.time() - start_time
185
+ epoch_time += step_time
186
+
187
+ # acumulate the total epoch loss
188
+ tot_loss += loss.item()
189
+
190
+ # Averaged Loader Time
191
+ num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1
192
+ avg_loader_time = (
193
+ 1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time
194
+ if avg_loader_time != 0
195
+ else loader_time
196
+ )
197
+ current_lr = optimizer.param_groups[0]["lr"]
198
+
199
+ if global_step % c.steps_plot_stats == 0:
200
+ # Plot Training Epoch Stats
201
+ train_stats = {
202
+ "loss": loss.item(),
203
+ "lr": current_lr,
204
+ "grad_norm": grad_norm,
205
+ "step_time": step_time,
206
+ "avg_loader_time": avg_loader_time,
207
+ }
208
+ dashboard_logger.train_epoch_stats(global_step, train_stats)
209
+ figures = {
210
+ "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
211
+ }
212
+ dashboard_logger.train_figures(global_step, figures)
213
+
214
+ if global_step % c.print_step == 0:
215
+ print(
216
+ " | > Step:{} Loss:{:.5f} GradNorm:{:.5f} "
217
+ "StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format(
218
+ global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr
219
+ ),
220
+ flush=True,
221
+ )
222
+
223
+ if global_step % c.save_step == 0:
224
+ # save model
225
+ save_checkpoint(model, optimizer, criterion, loss.item(), OUT_PATH, global_step, epoch)
226
+
227
+ end_time = time.time()
228
+
229
+ print("")
230
+ print(
231
+ ">>> Epoch:{} AvgLoss: {:.5f} GradNorm:{:.5f} "
232
+ "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
233
+ epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time
234
+ ),
235
+ flush=True,
236
+ )
237
+ # evaluation
238
+ if c.run_eval:
239
+ model.eval()
240
+ eval_loss = evaluation(model, criterion, eval_data_loader, global_step)
241
+ print("\n\n")
242
+ print("--> EVAL PERFORMANCE")
243
+ print(
244
+ " | > Epoch:{} AvgLoss: {:.5f} ".format(epoch, eval_loss),
245
+ flush=True,
246
+ )
247
+ # save the best checkpoint
248
+ best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch)
249
+ model.train()
250
+
251
+ return best_loss, global_step
252
+
253
+
254
+ def main(args): # pylint: disable=redefined-outer-name
255
+ # pylint: disable=global-variable-undefined
256
+ global meta_data_train
257
+ global meta_data_eval
258
+ global train_classes
259
+
260
+ ap = AudioProcessor(**c.audio)
261
+ model = setup_encoder_model(c)
262
+
263
+ optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model)
264
+
265
+ # pylint: disable=redefined-outer-name
266
+ meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True)
267
+
268
+ train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False, verbose=True)
269
+ if c.run_eval:
270
+ eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True)
271
+ else:
272
+ eval_data_loader = None
273
+
274
+ num_classes = len(train_classes)
275
+ criterion = model.get_criterion(c, num_classes)
276
+
277
+ if c.loss == "softmaxproto" and c.model != "speaker_encoder":
278
+ c.map_classid_to_classname = map_classid_to_classname
279
+ copy_model_files(c, OUT_PATH)
280
+
281
+ if args.restore_path:
282
+ criterion, args.restore_step = model.load_checkpoint(
283
+ c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion
284
+ )
285
+ print(" > Model restored from step %d" % args.restore_step, flush=True)
286
+ else:
287
+ args.restore_step = 0
288
+
289
+ if c.lr_decay:
290
+ scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1)
291
+ else:
292
+ scheduler = None
293
+
294
+ num_params = count_parameters(model)
295
+ print("\n > Model has {} parameters".format(num_params), flush=True)
296
+
297
+ if use_cuda:
298
+ model = model.cuda()
299
+ criterion.cuda()
300
+
301
+ global_step = args.restore_step
302
+ _, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step)
303
+
304
+
305
+ if __name__ == "__main__":
306
+ args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training()
307
+
308
+ try:
309
+ main(args)
310
+ except KeyboardInterrupt:
311
+ remove_experiment_folder(OUT_PATH)
312
+ try:
313
+ sys.exit(0)
314
+ except SystemExit:
315
+ os._exit(0) # pylint: disable=protected-access
316
+ except Exception: # pylint: disable=broad-except
317
+ remove_experiment_folder(OUT_PATH)
318
+ traceback.print_exc()
319
+ sys.exit(1)
TTS/bin/train_tts.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dataclasses import dataclass, field
3
+
4
+ from trainer import Trainer, TrainerArgs
5
+
6
+ from TTS.config import load_config, register_config
7
+ from TTS.tts.datasets import load_tts_samples
8
+ from TTS.tts.models import setup_model
9
+
10
+
11
+ @dataclass
12
+ class TrainTTSArgs(TrainerArgs):
13
+ config_path: str = field(default=None, metadata={"help": "Path to the config file."})
14
+
15
+
16
+ def main():
17
+ """Run `tts` model training directly by a `config.json` file."""
18
+ # init trainer args
19
+ train_args = TrainTTSArgs()
20
+ parser = train_args.init_argparse(arg_prefix="")
21
+
22
+ # override trainer args from comman-line args
23
+ args, config_overrides = parser.parse_known_args()
24
+ train_args.parse_args(args)
25
+
26
+ # load config.json and register
27
+ if args.config_path or args.continue_path:
28
+ if args.config_path:
29
+ # init from a file
30
+ config = load_config(args.config_path)
31
+ if len(config_overrides) > 0:
32
+ config.parse_known_args(config_overrides, relaxed_parser=True)
33
+ elif args.continue_path:
34
+ # continue from a prev experiment
35
+ config = load_config(os.path.join(args.continue_path, "config.json"))
36
+ if len(config_overrides) > 0:
37
+ config.parse_known_args(config_overrides, relaxed_parser=True)
38
+ else:
39
+ # init from console args
40
+ from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
41
+
42
+ config_base = BaseTrainingConfig()
43
+ config_base.parse_known_args(config_overrides)
44
+ config = register_config(config_base.model)()
45
+
46
+ # load training samples
47
+ train_samples, eval_samples = load_tts_samples(
48
+ config.datasets,
49
+ eval_split=True,
50
+ eval_split_max_size=config.eval_split_max_size,
51
+ eval_split_size=config.eval_split_size,
52
+ )
53
+
54
+ # init the model from config
55
+ model = setup_model(config, train_samples + eval_samples)
56
+
57
+ # init the trainer and 🚀
58
+ trainer = Trainer(
59
+ train_args,
60
+ model.config,
61
+ config.output_path,
62
+ model=model,
63
+ train_samples=train_samples,
64
+ eval_samples=eval_samples,
65
+ parse_command_line_args=False,
66
+ )
67
+ trainer.fit()
68
+
69
+
70
+ if __name__ == "__main__":
71
+ main()
TTS/bin/train_vocoder.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dataclasses import dataclass, field
3
+
4
+ from trainer import Trainer, TrainerArgs
5
+
6
+ from TTS.config import load_config, register_config
7
+ from TTS.utils.audio import AudioProcessor
8
+ from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
9
+ from TTS.vocoder.models import setup_model
10
+
11
+
12
+ @dataclass
13
+ class TrainVocoderArgs(TrainerArgs):
14
+ config_path: str = field(default=None, metadata={"help": "Path to the config file."})
15
+
16
+
17
+ def main():
18
+ """Run `tts` model training directly by a `config.json` file."""
19
+ # init trainer args
20
+ train_args = TrainVocoderArgs()
21
+ parser = train_args.init_argparse(arg_prefix="")
22
+
23
+ # override trainer args from comman-line args
24
+ args, config_overrides = parser.parse_known_args()
25
+ train_args.parse_args(args)
26
+
27
+ # load config.json and register
28
+ if args.config_path or args.continue_path:
29
+ if args.config_path:
30
+ # init from a file
31
+ config = load_config(args.config_path)
32
+ if len(config_overrides) > 0:
33
+ config.parse_known_args(config_overrides, relaxed_parser=True)
34
+ elif args.continue_path:
35
+ # continue from a prev experiment
36
+ config = load_config(os.path.join(args.continue_path, "config.json"))
37
+ if len(config_overrides) > 0:
38
+ config.parse_known_args(config_overrides, relaxed_parser=True)
39
+ else:
40
+ # init from console args
41
+ from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
42
+
43
+ config_base = BaseTrainingConfig()
44
+ config_base.parse_known_args(config_overrides)
45
+ config = register_config(config_base.model)()
46
+
47
+ # load training samples
48
+ if "feature_path" in config and config.feature_path:
49
+ # load pre-computed features
50
+ print(f" > Loading features from: {config.feature_path}")
51
+ eval_samples, train_samples = load_wav_feat_data(config.data_path, config.feature_path, config.eval_split_size)
52
+ else:
53
+ # load data raw wav files
54
+ eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
55
+
56
+ # setup audio processor
57
+ ap = AudioProcessor(**config.audio)
58
+
59
+ # init the model from config
60
+ model = setup_model(config)
61
+
62
+ # init the trainer and 🚀
63
+ trainer = Trainer(
64
+ train_args,
65
+ config,
66
+ config.output_path,
67
+ model=model,
68
+ train_samples=train_samples,
69
+ eval_samples=eval_samples,
70
+ training_assets={"audio_processor": ap},
71
+ parse_command_line_args=False,
72
+ )
73
+ trainer.fit()
74
+
75
+
76
+ if __name__ == "__main__":
77
+ main()
TTS/bin/tune_wavegrad.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Search a good noise schedule for WaveGrad for a given number of inference iterations"""
2
+ import argparse
3
+ from itertools import product as cartesian_product
4
+
5
+ import numpy as np
6
+ import torch
7
+ from torch.utils.data import DataLoader
8
+ from tqdm import tqdm
9
+
10
+ from TTS.config import load_config
11
+ from TTS.utils.audio import AudioProcessor
12
+ from TTS.vocoder.datasets.preprocess import load_wav_data
13
+ from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
14
+ from TTS.vocoder.models import setup_model
15
+
16
+ if __name__ == "__main__":
17
+ parser = argparse.ArgumentParser()
18
+ parser.add_argument("--model_path", type=str, help="Path to model checkpoint.")
19
+ parser.add_argument("--config_path", type=str, help="Path to model config file.")
20
+ parser.add_argument("--data_path", type=str, help="Path to data directory.")
21
+ parser.add_argument("--output_path", type=str, help="path for output file including file name and extension.")
22
+ parser.add_argument(
23
+ "--num_iter",
24
+ type=int,
25
+ help="Number of model inference iterations that you like to optimize noise schedule for.",
26
+ )
27
+ parser.add_argument("--use_cuda", action="store_true", help="enable CUDA.")
28
+ parser.add_argument("--num_samples", type=int, default=1, help="Number of datasamples used for inference.")
29
+ parser.add_argument(
30
+ "--search_depth",
31
+ type=int,
32
+ default=3,
33
+ help="Search granularity. Increasing this increases the run-time exponentially.",
34
+ )
35
+
36
+ # load config
37
+ args = parser.parse_args()
38
+ config = load_config(args.config_path)
39
+
40
+ # setup audio processor
41
+ ap = AudioProcessor(**config.audio)
42
+
43
+ # load dataset
44
+ _, train_data = load_wav_data(args.data_path, 0)
45
+ train_data = train_data[: args.num_samples]
46
+ dataset = WaveGradDataset(
47
+ ap=ap,
48
+ items=train_data,
49
+ seq_len=-1,
50
+ hop_len=ap.hop_length,
51
+ pad_short=config.pad_short,
52
+ conv_pad=config.conv_pad,
53
+ is_training=True,
54
+ return_segments=False,
55
+ use_noise_augment=False,
56
+ use_cache=False,
57
+ verbose=True,
58
+ )
59
+ loader = DataLoader(
60
+ dataset,
61
+ batch_size=1,
62
+ shuffle=False,
63
+ collate_fn=dataset.collate_full_clips,
64
+ drop_last=False,
65
+ num_workers=config.num_loader_workers,
66
+ pin_memory=False,
67
+ )
68
+
69
+ # setup the model
70
+ model = setup_model(config)
71
+ if args.use_cuda:
72
+ model.cuda()
73
+
74
+ # setup optimization parameters
75
+ base_values = sorted(10 * np.random.uniform(size=args.search_depth))
76
+ print(f" > base values: {base_values}")
77
+ exponents = 10 ** np.linspace(-6, -1, num=args.num_iter)
78
+ best_error = float("inf")
79
+ best_schedule = None # pylint: disable=C0103
80
+ total_search_iter = len(base_values) ** args.num_iter
81
+ for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=total_search_iter):
82
+ beta = exponents * base
83
+ model.compute_noise_level(beta)
84
+ for data in loader:
85
+ mel, audio = data
86
+ y_hat = model.inference(mel.cuda() if args.use_cuda else mel)
87
+
88
+ if args.use_cuda:
89
+ y_hat = y_hat.cpu()
90
+ y_hat = y_hat.numpy()
91
+
92
+ mel_hat = []
93
+ for i in range(y_hat.shape[0]):
94
+ m = ap.melspectrogram(y_hat[i, 0])[:, :-1]
95
+ mel_hat.append(torch.from_numpy(m))
96
+
97
+ mel_hat = torch.stack(mel_hat)
98
+ mse = torch.sum((mel - mel_hat) ** 2).mean()
99
+ if mse.item() < best_error:
100
+ best_error = mse.item()
101
+ best_schedule = {"beta": beta}
102
+ print(f" > Found a better schedule. - MSE: {mse.item()}")
103
+ np.save(args.output_path, best_schedule)
TTS/config/__init__.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from typing import Dict
5
+
6
+ import fsspec
7
+ import yaml
8
+ from coqpit import Coqpit
9
+
10
+ from TTS.config.shared_configs import *
11
+ from TTS.utils.generic_utils import find_module
12
+
13
+
14
+ def read_json_with_comments(json_path):
15
+ """for backward compat."""
16
+ # fallback to json
17
+ with fsspec.open(json_path, "r", encoding="utf-8") as f:
18
+ input_str = f.read()
19
+ # handle comments
20
+ input_str = re.sub(r"\\\n", "", input_str)
21
+ input_str = re.sub(r"//.*\n", "\n", input_str)
22
+ data = json.loads(input_str)
23
+ return data
24
+
25
+
26
+ def register_config(model_name: str) -> Coqpit:
27
+ """Find the right config for the given model name.
28
+
29
+ Args:
30
+ model_name (str): Model name.
31
+
32
+ Raises:
33
+ ModuleNotFoundError: No matching config for the model name.
34
+
35
+ Returns:
36
+ Coqpit: config class.
37
+ """
38
+ config_class = None
39
+ config_name = model_name + "_config"
40
+ paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs", "TTS.vc.configs"]
41
+ for path in paths:
42
+ try:
43
+ config_class = find_module(path, config_name)
44
+ except ModuleNotFoundError:
45
+ pass
46
+ if config_class is None:
47
+ raise ModuleNotFoundError(f" [!] Config for {model_name} cannot be found.")
48
+ return config_class
49
+
50
+
51
+ def _process_model_name(config_dict: Dict) -> str:
52
+ """Format the model name as expected. It is a band-aid for the old `vocoder` model names.
53
+
54
+ Args:
55
+ config_dict (Dict): A dictionary including the config fields.
56
+
57
+ Returns:
58
+ str: Formatted modelname.
59
+ """
60
+ model_name = config_dict["model"] if "model" in config_dict else config_dict["generator_model"]
61
+ model_name = model_name.replace("_generator", "").replace("_discriminator", "")
62
+ return model_name
63
+
64
+
65
+ def load_config(config_path: str) -> Coqpit:
66
+ """Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name
67
+ to find the corresponding Config class. Then initialize the Config.
68
+
69
+ Args:
70
+ config_path (str): path to the config file.
71
+
72
+ Raises:
73
+ TypeError: given config file has an unknown type.
74
+
75
+ Returns:
76
+ Coqpit: TTS config object.
77
+ """
78
+ config_dict = {}
79
+ ext = os.path.splitext(config_path)[1]
80
+ if ext in (".yml", ".yaml"):
81
+ with fsspec.open(config_path, "r", encoding="utf-8") as f:
82
+ data = yaml.safe_load(f)
83
+ elif ext == ".json":
84
+ try:
85
+ with fsspec.open(config_path, "r", encoding="utf-8") as f:
86
+ data = json.load(f)
87
+ except json.decoder.JSONDecodeError:
88
+ # backwards compat.
89
+ data = read_json_with_comments(config_path)
90
+ else:
91
+ raise TypeError(f" [!] Unknown config file type {ext}")
92
+ config_dict.update(data)
93
+ model_name = _process_model_name(config_dict)
94
+ config_class = register_config(model_name.lower())
95
+ config = config_class()
96
+ config.from_dict(config_dict)
97
+ return config
98
+
99
+
100
+ def check_config_and_model_args(config, arg_name, value):
101
+ """Check the give argument in `config.model_args` if exist or in `config` for
102
+ the given value.
103
+
104
+ Return False if the argument does not exist in `config.model_args` or `config`.
105
+ This is to patch up the compatibility between models with and without `model_args`.
106
+
107
+ TODO: Remove this in the future with a unified approach.
108
+ """
109
+ if hasattr(config, "model_args"):
110
+ if arg_name in config.model_args:
111
+ return config.model_args[arg_name] == value
112
+ if hasattr(config, arg_name):
113
+ return config[arg_name] == value
114
+ return False
115
+
116
+
117
+ def get_from_config_or_model_args(config, arg_name):
118
+ """Get the given argument from `config.model_args` if exist or in `config`."""
119
+ if hasattr(config, "model_args"):
120
+ if arg_name in config.model_args:
121
+ return config.model_args[arg_name]
122
+ return config[arg_name]
123
+
124
+
125
+ def get_from_config_or_model_args_with_default(config, arg_name, def_val):
126
+ """Get the given argument from `config.model_args` if exist or in `config`."""
127
+ if hasattr(config, "model_args"):
128
+ if arg_name in config.model_args:
129
+ return config.model_args[arg_name]
130
+ if hasattr(config, arg_name):
131
+ return config[arg_name]
132
+ return def_val
TTS/config/shared_configs.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import asdict, dataclass
2
+ from typing import List
3
+
4
+ from coqpit import Coqpit, check_argument
5
+ from trainer import TrainerConfig
6
+
7
+
8
+ @dataclass
9
+ class BaseAudioConfig(Coqpit):
10
+ """Base config to definge audio processing parameters. It is used to initialize
11
+ ```TTS.utils.audio.AudioProcessor.```
12
+
13
+ Args:
14
+ fft_size (int):
15
+ Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
16
+
17
+ win_length (int):
18
+ Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
19
+ ```fft_size```. Defaults to 1024.
20
+
21
+ hop_length (int):
22
+ Number of audio samples between adjacent STFT columns. Defaults to 1024.
23
+
24
+ frame_shift_ms (int):
25
+ Set ```hop_length``` based on milliseconds and sampling rate.
26
+
27
+ frame_length_ms (int):
28
+ Set ```win_length``` based on milliseconds and sampling rate.
29
+
30
+ stft_pad_mode (str):
31
+ Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
32
+
33
+ sample_rate (int):
34
+ Audio sampling rate. Defaults to 22050.
35
+
36
+ resample (bool):
37
+ Enable / Disable resampling audio to ```sample_rate```. Defaults to ```False```.
38
+
39
+ preemphasis (float):
40
+ Preemphasis coefficient. Defaults to 0.0.
41
+
42
+ ref_level_db (int): 20
43
+ Reference Db level to rebase the audio signal and ignore the level below. 20Db is assumed the sound of air.
44
+ Defaults to 20.
45
+
46
+ do_sound_norm (bool):
47
+ Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False.
48
+
49
+ log_func (str):
50
+ Numpy log function used for amplitude to DB conversion. Defaults to 'np.log10'.
51
+
52
+ do_trim_silence (bool):
53
+ Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```.
54
+
55
+ do_amp_to_db_linear (bool, optional):
56
+ enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
57
+
58
+ do_amp_to_db_mel (bool, optional):
59
+ enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
60
+
61
+ pitch_fmax (float, optional):
62
+ Maximum frequency of the F0 frames. Defaults to ```640```.
63
+
64
+ pitch_fmin (float, optional):
65
+ Minimum frequency of the F0 frames. Defaults to ```1```.
66
+
67
+ trim_db (int):
68
+ Silence threshold used for silence trimming. Defaults to 45.
69
+
70
+ do_rms_norm (bool, optional):
71
+ enable/disable RMS volume normalization when loading an audio file. Defaults to False.
72
+
73
+ db_level (int, optional):
74
+ dB level used for rms normalization. The range is -99 to 0. Defaults to None.
75
+
76
+ power (float):
77
+ Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
78
+ artifacts in the synthesized voice. Defaults to 1.5.
79
+
80
+ griffin_lim_iters (int):
81
+ Number of Griffing Lim iterations. Defaults to 60.
82
+
83
+ num_mels (int):
84
+ Number of mel-basis frames that defines the frame lengths of each mel-spectrogram frame. Defaults to 80.
85
+
86
+ mel_fmin (float): Min frequency level used for the mel-basis filters. ~50 for male and ~95 for female voices.
87
+ It needs to be adjusted for a dataset. Defaults to 0.
88
+
89
+ mel_fmax (float):
90
+ Max frequency level used for the mel-basis filters. It needs to be adjusted for a dataset.
91
+
92
+ spec_gain (int):
93
+ Gain applied when converting amplitude to DB. Defaults to 20.
94
+
95
+ signal_norm (bool):
96
+ enable/disable signal normalization. Defaults to True.
97
+
98
+ min_level_db (int):
99
+ minimum db threshold for the computed melspectrograms. Defaults to -100.
100
+
101
+ symmetric_norm (bool):
102
+ enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else
103
+ [0, k], Defaults to True.
104
+
105
+ max_norm (float):
106
+ ```k``` defining the normalization range. Defaults to 4.0.
107
+
108
+ clip_norm (bool):
109
+ enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
110
+
111
+ stats_path (str):
112
+ Path to the computed stats file. Defaults to None.
113
+ """
114
+
115
+ # stft parameters
116
+ fft_size: int = 1024
117
+ win_length: int = 1024
118
+ hop_length: int = 256
119
+ frame_shift_ms: int = None
120
+ frame_length_ms: int = None
121
+ stft_pad_mode: str = "reflect"
122
+ # audio processing parameters
123
+ sample_rate: int = 22050
124
+ resample: bool = False
125
+ preemphasis: float = 0.0
126
+ ref_level_db: int = 20
127
+ do_sound_norm: bool = False
128
+ log_func: str = "np.log10"
129
+ # silence trimming
130
+ do_trim_silence: bool = True
131
+ trim_db: int = 45
132
+ # rms volume normalization
133
+ do_rms_norm: bool = False
134
+ db_level: float = None
135
+ # griffin-lim params
136
+ power: float = 1.5
137
+ griffin_lim_iters: int = 60
138
+ # mel-spec params
139
+ num_mels: int = 80
140
+ mel_fmin: float = 0.0
141
+ mel_fmax: float = None
142
+ spec_gain: int = 20
143
+ do_amp_to_db_linear: bool = True
144
+ do_amp_to_db_mel: bool = True
145
+ # f0 params
146
+ pitch_fmax: float = 640.0
147
+ pitch_fmin: float = 1.0
148
+ # normalization params
149
+ signal_norm: bool = True
150
+ min_level_db: int = -100
151
+ symmetric_norm: bool = True
152
+ max_norm: float = 4.0
153
+ clip_norm: bool = True
154
+ stats_path: str = None
155
+
156
+ def check_values(
157
+ self,
158
+ ):
159
+ """Check config fields"""
160
+ c = asdict(self)
161
+ check_argument("num_mels", c, restricted=True, min_val=10, max_val=2056)
162
+ check_argument("fft_size", c, restricted=True, min_val=128, max_val=4058)
163
+ check_argument("sample_rate", c, restricted=True, min_val=512, max_val=100000)
164
+ check_argument(
165
+ "frame_length_ms",
166
+ c,
167
+ restricted=True,
168
+ min_val=10,
169
+ max_val=1000,
170
+ alternative="win_length",
171
+ )
172
+ check_argument("frame_shift_ms", c, restricted=True, min_val=1, max_val=1000, alternative="hop_length")
173
+ check_argument("preemphasis", c, restricted=True, min_val=0, max_val=1)
174
+ check_argument("min_level_db", c, restricted=True, min_val=-1000, max_val=10)
175
+ check_argument("ref_level_db", c, restricted=True, min_val=0, max_val=1000)
176
+ check_argument("power", c, restricted=True, min_val=1, max_val=5)
177
+ check_argument("griffin_lim_iters", c, restricted=True, min_val=10, max_val=1000)
178
+
179
+ # normalization parameters
180
+ check_argument("signal_norm", c, restricted=True)
181
+ check_argument("symmetric_norm", c, restricted=True)
182
+ check_argument("max_norm", c, restricted=True, min_val=0.1, max_val=1000)
183
+ check_argument("clip_norm", c, restricted=True)
184
+ check_argument("mel_fmin", c, restricted=True, min_val=0.0, max_val=1000)
185
+ check_argument("mel_fmax", c, restricted=True, min_val=500.0, allow_none=True)
186
+ check_argument("spec_gain", c, restricted=True, min_val=1, max_val=100)
187
+ check_argument("do_trim_silence", c, restricted=True)
188
+ check_argument("trim_db", c, restricted=True)
189
+
190
+
191
+ @dataclass
192
+ class BaseDatasetConfig(Coqpit):
193
+ """Base config for TTS datasets.
194
+
195
+ Args:
196
+ formatter (str):
197
+ Formatter name that defines used formatter in ```TTS.tts.datasets.formatter```. Defaults to `""`.
198
+
199
+ dataset_name (str):
200
+ Unique name for the dataset. Defaults to `""`.
201
+
202
+ path (str):
203
+ Root path to the dataset files. Defaults to `""`.
204
+
205
+ meta_file_train (str):
206
+ Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
207
+ Defaults to `""`.
208
+
209
+ ignored_speakers (List):
210
+ List of speakers IDs that are not used at the training. Default None.
211
+
212
+ language (str):
213
+ Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to `""`.
214
+
215
+ phonemizer (str):
216
+ Phonemizer used for that dataset's language. By default it uses `DEF_LANG_TO_PHONEMIZER`. Defaults to `""`.
217
+
218
+ meta_file_val (str):
219
+ Name of the dataset meta file that defines the instances used at validation.
220
+
221
+ meta_file_attn_mask (str):
222
+ Path to the file that lists the attention mask files used with models that require attention masks to
223
+ train the duration predictor.
224
+ """
225
+
226
+ formatter: str = ""
227
+ dataset_name: str = ""
228
+ path: str = ""
229
+ meta_file_train: str = ""
230
+ ignored_speakers: List[str] = None
231
+ language: str = ""
232
+ phonemizer: str = ""
233
+ meta_file_val: str = ""
234
+ meta_file_attn_mask: str = ""
235
+
236
+ def check_values(
237
+ self,
238
+ ):
239
+ """Check config fields"""
240
+ c = asdict(self)
241
+ check_argument("formatter", c, restricted=True)
242
+ check_argument("path", c, restricted=True)
243
+ check_argument("meta_file_train", c, restricted=True)
244
+ check_argument("meta_file_val", c, restricted=False)
245
+ check_argument("meta_file_attn_mask", c, restricted=False)
246
+
247
+
248
+ @dataclass
249
+ class BaseTrainingConfig(TrainerConfig):
250
+ """Base config to define the basic 🐸TTS training parameters that are shared
251
+ among all the models. It is based on ```Trainer.TrainingConfig```.
252
+
253
+ Args:
254
+ model (str):
255
+ Name of the model that is used in the training.
256
+
257
+ num_loader_workers (int):
258
+ Number of workers for training time dataloader.
259
+
260
+ num_eval_loader_workers (int):
261
+ Number of workers for evaluation time dataloader.
262
+ """
263
+
264
+ model: str = None
265
+ # dataloading
266
+ num_loader_workers: int = 0
267
+ num_eval_loader_workers: int = 0
268
+ use_noise_augment: bool = False
TTS/encoder/README.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Speaker Encoder
2
+
3
+ This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
4
+
5
+ With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
6
+
7
+ Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
8
+
9
+ ![](umap.png)
10
+
11
+ Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
12
+
13
+ To run the code, you need to follow the same flow as in TTS.
14
+
15
+ - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
16
+ - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
17
+ - Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
18
+ - Watch training on Tensorboard as in TTS
TTS/encoder/__init__.py ADDED
File without changes
TTS/encoder/configs/base_encoder_config.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import asdict, dataclass, field
2
+ from typing import Dict, List
3
+
4
+ from coqpit import MISSING
5
+
6
+ from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
7
+
8
+
9
+ @dataclass
10
+ class BaseEncoderConfig(BaseTrainingConfig):
11
+ """Defines parameters for a Generic Encoder model."""
12
+
13
+ model: str = None
14
+ audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
15
+ datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
16
+ # model params
17
+ model_params: Dict = field(
18
+ default_factory=lambda: {
19
+ "model_name": "lstm",
20
+ "input_dim": 80,
21
+ "proj_dim": 256,
22
+ "lstm_dim": 768,
23
+ "num_lstm_layers": 3,
24
+ "use_lstm_with_projection": True,
25
+ }
26
+ )
27
+
28
+ audio_augmentation: Dict = field(default_factory=lambda: {})
29
+
30
+ # training params
31
+ epochs: int = 10000
32
+ loss: str = "angleproto"
33
+ grad_clip: float = 3.0
34
+ lr: float = 0.0001
35
+ optimizer: str = "radam"
36
+ optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
37
+ lr_decay: bool = False
38
+ warmup_steps: int = 4000
39
+
40
+ # logging params
41
+ tb_model_param_stats: bool = False
42
+ steps_plot_stats: int = 10
43
+ save_step: int = 1000
44
+ print_step: int = 20
45
+ run_eval: bool = False
46
+
47
+ # data loader
48
+ num_classes_in_batch: int = MISSING
49
+ num_utter_per_class: int = MISSING
50
+ eval_num_classes_in_batch: int = None
51
+ eval_num_utter_per_class: int = None
52
+
53
+ num_loader_workers: int = MISSING
54
+ voice_len: float = 1.6
55
+
56
+ def check_values(self):
57
+ super().check_values()
58
+ c = asdict(self)
59
+ assert (
60
+ c["model_params"]["input_dim"] == self.audio.num_mels
61
+ ), " [!] model input dimendion must be equal to melspectrogram dimension."
TTS/encoder/configs/emotion_encoder_config.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import asdict, dataclass
2
+
3
+ from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
4
+
5
+
6
+ @dataclass
7
+ class EmotionEncoderConfig(BaseEncoderConfig):
8
+ """Defines parameters for Emotion Encoder model."""
9
+
10
+ model: str = "emotion_encoder"
11
+ map_classid_to_classname: dict = None
12
+ class_name_key: str = "emotion_name"
TTS/encoder/configs/speaker_encoder_config.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import asdict, dataclass
2
+
3
+ from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
4
+
5
+
6
+ @dataclass
7
+ class SpeakerEncoderConfig(BaseEncoderConfig):
8
+ """Defines parameters for Speaker Encoder model."""
9
+
10
+ model: str = "speaker_encoder"
11
+ class_name_key: str = "speaker_name"
TTS/encoder/dataset.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+
3
+ import torch
4
+ from torch.utils.data import Dataset
5
+
6
+ from TTS.encoder.utils.generic_utils import AugmentWAV
7
+
8
+
9
+ class EncoderDataset(Dataset):
10
+ def __init__(
11
+ self,
12
+ config,
13
+ ap,
14
+ meta_data,
15
+ voice_len=1.6,
16
+ num_classes_in_batch=64,
17
+ num_utter_per_class=10,
18
+ verbose=False,
19
+ augmentation_config=None,
20
+ use_torch_spec=None,
21
+ ):
22
+ """
23
+ Args:
24
+ ap (TTS.tts.utils.AudioProcessor): audio processor object.
25
+ meta_data (list): list of dataset instances.
26
+ seq_len (int): voice segment length in seconds.
27
+ verbose (bool): print diagnostic information.
28
+ """
29
+ super().__init__()
30
+ self.config = config
31
+ self.items = meta_data
32
+ self.sample_rate = ap.sample_rate
33
+ self.seq_len = int(voice_len * self.sample_rate)
34
+ self.num_utter_per_class = num_utter_per_class
35
+ self.ap = ap
36
+ self.verbose = verbose
37
+ self.use_torch_spec = use_torch_spec
38
+ self.classes, self.items = self.__parse_items()
39
+
40
+ self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
41
+
42
+ # Data Augmentation
43
+ self.augmentator = None
44
+ self.gaussian_augmentation_config = None
45
+ if augmentation_config:
46
+ self.data_augmentation_p = augmentation_config["p"]
47
+ if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config):
48
+ self.augmentator = AugmentWAV(ap, augmentation_config)
49
+
50
+ if "gaussian" in augmentation_config.keys():
51
+ self.gaussian_augmentation_config = augmentation_config["gaussian"]
52
+
53
+ if self.verbose:
54
+ print("\n > DataLoader initialization")
55
+ print(f" | > Classes per Batch: {num_classes_in_batch}")
56
+ print(f" | > Number of instances : {len(self.items)}")
57
+ print(f" | > Sequence length: {self.seq_len}")
58
+ print(f" | > Num Classes: {len(self.classes)}")
59
+ print(f" | > Classes: {self.classes}")
60
+
61
+ def load_wav(self, filename):
62
+ audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
63
+ return audio
64
+
65
+ def __parse_items(self):
66
+ class_to_utters = {}
67
+ for item in self.items:
68
+ path_ = item["audio_file"]
69
+ class_name = item[self.config.class_name_key]
70
+ if class_name in class_to_utters.keys():
71
+ class_to_utters[class_name].append(path_)
72
+ else:
73
+ class_to_utters[class_name] = [
74
+ path_,
75
+ ]
76
+
77
+ # skip classes with number of samples >= self.num_utter_per_class
78
+ class_to_utters = {k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class}
79
+
80
+ classes = list(class_to_utters.keys())
81
+ classes.sort()
82
+
83
+ new_items = []
84
+ for item in self.items:
85
+ path_ = item["audio_file"]
86
+ class_name = item["emotion_name"] if self.config.model == "emotion_encoder" else item["speaker_name"]
87
+ # ignore filtered classes
88
+ if class_name not in classes:
89
+ continue
90
+ # ignore small audios
91
+ if self.load_wav(path_).shape[0] - self.seq_len <= 0:
92
+ continue
93
+
94
+ new_items.append({"wav_file_path": path_, "class_name": class_name})
95
+
96
+ return classes, new_items
97
+
98
+ def __len__(self):
99
+ return len(self.items)
100
+
101
+ def get_num_classes(self):
102
+ return len(self.classes)
103
+
104
+ def get_class_list(self):
105
+ return self.classes
106
+
107
+ def set_classes(self, classes):
108
+ self.classes = classes
109
+ self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
110
+
111
+ def get_map_classid_to_classname(self):
112
+ return dict((c_id, c_n) for c_n, c_id in self.classname_to_classid.items())
113
+
114
+ def __getitem__(self, idx):
115
+ return self.items[idx]
116
+
117
+ def collate_fn(self, batch):
118
+ # get the batch class_ids
119
+ labels = []
120
+ feats = []
121
+ for item in batch:
122
+ utter_path = item["wav_file_path"]
123
+ class_name = item["class_name"]
124
+
125
+ # get classid
126
+ class_id = self.classname_to_classid[class_name]
127
+ # load wav file
128
+ wav = self.load_wav(utter_path)
129
+ offset = random.randint(0, wav.shape[0] - self.seq_len)
130
+ wav = wav[offset : offset + self.seq_len]
131
+
132
+ if self.augmentator is not None and self.data_augmentation_p:
133
+ if random.random() < self.data_augmentation_p:
134
+ wav = self.augmentator.apply_one(wav)
135
+
136
+ if not self.use_torch_spec:
137
+ mel = self.ap.melspectrogram(wav)
138
+ feats.append(torch.FloatTensor(mel))
139
+ else:
140
+ feats.append(torch.FloatTensor(wav))
141
+
142
+ labels.append(class_id)
143
+
144
+ feats = torch.stack(feats)
145
+ labels = torch.LongTensor(labels)
146
+
147
+ return feats, labels
TTS/encoder/losses.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ from torch import nn
4
+
5
+
6
+ # adapted from https://github.com/cvqluu/GE2E-Loss
7
+ class GE2ELoss(nn.Module):
8
+ def __init__(self, init_w=10.0, init_b=-5.0, loss_method="softmax"):
9
+ """
10
+ Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1]
11
+ Accepts an input of size (N, M, D)
12
+ where N is the number of speakers in the batch,
13
+ M is the number of utterances per speaker,
14
+ and D is the dimensionality of the embedding vector (e.g. d-vector)
15
+ Args:
16
+ - init_w (float): defines the initial value of w in Equation (5) of [1]
17
+ - init_b (float): definies the initial value of b in Equation (5) of [1]
18
+ """
19
+ super().__init__()
20
+ # pylint: disable=E1102
21
+ self.w = nn.Parameter(torch.tensor(init_w))
22
+ # pylint: disable=E1102
23
+ self.b = nn.Parameter(torch.tensor(init_b))
24
+ self.loss_method = loss_method
25
+
26
+ print(" > Initialized Generalized End-to-End loss")
27
+
28
+ assert self.loss_method in ["softmax", "contrast"]
29
+
30
+ if self.loss_method == "softmax":
31
+ self.embed_loss = self.embed_loss_softmax
32
+ if self.loss_method == "contrast":
33
+ self.embed_loss = self.embed_loss_contrast
34
+
35
+ # pylint: disable=R0201
36
+ def calc_new_centroids(self, dvecs, centroids, spkr, utt):
37
+ """
38
+ Calculates the new centroids excluding the reference utterance
39
+ """
40
+ excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt + 1 :]))
41
+ excl = torch.mean(excl, 0)
42
+ new_centroids = []
43
+ for i, centroid in enumerate(centroids):
44
+ if i == spkr:
45
+ new_centroids.append(excl)
46
+ else:
47
+ new_centroids.append(centroid)
48
+ return torch.stack(new_centroids)
49
+
50
+ def calc_cosine_sim(self, dvecs, centroids):
51
+ """
52
+ Make the cosine similarity matrix with dims (N,M,N)
53
+ """
54
+ cos_sim_matrix = []
55
+ for spkr_idx, speaker in enumerate(dvecs):
56
+ cs_row = []
57
+ for utt_idx, utterance in enumerate(speaker):
58
+ new_centroids = self.calc_new_centroids(dvecs, centroids, spkr_idx, utt_idx)
59
+ # vector based cosine similarity for speed
60
+ cs_row.append(
61
+ torch.clamp(
62
+ torch.mm(
63
+ utterance.unsqueeze(1).transpose(0, 1),
64
+ new_centroids.transpose(0, 1),
65
+ )
66
+ / (torch.norm(utterance) * torch.norm(new_centroids, dim=1)),
67
+ 1e-6,
68
+ )
69
+ )
70
+ cs_row = torch.cat(cs_row, dim=0)
71
+ cos_sim_matrix.append(cs_row)
72
+ return torch.stack(cos_sim_matrix)
73
+
74
+ # pylint: disable=R0201
75
+ def embed_loss_softmax(self, dvecs, cos_sim_matrix):
76
+ """
77
+ Calculates the loss on each embedding $L(e_{ji})$ by taking softmax
78
+ """
79
+ N, M, _ = dvecs.shape
80
+ L = []
81
+ for j in range(N):
82
+ L_row = []
83
+ for i in range(M):
84
+ L_row.append(-F.log_softmax(cos_sim_matrix[j, i], 0)[j])
85
+ L_row = torch.stack(L_row)
86
+ L.append(L_row)
87
+ return torch.stack(L)
88
+
89
+ # pylint: disable=R0201
90
+ def embed_loss_contrast(self, dvecs, cos_sim_matrix):
91
+ """
92
+ Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid
93
+ """
94
+ N, M, _ = dvecs.shape
95
+ L = []
96
+ for j in range(N):
97
+ L_row = []
98
+ for i in range(M):
99
+ centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j, i])
100
+ excl_centroids_sigmoids = torch.cat((centroids_sigmoids[:j], centroids_sigmoids[j + 1 :]))
101
+ L_row.append(1.0 - torch.sigmoid(cos_sim_matrix[j, i, j]) + torch.max(excl_centroids_sigmoids))
102
+ L_row = torch.stack(L_row)
103
+ L.append(L_row)
104
+ return torch.stack(L)
105
+
106
+ def forward(self, x, _label=None):
107
+ """
108
+ Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
109
+ """
110
+
111
+ assert x.size()[1] >= 2
112
+
113
+ centroids = torch.mean(x, 1)
114
+ cos_sim_matrix = self.calc_cosine_sim(x, centroids)
115
+ torch.clamp(self.w, 1e-6)
116
+ cos_sim_matrix = self.w * cos_sim_matrix + self.b
117
+ L = self.embed_loss(x, cos_sim_matrix)
118
+ return L.mean()
119
+
120
+
121
+ # adapted from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/angleproto.py
122
+ class AngleProtoLoss(nn.Module):
123
+ """
124
+ Implementation of the Angular Prototypical loss defined in https://arxiv.org/abs/2003.11982
125
+ Accepts an input of size (N, M, D)
126
+ where N is the number of speakers in the batch,
127
+ M is the number of utterances per speaker,
128
+ and D is the dimensionality of the embedding vector
129
+ Args:
130
+ - init_w (float): defines the initial value of w
131
+ - init_b (float): definies the initial value of b
132
+ """
133
+
134
+ def __init__(self, init_w=10.0, init_b=-5.0):
135
+ super().__init__()
136
+ # pylint: disable=E1102
137
+ self.w = nn.Parameter(torch.tensor(init_w))
138
+ # pylint: disable=E1102
139
+ self.b = nn.Parameter(torch.tensor(init_b))
140
+ self.criterion = torch.nn.CrossEntropyLoss()
141
+
142
+ print(" > Initialized Angular Prototypical loss")
143
+
144
+ def forward(self, x, _label=None):
145
+ """
146
+ Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
147
+ """
148
+
149
+ assert x.size()[1] >= 2
150
+
151
+ out_anchor = torch.mean(x[:, 1:, :], 1)
152
+ out_positive = x[:, 0, :]
153
+ num_speakers = out_anchor.size()[0]
154
+
155
+ cos_sim_matrix = F.cosine_similarity(
156
+ out_positive.unsqueeze(-1).expand(-1, -1, num_speakers),
157
+ out_anchor.unsqueeze(-1).expand(-1, -1, num_speakers).transpose(0, 2),
158
+ )
159
+ torch.clamp(self.w, 1e-6)
160
+ cos_sim_matrix = cos_sim_matrix * self.w + self.b
161
+ label = torch.arange(num_speakers).to(cos_sim_matrix.device)
162
+ L = self.criterion(cos_sim_matrix, label)
163
+ return L
164
+
165
+
166
+ class SoftmaxLoss(nn.Module):
167
+ """
168
+ Implementation of the Softmax loss as defined in https://arxiv.org/abs/2003.11982
169
+ Args:
170
+ - embedding_dim (float): speaker embedding dim
171
+ - n_speakers (float): number of speakers
172
+ """
173
+
174
+ def __init__(self, embedding_dim, n_speakers):
175
+ super().__init__()
176
+
177
+ self.criterion = torch.nn.CrossEntropyLoss()
178
+ self.fc = nn.Linear(embedding_dim, n_speakers)
179
+
180
+ print("Initialised Softmax Loss")
181
+
182
+ def forward(self, x, label=None):
183
+ # reshape for compatibility
184
+ x = x.reshape(-1, x.size()[-1])
185
+ label = label.reshape(-1)
186
+
187
+ x = self.fc(x)
188
+ L = self.criterion(x, label)
189
+
190
+ return L
191
+
192
+ def inference(self, embedding):
193
+ x = self.fc(embedding)
194
+ activations = torch.nn.functional.softmax(x, dim=1).squeeze(0)
195
+ class_id = torch.argmax(activations)
196
+ return class_id
197
+
198
+
199
+ class SoftmaxAngleProtoLoss(nn.Module):
200
+ """
201
+ Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153
202
+ Args:
203
+ - embedding_dim (float): speaker embedding dim
204
+ - n_speakers (float): number of speakers
205
+ - init_w (float): defines the initial value of w
206
+ - init_b (float): definies the initial value of b
207
+ """
208
+
209
+ def __init__(self, embedding_dim, n_speakers, init_w=10.0, init_b=-5.0):
210
+ super().__init__()
211
+
212
+ self.softmax = SoftmaxLoss(embedding_dim, n_speakers)
213
+ self.angleproto = AngleProtoLoss(init_w, init_b)
214
+
215
+ print("Initialised SoftmaxAnglePrototypical Loss")
216
+
217
+ def forward(self, x, label=None):
218
+ """
219
+ Calculates the SoftmaxAnglePrototypical loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
220
+ """
221
+
222
+ Lp = self.angleproto(x)
223
+
224
+ Ls = self.softmax(x, label)
225
+
226
+ return Ls + Lp
TTS/encoder/models/base_encoder.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torchaudio
4
+ from coqpit import Coqpit
5
+ from torch import nn
6
+
7
+ from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
8
+ from TTS.utils.generic_utils import set_init_dict
9
+ from TTS.utils.io import load_fsspec
10
+
11
+
12
+ class PreEmphasis(nn.Module):
13
+ def __init__(self, coefficient=0.97):
14
+ super().__init__()
15
+ self.coefficient = coefficient
16
+ self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0))
17
+
18
+ def forward(self, x):
19
+ assert len(x.size()) == 2
20
+
21
+ x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect")
22
+ return torch.nn.functional.conv1d(x, self.filter).squeeze(1)
23
+
24
+
25
+ class BaseEncoder(nn.Module):
26
+ """Base `encoder` class. Every new `encoder` model must inherit this.
27
+
28
+ It defines common `encoder` specific functions.
29
+ """
30
+
31
+ # pylint: disable=W0102
32
+ def __init__(self):
33
+ super(BaseEncoder, self).__init__()
34
+
35
+ def get_torch_mel_spectrogram_class(self, audio_config):
36
+ return torch.nn.Sequential(
37
+ PreEmphasis(audio_config["preemphasis"]),
38
+ # TorchSTFT(
39
+ # n_fft=audio_config["fft_size"],
40
+ # hop_length=audio_config["hop_length"],
41
+ # win_length=audio_config["win_length"],
42
+ # sample_rate=audio_config["sample_rate"],
43
+ # window="hamming_window",
44
+ # mel_fmin=0.0,
45
+ # mel_fmax=None,
46
+ # use_htk=True,
47
+ # do_amp_to_db=False,
48
+ # n_mels=audio_config["num_mels"],
49
+ # power=2.0,
50
+ # use_mel=True,
51
+ # mel_norm=None,
52
+ # )
53
+ torchaudio.transforms.MelSpectrogram(
54
+ sample_rate=audio_config["sample_rate"],
55
+ n_fft=audio_config["fft_size"],
56
+ win_length=audio_config["win_length"],
57
+ hop_length=audio_config["hop_length"],
58
+ window_fn=torch.hamming_window,
59
+ n_mels=audio_config["num_mels"],
60
+ ),
61
+ )
62
+
63
+ @torch.no_grad()
64
+ def inference(self, x, l2_norm=True):
65
+ return self.forward(x, l2_norm)
66
+
67
+ @torch.no_grad()
68
+ def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True):
69
+ """
70
+ Generate embeddings for a batch of utterances
71
+ x: 1xTxD
72
+ """
73
+ # map to the waveform size
74
+ if self.use_torch_spec:
75
+ num_frames = num_frames * self.audio_config["hop_length"]
76
+
77
+ max_len = x.shape[1]
78
+
79
+ if max_len < num_frames:
80
+ num_frames = max_len
81
+
82
+ offsets = np.linspace(0, max_len - num_frames, num=num_eval)
83
+
84
+ frames_batch = []
85
+ for offset in offsets:
86
+ offset = int(offset)
87
+ end_offset = int(offset + num_frames)
88
+ frames = x[:, offset:end_offset]
89
+ frames_batch.append(frames)
90
+
91
+ frames_batch = torch.cat(frames_batch, dim=0)
92
+ embeddings = self.inference(frames_batch, l2_norm=l2_norm)
93
+
94
+ if return_mean:
95
+ embeddings = torch.mean(embeddings, dim=0, keepdim=True)
96
+ return embeddings
97
+
98
+ def get_criterion(self, c: Coqpit, num_classes=None):
99
+ if c.loss == "ge2e":
100
+ criterion = GE2ELoss(loss_method="softmax")
101
+ elif c.loss == "angleproto":
102
+ criterion = AngleProtoLoss()
103
+ elif c.loss == "softmaxproto":
104
+ criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_classes)
105
+ else:
106
+ raise Exception("The %s not is a loss supported" % c.loss)
107
+ return criterion
108
+
109
+ def load_checkpoint(
110
+ self,
111
+ config: Coqpit,
112
+ checkpoint_path: str,
113
+ eval: bool = False,
114
+ use_cuda: bool = False,
115
+ criterion=None,
116
+ cache=False,
117
+ ):
118
+ state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
119
+ try:
120
+ self.load_state_dict(state["model"])
121
+ print(" > Model fully restored. ")
122
+ except (KeyError, RuntimeError) as error:
123
+ # If eval raise the error
124
+ if eval:
125
+ raise error
126
+
127
+ print(" > Partial model initialization.")
128
+ model_dict = self.state_dict()
129
+ model_dict = set_init_dict(model_dict, state["model"], c)
130
+ self.load_state_dict(model_dict)
131
+ del model_dict
132
+
133
+ # load the criterion for restore_path
134
+ if criterion is not None and "criterion" in state:
135
+ try:
136
+ criterion.load_state_dict(state["criterion"])
137
+ except (KeyError, RuntimeError) as error:
138
+ print(" > Criterion load ignored because of:", error)
139
+
140
+ # instance and load the criterion for the encoder classifier in inference time
141
+ if (
142
+ eval
143
+ and criterion is None
144
+ and "criterion" in state
145
+ and getattr(config, "map_classid_to_classname", None) is not None
146
+ ):
147
+ criterion = self.get_criterion(config, len(config.map_classid_to_classname))
148
+ criterion.load_state_dict(state["criterion"])
149
+
150
+ if use_cuda:
151
+ self.cuda()
152
+ if criterion is not None:
153
+ criterion = criterion.cuda()
154
+
155
+ if eval:
156
+ self.eval()
157
+ assert not self.training
158
+
159
+ if not eval:
160
+ return criterion, state["step"]
161
+ return criterion
TTS/encoder/models/lstm.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+
4
+ from TTS.encoder.models.base_encoder import BaseEncoder
5
+
6
+
7
+ class LSTMWithProjection(nn.Module):
8
+ def __init__(self, input_size, hidden_size, proj_size):
9
+ super().__init__()
10
+ self.input_size = input_size
11
+ self.hidden_size = hidden_size
12
+ self.proj_size = proj_size
13
+ self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
14
+ self.linear = nn.Linear(hidden_size, proj_size, bias=False)
15
+
16
+ def forward(self, x):
17
+ self.lstm.flatten_parameters()
18
+ o, (_, _) = self.lstm(x)
19
+ return self.linear(o)
20
+
21
+
22
+ class LSTMWithoutProjection(nn.Module):
23
+ def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers):
24
+ super().__init__()
25
+ self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True)
26
+ self.linear = nn.Linear(lstm_dim, proj_dim, bias=True)
27
+ self.relu = nn.ReLU()
28
+
29
+ def forward(self, x):
30
+ _, (hidden, _) = self.lstm(x)
31
+ return self.relu(self.linear(hidden[-1]))
32
+
33
+
34
+ class LSTMSpeakerEncoder(BaseEncoder):
35
+ def __init__(
36
+ self,
37
+ input_dim,
38
+ proj_dim=256,
39
+ lstm_dim=768,
40
+ num_lstm_layers=3,
41
+ use_lstm_with_projection=True,
42
+ use_torch_spec=False,
43
+ audio_config=None,
44
+ ):
45
+ super().__init__()
46
+ self.use_lstm_with_projection = use_lstm_with_projection
47
+ self.use_torch_spec = use_torch_spec
48
+ self.audio_config = audio_config
49
+ self.proj_dim = proj_dim
50
+
51
+ layers = []
52
+ # choise LSTM layer
53
+ if use_lstm_with_projection:
54
+ layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
55
+ for _ in range(num_lstm_layers - 1):
56
+ layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
57
+ self.layers = nn.Sequential(*layers)
58
+ else:
59
+ self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers)
60
+
61
+ self.instancenorm = nn.InstanceNorm1d(input_dim)
62
+
63
+ if self.use_torch_spec:
64
+ self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
65
+ else:
66
+ self.torch_spec = None
67
+
68
+ self._init_layers()
69
+
70
+ def _init_layers(self):
71
+ for name, param in self.layers.named_parameters():
72
+ if "bias" in name:
73
+ nn.init.constant_(param, 0.0)
74
+ elif "weight" in name:
75
+ nn.init.xavier_normal_(param)
76
+
77
+ def forward(self, x, l2_norm=True):
78
+ """Forward pass of the model.
79
+
80
+ Args:
81
+ x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
82
+ to compute the spectrogram on-the-fly.
83
+ l2_norm (bool): Whether to L2-normalize the outputs.
84
+
85
+ Shapes:
86
+ - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
87
+ """
88
+ with torch.no_grad():
89
+ with torch.cuda.amp.autocast(enabled=False):
90
+ if self.use_torch_spec:
91
+ x.squeeze_(1)
92
+ x = self.torch_spec(x)
93
+ x = self.instancenorm(x).transpose(1, 2)
94
+ d = self.layers(x)
95
+ if self.use_lstm_with_projection:
96
+ d = d[:, -1]
97
+ if l2_norm:
98
+ d = torch.nn.functional.normalize(d, p=2, dim=1)
99
+ return d
TTS/encoder/models/resnet.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+
4
+ # from TTS.utils.audio.torch_transforms import TorchSTFT
5
+ from TTS.encoder.models.base_encoder import BaseEncoder
6
+
7
+
8
+ class SELayer(nn.Module):
9
+ def __init__(self, channel, reduction=8):
10
+ super(SELayer, self).__init__()
11
+ self.avg_pool = nn.AdaptiveAvgPool2d(1)
12
+ self.fc = nn.Sequential(
13
+ nn.Linear(channel, channel // reduction),
14
+ nn.ReLU(inplace=True),
15
+ nn.Linear(channel // reduction, channel),
16
+ nn.Sigmoid(),
17
+ )
18
+
19
+ def forward(self, x):
20
+ b, c, _, _ = x.size()
21
+ y = self.avg_pool(x).view(b, c)
22
+ y = self.fc(y).view(b, c, 1, 1)
23
+ return x * y
24
+
25
+
26
+ class SEBasicBlock(nn.Module):
27
+ expansion = 1
28
+
29
+ def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
30
+ super(SEBasicBlock, self).__init__()
31
+ self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
32
+ self.bn1 = nn.BatchNorm2d(planes)
33
+ self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
34
+ self.bn2 = nn.BatchNorm2d(planes)
35
+ self.relu = nn.ReLU(inplace=True)
36
+ self.se = SELayer(planes, reduction)
37
+ self.downsample = downsample
38
+ self.stride = stride
39
+
40
+ def forward(self, x):
41
+ residual = x
42
+
43
+ out = self.conv1(x)
44
+ out = self.relu(out)
45
+ out = self.bn1(out)
46
+
47
+ out = self.conv2(out)
48
+ out = self.bn2(out)
49
+ out = self.se(out)
50
+
51
+ if self.downsample is not None:
52
+ residual = self.downsample(x)
53
+
54
+ out += residual
55
+ out = self.relu(out)
56
+ return out
57
+
58
+
59
+ class ResNetSpeakerEncoder(BaseEncoder):
60
+ """Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153
61
+ Adapted from: https://github.com/clovaai/voxceleb_trainer
62
+ """
63
+
64
+ # pylint: disable=W0102
65
+ def __init__(
66
+ self,
67
+ input_dim=64,
68
+ proj_dim=512,
69
+ layers=[3, 4, 6, 3],
70
+ num_filters=[32, 64, 128, 256],
71
+ encoder_type="ASP",
72
+ log_input=False,
73
+ use_torch_spec=False,
74
+ audio_config=None,
75
+ ):
76
+ super(ResNetSpeakerEncoder, self).__init__()
77
+
78
+ self.encoder_type = encoder_type
79
+ self.input_dim = input_dim
80
+ self.log_input = log_input
81
+ self.use_torch_spec = use_torch_spec
82
+ self.audio_config = audio_config
83
+ self.proj_dim = proj_dim
84
+
85
+ self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=3, stride=1, padding=1)
86
+ self.relu = nn.ReLU(inplace=True)
87
+ self.bn1 = nn.BatchNorm2d(num_filters[0])
88
+
89
+ self.inplanes = num_filters[0]
90
+ self.layer1 = self.create_layer(SEBasicBlock, num_filters[0], layers[0])
91
+ self.layer2 = self.create_layer(SEBasicBlock, num_filters[1], layers[1], stride=(2, 2))
92
+ self.layer3 = self.create_layer(SEBasicBlock, num_filters[2], layers[2], stride=(2, 2))
93
+ self.layer4 = self.create_layer(SEBasicBlock, num_filters[3], layers[3], stride=(2, 2))
94
+
95
+ self.instancenorm = nn.InstanceNorm1d(input_dim)
96
+
97
+ if self.use_torch_spec:
98
+ self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
99
+ else:
100
+ self.torch_spec = None
101
+
102
+ outmap_size = int(self.input_dim / 8)
103
+
104
+ self.attention = nn.Sequential(
105
+ nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1),
106
+ nn.ReLU(),
107
+ nn.BatchNorm1d(128),
108
+ nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1),
109
+ nn.Softmax(dim=2),
110
+ )
111
+
112
+ if self.encoder_type == "SAP":
113
+ out_dim = num_filters[3] * outmap_size
114
+ elif self.encoder_type == "ASP":
115
+ out_dim = num_filters[3] * outmap_size * 2
116
+ else:
117
+ raise ValueError("Undefined encoder")
118
+
119
+ self.fc = nn.Linear(out_dim, proj_dim)
120
+
121
+ self._init_layers()
122
+
123
+ def _init_layers(self):
124
+ for m in self.modules():
125
+ if isinstance(m, nn.Conv2d):
126
+ nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
127
+ elif isinstance(m, nn.BatchNorm2d):
128
+ nn.init.constant_(m.weight, 1)
129
+ nn.init.constant_(m.bias, 0)
130
+
131
+ def create_layer(self, block, planes, blocks, stride=1):
132
+ downsample = None
133
+ if stride != 1 or self.inplanes != planes * block.expansion:
134
+ downsample = nn.Sequential(
135
+ nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
136
+ nn.BatchNorm2d(planes * block.expansion),
137
+ )
138
+
139
+ layers = []
140
+ layers.append(block(self.inplanes, planes, stride, downsample))
141
+ self.inplanes = planes * block.expansion
142
+ for _ in range(1, blocks):
143
+ layers.append(block(self.inplanes, planes))
144
+
145
+ return nn.Sequential(*layers)
146
+
147
+ # pylint: disable=R0201
148
+ def new_parameter(self, *size):
149
+ out = nn.Parameter(torch.FloatTensor(*size))
150
+ nn.init.xavier_normal_(out)
151
+ return out
152
+
153
+ def forward(self, x, l2_norm=False):
154
+ """Forward pass of the model.
155
+
156
+ Args:
157
+ x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
158
+ to compute the spectrogram on-the-fly.
159
+ l2_norm (bool): Whether to L2-normalize the outputs.
160
+
161
+ Shapes:
162
+ - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
163
+ """
164
+ x.squeeze_(1)
165
+ # if you torch spec compute it otherwise use the mel spec computed by the AP
166
+ if self.use_torch_spec:
167
+ x = self.torch_spec(x)
168
+
169
+ if self.log_input:
170
+ x = (x + 1e-6).log()
171
+ x = self.instancenorm(x).unsqueeze(1)
172
+
173
+ x = self.conv1(x)
174
+ x = self.relu(x)
175
+ x = self.bn1(x)
176
+
177
+ x = self.layer1(x)
178
+ x = self.layer2(x)
179
+ x = self.layer3(x)
180
+ x = self.layer4(x)
181
+
182
+ x = x.reshape(x.size()[0], -1, x.size()[-1])
183
+
184
+ w = self.attention(x)
185
+
186
+ if self.encoder_type == "SAP":
187
+ x = torch.sum(x * w, dim=2)
188
+ elif self.encoder_type == "ASP":
189
+ mu = torch.sum(x * w, dim=2)
190
+ sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-5))
191
+ x = torch.cat((mu, sg), 1)
192
+
193
+ x = x.view(x.size()[0], -1)
194
+ x = self.fc(x)
195
+
196
+ if l2_norm:
197
+ x = torch.nn.functional.normalize(x, p=2, dim=1)
198
+ return x
TTS/encoder/requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ umap-learn
2
+ numpy>=1.17.0
TTS/encoder/utils/__init__.py ADDED
File without changes
TTS/encoder/utils/generic_utils.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import glob
3
+ import os
4
+ import random
5
+ import re
6
+
7
+ import numpy as np
8
+ from scipy import signal
9
+
10
+ from TTS.encoder.models.lstm import LSTMSpeakerEncoder
11
+ from TTS.encoder.models.resnet import ResNetSpeakerEncoder
12
+ from TTS.utils.io import save_fsspec
13
+
14
+
15
+ class AugmentWAV(object):
16
+ def __init__(self, ap, augmentation_config):
17
+ self.ap = ap
18
+ self.use_additive_noise = False
19
+
20
+ if "additive" in augmentation_config.keys():
21
+ self.additive_noise_config = augmentation_config["additive"]
22
+ additive_path = self.additive_noise_config["sounds_path"]
23
+ if additive_path:
24
+ self.use_additive_noise = True
25
+ # get noise types
26
+ self.additive_noise_types = []
27
+ for key in self.additive_noise_config.keys():
28
+ if isinstance(self.additive_noise_config[key], dict):
29
+ self.additive_noise_types.append(key)
30
+
31
+ additive_files = glob.glob(os.path.join(additive_path, "**/*.wav"), recursive=True)
32
+
33
+ self.noise_list = {}
34
+
35
+ for wav_file in additive_files:
36
+ noise_dir = wav_file.replace(additive_path, "").split(os.sep)[0]
37
+ # ignore not listed directories
38
+ if noise_dir not in self.additive_noise_types:
39
+ continue
40
+ if not noise_dir in self.noise_list:
41
+ self.noise_list[noise_dir] = []
42
+ self.noise_list[noise_dir].append(wav_file)
43
+
44
+ print(
45
+ f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}"
46
+ )
47
+
48
+ self.use_rir = False
49
+
50
+ if "rir" in augmentation_config.keys():
51
+ self.rir_config = augmentation_config["rir"]
52
+ if self.rir_config["rir_path"]:
53
+ self.rir_files = glob.glob(os.path.join(self.rir_config["rir_path"], "**/*.wav"), recursive=True)
54
+ self.use_rir = True
55
+
56
+ print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances")
57
+
58
+ self.create_augmentation_global_list()
59
+
60
+ def create_augmentation_global_list(self):
61
+ if self.use_additive_noise:
62
+ self.global_noise_list = self.additive_noise_types
63
+ else:
64
+ self.global_noise_list = []
65
+ if self.use_rir:
66
+ self.global_noise_list.append("RIR_AUG")
67
+
68
+ def additive_noise(self, noise_type, audio):
69
+ clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4)
70
+
71
+ noise_list = random.sample(
72
+ self.noise_list[noise_type],
73
+ random.randint(
74
+ self.additive_noise_config[noise_type]["min_num_noises"],
75
+ self.additive_noise_config[noise_type]["max_num_noises"],
76
+ ),
77
+ )
78
+
79
+ audio_len = audio.shape[0]
80
+ noises_wav = None
81
+ for noise in noise_list:
82
+ noiseaudio = self.ap.load_wav(noise, sr=self.ap.sample_rate)[:audio_len]
83
+
84
+ if noiseaudio.shape[0] < audio_len:
85
+ continue
86
+
87
+ noise_snr = random.uniform(
88
+ self.additive_noise_config[noise_type]["min_snr_in_db"],
89
+ self.additive_noise_config[noise_type]["max_num_noises"],
90
+ )
91
+ noise_db = 10 * np.log10(np.mean(noiseaudio**2) + 1e-4)
92
+ noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio
93
+
94
+ if noises_wav is None:
95
+ noises_wav = noise_wav
96
+ else:
97
+ noises_wav += noise_wav
98
+
99
+ # if all possible files is less than audio, choose other files
100
+ if noises_wav is None:
101
+ return self.additive_noise(noise_type, audio)
102
+
103
+ return audio + noises_wav
104
+
105
+ def reverberate(self, audio):
106
+ audio_len = audio.shape[0]
107
+
108
+ rir_file = random.choice(self.rir_files)
109
+ rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate)
110
+ rir = rir / np.sqrt(np.sum(rir**2))
111
+ return signal.convolve(audio, rir, mode=self.rir_config["conv_mode"])[:audio_len]
112
+
113
+ def apply_one(self, audio):
114
+ noise_type = random.choice(self.global_noise_list)
115
+ if noise_type == "RIR_AUG":
116
+ return self.reverberate(audio)
117
+
118
+ return self.additive_noise(noise_type, audio)
119
+
120
+
121
+ def to_camel(text):
122
+ text = text.capitalize()
123
+ return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
124
+
125
+
126
+ def setup_encoder_model(config: "Coqpit"):
127
+ if config.model_params["model_name"].lower() == "lstm":
128
+ model = LSTMSpeakerEncoder(
129
+ config.model_params["input_dim"],
130
+ config.model_params["proj_dim"],
131
+ config.model_params["lstm_dim"],
132
+ config.model_params["num_lstm_layers"],
133
+ use_torch_spec=config.model_params.get("use_torch_spec", False),
134
+ audio_config=config.audio,
135
+ )
136
+ elif config.model_params["model_name"].lower() == "resnet":
137
+ model = ResNetSpeakerEncoder(
138
+ input_dim=config.model_params["input_dim"],
139
+ proj_dim=config.model_params["proj_dim"],
140
+ log_input=config.model_params.get("log_input", False),
141
+ use_torch_spec=config.model_params.get("use_torch_spec", False),
142
+ audio_config=config.audio,
143
+ )
144
+ return model
145
+
146
+
147
+ def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch):
148
+ checkpoint_path = "checkpoint_{}.pth".format(current_step)
149
+ checkpoint_path = os.path.join(out_path, checkpoint_path)
150
+ print(" | | > Checkpoint saving : {}".format(checkpoint_path))
151
+
152
+ new_state_dict = model.state_dict()
153
+ state = {
154
+ "model": new_state_dict,
155
+ "optimizer": optimizer.state_dict() if optimizer is not None else None,
156
+ "criterion": criterion.state_dict(),
157
+ "step": current_step,
158
+ "epoch": epoch,
159
+ "loss": model_loss,
160
+ "date": datetime.date.today().strftime("%B %d, %Y"),
161
+ }
162
+ save_fsspec(state, checkpoint_path)
163
+
164
+
165
+ def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step, epoch):
166
+ if model_loss < best_loss:
167
+ new_state_dict = model.state_dict()
168
+ state = {
169
+ "model": new_state_dict,
170
+ "optimizer": optimizer.state_dict(),
171
+ "criterion": criterion.state_dict(),
172
+ "step": current_step,
173
+ "epoch": epoch,
174
+ "loss": model_loss,
175
+ "date": datetime.date.today().strftime("%B %d, %Y"),
176
+ }
177
+ best_loss = model_loss
178
+ bestmodel_path = "best_model.pth"
179
+ bestmodel_path = os.path.join(out_path, bestmodel_path)
180
+ print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
181
+ save_fsspec(state, bestmodel_path)
182
+ return best_loss