diff --git a/.cardboardlint.yml b/.cardboardlint.yml
new file mode 100644
index 0000000000000000000000000000000000000000..4a115a37cddb065c76afebc905476e650f53d085
--- /dev/null
+++ b/.cardboardlint.yml
@@ -0,0 +1,5 @@
+linters:
+- pylint:
+ # pylintrc: pylintrc
+ filefilter: ['- test_*.py', '+ *.py', '- *.npy']
+ # exclude:
\ No newline at end of file
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..8d8ad918c964012d81e3913af1a9ba76afa50140
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,9 @@
+.git/
+Dockerfile
+build/
+dist/
+TTS.egg-info/
+tests/outputs/*
+tests/train_outputs/*
+__pycache__/
+*.pyc
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..563040e8dac9d86ecde0ccfc43691ccff420e87d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,171 @@
+WadaSNR/
+.idea/
+*.pyc
+.DS_Store
+./__init__.py
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+.static_storage/
+.media/
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# vim
+*.swp
+*.swm
+*.swn
+*.swo
+
+# pytorch models
+*.pth
+*.pth.tar
+!dummy_speakers.pth
+result/
+
+# setup.py
+version.py
+
+# jupyter dummy files
+core
+
+# ignore local datasets
+recipes/WIP/*
+recipes/ljspeech/LJSpeech-1.1/*
+recipes/vctk/VCTK/*
+recipes/**/*.npy
+recipes/**/*.json
+VCTK-Corpus-removed-silence/*
+
+# ignore training logs
+trainer_*_log.txt
+
+# files used internally for dev, test etc.
+tests/outputs/*
+tests/train_outputs/*
+TODO.txt
+.vscode/*
+data/*
+notebooks/data/*
+TTS/tts/utils/monotonic_align/core.c
+.vscode-upload.json
+temp_build/*
+events.out*
+old_configs/*
+model_importers/*
+model_profiling/*
+docs/source/TODO/*
+.noseids
+.dccache
+log.txt
+umap.png
+*.out
+SocialMedia.txt
+output.wav
+tts_output.wav
+deps.json
+speakers.json
+internal/*
+*_pitch.npy
+*_phoneme.npy
+wandb
+depot/*
+coqui_recipes/*
+local_scripts/*
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..911f2a838ea4c598f83a415da5fd58e83dfc3659
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,27 @@
+repos:
+ - repo: 'https://github.com/pre-commit/pre-commit-hooks'
+ rev: v2.3.0
+ hooks:
+ - id: check-yaml
+ - id: end-of-file-fixer
+ - id: trailing-whitespace
+ - repo: 'https://github.com/psf/black'
+ rev: 22.3.0
+ hooks:
+ - id: black
+ language_version: python3
+ - repo: https://github.com/pycqa/isort
+ rev: 5.8.0
+ hooks:
+ - id: isort
+ name: isort (python)
+ - id: isort
+ name: isort (cython)
+ types: [cython]
+ - id: isort
+ name: isort (pyi)
+ types: [pyi]
+ - repo: https://github.com/pycqa/pylint
+ rev: v2.8.2
+ hooks:
+ - id: pylint
diff --git a/.pylintrc b/.pylintrc
new file mode 100644
index 0000000000000000000000000000000000000000..d5f9c4909cb3fe0faeb41d4ec72764c1c69ec754
--- /dev/null
+++ b/.pylintrc
@@ -0,0 +1,597 @@
+[MASTER]
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code.
+extension-pkg-whitelist=
+
+# Add files or directories to the blacklist. They should be base names, not
+# paths.
+ignore=CVS
+
+# Add files or directories matching the regex patterns to the blacklist. The
+# regex matches against base names, not paths.
+ignore-patterns=
+
+# Python code to execute, usually for sys.path manipulation such as
+# pygtk.require().
+#init-hook=
+
+# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
+# number of processors available to use.
+jobs=1
+
+# Control the amount of potential inferred values when inferring a single
+# object. This can help the performance when dealing with large functions or
+# complex, nested conditions.
+limit-inference-results=100
+
+# List of plugins (as comma separated values of python modules names) to load,
+# usually to register additional checkers.
+load-plugins=
+
+# Pickle collected data for later comparisons.
+persistent=yes
+
+# Specify a configuration file.
+#rcfile=
+
+# When enabled, pylint would attempt to guess common misconfiguration and emit
+# user-friendly hints instead of false-positive error messages.
+suggestion-mode=yes
+
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+
+
+[MESSAGES CONTROL]
+
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
+confidence=
+
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once). You can also use "--disable=all" to
+# disable everything first and then reenable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use "--disable=all --enable=classes
+# --disable=W".
+disable=missing-docstring,
+ too-many-public-methods,
+ too-many-lines,
+ bare-except,
+ ## for avoiding weird p3.6 CI linter error
+ ## TODO: see later if we can remove this
+ assigning-non-slot,
+ unsupported-assignment-operation,
+ ## end
+ line-too-long,
+ fixme,
+ wrong-import-order,
+ ungrouped-imports,
+ wrong-import-position,
+ import-error,
+ invalid-name,
+ too-many-instance-attributes,
+ arguments-differ,
+ arguments-renamed,
+ no-name-in-module,
+ no-member,
+ unsubscriptable-object,
+ print-statement,
+ parameter-unpacking,
+ unpacking-in-except,
+ old-raise-syntax,
+ backtick,
+ long-suffix,
+ old-ne-operator,
+ old-octal-literal,
+ import-star-module-level,
+ non-ascii-bytes-literal,
+ raw-checker-failed,
+ bad-inline-option,
+ locally-disabled,
+ file-ignored,
+ suppressed-message,
+ useless-suppression,
+ deprecated-pragma,
+ use-symbolic-message-instead,
+ useless-object-inheritance,
+ too-few-public-methods,
+ too-many-branches,
+ too-many-arguments,
+ too-many-locals,
+ too-many-statements,
+ apply-builtin,
+ basestring-builtin,
+ buffer-builtin,
+ cmp-builtin,
+ coerce-builtin,
+ execfile-builtin,
+ file-builtin,
+ long-builtin,
+ raw_input-builtin,
+ reduce-builtin,
+ standarderror-builtin,
+ unicode-builtin,
+ xrange-builtin,
+ coerce-method,
+ delslice-method,
+ getslice-method,
+ setslice-method,
+ no-absolute-import,
+ old-division,
+ dict-iter-method,
+ dict-view-method,
+ next-method-called,
+ metaclass-assignment,
+ indexing-exception,
+ raising-string,
+ reload-builtin,
+ oct-method,
+ hex-method,
+ nonzero-method,
+ cmp-method,
+ input-builtin,
+ round-builtin,
+ intern-builtin,
+ unichr-builtin,
+ map-builtin-not-iterating,
+ zip-builtin-not-iterating,
+ range-builtin-not-iterating,
+ filter-builtin-not-iterating,
+ using-cmp-argument,
+ eq-without-hash,
+ div-method,
+ idiv-method,
+ rdiv-method,
+ exception-message-attribute,
+ invalid-str-codec,
+ sys-max-int,
+ bad-python3-import,
+ deprecated-string-function,
+ deprecated-str-translate-call,
+ deprecated-itertools-function,
+ deprecated-types-field,
+ next-method-defined,
+ dict-items-not-iterating,
+ dict-keys-not-iterating,
+ dict-values-not-iterating,
+ deprecated-operator-function,
+ deprecated-urllib-function,
+ xreadlines-attribute,
+ deprecated-sys-function,
+ exception-escape,
+ comprehension-escape,
+ duplicate-code,
+ not-callable,
+ import-outside-toplevel
+
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+enable=c-extension-no-member
+
+
+[REPORTS]
+
+# Python expression which should return a note less than 10 (10 is the highest
+# note). You have access to the variables errors warning, statement which
+# respectively contain the number of errors / warnings messages and the total
+# number of statements analyzed. This is used by the global evaluation report
+# (RP0004).
+evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
+
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details.
+#msg-template=
+
+# Set the output format. Available formats are text, parseable, colorized, json
+# and msvs (visual studio). You can also give a reporter class, e.g.
+# mypackage.mymodule.MyReporterClass.
+output-format=text
+
+# Tells whether to display a full report or only the messages.
+reports=no
+
+# Activate the evaluation score.
+score=yes
+
+
+[REFACTORING]
+
+# Maximum number of nested blocks for function / method body
+max-nested-blocks=5
+
+# Complete name of functions that never returns. When checking for
+# inconsistent-return-statements if a never returning function is called then
+# it will be considered as an explicit return statement and no message will be
+# printed.
+never-returning-functions=sys.exit
+
+
+[LOGGING]
+
+# Format style used to check logging format string. `old` means using %
+# formatting, while `new` is for `{}` formatting.
+logging-format-style=old
+
+# Logging modules to check that the string format arguments are in logging
+# function parameter format.
+logging-modules=logging
+
+
+[SPELLING]
+
+# Limits count of emitted suggestions for spelling mistakes.
+max-spelling-suggestions=4
+
+# Spelling dictionary name. Available dictionaries: none. To make it working
+# install python-enchant package..
+spelling-dict=
+
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+
+# A path to a file that contains private dictionary; one word per line.
+spelling-private-dict-file=
+
+# Tells whether to store unknown words to indicated private dictionary in
+# --spelling-private-dict-file option instead of raising a message.
+spelling-store-unknown-words=no
+
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=FIXME,
+ XXX,
+ TODO
+
+
+[TYPECHECK]
+
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators=contextlib.contextmanager
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=numpy.*,torch.*
+
+# Tells whether missing members accessed in mixin class should be ignored. A
+# mixin class is detected if its name ends with "mixin" (case insensitive).
+ignore-mixin-members=yes
+
+# Tells whether to warn about missing members when the owner of the attribute
+# is inferred to be None.
+ignore-none=yes
+
+# This flag controls whether pylint should warn about no-member and similar
+# checks whenever an opaque object is returned when inferring. The inference
+# can return multiple potential results while evaluating a Python object, but
+# some branches might not be evaluated, which results in partial inference. In
+# that case, it might be useful to still emit no-member and other checks for
+# the rest of the inferred objects.
+ignore-on-opaque-inference=yes
+
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=optparse.Values,thread._local,_thread._local
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis. It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=
+
+# Show a hint with possible names when a member name was not found. The aspect
+# of finding the hint is based on edit distance.
+missing-member-hint=yes
+
+# The minimum edit distance a name should have in order to be considered a
+# similar match for a missing member name.
+missing-member-hint-distance=1
+
+# The total number of similar names that should be taken in consideration when
+# showing a hint for a missing member.
+missing-member-max-choices=1
+
+
+[VARIABLES]
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid defining new builtins when possible.
+additional-builtins=
+
+# Tells whether unused global variables should be treated as a violation.
+allow-global-unused-variables=yes
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,
+ _cb
+
+# A regular expression matching the name of dummy variables (i.e. expected to
+# not be used).
+dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
+
+# Argument names that match this expression will be ignored. Default to name
+# with leading underscore.
+ignored-argument-names=_.*|^ignored_|^unused_
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
+
+
+[FORMAT]
+
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=^\s*(# )??$
+
+# Number of spaces of indent required inside a hanging or continued line.
+indent-after-paren=4
+
+# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
+# tab).
+indent-string=' '
+
+# Maximum number of characters on a single line.
+max-line-length=120
+
+# Maximum number of lines in a module.
+max-module-lines=1000
+
+# List of optional constructs for which whitespace checking is disabled. `dict-
+# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}.
+# `trailing-comma` allows a space between comma and closing bracket: (a, ).
+# `empty-line` allows space-only lines.
+no-space-check=trailing-comma,
+ dict-separator
+
+# Allow the body of a class to be on the same line as the declaration if body
+# contains single statement.
+single-line-class-stmt=no
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=no
+
+
+[SIMILARITIES]
+
+# Ignore comments when computing similarities.
+ignore-comments=yes
+
+# Ignore docstrings when computing similarities.
+ignore-docstrings=yes
+
+# Ignore imports when computing similarities.
+ignore-imports=no
+
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+
+
+[BASIC]
+
+# Naming style matching correct argument names.
+argument-naming-style=snake_case
+
+# Regular expression matching correct argument names. Overrides argument-
+# naming-style.
+argument-rgx=[a-z_][a-z0-9_]{0,30}$
+
+# Naming style matching correct attribute names.
+attr-naming-style=snake_case
+
+# Regular expression matching correct attribute names. Overrides attr-naming-
+# style.
+#attr-rgx=
+
+# Bad variable names which should always be refused, separated by a comma.
+bad-names=
+
+# Naming style matching correct class attribute names.
+class-attribute-naming-style=any
+
+# Regular expression matching correct class attribute names. Overrides class-
+# attribute-naming-style.
+#class-attribute-rgx=
+
+# Naming style matching correct class names.
+class-naming-style=PascalCase
+
+# Regular expression matching correct class names. Overrides class-naming-
+# style.
+#class-rgx=
+
+# Naming style matching correct constant names.
+const-naming-style=UPPER_CASE
+
+# Regular expression matching correct constant names. Overrides const-naming-
+# style.
+#const-rgx=
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=-1
+
+# Naming style matching correct function names.
+function-naming-style=snake_case
+
+# Regular expression matching correct function names. Overrides function-
+# naming-style.
+#function-rgx=
+
+# Good variable names which should always be accepted, separated by a comma.
+good-names=i,
+ j,
+ k,
+ x,
+ ex,
+ Run,
+ _
+
+# Include a hint for the correct naming format with invalid-name.
+include-naming-hint=no
+
+# Naming style matching correct inline iteration names.
+inlinevar-naming-style=any
+
+# Regular expression matching correct inline iteration names. Overrides
+# inlinevar-naming-style.
+#inlinevar-rgx=
+
+# Naming style matching correct method names.
+method-naming-style=snake_case
+
+# Regular expression matching correct method names. Overrides method-naming-
+# style.
+#method-rgx=
+
+# Naming style matching correct module names.
+module-naming-style=snake_case
+
+# Regular expression matching correct module names. Overrides module-naming-
+# style.
+#module-rgx=
+
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=^_
+
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties.
+# These decorators are taken in consideration only for invalid-name.
+property-classes=abc.abstractproperty
+
+# Naming style matching correct variable names.
+variable-naming-style=snake_case
+
+# Regular expression matching correct variable names. Overrides variable-
+# naming-style.
+variable-rgx=[a-z_][a-z0-9_]{0,30}$
+
+
+[STRING]
+
+# This flag controls whether the implicit-str-concat-in-sequence should
+# generate a warning on implicit string concatenation in sequences defined over
+# several lines.
+check-str-concat-over-line-jumps=no
+
+
+[IMPORTS]
+
+# Allow wildcard imports from modules that define __all__.
+allow-wildcard-with-all=no
+
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
+
+# Deprecated modules which should not be used, separated by a comma.
+deprecated-modules=optparse,tkinter.tix
+
+# Create a graph of external dependencies in the given file (report RP0402 must
+# not be disabled).
+ext-import-graph=
+
+# Create a graph of every (i.e. internal and external) dependencies in the
+# given file (report RP0402 must not be disabled).
+import-graph=
+
+# Create a graph of internal dependencies in the given file (report RP0402 must
+# not be disabled).
+int-import-graph=
+
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+known-standard-library=
+
+# Force import order to recognize a module as part of a third party library.
+known-third-party=enchant
+
+
+[CLASSES]
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,
+ __new__,
+ setUp
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,
+ _fields,
+ _replace,
+ _source,
+ _make
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=cls
+
+
+[DESIGN]
+
+# Maximum number of arguments for function / method.
+max-args=5
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes=7
+
+# Maximum number of boolean expressions in an if statement.
+max-bool-expr=5
+
+# Maximum number of branch for function / method body.
+max-branches=12
+
+# Maximum number of locals for function / method body.
+max-locals=15
+
+# Maximum number of parents for a class (see R0901).
+max-parents=15
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+
+# Maximum number of return / yield for function / method body.
+max-returns=6
+
+# Maximum number of statements in function / method body.
+max-statements=50
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=2
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when being caught. Defaults to
+# "BaseException, Exception".
+overgeneral-exceptions=BaseException,
+ Exception
diff --git a/.readthedocs.yml b/.readthedocs.yml
new file mode 100644
index 0000000000000000000000000000000000000000..946d363cff24913f01fffef6b5a2e868f99ad14b
--- /dev/null
+++ b/.readthedocs.yml
@@ -0,0 +1,18 @@
+# .readthedocs.yml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+ builder: html
+ configuration: docs/source/conf.py
+
+# Optionally set the version of Python and requirements required to build your docs
+python:
+ version: 3.7
+ install:
+ - requirements: docs/requirements.txt
+ - requirements: requirements.txt
\ No newline at end of file
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 0000000000000000000000000000000000000000..6b0c8f19af1b37607c3994abe28b8d362cbcb564
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,20 @@
+cff-version: 1.2.0
+message: "If you want to cite 🐸💬, feel free to use this (but only if you loved it 😊)"
+title: "Coqui TTS"
+abstract: "A deep learning toolkit for Text-to-Speech, battle-tested in research and production"
+date-released: 2021-01-01
+authors:
+ - family-names: "Eren"
+ given-names: "Gölge"
+ - name: "The Coqui TTS Team"
+version: 1.4
+doi: 10.5281/zenodo.6334862
+license: "MPL-2.0"
+url: "https://www.coqui.ai"
+repository-code: "https://github.com/coqui-ai/TTS"
+keywords:
+ - machine learning
+ - deep learning
+ - artificial intelligence
+ - text to speech
+ - TTS
\ No newline at end of file
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000000000000000000000000000000000..b80639d63c29e902c547de347806651bcc9ad3b2
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,133 @@
+
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, caste, color, religion, or sexual identity
+and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+ and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the
+ overall community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or
+ advances of any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email
+ address, without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+ professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+coc-report@coqui.ai.
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series
+of actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or
+permanent ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within
+the community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.0, available at
+[https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0].
+
+Community Impact Guidelines were inspired by
+[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
+
+For answers to common questions about this code of conduct, see the FAQ at
+[https://www.contributor-covenant.org/faq][FAQ]. Translations are available
+at [https://www.contributor-covenant.org/translations][translations].
+
+[homepage]: https://www.contributor-covenant.org
+[v2.0]: https://www.contributor-covenant.org/version/2/0/code_of_conduct.html
+[Mozilla CoC]: https://github.com/mozilla/diversity
+[FAQ]: https://www.contributor-covenant.org/faq
+[translations]: https://www.contributor-covenant.org/translations
diff --git a/CODE_OWNERS.rst b/CODE_OWNERS.rst
new file mode 100644
index 0000000000000000000000000000000000000000..768b573911eae8aeb229de6f56039deb9a64ce27
--- /dev/null
+++ b/CODE_OWNERS.rst
@@ -0,0 +1,75 @@
+TTS code owners / governance system
+==========================================
+
+TTS is run under a governance system inspired (and partially copied from) by the `Mozilla module ownership system `_. The project is roughly divided into modules, and each module has its owners, which are responsible for reviewing pull requests and deciding on technical direction for their modules. Module ownership authority is given to people who have worked extensively on areas of the project.
+
+Module owners also have the authority of naming other module owners or appointing module peers, which are people with authority to review pull requests in that module. They can also sub-divide their module into sub-modules with their owners.
+
+Module owners are not tyrants. They are chartered to make decisions with input from the community and in the best interest of the community. Module owners are not required to make code changes or additions solely because the community wants them to do so. (Like anyone else, the module owners may write code because they want to, because their employers want them to, because the community wants them to, or for some other reason.) Module owners do need to pay attention to patches submitted to that module. However “pay attention” does not mean agreeing to every patch. Some patches may not make sense for the WebThings project; some may be poorly implemented. Module owners have the authority to decline a patch; this is a necessary part of the role. We ask the module owners to describe in the relevant issue their reasons for wanting changes to a patch, for declining it altogether, or for postponing review for some period. We don’t ask or expect them to rewrite patches to make them acceptable. Similarly, module owners may need to delay review of a promising patch due to an upcoming deadline. For example, a patch may be of interest, but not for the next milestone. In such a case it may make sense for the module owner to postpone review of a patch until after matters needed for a milestone have been finalized. Again, we expect this to be described in the relevant issue. And of course, it shouldn’t go on very often or for very long or escalation and review is likely.
+
+The work of the various module owners and peers is overseen by the global owners, which are responsible for making final decisions in case there's conflict between owners as well as set the direction for the project as a whole.
+
+This file describes module owners who are active on the project and which parts of the code they have expertise on (and interest in). If you're making changes to the code and are wondering who's an appropriate person to talk to, this list will tell you who to ping.
+
+There's overlap in the areas of expertise of each owner, and in particular when looking at which files are covered by each area, there is a lot of overlap. Don't worry about getting it exactly right when requesting review, any code owner will be happy to redirect the request to a more appropriate person.
+
+Global owners
+----------------
+
+These are people who have worked on the project extensively and are familiar with all or most parts of it. Their expertise and review guidance is trusted by other code owners to cover their own areas of expertise. In case of conflicting opinions from other owners, global owners will make a final decision.
+
+- Eren Gölge (@erogol)
+- Reuben Morais (@reuben)
+
+Training, feeding
+-----------------
+
+- Eren Gölge (@erogol)
+
+Model exporting
+---------------
+
+- Eren Gölge (@erogol)
+
+Multi-Speaker TTS
+-----------------
+
+- Eren Gölge (@erogol)
+- Edresson Casanova (@edresson)
+
+TTS
+---
+
+- Eren Gölge (@erogol)
+
+Vocoders
+--------
+
+- Eren Gölge (@erogol)
+
+Speaker Encoder
+---------------
+
+- Eren Gölge (@erogol)
+
+Testing & CI
+------------
+
+- Eren Gölge (@erogol)
+- Reuben Morais (@reuben)
+
+Python bindings
+---------------
+
+- Eren Gölge (@erogol)
+- Reuben Morais (@reuben)
+
+Documentation
+-------------
+
+- Eren Gölge (@erogol)
+
+Third party bindings
+--------------------
+
+Owned by the author.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..81a426e823c6ccc1f9987b9260a270c3e143500d
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,136 @@
+# Contribution guidelines
+
+Welcome to the 🐸TTS!
+
+This repository is governed by [the Contributor Covenant Code of Conduct](https://github.com/coqui-ai/TTS/blob/main/CODE_OF_CONDUCT.md).
+
+## Where to start.
+We welcome everyone who likes to contribute to 🐸TTS.
+
+You can contribute not only with code but with bug reports, comments, questions, answers, or just a simple tweet to spread the word.
+
+If you like to contribute code, squash a bug but if you don't know where to start, here are some pointers.
+
+- [Development Road Map](https://github.com/coqui-ai/TTS/issues/378)
+
+ You can pick something out of our road map. We keep the progess of the project in this simple issue thread. It has new model proposals or developmental updates etc.
+
+- [Github Issues Tracker](https://github.com/coqui-ai/TTS/issues)
+
+ This is a place to find feature requests, bugs.
+
+ Issues with the ```good first issue``` tag are good place for beginners to take on.
+
+- ✨**PR**✨ [pages](https://github.com/coqui-ai/TTS/pulls) with the ```🚀new version``` tag.
+
+ We list all the target improvements for the next version. You can pick one of them and start contributing.
+
+- Also feel free to suggest new features, ideas and models. We're always open for new things.
+
+## Call for sharing language models
+If possible, please consider sharing your pre-trained models in any language (if the licences allow for you to do so). We will include them in our model catalogue for public use and give the proper attribution, whether it be your name, company, website or any other source specified.
+
+This model can be shared in two ways:
+1. Share the model files with us and we serve them with the next 🐸 TTS release.
+2. Upload your models on GDrive and share the link.
+
+Models are served under `.models.json` file and any model is available under TTS CLI or Server end points.
+
+Either way you choose, please make sure you send the models [here](https://github.com/coqui-ai/TTS/issues/380).
+
+## Sending a ✨**PR**✨
+
+If you have a new feature, a model to implement, or a bug to squash, go ahead and send a ✨**PR**✨.
+Please use the following steps to send a ✨**PR**✨.
+Let us know if you encounter a problem along the way.
+
+The following steps are tested on an Ubuntu system.
+
+1. Fork 🐸TTS[https://github.com/coqui-ai/TTS] by clicking the fork button at the top right corner of the project page.
+
+2. Clone 🐸TTS and add the main repo as a new remote named ```upsteam```.
+
+ ```bash
+ $ git clone git@github.com:/TTS.git
+ $ cd TTS
+ $ git remote add upstream https://github.com/coqui-ai/TTS.git
+ ```
+
+3. Install 🐸TTS for development.
+
+ ```bash
+ $ make system-deps # intended to be used on Ubuntu (Debian). Let us know if you have a different OS.
+ $ make install
+ ```
+
+4. Create a new branch with an informative name for your goal.
+
+ ```bash
+ $ git checkout -b an_informative_name_for_my_branch
+ ```
+
+5. Implement your changes on your new branch.
+
+6. Explain your code using [Google Style](https://google.github.io/styleguide/pyguide.html#381-docstrings) docstrings.
+
+7. Add your tests to our test suite under ```tests``` folder. It is important to show that your code works, edge cases are considered, and inform others about the intended use.
+
+8. Run the tests to see how your updates work with the rest of the project. You can repeat this step multiple times as you implement your changes to make sure you are on the right direction.
+
+ ```bash
+ $ make test # stop at the first error
+ $ make test_all # run all the tests, report all the errors
+ ```
+
+9. Format your code. We use ```black``` for code and ```isort``` for ```import``` formatting.
+
+ ```bash
+ $ make style
+ ```
+
+10. Run the linter and correct the issues raised. We use ```pylint``` for linting. It helps to enforce a coding standard, offers simple refactoring suggestions.
+
+ ```bash
+ $ make lint
+ ```
+
+11. When things are good, add new files and commit your changes.
+
+ ```bash
+ $ git add my_file1.py my_file2.py ...
+ $ git commit
+ ```
+
+ It's a good practice to regularly sync your local copy of the project with the upstream code to keep up with the recent updates.
+
+ ```bash
+ $ git fetch upstream
+ $ git rebase upstream/master
+ # or for the development version
+ $ git rebase upstream/dev
+ ```
+
+12. Send a PR to ```dev``` branch.
+
+ Push your branch to your fork.
+
+ ```bash
+ $ git push -u origin an_informative_name_for_my_branch
+ ```
+
+ Then go to your fork's Github page and click on 'Pull request' to send your ✨**PR**✨.
+
+ Please set ✨**PR**✨'s target branch to ```dev``` as we use ```dev``` to work on the next version.
+
+13. Let's discuss until it is perfect. 💪
+
+ We might ask you for certain changes that would appear in the ✨**PR**✨'s page under 🐸TTS[https://github.com/coqui-ai/TTS/pulls].
+
+14. Once things look perfect, We merge it to the ```dev``` branch and make it ready for the next version.
+
+Feel free to ping us at any step you need help using our communication channels.
+
+If you are new to Github or open-source contribution, These are good resources.
+
+- [Github Docs](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/proposing-changes-to-your-work-with-pull-requests)
+- [First-Contribution](https://github.com/firstcontributions/first-contributions)
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..7182798e7b359883bdfd5f38ef4ff0ac2de96c71
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,12 @@
+ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04
+FROM ${BASE}
+RUN apt-get update && apt-get upgrade -y
+RUN apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
+RUN pip3 install llvmlite --ignore-installed
+
+WORKDIR /root
+COPY . /root
+RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
+RUN make install
+ENTRYPOINT ["tts"]
+CMD ["--help"]
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000000000000000000000000000000000000..14e2f777f6c395e7e04ab4aa306bbcc4b0c1120e
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,373 @@
+Mozilla Public License Version 2.0
+==================================
+
+1. Definitions
+--------------
+
+1.1. "Contributor"
+ means each individual or legal entity that creates, contributes to
+ the creation of, or owns Covered Software.
+
+1.2. "Contributor Version"
+ means the combination of the Contributions of others (if any) used
+ by a Contributor and that particular Contributor's Contribution.
+
+1.3. "Contribution"
+ means Covered Software of a particular Contributor.
+
+1.4. "Covered Software"
+ means Source Code Form to which the initial Contributor has attached
+ the notice in Exhibit A, the Executable Form of such Source Code
+ Form, and Modifications of such Source Code Form, in each case
+ including portions thereof.
+
+1.5. "Incompatible With Secondary Licenses"
+ means
+
+ (a) that the initial Contributor has attached the notice described
+ in Exhibit B to the Covered Software; or
+
+ (b) that the Covered Software was made available under the terms of
+ version 1.1 or earlier of the License, but not also under the
+ terms of a Secondary License.
+
+1.6. "Executable Form"
+ means any form of the work other than Source Code Form.
+
+1.7. "Larger Work"
+ means a work that combines Covered Software with other material, in
+ a separate file or files, that is not Covered Software.
+
+1.8. "License"
+ means this document.
+
+1.9. "Licensable"
+ means having the right to grant, to the maximum extent possible,
+ whether at the time of the initial grant or subsequently, any and
+ all of the rights conveyed by this License.
+
+1.10. "Modifications"
+ means any of the following:
+
+ (a) any file in Source Code Form that results from an addition to,
+ deletion from, or modification of the contents of Covered
+ Software; or
+
+ (b) any new file in Source Code Form that contains any Covered
+ Software.
+
+1.11. "Patent Claims" of a Contributor
+ means any patent claim(s), including without limitation, method,
+ process, and apparatus claims, in any patent Licensable by such
+ Contributor that would be infringed, but for the grant of the
+ License, by the making, using, selling, offering for sale, having
+ made, import, or transfer of either its Contributions or its
+ Contributor Version.
+
+1.12. "Secondary License"
+ means either the GNU General Public License, Version 2.0, the GNU
+ Lesser General Public License, Version 2.1, the GNU Affero General
+ Public License, Version 3.0, or any later versions of those
+ licenses.
+
+1.13. "Source Code Form"
+ means the form of the work preferred for making modifications.
+
+1.14. "You" (or "Your")
+ means an individual or a legal entity exercising rights under this
+ License. For legal entities, "You" includes any entity that
+ controls, is controlled by, or is under common control with You. For
+ purposes of this definition, "control" means (a) the power, direct
+ or indirect, to cause the direction or management of such entity,
+ whether by contract or otherwise, or (b) ownership of more than
+ fifty percent (50%) of the outstanding shares or beneficial
+ ownership of such entity.
+
+2. License Grants and Conditions
+--------------------------------
+
+2.1. Grants
+
+Each Contributor hereby grants You a world-wide, royalty-free,
+non-exclusive license:
+
+(a) under intellectual property rights (other than patent or trademark)
+ Licensable by such Contributor to use, reproduce, make available,
+ modify, display, perform, distribute, and otherwise exploit its
+ Contributions, either on an unmodified basis, with Modifications, or
+ as part of a Larger Work; and
+
+(b) under Patent Claims of such Contributor to make, use, sell, offer
+ for sale, have made, import, and otherwise transfer either its
+ Contributions or its Contributor Version.
+
+2.2. Effective Date
+
+The licenses granted in Section 2.1 with respect to any Contribution
+become effective for each Contribution on the date the Contributor first
+distributes such Contribution.
+
+2.3. Limitations on Grant Scope
+
+The licenses granted in this Section 2 are the only rights granted under
+this License. No additional rights or licenses will be implied from the
+distribution or licensing of Covered Software under this License.
+Notwithstanding Section 2.1(b) above, no patent license is granted by a
+Contributor:
+
+(a) for any code that a Contributor has removed from Covered Software;
+ or
+
+(b) for infringements caused by: (i) Your and any other third party's
+ modifications of Covered Software, or (ii) the combination of its
+ Contributions with other software (except as part of its Contributor
+ Version); or
+
+(c) under Patent Claims infringed by Covered Software in the absence of
+ its Contributions.
+
+This License does not grant any rights in the trademarks, service marks,
+or logos of any Contributor (except as may be necessary to comply with
+the notice requirements in Section 3.4).
+
+2.4. Subsequent Licenses
+
+No Contributor makes additional grants as a result of Your choice to
+distribute the Covered Software under a subsequent version of this
+License (see Section 10.2) or under the terms of a Secondary License (if
+permitted under the terms of Section 3.3).
+
+2.5. Representation
+
+Each Contributor represents that the Contributor believes its
+Contributions are its original creation(s) or it has sufficient rights
+to grant the rights to its Contributions conveyed by this License.
+
+2.6. Fair Use
+
+This License is not intended to limit any rights You have under
+applicable copyright doctrines of fair use, fair dealing, or other
+equivalents.
+
+2.7. Conditions
+
+Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
+in Section 2.1.
+
+3. Responsibilities
+-------------------
+
+3.1. Distribution of Source Form
+
+All distribution of Covered Software in Source Code Form, including any
+Modifications that You create or to which You contribute, must be under
+the terms of this License. You must inform recipients that the Source
+Code Form of the Covered Software is governed by the terms of this
+License, and how they can obtain a copy of this License. You may not
+attempt to alter or restrict the recipients' rights in the Source Code
+Form.
+
+3.2. Distribution of Executable Form
+
+If You distribute Covered Software in Executable Form then:
+
+(a) such Covered Software must also be made available in Source Code
+ Form, as described in Section 3.1, and You must inform recipients of
+ the Executable Form how they can obtain a copy of such Source Code
+ Form by reasonable means in a timely manner, at a charge no more
+ than the cost of distribution to the recipient; and
+
+(b) You may distribute such Executable Form under the terms of this
+ License, or sublicense it under different terms, provided that the
+ license for the Executable Form does not attempt to limit or alter
+ the recipients' rights in the Source Code Form under this License.
+
+3.3. Distribution of a Larger Work
+
+You may create and distribute a Larger Work under terms of Your choice,
+provided that You also comply with the requirements of this License for
+the Covered Software. If the Larger Work is a combination of Covered
+Software with a work governed by one or more Secondary Licenses, and the
+Covered Software is not Incompatible With Secondary Licenses, this
+License permits You to additionally distribute such Covered Software
+under the terms of such Secondary License(s), so that the recipient of
+the Larger Work may, at their option, further distribute the Covered
+Software under the terms of either this License or such Secondary
+License(s).
+
+3.4. Notices
+
+You may not remove or alter the substance of any license notices
+(including copyright notices, patent notices, disclaimers of warranty,
+or limitations of liability) contained within the Source Code Form of
+the Covered Software, except that You may alter any license notices to
+the extent required to remedy known factual inaccuracies.
+
+3.5. Application of Additional Terms
+
+You may choose to offer, and to charge a fee for, warranty, support,
+indemnity or liability obligations to one or more recipients of Covered
+Software. However, You may do so only on Your own behalf, and not on
+behalf of any Contributor. You must make it absolutely clear that any
+such warranty, support, indemnity, or liability obligation is offered by
+You alone, and You hereby agree to indemnify every Contributor for any
+liability incurred by such Contributor as a result of warranty, support,
+indemnity or liability terms You offer. You may include additional
+disclaimers of warranty and limitations of liability specific to any
+jurisdiction.
+
+4. Inability to Comply Due to Statute or Regulation
+---------------------------------------------------
+
+If it is impossible for You to comply with any of the terms of this
+License with respect to some or all of the Covered Software due to
+statute, judicial order, or regulation then You must: (a) comply with
+the terms of this License to the maximum extent possible; and (b)
+describe the limitations and the code they affect. Such description must
+be placed in a text file included with all distributions of the Covered
+Software under this License. Except to the extent prohibited by statute
+or regulation, such description must be sufficiently detailed for a
+recipient of ordinary skill to be able to understand it.
+
+5. Termination
+--------------
+
+5.1. The rights granted under this License will terminate automatically
+if You fail to comply with any of its terms. However, if You become
+compliant, then the rights granted under this License from a particular
+Contributor are reinstated (a) provisionally, unless and until such
+Contributor explicitly and finally terminates Your grants, and (b) on an
+ongoing basis, if such Contributor fails to notify You of the
+non-compliance by some reasonable means prior to 60 days after You have
+come back into compliance. Moreover, Your grants from a particular
+Contributor are reinstated on an ongoing basis if such Contributor
+notifies You of the non-compliance by some reasonable means, this is the
+first time You have received notice of non-compliance with this License
+from such Contributor, and You become compliant prior to 30 days after
+Your receipt of the notice.
+
+5.2. If You initiate litigation against any entity by asserting a patent
+infringement claim (excluding declaratory judgment actions,
+counter-claims, and cross-claims) alleging that a Contributor Version
+directly or indirectly infringes any patent, then the rights granted to
+You by any and all Contributors for the Covered Software under Section
+2.1 of this License shall terminate.
+
+5.3. In the event of termination under Sections 5.1 or 5.2 above, all
+end user license agreements (excluding distributors and resellers) which
+have been validly granted by You or Your distributors under this License
+prior to termination shall survive termination.
+
+************************************************************************
+* *
+* 6. Disclaimer of Warranty *
+* ------------------------- *
+* *
+* Covered Software is provided under this License on an "as is" *
+* basis, without warranty of any kind, either expressed, implied, or *
+* statutory, including, without limitation, warranties that the *
+* Covered Software is free of defects, merchantable, fit for a *
+* particular purpose or non-infringing. The entire risk as to the *
+* quality and performance of the Covered Software is with You. *
+* Should any Covered Software prove defective in any respect, You *
+* (not any Contributor) assume the cost of any necessary servicing, *
+* repair, or correction. This disclaimer of warranty constitutes an *
+* essential part of this License. No use of any Covered Software is *
+* authorized under this License except under this disclaimer. *
+* *
+************************************************************************
+
+************************************************************************
+* *
+* 7. Limitation of Liability *
+* -------------------------- *
+* *
+* Under no circumstances and under no legal theory, whether tort *
+* (including negligence), contract, or otherwise, shall any *
+* Contributor, or anyone who distributes Covered Software as *
+* permitted above, be liable to You for any direct, indirect, *
+* special, incidental, or consequential damages of any character *
+* including, without limitation, damages for lost profits, loss of *
+* goodwill, work stoppage, computer failure or malfunction, or any *
+* and all other commercial damages or losses, even if such party *
+* shall have been informed of the possibility of such damages. This *
+* limitation of liability shall not apply to liability for death or *
+* personal injury resulting from such party's negligence to the *
+* extent applicable law prohibits such limitation. Some *
+* jurisdictions do not allow the exclusion or limitation of *
+* incidental or consequential damages, so this exclusion and *
+* limitation may not apply to You. *
+* *
+************************************************************************
+
+8. Litigation
+-------------
+
+Any litigation relating to this License may be brought only in the
+courts of a jurisdiction where the defendant maintains its principal
+place of business and such litigation shall be governed by laws of that
+jurisdiction, without reference to its conflict-of-law provisions.
+Nothing in this Section shall prevent a party's ability to bring
+cross-claims or counter-claims.
+
+9. Miscellaneous
+----------------
+
+This License represents the complete agreement concerning the subject
+matter hereof. If any provision of this License is held to be
+unenforceable, such provision shall be reformed only to the extent
+necessary to make it enforceable. Any law or regulation which provides
+that the language of a contract shall be construed against the drafter
+shall not be used to construe this License against a Contributor.
+
+10. Versions of the License
+---------------------------
+
+10.1. New Versions
+
+Mozilla Foundation is the license steward. Except as provided in Section
+10.3, no one other than the license steward has the right to modify or
+publish new versions of this License. Each version will be given a
+distinguishing version number.
+
+10.2. Effect of New Versions
+
+You may distribute the Covered Software under the terms of the version
+of the License under which You originally received the Covered Software,
+or under the terms of any subsequent version published by the license
+steward.
+
+10.3. Modified Versions
+
+If you create software not governed by this License, and you want to
+create a new license for such software, you may create and use a
+modified version of this License if you rename the license and remove
+any references to the name of the license steward (except to note that
+such modified license differs from this License).
+
+10.4. Distributing Source Code Form that is Incompatible With Secondary
+Licenses
+
+If You choose to distribute Source Code Form that is Incompatible With
+Secondary Licenses under the terms of this version of the License, the
+notice described in Exhibit B of this License must be attached.
+
+Exhibit A - Source Code Form License Notice
+-------------------------------------------
+
+ This Source Code Form is subject to the terms of the Mozilla Public
+ License, v. 2.0. If a copy of the MPL was not distributed with this
+ file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+If it is not possible or desirable to put the notice in a particular
+file, then You may include the notice in a location (such as a LICENSE
+file in a relevant directory) where a recipient would be likely to look
+for such a notice.
+
+You may add additional accurate notices of copyright ownership.
+
+Exhibit B - "Incompatible With Secondary Licenses" Notice
+---------------------------------------------------------
+
+ This Source Code Form is "Incompatible With Secondary Licenses", as
+ defined by the Mozilla Public License, v. 2.0.
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..321d3999c185a326a9d300451a3e732e4225f2e6
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,15 @@
+include README.md
+include LICENSE.txt
+include requirements.*.txt
+include *.cff
+include requirements.txt
+include TTS/VERSION
+recursive-include TTS *.json
+recursive-include TTS *.html
+recursive-include TTS *.png
+recursive-include TTS *.md
+recursive-include TTS *.py
+recursive-include TTS *.pyx
+recursive-include images *.png
+recursive-exclude tests *
+prune tests*
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..7adea3a1fe8a61b12da0356cc1f57c2564b38570
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,72 @@
+.DEFAULT_GOAL := help
+.PHONY: test system-deps dev-deps deps style lint install help docs
+
+help:
+ @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
+
+target_dirs := tests TTS notebooks recipes
+
+test_all: ## run tests and don't stop on an error.
+ nose2 --with-coverage --coverage TTS tests
+ ./run_bash_tests.sh
+
+test: ## run tests.
+ nose2 -F -v -B --with-coverage --coverage TTS tests
+
+test_vocoder: ## run vocoder tests.
+ nose2 -F -v -B --with-coverage --coverage TTS tests.vocoder_tests
+
+test_tts: ## run tts tests.
+ nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests
+
+test_aux: ## run aux tests.
+ nose2 -F -v -B --with-coverage --coverage TTS tests.aux_tests
+ ./run_bash_tests.sh
+
+test_zoo: ## run zoo tests.
+ nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests
+
+inference_tests: ## run inference tests.
+ nose2 -F -v -B --with-coverage --coverage TTS tests.inference_tests
+
+data_tests: ## run data tests.
+ nose2 -F -v -B --with-coverage --coverage TTS tests.data_tests
+
+test_text: ## run text tests.
+ nose2 -F -v -B --with-coverage --coverage TTS tests.text_tests
+
+test_failed: ## only run tests failed the last time.
+ nose2 -F -v -B --with-coverage --coverage TTS tests
+
+style: ## update code style.
+ black ${target_dirs}
+ isort ${target_dirs}
+
+lint: ## run pylint linter.
+ pylint ${target_dirs}
+ black ${target_dirs} --check
+ isort ${target_dirs} --check-only
+
+system-deps: ## install linux system deps
+ sudo apt-get install -y libsndfile1-dev
+
+dev-deps: ## install development deps
+ pip install -r requirements.dev.txt
+
+doc-deps: ## install docs dependencies
+ pip install -r docs/requirements.txt
+
+build-docs: ## build the docs
+ cd docs && make clean && make build
+
+hub-deps: ## install deps for torch hub use
+ pip install -r requirements.hub.txt
+
+deps: ## install 🐸 requirements.
+ pip install -r requirements.txt
+
+install: ## install 🐸 TTS for development.
+ pip install -e .[all]
+
+docs: ## build the docs
+ $(MAKE) -C docs clean && $(MAKE) -C docs html
diff --git a/README.md b/README.md
index 154df8298fab5ecf322016157858e08cd1bccbe1..baa529124e5995ca95bfdc8b30483c4ff6d1ab0c 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,343 @@
----
-license: apache-2.0
----
+
+
+----
+
+### 📣 Clone your voice with a single click on [🐸Coqui.ai](https://app.coqui.ai/auth/signin)
+
+----
+
+🐸TTS is a library for advanced Text-to-Speech generation. It's built on the latest research, was designed to achieve the best trade-off among ease-of-training, speed and quality.
+🐸TTS comes with pretrained models, tools for measuring dataset quality and already used in **20+ languages** for products and research projects.
+
+[![Dicord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv)
+[![License]()](https://opensource.org/licenses/MPL-2.0)
+[![PyPI version](https://badge.fury.io/py/TTS.svg)](https://badge.fury.io/py/TTS)
+[![Covenant](https://camo.githubusercontent.com/7d620efaa3eac1c5b060ece5d6aacfcc8b81a74a04d05cd0398689c01c4463bb/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6e7472696275746f72253230436f76656e616e742d76322e3025323061646f707465642d6666363962342e737667)](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md)
+[![Downloads](https://pepy.tech/badge/tts)](https://pepy.tech/project/tts)
+[![DOI](https://zenodo.org/badge/265612440.svg)](https://zenodo.org/badge/latestdoi/265612440)
+
+![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/aux_tests.yml/badge.svg)
+![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/data_tests.yml/badge.svg)
+![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/docker.yaml/badge.svg)
+![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/inference_tests.yml/badge.svg)
+![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/style_check.yml/badge.svg)
+![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/text_tests.yml/badge.svg)
+![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/tts_tests.yml/badge.svg)
+![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/vocoder_tests.yml/badge.svg)
+![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests0.yml/badge.svg)
+![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests1.yml/badge.svg)
+![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests2.yml/badge.svg)
+[![Docs]()](https://tts.readthedocs.io/en/latest/)
+
+📰 [**Subscribe to 🐸Coqui.ai Newsletter**](https://coqui.ai/?subscription=true)
+
+📢 [English Voice Samples](https://erogol.github.io/ddc-samples/) and [SoundCloud playlist](https://soundcloud.com/user-565970875/pocket-article-wavernn-and-tacotron2)
+
+📄 [Text-to-Speech paper collection](https://github.com/erogol/TTS-papers)
+
+
+
+## 💬 Where to ask questions
+Please use our dedicated channels for questions and discussion. Help is much more valuable if it's shared publicly so that more people can benefit from it.
+
+| Type | Platforms |
+| ------------------------------- | --------------------------------------- |
+| 🚨 **Bug Reports** | [GitHub Issue Tracker] |
+| 🎁 **Feature Requests & Ideas** | [GitHub Issue Tracker] |
+| 👩💻 **Usage Questions** | [GitHub Discussions] |
+| 🗯 **General Discussion** | [GitHub Discussions] or [Discord] |
+
+[github issue tracker]: https://github.com/coqui-ai/tts/issues
+[github discussions]: https://github.com/coqui-ai/TTS/discussions
+[discord]: https://discord.gg/5eXr5seRrv
+[Tutorials and Examples]: https://github.com/coqui-ai/TTS/wiki/TTS-Notebooks-and-Tutorials
+
+
+## 🔗 Links and Resources
+| Type | Links |
+| ------------------------------- | --------------------------------------- |
+| 💼 **Documentation** | [ReadTheDocs](https://tts.readthedocs.io/en/latest/)
+| 💾 **Installation** | [TTS/README.md](https://github.com/coqui-ai/TTS/tree/dev#install-tts)|
+| 👩💻 **Contributing** | [CONTRIBUTING.md](https://github.com/coqui-ai/TTS/blob/main/CONTRIBUTING.md)|
+| 📌 **Road Map** | [Main Development Plans](https://github.com/coqui-ai/TTS/issues/378)
+| 🚀 **Released Models** | [TTS Releases](https://github.com/coqui-ai/TTS/releases) and [Experimental Models](https://github.com/coqui-ai/TTS/wiki/Experimental-Released-Models)|
+
+## 🥇 TTS Performance
+
+
+Underlined "TTS*" and "Judy*" are 🐸TTS models
+
+
+## Features
+- High-performance Deep Learning models for Text2Speech tasks.
+ - Text2Spec models (Tacotron, Tacotron2, Glow-TTS, SpeedySpeech).
+ - Speaker Encoder to compute speaker embeddings efficiently.
+ - Vocoder models (MelGAN, Multiband-MelGAN, GAN-TTS, ParallelWaveGAN, WaveGrad, WaveRNN)
+- Fast and efficient model training.
+- Detailed training logs on the terminal and Tensorboard.
+- Support for Multi-speaker TTS.
+- Efficient, flexible, lightweight but feature complete `Trainer API`.
+- Released and ready-to-use models.
+- Tools to curate Text2Speech datasets under```dataset_analysis```.
+- Utilities to use and test your models.
+- Modular (but not too much) code base enabling easy implementation of new ideas.
+
+## Implemented Models
+### Spectrogram models
+- Tacotron: [paper](https://arxiv.org/abs/1703.10135)
+- Tacotron2: [paper](https://arxiv.org/abs/1712.05884)
+- Glow-TTS: [paper](https://arxiv.org/abs/2005.11129)
+- Speedy-Speech: [paper](https://arxiv.org/abs/2008.03802)
+- Align-TTS: [paper](https://arxiv.org/abs/2003.01950)
+- FastPitch: [paper](https://arxiv.org/pdf/2006.06873.pdf)
+- FastSpeech: [paper](https://arxiv.org/abs/1905.09263)
+- FastSpeech2: [paper](https://arxiv.org/abs/2006.04558)
+- SC-GlowTTS: [paper](https://arxiv.org/abs/2104.05557)
+- Capacitron: [paper](https://arxiv.org/abs/1906.03402)
+- OverFlow: [paper](https://arxiv.org/abs/2211.06892)
+- Neural HMM TTS: [paper](https://arxiv.org/abs/2108.13320)
+
+### End-to-End Models
+- VITS: [paper](https://arxiv.org/pdf/2106.06103)
+- YourTTS: [paper](https://arxiv.org/abs/2112.02418)
+
+### Attention Methods
+- Guided Attention: [paper](https://arxiv.org/abs/1710.08969)
+- Forward Backward Decoding: [paper](https://arxiv.org/abs/1907.09006)
+- Graves Attention: [paper](https://arxiv.org/abs/1910.10288)
+- Double Decoder Consistency: [blog](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/)
+- Dynamic Convolutional Attention: [paper](https://arxiv.org/pdf/1910.10288.pdf)
+- Alignment Network: [paper](https://arxiv.org/abs/2108.10447)
+
+### Speaker Encoder
+- GE2E: [paper](https://arxiv.org/abs/1710.10467)
+- Angular Loss: [paper](https://arxiv.org/pdf/2003.11982.pdf)
+
+### Vocoders
+- MelGAN: [paper](https://arxiv.org/abs/1910.06711)
+- MultiBandMelGAN: [paper](https://arxiv.org/abs/2005.05106)
+- ParallelWaveGAN: [paper](https://arxiv.org/abs/1910.11480)
+- GAN-TTS discriminators: [paper](https://arxiv.org/abs/1909.11646)
+- WaveRNN: [origin](https://github.com/fatchord/WaveRNN/)
+- WaveGrad: [paper](https://arxiv.org/abs/2009.00713)
+- HiFiGAN: [paper](https://arxiv.org/abs/2010.05646)
+- UnivNet: [paper](https://arxiv.org/abs/2106.07889)
+
+You can also help us implement more models.
+
+## Install TTS
+🐸TTS is tested on Ubuntu 18.04 with **python >= 3.7, < 3.11.**.
+
+If you are only interested in [synthesizing speech](https://tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option.
+
+```bash
+pip install TTS
+```
+
+If you plan to code or train models, clone 🐸TTS and install it locally.
+
+```bash
+git clone https://github.com/coqui-ai/TTS
+pip install -e .[all,dev,notebooks] # Select the relevant extras
+```
+
+If you are on Ubuntu (Debian), you can also run following commands for installation.
+
+```bash
+$ make system-deps # intended to be used on Ubuntu (Debian). Let us know if you have a different OS.
+$ make install
+```
+
+If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/how-can-i-run-mozilla-tts-coqui-tts-training-with-cuda-on-a-windows-system).
+
+
+## Docker Image
+You can also try TTS without install with the docker image.
+Simply run the following command and you will be able to run TTS without installing it.
+
+```bash
+docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu
+python3 TTS/server/server.py --list_models #To get the list of available models
+python3 TTS/server/server.py --model_name tts_models/en/vctk/vits # To start a server
+```
+
+You can then enjoy the TTS server [here](http://[::1]:5002/)
+More details about the docker images (like GPU support) can be found [here](https://tts.readthedocs.io/en/latest/docker_images.html)
+
+
+## Synthesizing speech by 🐸TTS
+
+### 🐍 Python API
+
+```python
+from TTS.api import TTS
+
+# Running a multi-speaker and multi-lingual model
+
+# List available 🐸TTS models and choose the first one
+model_name = TTS.list_models()[0]
+# Init TTS
+tts = TTS(model_name)
+# Run TTS
+# ❗ Since this model is multi-speaker and multi-lingual, we must set the target speaker and the language
+# Text to speech with a numpy output
+wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
+# Text to speech to a file
+tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
+
+# Running a single speaker model
+
+# Init TTS with the target model name
+tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
+# Run TTS
+tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
+
+# Example voice cloning with YourTTS in English, French and Portuguese:
+tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
+tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
+tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr-fr", file_path="output.wav")
+tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt-br", file_path="output.wav")
+
+
+# Example voice conversion converting speaker of the `source_wav` to the speaker of the `target_wav`
+
+tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False, gpu=True)
+tts.voice_conversion_to_file(source_wav="my/source.wav", target_wav="my/target.wav", file_path="output.wav")
+
+# Example voice cloning by a single speaker TTS model combining with the voice conversion model. This way, you can
+# clone voices by using any model in 🐸TTS.
+
+tts = TTS("tts_models/de/thorsten/tacotron2-DDC")
+tts.tts_with_vc_to_file(
+ "Wie sage ich auf Italienisch, dass ich dich liebe?",
+ speaker_wav="target/speaker.wav",
+ file_path="ouptut.wav"
+)
+
+# Example text to speech using [🐸Coqui Studio](https://coqui.ai) models. You can use all of your available speakers in the studio.
+# [🐸Coqui Studio](https://coqui.ai) API token is required. You can get it from the [account page](https://coqui.ai/account).
+# You should set the `COQUI_STUDIO_TOKEN` environment variable to use the API token.
+
+# If you have a valid API token set you will see the studio speakers as separate models in the list.
+# The name format is coqui_studio/en//coqui_studio
+models = TTS().list_models()
+# Init TTS with the target studio speaker
+tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False, gpu=False)
+# Run TTS
+tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
+# Run TTS with emotion and speed control
+tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
+```
+
+### Command line `tts`
+#### Single Speaker Models
+
+- List provided models:
+
+ ```
+ $ tts --list_models
+ ```
+- Get model info (for both tts_models and vocoder_models):
+ - Query by type/name:
+ The model_info_by_name uses the name as it from the --list_models.
+ ```
+ $ tts --model_info_by_name "///"
+ ```
+ For example:
+
+ ```
+ $ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
+ ```
+ ```
+ $ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
+ ```
+ - Query by type/idx:
+ The model_query_idx uses the corresponding idx from --list_models.
+ ```
+ $ tts --model_info_by_idx "/"
+ ```
+ For example:
+
+ ```
+ $ tts --model_info_by_idx tts_models/3
+ ```
+
+- Run TTS with default models:
+
+ ```
+ $ tts --text "Text for TTS" --out_path output/path/speech.wav
+ ```
+
+- Run a TTS model with its default vocoder model:
+
+ ```
+ $ tts --text "Text for TTS" --model_name "///" --out_path output/path/speech.wav
+ ```
+ For example:
+
+ ```
+ $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
+ ```
+
+- Run with specific TTS and vocoder models from the list:
+
+ ```
+ $ tts --text "Text for TTS" --model_name "///" --vocoder_name "///" --out_path output/path/speech.wav
+ ```
+
+ For example:
+
+ ```
+ $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
+ ```
+
+
+- Run your own TTS model (Using Griffin-Lim Vocoder):
+
+ ```
+ $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
+ ```
+
+- Run your own TTS and Vocoder models:
+ ```
+ $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
+ --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
+ ```
+
+#### Multi-speaker Models
+
+- List the available speakers and choose as among them:
+
+ ```
+ $ tts --model_name "//" --list_speaker_idxs
+ ```
+
+- Run the multi-speaker TTS model with the target speaker ID:
+
+ ```
+ $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "//" --speaker_idx
+ ```
+
+- Run your own multi-speaker TTS model:
+
+ ```
+ $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx
+ ```
+
+## Directory Structure
+```
+|- notebooks/ (Jupyter Notebooks for model evaluation, parameter selection and data analysis.)
+|- utils/ (common utilities.)
+|- TTS
+ |- bin/ (folder for all the executables.)
+ |- train*.py (train your target model.)
+ |- ...
+ |- tts/ (text to speech models)
+ |- layers/ (model layer definitions)
+ |- models/ (model definitions)
+ |- utils/ (model specific utilities.)
+ |- speaker_encoder/ (Speaker Encoder models.)
+ |- (same)
+ |- vocoder/ (Vocoder models.)
+ |- (same)
+```
diff --git a/TTS/.models.json b/TTS/.models.json
new file mode 100644
index 0000000000000000000000000000000000000000..02b95cf2aa8980358d900c39fb2c38d3b8607093
--- /dev/null
+++ b/TTS/.models.json
@@ -0,0 +1,819 @@
+{
+ "tts_models": {
+ "multilingual":{
+ "multi-dataset":{
+ "your_tts":{
+ "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
+ "default_vocoder": null,
+ "commit": "e9a1953e",
+ "license": "CC BY-NC-ND 4.0",
+ "contact": "egolge@coqui.ai"
+ }
+ }
+ },
+ "bg": {
+ "cv": {
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--bg--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "cs": {
+ "cv": {
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--cs--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "da": {
+ "cv": {
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--da--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "et": {
+ "cv": {
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--et--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "ga": {
+ "cv": {
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ga--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "en": {
+ "ek1": {
+ "tacotron2": {
+ "description": "EK1 en-rp tacotron2 by NMStoker",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ek1--tacotron2.zip",
+ "default_vocoder": "vocoder_models/en/ek1/wavegrad",
+ "commit": "c802255",
+ "license": "apache 2.0"
+ }
+ },
+ "ljspeech": {
+ "tacotron2-DDC": {
+ "description": "Tacotron2 with Double Decoder Consistency.",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC.zip",
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
+ "commit": "bae2ad0f",
+ "author": "Eren Gölge @erogol",
+ "license": "apache 2.0",
+ "contact": "egolge@coqui.com"
+ },
+ "tacotron2-DDC_ph": {
+ "description": "Tacotron2 with Double Decoder Consistency with phonemes.",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
+ "default_vocoder": "vocoder_models/en/ljspeech/univnet",
+ "commit": "3900448",
+ "author": "Eren Gölge @erogol",
+ "license": "apache 2.0",
+ "contact": "egolge@coqui.com"
+ },
+ "glow-tts": {
+ "description": "",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--glow-tts.zip",
+ "stats_file": null,
+ "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
+ "commit": "",
+ "author": "Eren Gölge @erogol",
+ "license": "MPL",
+ "contact": "egolge@coqui.com"
+ },
+ "speedy-speech": {
+ "description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--speedy-speech.zip",
+ "stats_file": null,
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
+ "commit": "4581e3d",
+ "author": "Eren Gölge @erogol",
+ "license": "apache 2.0",
+ "contact": "egolge@coqui.com"
+ },
+ "tacotron2-DCA": {
+ "description": "",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DCA.zip",
+ "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
+ "commit": "",
+ "author": "Eren Gölge @erogol",
+ "license": "MPL",
+ "contact": "egolge@coqui.com"
+ },
+ "vits": {
+ "description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--vits.zip",
+ "default_vocoder": null,
+ "commit": "3900448",
+ "author": "Eren Gölge @erogol",
+ "license": "apache 2.0",
+ "contact": "egolge@coqui.com"
+ },
+ "vits--neon": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--en--ljspeech--vits.zip",
+ "default_vocoder": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause",
+ "contact": null,
+ "commit": null
+ },
+ "fast_pitch": {
+ "description": "FastPitch model trained on LJSpeech using the Aligner Network",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
+ "commit": "b27b3ba",
+ "author": "Eren Gölge @erogol",
+ "license": "apache 2.0",
+ "contact": "egolge@coqui.com"
+ },
+ "overflow": {
+ "description": "Overflow model trained on LJSpeech",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.0_models/tts_models--en--ljspeech--overflow.zip",
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
+ "commit": "3b1a28f",
+ "author": "Eren Gölge @erogol",
+ "license": "apache 2.0",
+ "contact": "egolge@coqui.ai"
+ },
+ "neural_hmm": {
+ "description": "Neural HMM model trained on LJSpeech",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.11.0_models/tts_models--en--ljspeech--neural_hmm.zip",
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
+ "commit": "3b1a28f",
+ "author": "Shivam Metha @shivammehta25",
+ "license": "apache 2.0",
+ "contact": "d83ee8fe45e3c0d776d4a865aca21d7c2ac324c4"
+ }
+ },
+ "vctk": {
+ "vits": {
+ "description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--vits.zip",
+ "default_vocoder": null,
+ "commit": "3900448",
+ "author": "Eren @erogol",
+ "license": "apache 2.0",
+ "contact": "egolge@coqui.ai"
+ },
+ "fast_pitch":{
+ "description": "FastPitch model trained on VCTK dataseset.",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip",
+ "default_vocoder": null,
+ "commit": "bdab788d",
+ "author": "Eren @erogol",
+ "license": "CC BY-NC-ND 4.0",
+ "contact": "egolge@coqui.ai"
+ }
+ },
+ "sam": {
+ "tacotron-DDC": {
+ "description": "Tacotron2 with Double Decoder Consistency trained with Aceenture's Sam dataset.",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--sam--tacotron-DDC.zip",
+ "default_vocoder": "vocoder_models/en/sam/hifigan_v2",
+ "commit": "bae2ad0f",
+ "author": "Eren Gölge @erogol",
+ "license": "apache 2.0",
+ "contact": "egolge@coqui.com"
+ }
+ },
+ "blizzard2013": {
+ "capacitron-t2-c50": {
+ "description": "Capacitron additions to Tacotron 2 with Capacity at 50 as in https://arxiv.org/pdf/1906.03402.pdf",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c50.zip",
+ "commit": "d6284e7",
+ "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
+ "author": "Adam Froghyar @a-froghyar",
+ "license": "apache 2.0",
+ "contact": "adamfroghyar@gmail.com"
+ },
+ "capacitron-t2-c150_v2": {
+ "description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.1_models/tts_models--en--blizzard2013--capacitron-t2-c150_v2.zip",
+ "commit": "a67039d",
+ "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
+ "author": "Adam Froghyar @a-froghyar",
+ "license": "apache 2.0",
+ "contact": "adamfroghyar@gmail.com"
+ }
+ }
+ },
+ "es": {
+ "mai": {
+ "tacotron2-DDC": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--es--mai--tacotron2-DDC.zip",
+ "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
+ "commit": "",
+ "author": "Eren Gölge @erogol",
+ "license": "MPL",
+ "contact": "egolge@coqui.com"
+ }
+ },
+ "css10":{
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--es--css10--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "fr": {
+ "mai": {
+ "tacotron2-DDC": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
+ "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
+ "commit": null,
+ "author": "Eren Gölge @erogol",
+ "license": "MPL",
+ "contact": "egolge@coqui.com"
+ }
+ },
+ "css10":{
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fr--css10--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "uk":{
+ "mai": {
+ "glow-tts": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip",
+ "author":"@robinhad",
+ "commit": "bdab788d",
+ "license": "MIT",
+ "contact": "",
+ "default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
+ },
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--uk--mai--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "zh-CN": {
+ "baker": {
+ "tacotron2-DDC-GST": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
+ "commit": "unknown",
+ "author": "@kirianguiller",
+ "license": "apache 2.0",
+ "default_vocoder": null
+ }
+ }
+ },
+ "nl": {
+ "mai": {
+ "tacotron2-DDC": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--nl--mai--tacotron2-DDC.zip",
+ "author": "@r-dh",
+ "license": "apache 2.0",
+ "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan",
+ "stats_file": null,
+ "commit": "540d811"
+ }
+ },
+ "css10":{
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--nl--css10--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "de": {
+ "thorsten": {
+ "tacotron2-DCA": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--de--thorsten--tacotron2-DCA.zip",
+ "default_vocoder": "vocoder_models/de/thorsten/fullband-melgan",
+ "author": "@thorstenMueller",
+ "license": "apache 2.0",
+ "commit": "unknown"
+ },
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--de--thorsten--vits.zip",
+ "default_vocoder": null,
+ "author": "@thorstenMueller",
+ "license": "apache 2.0",
+ "commit": "unknown"
+ },
+ "tacotron2-DDC": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--thorsten--tacotron2-DDC.zip",
+ "default_vocoder": "vocoder_models/de/thorsten/hifigan_v1",
+ "description": "Thorsten-Dec2021-22k-DDC",
+ "author": "@thorstenMueller",
+ "license": "apache 2.0",
+ "commit": "unknown"
+ }
+ },
+ "css10": {
+ "vits-neon":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--css10--vits.zip",
+ "default_vocoder": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause",
+ "commit": null
+ }
+ }
+ },
+ "ja": {
+ "kokoro": {
+ "tacotron2-DDC": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--ja--kokoro--tacotron2-DDC.zip",
+ "default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1",
+ "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
+ "author": "@kaiidams",
+ "license": "apache 2.0",
+ "commit": "401fbd89"
+ }
+ }
+ },
+ "tr":{
+ "common-voice": {
+ "glow-tts":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip",
+ "default_vocoder": "vocoder_models/tr/common-voice/hifigan",
+ "license": "MIT",
+ "description": "Turkish GlowTTS model using an unknown speaker from the Common-Voice dataset.",
+ "author": "Fatih Akademi",
+ "commit": null
+ }
+ }
+ },
+ "it": {
+ "mai_female": {
+ "glow-tts":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip",
+ "default_vocoder": null,
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
+ "author": "@nicolalandro",
+ "license": "apache 2.0",
+ "commit": null
+ },
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip",
+ "default_vocoder": null,
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
+ "author": "@nicolalandro",
+ "license": "apache 2.0",
+ "commit": null
+ }
+ },
+ "mai_male": {
+ "glow-tts":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip",
+ "default_vocoder": null,
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
+ "author": "@nicolalandro",
+ "license": "apache 2.0",
+ "commit": null
+ },
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip",
+ "default_vocoder": null,
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
+ "author": "@nicolalandro",
+ "license": "apache 2.0",
+ "commit": null
+ }
+ }
+ },
+ "ewe": {
+ "openbible": {
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--ewe--openbible--vits.zip",
+ "default_vocoder": null,
+ "license": "CC-BY-SA 4.0",
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
+ "author": "@coqui_ai",
+ "commit": "1b22f03"
+ }
+ }
+ },
+ "hau": {
+ "openbible": {
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--hau--openbible--vits.zip",
+ "default_vocoder": null,
+ "license": "CC-BY-SA 4.0",
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
+ "author": "@coqui_ai",
+ "commit": "1b22f03"
+ }
+ }
+ },
+ "lin": {
+ "openbible": {
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--lin--openbible--vits.zip",
+ "default_vocoder": null,
+ "license": "CC-BY-SA 4.0",
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
+ "author": "@coqui_ai",
+ "commit": "1b22f03"
+ }
+ }
+ },
+ "tw_akuapem": {
+ "openbible": {
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip",
+ "default_vocoder": null,
+ "license": "CC-BY-SA 4.0",
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
+ "author": "@coqui_ai",
+ "commit": "1b22f03"
+ }
+ }
+ },
+ "tw_asante": {
+ "openbible": {
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip",
+ "default_vocoder": null,
+ "license": "CC-BY-SA 4.0",
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
+ "author": "@coqui_ai",
+ "commit": "1b22f03"
+ }
+ }
+ },
+ "yor": {
+ "openbible": {
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--yor--openbible--vits.zip",
+ "default_vocoder": null,
+ "license": "CC-BY-SA 4.0",
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
+ "author": "@coqui_ai",
+ "commit": "1b22f03"
+ }
+ }
+ },
+ "hu": {
+ "css10": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hu--css10--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "el": {
+ "cv": {
+ "vits": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--el--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "fi": {
+ "css10": {
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fi--css10--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "hr": {
+ "cv": {
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hr--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "lt": {
+ "cv": {
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lt--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "lv": {
+ "cv": {
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lv--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "mt": {
+ "cv": {
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--mt--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "pl": {
+ "mai_female": {
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pl--mai_female--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "pt": {
+ "cv": {
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pt--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "ro": {
+ "cv": {
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ro--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "sk": {
+ "cv": {
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sk--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "sl": {
+ "cv": {
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sl--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "sv": {
+ "cv": {
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sv--cv--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "author": "@NeonGeckoCom",
+ "license": "bsd-3-clause"
+ }
+ }
+ },
+ "ca": {
+ "custom": {
+ "vits":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--ca--custom--vits.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "description": " It is trained from zero with 101460 utterances consisting of 257 speakers, approx 138 hours of speech. We used three datasets;\nFestcat and Google Catalan TTS (both TTS datasets) and also a part of Common Voice 8. It is trained with TTS v0.8.0.\nhttps://github.com/coqui-ai/TTS/discussions/930#discussioncomment-4466345",
+ "author": "@gullabi",
+ "license": "CC-BY-4.0"
+ }
+ }
+ },
+ "fa":{
+ "custom":{
+ "glow-tts": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--fa--custom--glow-tts.zip",
+ "default_vocoder": null,
+ "commit": null,
+ "description": "persian-tts-female-glow_tts model for text to speech purposes. Single-speaker female voice Trained on persian-tts-dataset-famale. \nThis model has no compatible vocoder thus the output quality is not very good. \nDataset: https://www.kaggle.com/datasets/magnoliasis/persian-tts-dataset-famale.",
+ "author": "@karim23657",
+ "license": "CC-BY-4.0"
+ }
+ }
+ }
+ },
+ "vocoder_models": {
+ "universal": {
+ "libri-tts": {
+ "wavegrad": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--wavegrad.zip",
+ "commit": "ea976b0",
+ "author": "Eren Gölge @erogol",
+ "license": "MPL",
+ "contact": "egolge@coqui.com"
+ },
+ "fullband-melgan": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--fullband-melgan.zip",
+ "commit": "4132240",
+ "author": "Eren Gölge @erogol",
+ "license": "MPL",
+ "contact": "egolge@coqui.com"
+ }
+ }
+ },
+ "en": {
+ "ek1": {
+ "wavegrad": {
+ "description": "EK1 en-rp wavegrad by NMStoker",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip",
+ "commit": "c802255",
+ "license": "apache 2.0"
+ }
+ },
+ "ljspeech": {
+ "multiband-melgan": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip",
+ "commit": "ea976b0",
+ "author": "Eren Gölge @erogol",
+ "license": "MPL",
+ "contact": "egolge@coqui.com"
+ },
+ "hifigan_v2": {
+ "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip",
+ "commit": "bae2ad0f",
+ "author": "@erogol",
+ "license": "apache 2.0",
+ "contact": "egolge@coqui.ai"
+ },
+ "univnet": {
+ "description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip",
+ "commit": "4581e3d",
+ "author": "Eren @erogol",
+ "license": "apache 2.0",
+ "contact": "egolge@coqui.ai"
+ }
+ },
+ "blizzard2013": {
+ "hifigan_v2": {
+ "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/vocoder_models--en--blizzard2013--hifigan_v2.zip",
+ "commit": "d6284e7",
+ "author": "Adam Froghyar @a-froghyar",
+ "license": "apache 2.0",
+ "contact": "adamfroghyar@gmail.com"
+ }
+ },
+ "vctk": {
+ "hifigan_v2": {
+ "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip",
+ "commit": "2f07160",
+ "author": "Edresson Casanova",
+ "license": "apache 2.0",
+ "contact": ""
+ }
+ },
+ "sam": {
+ "hifigan_v2": {
+ "description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC",
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip",
+ "commit": "2f07160",
+ "author": "Eren Gölge @erogol",
+ "license": "apache 2.0",
+ "contact": "egolge@coqui.ai"
+ }
+ }
+ },
+ "nl": {
+ "mai": {
+ "parallel-wavegan": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip",
+ "author": "@r-dh",
+ "license": "apache 2.0",
+ "commit": "unknown"
+ }
+ }
+ },
+ "de": {
+ "thorsten": {
+ "wavegrad": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip",
+ "author": "@thorstenMueller",
+ "license": "apache 2.0",
+ "commit": "unknown"
+ },
+ "fullband-melgan": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip",
+ "author": "@thorstenMueller",
+ "license": "apache 2.0",
+ "commit": "unknown"
+ },
+ "hifigan_v1": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/vocoder_models--de--thorsten--hifigan_v1.zip",
+ "description": "HifiGAN vocoder model for Thorsten Neutral Dec2021 22k Samplerate Tacotron2 DDC model",
+ "author": "@thorstenMueller",
+ "license": "apache 2.0",
+ "commit": "unknown"
+ }
+ }
+ },
+ "ja": {
+ "kokoro": {
+ "hifigan_v1": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip",
+ "description": "HifiGAN model trained for kokoro dataset by @kaiidams",
+ "author": "@kaiidams",
+ "license": "apache 2.0",
+ "commit": "3900448"
+ }
+ }
+ },
+ "uk": {
+ "mai": {
+ "multiband-melgan": {
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip",
+ "author":"@robinhad",
+ "commit": "bdab788d",
+ "license": "MIT",
+ "contact": ""
+ }
+ }
+ },
+ "tr":{
+ "common-voice": {
+ "hifigan":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip",
+ "description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
+ "author": "Fatih Akademi",
+ "license": "MIT",
+ "commit": null
+ }
+ }
+ }
+ },
+ "voice_conversion_models":{
+ "multilingual":{
+ "vctk":{
+ "freevc24":{
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
+ "description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
+ "author": "Jing-Yi Li @OlaWod",
+ "license": "MIT",
+ "commit": null
+ }
+ }
+ }
+ }
+}
diff --git a/TTS/VERSION b/TTS/VERSION
new file mode 100644
index 0000000000000000000000000000000000000000..54d1a4f2a4a7f6afc19897c88a7b73c17ccc54fb
--- /dev/null
+++ b/TTS/VERSION
@@ -0,0 +1 @@
+0.13.0
diff --git a/TTS/__init__.py b/TTS/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eaf05db1b950d82bfd7e20857e09a0fef45b430a
--- /dev/null
+++ b/TTS/__init__.py
@@ -0,0 +1,6 @@
+import os
+
+with open(os.path.join(os.path.dirname(__file__), "VERSION"), "r", encoding="utf-8") as f:
+ version = f.read().strip()
+
+__version__ = version
diff --git a/TTS/api.py b/TTS/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..7376cfa44f1d734c05c98fefddc462424bb6c4ab
--- /dev/null
+++ b/TTS/api.py
@@ -0,0 +1,628 @@
+import http.client
+import json
+import os
+import tempfile
+import urllib.request
+from pathlib import Path
+from typing import Tuple
+
+import numpy as np
+from scipy.io import wavfile
+
+from TTS.utils.audio.numpy_transforms import save_wav
+from TTS.utils.manage import ModelManager
+from TTS.utils.synthesizer import Synthesizer
+
+
+class Speaker(object):
+ """Convert dict to object."""
+
+ def __init__(self, d, is_voice=False):
+ self.is_voice = is_voice
+ for k, v in d.items():
+ if isinstance(k, (list, tuple)):
+ setattr(self, k, [Speaker(x) if isinstance(x, dict) else x for x in v])
+ else:
+ setattr(self, k, Speaker(v) if isinstance(v, dict) else v)
+
+ def __repr__(self):
+ return str(self.__dict__)
+
+
+class CS_API:
+ """🐸Coqui Studio API Wrapper.
+
+ 🐸Coqui Studio is the most advanced voice generation platform. You can generate new voices by voice cloning, voice
+ interpolation, or our unique prompt to voice technology. It also provides a set of built-in voices with different
+ characteristics. You can use these voices to generate new audio files or use them in your applications.
+ You can use all the built-in and your own 🐸Coqui Studio speakers with this API with an API token.
+ You can signup to 🐸Coqui Studio from https://app.coqui.ai/auth/signup and get an API token from
+ https://app.coqui.ai/account. We can either enter the token as an environment variable as
+ `export COQUI_STUDIO_TOKEN=` or pass it as `CS_API(api_token=)`.
+ Visit https://app.coqui.ai/api for more information.
+
+ Example listing all available speakers:
+ >>> from TTS.api import CS_API
+ >>> tts = CS_API()
+ >>> tts.speakers
+
+ Example listing all emotions:
+ >>> from TTS.api import CS_API
+ >>> tts = CS_API()
+ >>> tts.emotions
+
+ Example with a built-in 🐸 speaker:
+ >>> from TTS.api import CS_API
+ >>> tts = CS_API()
+ >>> wav, sr = api.tts("Hello world", speaker_name="Claribel Dervla")
+ >>> filepath = tts.tts_to_file(text="Hello world!", speaker_name=tts.speakers[0].name, file_path="output.wav")
+ """
+
+ def __init__(self, api_token=None):
+ self.api_token = api_token
+ self.api_prefix = "/api/v2"
+ self.headers = None
+ self._speakers = None
+ self._check_token()
+
+ @property
+ def speakers(self):
+ if self._speakers is None:
+ self._speakers = self.list_all_speakers()
+ return self._speakers
+
+ @property
+ def emotions(self):
+ """Return a list of available emotions.
+
+ TODO: Get this from the API endpoint.
+ """
+ return ["Neutral", "Happy", "Sad", "Angry", "Dull"]
+
+ def _check_token(self):
+ if self.api_token is None:
+ self.api_token = os.environ.get("COQUI_STUDIO_TOKEN")
+ self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"}
+ if not self.api_token:
+ raise ValueError(
+ "No API token found for 🐸Coqui Studio voices - https://coqui.ai.\n"
+ "Visit 🔗https://app.coqui.ai/account to get one.\n"
+ "Set it as an environment variable `export COQUI_STUDIO_TOKEN=`\n"
+ ""
+ )
+
+ def list_all_speakers(self):
+ """Return both built-in Coqui Studio speakers and custom voices created by the user."""
+ return self.list_speakers() + self.list_voices()
+
+ def list_speakers(self):
+ """List built-in Coqui Studio speakers."""
+ self._check_token()
+ conn = http.client.HTTPSConnection("app.coqui.ai")
+ conn.request("GET", f"{self.api_prefix}/speakers", headers=self.headers)
+ res = conn.getresponse()
+ data = res.read()
+ return [Speaker(s) for s in json.loads(data)["result"]]
+
+ def list_voices(self):
+ """List custom voices created by the user."""
+ conn = http.client.HTTPSConnection("app.coqui.ai")
+ conn.request("GET", f"{self.api_prefix}/voices", headers=self.headers)
+ res = conn.getresponse()
+ data = res.read()
+ return [Speaker(s, True) for s in json.loads(data)["result"]]
+
+ def list_speakers_as_tts_models(self):
+ """List speakers in ModelManager format."""
+ models = []
+ for speaker in self.speakers:
+ model = f"coqui_studio/en/{speaker.name}/coqui_studio"
+ models.append(model)
+ return models
+
+ def name_to_speaker(self, name):
+ for speaker in self.speakers:
+ if speaker.name == name:
+ return speaker
+ raise ValueError(f"Speaker {name} not found.")
+
+ def id_to_speaker(self, speaker_id):
+ for speaker in self.speakers:
+ if speaker.id == speaker_id:
+ return speaker
+ raise ValueError(f"Speaker {speaker_id} not found.")
+
+ @staticmethod
+ def url_to_np(url):
+ tmp_file, _ = urllib.request.urlretrieve(url)
+ rate, data = wavfile.read(tmp_file)
+ return data, rate
+
+ @staticmethod
+ def _create_payload(text, speaker, emotion, speed):
+ payload = {}
+ if speaker.is_voice:
+ payload["voice_id"] = speaker.id
+ else:
+ payload["speaker_id"] = speaker.id
+ payload.update(
+ {
+ "emotion": emotion,
+ "name": speaker.name,
+ "text": text,
+ "speed": speed,
+ }
+ )
+ return payload
+
+ def tts(
+ self,
+ text: str,
+ speaker_name: str = None,
+ speaker_id=None,
+ emotion="Neutral",
+ speed=1.0,
+ language=None, # pylint: disable=unused-argument
+ ) -> Tuple[np.ndarray, int]:
+ """Synthesize speech from text.
+
+ Args:
+ text (str): Text to synthesize.
+ speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
+ voices (user generated speakers) with `list_voices()`.
+ speaker_id (str): Speaker ID. If None, the speaker name is used.
+ emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
+ speed (float): Speed of the speech. 1.0 is normal speed.
+ language (str): Language of the text. If None, the default language of the speaker is used.
+ """
+ self._check_token()
+ if speaker_name is None and speaker_id is None:
+ raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.")
+ if speaker_id is None:
+ speaker = self.name_to_speaker(speaker_name)
+ else:
+ speaker = self.id_to_speaker(speaker_id)
+ conn = http.client.HTTPSConnection("app.coqui.ai")
+ payload = self._create_payload(text, speaker, emotion, speed)
+ conn.request("POST", "/api/v2/samples", json.dumps(payload), self.headers)
+ res = conn.getresponse()
+ data = res.read()
+ try:
+ wav, sr = self.url_to_np(json.loads(data)["audio_url"])
+ except KeyError as e:
+ raise ValueError(f" [!] 🐸 API returned error: {data}") from e
+ return wav, sr
+
+ def tts_to_file(
+ self,
+ text: str,
+ speaker_name: str,
+ speaker_id=None,
+ emotion="Neutral",
+ speed=1.0,
+ language=None,
+ file_path: str = None,
+ ) -> str:
+ """Synthesize speech from text and save it to a file.
+
+ Args:
+ text (str): Text to synthesize.
+ speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
+ voices (user generated speakers) with `list_voices()`.
+ speaker_id (str): Speaker ID. If None, the speaker name is used.
+ emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
+ speed (float): Speed of the speech. 1.0 is normal speed.
+ language (str): Language of the text. If None, the default language of the speaker is used.
+ file_path (str): Path to save the file. If None, a temporary file is created.
+ """
+ if file_path is None:
+ file_path = tempfile.mktemp(".wav")
+ wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language)
+ wavfile.write(file_path, sr, wav)
+ return file_path
+
+
+class TTS:
+ """TODO: Add voice conversion and Capacitron support."""
+
+ def __init__(
+ self,
+ model_name: str = None,
+ model_path: str = None,
+ config_path: str = None,
+ vocoder_path: str = None,
+ vocoder_config_path: str = None,
+ progress_bar: bool = True,
+ gpu=False,
+ ):
+ """🐸TTS python interface that allows to load and use the released models.
+
+ Example with a multi-speaker model:
+ >>> from TTS.api import TTS
+ >>> tts = TTS(TTS.list_models()[0])
+ >>> wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
+ >>> tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
+
+ Example with a single-speaker model:
+ >>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
+ >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
+
+ Example loading a model from a path:
+ >>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False, gpu=False)
+ >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
+
+ Example voice cloning with YourTTS in English, French and Portuguese:
+ >>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
+ >>> tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="thisisit.wav")
+ >>> tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="thisisit.wav")
+ >>> tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="thisisit.wav")
+
+ Args:
+ model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None.
+ model_path (str, optional): Path to the model checkpoint. Defaults to None.
+ config_path (str, optional): Path to the model config. Defaults to None.
+ vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
+ vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
+ progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
+ """
+ self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)
+
+ self.synthesizer = None
+ self.voice_converter = None
+ self.csapi = None
+ self.model_name = None
+
+ if model_name:
+ self.load_tts_model_by_name(model_name, gpu)
+
+ if model_path:
+ self.load_tts_model_by_path(
+ model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu
+ )
+
+ @property
+ def models(self):
+ return self.manager.list_tts_models()
+
+ @property
+ def is_multi_speaker(self):
+ if hasattr(self.synthesizer.tts_model, "speaker_manager") and self.synthesizer.tts_model.speaker_manager:
+ return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
+ return False
+
+ @property
+ def is_coqui_studio(self):
+ return "coqui_studio" in self.model_name
+
+ @property
+ def is_multi_lingual(self):
+ if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
+ return self.synthesizer.tts_model.language_manager.num_languages > 1
+ return False
+
+ @property
+ def speakers(self):
+ if not self.is_multi_speaker:
+ return None
+ return self.synthesizer.tts_model.speaker_manager.speaker_names
+
+ @property
+ def languages(self):
+ if not self.is_multi_lingual:
+ return None
+ return self.synthesizer.tts_model.language_manager.language_names
+
+ @staticmethod
+ def get_models_file_path():
+ return Path(__file__).parent / ".models.json"
+
+ @staticmethod
+ def list_models():
+ try:
+ csapi = CS_API()
+ models = csapi.list_speakers_as_tts_models()
+ except ValueError as e:
+ print(e)
+ models = []
+ manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
+ return manager.list_tts_models() + models
+
+ def download_model_by_name(self, model_name: str):
+ model_path, config_path, model_item = self.manager.download_model(model_name)
+ if model_item.get("default_vocoder") is None:
+ return model_path, config_path, None, None
+ vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"])
+ return model_path, config_path, vocoder_path, vocoder_config_path
+
+ def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
+ """Load one of the voice conversion models by name.
+
+ Args:
+ model_name (str): Model name to load. You can list models by ```tts.models```.
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
+ """
+ model_path, config_path, _, _ = self.download_model_by_name(model_name)
+ self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu)
+
+ def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
+ """Load one of 🐸TTS models by name.
+
+ Args:
+ model_name (str): Model name to load. You can list models by ```tts.models```.
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
+
+ TODO: Add tests
+ """
+ self.synthesizer = None
+ self.csapi = None
+ self.model_name = model_name
+
+ if "coqui_studio" in model_name:
+ self.csapi = CS_API()
+ else:
+ model_path, config_path, vocoder_path, vocoder_config_path = self.download_model_by_name(model_name)
+
+ # init synthesizer
+ # None values are fetch from the model
+ self.synthesizer = Synthesizer(
+ tts_checkpoint=model_path,
+ tts_config_path=config_path,
+ tts_speakers_file=None,
+ tts_languages_file=None,
+ vocoder_checkpoint=vocoder_path,
+ vocoder_config=vocoder_config_path,
+ encoder_checkpoint=None,
+ encoder_config=None,
+ use_cuda=gpu,
+ )
+
+ def load_tts_model_by_path(
+ self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
+ ):
+ """Load a model from a path.
+
+ Args:
+ model_path (str): Path to the model checkpoint.
+ config_path (str): Path to the model config.
+ vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
+ vocoder_config (str, optional): Path to the vocoder config. Defaults to None.
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
+ """
+
+ self.synthesizer = Synthesizer(
+ tts_checkpoint=model_path,
+ tts_config_path=config_path,
+ tts_speakers_file=None,
+ tts_languages_file=None,
+ vocoder_checkpoint=vocoder_path,
+ vocoder_config=vocoder_config,
+ encoder_checkpoint=None,
+ encoder_config=None,
+ use_cuda=gpu,
+ )
+
+ def _check_arguments(
+ self,
+ speaker: str = None,
+ language: str = None,
+ speaker_wav: str = None,
+ emotion: str = None,
+ speed: float = None,
+ ) -> None:
+ """Check if the arguments are valid for the model."""
+ if not self.is_coqui_studio:
+ # check for the coqui tts models
+ if self.is_multi_speaker and (speaker is None and speaker_wav is None):
+ raise ValueError("Model is multi-speaker but no `speaker` is provided.")
+ if self.is_multi_lingual and language is None:
+ raise ValueError("Model is multi-lingual but no `language` is provided.")
+ if not self.is_multi_speaker and speaker is not None:
+ raise ValueError("Model is not multi-speaker but `speaker` is provided.")
+ if not self.is_multi_lingual and language is not None:
+ raise ValueError("Model is not multi-lingual but `language` is provided.")
+ if not emotion is None and not speed is None:
+ raise ValueError("Emotion and speed can only be used with Coqui Studio models.")
+ else:
+ if emotion is None:
+ emotion = "Neutral"
+ if speed is None:
+ speed = 1.0
+ # check for the studio models
+ if speaker_wav is not None:
+ raise ValueError("Coqui Studio models do not support `speaker_wav` argument.")
+ if speaker is not None:
+ raise ValueError("Coqui Studio models do not support `speaker` argument.")
+ if language is not None and language != "en":
+ raise ValueError("Coqui Studio models currently support only `language=en` argument.")
+ if emotion not in ["Neutral", "Happy", "Sad", "Angry", "Dull"]:
+ raise ValueError(f"Emotion - `{emotion}` - must be one of `Neutral`, `Happy`, `Sad`, `Angry`, `Dull`.")
+
+ def tts_coqui_studio(
+ self,
+ text: str,
+ speaker_name: str = None,
+ language: str = None,
+ emotion: str = "Neutral",
+ speed: float = 1.0,
+ file_path: str = None,
+ ):
+ """Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
+
+ Args:
+ text (str):
+ Input text to synthesize.
+ speaker_name (str, optional):
+ Speaker name from Coqui Studio. Defaults to None.
+ language (str, optional):
+ Language code. Coqui Studio currently supports only English. Defaults to None.
+ emotion (str, optional):
+ Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Defaults to "Neutral".
+ speed (float, optional):
+ Speed of the speech. Defaults to 1.0.
+ file_path (str, optional):
+ Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None.
+ """
+ speaker_name = self.model_name.split("/")[2]
+ if file_path is None:
+ return self.csapi.tts_to_file(
+ text=text,
+ speaker_name=speaker_name,
+ language=language,
+ speed=speed,
+ emotion=emotion,
+ file_path=file_path,
+ )[0]
+ return self.csapi.tts(text=text, speaker_name=speaker_name, language=language, speed=speed, emotion=emotion)[0]
+
+ def tts(
+ self,
+ text: str,
+ speaker: str = None,
+ language: str = None,
+ speaker_wav: str = None,
+ emotion: str = None,
+ speed: float = None,
+ ):
+ """Convert text to speech.
+
+ Args:
+ text (str):
+ Input text to synthesize.
+ speaker (str, optional):
+ Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
+ `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
+ language (str, optional):
+ Language code for multi-lingual models. You can check whether loaded model is multi-lingual
+ `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
+ speaker_wav (str, optional):
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
+ Defaults to None.
+ emotion (str, optional):
+ Emotion to use for 🐸Coqui Studio models. If None, Studio models use "Neutral". Defaults to None.
+ speed (float, optional):
+ Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0.
+ Defaults to None.
+ """
+ self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed)
+ if self.csapi is not None:
+ return self.tts_coqui_studio(
+ text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed
+ )
+
+ wav = self.synthesizer.tts(
+ text=text,
+ speaker_name=speaker,
+ language_name=language,
+ speaker_wav=speaker_wav,
+ reference_wav=None,
+ style_wav=None,
+ style_text=None,
+ reference_speaker_name=None,
+ )
+ return wav
+
+ def tts_to_file(
+ self,
+ text: str,
+ speaker: str = None,
+ language: str = None,
+ speaker_wav: str = None,
+ emotion: str = "Neutral",
+ speed: float = 1.0,
+ file_path: str = "output.wav",
+ ):
+ """Convert text to speech.
+
+ Args:
+ text (str):
+ Input text to synthesize.
+ speaker (str, optional):
+ Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
+ `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
+ language (str, optional):
+ Language code for multi-lingual models. You can check whether loaded model is multi-lingual
+ `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
+ speaker_wav (str, optional):
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
+ Defaults to None.
+ emotion (str, optional):
+ Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral".
+ speed (float, optional):
+ Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
+ file_path (str, optional):
+ Output file path. Defaults to "output.wav".
+ """
+ self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav)
+
+ if self.csapi is not None:
+ return self.tts_coqui_studio(
+ text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed, file_path=file_path
+ )
+ wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav)
+ self.synthesizer.save_wav(wav=wav, path=file_path)
+ return file_path
+
+ def voice_conversion(
+ self,
+ sourve_wav: str,
+ target_wav: str,
+ ):
+ """Voice conversion with FreeVC. Convert source wav to target speaker.
+
+ Args:
+ source_wav (str):
+ Path to the source wav file.
+ target_wav (str):
+ Path to the target wav file.
+ """
+ wav = self.synthesizer.voice_conversion(source_wav=sourve_wav, target_wav=target_wav)
+ return wav
+
+ def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None):
+ """Convert text to speech with voice conversion.
+
+ It combines tts with voice conversion to fake voice cloning.
+
+ - Convert text to speech with tts.
+ - Convert the output wav to target speaker with voice conversion.
+
+ Args:
+ text (str):
+ Input text to synthesize.
+ language (str, optional):
+ Language code for multi-lingual models. You can check whether loaded model is multi-lingual
+ `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
+ speaker_wav (str, optional):
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
+ Defaults to None.
+ """
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
+ # Lazy code... save it to a temp file to resample it while reading it for VC
+ self.tts_to_file(text=text, speaker=None, language=language, file_path=fp.name)
+ if self.voice_converter is None:
+ self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24")
+ wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav)
+ return wav
+
+ def tts_with_vc_to_file(
+ self, text: str, language: str = None, speaker_wav: str = None, file_path: str = "output.wav"
+ ):
+ """Convert text to speech with voice conversion and save to file.
+
+ Check `tts_with_vc` for more details.
+
+ Args:
+ text (str):
+ Input text to synthesize.
+ language (str, optional):
+ Language code for multi-lingual models. You can check whether loaded model is multi-lingual
+ `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
+ speaker_wav (str, optional):
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
+ Defaults to None.
+ file_path (str, optional):
+ Output file path. Defaults to "output.wav".
+ """
+ wav = self.tts_with_vc(text=text, language=language, speaker_wav=speaker_wav)
+ save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
diff --git a/TTS/bin/__init__.py b/TTS/bin/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/TTS/bin/collect_env_info.py b/TTS/bin/collect_env_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..662fcd02ece0fad387b6bfc4bad9316c7e2a0bad
--- /dev/null
+++ b/TTS/bin/collect_env_info.py
@@ -0,0 +1,48 @@
+"""Get detailed info about the working environment."""
+import os
+import platform
+import sys
+
+import numpy
+import torch
+
+sys.path += [os.path.abspath(".."), os.path.abspath(".")]
+import json
+
+import TTS
+
+
+def system_info():
+ return {
+ "OS": platform.system(),
+ "architecture": platform.architecture(),
+ "version": platform.version(),
+ "processor": platform.processor(),
+ "python": platform.python_version(),
+ }
+
+
+def cuda_info():
+ return {
+ "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())],
+ "available": torch.cuda.is_available(),
+ "version": torch.version.cuda,
+ }
+
+
+def package_info():
+ return {
+ "numpy": numpy.__version__,
+ "PyTorch_version": torch.__version__,
+ "PyTorch_debug": torch.version.debug,
+ "TTS": TTS.__version__,
+ }
+
+
+def main():
+ details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()}
+ print(json.dumps(details, indent=4, sort_keys=True))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ab520be7d9f41ecf4f124446400b5e1b597ae8b
--- /dev/null
+++ b/TTS/bin/compute_attention_masks.py
@@ -0,0 +1,165 @@
+import argparse
+import importlib
+import os
+from argparse import RawTextHelpFormatter
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from TTS.config import load_config
+from TTS.tts.datasets.TTSDataset import TTSDataset
+from TTS.tts.models import setup_model
+from TTS.tts.utils.text.characters import make_symbols, phonemes, symbols
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.io import load_checkpoint
+
+if __name__ == "__main__":
+ # pylint: disable=bad-option-value
+ parser = argparse.ArgumentParser(
+ description="""Extract attention masks from trained Tacotron/Tacotron2 models.
+These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n"""
+ """Each attention mask is written to the same path as the input wav file with ".npy" file extension.
+(e.g. path/bla.wav (wav file) --> path/bla.npy (attention mask))\n"""
+ """
+Example run:
+ CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
+ --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth
+ --config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json
+ --dataset_metafile metadata.csv
+ --data_path /root/LJSpeech-1.1/
+ --batch_size 32
+ --dataset ljspeech
+ --use_cuda True
+""",
+ formatter_class=RawTextHelpFormatter,
+ )
+ parser.add_argument("--model_path", type=str, required=True, help="Path to Tacotron/Tacotron2 model file ")
+ parser.add_argument(
+ "--config_path",
+ type=str,
+ required=True,
+ help="Path to Tacotron/Tacotron2 config file.",
+ )
+ parser.add_argument(
+ "--dataset",
+ type=str,
+ default="",
+ required=True,
+ help="Target dataset processor name from TTS.tts.dataset.preprocess.",
+ )
+
+ parser.add_argument(
+ "--dataset_metafile",
+ type=str,
+ default="",
+ required=True,
+ help="Dataset metafile inclusing file paths with transcripts.",
+ )
+ parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.")
+ parser.add_argument("--use_cuda", type=bool, default=False, help="enable/disable cuda.")
+
+ parser.add_argument(
+ "--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA."
+ )
+ args = parser.parse_args()
+
+ C = load_config(args.config_path)
+ ap = AudioProcessor(**C.audio)
+
+ # if the vocabulary was passed, replace the default
+ if "characters" in C.keys():
+ symbols, phonemes = make_symbols(**C.characters)
+
+ # load the model
+ num_chars = len(phonemes) if C.use_phonemes else len(symbols)
+ # TODO: handle multi-speaker
+ model = setup_model(C)
+ model, _ = load_checkpoint(model, args.model_path, args.use_cuda, True)
+
+ # data loader
+ preprocessor = importlib.import_module("TTS.tts.datasets.formatters")
+ preprocessor = getattr(preprocessor, args.dataset)
+ meta_data = preprocessor(args.data_path, args.dataset_metafile)
+ dataset = TTSDataset(
+ model.decoder.r,
+ C.text_cleaner,
+ compute_linear_spec=False,
+ ap=ap,
+ meta_data=meta_data,
+ characters=C.characters if "characters" in C.keys() else None,
+ add_blank=C["add_blank"] if "add_blank" in C.keys() else False,
+ use_phonemes=C.use_phonemes,
+ phoneme_cache_path=C.phoneme_cache_path,
+ phoneme_language=C.phoneme_language,
+ enable_eos_bos=C.enable_eos_bos_chars,
+ )
+
+ dataset.sort_and_filter_items(C.get("sort_by_audio_len", default=False))
+ loader = DataLoader(
+ dataset,
+ batch_size=args.batch_size,
+ num_workers=4,
+ collate_fn=dataset.collate_fn,
+ shuffle=False,
+ drop_last=False,
+ )
+
+ # compute attentions
+ file_paths = []
+ with torch.no_grad():
+ for data in tqdm(loader):
+ # setup input data
+ text_input = data[0]
+ text_lengths = data[1]
+ linear_input = data[3]
+ mel_input = data[4]
+ mel_lengths = data[5]
+ stop_targets = data[6]
+ item_idxs = data[7]
+
+ # dispatch data to GPU
+ if args.use_cuda:
+ text_input = text_input.cuda()
+ text_lengths = text_lengths.cuda()
+ mel_input = mel_input.cuda()
+ mel_lengths = mel_lengths.cuda()
+
+ model_outputs = model.forward(text_input, text_lengths, mel_input)
+
+ alignments = model_outputs["alignments"].detach()
+ for idx, alignment in enumerate(alignments):
+ item_idx = item_idxs[idx]
+ # interpolate if r > 1
+ alignment = (
+ torch.nn.functional.interpolate(
+ alignment.transpose(0, 1).unsqueeze(0),
+ size=None,
+ scale_factor=model.decoder.r,
+ mode="nearest",
+ align_corners=None,
+ recompute_scale_factor=None,
+ )
+ .squeeze(0)
+ .transpose(0, 1)
+ )
+ # remove paddings
+ alignment = alignment[: mel_lengths[idx], : text_lengths[idx]].cpu().numpy()
+ # set file paths
+ wav_file_name = os.path.basename(item_idx)
+ align_file_name = os.path.splitext(wav_file_name)[0] + "_attn.npy"
+ file_path = item_idx.replace(wav_file_name, align_file_name)
+ # save output
+ wav_file_abs_path = os.path.abspath(item_idx)
+ file_abs_path = os.path.abspath(file_path)
+ file_paths.append([wav_file_abs_path, file_abs_path])
+ np.save(file_path, alignment)
+
+ # ourput metafile
+ metafile = os.path.join(args.data_path, "metadata_attn_mask.txt")
+
+ with open(metafile, "w", encoding="utf-8") as f:
+ for p in file_paths:
+ f.write(f"{p[0]}|{p[1]}\n")
+ print(f" >> Metafile created: {metafile}")
diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e0932cc767fa600ff4af34e2df3917e31e2eb59
--- /dev/null
+++ b/TTS/bin/compute_embeddings.py
@@ -0,0 +1,172 @@
+import argparse
+import os
+from argparse import RawTextHelpFormatter
+
+import torch
+from tqdm import tqdm
+
+from TTS.config import load_config
+from TTS.config.shared_configs import BaseDatasetConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.utils.managers import save_file
+from TTS.tts.utils.speakers import SpeakerManager
+
+
+def compute_embeddings(
+ model_path,
+ config_path,
+ output_path,
+ old_spakers_file=None,
+ config_dataset_path=None,
+ formatter_name=None,
+ dataset_name=None,
+ dataset_path=None,
+ meta_file_train=None,
+ meta_file_val=None,
+ disable_cuda=False,
+ no_eval=False,
+):
+ use_cuda = torch.cuda.is_available() and not disable_cuda
+
+ if config_dataset_path is not None:
+ c_dataset = load_config(config_dataset_path)
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not no_eval)
+ else:
+ c_dataset = BaseDatasetConfig()
+ c_dataset.formatter = formatter_name
+ c_dataset.dataset_name = dataset_name
+ c_dataset.path = dataset_path
+ if meta_file_train is not None:
+ c_dataset.meta_file_train = meta_file_train
+ if meta_file_val is not None:
+ c_dataset.meta_file_val = meta_file_val
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not no_eval)
+
+ if meta_data_eval is None:
+ samples = meta_data_train
+ else:
+ samples = meta_data_train + meta_data_eval
+
+ encoder_manager = SpeakerManager(
+ encoder_model_path=model_path,
+ encoder_config_path=config_path,
+ d_vectors_file_path=old_spakers_file,
+ use_cuda=use_cuda,
+ )
+
+ class_name_key = encoder_manager.encoder_config.class_name_key
+
+ # compute speaker embeddings
+ speaker_mapping = {}
+ for fields in tqdm(samples):
+ class_name = fields[class_name_key]
+ audio_file = fields["audio_file"]
+ embedding_key = fields["audio_unique_name"]
+
+ if old_spakers_file is not None and embedding_key in encoder_manager.clip_ids:
+ # get the embedding from the old file
+ embedd = encoder_manager.get_embedding_by_clip(embedding_key)
+ else:
+ # extract the embedding
+ embedd = encoder_manager.compute_embedding_from_clip(audio_file)
+
+ # create speaker_mapping if target dataset is defined
+ speaker_mapping[embedding_key] = {}
+ speaker_mapping[embedding_key]["name"] = class_name
+ speaker_mapping[embedding_key]["embedding"] = embedd
+
+ if speaker_mapping:
+ # save speaker_mapping if target dataset is defined
+ if os.path.isdir(output_path):
+ mapping_file_path = os.path.join(output_path, "speakers.pth")
+ else:
+ mapping_file_path = output_path
+
+ if os.path.dirname(mapping_file_path) != "":
+ os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
+
+ save_file(speaker_mapping, mapping_file_path)
+ print("Speaker embeddings saved at:", mapping_file_path)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
+ """
+ Example runs:
+ python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --config_dataset_path dataset_config.json
+
+ python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv
+ """,
+ formatter_class=RawTextHelpFormatter,
+ )
+ parser.add_argument(
+ "--model_path",
+ type=str,
+ help="Path to model checkpoint file. It defaults to the released speaker encoder.",
+ default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
+ )
+ parser.add_argument(
+ "--config_path",
+ type=str,
+ help="Path to model config file. It defaults to the released speaker encoder config.",
+ default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
+ )
+ parser.add_argument(
+ "--config_dataset_path",
+ type=str,
+ help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
+ default=None,
+ )
+ parser.add_argument("--output_path", type=str, help="Path for output `pth` or `json` file.", default="speakers.pth")
+ parser.add_argument(
+ "--old_file", type=str, help="Previous embedding file to only compute new audios.", default=None
+ )
+ parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
+ parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False)
+ parser.add_argument(
+ "--formatter_name",
+ type=str,
+ help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
+ default=None,
+ )
+ parser.add_argument(
+ "--dataset_name",
+ type=str,
+ help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
+ default=None,
+ )
+ parser.add_argument(
+ "--dataset_path",
+ type=str,
+ help="Path to the dataset. You either need to provide this or `config_dataset_path`",
+ default=None,
+ )
+ parser.add_argument(
+ "--meta_file_train",
+ type=str,
+ help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
+ default=None,
+ )
+ parser.add_argument(
+ "--meta_file_val",
+ type=str,
+ help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
+ default=None,
+ )
+ args = parser.parse_args()
+
+ compute_embeddings(
+ args.model_path,
+ args.config_path,
+ args.output_path,
+ old_spakers_file=args.old_file,
+ config_dataset_path=args.config_dataset_path,
+ formatter_name=args.formatter_name,
+ dataset_name=args.dataset_name,
+ dataset_path=args.dataset_path,
+ meta_file_train=args.meta_file_train,
+ meta_file_val=args.meta_file_val,
+ disable_cuda=args.disable_cuda,
+ no_eval=args.no_eval,
+ )
diff --git a/TTS/bin/compute_statistics.py b/TTS/bin/compute_statistics.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ab7ea7a3b10ec3cc23d8a744c7bdc79de52dbf2
--- /dev/null
+++ b/TTS/bin/compute_statistics.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import argparse
+import glob
+import os
+
+import numpy as np
+from tqdm import tqdm
+
+# from TTS.utils.io import load_config
+from TTS.config import load_config
+from TTS.tts.datasets import load_tts_samples
+from TTS.utils.audio import AudioProcessor
+
+
+def main():
+ """Run preprocessing process."""
+ parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
+ parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
+ parser.add_argument("out_path", type=str, help="save path (directory and filename).")
+ parser.add_argument(
+ "--data_path",
+ type=str,
+ required=False,
+ help="folder including the target set of wavs overriding dataset config.",
+ )
+ args, overrides = parser.parse_known_args()
+
+ CONFIG = load_config(args.config_path)
+ CONFIG.parse_known_args(overrides, relaxed_parser=True)
+
+ # load config
+ CONFIG.audio.signal_norm = False # do not apply earlier normalization
+ CONFIG.audio.stats_path = None # discard pre-defined stats
+
+ # load audio processor
+ ap = AudioProcessor(**CONFIG.audio.to_dict())
+
+ # load the meta data of target dataset
+ if args.data_path:
+ dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True)
+ else:
+ dataset_items = load_tts_samples(CONFIG.datasets)[0] # take only train data
+ print(f" > There are {len(dataset_items)} files.")
+
+ mel_sum = 0
+ mel_square_sum = 0
+ linear_sum = 0
+ linear_square_sum = 0
+ N = 0
+ for item in tqdm(dataset_items):
+ # compute features
+ wav = ap.load_wav(item if isinstance(item, str) else item["audio_file"])
+ linear = ap.spectrogram(wav)
+ mel = ap.melspectrogram(wav)
+
+ # compute stats
+ N += mel.shape[1]
+ mel_sum += mel.sum(1)
+ linear_sum += linear.sum(1)
+ mel_square_sum += (mel**2).sum(axis=1)
+ linear_square_sum += (linear**2).sum(axis=1)
+
+ mel_mean = mel_sum / N
+ mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2)
+ linear_mean = linear_sum / N
+ linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2)
+
+ output_file_path = args.out_path
+ stats = {}
+ stats["mel_mean"] = mel_mean
+ stats["mel_std"] = mel_scale
+ stats["linear_mean"] = linear_mean
+ stats["linear_std"] = linear_scale
+
+ print(f" > Avg mel spec mean: {mel_mean.mean()}")
+ print(f" > Avg mel spec scale: {mel_scale.mean()}")
+ print(f" > Avg linear spec mean: {linear_mean.mean()}")
+ print(f" > Avg linear spec scale: {linear_scale.mean()}")
+
+ # set default config values for mean-var scaling
+ CONFIG.audio.stats_path = output_file_path
+ CONFIG.audio.signal_norm = True
+ # remove redundant values
+ del CONFIG.audio.max_norm
+ del CONFIG.audio.min_level_db
+ del CONFIG.audio.symmetric_norm
+ del CONFIG.audio.clip_norm
+ stats["audio_config"] = CONFIG.audio.to_dict()
+ np.save(output_file_path, stats, allow_pickle=True)
+ print(f" > stats saved to {output_file_path}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/TTS/bin/eval_encoder.py b/TTS/bin/eval_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..60fed1393215cd5e2e349795b585ae12f2e227fa
--- /dev/null
+++ b/TTS/bin/eval_encoder.py
@@ -0,0 +1,88 @@
+import argparse
+from argparse import RawTextHelpFormatter
+
+import torch
+from tqdm import tqdm
+
+from TTS.config import load_config
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.utils.speakers import SpeakerManager
+
+
+def compute_encoder_accuracy(dataset_items, encoder_manager):
+ class_name_key = encoder_manager.encoder_config.class_name_key
+ map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)
+
+ class_acc_dict = {}
+
+ # compute embeddings for all wav_files
+ for item in tqdm(dataset_items):
+ class_name = item[class_name_key]
+ wav_file = item["audio_file"]
+
+ # extract the embedding
+ embedd = encoder_manager.compute_embedding_from_clip(wav_file)
+ if encoder_manager.encoder_criterion is not None and map_classid_to_classname is not None:
+ embedding = torch.FloatTensor(embedd).unsqueeze(0)
+ if encoder_manager.use_cuda:
+ embedding = embedding.cuda()
+
+ class_id = encoder_manager.encoder_criterion.softmax.inference(embedding).item()
+ predicted_label = map_classid_to_classname[str(class_id)]
+ else:
+ predicted_label = None
+
+ if class_name is not None and predicted_label is not None:
+ is_equal = int(class_name == predicted_label)
+ if class_name not in class_acc_dict:
+ class_acc_dict[class_name] = [is_equal]
+ else:
+ class_acc_dict[class_name].append(is_equal)
+ else:
+ raise RuntimeError("Error: class_name or/and predicted_label are None")
+
+ acc_avg = 0
+ for key, values in class_acc_dict.items():
+ acc = sum(values) / len(values)
+ print("Class", key, "Accuracy:", acc)
+ acc_avg += acc
+
+ print("Average Accuracy:", acc_avg / len(class_acc_dict))
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="""Compute the accuracy of the encoder.\n\n"""
+ """
+ Example runs:
+ python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json dataset_config.json
+ """,
+ formatter_class=RawTextHelpFormatter,
+ )
+ parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
+ parser.add_argument(
+ "config_path",
+ type=str,
+ help="Path to model config file.",
+ )
+
+ parser.add_argument(
+ "config_dataset_path",
+ type=str,
+ help="Path to dataset config file.",
+ )
+ parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
+ parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
+
+ args = parser.parse_args()
+
+ c_dataset = load_config(args.config_dataset_path)
+
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval)
+ items = meta_data_train + meta_data_eval
+
+ enc_manager = SpeakerManager(
+ encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda
+ )
+
+ compute_encoder_accuracy(items, enc_manager)
diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eadee070ed15283e4460331cbb48927392a2b12
--- /dev/null
+++ b/TTS/bin/extract_tts_spectrograms.py
@@ -0,0 +1,286 @@
+#!/usr/bin/env python3
+"""Extract Mel spectrograms with teacher forcing."""
+
+import argparse
+import os
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from TTS.config import load_config
+from TTS.tts.datasets import TTSDataset, load_tts_samples
+from TTS.tts.models import setup_model
+from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.generic_utils import count_parameters
+
+use_cuda = torch.cuda.is_available()
+
+
+def setup_loader(ap, r, verbose=False):
+ tokenizer, _ = TTSTokenizer.init_from_config(c)
+ dataset = TTSDataset(
+ outputs_per_step=r,
+ compute_linear_spec=False,
+ samples=meta_data,
+ tokenizer=tokenizer,
+ ap=ap,
+ batch_group_size=0,
+ min_text_len=c.min_text_len,
+ max_text_len=c.max_text_len,
+ min_audio_len=c.min_audio_len,
+ max_audio_len=c.max_audio_len,
+ phoneme_cache_path=c.phoneme_cache_path,
+ precompute_num_workers=0,
+ use_noise_augment=False,
+ verbose=verbose,
+ speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
+ d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
+ )
+
+ if c.use_phonemes and c.compute_input_seq_cache:
+ # precompute phonemes to have a better estimate of sequence lengths.
+ dataset.compute_input_seq(c.num_loader_workers)
+ dataset.preprocess_samples()
+
+ loader = DataLoader(
+ dataset,
+ batch_size=c.batch_size,
+ shuffle=False,
+ collate_fn=dataset.collate_fn,
+ drop_last=False,
+ sampler=None,
+ num_workers=c.num_loader_workers,
+ pin_memory=False,
+ )
+ return loader
+
+
+def set_filename(wav_path, out_path):
+ wav_file = os.path.basename(wav_path)
+ file_name = wav_file.split(".")[0]
+ os.makedirs(os.path.join(out_path, "quant"), exist_ok=True)
+ os.makedirs(os.path.join(out_path, "mel"), exist_ok=True)
+ os.makedirs(os.path.join(out_path, "wav_gl"), exist_ok=True)
+ os.makedirs(os.path.join(out_path, "wav"), exist_ok=True)
+ wavq_path = os.path.join(out_path, "quant", file_name)
+ mel_path = os.path.join(out_path, "mel", file_name)
+ wav_gl_path = os.path.join(out_path, "wav_gl", file_name + ".wav")
+ wav_path = os.path.join(out_path, "wav", file_name + ".wav")
+ return file_name, wavq_path, mel_path, wav_gl_path, wav_path
+
+
+def format_data(data):
+ # setup input data
+ text_input = data["token_id"]
+ text_lengths = data["token_id_lengths"]
+ mel_input = data["mel"]
+ mel_lengths = data["mel_lengths"]
+ item_idx = data["item_idxs"]
+ d_vectors = data["d_vectors"]
+ speaker_ids = data["speaker_ids"]
+ attn_mask = data["attns"]
+ avg_text_length = torch.mean(text_lengths.float())
+ avg_spec_length = torch.mean(mel_lengths.float())
+
+ # dispatch data to GPU
+ if use_cuda:
+ text_input = text_input.cuda(non_blocking=True)
+ text_lengths = text_lengths.cuda(non_blocking=True)
+ mel_input = mel_input.cuda(non_blocking=True)
+ mel_lengths = mel_lengths.cuda(non_blocking=True)
+ if speaker_ids is not None:
+ speaker_ids = speaker_ids.cuda(non_blocking=True)
+ if d_vectors is not None:
+ d_vectors = d_vectors.cuda(non_blocking=True)
+ if attn_mask is not None:
+ attn_mask = attn_mask.cuda(non_blocking=True)
+ return (
+ text_input,
+ text_lengths,
+ mel_input,
+ mel_lengths,
+ speaker_ids,
+ d_vectors,
+ avg_text_length,
+ avg_spec_length,
+ attn_mask,
+ item_idx,
+ )
+
+
+@torch.no_grad()
+def inference(
+ model_name,
+ model,
+ ap,
+ text_input,
+ text_lengths,
+ mel_input,
+ mel_lengths,
+ speaker_ids=None,
+ d_vectors=None,
+):
+ if model_name == "glow_tts":
+ speaker_c = None
+ if speaker_ids is not None:
+ speaker_c = speaker_ids
+ elif d_vectors is not None:
+ speaker_c = d_vectors
+ outputs = model.inference_with_MAS(
+ text_input,
+ text_lengths,
+ mel_input,
+ mel_lengths,
+ aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids},
+ )
+ model_output = outputs["model_outputs"]
+ model_output = model_output.detach().cpu().numpy()
+
+ elif "tacotron" in model_name:
+ aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
+ outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input)
+ postnet_outputs = outputs["model_outputs"]
+ # normalize tacotron output
+ if model_name == "tacotron":
+ mel_specs = []
+ postnet_outputs = postnet_outputs.data.cpu().numpy()
+ for b in range(postnet_outputs.shape[0]):
+ postnet_output = postnet_outputs[b]
+ mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T))
+ model_output = torch.stack(mel_specs).cpu().numpy()
+
+ elif model_name == "tacotron2":
+ model_output = postnet_outputs.detach().cpu().numpy()
+ return model_output
+
+
+def extract_spectrograms(
+ data_loader, model, ap, output_path, quantized_wav=False, save_audio=False, debug=False, metada_name="metada.txt"
+):
+ model.eval()
+ export_metadata = []
+ for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
+ # format data
+ (
+ text_input,
+ text_lengths,
+ mel_input,
+ mel_lengths,
+ speaker_ids,
+ d_vectors,
+ _,
+ _,
+ _,
+ item_idx,
+ ) = format_data(data)
+
+ model_output = inference(
+ c.model.lower(),
+ model,
+ ap,
+ text_input,
+ text_lengths,
+ mel_input,
+ mel_lengths,
+ speaker_ids,
+ d_vectors,
+ )
+
+ for idx in range(text_input.shape[0]):
+ wav_file_path = item_idx[idx]
+ wav = ap.load_wav(wav_file_path)
+ _, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
+
+ # quantize and save wav
+ if quantized_wav:
+ wavq = ap.quantize(wav)
+ np.save(wavq_path, wavq)
+
+ # save TTS mel
+ mel = model_output[idx]
+ mel_length = mel_lengths[idx]
+ mel = mel[:mel_length, :].T
+ np.save(mel_path, mel)
+
+ export_metadata.append([wav_file_path, mel_path])
+ if save_audio:
+ ap.save_wav(wav, wav_path)
+
+ if debug:
+ print("Audio for debug saved at:", wav_gl_path)
+ wav = ap.inv_melspectrogram(mel)
+ ap.save_wav(wav, wav_gl_path)
+
+ with open(os.path.join(output_path, metada_name), "w", encoding="utf-8") as f:
+ for data in export_metadata:
+ f.write(f"{data[0]}|{data[1]+'.npy'}\n")
+
+
+def main(args): # pylint: disable=redefined-outer-name
+ # pylint: disable=global-variable-undefined
+ global meta_data, speaker_manager
+
+ # Audio processor
+ ap = AudioProcessor(**c.audio)
+
+ # load data instances
+ meta_data_train, meta_data_eval = load_tts_samples(
+ c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
+ )
+
+ # use eval and training partitions
+ meta_data = meta_data_train + meta_data_eval
+
+ # init speaker manager
+ if c.use_speaker_embedding:
+ speaker_manager = SpeakerManager(data_items=meta_data)
+ elif c.use_d_vector_file:
+ speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file)
+ else:
+ speaker_manager = None
+
+ # setup model
+ model = setup_model(c)
+
+ # restore model
+ model.load_checkpoint(c, args.checkpoint_path, eval=True)
+
+ if use_cuda:
+ model.cuda()
+
+ num_params = count_parameters(model)
+ print("\n > Model has {} parameters".format(num_params), flush=True)
+ # set r
+ r = 1 if c.model.lower() == "glow_tts" else model.decoder.r
+ own_loader = setup_loader(ap, r, verbose=True)
+
+ extract_spectrograms(
+ own_loader,
+ model,
+ ap,
+ args.output_path,
+ quantized_wav=args.quantized,
+ save_audio=args.save_audio,
+ debug=args.debug,
+ metada_name="metada.txt",
+ )
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
+ parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
+ parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
+ parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
+ parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
+ parser.add_argument("--quantized", action="store_true", help="Save quantized audio files")
+ parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
+ args = parser.parse_args()
+
+ c = load_config(args.config_path)
+ c.audio.trim_silence = False
+ main(args)
diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea16974839df6cf9942ef24a5535597940fde5b2
--- /dev/null
+++ b/TTS/bin/find_unique_chars.py
@@ -0,0 +1,45 @@
+"""Find all the unique characters in a dataset"""
+import argparse
+from argparse import RawTextHelpFormatter
+
+from TTS.config import load_config
+from TTS.tts.datasets import load_tts_samples
+
+
+def main():
+ # pylint: disable=bad-option-value
+ parser = argparse.ArgumentParser(
+ description="""Find all the unique characters or phonemes in a dataset.\n\n"""
+ """
+ Example runs:
+
+ python TTS/bin/find_unique_chars.py --config_path config.json
+ """,
+ formatter_class=RawTextHelpFormatter,
+ )
+ parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
+ args = parser.parse_args()
+
+ c = load_config(args.config_path)
+
+ # load all datasets
+ train_items, eval_items = load_tts_samples(
+ c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
+ )
+
+ items = train_items + eval_items
+
+ texts = "".join(item["text"] for item in items)
+ chars = set(texts)
+ lower_chars = filter(lambda c: c.islower(), chars)
+ chars_force_lower = [c.lower() for c in chars]
+ chars_force_lower = set(chars_force_lower)
+
+ print(f" > Number of unique characters: {len(chars)}")
+ print(f" > Unique characters: {''.join(sorted(chars))}")
+ print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
+ print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bd7a78eef2c4850bca9369def55d68336cd53aa
--- /dev/null
+++ b/TTS/bin/find_unique_phonemes.py
@@ -0,0 +1,74 @@
+"""Find all the unique characters in a dataset"""
+import argparse
+import multiprocessing
+from argparse import RawTextHelpFormatter
+
+from tqdm.contrib.concurrent import process_map
+
+from TTS.config import load_config
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.utils.text.phonemizers import Gruut
+
+
+def compute_phonemes(item):
+ text = item["text"]
+ ph = phonemizer.phonemize(text).replace("|", "")
+ return set(list(ph))
+
+
+def main():
+ # pylint: disable=W0601
+ global c, phonemizer
+ # pylint: disable=bad-option-value
+ parser = argparse.ArgumentParser(
+ description="""Find all the unique characters or phonemes in a dataset.\n\n"""
+ """
+ Example runs:
+
+ python TTS/bin/find_unique_phonemes.py --config_path config.json
+ """,
+ formatter_class=RawTextHelpFormatter,
+ )
+ parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
+ args = parser.parse_args()
+
+ c = load_config(args.config_path)
+
+ # load all datasets
+ train_items, eval_items = load_tts_samples(
+ c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
+ )
+ items = train_items + eval_items
+ print("Num items:", len(items))
+
+ language_list = [item["language"] for item in items]
+ is_lang_def = all(language_list)
+
+ if not c.phoneme_language or not is_lang_def:
+ raise ValueError("Phoneme language must be defined in config.")
+
+ if not language_list.count(language_list[0]) == len(language_list):
+ raise ValueError(
+ "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
+ )
+
+ phonemizer = Gruut(language=language_list[0], keep_puncs=True)
+
+ phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
+ phones = []
+ for ph in phonemes:
+ phones.extend(ph)
+
+ phones = set(phones)
+ lower_phones = filter(lambda c: c.islower(), phones)
+ phones_force_lower = [c.lower() for c in phones]
+ phones_force_lower = set(phones_force_lower)
+
+ print(f" > Number of unique phonemes: {len(phones)}")
+ print(f" > Unique phonemes: {''.join(sorted(phones))}")
+ print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
+ print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py
new file mode 100644
index 0000000000000000000000000000000000000000..352628bbc163338668edfe33ea328825d5b9a015
--- /dev/null
+++ b/TTS/bin/remove_silence_using_vad.py
@@ -0,0 +1,93 @@
+import argparse
+import glob
+import os
+import pathlib
+
+from tqdm import tqdm
+
+from TTS.utils.vad import get_vad_model_and_utils, remove_silence
+
+
+def adjust_path_and_remove_silence(audio_path):
+ output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
+ # ignore if the file exists
+ if os.path.exists(output_path) and not args.force:
+ return output_path
+
+ # create all directory structure
+ pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+ # remove the silence and save the audio
+ output_path, is_speech = remove_silence(
+ model_and_utils,
+ audio_path,
+ output_path,
+ trim_just_beginning_and_end=args.trim_just_beginning_and_end,
+ use_cuda=args.use_cuda,
+ )
+
+ return output_path, is_speech
+
+
+def preprocess_audios():
+ files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
+ print("> Number of files: ", len(files))
+ if not args.force:
+ print("> Ignoring files that already exist in the output idrectory.")
+
+ if args.trim_just_beginning_and_end:
+ print("> Trimming just the beginning and the end with nonspeech parts.")
+ else:
+ print("> Trimming all nonspeech parts.")
+
+ filtered_files = []
+ if files:
+ # create threads
+ # num_threads = multiprocessing.cpu_count()
+ # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
+ for f in tqdm(files):
+ output_path, is_speech = adjust_path_and_remove_silence(f)
+ if not is_speech:
+ filtered_files.append(output_path)
+
+ # write files that do not have speech
+ with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
+ for file in filtered_files:
+ f.write(file + "\n")
+ else:
+ print("> No files Found !")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
+ )
+ parser.add_argument("-i", "--input_dir", type=str, default="../VCTK-Corpus", help="Dataset root dir")
+ parser.add_argument(
+ "-o", "--output_dir", type=str, default="../VCTK-Corpus-removed-silence", help="Output Dataset dir"
+ )
+ parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
+ parser.add_argument(
+ "-g",
+ "--glob",
+ type=str,
+ default="**/*.wav",
+ help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav",
+ )
+ parser.add_argument(
+ "-t",
+ "--trim_just_beginning_and_end",
+ type=bool,
+ default=True,
+ help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True",
+ )
+ parser.add_argument(
+ "-c",
+ "--use_cuda",
+ type=bool,
+ default=False,
+ help="If True use cuda",
+ )
+ args = parser.parse_args()
+ # load the model and utils
+ model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda)
+ preprocess_audios()
diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3f28485d1fb235ab0d521ee30318c64b48fbd5a
--- /dev/null
+++ b/TTS/bin/resample.py
@@ -0,0 +1,90 @@
+import argparse
+import glob
+import os
+from argparse import RawTextHelpFormatter
+from multiprocessing import Pool
+from shutil import copytree
+
+import librosa
+import soundfile as sf
+from tqdm import tqdm
+
+
+def resample_file(func_args):
+ filename, output_sr = func_args
+ y, sr = librosa.load(filename, sr=output_sr)
+ sf.write(filename, y, sr)
+
+
+def resample_files(input_dir, output_sr, output_dir=None, file_ext="wav", n_jobs=10):
+ if output_dir:
+ print("Recursively copying the input folder...")
+ copytree(input_dir, output_dir)
+ input_dir = output_dir
+
+ print("Resampling the audio files...")
+ audio_files = glob.glob(os.path.join(input_dir, f"**/*.{file_ext}"), recursive=True)
+ print(f"Found {len(audio_files)} files...")
+ audio_files = list(zip(audio_files, len(audio_files) * [output_sr]))
+ with Pool(processes=n_jobs) as p:
+ with tqdm(total=len(audio_files)) as pbar:
+ for _, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
+ pbar.update()
+
+ print("Done !")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="""Resample a folder recusively with librosa
+ Can be used in place or create a copy of the folder as an output.\n\n
+ Example run:
+ python TTS/bin/resample.py
+ --input_dir /root/LJSpeech-1.1/
+ --output_sr 22050
+ --output_dir /root/resampled_LJSpeech-1.1/
+ --file_ext wav
+ --n_jobs 24
+ """,
+ formatter_class=RawTextHelpFormatter,
+ )
+
+ parser.add_argument(
+ "--input_dir",
+ type=str,
+ default=None,
+ required=True,
+ help="Path of the folder containing the audio files to resample",
+ )
+
+ parser.add_argument(
+ "--output_sr",
+ type=int,
+ default=22050,
+ required=False,
+ help="Samlple rate to which the audio files should be resampled",
+ )
+
+ parser.add_argument(
+ "--output_dir",
+ type=str,
+ default=None,
+ required=False,
+ help="Path of the destination folder. If not defined, the operation is done in place",
+ )
+
+ parser.add_argument(
+ "--file_ext",
+ type=str,
+ default="wav",
+ required=False,
+ help="Extension of the audio files to resample",
+ )
+
+ parser.add_argument(
+ "--n_jobs", type=int, default=None, help="Number of threads to use, by default it uses all cores"
+ )
+
+ args = parser.parse_args()
+
+ resample_files(args.input_dir, args.output_sr, args.output_dir, args.file_ext, args.n_jobs)
diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
new file mode 100644
index 0000000000000000000000000000000000000000..2877ea2bdecafc3a462e653e86ade15373503744
--- /dev/null
+++ b/TTS/bin/synthesize.py
@@ -0,0 +1,418 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import argparse
+import sys
+from argparse import RawTextHelpFormatter
+
+# pylint: disable=redefined-outer-name, unused-argument
+from pathlib import Path
+
+from TTS.utils.manage import ModelManager
+from TTS.utils.synthesizer import Synthesizer
+
+
+def str2bool(v):
+ if isinstance(v, bool):
+ return v
+ if v.lower() in ("yes", "true", "t", "y", "1"):
+ return True
+ if v.lower() in ("no", "false", "f", "n", "0"):
+ return False
+ raise argparse.ArgumentTypeError("Boolean value expected.")
+
+
+def main():
+ description = """Synthesize speech on command line.
+
+You can either use your trained model or choose a model from the provided list.
+
+If you don't specify any models, then it uses LJSpeech based English model.
+
+## Example Runs
+
+### Single Speaker Models
+
+- List provided models:
+
+ ```
+ $ tts --list_models
+ ```
+
+- Query info for model info by idx:
+
+ ```
+ $ tts --model_info_by_idx "/"
+ ```
+
+- Query info for model info by full name:
+
+ ```
+ $ tts --model_info_by_name "///"
+ ```
+
+- Run TTS with default models:
+
+ ```
+ $ tts --text "Text for TTS"
+ ```
+
+- Run a TTS model with its default vocoder model:
+
+ ```
+ $ tts --text "Text for TTS" --model_name "///
+ ```
+
+- Run with specific TTS and vocoder models from the list:
+
+ ```
+ $ tts --text "Text for TTS" --model_name "///" --vocoder_name "///" --output_path
+ ```
+
+- Run your own TTS model (Using Griffin-Lim Vocoder):
+
+ ```
+ $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
+ ```
+
+- Run your own TTS and Vocoder models:
+ ```
+ $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
+ --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
+ ```
+
+### Multi-speaker Models
+
+- List the available speakers and choose as among them:
+
+ ```
+ $ tts --model_name "//" --list_speaker_idxs
+ ```
+
+- Run the multi-speaker TTS model with the target speaker ID:
+
+ ```
+ $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "//" --speaker_idx
+ ```
+
+- Run your own multi-speaker TTS model:
+
+ ```
+ $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx
+ ```
+
+### Voice Conversion Models
+
+ ```
+ $ tts --out_path output/path/speech.wav --model_name "//" --source_wav --target_wav
+ ```
+ """
+ # We remove Markdown code formatting programmatically here to allow us to copy-and-paste from main README to keep
+ # documentation in sync more easily.
+ parser = argparse.ArgumentParser(
+ description=description.replace(" ```\n", ""),
+ formatter_class=RawTextHelpFormatter,
+ )
+
+ parser.add_argument(
+ "--list_models",
+ type=str2bool,
+ nargs="?",
+ const=True,
+ default=False,
+ help="list available pre-trained TTS and vocoder models.",
+ )
+
+ parser.add_argument(
+ "--model_info_by_idx",
+ type=str,
+ default=None,
+ help="model info using query format: /",
+ )
+
+ parser.add_argument(
+ "--model_info_by_name",
+ type=str,
+ default=None,
+ help="model info using query format: ///",
+ )
+
+ parser.add_argument("--text", type=str, default=None, help="Text to generate speech.")
+
+ # Args for running pre-trained TTS models.
+ parser.add_argument(
+ "--model_name",
+ type=str,
+ default="tts_models/en/ljspeech/tacotron2-DDC",
+ help="Name of one of the pre-trained TTS models in format //",
+ )
+ parser.add_argument(
+ "--vocoder_name",
+ type=str,
+ default=None,
+ help="Name of one of the pre-trained vocoder models in format //",
+ )
+
+ # Args for running custom models
+ parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
+ parser.add_argument(
+ "--model_path",
+ type=str,
+ default=None,
+ help="Path to model file.",
+ )
+ parser.add_argument(
+ "--out_path",
+ type=str,
+ default="tts_output.wav",
+ help="Output wav file path.",
+ )
+ parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
+ parser.add_argument(
+ "--vocoder_path",
+ type=str,
+ help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
+ default=None,
+ )
+ parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
+ parser.add_argument(
+ "--encoder_path",
+ type=str,
+ help="Path to speaker encoder model file.",
+ default=None,
+ )
+ parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
+
+ # args for multi-speaker synthesis
+ parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
+ parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
+ parser.add_argument(
+ "--speaker_idx",
+ type=str,
+ help="Target speaker ID for a multi-speaker TTS model.",
+ default=None,
+ )
+ parser.add_argument(
+ "--language_idx",
+ type=str,
+ help="Target language ID for a multi-lingual TTS model.",
+ default=None,
+ )
+ parser.add_argument(
+ "--speaker_wav",
+ nargs="+",
+ help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
+ default=None,
+ )
+ parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
+ parser.add_argument(
+ "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
+ )
+ parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
+ parser.add_argument(
+ "--list_speaker_idxs",
+ help="List available speaker ids for the defined multi-speaker model.",
+ type=str2bool,
+ nargs="?",
+ const=True,
+ default=False,
+ )
+ parser.add_argument(
+ "--list_language_idxs",
+ help="List available language ids for the defined multi-lingual model.",
+ type=str2bool,
+ nargs="?",
+ const=True,
+ default=False,
+ )
+ # aux args
+ parser.add_argument(
+ "--save_spectogram",
+ type=bool,
+ help="If true save raw spectogram for further (vocoder) processing in out_path.",
+ default=False,
+ )
+ parser.add_argument(
+ "--reference_wav",
+ type=str,
+ help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav",
+ default=None,
+ )
+ parser.add_argument(
+ "--reference_speaker_idx",
+ type=str,
+ help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
+ default=None,
+ )
+ parser.add_argument(
+ "--progress_bar",
+ type=str2bool,
+ help="If true shows a progress bar for the model download. Defaults to True",
+ default=True,
+ )
+
+ # voice conversion args
+ parser.add_argument(
+ "--source_wav",
+ type=str,
+ default=None,
+ help="Original audio file to convert in the voice of the target_wav",
+ )
+ parser.add_argument(
+ "--target_wav",
+ type=str,
+ default=None,
+ help="Target audio file to convert in the voice of the source_wav",
+ )
+
+ args = parser.parse_args()
+
+ # print the description if either text or list_models is not set
+ check_args = [
+ args.text,
+ args.list_models,
+ args.list_speaker_idxs,
+ args.list_language_idxs,
+ args.reference_wav,
+ args.model_info_by_idx,
+ args.model_info_by_name,
+ args.source_wav,
+ args.target_wav,
+ ]
+ if not any(check_args):
+ parser.parse_args(["-h"])
+
+ # load model manager
+ path = Path(__file__).parent / "../.models.json"
+ manager = ModelManager(path, progress_bar=args.progress_bar)
+
+ tts_path = None
+ tts_config_path = None
+ speakers_file_path = None
+ language_ids_file_path = None
+ vocoder_path = None
+ vocoder_config_path = None
+ encoder_path = None
+ encoder_config_path = None
+ vc_path = None
+ vc_config_path = None
+
+ # CASE1 #list : list pre-trained TTS models
+ if args.list_models:
+ manager.list_models()
+ sys.exit()
+
+ # CASE2 #info : model info for pre-trained TTS models
+ if args.model_info_by_idx:
+ model_query = args.model_info_by_idx
+ manager.model_info_by_idx(model_query)
+ sys.exit()
+
+ if args.model_info_by_name:
+ model_query_full_name = args.model_info_by_name
+ manager.model_info_by_full_name(model_query_full_name)
+ sys.exit()
+
+ # CASE3: load pre-trained model paths
+ if args.model_name is not None and not args.model_path:
+ model_path, config_path, model_item = manager.download_model(args.model_name)
+
+ # tts model
+ if model_item["model_type"] == "tts_models":
+ tts_path = model_path
+ tts_config_path = config_path
+ if "default_vocoder" in model_item:
+ args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
+
+ # voice conversion model
+ if model_item["model_type"] == "voice_conversion_models":
+ vc_path = model_path
+ vc_config_path = config_path
+
+ # load vocoder
+ if args.vocoder_name is not None and not args.vocoder_path:
+ vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
+
+ # CASE4: set custom model paths
+ if args.model_path is not None:
+ tts_path = args.model_path
+ tts_config_path = args.config_path
+ speakers_file_path = args.speakers_file_path
+ language_ids_file_path = args.language_ids_file_path
+
+ if args.vocoder_path is not None:
+ vocoder_path = args.vocoder_path
+ vocoder_config_path = args.vocoder_config_path
+
+ if args.encoder_path is not None:
+ encoder_path = args.encoder_path
+ encoder_config_path = args.encoder_config_path
+
+ # load models
+ synthesizer = Synthesizer(
+ tts_path,
+ tts_config_path,
+ speakers_file_path,
+ language_ids_file_path,
+ vocoder_path,
+ vocoder_config_path,
+ encoder_path,
+ encoder_config_path,
+ vc_path,
+ vc_config_path,
+ args.use_cuda,
+ )
+
+ # query speaker ids of a multi-speaker model.
+ if args.list_speaker_idxs:
+ print(
+ " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
+ )
+ print(synthesizer.tts_model.speaker_manager.name_to_id)
+ return
+
+ # query langauge ids of a multi-lingual model.
+ if args.list_language_idxs:
+ print(
+ " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
+ )
+ print(synthesizer.tts_model.language_manager.name_to_id)
+ return
+
+ # check the arguments against a multi-speaker model.
+ if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
+ print(
+ " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
+ "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
+ )
+ return
+
+ # RUN THE SYNTHESIS
+ if args.text:
+ print(" > Text: {}".format(args.text))
+
+ # kick it
+ if tts_path is not None:
+ wav = synthesizer.tts(
+ args.text,
+ args.speaker_idx,
+ args.language_idx,
+ args.speaker_wav,
+ reference_wav=args.reference_wav,
+ style_wav=args.capacitron_style_wav,
+ style_text=args.capacitron_style_text,
+ reference_speaker_name=args.reference_speaker_idx,
+ )
+ elif vc_path is not None:
+ wav = synthesizer.voice_conversion(
+ source_wav=args.source_wav,
+ target_wav=args.target_wav,
+ )
+
+ # save the results
+ print(" > Saving output to {}".format(args.out_path))
+ synthesizer.save_wav(wav, args.out_path)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2e7779c0c109a3ec78f1972ebf1147ec436048a
--- /dev/null
+++ b/TTS/bin/train_encoder.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import os
+import sys
+import time
+import traceback
+
+import torch
+from torch.utils.data import DataLoader
+from trainer.torch import NoamLR
+from trainer.trainer_utils import get_optimizer
+
+from TTS.encoder.dataset import EncoderDataset
+from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_encoder_model
+from TTS.encoder.utils.training import init_training
+from TTS.encoder.utils.visual import plot_embeddings
+from TTS.tts.datasets import load_tts_samples
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
+from TTS.utils.io import copy_model_files
+from TTS.utils.samplers import PerfectBatchSampler
+from TTS.utils.training import check_update
+
+torch.backends.cudnn.enabled = True
+torch.backends.cudnn.benchmark = True
+torch.manual_seed(54321)
+use_cuda = torch.cuda.is_available()
+num_gpus = torch.cuda.device_count()
+print(" > Using CUDA: ", use_cuda)
+print(" > Number of GPUs: ", num_gpus)
+
+
+def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False):
+ num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class
+ num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch
+
+ dataset = EncoderDataset(
+ c,
+ ap,
+ meta_data_eval if is_val else meta_data_train,
+ voice_len=c.voice_len,
+ num_utter_per_class=num_utter_per_class,
+ num_classes_in_batch=num_classes_in_batch,
+ verbose=verbose,
+ augmentation_config=c.audio_augmentation if not is_val else None,
+ use_torch_spec=c.model_params.get("use_torch_spec", False),
+ )
+ # get classes list
+ classes = dataset.get_class_list()
+
+ sampler = PerfectBatchSampler(
+ dataset.items,
+ classes,
+ batch_size=num_classes_in_batch * num_utter_per_class, # total batch size
+ num_classes_in_batch=num_classes_in_batch,
+ num_gpus=1,
+ shuffle=not is_val,
+ drop_last=True,
+ )
+
+ if len(classes) < num_classes_in_batch:
+ if is_val:
+ raise RuntimeError(
+ f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !"
+ )
+ raise RuntimeError(
+ f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !"
+ )
+
+ # set the classes to avoid get wrong class_id when the number of training and eval classes are not equal
+ if is_val:
+ dataset.set_classes(train_classes)
+
+ loader = DataLoader(
+ dataset,
+ num_workers=c.num_loader_workers,
+ batch_sampler=sampler,
+ collate_fn=dataset.collate_fn,
+ )
+
+ return loader, classes, dataset.get_map_classid_to_classname()
+
+
+def evaluation(model, criterion, data_loader, global_step):
+ eval_loss = 0
+ for _, data in enumerate(data_loader):
+ with torch.no_grad():
+ # setup input data
+ inputs, labels = data
+
+ # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
+ labels = torch.transpose(
+ labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1
+ ).reshape(labels.shape)
+ inputs = torch.transpose(
+ inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1
+ ).reshape(inputs.shape)
+
+ # dispatch data to GPU
+ if use_cuda:
+ inputs = inputs.cuda(non_blocking=True)
+ labels = labels.cuda(non_blocking=True)
+
+ # forward pass model
+ outputs = model(inputs)
+
+ # loss computation
+ loss = criterion(
+ outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels
+ )
+
+ eval_loss += loss.item()
+
+ eval_avg_loss = eval_loss / len(data_loader)
+ # save stats
+ dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
+ # plot the last batch in the evaluation
+ figures = {
+ "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
+ }
+ dashboard_logger.eval_figures(global_step, figures)
+ return eval_avg_loss
+
+
+def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
+ model.train()
+ best_loss = float("inf")
+ avg_loader_time = 0
+ end_time = time.time()
+ for epoch in range(c.epochs):
+ tot_loss = 0
+ epoch_time = 0
+ for _, data in enumerate(data_loader):
+ start_time = time.time()
+
+ # setup input data
+ inputs, labels = data
+ # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
+ labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(
+ labels.shape
+ )
+ inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(
+ inputs.shape
+ )
+ # ToDo: move it to a unit test
+ # labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
+ # inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
+ # idx = 0
+ # for j in range(0, c.num_classes_in_batch, 1):
+ # for i in range(j, len(labels), c.num_classes_in_batch):
+ # if not torch.all(labels[i].eq(labels_converted[idx])) or not torch.all(inputs[i].eq(inputs_converted[idx])):
+ # print("Invalid")
+ # print(labels)
+ # exit()
+ # idx += 1
+ # labels = labels_converted
+ # inputs = inputs_converted
+
+ loader_time = time.time() - end_time
+ global_step += 1
+
+ # setup lr
+ if c.lr_decay:
+ scheduler.step()
+ optimizer.zero_grad()
+
+ # dispatch data to GPU
+ if use_cuda:
+ inputs = inputs.cuda(non_blocking=True)
+ labels = labels.cuda(non_blocking=True)
+
+ # forward pass model
+ outputs = model(inputs)
+
+ # loss computation
+ loss = criterion(
+ outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels
+ )
+ loss.backward()
+ grad_norm, _ = check_update(model, c.grad_clip)
+ optimizer.step()
+
+ step_time = time.time() - start_time
+ epoch_time += step_time
+
+ # acumulate the total epoch loss
+ tot_loss += loss.item()
+
+ # Averaged Loader Time
+ num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1
+ avg_loader_time = (
+ 1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time
+ if avg_loader_time != 0
+ else loader_time
+ )
+ current_lr = optimizer.param_groups[0]["lr"]
+
+ if global_step % c.steps_plot_stats == 0:
+ # Plot Training Epoch Stats
+ train_stats = {
+ "loss": loss.item(),
+ "lr": current_lr,
+ "grad_norm": grad_norm,
+ "step_time": step_time,
+ "avg_loader_time": avg_loader_time,
+ }
+ dashboard_logger.train_epoch_stats(global_step, train_stats)
+ figures = {
+ "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
+ }
+ dashboard_logger.train_figures(global_step, figures)
+
+ if global_step % c.print_step == 0:
+ print(
+ " | > Step:{} Loss:{:.5f} GradNorm:{:.5f} "
+ "StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format(
+ global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr
+ ),
+ flush=True,
+ )
+
+ if global_step % c.save_step == 0:
+ # save model
+ save_checkpoint(model, optimizer, criterion, loss.item(), OUT_PATH, global_step, epoch)
+
+ end_time = time.time()
+
+ print("")
+ print(
+ ">>> Epoch:{} AvgLoss: {:.5f} GradNorm:{:.5f} "
+ "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
+ epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time
+ ),
+ flush=True,
+ )
+ # evaluation
+ if c.run_eval:
+ model.eval()
+ eval_loss = evaluation(model, criterion, eval_data_loader, global_step)
+ print("\n\n")
+ print("--> EVAL PERFORMANCE")
+ print(
+ " | > Epoch:{} AvgLoss: {:.5f} ".format(epoch, eval_loss),
+ flush=True,
+ )
+ # save the best checkpoint
+ best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch)
+ model.train()
+
+ return best_loss, global_step
+
+
+def main(args): # pylint: disable=redefined-outer-name
+ # pylint: disable=global-variable-undefined
+ global meta_data_train
+ global meta_data_eval
+ global train_classes
+
+ ap = AudioProcessor(**c.audio)
+ model = setup_encoder_model(c)
+
+ optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model)
+
+ # pylint: disable=redefined-outer-name
+ meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True)
+
+ train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False, verbose=True)
+ if c.run_eval:
+ eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True)
+ else:
+ eval_data_loader = None
+
+ num_classes = len(train_classes)
+ criterion = model.get_criterion(c, num_classes)
+
+ if c.loss == "softmaxproto" and c.model != "speaker_encoder":
+ c.map_classid_to_classname = map_classid_to_classname
+ copy_model_files(c, OUT_PATH)
+
+ if args.restore_path:
+ criterion, args.restore_step = model.load_checkpoint(
+ c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion
+ )
+ print(" > Model restored from step %d" % args.restore_step, flush=True)
+ else:
+ args.restore_step = 0
+
+ if c.lr_decay:
+ scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1)
+ else:
+ scheduler = None
+
+ num_params = count_parameters(model)
+ print("\n > Model has {} parameters".format(num_params), flush=True)
+
+ if use_cuda:
+ model = model.cuda()
+ criterion.cuda()
+
+ global_step = args.restore_step
+ _, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step)
+
+
+if __name__ == "__main__":
+ args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training()
+
+ try:
+ main(args)
+ except KeyboardInterrupt:
+ remove_experiment_folder(OUT_PATH)
+ try:
+ sys.exit(0)
+ except SystemExit:
+ os._exit(0) # pylint: disable=protected-access
+ except Exception: # pylint: disable=broad-except
+ remove_experiment_folder(OUT_PATH)
+ traceback.print_exc()
+ sys.exit(1)
diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdb4f6f69122a4a9aa4e07695f1816ce9727f323
--- /dev/null
+++ b/TTS/bin/train_tts.py
@@ -0,0 +1,71 @@
+import os
+from dataclasses import dataclass, field
+
+from trainer import Trainer, TrainerArgs
+
+from TTS.config import load_config, register_config
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models import setup_model
+
+
+@dataclass
+class TrainTTSArgs(TrainerArgs):
+ config_path: str = field(default=None, metadata={"help": "Path to the config file."})
+
+
+def main():
+ """Run `tts` model training directly by a `config.json` file."""
+ # init trainer args
+ train_args = TrainTTSArgs()
+ parser = train_args.init_argparse(arg_prefix="")
+
+ # override trainer args from comman-line args
+ args, config_overrides = parser.parse_known_args()
+ train_args.parse_args(args)
+
+ # load config.json and register
+ if args.config_path or args.continue_path:
+ if args.config_path:
+ # init from a file
+ config = load_config(args.config_path)
+ if len(config_overrides) > 0:
+ config.parse_known_args(config_overrides, relaxed_parser=True)
+ elif args.continue_path:
+ # continue from a prev experiment
+ config = load_config(os.path.join(args.continue_path, "config.json"))
+ if len(config_overrides) > 0:
+ config.parse_known_args(config_overrides, relaxed_parser=True)
+ else:
+ # init from console args
+ from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
+
+ config_base = BaseTrainingConfig()
+ config_base.parse_known_args(config_overrides)
+ config = register_config(config_base.model)()
+
+ # load training samples
+ train_samples, eval_samples = load_tts_samples(
+ config.datasets,
+ eval_split=True,
+ eval_split_max_size=config.eval_split_max_size,
+ eval_split_size=config.eval_split_size,
+ )
+
+ # init the model from config
+ model = setup_model(config, train_samples + eval_samples)
+
+ # init the trainer and 🚀
+ trainer = Trainer(
+ train_args,
+ model.config,
+ config.output_path,
+ model=model,
+ train_samples=train_samples,
+ eval_samples=eval_samples,
+ parse_command_line_args=False,
+ )
+ trainer.fit()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/TTS/bin/train_vocoder.py b/TTS/bin/train_vocoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..32ecd7bdc3652b3683be846bdd9518e937aee904
--- /dev/null
+++ b/TTS/bin/train_vocoder.py
@@ -0,0 +1,77 @@
+import os
+from dataclasses import dataclass, field
+
+from trainer import Trainer, TrainerArgs
+
+from TTS.config import load_config, register_config
+from TTS.utils.audio import AudioProcessor
+from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
+from TTS.vocoder.models import setup_model
+
+
+@dataclass
+class TrainVocoderArgs(TrainerArgs):
+ config_path: str = field(default=None, metadata={"help": "Path to the config file."})
+
+
+def main():
+ """Run `tts` model training directly by a `config.json` file."""
+ # init trainer args
+ train_args = TrainVocoderArgs()
+ parser = train_args.init_argparse(arg_prefix="")
+
+ # override trainer args from comman-line args
+ args, config_overrides = parser.parse_known_args()
+ train_args.parse_args(args)
+
+ # load config.json and register
+ if args.config_path or args.continue_path:
+ if args.config_path:
+ # init from a file
+ config = load_config(args.config_path)
+ if len(config_overrides) > 0:
+ config.parse_known_args(config_overrides, relaxed_parser=True)
+ elif args.continue_path:
+ # continue from a prev experiment
+ config = load_config(os.path.join(args.continue_path, "config.json"))
+ if len(config_overrides) > 0:
+ config.parse_known_args(config_overrides, relaxed_parser=True)
+ else:
+ # init from console args
+ from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
+
+ config_base = BaseTrainingConfig()
+ config_base.parse_known_args(config_overrides)
+ config = register_config(config_base.model)()
+
+ # load training samples
+ if "feature_path" in config and config.feature_path:
+ # load pre-computed features
+ print(f" > Loading features from: {config.feature_path}")
+ eval_samples, train_samples = load_wav_feat_data(config.data_path, config.feature_path, config.eval_split_size)
+ else:
+ # load data raw wav files
+ eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
+
+ # setup audio processor
+ ap = AudioProcessor(**config.audio)
+
+ # init the model from config
+ model = setup_model(config)
+
+ # init the trainer and 🚀
+ trainer = Trainer(
+ train_args,
+ config,
+ config.output_path,
+ model=model,
+ train_samples=train_samples,
+ eval_samples=eval_samples,
+ training_assets={"audio_processor": ap},
+ parse_command_line_args=False,
+ )
+ trainer.fit()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/TTS/bin/tune_wavegrad.py b/TTS/bin/tune_wavegrad.py
new file mode 100644
index 0000000000000000000000000000000000000000..09582cea7c7962b098efcde5754a02573d18264a
--- /dev/null
+++ b/TTS/bin/tune_wavegrad.py
@@ -0,0 +1,103 @@
+"""Search a good noise schedule for WaveGrad for a given number of inference iterations"""
+import argparse
+from itertools import product as cartesian_product
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from TTS.config import load_config
+from TTS.utils.audio import AudioProcessor
+from TTS.vocoder.datasets.preprocess import load_wav_data
+from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
+from TTS.vocoder.models import setup_model
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model_path", type=str, help="Path to model checkpoint.")
+ parser.add_argument("--config_path", type=str, help="Path to model config file.")
+ parser.add_argument("--data_path", type=str, help="Path to data directory.")
+ parser.add_argument("--output_path", type=str, help="path for output file including file name and extension.")
+ parser.add_argument(
+ "--num_iter",
+ type=int,
+ help="Number of model inference iterations that you like to optimize noise schedule for.",
+ )
+ parser.add_argument("--use_cuda", action="store_true", help="enable CUDA.")
+ parser.add_argument("--num_samples", type=int, default=1, help="Number of datasamples used for inference.")
+ parser.add_argument(
+ "--search_depth",
+ type=int,
+ default=3,
+ help="Search granularity. Increasing this increases the run-time exponentially.",
+ )
+
+ # load config
+ args = parser.parse_args()
+ config = load_config(args.config_path)
+
+ # setup audio processor
+ ap = AudioProcessor(**config.audio)
+
+ # load dataset
+ _, train_data = load_wav_data(args.data_path, 0)
+ train_data = train_data[: args.num_samples]
+ dataset = WaveGradDataset(
+ ap=ap,
+ items=train_data,
+ seq_len=-1,
+ hop_len=ap.hop_length,
+ pad_short=config.pad_short,
+ conv_pad=config.conv_pad,
+ is_training=True,
+ return_segments=False,
+ use_noise_augment=False,
+ use_cache=False,
+ verbose=True,
+ )
+ loader = DataLoader(
+ dataset,
+ batch_size=1,
+ shuffle=False,
+ collate_fn=dataset.collate_full_clips,
+ drop_last=False,
+ num_workers=config.num_loader_workers,
+ pin_memory=False,
+ )
+
+ # setup the model
+ model = setup_model(config)
+ if args.use_cuda:
+ model.cuda()
+
+ # setup optimization parameters
+ base_values = sorted(10 * np.random.uniform(size=args.search_depth))
+ print(f" > base values: {base_values}")
+ exponents = 10 ** np.linspace(-6, -1, num=args.num_iter)
+ best_error = float("inf")
+ best_schedule = None # pylint: disable=C0103
+ total_search_iter = len(base_values) ** args.num_iter
+ for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=total_search_iter):
+ beta = exponents * base
+ model.compute_noise_level(beta)
+ for data in loader:
+ mel, audio = data
+ y_hat = model.inference(mel.cuda() if args.use_cuda else mel)
+
+ if args.use_cuda:
+ y_hat = y_hat.cpu()
+ y_hat = y_hat.numpy()
+
+ mel_hat = []
+ for i in range(y_hat.shape[0]):
+ m = ap.melspectrogram(y_hat[i, 0])[:, :-1]
+ mel_hat.append(torch.from_numpy(m))
+
+ mel_hat = torch.stack(mel_hat)
+ mse = torch.sum((mel - mel_hat) ** 2).mean()
+ if mse.item() < best_error:
+ best_error = mse.item()
+ best_schedule = {"beta": beta}
+ print(f" > Found a better schedule. - MSE: {mse.item()}")
+ np.save(args.output_path, best_schedule)
diff --git a/TTS/config/__init__.py b/TTS/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..49ac8f4ade11c14ec339c5bcf5a682aa560b942d
--- /dev/null
+++ b/TTS/config/__init__.py
@@ -0,0 +1,132 @@
+import json
+import os
+import re
+from typing import Dict
+
+import fsspec
+import yaml
+from coqpit import Coqpit
+
+from TTS.config.shared_configs import *
+from TTS.utils.generic_utils import find_module
+
+
+def read_json_with_comments(json_path):
+ """for backward compat."""
+ # fallback to json
+ with fsspec.open(json_path, "r", encoding="utf-8") as f:
+ input_str = f.read()
+ # handle comments
+ input_str = re.sub(r"\\\n", "", input_str)
+ input_str = re.sub(r"//.*\n", "\n", input_str)
+ data = json.loads(input_str)
+ return data
+
+
+def register_config(model_name: str) -> Coqpit:
+ """Find the right config for the given model name.
+
+ Args:
+ model_name (str): Model name.
+
+ Raises:
+ ModuleNotFoundError: No matching config for the model name.
+
+ Returns:
+ Coqpit: config class.
+ """
+ config_class = None
+ config_name = model_name + "_config"
+ paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs", "TTS.vc.configs"]
+ for path in paths:
+ try:
+ config_class = find_module(path, config_name)
+ except ModuleNotFoundError:
+ pass
+ if config_class is None:
+ raise ModuleNotFoundError(f" [!] Config for {model_name} cannot be found.")
+ return config_class
+
+
+def _process_model_name(config_dict: Dict) -> str:
+ """Format the model name as expected. It is a band-aid for the old `vocoder` model names.
+
+ Args:
+ config_dict (Dict): A dictionary including the config fields.
+
+ Returns:
+ str: Formatted modelname.
+ """
+ model_name = config_dict["model"] if "model" in config_dict else config_dict["generator_model"]
+ model_name = model_name.replace("_generator", "").replace("_discriminator", "")
+ return model_name
+
+
+def load_config(config_path: str) -> Coqpit:
+ """Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name
+ to find the corresponding Config class. Then initialize the Config.
+
+ Args:
+ config_path (str): path to the config file.
+
+ Raises:
+ TypeError: given config file has an unknown type.
+
+ Returns:
+ Coqpit: TTS config object.
+ """
+ config_dict = {}
+ ext = os.path.splitext(config_path)[1]
+ if ext in (".yml", ".yaml"):
+ with fsspec.open(config_path, "r", encoding="utf-8") as f:
+ data = yaml.safe_load(f)
+ elif ext == ".json":
+ try:
+ with fsspec.open(config_path, "r", encoding="utf-8") as f:
+ data = json.load(f)
+ except json.decoder.JSONDecodeError:
+ # backwards compat.
+ data = read_json_with_comments(config_path)
+ else:
+ raise TypeError(f" [!] Unknown config file type {ext}")
+ config_dict.update(data)
+ model_name = _process_model_name(config_dict)
+ config_class = register_config(model_name.lower())
+ config = config_class()
+ config.from_dict(config_dict)
+ return config
+
+
+def check_config_and_model_args(config, arg_name, value):
+ """Check the give argument in `config.model_args` if exist or in `config` for
+ the given value.
+
+ Return False if the argument does not exist in `config.model_args` or `config`.
+ This is to patch up the compatibility between models with and without `model_args`.
+
+ TODO: Remove this in the future with a unified approach.
+ """
+ if hasattr(config, "model_args"):
+ if arg_name in config.model_args:
+ return config.model_args[arg_name] == value
+ if hasattr(config, arg_name):
+ return config[arg_name] == value
+ return False
+
+
+def get_from_config_or_model_args(config, arg_name):
+ """Get the given argument from `config.model_args` if exist or in `config`."""
+ if hasattr(config, "model_args"):
+ if arg_name in config.model_args:
+ return config.model_args[arg_name]
+ return config[arg_name]
+
+
+def get_from_config_or_model_args_with_default(config, arg_name, def_val):
+ """Get the given argument from `config.model_args` if exist or in `config`."""
+ if hasattr(config, "model_args"):
+ if arg_name in config.model_args:
+ return config.model_args[arg_name]
+ if hasattr(config, arg_name):
+ return config[arg_name]
+ return def_val
diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fae77d61361eff8c8fa521a0f4a90dc46f63c75
--- /dev/null
+++ b/TTS/config/shared_configs.py
@@ -0,0 +1,268 @@
+from dataclasses import asdict, dataclass
+from typing import List
+
+from coqpit import Coqpit, check_argument
+from trainer import TrainerConfig
+
+
+@dataclass
+class BaseAudioConfig(Coqpit):
+ """Base config to definge audio processing parameters. It is used to initialize
+ ```TTS.utils.audio.AudioProcessor.```
+
+ Args:
+ fft_size (int):
+ Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
+
+ win_length (int):
+ Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
+ ```fft_size```. Defaults to 1024.
+
+ hop_length (int):
+ Number of audio samples between adjacent STFT columns. Defaults to 1024.
+
+ frame_shift_ms (int):
+ Set ```hop_length``` based on milliseconds and sampling rate.
+
+ frame_length_ms (int):
+ Set ```win_length``` based on milliseconds and sampling rate.
+
+ stft_pad_mode (str):
+ Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
+
+ sample_rate (int):
+ Audio sampling rate. Defaults to 22050.
+
+ resample (bool):
+ Enable / Disable resampling audio to ```sample_rate```. Defaults to ```False```.
+
+ preemphasis (float):
+ Preemphasis coefficient. Defaults to 0.0.
+
+ ref_level_db (int): 20
+ Reference Db level to rebase the audio signal and ignore the level below. 20Db is assumed the sound of air.
+ Defaults to 20.
+
+ do_sound_norm (bool):
+ Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False.
+
+ log_func (str):
+ Numpy log function used for amplitude to DB conversion. Defaults to 'np.log10'.
+
+ do_trim_silence (bool):
+ Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```.
+
+ do_amp_to_db_linear (bool, optional):
+ enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
+
+ do_amp_to_db_mel (bool, optional):
+ enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
+
+ pitch_fmax (float, optional):
+ Maximum frequency of the F0 frames. Defaults to ```640```.
+
+ pitch_fmin (float, optional):
+ Minimum frequency of the F0 frames. Defaults to ```1```.
+
+ trim_db (int):
+ Silence threshold used for silence trimming. Defaults to 45.
+
+ do_rms_norm (bool, optional):
+ enable/disable RMS volume normalization when loading an audio file. Defaults to False.
+
+ db_level (int, optional):
+ dB level used for rms normalization. The range is -99 to 0. Defaults to None.
+
+ power (float):
+ Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
+ artifacts in the synthesized voice. Defaults to 1.5.
+
+ griffin_lim_iters (int):
+ Number of Griffing Lim iterations. Defaults to 60.
+
+ num_mels (int):
+ Number of mel-basis frames that defines the frame lengths of each mel-spectrogram frame. Defaults to 80.
+
+ mel_fmin (float): Min frequency level used for the mel-basis filters. ~50 for male and ~95 for female voices.
+ It needs to be adjusted for a dataset. Defaults to 0.
+
+ mel_fmax (float):
+ Max frequency level used for the mel-basis filters. It needs to be adjusted for a dataset.
+
+ spec_gain (int):
+ Gain applied when converting amplitude to DB. Defaults to 20.
+
+ signal_norm (bool):
+ enable/disable signal normalization. Defaults to True.
+
+ min_level_db (int):
+ minimum db threshold for the computed melspectrograms. Defaults to -100.
+
+ symmetric_norm (bool):
+ enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else
+ [0, k], Defaults to True.
+
+ max_norm (float):
+ ```k``` defining the normalization range. Defaults to 4.0.
+
+ clip_norm (bool):
+ enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
+
+ stats_path (str):
+ Path to the computed stats file. Defaults to None.
+ """
+
+ # stft parameters
+ fft_size: int = 1024
+ win_length: int = 1024
+ hop_length: int = 256
+ frame_shift_ms: int = None
+ frame_length_ms: int = None
+ stft_pad_mode: str = "reflect"
+ # audio processing parameters
+ sample_rate: int = 22050
+ resample: bool = False
+ preemphasis: float = 0.0
+ ref_level_db: int = 20
+ do_sound_norm: bool = False
+ log_func: str = "np.log10"
+ # silence trimming
+ do_trim_silence: bool = True
+ trim_db: int = 45
+ # rms volume normalization
+ do_rms_norm: bool = False
+ db_level: float = None
+ # griffin-lim params
+ power: float = 1.5
+ griffin_lim_iters: int = 60
+ # mel-spec params
+ num_mels: int = 80
+ mel_fmin: float = 0.0
+ mel_fmax: float = None
+ spec_gain: int = 20
+ do_amp_to_db_linear: bool = True
+ do_amp_to_db_mel: bool = True
+ # f0 params
+ pitch_fmax: float = 640.0
+ pitch_fmin: float = 1.0
+ # normalization params
+ signal_norm: bool = True
+ min_level_db: int = -100
+ symmetric_norm: bool = True
+ max_norm: float = 4.0
+ clip_norm: bool = True
+ stats_path: str = None
+
+ def check_values(
+ self,
+ ):
+ """Check config fields"""
+ c = asdict(self)
+ check_argument("num_mels", c, restricted=True, min_val=10, max_val=2056)
+ check_argument("fft_size", c, restricted=True, min_val=128, max_val=4058)
+ check_argument("sample_rate", c, restricted=True, min_val=512, max_val=100000)
+ check_argument(
+ "frame_length_ms",
+ c,
+ restricted=True,
+ min_val=10,
+ max_val=1000,
+ alternative="win_length",
+ )
+ check_argument("frame_shift_ms", c, restricted=True, min_val=1, max_val=1000, alternative="hop_length")
+ check_argument("preemphasis", c, restricted=True, min_val=0, max_val=1)
+ check_argument("min_level_db", c, restricted=True, min_val=-1000, max_val=10)
+ check_argument("ref_level_db", c, restricted=True, min_val=0, max_val=1000)
+ check_argument("power", c, restricted=True, min_val=1, max_val=5)
+ check_argument("griffin_lim_iters", c, restricted=True, min_val=10, max_val=1000)
+
+ # normalization parameters
+ check_argument("signal_norm", c, restricted=True)
+ check_argument("symmetric_norm", c, restricted=True)
+ check_argument("max_norm", c, restricted=True, min_val=0.1, max_val=1000)
+ check_argument("clip_norm", c, restricted=True)
+ check_argument("mel_fmin", c, restricted=True, min_val=0.0, max_val=1000)
+ check_argument("mel_fmax", c, restricted=True, min_val=500.0, allow_none=True)
+ check_argument("spec_gain", c, restricted=True, min_val=1, max_val=100)
+ check_argument("do_trim_silence", c, restricted=True)
+ check_argument("trim_db", c, restricted=True)
+
+
+@dataclass
+class BaseDatasetConfig(Coqpit):
+ """Base config for TTS datasets.
+
+ Args:
+ formatter (str):
+ Formatter name that defines used formatter in ```TTS.tts.datasets.formatter```. Defaults to `""`.
+
+ dataset_name (str):
+ Unique name for the dataset. Defaults to `""`.
+
+ path (str):
+ Root path to the dataset files. Defaults to `""`.
+
+ meta_file_train (str):
+ Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
+ Defaults to `""`.
+
+ ignored_speakers (List):
+ List of speakers IDs that are not used at the training. Default None.
+
+ language (str):
+ Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to `""`.
+
+ phonemizer (str):
+ Phonemizer used for that dataset's language. By default it uses `DEF_LANG_TO_PHONEMIZER`. Defaults to `""`.
+
+ meta_file_val (str):
+ Name of the dataset meta file that defines the instances used at validation.
+
+ meta_file_attn_mask (str):
+ Path to the file that lists the attention mask files used with models that require attention masks to
+ train the duration predictor.
+ """
+
+ formatter: str = ""
+ dataset_name: str = ""
+ path: str = ""
+ meta_file_train: str = ""
+ ignored_speakers: List[str] = None
+ language: str = ""
+ phonemizer: str = ""
+ meta_file_val: str = ""
+ meta_file_attn_mask: str = ""
+
+ def check_values(
+ self,
+ ):
+ """Check config fields"""
+ c = asdict(self)
+ check_argument("formatter", c, restricted=True)
+ check_argument("path", c, restricted=True)
+ check_argument("meta_file_train", c, restricted=True)
+ check_argument("meta_file_val", c, restricted=False)
+ check_argument("meta_file_attn_mask", c, restricted=False)
+
+
+@dataclass
+class BaseTrainingConfig(TrainerConfig):
+ """Base config to define the basic 🐸TTS training parameters that are shared
+ among all the models. It is based on ```Trainer.TrainingConfig```.
+
+ Args:
+ model (str):
+ Name of the model that is used in the training.
+
+ num_loader_workers (int):
+ Number of workers for training time dataloader.
+
+ num_eval_loader_workers (int):
+ Number of workers for evaluation time dataloader.
+ """
+
+ model: str = None
+ # dataloading
+ num_loader_workers: int = 0
+ num_eval_loader_workers: int = 0
+ use_noise_augment: bool = False
diff --git a/TTS/encoder/README.md b/TTS/encoder/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b38b20052b707b0358068bc0ce58bc300a149def
--- /dev/null
+++ b/TTS/encoder/README.md
@@ -0,0 +1,18 @@
+### Speaker Encoder
+
+This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
+
+With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
+
+Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
+
+![](umap.png)
+
+Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
+
+To run the code, you need to follow the same flow as in TTS.
+
+- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
+- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
+- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
+- Watch training on Tensorboard as in TTS
diff --git a/TTS/encoder/__init__.py b/TTS/encoder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/TTS/encoder/configs/base_encoder_config.py b/TTS/encoder/configs/base_encoder_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebbaa0457bb55aef70d54dd36fd9b2b7f7c702bb
--- /dev/null
+++ b/TTS/encoder/configs/base_encoder_config.py
@@ -0,0 +1,61 @@
+from dataclasses import asdict, dataclass, field
+from typing import Dict, List
+
+from coqpit import MISSING
+
+from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
+
+
+@dataclass
+class BaseEncoderConfig(BaseTrainingConfig):
+ """Defines parameters for a Generic Encoder model."""
+
+ model: str = None
+ audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
+ datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
+ # model params
+ model_params: Dict = field(
+ default_factory=lambda: {
+ "model_name": "lstm",
+ "input_dim": 80,
+ "proj_dim": 256,
+ "lstm_dim": 768,
+ "num_lstm_layers": 3,
+ "use_lstm_with_projection": True,
+ }
+ )
+
+ audio_augmentation: Dict = field(default_factory=lambda: {})
+
+ # training params
+ epochs: int = 10000
+ loss: str = "angleproto"
+ grad_clip: float = 3.0
+ lr: float = 0.0001
+ optimizer: str = "radam"
+ optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
+ lr_decay: bool = False
+ warmup_steps: int = 4000
+
+ # logging params
+ tb_model_param_stats: bool = False
+ steps_plot_stats: int = 10
+ save_step: int = 1000
+ print_step: int = 20
+ run_eval: bool = False
+
+ # data loader
+ num_classes_in_batch: int = MISSING
+ num_utter_per_class: int = MISSING
+ eval_num_classes_in_batch: int = None
+ eval_num_utter_per_class: int = None
+
+ num_loader_workers: int = MISSING
+ voice_len: float = 1.6
+
+ def check_values(self):
+ super().check_values()
+ c = asdict(self)
+ assert (
+ c["model_params"]["input_dim"] == self.audio.num_mels
+ ), " [!] model input dimendion must be equal to melspectrogram dimension."
diff --git a/TTS/encoder/configs/emotion_encoder_config.py b/TTS/encoder/configs/emotion_encoder_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..5eda2671be980abce4a0506a075387b601a1596c
--- /dev/null
+++ b/TTS/encoder/configs/emotion_encoder_config.py
@@ -0,0 +1,12 @@
+from dataclasses import asdict, dataclass
+
+from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
+
+
+@dataclass
+class EmotionEncoderConfig(BaseEncoderConfig):
+ """Defines parameters for Emotion Encoder model."""
+
+ model: str = "emotion_encoder"
+ map_classid_to_classname: dict = None
+ class_name_key: str = "emotion_name"
diff --git a/TTS/encoder/configs/speaker_encoder_config.py b/TTS/encoder/configs/speaker_encoder_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..6dceb00277ba68efe128936ff7f9456338f9753f
--- /dev/null
+++ b/TTS/encoder/configs/speaker_encoder_config.py
@@ -0,0 +1,11 @@
+from dataclasses import asdict, dataclass
+
+from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
+
+
+@dataclass
+class SpeakerEncoderConfig(BaseEncoderConfig):
+ """Defines parameters for Speaker Encoder model."""
+
+ model: str = "speaker_encoder"
+ class_name_key: str = "speaker_name"
diff --git a/TTS/encoder/dataset.py b/TTS/encoder/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..582b1fe9ca35cb9afbc20b8f72b6173282201272
--- /dev/null
+++ b/TTS/encoder/dataset.py
@@ -0,0 +1,147 @@
+import random
+
+import torch
+from torch.utils.data import Dataset
+
+from TTS.encoder.utils.generic_utils import AugmentWAV
+
+
+class EncoderDataset(Dataset):
+ def __init__(
+ self,
+ config,
+ ap,
+ meta_data,
+ voice_len=1.6,
+ num_classes_in_batch=64,
+ num_utter_per_class=10,
+ verbose=False,
+ augmentation_config=None,
+ use_torch_spec=None,
+ ):
+ """
+ Args:
+ ap (TTS.tts.utils.AudioProcessor): audio processor object.
+ meta_data (list): list of dataset instances.
+ seq_len (int): voice segment length in seconds.
+ verbose (bool): print diagnostic information.
+ """
+ super().__init__()
+ self.config = config
+ self.items = meta_data
+ self.sample_rate = ap.sample_rate
+ self.seq_len = int(voice_len * self.sample_rate)
+ self.num_utter_per_class = num_utter_per_class
+ self.ap = ap
+ self.verbose = verbose
+ self.use_torch_spec = use_torch_spec
+ self.classes, self.items = self.__parse_items()
+
+ self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
+
+ # Data Augmentation
+ self.augmentator = None
+ self.gaussian_augmentation_config = None
+ if augmentation_config:
+ self.data_augmentation_p = augmentation_config["p"]
+ if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config):
+ self.augmentator = AugmentWAV(ap, augmentation_config)
+
+ if "gaussian" in augmentation_config.keys():
+ self.gaussian_augmentation_config = augmentation_config["gaussian"]
+
+ if self.verbose:
+ print("\n > DataLoader initialization")
+ print(f" | > Classes per Batch: {num_classes_in_batch}")
+ print(f" | > Number of instances : {len(self.items)}")
+ print(f" | > Sequence length: {self.seq_len}")
+ print(f" | > Num Classes: {len(self.classes)}")
+ print(f" | > Classes: {self.classes}")
+
+ def load_wav(self, filename):
+ audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
+ return audio
+
+ def __parse_items(self):
+ class_to_utters = {}
+ for item in self.items:
+ path_ = item["audio_file"]
+ class_name = item[self.config.class_name_key]
+ if class_name in class_to_utters.keys():
+ class_to_utters[class_name].append(path_)
+ else:
+ class_to_utters[class_name] = [
+ path_,
+ ]
+
+ # skip classes with number of samples >= self.num_utter_per_class
+ class_to_utters = {k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class}
+
+ classes = list(class_to_utters.keys())
+ classes.sort()
+
+ new_items = []
+ for item in self.items:
+ path_ = item["audio_file"]
+ class_name = item["emotion_name"] if self.config.model == "emotion_encoder" else item["speaker_name"]
+ # ignore filtered classes
+ if class_name not in classes:
+ continue
+ # ignore small audios
+ if self.load_wav(path_).shape[0] - self.seq_len <= 0:
+ continue
+
+ new_items.append({"wav_file_path": path_, "class_name": class_name})
+
+ return classes, new_items
+
+ def __len__(self):
+ return len(self.items)
+
+ def get_num_classes(self):
+ return len(self.classes)
+
+ def get_class_list(self):
+ return self.classes
+
+ def set_classes(self, classes):
+ self.classes = classes
+ self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
+
+ def get_map_classid_to_classname(self):
+ return dict((c_id, c_n) for c_n, c_id in self.classname_to_classid.items())
+
+ def __getitem__(self, idx):
+ return self.items[idx]
+
+ def collate_fn(self, batch):
+ # get the batch class_ids
+ labels = []
+ feats = []
+ for item in batch:
+ utter_path = item["wav_file_path"]
+ class_name = item["class_name"]
+
+ # get classid
+ class_id = self.classname_to_classid[class_name]
+ # load wav file
+ wav = self.load_wav(utter_path)
+ offset = random.randint(0, wav.shape[0] - self.seq_len)
+ wav = wav[offset : offset + self.seq_len]
+
+ if self.augmentator is not None and self.data_augmentation_p:
+ if random.random() < self.data_augmentation_p:
+ wav = self.augmentator.apply_one(wav)
+
+ if not self.use_torch_spec:
+ mel = self.ap.melspectrogram(wav)
+ feats.append(torch.FloatTensor(mel))
+ else:
+ feats.append(torch.FloatTensor(wav))
+
+ labels.append(class_id)
+
+ feats = torch.stack(feats)
+ labels = torch.LongTensor(labels)
+
+ return feats, labels
diff --git a/TTS/encoder/losses.py b/TTS/encoder/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b5aa0fc48fe00aeedeff28ba48ed2af498ce582
--- /dev/null
+++ b/TTS/encoder/losses.py
@@ -0,0 +1,226 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+# adapted from https://github.com/cvqluu/GE2E-Loss
+class GE2ELoss(nn.Module):
+ def __init__(self, init_w=10.0, init_b=-5.0, loss_method="softmax"):
+ """
+ Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1]
+ Accepts an input of size (N, M, D)
+ where N is the number of speakers in the batch,
+ M is the number of utterances per speaker,
+ and D is the dimensionality of the embedding vector (e.g. d-vector)
+ Args:
+ - init_w (float): defines the initial value of w in Equation (5) of [1]
+ - init_b (float): definies the initial value of b in Equation (5) of [1]
+ """
+ super().__init__()
+ # pylint: disable=E1102
+ self.w = nn.Parameter(torch.tensor(init_w))
+ # pylint: disable=E1102
+ self.b = nn.Parameter(torch.tensor(init_b))
+ self.loss_method = loss_method
+
+ print(" > Initialized Generalized End-to-End loss")
+
+ assert self.loss_method in ["softmax", "contrast"]
+
+ if self.loss_method == "softmax":
+ self.embed_loss = self.embed_loss_softmax
+ if self.loss_method == "contrast":
+ self.embed_loss = self.embed_loss_contrast
+
+ # pylint: disable=R0201
+ def calc_new_centroids(self, dvecs, centroids, spkr, utt):
+ """
+ Calculates the new centroids excluding the reference utterance
+ """
+ excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt + 1 :]))
+ excl = torch.mean(excl, 0)
+ new_centroids = []
+ for i, centroid in enumerate(centroids):
+ if i == spkr:
+ new_centroids.append(excl)
+ else:
+ new_centroids.append(centroid)
+ return torch.stack(new_centroids)
+
+ def calc_cosine_sim(self, dvecs, centroids):
+ """
+ Make the cosine similarity matrix with dims (N,M,N)
+ """
+ cos_sim_matrix = []
+ for spkr_idx, speaker in enumerate(dvecs):
+ cs_row = []
+ for utt_idx, utterance in enumerate(speaker):
+ new_centroids = self.calc_new_centroids(dvecs, centroids, spkr_idx, utt_idx)
+ # vector based cosine similarity for speed
+ cs_row.append(
+ torch.clamp(
+ torch.mm(
+ utterance.unsqueeze(1).transpose(0, 1),
+ new_centroids.transpose(0, 1),
+ )
+ / (torch.norm(utterance) * torch.norm(new_centroids, dim=1)),
+ 1e-6,
+ )
+ )
+ cs_row = torch.cat(cs_row, dim=0)
+ cos_sim_matrix.append(cs_row)
+ return torch.stack(cos_sim_matrix)
+
+ # pylint: disable=R0201
+ def embed_loss_softmax(self, dvecs, cos_sim_matrix):
+ """
+ Calculates the loss on each embedding $L(e_{ji})$ by taking softmax
+ """
+ N, M, _ = dvecs.shape
+ L = []
+ for j in range(N):
+ L_row = []
+ for i in range(M):
+ L_row.append(-F.log_softmax(cos_sim_matrix[j, i], 0)[j])
+ L_row = torch.stack(L_row)
+ L.append(L_row)
+ return torch.stack(L)
+
+ # pylint: disable=R0201
+ def embed_loss_contrast(self, dvecs, cos_sim_matrix):
+ """
+ Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid
+ """
+ N, M, _ = dvecs.shape
+ L = []
+ for j in range(N):
+ L_row = []
+ for i in range(M):
+ centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j, i])
+ excl_centroids_sigmoids = torch.cat((centroids_sigmoids[:j], centroids_sigmoids[j + 1 :]))
+ L_row.append(1.0 - torch.sigmoid(cos_sim_matrix[j, i, j]) + torch.max(excl_centroids_sigmoids))
+ L_row = torch.stack(L_row)
+ L.append(L_row)
+ return torch.stack(L)
+
+ def forward(self, x, _label=None):
+ """
+ Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
+ """
+
+ assert x.size()[1] >= 2
+
+ centroids = torch.mean(x, 1)
+ cos_sim_matrix = self.calc_cosine_sim(x, centroids)
+ torch.clamp(self.w, 1e-6)
+ cos_sim_matrix = self.w * cos_sim_matrix + self.b
+ L = self.embed_loss(x, cos_sim_matrix)
+ return L.mean()
+
+
+# adapted from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/angleproto.py
+class AngleProtoLoss(nn.Module):
+ """
+ Implementation of the Angular Prototypical loss defined in https://arxiv.org/abs/2003.11982
+ Accepts an input of size (N, M, D)
+ where N is the number of speakers in the batch,
+ M is the number of utterances per speaker,
+ and D is the dimensionality of the embedding vector
+ Args:
+ - init_w (float): defines the initial value of w
+ - init_b (float): definies the initial value of b
+ """
+
+ def __init__(self, init_w=10.0, init_b=-5.0):
+ super().__init__()
+ # pylint: disable=E1102
+ self.w = nn.Parameter(torch.tensor(init_w))
+ # pylint: disable=E1102
+ self.b = nn.Parameter(torch.tensor(init_b))
+ self.criterion = torch.nn.CrossEntropyLoss()
+
+ print(" > Initialized Angular Prototypical loss")
+
+ def forward(self, x, _label=None):
+ """
+ Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
+ """
+
+ assert x.size()[1] >= 2
+
+ out_anchor = torch.mean(x[:, 1:, :], 1)
+ out_positive = x[:, 0, :]
+ num_speakers = out_anchor.size()[0]
+
+ cos_sim_matrix = F.cosine_similarity(
+ out_positive.unsqueeze(-1).expand(-1, -1, num_speakers),
+ out_anchor.unsqueeze(-1).expand(-1, -1, num_speakers).transpose(0, 2),
+ )
+ torch.clamp(self.w, 1e-6)
+ cos_sim_matrix = cos_sim_matrix * self.w + self.b
+ label = torch.arange(num_speakers).to(cos_sim_matrix.device)
+ L = self.criterion(cos_sim_matrix, label)
+ return L
+
+
+class SoftmaxLoss(nn.Module):
+ """
+ Implementation of the Softmax loss as defined in https://arxiv.org/abs/2003.11982
+ Args:
+ - embedding_dim (float): speaker embedding dim
+ - n_speakers (float): number of speakers
+ """
+
+ def __init__(self, embedding_dim, n_speakers):
+ super().__init__()
+
+ self.criterion = torch.nn.CrossEntropyLoss()
+ self.fc = nn.Linear(embedding_dim, n_speakers)
+
+ print("Initialised Softmax Loss")
+
+ def forward(self, x, label=None):
+ # reshape for compatibility
+ x = x.reshape(-1, x.size()[-1])
+ label = label.reshape(-1)
+
+ x = self.fc(x)
+ L = self.criterion(x, label)
+
+ return L
+
+ def inference(self, embedding):
+ x = self.fc(embedding)
+ activations = torch.nn.functional.softmax(x, dim=1).squeeze(0)
+ class_id = torch.argmax(activations)
+ return class_id
+
+
+class SoftmaxAngleProtoLoss(nn.Module):
+ """
+ Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153
+ Args:
+ - embedding_dim (float): speaker embedding dim
+ - n_speakers (float): number of speakers
+ - init_w (float): defines the initial value of w
+ - init_b (float): definies the initial value of b
+ """
+
+ def __init__(self, embedding_dim, n_speakers, init_w=10.0, init_b=-5.0):
+ super().__init__()
+
+ self.softmax = SoftmaxLoss(embedding_dim, n_speakers)
+ self.angleproto = AngleProtoLoss(init_w, init_b)
+
+ print("Initialised SoftmaxAnglePrototypical Loss")
+
+ def forward(self, x, label=None):
+ """
+ Calculates the SoftmaxAnglePrototypical loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
+ """
+
+ Lp = self.angleproto(x)
+
+ Ls = self.softmax(x, label)
+
+ return Ls + Lp
diff --git a/TTS/encoder/models/base_encoder.py b/TTS/encoder/models/base_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..957ea3c4ca719c2a054c93382787909e418288b2
--- /dev/null
+++ b/TTS/encoder/models/base_encoder.py
@@ -0,0 +1,161 @@
+import numpy as np
+import torch
+import torchaudio
+from coqpit import Coqpit
+from torch import nn
+
+from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
+from TTS.utils.generic_utils import set_init_dict
+from TTS.utils.io import load_fsspec
+
+
+class PreEmphasis(nn.Module):
+ def __init__(self, coefficient=0.97):
+ super().__init__()
+ self.coefficient = coefficient
+ self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0))
+
+ def forward(self, x):
+ assert len(x.size()) == 2
+
+ x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect")
+ return torch.nn.functional.conv1d(x, self.filter).squeeze(1)
+
+
+class BaseEncoder(nn.Module):
+ """Base `encoder` class. Every new `encoder` model must inherit this.
+
+ It defines common `encoder` specific functions.
+ """
+
+ # pylint: disable=W0102
+ def __init__(self):
+ super(BaseEncoder, self).__init__()
+
+ def get_torch_mel_spectrogram_class(self, audio_config):
+ return torch.nn.Sequential(
+ PreEmphasis(audio_config["preemphasis"]),
+ # TorchSTFT(
+ # n_fft=audio_config["fft_size"],
+ # hop_length=audio_config["hop_length"],
+ # win_length=audio_config["win_length"],
+ # sample_rate=audio_config["sample_rate"],
+ # window="hamming_window",
+ # mel_fmin=0.0,
+ # mel_fmax=None,
+ # use_htk=True,
+ # do_amp_to_db=False,
+ # n_mels=audio_config["num_mels"],
+ # power=2.0,
+ # use_mel=True,
+ # mel_norm=None,
+ # )
+ torchaudio.transforms.MelSpectrogram(
+ sample_rate=audio_config["sample_rate"],
+ n_fft=audio_config["fft_size"],
+ win_length=audio_config["win_length"],
+ hop_length=audio_config["hop_length"],
+ window_fn=torch.hamming_window,
+ n_mels=audio_config["num_mels"],
+ ),
+ )
+
+ @torch.no_grad()
+ def inference(self, x, l2_norm=True):
+ return self.forward(x, l2_norm)
+
+ @torch.no_grad()
+ def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True):
+ """
+ Generate embeddings for a batch of utterances
+ x: 1xTxD
+ """
+ # map to the waveform size
+ if self.use_torch_spec:
+ num_frames = num_frames * self.audio_config["hop_length"]
+
+ max_len = x.shape[1]
+
+ if max_len < num_frames:
+ num_frames = max_len
+
+ offsets = np.linspace(0, max_len - num_frames, num=num_eval)
+
+ frames_batch = []
+ for offset in offsets:
+ offset = int(offset)
+ end_offset = int(offset + num_frames)
+ frames = x[:, offset:end_offset]
+ frames_batch.append(frames)
+
+ frames_batch = torch.cat(frames_batch, dim=0)
+ embeddings = self.inference(frames_batch, l2_norm=l2_norm)
+
+ if return_mean:
+ embeddings = torch.mean(embeddings, dim=0, keepdim=True)
+ return embeddings
+
+ def get_criterion(self, c: Coqpit, num_classes=None):
+ if c.loss == "ge2e":
+ criterion = GE2ELoss(loss_method="softmax")
+ elif c.loss == "angleproto":
+ criterion = AngleProtoLoss()
+ elif c.loss == "softmaxproto":
+ criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_classes)
+ else:
+ raise Exception("The %s not is a loss supported" % c.loss)
+ return criterion
+
+ def load_checkpoint(
+ self,
+ config: Coqpit,
+ checkpoint_path: str,
+ eval: bool = False,
+ use_cuda: bool = False,
+ criterion=None,
+ cache=False,
+ ):
+ state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
+ try:
+ self.load_state_dict(state["model"])
+ print(" > Model fully restored. ")
+ except (KeyError, RuntimeError) as error:
+ # If eval raise the error
+ if eval:
+ raise error
+
+ print(" > Partial model initialization.")
+ model_dict = self.state_dict()
+ model_dict = set_init_dict(model_dict, state["model"], c)
+ self.load_state_dict(model_dict)
+ del model_dict
+
+ # load the criterion for restore_path
+ if criterion is not None and "criterion" in state:
+ try:
+ criterion.load_state_dict(state["criterion"])
+ except (KeyError, RuntimeError) as error:
+ print(" > Criterion load ignored because of:", error)
+
+ # instance and load the criterion for the encoder classifier in inference time
+ if (
+ eval
+ and criterion is None
+ and "criterion" in state
+ and getattr(config, "map_classid_to_classname", None) is not None
+ ):
+ criterion = self.get_criterion(config, len(config.map_classid_to_classname))
+ criterion.load_state_dict(state["criterion"])
+
+ if use_cuda:
+ self.cuda()
+ if criterion is not None:
+ criterion = criterion.cuda()
+
+ if eval:
+ self.eval()
+ assert not self.training
+
+ if not eval:
+ return criterion, state["step"]
+ return criterion
diff --git a/TTS/encoder/models/lstm.py b/TTS/encoder/models/lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..51852b5b820d181824b0db1a205cd5d7bd4fb20d
--- /dev/null
+++ b/TTS/encoder/models/lstm.py
@@ -0,0 +1,99 @@
+import torch
+from torch import nn
+
+from TTS.encoder.models.base_encoder import BaseEncoder
+
+
+class LSTMWithProjection(nn.Module):
+ def __init__(self, input_size, hidden_size, proj_size):
+ super().__init__()
+ self.input_size = input_size
+ self.hidden_size = hidden_size
+ self.proj_size = proj_size
+ self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
+ self.linear = nn.Linear(hidden_size, proj_size, bias=False)
+
+ def forward(self, x):
+ self.lstm.flatten_parameters()
+ o, (_, _) = self.lstm(x)
+ return self.linear(o)
+
+
+class LSTMWithoutProjection(nn.Module):
+ def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers):
+ super().__init__()
+ self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True)
+ self.linear = nn.Linear(lstm_dim, proj_dim, bias=True)
+ self.relu = nn.ReLU()
+
+ def forward(self, x):
+ _, (hidden, _) = self.lstm(x)
+ return self.relu(self.linear(hidden[-1]))
+
+
+class LSTMSpeakerEncoder(BaseEncoder):
+ def __init__(
+ self,
+ input_dim,
+ proj_dim=256,
+ lstm_dim=768,
+ num_lstm_layers=3,
+ use_lstm_with_projection=True,
+ use_torch_spec=False,
+ audio_config=None,
+ ):
+ super().__init__()
+ self.use_lstm_with_projection = use_lstm_with_projection
+ self.use_torch_spec = use_torch_spec
+ self.audio_config = audio_config
+ self.proj_dim = proj_dim
+
+ layers = []
+ # choise LSTM layer
+ if use_lstm_with_projection:
+ layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
+ for _ in range(num_lstm_layers - 1):
+ layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
+ self.layers = nn.Sequential(*layers)
+ else:
+ self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers)
+
+ self.instancenorm = nn.InstanceNorm1d(input_dim)
+
+ if self.use_torch_spec:
+ self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
+ else:
+ self.torch_spec = None
+
+ self._init_layers()
+
+ def _init_layers(self):
+ for name, param in self.layers.named_parameters():
+ if "bias" in name:
+ nn.init.constant_(param, 0.0)
+ elif "weight" in name:
+ nn.init.xavier_normal_(param)
+
+ def forward(self, x, l2_norm=True):
+ """Forward pass of the model.
+
+ Args:
+ x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
+ to compute the spectrogram on-the-fly.
+ l2_norm (bool): Whether to L2-normalize the outputs.
+
+ Shapes:
+ - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
+ """
+ with torch.no_grad():
+ with torch.cuda.amp.autocast(enabled=False):
+ if self.use_torch_spec:
+ x.squeeze_(1)
+ x = self.torch_spec(x)
+ x = self.instancenorm(x).transpose(1, 2)
+ d = self.layers(x)
+ if self.use_lstm_with_projection:
+ d = d[:, -1]
+ if l2_norm:
+ d = torch.nn.functional.normalize(d, p=2, dim=1)
+ return d
diff --git a/TTS/encoder/models/resnet.py b/TTS/encoder/models/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..5eafcd6005739fcdc454fb20def3e66791766a53
--- /dev/null
+++ b/TTS/encoder/models/resnet.py
@@ -0,0 +1,198 @@
+import torch
+from torch import nn
+
+# from TTS.utils.audio.torch_transforms import TorchSTFT
+from TTS.encoder.models.base_encoder import BaseEncoder
+
+
+class SELayer(nn.Module):
+ def __init__(self, channel, reduction=8):
+ super(SELayer, self).__init__()
+ self.avg_pool = nn.AdaptiveAvgPool2d(1)
+ self.fc = nn.Sequential(
+ nn.Linear(channel, channel // reduction),
+ nn.ReLU(inplace=True),
+ nn.Linear(channel // reduction, channel),
+ nn.Sigmoid(),
+ )
+
+ def forward(self, x):
+ b, c, _, _ = x.size()
+ y = self.avg_pool(x).view(b, c)
+ y = self.fc(y).view(b, c, 1, 1)
+ return x * y
+
+
+class SEBasicBlock(nn.Module):
+ expansion = 1
+
+ def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
+ super(SEBasicBlock, self).__init__()
+ self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+ self.bn1 = nn.BatchNorm2d(planes)
+ self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
+ self.bn2 = nn.BatchNorm2d(planes)
+ self.relu = nn.ReLU(inplace=True)
+ self.se = SELayer(planes, reduction)
+ self.downsample = downsample
+ self.stride = stride
+
+ def forward(self, x):
+ residual = x
+
+ out = self.conv1(x)
+ out = self.relu(out)
+ out = self.bn1(out)
+
+ out = self.conv2(out)
+ out = self.bn2(out)
+ out = self.se(out)
+
+ if self.downsample is not None:
+ residual = self.downsample(x)
+
+ out += residual
+ out = self.relu(out)
+ return out
+
+
+class ResNetSpeakerEncoder(BaseEncoder):
+ """Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153
+ Adapted from: https://github.com/clovaai/voxceleb_trainer
+ """
+
+ # pylint: disable=W0102
+ def __init__(
+ self,
+ input_dim=64,
+ proj_dim=512,
+ layers=[3, 4, 6, 3],
+ num_filters=[32, 64, 128, 256],
+ encoder_type="ASP",
+ log_input=False,
+ use_torch_spec=False,
+ audio_config=None,
+ ):
+ super(ResNetSpeakerEncoder, self).__init__()
+
+ self.encoder_type = encoder_type
+ self.input_dim = input_dim
+ self.log_input = log_input
+ self.use_torch_spec = use_torch_spec
+ self.audio_config = audio_config
+ self.proj_dim = proj_dim
+
+ self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=3, stride=1, padding=1)
+ self.relu = nn.ReLU(inplace=True)
+ self.bn1 = nn.BatchNorm2d(num_filters[0])
+
+ self.inplanes = num_filters[0]
+ self.layer1 = self.create_layer(SEBasicBlock, num_filters[0], layers[0])
+ self.layer2 = self.create_layer(SEBasicBlock, num_filters[1], layers[1], stride=(2, 2))
+ self.layer3 = self.create_layer(SEBasicBlock, num_filters[2], layers[2], stride=(2, 2))
+ self.layer4 = self.create_layer(SEBasicBlock, num_filters[3], layers[3], stride=(2, 2))
+
+ self.instancenorm = nn.InstanceNorm1d(input_dim)
+
+ if self.use_torch_spec:
+ self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
+ else:
+ self.torch_spec = None
+
+ outmap_size = int(self.input_dim / 8)
+
+ self.attention = nn.Sequential(
+ nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1),
+ nn.ReLU(),
+ nn.BatchNorm1d(128),
+ nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1),
+ nn.Softmax(dim=2),
+ )
+
+ if self.encoder_type == "SAP":
+ out_dim = num_filters[3] * outmap_size
+ elif self.encoder_type == "ASP":
+ out_dim = num_filters[3] * outmap_size * 2
+ else:
+ raise ValueError("Undefined encoder")
+
+ self.fc = nn.Linear(out_dim, proj_dim)
+
+ self._init_layers()
+
+ def _init_layers(self):
+ for m in self.modules():
+ if isinstance(m, nn.Conv2d):
+ nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+ elif isinstance(m, nn.BatchNorm2d):
+ nn.init.constant_(m.weight, 1)
+ nn.init.constant_(m.bias, 0)
+
+ def create_layer(self, block, planes, blocks, stride=1):
+ downsample = None
+ if stride != 1 or self.inplanes != planes * block.expansion:
+ downsample = nn.Sequential(
+ nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
+ nn.BatchNorm2d(planes * block.expansion),
+ )
+
+ layers = []
+ layers.append(block(self.inplanes, planes, stride, downsample))
+ self.inplanes = planes * block.expansion
+ for _ in range(1, blocks):
+ layers.append(block(self.inplanes, planes))
+
+ return nn.Sequential(*layers)
+
+ # pylint: disable=R0201
+ def new_parameter(self, *size):
+ out = nn.Parameter(torch.FloatTensor(*size))
+ nn.init.xavier_normal_(out)
+ return out
+
+ def forward(self, x, l2_norm=False):
+ """Forward pass of the model.
+
+ Args:
+ x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
+ to compute the spectrogram on-the-fly.
+ l2_norm (bool): Whether to L2-normalize the outputs.
+
+ Shapes:
+ - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
+ """
+ x.squeeze_(1)
+ # if you torch spec compute it otherwise use the mel spec computed by the AP
+ if self.use_torch_spec:
+ x = self.torch_spec(x)
+
+ if self.log_input:
+ x = (x + 1e-6).log()
+ x = self.instancenorm(x).unsqueeze(1)
+
+ x = self.conv1(x)
+ x = self.relu(x)
+ x = self.bn1(x)
+
+ x = self.layer1(x)
+ x = self.layer2(x)
+ x = self.layer3(x)
+ x = self.layer4(x)
+
+ x = x.reshape(x.size()[0], -1, x.size()[-1])
+
+ w = self.attention(x)
+
+ if self.encoder_type == "SAP":
+ x = torch.sum(x * w, dim=2)
+ elif self.encoder_type == "ASP":
+ mu = torch.sum(x * w, dim=2)
+ sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-5))
+ x = torch.cat((mu, sg), 1)
+
+ x = x.view(x.size()[0], -1)
+ x = self.fc(x)
+
+ if l2_norm:
+ x = torch.nn.functional.normalize(x, p=2, dim=1)
+ return x
diff --git a/TTS/encoder/requirements.txt b/TTS/encoder/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a486cc45ddb44591bd03c9c0df294fbe98c13884
--- /dev/null
+++ b/TTS/encoder/requirements.txt
@@ -0,0 +1,2 @@
+umap-learn
+numpy>=1.17.0
diff --git a/TTS/encoder/utils/__init__.py b/TTS/encoder/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/TTS/encoder/utils/generic_utils.py b/TTS/encoder/utils/generic_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1da029611b5c9bd59b05d61189674832d50ed634
--- /dev/null
+++ b/TTS/encoder/utils/generic_utils.py
@@ -0,0 +1,182 @@
+import datetime
+import glob
+import os
+import random
+import re
+
+import numpy as np
+from scipy import signal
+
+from TTS.encoder.models.lstm import LSTMSpeakerEncoder
+from TTS.encoder.models.resnet import ResNetSpeakerEncoder
+from TTS.utils.io import save_fsspec
+
+
+class AugmentWAV(object):
+ def __init__(self, ap, augmentation_config):
+ self.ap = ap
+ self.use_additive_noise = False
+
+ if "additive" in augmentation_config.keys():
+ self.additive_noise_config = augmentation_config["additive"]
+ additive_path = self.additive_noise_config["sounds_path"]
+ if additive_path:
+ self.use_additive_noise = True
+ # get noise types
+ self.additive_noise_types = []
+ for key in self.additive_noise_config.keys():
+ if isinstance(self.additive_noise_config[key], dict):
+ self.additive_noise_types.append(key)
+
+ additive_files = glob.glob(os.path.join(additive_path, "**/*.wav"), recursive=True)
+
+ self.noise_list = {}
+
+ for wav_file in additive_files:
+ noise_dir = wav_file.replace(additive_path, "").split(os.sep)[0]
+ # ignore not listed directories
+ if noise_dir not in self.additive_noise_types:
+ continue
+ if not noise_dir in self.noise_list:
+ self.noise_list[noise_dir] = []
+ self.noise_list[noise_dir].append(wav_file)
+
+ print(
+ f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}"
+ )
+
+ self.use_rir = False
+
+ if "rir" in augmentation_config.keys():
+ self.rir_config = augmentation_config["rir"]
+ if self.rir_config["rir_path"]:
+ self.rir_files = glob.glob(os.path.join(self.rir_config["rir_path"], "**/*.wav"), recursive=True)
+ self.use_rir = True
+
+ print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances")
+
+ self.create_augmentation_global_list()
+
+ def create_augmentation_global_list(self):
+ if self.use_additive_noise:
+ self.global_noise_list = self.additive_noise_types
+ else:
+ self.global_noise_list = []
+ if self.use_rir:
+ self.global_noise_list.append("RIR_AUG")
+
+ def additive_noise(self, noise_type, audio):
+ clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4)
+
+ noise_list = random.sample(
+ self.noise_list[noise_type],
+ random.randint(
+ self.additive_noise_config[noise_type]["min_num_noises"],
+ self.additive_noise_config[noise_type]["max_num_noises"],
+ ),
+ )
+
+ audio_len = audio.shape[0]
+ noises_wav = None
+ for noise in noise_list:
+ noiseaudio = self.ap.load_wav(noise, sr=self.ap.sample_rate)[:audio_len]
+
+ if noiseaudio.shape[0] < audio_len:
+ continue
+
+ noise_snr = random.uniform(
+ self.additive_noise_config[noise_type]["min_snr_in_db"],
+ self.additive_noise_config[noise_type]["max_num_noises"],
+ )
+ noise_db = 10 * np.log10(np.mean(noiseaudio**2) + 1e-4)
+ noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio
+
+ if noises_wav is None:
+ noises_wav = noise_wav
+ else:
+ noises_wav += noise_wav
+
+ # if all possible files is less than audio, choose other files
+ if noises_wav is None:
+ return self.additive_noise(noise_type, audio)
+
+ return audio + noises_wav
+
+ def reverberate(self, audio):
+ audio_len = audio.shape[0]
+
+ rir_file = random.choice(self.rir_files)
+ rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate)
+ rir = rir / np.sqrt(np.sum(rir**2))
+ return signal.convolve(audio, rir, mode=self.rir_config["conv_mode"])[:audio_len]
+
+ def apply_one(self, audio):
+ noise_type = random.choice(self.global_noise_list)
+ if noise_type == "RIR_AUG":
+ return self.reverberate(audio)
+
+ return self.additive_noise(noise_type, audio)
+
+
+def to_camel(text):
+ text = text.capitalize()
+ return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
+
+
+def setup_encoder_model(config: "Coqpit"):
+ if config.model_params["model_name"].lower() == "lstm":
+ model = LSTMSpeakerEncoder(
+ config.model_params["input_dim"],
+ config.model_params["proj_dim"],
+ config.model_params["lstm_dim"],
+ config.model_params["num_lstm_layers"],
+ use_torch_spec=config.model_params.get("use_torch_spec", False),
+ audio_config=config.audio,
+ )
+ elif config.model_params["model_name"].lower() == "resnet":
+ model = ResNetSpeakerEncoder(
+ input_dim=config.model_params["input_dim"],
+ proj_dim=config.model_params["proj_dim"],
+ log_input=config.model_params.get("log_input", False),
+ use_torch_spec=config.model_params.get("use_torch_spec", False),
+ audio_config=config.audio,
+ )
+ return model
+
+
+def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch):
+ checkpoint_path = "checkpoint_{}.pth".format(current_step)
+ checkpoint_path = os.path.join(out_path, checkpoint_path)
+ print(" | | > Checkpoint saving : {}".format(checkpoint_path))
+
+ new_state_dict = model.state_dict()
+ state = {
+ "model": new_state_dict,
+ "optimizer": optimizer.state_dict() if optimizer is not None else None,
+ "criterion": criterion.state_dict(),
+ "step": current_step,
+ "epoch": epoch,
+ "loss": model_loss,
+ "date": datetime.date.today().strftime("%B %d, %Y"),
+ }
+ save_fsspec(state, checkpoint_path)
+
+
+def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step, epoch):
+ if model_loss < best_loss:
+ new_state_dict = model.state_dict()
+ state = {
+ "model": new_state_dict,
+ "optimizer": optimizer.state_dict(),
+ "criterion": criterion.state_dict(),
+ "step": current_step,
+ "epoch": epoch,
+ "loss": model_loss,
+ "date": datetime.date.today().strftime("%B %d, %Y"),
+ }
+ best_loss = model_loss
+ bestmodel_path = "best_model.pth"
+ bestmodel_path = os.path.join(out_path, bestmodel_path)
+ print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
+ save_fsspec(state, bestmodel_path)
+ return best_loss
diff --git a/TTS/encoder/utils/io.py b/TTS/encoder/utils/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1dad3e24d234cdcb9616fb14bc87919c7e20291
--- /dev/null
+++ b/TTS/encoder/utils/io.py
@@ -0,0 +1,38 @@
+import datetime
+import os
+
+from TTS.utils.io import save_fsspec
+
+
+def save_checkpoint(model, optimizer, model_loss, out_path, current_step):
+ checkpoint_path = "checkpoint_{}.pth".format(current_step)
+ checkpoint_path = os.path.join(out_path, checkpoint_path)
+ print(" | | > Checkpoint saving : {}".format(checkpoint_path))
+
+ new_state_dict = model.state_dict()
+ state = {
+ "model": new_state_dict,
+ "optimizer": optimizer.state_dict() if optimizer is not None else None,
+ "step": current_step,
+ "loss": model_loss,
+ "date": datetime.date.today().strftime("%B %d, %Y"),
+ }
+ save_fsspec(state, checkpoint_path)
+
+
+def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_step):
+ if model_loss < best_loss:
+ new_state_dict = model.state_dict()
+ state = {
+ "model": new_state_dict,
+ "optimizer": optimizer.state_dict(),
+ "step": current_step,
+ "loss": model_loss,
+ "date": datetime.date.today().strftime("%B %d, %Y"),
+ }
+ best_loss = model_loss
+ bestmodel_path = "best_model.pth"
+ bestmodel_path = os.path.join(out_path, bestmodel_path)
+ print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
+ save_fsspec(state, bestmodel_path)
+ return best_loss
diff --git a/TTS/encoder/utils/prepare_voxceleb.py b/TTS/encoder/utils/prepare_voxceleb.py
new file mode 100644
index 0000000000000000000000000000000000000000..b93baf9e60f0d5c35a4e86f6746e29f6097174b5
--- /dev/null
+++ b/TTS/encoder/utils/prepare_voxceleb.py
@@ -0,0 +1,219 @@
+# coding=utf-8
+# Copyright (C) 2020 ATHENA AUTHORS; Yiping Peng; Ne Luo
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Only support eager mode and TF>=2.0.0
+# pylint: disable=no-member, invalid-name, relative-beyond-top-level
+# pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes
+""" voxceleb 1 & 2 """
+
+import hashlib
+import os
+import subprocess
+import sys
+import zipfile
+
+import pandas
+import soundfile as sf
+from absl import logging
+
+SUBSETS = {
+ "vox1_dev_wav": [
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partaa",
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partab",
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partac",
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_partad",
+ ],
+ "vox1_test_wav": ["https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip"],
+ "vox2_dev_aac": [
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaa",
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partab",
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partac",
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partad",
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partae",
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partaf",
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partag",
+ "https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_dev_aac_partah",
+ ],
+ "vox2_test_aac": ["https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox2_test_aac.zip"],
+}
+
+MD5SUM = {
+ "vox1_dev_wav": "ae63e55b951748cc486645f532ba230b",
+ "vox2_dev_aac": "bbc063c46078a602ca71605645c2a402",
+ "vox1_test_wav": "185fdc63c3c739954633d50379a3d102",
+ "vox2_test_aac": "0d2b3ea430a821c33263b5ea37ede312",
+}
+
+USER = {"user": "", "password": ""}
+
+speaker_id_dict = {}
+
+
+def download_and_extract(directory, subset, urls):
+ """Download and extract the given split of dataset.
+
+ Args:
+ directory: the directory where to put the downloaded data.
+ subset: subset name of the corpus.
+ urls: the list of urls to download the data file.
+ """
+ os.makedirs(directory, exist_ok=True)
+
+ try:
+ for url in urls:
+ zip_filepath = os.path.join(directory, url.split("/")[-1])
+ if os.path.exists(zip_filepath):
+ continue
+ logging.info("Downloading %s to %s" % (url, zip_filepath))
+ subprocess.call(
+ "wget %s --user %s --password %s -O %s" % (url, USER["user"], USER["password"], zip_filepath),
+ shell=True,
+ )
+
+ statinfo = os.stat(zip_filepath)
+ logging.info("Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size))
+
+ # concatenate all parts into zip files
+ if ".zip" not in zip_filepath:
+ zip_filepath = "_".join(zip_filepath.split("_")[:-1])
+ subprocess.call("cat %s* > %s.zip" % (zip_filepath, zip_filepath), shell=True)
+ zip_filepath += ".zip"
+ extract_path = zip_filepath.strip(".zip")
+
+ # check zip file md5sum
+ with open(zip_filepath, "rb") as f_zip:
+ md5 = hashlib.md5(f_zip.read()).hexdigest()
+ if md5 != MD5SUM[subset]:
+ raise ValueError("md5sum of %s mismatch" % zip_filepath)
+
+ with zipfile.ZipFile(zip_filepath, "r") as zfile:
+ zfile.extractall(directory)
+ extract_path_ori = os.path.join(directory, zfile.infolist()[0].filename)
+ subprocess.call("mv %s %s" % (extract_path_ori, extract_path), shell=True)
+ finally:
+ # os.remove(zip_filepath)
+ pass
+
+
+def exec_cmd(cmd):
+ """Run a command in a subprocess.
+ Args:
+ cmd: command line to be executed.
+ Return:
+ int, the return code.
+ """
+ try:
+ retcode = subprocess.call(cmd, shell=True)
+ if retcode < 0:
+ logging.info(f"Child was terminated by signal {retcode}")
+ except OSError as e:
+ logging.info(f"Execution failed: {e}")
+ retcode = -999
+ return retcode
+
+
+def decode_aac_with_ffmpeg(aac_file, wav_file):
+ """Decode a given AAC file into WAV using ffmpeg.
+ Args:
+ aac_file: file path to input AAC file.
+ wav_file: file path to output WAV file.
+ Return:
+ bool, True if success.
+ """
+ cmd = f"ffmpeg -i {aac_file} {wav_file}"
+ logging.info(f"Decoding aac file using command line: {cmd}")
+ ret = exec_cmd(cmd)
+ if ret != 0:
+ logging.error(f"Failed to decode aac file with retcode {ret}")
+ logging.error("Please check your ffmpeg installation.")
+ return False
+ return True
+
+
+def convert_audio_and_make_label(input_dir, subset, output_dir, output_file):
+ """Optionally convert AAC to WAV and make speaker labels.
+ Args:
+ input_dir: the directory which holds the input dataset.
+ subset: the name of the specified subset. e.g. vox1_dev_wav
+ output_dir: the directory to place the newly generated csv files.
+ output_file: the name of the newly generated csv file. e.g. vox1_dev_wav.csv
+ """
+
+ logging.info("Preprocessing audio and label for subset %s" % subset)
+ source_dir = os.path.join(input_dir, subset)
+
+ files = []
+ # Convert all AAC file into WAV format. At the same time, generate the csv
+ for root, _, filenames in os.walk(source_dir):
+ for filename in filenames:
+ name, ext = os.path.splitext(filename)
+ if ext.lower() == ".wav":
+ _, ext2 = os.path.splitext(name)
+ if ext2:
+ continue
+ wav_file = os.path.join(root, filename)
+ elif ext.lower() == ".m4a":
+ # Convert AAC to WAV.
+ aac_file = os.path.join(root, filename)
+ wav_file = aac_file + ".wav"
+ if not os.path.exists(wav_file):
+ if not decode_aac_with_ffmpeg(aac_file, wav_file):
+ raise RuntimeError("Audio decoding failed.")
+ else:
+ continue
+ speaker_name = root.split(os.path.sep)[-2]
+ if speaker_name not in speaker_id_dict:
+ num = len(speaker_id_dict)
+ speaker_id_dict[speaker_name] = num
+ # wav_filesize = os.path.getsize(wav_file)
+ wav_length = len(sf.read(wav_file)[0])
+ files.append((os.path.abspath(wav_file), wav_length, speaker_id_dict[speaker_name], speaker_name))
+
+ # Write to CSV file which contains four columns:
+ # "wav_filename", "wav_length_ms", "speaker_id", "speaker_name".
+ csv_file_path = os.path.join(output_dir, output_file)
+ df = pandas.DataFrame(data=files, columns=["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"])
+ df.to_csv(csv_file_path, index=False, sep="\t")
+ logging.info("Successfully generated csv file {}".format(csv_file_path))
+
+
+def processor(directory, subset, force_process):
+ """download and process"""
+ urls = SUBSETS
+ if subset not in urls:
+ raise ValueError(subset, "is not in voxceleb")
+
+ subset_csv = os.path.join(directory, subset + ".csv")
+ if not force_process and os.path.exists(subset_csv):
+ return subset_csv
+
+ logging.info("Downloading and process the voxceleb in %s", directory)
+ logging.info("Preparing subset %s", subset)
+ download_and_extract(directory, subset, urls[subset])
+ convert_audio_and_make_label(directory, subset, directory, subset + ".csv")
+ logging.info("Finished downloading and processing")
+ return subset_csv
+
+
+if __name__ == "__main__":
+ logging.set_verbosity(logging.INFO)
+ if len(sys.argv) != 4:
+ print("Usage: python prepare_data.py save_directory user password")
+ sys.exit()
+
+ DIR, USER["user"], USER["password"] = sys.argv[1], sys.argv[2], sys.argv[3]
+ for SUBSET in SUBSETS:
+ processor(DIR, SUBSET, False)
diff --git a/TTS/encoder/utils/training.py b/TTS/encoder/utils/training.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c58a232e7a146bb24718700527ab80e62a1ab1a
--- /dev/null
+++ b/TTS/encoder/utils/training.py
@@ -0,0 +1,99 @@
+import os
+from dataclasses import dataclass, field
+
+from coqpit import Coqpit
+from trainer import TrainerArgs, get_last_checkpoint
+from trainer.logging import logger_factory
+from trainer.logging.console_logger import ConsoleLogger
+
+from TTS.config import load_config, register_config
+from TTS.tts.utils.text.characters import parse_symbols
+from TTS.utils.generic_utils import get_experiment_folder_path, get_git_branch
+from TTS.utils.io import copy_model_files
+
+
+@dataclass
+class TrainArgs(TrainerArgs):
+ config_path: str = field(default=None, metadata={"help": "Path to the config file."})
+
+
+def getarguments():
+ train_config = TrainArgs()
+ parser = train_config.init_argparse(arg_prefix="")
+ return parser
+
+
+def process_args(args, config=None):
+ """Process parsed comand line arguments and initialize the config if not provided.
+ Args:
+ args (argparse.Namespace or dict like): Parsed input arguments.
+ config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None.
+ Returns:
+ c (TTS.utils.io.AttrDict): Config paramaters.
+ out_path (str): Path to save models and logging.
+ audio_path (str): Path to save generated test audios.
+ c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does
+ logging to the console.
+ dashboard_logger (WandbLogger or TensorboardLogger): Class that does the dashboard Logging
+ TODO:
+ - Interactive config definition.
+ """
+ if isinstance(args, tuple):
+ args, coqpit_overrides = args
+ if args.continue_path:
+ # continue a previous training from its output folder
+ experiment_path = args.continue_path
+ args.config_path = os.path.join(args.continue_path, "config.json")
+ args.restore_path, best_model = get_last_checkpoint(args.continue_path)
+ if not args.best_path:
+ args.best_path = best_model
+ # init config if not already defined
+ if config is None:
+ if args.config_path:
+ # init from a file
+ config = load_config(args.config_path)
+ else:
+ # init from console args
+ from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
+
+ config_base = BaseTrainingConfig()
+ config_base.parse_known_args(coqpit_overrides)
+ config = register_config(config_base.model)()
+ # override values from command-line args
+ config.parse_known_args(coqpit_overrides, relaxed_parser=True)
+ experiment_path = args.continue_path
+ if not experiment_path:
+ experiment_path = get_experiment_folder_path(config.output_path, config.run_name)
+ audio_path = os.path.join(experiment_path, "test_audios")
+ config.output_log_path = experiment_path
+ # setup rank 0 process in distributed training
+ dashboard_logger = None
+ if args.rank == 0:
+ new_fields = {}
+ if args.restore_path:
+ new_fields["restore_path"] = args.restore_path
+ new_fields["github_branch"] = get_git_branch()
+ # if model characters are not set in the config file
+ # save the default set to the config file for future
+ # compatibility.
+ if config.has("characters") and config.characters is None:
+ used_characters = parse_symbols()
+ new_fields["characters"] = used_characters
+ copy_model_files(config, experiment_path, new_fields)
+ dashboard_logger = logger_factory(config, experiment_path)
+ c_logger = ConsoleLogger()
+ return config, experiment_path, audio_path, c_logger, dashboard_logger
+
+
+def init_arguments():
+ train_config = TrainArgs()
+ parser = train_config.init_argparse(arg_prefix="")
+ return parser
+
+
+def init_training(config: Coqpit = None):
+ """Initialization of a training run."""
+ parser = init_arguments()
+ args = parser.parse_known_args()
+ config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = process_args(args, config)
+ return args[0], config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger
diff --git a/TTS/encoder/utils/visual.py b/TTS/encoder/utils/visual.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2db2f3fa3408f96a04f7932438f175c6ec19c51
--- /dev/null
+++ b/TTS/encoder/utils/visual.py
@@ -0,0 +1,50 @@
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import umap
+
+matplotlib.use("Agg")
+
+
+colormap = (
+ np.array(
+ [
+ [76, 255, 0],
+ [0, 127, 70],
+ [255, 0, 0],
+ [255, 217, 38],
+ [0, 135, 255],
+ [165, 0, 165],
+ [255, 167, 255],
+ [0, 255, 255],
+ [255, 96, 38],
+ [142, 76, 0],
+ [33, 0, 127],
+ [0, 0, 0],
+ [183, 183, 183],
+ ],
+ dtype=np.float,
+ )
+ / 255
+)
+
+
+def plot_embeddings(embeddings, num_classes_in_batch):
+ num_utter_per_class = embeddings.shape[0] // num_classes_in_batch
+
+ # if necessary get just the first 10 classes
+ if num_classes_in_batch > 10:
+ num_classes_in_batch = 10
+ embeddings = embeddings[: num_classes_in_batch * num_utter_per_class]
+
+ model = umap.UMAP()
+ projection = model.fit_transform(embeddings)
+ ground_truth = np.repeat(np.arange(num_classes_in_batch), num_utter_per_class)
+ colors = [colormap[i] for i in ground_truth]
+ fig, ax = plt.subplots(figsize=(16, 10))
+ _ = ax.scatter(projection[:, 0], projection[:, 1], c=colors)
+ plt.gca().set_aspect("equal", "datalim")
+ plt.title("UMAP projection")
+ plt.tight_layout()
+ plt.savefig("umap")
+ return fig
diff --git a/TTS/model.py b/TTS/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae6be7b444695756c00c4faa8f2f6c787dfcf9d8
--- /dev/null
+++ b/TTS/model.py
@@ -0,0 +1,59 @@
+from abc import abstractmethod
+from typing import Dict
+
+import torch
+from coqpit import Coqpit
+from trainer import TrainerModel
+
+# pylint: skip-file
+
+
+class BaseTrainerModel(TrainerModel):
+ """BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS.
+
+ Every new 🐸TTS model must inherit it.
+ """
+
+ @staticmethod
+ @abstractmethod
+ def init_from_config(config: Coqpit):
+ """Init the model and all its attributes from the given config.
+
+ Override this depending on your model.
+ """
+ ...
+
+ @abstractmethod
+ def inference(self, input: torch.Tensor, aux_input={}) -> Dict:
+ """Forward pass for inference.
+
+ It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs```
+ is considered to be the main output and you can add any other auxiliary outputs as you want.
+
+ We don't use `*kwargs` since it is problematic with the TorchScript API.
+
+ Args:
+ input (torch.Tensor): [description]
+ aux_input (Dict): Auxiliary inputs like speaker embeddings, durations etc.
+
+ Returns:
+ Dict: [description]
+ """
+ outputs_dict = {"model_outputs": None}
+ ...
+ return outputs_dict
+
+ @abstractmethod
+ def load_checkpoint(
+ self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True, cache=False
+ ) -> None:
+ """Load a model checkpoint gile and get ready for training or inference.
+
+ Args:
+ config (Coqpit): Model configuration.
+ checkpoint_path (str): Path to the model checkpoint file.
+ eval (bool, optional): If true, init model for inference else for training. Defaults to False.
+ strict (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
+ cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False.
+ """
+ ...
diff --git a/TTS/server/README.md b/TTS/server/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..270656c4e39dc11636efbb1ba51eba7c9b4a8f04
--- /dev/null
+++ b/TTS/server/README.md
@@ -0,0 +1,18 @@
+# :frog: TTS demo server
+Before you use the server, make sure you [install](https://github.com/coqui-ai/TTS/tree/dev#install-tts)) :frog: TTS properly. Then, you can follow the steps below.
+
+**Note:** If you install :frog:TTS using ```pip```, you can also use the ```tts-server``` end point on the terminal.
+
+Examples runs:
+
+List officially released models.
+```python TTS/server/server.py --list_models ```
+
+Run the server with the official models.
+```python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan```
+
+Run the server with the official models on a GPU.
+```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True```
+
+Run the server with a custom models.
+```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json```
diff --git a/TTS/server/__init__.py b/TTS/server/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/TTS/server/conf.json b/TTS/server/conf.json
new file mode 100644
index 0000000000000000000000000000000000000000..49b6c09c3848a224dfb39a1f653aa1b289a4b6e5
--- /dev/null
+++ b/TTS/server/conf.json
@@ -0,0 +1,12 @@
+{
+ "tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/", // tts model root folder
+ "tts_file":"best_model.pth", // tts checkpoint file
+ "tts_config":"config.json", // tts config.json file
+ "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding.
+ "vocoder_config":null,
+ "vocoder_file": null,
+ "is_wavernn_batched":true,
+ "port": 5002,
+ "use_cuda": true,
+ "debug": true
+}
diff --git a/TTS/server/server.py b/TTS/server/server.py
new file mode 100644
index 0000000000000000000000000000000000000000..7324e801114c56aaee5fc4c81a6523d34662419d
--- /dev/null
+++ b/TTS/server/server.py
@@ -0,0 +1,249 @@
+#!flask/bin/python
+import argparse
+import io
+import json
+import os
+import sys
+from pathlib import Path
+from threading import Lock
+from typing import Union
+from urllib.parse import parse_qs
+
+from flask import Flask, render_template, render_template_string, request, send_file
+
+from TTS.config import load_config
+from TTS.utils.manage import ModelManager
+from TTS.utils.synthesizer import Synthesizer
+
+
+def create_argparser():
+ def convert_boolean(x):
+ return x.lower() in ["true", "1", "yes"]
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--list_models",
+ type=convert_boolean,
+ nargs="?",
+ const=True,
+ default=False,
+ help="list available pre-trained tts and vocoder models.",
+ )
+ parser.add_argument(
+ "--model_name",
+ type=str,
+ default="tts_models/en/ljspeech/tacotron2-DDC",
+ help="Name of one of the pre-trained tts models in format //",
+ )
+ parser.add_argument("--vocoder_name", type=str, default=None, help="name of one of the released vocoder models.")
+
+ # Args for running custom models
+ parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
+ parser.add_argument(
+ "--model_path",
+ type=str,
+ default=None,
+ help="Path to model file.",
+ )
+ parser.add_argument(
+ "--vocoder_path",
+ type=str,
+ help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
+ default=None,
+ )
+ parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
+ parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
+ parser.add_argument("--port", type=int, default=5002, help="port to listen on.")
+ parser.add_argument("--use_cuda", type=convert_boolean, default=False, help="true to use CUDA.")
+ parser.add_argument("--debug", type=convert_boolean, default=False, help="true to enable Flask debug mode.")
+ parser.add_argument("--show_details", type=convert_boolean, default=False, help="Generate model detail page.")
+ return parser
+
+
+# parse the args
+args = create_argparser().parse_args()
+
+path = Path(__file__).parent / "../.models.json"
+manager = ModelManager(path)
+
+if args.list_models:
+ manager.list_models()
+ sys.exit()
+
+# update in-use models to the specified released models.
+model_path = None
+config_path = None
+speakers_file_path = None
+vocoder_path = None
+vocoder_config_path = None
+
+# CASE1: list pre-trained TTS models
+if args.list_models:
+ manager.list_models()
+ sys.exit()
+
+# CASE2: load pre-trained model paths
+if args.model_name is not None and not args.model_path:
+ model_path, config_path, model_item = manager.download_model(args.model_name)
+ args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
+
+if args.vocoder_name is not None and not args.vocoder_path:
+ vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
+
+# CASE3: set custom model paths
+if args.model_path is not None:
+ model_path = args.model_path
+ config_path = args.config_path
+ speakers_file_path = args.speakers_file_path
+
+if args.vocoder_path is not None:
+ vocoder_path = args.vocoder_path
+ vocoder_config_path = args.vocoder_config_path
+
+# load models
+synthesizer = Synthesizer(
+ tts_checkpoint=model_path,
+ tts_config_path=config_path,
+ tts_speakers_file=speakers_file_path,
+ tts_languages_file=None,
+ vocoder_checkpoint=vocoder_path,
+ vocoder_config=vocoder_config_path,
+ encoder_checkpoint="",
+ encoder_config="",
+ use_cuda=args.use_cuda,
+)
+
+use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and (
+ synthesizer.tts_model.num_speakers > 1 or synthesizer.tts_speakers_file is not None
+)
+speaker_manager = getattr(synthesizer.tts_model, "speaker_manager", None)
+
+use_multi_language = hasattr(synthesizer.tts_model, "num_languages") and (
+ synthesizer.tts_model.num_languages > 1 or synthesizer.tts_languages_file is not None
+)
+language_manager = getattr(synthesizer.tts_model, "language_manager", None)
+
+# TODO: set this from SpeakerManager
+use_gst = synthesizer.tts_config.get("use_gst", False)
+app = Flask(__name__)
+
+
+def style_wav_uri_to_dict(style_wav: str) -> Union[str, dict]:
+ """Transform an uri style_wav, in either a string (path to wav file to be use for style transfer)
+ or a dict (gst tokens/values to be use for styling)
+
+ Args:
+ style_wav (str): uri
+
+ Returns:
+ Union[str, dict]: path to file (str) or gst style (dict)
+ """
+ if style_wav:
+ if os.path.isfile(style_wav) and style_wav.endswith(".wav"):
+ return style_wav # style_wav is a .wav file located on the server
+
+ style_wav = json.loads(style_wav)
+ return style_wav # style_wav is a gst dictionary with {token1_id : token1_weigth, ...}
+ return None
+
+
+@app.route("/")
+def index():
+ return render_template(
+ "index.html",
+ show_details=args.show_details,
+ use_multi_speaker=use_multi_speaker,
+ use_multi_language=use_multi_language,
+ speaker_ids=speaker_manager.name_to_id if speaker_manager is not None else None,
+ language_ids=language_manager.name_to_id if language_manager is not None else None,
+ use_gst=use_gst,
+ )
+
+
+@app.route("/details")
+def details():
+ model_config = load_config(args.tts_config)
+ if args.vocoder_config is not None and os.path.isfile(args.vocoder_config):
+ vocoder_config = load_config(args.vocoder_config)
+ else:
+ vocoder_config = None
+
+ return render_template(
+ "details.html",
+ show_details=args.show_details,
+ model_config=model_config,
+ vocoder_config=vocoder_config,
+ args=args.__dict__,
+ )
+
+
+lock = Lock()
+
+
+@app.route("/api/tts", methods=["GET"])
+def tts():
+ with lock:
+ text = request.args.get("text")
+ speaker_idx = request.args.get("speaker_id", "")
+ language_idx = request.args.get("language_id", "")
+ style_wav = request.args.get("style_wav", "")
+ style_wav = style_wav_uri_to_dict(style_wav)
+ print(f" > Model input: {text}")
+ print(f" > Speaker Idx: {speaker_idx}")
+ print(f" > Language Idx: {language_idx}")
+ wavs = synthesizer.tts(text, speaker_name=speaker_idx, language_name=language_idx, style_wav=style_wav)
+ out = io.BytesIO()
+ synthesizer.save_wav(wavs, out)
+ return send_file(out, mimetype="audio/wav")
+
+
+# Basic MaryTTS compatibility layer
+
+
+@app.route("/locales", methods=["GET"])
+def mary_tts_api_locales():
+ """MaryTTS-compatible /locales endpoint"""
+ # NOTE: We currently assume there is only one model active at the same time
+ if args.model_name is not None:
+ model_details = args.model_name.split("/")
+ else:
+ model_details = ["", "en", "", "default"]
+ return render_template_string("{{ locale }}\n", locale=model_details[1])
+
+
+@app.route("/voices", methods=["GET"])
+def mary_tts_api_voices():
+ """MaryTTS-compatible /voices endpoint"""
+ # NOTE: We currently assume there is only one model active at the same time
+ if args.model_name is not None:
+ model_details = args.model_name.split("/")
+ else:
+ model_details = ["", "en", "", "default"]
+ return render_template_string(
+ "{{ name }} {{ locale }} {{ gender }}\n", name=model_details[3], locale=model_details[1], gender="u"
+ )
+
+
+@app.route("/process", methods=["GET", "POST"])
+def mary_tts_api_process():
+ """MaryTTS-compatible /process endpoint"""
+ with lock:
+ if request.method == "POST":
+ data = parse_qs(request.get_data(as_text=True))
+ # NOTE: we ignore param. LOCALE and VOICE for now since we have only one active model
+ text = data.get("INPUT_TEXT", [""])[0]
+ else:
+ text = request.args.get("INPUT_TEXT", "")
+ print(f" > Model input: {text}")
+ wavs = synthesizer.tts(text)
+ out = io.BytesIO()
+ synthesizer.save_wav(wavs, out)
+ return send_file(out, mimetype="audio/wav")
+
+
+def main():
+ app.run(debug=args.debug, host="::", port=args.port)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/TTS/server/static/coqui-log-green-TTS.png b/TTS/server/static/coqui-log-green-TTS.png
new file mode 100644
index 0000000000000000000000000000000000000000..6ad188b8c03a170097c0393c6769996f03cf9054
Binary files /dev/null and b/TTS/server/static/coqui-log-green-TTS.png differ
diff --git a/TTS/server/templates/details.html b/TTS/server/templates/details.html
new file mode 100644
index 0000000000000000000000000000000000000000..51c9ed85a83ac0aab045623ee1e6c430fbe51b9d
--- /dev/null
+++ b/TTS/server/templates/details.html
@@ -0,0 +1,131 @@
+
+
+
+
+
+
+
+
+
+
+ TTS engine
+
+
+
+
+
+
+
+
+
+
+
+ {% if show_details == true %}
+
+
+ Model details
+
+
+
+
+ CLI arguments:
+
+
+
CLI key
+
Value
+
+
+ {% for key, value in args.items() %}
+
+
+
{{ key }}
+
{{ value }}
+
+
+ {% endfor %}
+
+
+
+
+
+
+ {% if model_config != None %}
+
+
+ Model config:
+
+
+
+
Key
+
Value
+
+
+
+ {% for key, value in model_config.items() %}
+
+
+
{{ key }}
+
{{ value }}
+
+
+ {% endfor %}
+
+
+
+
+ {% endif %}
+
+
+
+
+
+
+ {% if vocoder_config != None %}
+
+ Vocoder model config:
+
+
+
+
Key
+
Value
+
+
+
+ {% for key, value in vocoder_config.items() %}
+
+
+
{{ key }}
+
{{ value }}
+
+
+ {% endfor %}
+
+
+
+
+ {% endif %}
+
+
+ {% else %}
+
+ Please start server with --show_details=true to see details.
+