Spaces:

amphion
/

Text-to-Speech

Running

App Files Files Community

zyingt commited on Dec 18, 2023

Commit

0d80816

•

1 Parent(s): 43347e6

Upload 685 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.gitignore +61 -0
LICENSE +21 -0
app.py +84 -0
bins/calc_metrics.py +140 -0
bins/svc/inference.py +262 -0
bins/svc/preprocess.py +182 -0
bins/svc/train.py +107 -0
bins/tta/inference.py +94 -0
bins/tta/preprocess.py +194 -0
bins/tta/train_tta.py +77 -0
bins/tts/inference.py +162 -0
bins/tts/preprocess.py +253 -0
bins/tts/train.py +107 -0
bins/vocoder/inference.py +114 -0
bins/vocoder/preprocess.py +150 -0
bins/vocoder/train.py +90 -0
config/audioldm.json +92 -0
config/autoencoderkl.json +69 -0
config/base.json +220 -0
config/comosvc.json +216 -0
config/diffusion.json +227 -0
config/fs2.json +117 -0
config/transformer.json +180 -0
config/tts.json +26 -0
config/valle.json +52 -0
config/vits.json +101 -0
config/vocoder.json +84 -0
egs/datasets/README.md +381 -0
egs/metrics/README.md +93 -0
egs/metrics/run.sh +40 -0
egs/svc/DiffComoSVC/README.md +234 -0
egs/svc/DiffComoSVC/exp_config.json +143 -0
egs/svc/DiffComoSVC/run.sh +150 -0
egs/svc/MultipleContentsSVC/README.md +153 -0
egs/svc/MultipleContentsSVC/exp_config.json +126 -0
egs/svc/MultipleContentsSVC/run.sh +150 -0
egs/svc/README.md +34 -0
egs/svc/TransformerSVC/README.md +164 -0
egs/svc/TransformerSVC/exp_config.json +108 -0
egs/svc/TransformerSVC/run.sh +150 -0
egs/svc/_template/run.sh +150 -0
egs/tta/README.md +19 -0
egs/tta/RECIPE.md +156 -0
egs/tta/audioldm/exp_config.json +90 -0
egs/tta/audioldm/exp_config_base.json +11 -0
egs/tta/audioldm/exp_config_latent_4_10_78.json +88 -0
egs/tta/audioldm/run_inference.sh +52 -0
egs/tta/audioldm/run_inference_latent_4_10_78.sh +52 -0
egs/tta/audioldm/run_train.sh +26 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+imgs/vocoder/gan/MSSBCQTD.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,61 @@

+# Mac OS files
+.DS_Store
+# IDEs
+.idea
+.vs
+.vscode
+.cache
+# GitHub files
+.github
+# Byte-compiled / optimized / DLL / cached files
+__pycache__/
+*.py[cod]
+*$py.class
+*.pyc
+.temp
+*.c
+*.so
+*.o
+# Developing mode
+_*.sh
+_*.json
+*.lst
+yard*
+*.out
+evaluation/evalset_selection
+mfa
+egs/svc/*wavmark
+egs/svc/custom
+egs/svc/*/dev*
+egs/svc/dev_exp_config.json
+bins/svc/demo*
+bins/svc/preprocess_custom.py
+data
+ckpts
+# Data and ckpt
+*.pkl
+*.pt
+*.npy
+*.npz
+*.tar.gz
+*.ckpt
+*.wav
+*.flac
+pretrained/wenet/*conformer_exp
+# Runtime data dirs
+processed_data
+data
+model_ckpt
+logs
+*.ipynb
+*.lst
+source_audio
+result
+conversion_results
+get_available_gpu.py

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Amphion
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import subprocess
+command_to_run = "cd ./modules/monotonic_align;mkdir -p monotonic_align;python setup.py build_ext --inplace;cd /home/user/app"
+try:
+    result = subprocess.check_output(command_to_run, shell=True, text=True)
+    print("Command output:")
+    print(result)
+except subprocess.CalledProcessError as e:
+    print(f"Command failed with return code {e.returncode}")
+import gradio as gr
+import os
+import inference
+SUPPORTED_SPEAKERS = {
+    "92":"hifitts_92",
+    "6097":"hifitts_6097",
+    "6670":"hifitts_6670",
+    "6671":"hifitts_6671",
+    "8051":"hifitts_8051",
+    "9017":"hifitts_9017",
+    "9136":"hifitts_9136",
+    "11614":"hifitts_11614",
+    "11697":"hifitts_11697",
+    "12787":"hifitts_12787"
+}
+def tts_inference(
+    input_text,
+    target_speaker
+):
+    args_list = ["--config", "./egs/tts/vits_hifitts/exp_config.json"]
+    args_list += ["--checkpoint_path", "./latest-checkpoint"]
+    args_list += ["--speaker_name", target_speaker]
+    args_list += ["--text", input_text]
+    args_list += ["--mode","single"]
+    args_list += ["--output_dir", "result"]
+    args_list += ["--log_level", "debug"]
+    os.environ["WORK_DIR"] = "./"
+    inference.main(args_list)
+    ### Display ###
+    result_file = os.path.join(
+        "result/{}.wav".format(target_speaker)
+    )
+    return result_file
+demo_inputs = [
+    gr.Textbox(
+        label="Input text",
+        type="text",
+        lines=1,
+        max_lines=20
+    ),
+    gr.Radio(
+        choices=list(SUPPORTED_SPEAKERS.keys()),
+        label="Target Speaker",
+        value="92"
+    )
+]
+demo_output = gr.Audio(label="")
+demo = gr.Interface(
+    fn=tts_inference,
+    inputs=demo_inputs,
+    outputs=demo_output,
+    title="Amphion HifiTTS Text-to-Speech Demo",
+)
+if __name__ == "__main__":
+    demo.launch(share=True)

bins/calc_metrics.py ADDED Viewed

	@@ -0,0 +1,140 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import numpy as np
+import json
+import argparse
+from glob import glob
+from tqdm import tqdm
+from collections import defaultdict
+from evaluation.metrics.energy.energy_rmse import extract_energy_rmse
+from evaluation.metrics.energy.energy_pearson_coefficients import (
+    extract_energy_pearson_coeffcients,
+)
+from evaluation.metrics.f0.f0_pearson_coefficients import extract_fpc
+from evaluation.metrics.f0.f0_periodicity_rmse import extract_f0_periodicity_rmse
+from evaluation.metrics.f0.f0_rmse import extract_f0rmse
+from evaluation.metrics.f0.v_uv_f1 import extract_f1_v_uv
+from evaluation.metrics.intelligibility.character_error_rate import extract_cer
+from evaluation.metrics.intelligibility.word_error_rate import extract_wer
+from evaluation.metrics.similarity.speaker_similarity import extract_speaker_similarity
+from evaluation.metrics.spectrogram.frechet_distance import extract_fad
+from evaluation.metrics.spectrogram.mel_cepstral_distortion import extract_mcd
+from evaluation.metrics.spectrogram.multi_resolution_stft_distance import extract_mstft
+from evaluation.metrics.spectrogram.pesq import extract_pesq
+from evaluation.metrics.spectrogram.scale_invariant_signal_to_distortion_ratio import (
+    extract_si_sdr,
+)
+from evaluation.metrics.spectrogram.scale_invariant_signal_to_noise_ratio import (
+    extract_si_snr,
+)
+from evaluation.metrics.spectrogram.short_time_objective_intelligibility import (
+    extract_stoi,
+)
+METRIC_FUNC = {
+    "energy_rmse": extract_energy_rmse,
+    "energy_pc": extract_energy_pearson_coeffcients,
+    "fpc": extract_fpc,
+    "f0_periodicity_rmse": extract_f0_periodicity_rmse,
+    "f0rmse": extract_f0rmse,
+    "v_uv_f1": extract_f1_v_uv,
+    "cer": extract_cer,
+    "wer": extract_wer,
+    "speaker_similarity": extract_speaker_similarity,
+    "fad": extract_fad,
+    "mcd": extract_mcd,
+    "mstft": extract_mstft,
+    "pesq": extract_pesq,
+    "si_sdr": extract_si_sdr,
+    "si_snr": extract_si_snr,
+    "stoi": extract_stoi,
+}
+def calc_metric(ref_dir, deg_dir, dump_dir, metrics, fs=None):
+    result = defaultdict()
+    for metric in tqdm(metrics):
+        if metric in ["fad", "speaker_similarity"]:
+            result[metric] = str(METRIC_FUNC[metric](ref_dir, deg_dir))
+            continue
+        audios_ref = []
+        audios_deg = []
+        files = glob(ref_dir + "/*.wav")
+        for file in files:
+            audios_ref.append(file)
+            uid = file.split("/")[-1].split(".wav")[0]
+            file_gt = deg_dir + "/{}.wav".format(uid)
+            audios_deg.append(file_gt)
+        if metric in ["v_uv_f1"]:
+            tp_total = 0
+            fp_total = 0
+            fn_total = 0
+            for i in tqdm(range(len(audios_ref))):
+                audio_ref = audios_ref[i]
+                audio_deg = audios_deg[i]
+                tp, fp, fn = METRIC_FUNC[metric](audio_ref, audio_deg, fs)
+                tp_total += tp
+                fp_total += fp
+                fn_total += fn
+            result[metric] = str(tp_total / (tp_total + (fp_total + fn_total) / 2))
+        else:
+            scores = []
+            for i in tqdm(range(len(audios_ref))):
+                audio_ref = audios_ref[i]
+                audio_deg = audios_deg[i]
+                score = METRIC_FUNC[metric](
+                    audio_ref=audio_ref, audio_deg=audio_deg, fs=fs
+                )
+                if not np.isnan(score):
+                    scores.append(score)
+            scores = np.array(scores)
+            result["{}_mean".format(metric)] = str(np.mean(scores))
+            result["{}_std".format(metric)] = str(np.std(scores))
+    data = json.dumps(result, indent=4)
+    with open(os.path.join(dump_dir, "result.json"), "w", newline="\n") as f:
+        f.write(data)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ref_dir",
+        type=str,
+        help="Path to the target audio folder.",
+    )
+    parser.add_argument(
+        "--deg_dir",
+        type=str,
+        help="Path to the reference audio folder.",
+    )
+    parser.add_argument(
+        "--dump_dir",
+        type=str,
+        help="Path to dump the results.",
+    )
+    parser.add_argument(
+        "--metrics",
+        nargs="+",
+        help="Metrics used to evaluate.",
+    )
+    args = parser.parse_args()
+    calc_metric(args.ref_dir, args.deg_dir, args.dump_dir, args.metrics)

bins/svc/inference.py ADDED Viewed

	@@ -0,0 +1,262 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import os
+import glob
+from tqdm import tqdm
+import json
+import torch
+import time
+from models.svc.diffusion.diffusion_inference import DiffusionInference
+from models.svc.comosvc.comosvc_inference import ComoSVCInference
+from models.svc.transformer.transformer_inference import TransformerInference
+from utils.util import load_config
+from utils.audio_slicer import split_audio, merge_segments_encodec
+from processors import acoustic_extractor, content_extractor
+def build_inference(args, cfg, infer_type="from_dataset"):
+    supported_inference = {
+        "DiffWaveNetSVC": DiffusionInference,
+        "DiffComoSVC": ComoSVCInference,
+        "TransformerSVC": TransformerInference,
+    }
+    inference_class = supported_inference[cfg.model_type]
+    return inference_class(args, cfg, infer_type)
+def prepare_for_audio_file(args, cfg, num_workers=1):
+    preprocess_path = cfg.preprocess.processed_dir
+    audio_name = cfg.inference.source_audio_name
+    temp_audio_dir = os.path.join(preprocess_path, audio_name)
+    ### eval file
+    t = time.time()
+    eval_file = prepare_source_eval_file(cfg, temp_audio_dir, audio_name)
+    args.source = eval_file
+    with open(eval_file, "r") as f:
+        metadata = json.load(f)
+    print("Prepare for meta eval data: {:.1f}s".format(time.time() - t))
+    ### acoustic features
+    t = time.time()
+    acoustic_extractor.extract_utt_acoustic_features_serial(
+        metadata, temp_audio_dir, cfg
+    )
+    acoustic_extractor.cal_mel_min_max(
+        dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata
+    )
+    acoustic_extractor.cal_pitch_statistics_svc(
+        dataset=audio_name, output_path=preprocess_path, cfg=cfg, metadata=metadata
+    )
+    print("Prepare for acoustic features: {:.1f}s".format(time.time() - t))
+    ### content features
+    t = time.time()
+    content_extractor.extract_utt_content_features_dataloader(
+        cfg, metadata, num_workers
+    )
+    print("Prepare for content features: {:.1f}s".format(time.time() - t))
+    return args, cfg, temp_audio_dir
+def merge_for_audio_segments(audio_files, args, cfg):
+    audio_name = cfg.inference.source_audio_name
+    target_singer_name = args.target_singer
+    merge_segments_encodec(
+        wav_files=audio_files,
+        fs=cfg.preprocess.sample_rate,
+        output_path=os.path.join(
+            args.output_dir, "{}_{}.wav".format(audio_name, target_singer_name)
+        ),
+        overlap_duration=cfg.inference.segments_overlap_duration,
+    )
+    for tmp_file in audio_files:
+        os.remove(tmp_file)
+def prepare_source_eval_file(cfg, temp_audio_dir, audio_name):
+    """
+    Prepare the eval file (json) for an audio
+    """
+    audio_chunks_results = split_audio(
+        wav_file=cfg.inference.source_audio_path,
+        target_sr=cfg.preprocess.sample_rate,
+        output_dir=os.path.join(temp_audio_dir, "wavs"),
+        max_duration_of_segment=cfg.inference.segments_max_duration,
+        overlap_duration=cfg.inference.segments_overlap_duration,
+    )
+    metadata = []
+    for i, res in enumerate(audio_chunks_results):
+        res["index"] = i
+        res["Dataset"] = audio_name
+        res["Singer"] = audio_name
+        res["Uid"] = "{}_{}".format(audio_name, res["Uid"])
+        metadata.append(res)
+    eval_file = os.path.join(temp_audio_dir, "eval.json")
+    with open(eval_file, "w") as f:
+        json.dump(metadata, f, indent=4, ensure_ascii=False, sort_keys=True)
+    return eval_file
+def cuda_relevant(deterministic=False):
+    torch.cuda.empty_cache()
+    # TF32 on Ampere and above
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.allow_tf32 = True
+    # Deterministic
+    torch.backends.cudnn.deterministic = deterministic
+    torch.backends.cudnn.benchmark = not deterministic
+    torch.use_deterministic_algorithms(deterministic)
+def infer(args, cfg, infer_type):
+    # Build inference
+    t = time.time()
+    trainer = build_inference(args, cfg, infer_type)
+    print("Model Init: {:.1f}s".format(time.time() - t))
+    # Run inference
+    t = time.time()
+    output_audio_files = trainer.inference()
+    print("Model inference: {:.1f}s".format(time.time() - t))
+    return output_audio_files
+def build_parser():
+    r"""Build argument parser for inference.py.
+    Anything else should be put in an extra config YAML file.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=True,
+        help="JSON/YAML file for configurations.",
+    )
+    parser.add_argument(
+        "--acoustics_dir",
+        type=str,
+        help="Acoustics model checkpoint directory. If a directory is given, "
+        "search for the latest checkpoint dir in the directory. If a specific "
+        "checkpoint dir is given, directly load the checkpoint.",
+    )
+    parser.add_argument(
+        "--vocoder_dir",
+        type=str,
+        required=True,
+        help="Vocoder checkpoint directory. Searching behavior is the same as "
+        "the acoustics one.",
+    )
+    parser.add_argument(
+        "--target_singer",
+        type=str,
+        required=True,
+        help="convert to a specific singer (e.g. --target_singers singer_id).",
+    )
+    parser.add_argument(
+        "--trans_key",
+        default=0,
+        help="0: no pitch shift; autoshift: pitch shift;  int: key shift.",
+    )
+    parser.add_argument(
+        "--source",
+        type=str,
+        default="source_audio",
+        help="Source audio file or directory. If a JSON file is given, "
+        "inference from dataset is applied. If a directory is given, "
+        "inference from all wav/flac/mp3 audio files in the directory is applied. "
+        "Default: inference from all wav/flac/mp3 audio files in ./source_audio",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="conversion_results",
+        help="Output directory. Default: ./conversion_results",
+    )
+    parser.add_argument(
+        "--log_level",
+        type=str,
+        default="warning",
+        help="Logging level. Default: warning",
+    )
+    parser.add_argument(
+        "--keep_cache",
+        action="store_true",
+        default=True,
+        help="Keep cache files. Only applicable to inference from files.",
+    )
+    parser.add_argument(
+        "--diffusion_inference_steps",
+        type=int,
+        default=50,
+        help="Number of inference steps. Only applicable to diffusion inference.",
+    )
+    return parser
+def main():
+    ### Parse arguments and config
+    args = build_parser().parse_args()
+    cfg = load_config(args.config)
+    # CUDA settings
+    cuda_relevant()
+    if os.path.isdir(args.source):
+        ### Infer from file
+        # Get all the source audio files (.wav, .flac, .mp3)
+        source_audio_dir = args.source
+        audio_list = []
+        for suffix in ["wav", "flac", "mp3"]:
+            audio_list += glob.glob(
+                os.path.join(source_audio_dir, "**/*.{}".format(suffix)), recursive=True
+            )
+        print("There are {} source audios: ".format(len(audio_list)))
+        # Infer for every file as dataset
+        output_root_path = args.output_dir
+        for audio_path in tqdm(audio_list):
+            audio_name = audio_path.split("/")[-1].split(".")[0]
+            args.output_dir = os.path.join(output_root_path, audio_name)
+            print("\n{}\nConversion for {}...\n".format("*" * 10, audio_name))
+            cfg.inference.source_audio_path = audio_path
+            cfg.inference.source_audio_name = audio_name
+            cfg.inference.segments_max_duration = 10.0
+            cfg.inference.segments_overlap_duration = 1.0
+            # Prepare metadata and features
+            args, cfg, cache_dir = prepare_for_audio_file(args, cfg)
+            # Infer from file
+            output_audio_files = infer(args, cfg, infer_type="from_file")
+            # Merge the split segments
+            merge_for_audio_segments(output_audio_files, args, cfg)
+            # Keep or remove caches
+            if not args.keep_cache:
+                os.removedirs(cache_dir)
+    else:
+        ### Infer from dataset
+        infer(args, cfg, infer_type="from_dataset")
+if __name__ == "__main__":
+    main()

bins/svc/preprocess.py ADDED Viewed

	@@ -0,0 +1,182 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import faulthandler
+faulthandler.enable()
+import os
+import argparse
+import json
+from multiprocessing import cpu_count
+from utils.util import load_config
+from preprocessors.processor import preprocess_dataset
+from preprocessors.metadata import cal_metadata
+from processors import acoustic_extractor, content_extractor, data_augment
+def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
+    """Extract acoustic features of utterances in the dataset
+    Args:
+        dataset (str): name of dataset, e.g. opencpop
+        output_path (str): directory that stores train, test and feature files of datasets
+        cfg (dict): dictionary that stores configurations
+        n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
+    """
+    types = ["train", "test"] if "eval" not in dataset else ["test"]
+    metadata = []
+    dataset_output = os.path.join(output_path, dataset)
+    for dataset_type in types:
+        dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
+        with open(dataset_file, "r") as f:
+            metadata.extend(json.load(f))
+        # acoustic_extractor.extract_utt_acoustic_features_parallel(
+        #     metadata, dataset_output, cfg, n_workers=n_workers
+        # )
+    acoustic_extractor.extract_utt_acoustic_features_serial(
+        metadata, dataset_output, cfg
+    )
+def extract_content_features(dataset, output_path, cfg, num_workers=1):
+    """Extract content features of utterances in the dataset
+    Args:
+        dataset (str): name of dataset, e.g. opencpop
+        output_path (str): directory that stores train, test and feature files of datasets
+        cfg (dict): dictionary that stores configurations
+    """
+    types = ["train", "test"] if "eval" not in dataset else ["test"]
+    metadata = []
+    for dataset_type in types:
+        dataset_output = os.path.join(output_path, dataset)
+        dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
+        with open(dataset_file, "r") as f:
+            metadata.extend(json.load(f))
+    content_extractor.extract_utt_content_features_dataloader(
+        cfg, metadata, num_workers
+    )
+def preprocess(cfg, args):
+    """Proprocess raw data of single or multiple datasets (in cfg.dataset)
+    Args:
+        cfg (dict): dictionary that stores configurations
+        args (ArgumentParser): specify the configuration file and num_workers
+    """
+    # Specify the output root path to save the processed data
+    output_path = cfg.preprocess.processed_dir
+    os.makedirs(output_path, exist_ok=True)
+    ## Split train and test sets
+    for dataset in cfg.dataset:
+        print("Preprocess {}...".format(dataset))
+        preprocess_dataset(
+            dataset,
+            cfg.dataset_path[dataset],
+            output_path,
+            cfg.preprocess,
+            is_custom_dataset=cfg.use_custom_dataset,
+        )
+    # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
+    try:
+        assert isinstance(
+            cfg.preprocess.data_augment, list
+        ), "Please provide a list of datasets need to be augmented."
+        if len(cfg.preprocess.data_augment) > 0:
+            new_datasets_list = []
+            for dataset in cfg.preprocess.data_augment:
+                new_datasets = data_augment.augment_dataset(cfg, dataset)
+                new_datasets_list.extend(new_datasets)
+            cfg.dataset.extend(new_datasets_list)
+            print("Augmentation datasets: ", cfg.dataset)
+    except:
+        print("No Data Augmentation.")
+    # Dump metadata of datasets (singers, train/test durations, etc.)
+    cal_metadata(cfg)
+    ## Prepare the acoustic features
+    for dataset in cfg.dataset:
+        # Skip augmented datasets which do not need to extract acoustic features
+        # We will copy acoustic features from the original dataset later
+        if (
+            "pitch_shift" in dataset
+            or "formant_shift" in dataset
+            or "equalizer" in dataset in dataset
+        ):
+            continue
+        print(
+            "Extracting acoustic features for {} using {} workers ...".format(
+                dataset, args.num_workers
+            )
+        )
+        extract_acoustic_features(dataset, output_path, cfg, args.num_workers)
+        # Calculate the statistics of acoustic features
+        if cfg.preprocess.mel_min_max_norm:
+            acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
+        if cfg.preprocess.extract_pitch:
+            acoustic_extractor.cal_pitch_statistics_svc(dataset, output_path, cfg)
+    # Copy acoustic features for augmented datasets by creating soft-links
+    for dataset in cfg.dataset:
+        if "pitch_shift" in dataset:
+            src_dataset = dataset.replace("_pitch_shift", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        elif "formant_shift" in dataset:
+            src_dataset = dataset.replace("_formant_shift", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        elif "equalizer" in dataset:
+            src_dataset = dataset.replace("_equalizer", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        else:
+            continue
+        dataset_dir = os.path.join(output_path, dataset)
+        metadata = []
+        for split in ["train", "test"] if not "eval" in dataset else ["test"]:
+            metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
+            with open(metadata_file_path, "r") as f:
+                metadata.extend(json.load(f))
+        print("Copying acoustic features for {}...".format(dataset))
+        acoustic_extractor.copy_acoustic_features(
+            metadata, dataset_dir, src_dataset_dir, cfg
+        )
+        if cfg.preprocess.mel_min_max_norm:
+            acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
+        if cfg.preprocess.extract_pitch:
+            acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
+    # Prepare the content features
+    for dataset in cfg.dataset:
+        print("Extracting content features for {}...".format(dataset))
+        extract_content_features(dataset, output_path, cfg, args.num_workers)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config", default="config.json", help="json files for configurations."
+    )
+    parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
+    parser.add_argument("--prepare_alignment", type=bool, default=False)
+    args = parser.parse_args()
+    cfg = load_config(args.config)
+    preprocess(cfg, args)
+if __name__ == "__main__":
+    main()

bins/svc/train.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import torch
+from models.svc.diffusion.diffusion_trainer import DiffusionTrainer
+from models.svc.comosvc.comosvc_trainer import ComoSVCTrainer
+from models.svc.transformer.transformer_trainer import TransformerTrainer
+from utils.util import load_config
+def build_trainer(args, cfg):
+    supported_trainer = {
+        "DiffWaveNetSVC": DiffusionTrainer,
+        "DiffComoSVC": ComoSVCTrainer,
+        "TransformerSVC": TransformerTrainer,
+    }
+    trainer_class = supported_trainer[cfg.model_type]
+    trainer = trainer_class(args, cfg)
+    return trainer
+def cuda_relevant(deterministic=False):
+    torch.cuda.empty_cache()
+    # TF32 on Ampere and above
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.allow_tf32 = True
+    # Deterministic
+    torch.backends.cudnn.deterministic = deterministic
+    torch.backends.cudnn.benchmark = not deterministic
+    torch.use_deterministic_algorithms(deterministic)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        default="config.json",
+        help="json files for configurations.",
+        required=True,
+    )
+    parser.add_argument(
+        "--exp_name",
+        type=str,
+        default="exp_name",
+        help="A specific name to note the experiment",
+        required=True,
+    )
+    parser.add_argument(
+        "--resume",
+        action="store_true",
+        help="If specified, to resume from the existing checkpoint.",
+    )
+    parser.add_argument(
+        "--resume_from_ckpt_path",
+        type=str,
+        default="",
+        help="The specific checkpoint path that you want to resume from.",
+    )
+    parser.add_argument(
+        "--resume_type",
+        type=str,
+        default="",
+        help="`resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights",
+    )
+    parser.add_argument(
+        "--log_level", default="warning", help="logging level (debug, info, warning)"
+    )
+    args = parser.parse_args()
+    cfg = load_config(args.config)
+    # Data Augmentation
+    if (
+        type(cfg.preprocess.data_augment) == list
+        and len(cfg.preprocess.data_augment) > 0
+    ):
+        new_datasets_list = []
+        for dataset in cfg.preprocess.data_augment:
+            new_datasets = [
+                f"{dataset}_pitch_shift" if cfg.preprocess.use_pitch_shift else None,
+                f"{dataset}_formant_shift"
+                if cfg.preprocess.use_formant_shift
+                else None,
+                f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None,
+                f"{dataset}_time_stretch" if cfg.preprocess.use_time_stretch else None,
+            ]
+            new_datasets_list.extend(filter(None, new_datasets))
+        cfg.dataset.extend(new_datasets_list)
+    # CUDA settings
+    cuda_relevant()
+    # Build trainer
+    trainer = build_trainer(args, cfg)
+    trainer.train_loop()
+if __name__ == "__main__":
+    main()

bins/tta/inference.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+from argparse import ArgumentParser
+import os
+from models.tta.ldm.audioldm_inference import AudioLDMInference
+from utils.util import save_config, load_model_config, load_config
+import numpy as np
+import torch
+def build_inference(args, cfg):
+    supported_inference = {
+        "AudioLDM": AudioLDMInference,
+    }
+    inference_class = supported_inference[cfg.model_type]
+    inference = inference_class(args, cfg)
+    return inference
+def build_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=True,
+        help="JSON/YAML file for configurations.",
+    )
+    parser.add_argument(
+        "--text",
+        help="Text to be synthesized",
+        type=str,
+        default="Text to be synthesized.",
+    )
+    parser.add_argument(
+        "--checkpoint_path",
+        type=str,
+    )
+    parser.add_argument(
+        "--vocoder_path", type=str, help="Checkpoint path of the vocoder"
+    )
+    parser.add_argument(
+        "--vocoder_config_path", type=str, help="Config path of the vocoder"
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=None,
+        help="Output dir for saving generated results",
+    )
+    parser.add_argument(
+        "--num_steps",
+        type=int,
+        default=200,
+        help="The total number of denosing steps",
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=4.0,
+        help="The scale of classifer free guidance",
+    )
+    parser.add_argument("--local_rank", default=-1, type=int)
+    return parser
+def main():
+    # Parse arguments
+    args = build_parser().parse_args()
+    # args, infer_type = formulate_parser(args)
+    # Parse config
+    cfg = load_config(args.config)
+    if torch.cuda.is_available():
+        args.local_rank = torch.device("cuda")
+    else:
+        args.local_rank = torch.device("cpu")
+    print("args: ", args)
+    # Build inference
+    inferencer = build_inference(args, cfg)
+    # Run inference
+    inferencer.inference()
+if __name__ == "__main__":
+    main()

bins/tta/preprocess.py ADDED Viewed

	@@ -0,0 +1,194 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import faulthandler
+faulthandler.enable()
+import os
+import argparse
+import json
+import pyworld as pw
+from multiprocessing import cpu_count
+from utils.util import load_config
+from preprocessors.processor import preprocess_dataset, prepare_align
+from preprocessors.metadata import cal_metadata
+from processors import acoustic_extractor, content_extractor, data_augment
+def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
+    """Extract acoustic features of utterances in the dataset
+    Args:
+        dataset (str): name of dataset, e.g. opencpop
+        output_path (str): directory that stores train, test and feature files of datasets
+        cfg (dict): dictionary that stores configurations
+        n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
+    """
+    types = ["train", "test"] if "eval" not in dataset else ["test"]
+    metadata = []
+    for dataset_type in types:
+        dataset_output = os.path.join(output_path, dataset)
+        dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
+        with open(dataset_file, "r") as f:
+            metadata.extend(json.load(f))
+        # acoustic_extractor.extract_utt_acoustic_features_parallel(
+        #     metadata, dataset_output, cfg, n_workers=n_workers
+        # )
+    acoustic_extractor.extract_utt_acoustic_features_serial(
+        metadata, dataset_output, cfg
+    )
+def extract_content_features(dataset, output_path, cfg, num_workers=1):
+    """Extract content features of utterances in the dataset
+    Args:
+        dataset (str): name of dataset, e.g. opencpop
+        output_path (str): directory that stores train, test and feature files of datasets
+        cfg (dict): dictionary that stores configurations
+    """
+    types = ["train", "test"] if "eval" not in dataset else ["test"]
+    metadata = []
+    for dataset_type in types:
+        dataset_output = os.path.join(output_path, dataset)
+        dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
+        with open(dataset_file, "r") as f:
+            metadata.extend(json.load(f))
+    content_extractor.extract_utt_content_features_dataloader(
+        cfg, metadata, num_workers
+    )
+def preprocess(cfg, args):
+    """Proprocess raw data of single or multiple datasets (in cfg.dataset)
+    Args:
+        cfg (dict): dictionary that stores configurations
+        args (ArgumentParser): specify the configuration file and num_workers
+    """
+    # Specify the output root path to save the processed data
+    output_path = cfg.preprocess.processed_dir
+    os.makedirs(output_path, exist_ok=True)
+    ## Split train and test sets
+    for dataset in cfg.dataset:
+        print("Preprocess {}...".format(dataset))
+        if args.prepare_alignment:
+            ## Prepare alignment with MFA
+            print("Prepare alignment {}...".format(dataset))
+            prepare_align(
+                dataset, cfg.dataset_path[dataset], cfg.preprocess, output_path
+            )
+        preprocess_dataset(
+            dataset,
+            cfg.dataset_path[dataset],
+            output_path,
+            cfg.preprocess,
+            is_custom_dataset=cfg.use_custom_dataset,
+        )
+    # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
+    try:
+        assert isinstance(
+            cfg.preprocess.data_augment, list
+        ), "Please provide a list of datasets need to be augmented."
+        if len(cfg.preprocess.data_augment) > 0:
+            new_datasets_list = []
+            for dataset in cfg.preprocess.data_augment:
+                new_datasets = data_augment.augment_dataset(cfg, dataset)
+                new_datasets_list.extend(new_datasets)
+            cfg.dataset.extend(new_datasets_list)
+            print("Augmentation datasets: ", cfg.dataset)
+    except:
+        print("No Data Augmentation.")
+    # Dump metadata of datasets (singers, train/test durations, etc.)
+    cal_metadata(cfg)
+    ## Prepare the acoustic features
+    for dataset in cfg.dataset:
+        # Skip augmented datasets which do not need to extract acoustic features
+        # We will copy acoustic features from the original dataset later
+        if (
+            "pitch_shift" in dataset
+            or "formant_shift" in dataset
+            or "equalizer" in dataset in dataset
+        ):
+            continue
+        print(
+            "Extracting acoustic features for {} using {} workers ...".format(
+                dataset, args.num_workers
+            )
+        )
+        extract_acoustic_features(dataset, output_path, cfg, args.num_workers)
+        # Calculate the statistics of acoustic features
+        if cfg.preprocess.mel_min_max_norm:
+            acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
+        if cfg.preprocess.extract_pitch:
+            acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
+        if cfg.preprocess.extract_energy:
+            acoustic_extractor.cal_energy_statistics(dataset, output_path, cfg)
+    if cfg.preprocess.align_mel_duration:
+        acoustic_extractor.align_duration_mel(dataset, output_path, cfg)
+    # Copy acoustic features for augmented datasets by creating soft-links
+    for dataset in cfg.dataset:
+        if "pitch_shift" in dataset:
+            src_dataset = dataset.replace("_pitch_shift", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        elif "formant_shift" in dataset:
+            src_dataset = dataset.replace("_formant_shift", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        elif "equalizer" in dataset:
+            src_dataset = dataset.replace("_equalizer", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        else:
+            continue
+        dataset_dir = os.path.join(output_path, dataset)
+        metadata = []
+        for split in ["train", "test"] if not "eval" in dataset else ["test"]:
+            metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
+            with open(metadata_file_path, "r") as f:
+                metadata.extend(json.load(f))
+        print("Copying acoustic features for {}...".format(dataset))
+        acoustic_extractor.copy_acoustic_features(
+            metadata, dataset_dir, src_dataset_dir, cfg
+        )
+        if cfg.preprocess.mel_min_max_norm:
+            acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
+        if cfg.preprocess.extract_pitch:
+            acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
+    # Prepare the content features
+    for dataset in cfg.dataset:
+        print("Extracting content features for {}...".format(dataset))
+        extract_content_features(dataset, output_path, cfg, args.num_workers)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config", default="config.json", help="json files for configurations."
+    )
+    parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
+    parser.add_argument("--prepare_alignment", type=bool, default=False)
+    args = parser.parse_args()
+    cfg = load_config(args.config)
+    preprocess(cfg, args)
+if __name__ == "__main__":
+    main()

bins/tta/train_tta.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import os
+import torch
+from models.tta.autoencoder.autoencoder_trainer import AutoencoderKLTrainer
+from models.tta.ldm.audioldm_trainer import AudioLDMTrainer
+from utils.util import load_config
+def build_trainer(args, cfg):
+    supported_trainer = {
+        "AutoencoderKL": AutoencoderKLTrainer,
+        "AudioLDM": AudioLDMTrainer,
+    }
+    trainer_class = supported_trainer[cfg.model_type]
+    trainer = trainer_class(args, cfg)
+    return trainer
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        default="config.json",
+        help="json files for configurations.",
+        required=True,
+    )
+    parser.add_argument(
+        "--num_workers", type=int, default=6, help="Number of dataloader workers."
+    )
+    parser.add_argument(
+        "--exp_name",
+        type=str,
+        default="exp_name",
+        help="A specific name to note the experiment",
+        required=True,
+    )
+    parser.add_argument(
+        "--resume",
+        type=str,
+        default=None,
+        # action="store_true",
+        help="The model name to restore",
+    )
+    parser.add_argument(
+        "--log_level", default="info", help="logging level (info, debug, warning)"
+    )
+    parser.add_argument("--stdout_interval", default=5, type=int)
+    parser.add_argument("--local_rank", default=-1, type=int)
+    args = parser.parse_args()
+    cfg = load_config(args.config)
+    cfg.exp_name = args.exp_name
+    # Model saving dir
+    args.log_dir = os.path.join(cfg.log_dir, args.exp_name)
+    os.makedirs(args.log_dir, exist_ok=True)
+    if not cfg.train.ddp:
+        args.local_rank = torch.device("cuda")
+    # Build trainer
+    trainer = build_trainer(args, cfg)
+    # Restore models
+    if args.resume:
+        trainer.restore()
+    trainer.train()
+if __name__ == "__main__":
+    main()

bins/tts/inference.py ADDED Viewed

	@@ -0,0 +1,162 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+from argparse import ArgumentParser
+import os
+from models.tts.fastspeech2.fs2_inference import FastSpeech2Inference
+from models.tts.vits.vits_inference import VitsInference
+from models.tts.valle.valle_inference import VALLEInference
+from utils.util import load_config
+import torch
+def build_inference(args, cfg):
+    supported_inference = {
+        "FastSpeech2": FastSpeech2Inference,
+        "VITS": VitsInference,
+        "VALLE": VALLEInference,
+    }
+    inference_class = supported_inference[cfg.model_type]
+    inference = inference_class(args, cfg)
+    return inference
+def cuda_relevant(deterministic=False):
+    torch.cuda.empty_cache()
+    # TF32 on Ampere and above
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.allow_tf32 = True
+    # Deterministic
+    torch.backends.cudnn.deterministic = deterministic
+    torch.backends.cudnn.benchmark = not deterministic
+    torch.use_deterministic_algorithms(deterministic)
+def build_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=True,
+        help="JSON/YAML file for configurations.",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        help="convert from the source data",
+        default=None,
+    )
+    parser.add_argument(
+        "--testing_set",
+        type=str,
+        help="train, test, golden_test",
+        default="test",
+    )
+    parser.add_argument(
+        "--test_list_file",
+        type=str,
+        help="convert from the test list file",
+        default=None,
+    )
+    parser.add_argument(
+        "--speaker_name",
+        type=str,
+        default=None,
+        help="speaker name for multi-speaker synthesis, for single-sentence mode only",
+    )
+    parser.add_argument(
+        "--text",
+        help="Text to be synthesized.",
+        type=str,
+        default="",
+    )
+    parser.add_argument(
+        "--vocoder_dir",
+        type=str,
+        default=None,
+        help="Vocoder checkpoint directory. Searching behavior is the same as "
+        "the acoustics one.",
+    )
+    parser.add_argument(
+        "--acoustics_dir",
+        type=str,
+        default=None,
+        help="Acoustic model checkpoint directory. If a directory is given, "
+        "search for the latest checkpoint dir in the directory. If a specific "
+        "checkpoint dir is given, directly load the checkpoint.",
+    )
+    parser.add_argument(
+        "--checkpoint_path",
+        type=str,
+        default=None,
+        help="Acoustic model checkpoint directory. If a directory is given, "
+        "search for the latest checkpoint dir in the directory. If a specific "
+        "checkpoint dir is given, directly load the checkpoint.",
+    )
+    parser.add_argument(
+        "--mode",
+        type=str,
+        choices=["batch", "single"],
+        required=True,
+        help="Synthesize a whole dataset or a single sentence",
+    )
+    parser.add_argument(
+        "--log_level",
+        type=str,
+        default="warning",
+        help="Logging level. Default: warning",
+    )
+    parser.add_argument(
+        "--pitch_control",
+        type=float,
+        default=1.0,
+        help="control the pitch of the whole utterance, larger value for higher pitch",
+    )
+    parser.add_argument(
+        "--energy_control",
+        type=float,
+        default=1.0,
+        help="control the energy of the whole utterance, larger value for larger volume",
+    )
+    parser.add_argument(
+        "--duration_control",
+        type=float,
+        default=1.0,
+        help="control the speed of the whole utterance, larger value for slower speaking rate",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=None,
+        help="Output dir for saving generated results",
+    )
+    return parser
+def main():
+    # Parse arguments
+    parser = build_parser()
+    VALLEInference.add_arguments(parser)
+    args = parser.parse_args()
+    # Parse config
+    cfg = load_config(args.config)
+    # CUDA settings
+    cuda_relevant()
+    # Build inference
+    inferencer = build_inference(args, cfg)
+    # Run inference
+    inferencer.inference()
+if __name__ == "__main__":
+    main()

bins/tts/preprocess.py ADDED Viewed

	@@ -0,0 +1,253 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import faulthandler
+faulthandler.enable()
+import os
+import argparse
+import json
+import pyworld as pw
+from multiprocessing import cpu_count
+from utils.util import load_config
+from preprocessors.processor import preprocess_dataset, prepare_align
+from preprocessors.metadata import cal_metadata
+from processors import acoustic_extractor, content_extractor, data_augment, phone_extractor
+def extract_acoustic_features(dataset, output_path, cfg, num_workers=1):
+    """Extract acoustic features of utterances in the dataset
+    Args:
+        dataset (str): name of dataset, e.g. opencpop
+        output_path (str): directory that stores train, test and feature files of datasets
+        cfg (dict): dictionary that stores configurations
+        n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
+    """
+    # types = ["train", "test"] if "eval" not in dataset else ["test"]
+    types = list()
+    types.append((cfg.preprocess.train_file).split('.')[0])
+    types.append((cfg.preprocess.valid_file).split('.')[0])
+    if 'test' not in types:
+        types.append('test')
+    if "eval" in dataset:
+        types = ["test"]
+    print('types: ', types)
+    metadata = []
+    for dataset_type in types:
+        dataset_output = os.path.join(output_path, dataset)
+        dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
+        with open(dataset_file, "r") as f:
+            metadata.extend(json.load(f))
+    if num_workers > 1:
+        acoustic_extractor.extract_utt_acoustic_features_parallel(
+            metadata, dataset_output, cfg, num_workers=num_workers
+        )
+    else:
+        acoustic_extractor.extract_utt_acoustic_features_serial(
+            metadata, dataset_output, cfg
+        )
+def extract_content_features(dataset, output_path, cfg, num_workers=1):
+    """Extract content features of utterances in the dataset
+    Args:
+        dataset (str): name of dataset, e.g. opencpop
+        output_path (str): directory that stores train, test and feature files of datasets
+        cfg (dict): dictionary that stores configurations
+    """
+    # types = ["train", "test"] if "eval" not in dataset else ["test"]
+    types = list()
+    types.append((cfg.preprocess.train_file).split('.')[0])
+    types.append((cfg.preprocess.valid_file).split('.')[0])
+    if 'test' not in types:
+        types.append('test')
+    if "eval" in dataset:
+        types = ["test"]
+    metadata = []
+    for dataset_type in types:
+        dataset_output = os.path.join(output_path, dataset)
+        # dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
+        dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
+        with open(dataset_file, "r") as f:
+            metadata.extend(json.load(f))
+    content_extractor.extract_utt_content_features_dataloader(
+        cfg, metadata, num_workers
+    )
+def extract_phonme_sequences(dataset, output_path, cfg):
+    """Extract phoneme features of utterances in the dataset
+    Args:
+        dataset (str): name of dataset, e.g. opencpop
+        output_path (str): directory that stores train, test and feature files of datasets
+        cfg (dict): dictionary that stores configurations
+    """
+    # types = ["train", "test"] if "eval" not in dataset else ["test"]
+    types = list()
+    types.append((cfg.preprocess.train_file).split('.')[0])
+    types.append((cfg.preprocess.valid_file).split('.')[0])
+    if 'test' not in types:
+        types.append('test')
+    if "eval" in dataset:
+        types = ["test"]
+    metadata = []
+    for dataset_type in types:
+        dataset_output = os.path.join(output_path, dataset)
+        dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
+        with open(dataset_file, "r") as f:
+            metadata.extend(json.load(f))
+    phone_extractor.extract_utt_phone_sequence(
+        cfg, metadata
+    )
+def preprocess(cfg, args):
+    """Proprocess raw data of single or multiple datasets (in cfg.dataset)
+    Args:
+        cfg (dict): dictionary that stores configurations
+        args (ArgumentParser): specify the configuration file and num_workers
+    """
+    # Specify the output root path to save the processed data
+    output_path = cfg.preprocess.processed_dir
+    os.makedirs(output_path, exist_ok=True)
+    '''
+    ## Split train and test sets
+    for dataset in cfg.dataset:
+        print("Preprocess {}...".format(dataset))
+        if args.prepare_alignment:
+            ## Prepare alignment with MFA
+            print("Prepare alignment {}...".format(dataset))
+            prepare_align(
+                dataset, cfg.dataset_path[dataset], cfg.preprocess, output_path
+            )
+        preprocess_dataset(
+            dataset,
+            cfg.dataset_path[dataset],
+            output_path,
+            cfg.preprocess,
+            is_custom_dataset=cfg.use_custom_dataset,
+        )
+    # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
+    try:
+        assert isinstance(
+            cfg.preprocess.data_augment, list
+        ), "Please provide a list of datasets need to be augmented."
+        if len(cfg.preprocess.data_augment) > 0:
+            new_datasets_list = []
+            for dataset in cfg.preprocess.data_augment:
+                new_datasets = data_augment.augment_dataset(cfg, dataset)
+                new_datasets_list.extend(new_datasets)
+            cfg.dataset.extend(new_datasets_list)
+            print("Augmentation datasets: ", cfg.dataset)
+    except:
+        print("No Data Augmentation.")
+    # Dump metadata of datasets (singers, train/test durations, etc.)
+    cal_metadata(cfg)
+    '''
+    ## Prepare the acoustic features
+    for dataset in cfg.dataset:
+        # Skip augmented datasets which do not need to extract acoustic features
+        # We will copy acoustic features from the original dataset later
+        if (
+            "pitch_shift" in dataset
+            or "formant_shift" in dataset
+            or "equalizer" in dataset in dataset
+        ):
+            continue
+        print(
+            "Extracting acoustic features for {} using {} workers ...".format(
+                dataset, args.num_workers
+            )
+        )
+        extract_acoustic_features(dataset, output_path, cfg, args.num_workers)
+        # Calculate the statistics of acoustic features
+        if cfg.preprocess.mel_min_max_norm:
+            acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
+        if cfg.preprocess.extract_pitch:
+            acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
+        if cfg.preprocess.extract_energy:
+            acoustic_extractor.cal_energy_statistics(dataset, output_path, cfg)
+        if cfg.preprocess.pitch_norm:
+            acoustic_extractor.normalize(dataset, cfg.preprocess.pitch_dir, cfg)
+        if cfg.preprocess.energy_norm:
+            acoustic_extractor.normalize(dataset, cfg.preprocess.energy_dir, cfg)
+    # Copy acoustic features for augmented datasets by creating soft-links
+    for dataset in cfg.dataset:
+        if "pitch_shift" in dataset:
+            src_dataset = dataset.replace("_pitch_shift", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        elif "formant_shift" in dataset:
+            src_dataset = dataset.replace("_formant_shift", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        elif "equalizer" in dataset:
+            src_dataset = dataset.replace("_equalizer", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        else:
+            continue
+        dataset_dir = os.path.join(output_path, dataset)
+        metadata = []
+        for split in ["train", "test"] if not "eval" in dataset else ["test"]:
+            metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
+            with open(metadata_file_path, "r") as f:
+                metadata.extend(json.load(f))
+        print("Copying acoustic features for {}...".format(dataset))
+        acoustic_extractor.copy_acoustic_features(
+            metadata, dataset_dir, src_dataset_dir, cfg
+        )
+        if cfg.preprocess.mel_min_max_norm:
+            acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
+        if cfg.preprocess.extract_pitch:
+            acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
+    # Prepare the content features
+    for dataset in cfg.dataset:
+        print("Extracting content features for {}...".format(dataset))
+        extract_content_features(dataset, output_path, cfg, args.num_workers)
+    # Prepare the phenome squences
+    if cfg.preprocess.extract_phone:
+        for dataset in cfg.dataset:
+            print("Extracting phoneme sequence for {}...".format(dataset))
+            extract_phonme_sequences(dataset, output_path, cfg)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config", default="config.json", help="json files for configurations."
+    )
+    parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
+    parser.add_argument("--prepare_alignment", type=bool, default=False)
+    args = parser.parse_args()
+    cfg = load_config(args.config)
+    preprocess(cfg, args)
+if __name__ == "__main__":
+    main()

bins/tts/train.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import torch
+from models.tts.fastspeech2.fs2_trainer import FastSpeech2Trainer
+from models.tts.vits.vits_trainer import VITSTrainer
+from models.tts.valle.valle_trainer import VALLETrainer
+from utils.util import load_config
+def build_trainer(args, cfg):
+    supported_trainer = {
+        "FastSpeech2": FastSpeech2Trainer,
+        "VITS": VITSTrainer,
+        "VALLE": VALLETrainer,
+    }
+    trainer_class = supported_trainer[cfg.model_type]
+    trainer = trainer_class(args, cfg)
+    return trainer
+def cuda_relevant(deterministic=False):
+    torch.cuda.empty_cache()
+    # TF32 on Ampere and above
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.allow_tf32 = True
+    # Deterministic
+    torch.backends.cudnn.deterministic = deterministic
+    torch.backends.cudnn.benchmark = not deterministic
+    torch.use_deterministic_algorithms(deterministic)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        default="config.json",
+        help="json files for configurations.",
+        required=True,
+    )
+    parser.add_argument(
+        "--exp_name",
+        type=str,
+        default="exp_name",
+        help="A specific name to note the experiment",
+        required=True,
+    )
+    parser.add_argument(
+        "--resume", action="store_true", help="The model name to restore"
+    )
+    parser.add_argument(
+        "--log_level", default="warning", help="logging level (debug, info, warning)"
+    )
+    parser.add_argument(
+        "--resume_type",
+        type=str,
+        default="resume",
+        help="Resume training or finetuning.",
+    )
+    parser.add_argument(
+        "--checkpoint_path",
+        type=str,
+        default=None,
+        help="Checkpoint for resume training or finetuning.",
+    )
+    VALLETrainer.add_arguments(parser)
+    args = parser.parse_args()
+    cfg = load_config(args.config)
+    # Data Augmentation
+    if (
+        type(cfg.preprocess.data_augment) == list
+        and len(cfg.preprocess.data_augment) > 0
+    ):
+        new_datasets_list = []
+        for dataset in cfg.preprocess.data_augment:
+            new_datasets = [
+                f"{dataset}_pitch_shift" if cfg.preprocess.use_pitch_shift else None,
+                f"{dataset}_formant_shift"
+                if cfg.preprocess.use_formant_shift
+                else None,
+                f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None,
+                f"{dataset}_time_stretch" if cfg.preprocess.use_time_stretch else None,
+            ]
+            new_datasets_list.extend(filter(None, new_datasets))
+        cfg.dataset.extend(new_datasets_list)
+    # # CUDA settings
+    cuda_relevant()
+    # Build trainer
+    trainer = build_trainer(args, cfg)
+    torch.set_num_threads(1)
+    torch.set_num_interop_threads(1)
+    trainer.train_loop()
+if __name__ == "__main__":
+    main()

bins/vocoder/inference.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import os
+import torch
+from models.vocoders.vocoder_inference import VocoderInference
+from utils.util import load_config
+def build_inference(args, cfg, infer_type="infer_from_dataset"):
+    supported_inference = {
+        "GANVocoder": VocoderInference,
+    }
+    inference_class = supported_inference[cfg.model_type]
+    return inference_class(args, cfg, infer_type)
+def cuda_relevant(deterministic=False):
+    torch.cuda.empty_cache()
+    # TF32 on Ampere and above
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.allow_tf32 = True
+    # Deterministic
+    torch.backends.cudnn.deterministic = deterministic
+    torch.backends.cudnn.benchmark = not deterministic
+    torch.use_deterministic_algorithms(deterministic)
+def build_parser():
+    r"""Build argument parser for inference.py.
+    Anything else should be put in an extra config YAML file.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=True,
+        help="JSON/YAML file for configurations.",
+    )
+    parser.add_argument(
+        "--infer_mode",
+        type=str,
+        required=None,
+    )
+    parser.add_argument(
+        "--infer_datasets",
+        nargs="+",
+        default=None,
+    )
+    parser.add_argument(
+        "--feature_folder",
+        type=str,
+        default=None,
+    )
+    parser.add_argument(
+        "--audio_folder",
+        type=str,
+        default=None,
+    )
+    parser.add_argument(
+        "--vocoder_dir",
+        type=str,
+        required=True,
+        help="Vocoder checkpoint directory. Searching behavior is the same as "
+        "the acoustics one.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="result",
+        help="Output directory. Default: ./result",
+    )
+    parser.add_argument(
+        "--log_level",
+        type=str,
+        default="warning",
+        help="Logging level. Default: warning",
+    )
+    parser.add_argument(
+        "--keep_cache",
+        action="store_true",
+        default=False,
+        help="Keep cache files. Only applicable to inference from files.",
+    )
+    return parser
+def main():
+    # Parse arguments
+    args = build_parser().parse_args()
+    # Parse config
+    cfg = load_config(args.config)
+    # CUDA settings
+    cuda_relevant()
+    # Build inference
+    trainer = build_inference(args, cfg, args.infer_mode)
+    # Run inference
+    trainer.inference()
+if __name__ == "__main__":
+    main()

bins/vocoder/preprocess.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import faulthandler
+faulthandler.enable()
+import os
+import argparse
+import json
+import pyworld as pw
+from multiprocessing import cpu_count
+from utils.util import load_config
+from preprocessors.processor import preprocess_dataset, prepare_align
+from preprocessors.metadata import cal_metadata
+from processors import acoustic_extractor, content_extractor, data_augment
+def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
+    """Extract acoustic features of utterances in the dataset
+    Args:
+        dataset (str): name of dataset, e.g. opencpop
+        output_path (str): directory that stores train, test and feature files of datasets
+        cfg (dict): dictionary that stores configurations
+        n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
+    """
+    types = ["train", "test"] if "eval" not in dataset else ["test"]
+    metadata = []
+    for dataset_type in types:
+        dataset_output = os.path.join(output_path, dataset)
+        dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
+        with open(dataset_file, "r") as f:
+            metadata.extend(json.load(f))
+    acoustic_extractor.extract_utt_acoustic_features_serial(
+        metadata, dataset_output, cfg
+    )
+def preprocess(cfg, args):
+    """Proprocess raw data of single or multiple datasets (in cfg.dataset)
+    Args:
+        cfg (dict): dictionary that stores configurations
+        args (ArgumentParser): specify the configuration file and num_workers
+    """
+    # Specify the output root path to save the processed data
+    output_path = cfg.preprocess.processed_dir
+    os.makedirs(output_path, exist_ok=True)
+    ## Split train and test sets
+    for dataset in cfg.dataset:
+        print("Preprocess {}...".format(dataset))
+        preprocess_dataset(
+            dataset,
+            cfg.dataset_path[dataset],
+            output_path,
+            cfg.preprocess,
+            is_custom_dataset=cfg.use_custom_dataset,
+        )
+    # Data augmentation: create new wav files with pitch shift, formant shift, equalizer, time stretch
+    try:
+        assert isinstance(
+            cfg.preprocess.data_augment, list
+        ), "Please provide a list of datasets need to be augmented."
+        if len(cfg.preprocess.data_augment) > 0:
+            new_datasets_list = []
+            for dataset in cfg.preprocess.data_augment:
+                new_datasets = data_augment.augment_dataset(cfg, dataset)
+                new_datasets_list.extend(new_datasets)
+            cfg.dataset.extend(new_datasets_list)
+            print("Augmentation datasets: ", cfg.dataset)
+    except:
+        print("No Data Augmentation.")
+    # Dump metadata of datasets (singers, train/test durations, etc.)
+    cal_metadata(cfg)
+    ## Prepare the acoustic features
+    for dataset in cfg.dataset:
+        # Skip augmented datasets which do not need to extract acoustic features
+        # We will copy acoustic features from the original dataset later
+        if (
+            "pitch_shift" in dataset
+            or "formant_shift" in dataset
+            or "equalizer" in dataset in dataset
+        ):
+            continue
+        print(
+            "Extracting acoustic features for {} using {} workers ...".format(
+                dataset, args.num_workers
+            )
+        )
+        extract_acoustic_features(dataset, output_path, cfg, args.num_workers)
+        # Calculate the statistics of acoustic features
+        if cfg.preprocess.mel_min_max_norm:
+            acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
+    # Copy acoustic features for augmented datasets by creating soft-links
+    for dataset in cfg.dataset:
+        if "pitch_shift" in dataset:
+            src_dataset = dataset.replace("_pitch_shift", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        elif "formant_shift" in dataset:
+            src_dataset = dataset.replace("_formant_shift", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        elif "equalizer" in dataset:
+            src_dataset = dataset.replace("_equalizer", "")
+            src_dataset_dir = os.path.join(output_path, src_dataset)
+        else:
+            continue
+        dataset_dir = os.path.join(output_path, dataset)
+        metadata = []
+        for split in ["train", "test"] if not "eval" in dataset else ["test"]:
+            metadata_file_path = os.path.join(src_dataset_dir, "{}.json".format(split))
+            with open(metadata_file_path, "r") as f:
+                metadata.extend(json.load(f))
+        print("Copying acoustic features for {}...".format(dataset))
+        acoustic_extractor.copy_acoustic_features(
+            metadata, dataset_dir, src_dataset_dir, cfg
+        )
+        if cfg.preprocess.mel_min_max_norm:
+            acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
+        if cfg.preprocess.extract_pitch:
+            acoustic_extractor.cal_pitch_statistics(dataset, output_path, cfg)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config", default="config.json", help="json files for configurations."
+    )
+    parser.add_argument("--num_workers", type=int, default=int(cpu_count()))
+    args = parser.parse_args()
+    cfg = load_config(args.config)
+    preprocess(cfg, args)
+if __name__ == "__main__":
+    main()

bins/vocoder/train.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import torch
+from models.vocoders.gan.gan_vocoder_trainer import GANVocoderTrainer
+from utils.util import load_config
+def build_trainer(args, cfg):
+    supported_trainer = {
+        "GANVocoder": GANVocoderTrainer,
+    }
+    trainer_class = supported_trainer[cfg.model_type]
+    trainer = trainer_class(args, cfg)
+    return trainer
+def cuda_relevant(deterministic=False):
+    torch.cuda.empty_cache()
+    # TF32 on Ampere and above
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.allow_tf32 = True
+    # Deterministic
+    torch.backends.cudnn.deterministic = deterministic
+    torch.backends.cudnn.benchmark = not deterministic
+    torch.use_deterministic_algorithms(deterministic)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        default="config.json",
+        help="json files for configurations.",
+        required=True,
+    )
+    parser.add_argument(
+        "--exp_name",
+        type=str,
+        default="exp_name",
+        help="A specific name to note the experiment",
+        required=True,
+    )
+    parser.add_argument(
+        "--resume_type",
+        type=str,
+        help="resume for continue to train, finetune for finetuning",
+    )
+    parser.add_argument(
+        "--checkpoint",
+        type=str,
+        help="checkpoint to resume",
+    )
+    parser.add_argument(
+        "--log_level", default="warning", help="logging level (debug, info, warning)"
+    )
+    args = parser.parse_args()
+    cfg = load_config(args.config)
+    # Data Augmentation
+    if cfg.preprocess.data_augment:
+        new_datasets_list = []
+        for dataset in cfg.preprocess.data_augment:
+            new_datasets = [
+                # f"{dataset}_pitch_shift",
+                # f"{dataset}_formant_shift",
+                f"{dataset}_equalizer",
+                f"{dataset}_time_stretch",
+            ]
+            new_datasets_list.extend(new_datasets)
+        cfg.dataset.extend(new_datasets_list)
+    # CUDA settings
+    cuda_relevant()
+    # Build trainer
+    trainer = build_trainer(args, cfg)
+    trainer.train_loop()
+if __name__ == "__main__":
+    main()

config/audioldm.json ADDED Viewed

	@@ -0,0 +1,92 @@

+{
+  "base_config": "config/base.json",
+  "model_type": "AudioLDM",
+  "task_type": "tta",
+  "dataset": [
+    "AudioCaps"
+  ],
+  "preprocess": {
+    // feature used for model training
+    "use_spkid": false,
+    "use_uv": false,
+    "use_frame_pitch": false,
+    "use_phone_pitch": false,
+    "use_frame_energy": false,
+    "use_phone_energy": false,
+    "use_mel": false,
+    "use_audio": false,
+    "use_label": false,
+    "use_one_hot": false,
+    "cond_mask_prob": 0.1
+  },
+  // model
+  "model": {
+    "audioldm": {
+      "image_size": 32,
+      "in_channels": 4,
+      "out_channels": 4,
+      "model_channels": 256,
+      "attention_resolutions": [
+        4,
+        2,
+        1
+      ],
+      "num_res_blocks": 2,
+      "channel_mult": [
+        1,
+        2,
+        4
+      ],
+      "num_heads": 8,
+      "use_spatial_transformer": true,
+      "transformer_depth": 1,
+      "context_dim": 768,
+      "use_checkpoint": true,
+      "legacy": false
+    },
+    "autoencoderkl": {
+      "ch": 128,
+      "ch_mult": [
+        1,
+        1,
+        2,
+        2,
+        4
+      ],
+      "num_res_blocks": 2,
+      "in_channels": 1,
+      "z_channels": 4,
+      "out_ch": 1,
+      "double_z": true
+    },
+    "noise_scheduler": {
+      "num_train_timesteps": 1000,
+      "beta_start": 0.00085,
+      "beta_end": 0.012,
+      "beta_schedule": "scaled_linear",
+      "clip_sample": false,
+      "steps_offset": 1,
+      "set_alpha_to_one": false,
+      "skip_prk_steps": true,
+      "prediction_type": "epsilon"
+    }
+  },
+  // train
+  "train": {
+    "lronPlateau": {
+      "factor": 0.9,
+      "patience": 100,
+      "min_lr": 4.0e-5,
+      "verbose": true
+    },
+    "adam": {
+      "lr": 5.0e-5,
+      "betas": [
+        0.9,
+        0.999
+      ],
+      "weight_decay": 1.0e-2,
+      "eps": 1.0e-8
+    }
+  }
+}

config/autoencoderkl.json ADDED Viewed

	@@ -0,0 +1,69 @@

+{
+  "base_config": "config/base.json",
+  "model_type": "AutoencoderKL",
+  "task_type": "tta",
+  "dataset": [
+    "AudioCaps"
+  ],
+  "preprocess": {
+    // feature used for model training
+    "use_spkid": false,
+    "use_uv": false,
+    "use_frame_pitch": false,
+    "use_phone_pitch": false,
+    "use_frame_energy": false,
+    "use_phone_energy": false,
+    "use_mel": false,
+    "use_audio": false,
+    "use_label": false,
+    "use_one_hot": false
+  },
+  // model
+  "model": {
+    "autoencoderkl": {
+      "ch": 128,
+      "ch_mult": [
+        1,
+        1,
+        2,
+        2,
+        4
+      ],
+      "num_res_blocks": 2,
+      "in_channels": 1,
+      "z_channels": 4,
+      "out_ch": 1,
+      "double_z": true
+    },
+    "loss": {
+      "kl_weight": 1e-8,
+      "disc_weight": 0.5,
+      "disc_factor": 1.0,
+      "logvar_init": 0.0,
+      "min_adapt_d_weight": 0.0,
+      "max_adapt_d_weight": 10.0,
+      "disc_start": 50001,
+      "disc_in_channels": 1,
+      "disc_num_layers": 3,
+      "use_actnorm": false
+    }
+  },
+  // train
+  "train": {
+    "lronPlateau": {
+      "factor": 0.9,
+      "patience": 100,
+      "min_lr": 4.0e-5,
+      "verbose": true
+    },
+    "adam": {
+      "lr": 4.0e-4,
+      "betas": [
+        0.9,
+        0.999
+      ],
+      "weight_decay": 1.0e-2,
+      "eps": 1.0e-8
+    }
+  }
+}

config/base.json ADDED Viewed

	@@ -0,0 +1,220 @@

+{
+  "supported_model_type": [
+    "GANVocoder",
+    "Fastspeech2",
+    "DiffSVC",
+    "Transformer",
+    "EDM",
+    "CD"
+  ],
+  "task_type": "",
+  "dataset": [],
+  "use_custom_dataset": false,
+  "preprocess": {
+    "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon"
+    // trim audio silence
+    "data_augment": false,
+    "trim_silence": false,
+    "num_silent_frames": 8,
+    "trim_fft_size": 512, // fft size used in trimming
+    "trim_hop_size": 128, // hop size used in trimming
+    "trim_top_db": 30, // top db used in trimming sensitive to each dataset
+    // acoustic features
+    "extract_mel": false,
+    "mel_extract_mode": "",
+    "extract_linear_spec": false,
+    "extract_mcep": false,
+    "extract_pitch": false,
+    "extract_acoustic_token": false,
+    "pitch_remove_outlier": false,
+    "extract_uv": false,
+    "pitch_norm": false,
+    "extract_audio": false,
+    "extract_label": false,
+    "pitch_extractor": "parselmouth", // pyin, dio, pyworld, pyreaper, parselmouth, CWT (Continuous Wavelet Transform)
+    "extract_energy": false,
+    "energy_remove_outlier": false,
+    "energy_norm": false,
+    "energy_extract_mode": "from_mel",
+    "extract_duration": false,
+    "extract_amplitude_phase": false,
+    "mel_min_max_norm": false,
+    // lingusitic features
+    "extract_phone": false,
+    "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
+    // content features
+    "extract_whisper_feature": false,
+    "extract_contentvec_feature": false,
+    "extract_mert_feature": false,
+    "extract_wenet_feature": false,
+    // Settings for data preprocessing
+    "n_mel": 80,
+    "win_size": 480,
+    "hop_size": 120,
+    "sample_rate": 24000,
+    "n_fft": 1024,
+    "fmin": 0,
+    "fmax": 12000,
+    "min_level_db": -115,
+    "ref_level_db": 20,
+    "bits": 8,
+    // Directory names of processed data or extracted features
+    "processed_dir": "processed_data",
+    "trimmed_wav_dir": "trimmed_wavs", // directory name of silence trimed wav
+    "raw_data": "raw_data",
+    "phone_dir": "phones",
+    "wav_dir": "wavs", // directory name of processed wav (such as downsampled waveform)
+    "audio_dir": "audios",
+    "log_amplitude_dir": "log_amplitudes",
+    "phase_dir": "phases",
+    "real_dir": "reals",
+    "imaginary_dir": "imaginarys",
+    "label_dir": "labels",
+    "linear_dir": "linears",
+    "mel_dir": "mels", // directory name of extraced mel features
+    "mcep_dir": "mcep", // directory name of extraced mcep features
+    "dur_dir": "durs",
+    "symbols_dict": "symbols.dict",
+    "lab_dir": "labs", // directory name of extraced label features
+    "wenet_dir": "wenet", // directory name of extraced wenet features
+    "contentvec_dir": "contentvec", // directory name of extraced wenet features
+    "pitch_dir": "pitches", // directory name of extraced pitch features
+    "energy_dir": "energys", // directory name of extracted energy features
+    "phone_pitch_dir": "phone_pitches", // directory name of extraced pitch features
+    "phone_energy_dir": "phone_energys", // directory name of extracted energy features
+    "uv_dir": "uvs", // directory name of extracted unvoiced features
+    "duration_dir": "duration", // ground-truth duration file
+    "phone_seq_file": "phone_seq_file", // phoneme sequence file
+    "file_lst": "file.lst",
+    "train_file": "train.json", // training set, the json file contains detailed information about the dataset, including dataset name, utterance id, duration of the utterance
+    "valid_file": "valid.json", // validattion set
+    "spk2id": "spk2id.json", // used for multi-speaker dataset
+    "utt2spk": "utt2spk", // used for multi-speaker dataset
+    "emo2id": "emo2id.json", // used for multi-emotion dataset
+    "utt2emo": "utt2emo", // used for multi-emotion dataset
+    // Features used for model training
+    "use_text": false,
+    "use_phone": false,
+    "use_phn_seq": false,
+    "use_lab": false,
+    "use_linear": false,
+    "use_mel": false,
+    "use_min_max_norm_mel": false,
+    "use_wav": false,
+    "use_phone_pitch": false,
+    "use_log_scale_pitch": false,
+    "use_phone_energy": false,
+    "use_phone_duration": false,
+    "use_log_scale_energy": false,
+    "use_wenet": false,
+    "use_dur": false,
+    "use_spkid": false, // True: use speaker id for multi-speaker dataset
+    "use_emoid": false, // True: use emotion id for multi-emotion dataset
+    "use_frame_pitch": false,
+    "use_uv": false,
+    "use_frame_energy": false,
+    "use_frame_duration": false,
+    "use_audio": false,
+    "use_label": false,
+    "use_one_hot": false,
+    "use_amplitude_phase": false,
+    "data_augment": false,
+    "align_mel_duration": false
+  },
+  "train": {
+    "ddp": true,
+    "random_seed": 970227,
+    "batch_size": 16,
+    "max_steps": 1000000,
+    // Trackers
+    "tracker": [
+      "tensorboard"
+      // "wandb",
+      // "cometml",
+      // "mlflow",
+    ],
+    "max_epoch": -1,
+    // -1 means no limit
+    "save_checkpoint_stride": [
+      5,
+      20
+    ],
+    // unit is epoch
+    "keep_last": [
+      3,
+      -1
+    ],
+    // -1 means infinite, if one number will broadcast
+    "run_eval": [
+      false,
+      true
+    ],
+    // if one number will broadcast
+    // Fix the random seed
+    "random_seed": 10086,
+    // Optimizer
+    "optimizer": "AdamW",
+    "adamw": {
+      "lr": 4.0e-4
+      // nn model lr
+    },
+    // LR Scheduler
+    "scheduler": "ReduceLROnPlateau",
+    "reducelronplateau": {
+      "factor": 0.8,
+      "patience": 10,
+      // unit is epoch
+      "min_lr": 1.0e-4
+    },
+    // Batchsampler
+    "sampler": {
+      "holistic_shuffle": true,
+      "drop_last": true
+    },
+    // Dataloader
+    "dataloader": {
+      "num_worker": 32,
+      "pin_memory": true
+    },
+    "gradient_accumulation_step": 1,
+    "total_training_steps": 50000,
+    "save_summary_steps": 500,
+    "save_checkpoints_steps": 10000,
+    "valid_interval": 10000,
+    "keep_checkpoint_max": 5,
+    "multi_speaker_training": false, // True: train multi-speaker model; False: training single-speaker model;
+    "max_epoch": -1,
+    // -1 means no limit
+    "save_checkpoint_stride": [
+      5,
+      20
+    ],
+    // unit is epoch
+    "keep_last": [
+      3,
+      -1
+    ],
+    // -1 means infinite, if one number will broadcast
+    "run_eval": [
+      false,
+      true
+    ],
+    // Batchsampler
+    "sampler": {
+      "holistic_shuffle": true,
+      "drop_last": true
+    },
+    // Dataloader
+    "dataloader": {
+      "num_worker": 32,
+      "pin_memory": true
+    },
+    // Trackers
+    "tracker": [
+      "tensorboard"
+      // "wandb",
+      // "cometml",
+      // "mlflow",
+    ],
+  },
+}

config/comosvc.json ADDED Viewed

	@@ -0,0 +1,216 @@

+{
+    "base_config": "config/base.json",
+    "model_type": "DiffComoSVC",
+    "task_type": "svc",
+    "use_custom_dataset": false,
+    "preprocess": {
+        // data augmentations
+        "use_pitch_shift": false,
+        "use_formant_shift": false,
+        "use_time_stretch": false,
+        "use_equalizer": false,
+        // acoustic features
+        "extract_mel": true,
+        "mel_min_max_norm": true,
+        "extract_pitch": true,
+        "pitch_extractor": "parselmouth",
+        "extract_uv": true,
+        "extract_energy": true,
+        // content features
+        "extract_whisper_feature": false,
+        "whisper_sample_rate": 16000,
+        "extract_contentvec_feature": false,
+        "contentvec_sample_rate": 16000,
+        "extract_wenet_feature": false,
+        "wenet_sample_rate": 16000,
+        "extract_mert_feature": false,
+        "mert_sample_rate": 16000,
+        // Default config for whisper
+        "whisper_frameshift": 0.01,
+        "whisper_downsample_rate": 2,
+        // Default config for content vector
+        "contentvec_frameshift": 0.02,
+        // Default config for mert
+        "mert_model": "m-a-p/MERT-v1-330M",
+        "mert_feature_layer": -1,
+        "mert_hop_size": 320,
+        // 24k
+        "mert_frameshit": 0.01333,
+        // 10ms
+        "wenet_frameshift": 0.01,
+        // wenetspeech is 4, gigaspeech is 6
+        "wenet_downsample_rate": 4,
+        // Default config
+        "n_mel": 100,
+        "win_size": 1024,
+        // todo
+        "hop_size": 256,
+        "sample_rate": 24000,
+        "n_fft": 1024,
+        // todo
+        "fmin": 0,
+        "fmax": 12000,
+        // todo
+        "f0_min": 50,
+        // ~C2
+        "f0_max": 1100,
+        //1100,    // ~C6(1100), ~G5(800)
+        "pitch_bin": 256,
+        "pitch_max": 1100.0,
+        "pitch_min": 50.0,
+        "is_label": true,
+        "is_mu_law": true,
+        "bits": 8,
+        "mel_min_max_stats_dir": "mel_min_max_stats",
+        "whisper_dir": "whisper",
+        "contentvec_dir": "contentvec",
+        "wenet_dir": "wenet",
+        "mert_dir": "mert",
+        // Extract content features using dataloader
+        "pin_memory": true,
+        "num_workers": 8,
+        "content_feature_batch_size": 16,
+        // Features used for model training
+        "use_mel": true,
+        "use_min_max_norm_mel": true,
+        "use_frame_pitch": true,
+        "use_uv": true,
+        "use_frame_energy": true,
+        "use_log_scale_pitch": false,
+        "use_log_scale_energy": false,
+        "use_spkid": true,
+        // Meta file
+        "train_file": "train.json",
+        "valid_file": "test.json",
+        "spk2id": "singers.json",
+        "utt2spk": "utt2singer"
+    },
+    "model": {
+        "teacher_model_path": "[Your Teacher Model Path].bin",
+        "condition_encoder": {
+            "merge_mode": "add",
+            "input_melody_dim": 1,
+            "use_log_f0": true,
+            "n_bins_melody": 256,
+            //# Quantization (0 for not quantization)
+            "output_melody_dim": 384,
+            "input_loudness_dim": 1,
+            "use_log_loudness": true,
+            "n_bins_loudness": 256,
+            "output_loudness_dim": 384,
+            "use_whisper": false,
+            "use_contentvec": false,
+            "use_wenet": false,
+            "use_mert": false,
+            "whisper_dim": 1024,
+            "contentvec_dim": 256,
+            "mert_dim": 256,
+            "wenet_dim": 512,
+            "content_encoder_dim": 384,
+            "output_singer_dim": 384,
+            "singer_table_size": 512,
+            "output_content_dim": 384,
+            "use_spkid": true
+        },
+        "comosvc": {
+            "distill": false,
+            // conformer encoder
+            "input_dim": 384,
+            "output_dim": 100,
+            "n_heads": 2,
+            "n_layers": 6,
+            "filter_channels": 512,
+            "dropout": 0.1,
+            // karras diffusion
+            "P_mean": -1.2,
+            "P_std": 1.2,
+            "sigma_data": 0.5,
+            "sigma_min": 0.002,
+            "sigma_max": 80,
+            "rho": 7,
+            "n_timesteps": 40,
+        },
+        "diffusion": {
+            // Diffusion steps encoder
+            "step_encoder": {
+                "dim_raw_embedding": 128,
+                "dim_hidden_layer": 512,
+                "activation": "SiLU",
+                "num_layer": 2,
+                "max_period": 10000
+            },
+            // Diffusion decoder
+            "model_type": "bidilconv",
+            // bidilconv, unet2d, TODO: unet1d
+            "bidilconv": {
+                "base_channel": 384,
+                "n_res_block": 20,
+                "conv_kernel_size": 3,
+                "dilation_cycle_length": 4,
+                // specially, 1 means no dilation
+                "conditioner_size": 100
+            }
+        },
+    },
+    "train": {
+        // Basic settings
+        "fast_steps": 0,
+        "batch_size": 32,
+        "gradient_accumulation_step": 1,
+        "max_epoch": -1,
+        // -1 means no limit
+        "save_checkpoint_stride": [
+            10,
+            100
+        ],
+        // unit is epoch
+        "keep_last": [
+            3,
+            -1
+        ],
+        // -1 means infinite, if one number will broadcast
+        "run_eval": [
+            false,
+            true
+        ],
+        // if one number will broadcast
+        // Fix the random seed
+        "random_seed": 10086,
+        // Batchsampler
+        "sampler": {
+            "holistic_shuffle": true,
+            "drop_last": true
+        },
+        // Dataloader
+        "dataloader": {
+            "num_worker": 32,
+            "pin_memory": true
+        },
+        // Trackers
+        "tracker": [
+            "tensorboard"
+            // "wandb",
+            // "cometml",
+            // "mlflow",
+        ],
+        // Optimizer
+        "optimizer": "AdamW",
+        "adamw": {
+            "lr": 4.0e-4
+            // nn model lr
+        },
+        // LR Scheduler
+        "scheduler": "ReduceLROnPlateau",
+        "reducelronplateau": {
+            "factor": 0.8,
+            "patience": 10,
+            // unit is epoch
+            "min_lr": 1.0e-4
+        }
+    },
+    "inference": {
+        "comosvc": {
+            "inference_steps": 40
+        }
+    }
+}

config/diffusion.json ADDED Viewed

	@@ -0,0 +1,227 @@

+{
+    // FIXME: THESE ARE LEGACY
+    "base_config": "config/base.json",
+    "model_type": "diffusion",
+    "task_type": "svc",
+    "use_custom_dataset": false,
+    "preprocess": {
+        // data augmentations
+        "use_pitch_shift": false,
+        "use_formant_shift": false,
+        "use_time_stretch": false,
+        "use_equalizer": false,
+        // acoustic features
+        "extract_mel": true,
+        "mel_min_max_norm": true,
+        "extract_pitch": true,
+        "pitch_extractor": "parselmouth",
+        "extract_uv": true,
+        "extract_energy": true,
+        // content features
+        "extract_whisper_feature": false,
+        "whisper_sample_rate": 16000,
+        "extract_contentvec_feature": false,
+        "contentvec_sample_rate": 16000,
+        "extract_wenet_feature": false,
+        "wenet_sample_rate": 16000,
+        "extract_mert_feature": false,
+        "mert_sample_rate": 16000,
+        // Default config for whisper
+        "whisper_frameshift": 0.01,
+        "whisper_downsample_rate": 2,
+        // Default config for content vector
+        "contentvec_frameshift": 0.02,
+        // Default config for mert
+        "mert_model": "m-a-p/MERT-v1-330M",
+        "mert_feature_layer": -1,
+        "mert_hop_size": 320,
+        // 24k
+        "mert_frameshit": 0.01333,
+        // 10ms
+        "wenet_frameshift": 0.01,
+        // wenetspeech is 4, gigaspeech is 6
+        "wenet_downsample_rate": 4,
+        // Default config
+        "n_mel": 100,
+        "win_size": 1024,
+        // todo
+        "hop_size": 256,
+        "sample_rate": 24000,
+        "n_fft": 1024,
+        // todo
+        "fmin": 0,
+        "fmax": 12000,
+        // todo
+        "f0_min": 50,
+        // ~C2
+        "f0_max": 1100,
+        //1100,    // ~C6(1100), ~G5(800)
+        "pitch_bin": 256,
+        "pitch_max": 1100.0,
+        "pitch_min": 50.0,
+        "is_label": true,
+        "is_mu_law": true,
+        "bits": 8,
+        "mel_min_max_stats_dir": "mel_min_max_stats",
+        "whisper_dir": "whisper",
+        "contentvec_dir": "contentvec",
+        "wenet_dir": "wenet",
+        "mert_dir": "mert",
+        // Extract content features using dataloader
+        "pin_memory": true,
+        "num_workers": 8,
+        "content_feature_batch_size": 16,
+        // Features used for model training
+        "use_mel": true,
+        "use_min_max_norm_mel": true,
+        "use_frame_pitch": true,
+        "use_uv": true,
+        "use_frame_energy": true,
+        "use_log_scale_pitch": false,
+        "use_log_scale_energy": false,
+        "use_spkid": true,
+        // Meta file
+        "train_file": "train.json",
+        "valid_file": "test.json",
+        "spk2id": "singers.json",
+        "utt2spk": "utt2singer"
+    },
+    "model": {
+        "condition_encoder": {
+            "merge_mode": "add",
+            "input_melody_dim": 1,
+            "use_log_f0": true,
+            "n_bins_melody": 256,
+            //# Quantization (0 for not quantization)
+            "output_melody_dim": 384,
+            "input_loudness_dim": 1,
+            "use_log_loudness": true,
+            "n_bins_loudness": 256,
+            "output_loudness_dim": 384,
+            "use_whisper": false,
+            "use_contentvec": false,
+            "use_wenet": false,
+            "use_mert": false,
+            "whisper_dim": 1024,
+            "contentvec_dim": 256,
+            "mert_dim": 256,
+            "wenet_dim": 512,
+            "content_encoder_dim": 384,
+            "output_singer_dim": 384,
+            "singer_table_size": 512,
+            "output_content_dim": 384,
+            "use_spkid": true
+        },
+        // FIXME: FOLLOWING ARE NEW!!
+        "diffusion": {
+            "scheduler": "ddpm",
+            "scheduler_settings": {
+                "num_train_timesteps": 1000,
+                "beta_start": 1.0e-4,
+                "beta_end": 0.02,
+                "beta_schedule": "linear"
+            },
+            // Diffusion steps encoder
+            "step_encoder": {
+                "dim_raw_embedding": 128,
+                "dim_hidden_layer": 512,
+                "activation": "SiLU",
+                "num_layer": 2,
+                "max_period": 10000
+            },
+            // Diffusion decoder
+            "model_type": "bidilconv",
+            // bidilconv, unet2d, TODO: unet1d
+            "bidilconv": {
+                "base_channel": 384,
+                "n_res_block": 20,
+                "conv_kernel_size": 3,
+                "dilation_cycle_length": 4,
+                // specially, 1 means no dilation
+                "conditioner_size": 384
+            },
+            "unet2d": {
+                "in_channels": 1,
+                "out_channels": 1,
+                "down_block_types": [
+                    "CrossAttnDownBlock2D",
+                    "CrossAttnDownBlock2D",
+                    "CrossAttnDownBlock2D",
+                    "DownBlock2D"
+                ],
+                "mid_block_type": "UNetMidBlock2DCrossAttn",
+                "up_block_types": [
+                    "UpBlock2D",
+                    "CrossAttnUpBlock2D",
+                    "CrossAttnUpBlock2D",
+                    "CrossAttnUpBlock2D"
+                ],
+                "only_cross_attention": false
+            }
+        }
+    },
+    // FIXME: FOLLOWING ARE NEW!!
+    "train": {
+        // Basic settings
+        "batch_size": 64,
+        "gradient_accumulation_step": 1,
+        "max_epoch": -1,
+        // -1 means no limit
+        "save_checkpoint_stride": [
+            5,
+            20
+        ],
+        // unit is epoch
+        "keep_last": [
+            3,
+            -1
+        ],
+        // -1 means infinite, if one number will broadcast
+        "run_eval": [
+            false,
+            true
+        ],
+        // if one number will broadcast
+        // Fix the random seed
+        "random_seed": 10086,
+        // Batchsampler
+        "sampler": {
+            "holistic_shuffle": true,
+            "drop_last": true
+        },
+        // Dataloader
+        "dataloader": {
+            "num_worker": 32,
+            "pin_memory": true
+        },
+        // Trackers
+        "tracker": [
+            "tensorboard"
+            // "wandb",
+            // "cometml",
+            // "mlflow",
+        ],
+        // Optimizer
+        "optimizer": "AdamW",
+        "adamw": {
+            "lr": 4.0e-4
+            // nn model lr
+        },
+        // LR Scheduler
+        "scheduler": "ReduceLROnPlateau",
+        "reducelronplateau": {
+            "factor": 0.8,
+            "patience": 10,
+            // unit is epoch
+            "min_lr": 1.0e-4
+        }
+    },
+    "inference": {
+        "diffusion": {
+            "scheduler": "pndm",
+            "scheduler_settings": {
+                "num_inference_timesteps": 1000
+            }
+        }
+    }
+}

config/fs2.json ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+    "base_config": "config/tts.json",
+    "model_type": "FastSpeech2",
+    "task_type": "tts",
+    "dataset": ["LJSpeech"],
+    "preprocess": {
+      // acoustic features
+      "extract_audio": true,
+      "extract_mel": true,
+      "mel_extract_mode": "taco",
+      "mel_min_max_norm": false,
+      "extract_pitch": true,
+      "extract_uv": false,
+      "pitch_extractor": "dio",
+      "extract_energy": true,
+      "energy_extract_mode": "from_tacotron_stft",
+      "extract_duration": true,
+      "use_phone": true,
+      "pitch_norm": true,
+      "energy_norm": true,
+      "pitch_remove_outlier": true,
+      "energy_remove_outlier": true,
+      // Default config
+      "n_mel": 80,
+      "win_size": 1024,  // todo
+      "hop_size": 256,
+      "sample_rate": 22050,
+      "n_fft": 1024, // todo
+      "fmin": 0,
+      "fmax": 8000, // todo
+      "raw_data": "raw_data",
+      "text_cleaners": ["english_cleaners"],
+      "f0_min": 71,    // ~C2
+      "f0_max": 800, //1100,    // ~C6(1100), ~G5(800)
+      "pitch_bin": 256,
+      "pitch_max": 1100.0,
+      "pitch_min": 50.0,
+      "is_label": true,
+      "is_mu_law": true,
+      "bits": 8,
+      "mel_min_max_stats_dir": "mel_min_max_stats",
+      "whisper_dir": "whisper",
+      "content_vector_dir": "content_vector",
+      "wenet_dir": "wenet",
+      "mert_dir": "mert",
+      "spk2id":"spk2id.json",
+      "utt2spk":"utt2spk",
+      // Features used for model training
+      "use_mel": true,
+      "use_min_max_norm_mel": false,
+      "use_frame_pitch": false,
+      "use_frame_energy": false,
+      "use_phone_pitch": true,
+      "use_phone_energy": true,
+      "use_log_scale_pitch": false,
+      "use_log_scale_energy": false,
+      "use_spkid": false,
+      "align_mel_duration": true,
+      "text_cleaners": ["english_cleaners"]
+      },
+    "model": {
+      // Settings for transformer
+      "transformer": {
+        "encoder_layer": 4,
+        "encoder_head": 2,
+        "encoder_hidden": 256,
+        "decoder_layer": 6,
+        "decoder_head": 2,
+        "decoder_hidden": 256,
+        "conv_filter_size": 1024,
+        "conv_kernel_size": [9, 1],
+        "encoder_dropout": 0.2,
+        "decoder_dropout": 0.2
+      },
+      // Settings for variance_predictor
+      "variance_predictor":{
+        "filter_size": 256,
+        "kernel_size": 3,
+        "dropout": 0.5
+      },
+    "variance_embedding":{
+        "pitch_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing
+        "energy_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing
+        "n_bins": 256
+      },
+    "max_seq_len": 1000
+    },
+    "train":{
+      "batch_size": 16,
+      "sort_sample": true,
+      "drop_last": true,
+      "group_size": 4,
+      "grad_clip_thresh": 1.0,
+      "dataloader": {
+        "num_worker": 8,
+        "pin_memory": true
+      },
+      "lr_scheduler":{
+        "num_warmup": 4000
+      },
+      // LR Scheduler
+      "scheduler": "NoamLR",
+      // Optimizer
+      "optimizer": "Adam",
+      "adam": {
+        "lr": 0.0625,
+        "betas": [0.9, 0.98],
+        "eps": 0.000000001,
+        "weight_decay": 0.0
+      },
+    }
+}

config/transformer.json ADDED Viewed

	@@ -0,0 +1,180 @@

+{
+    "base_config": "config/base.json",
+    "model_type": "Transformer",
+    "task_type": "svc",
+    "use_custom_dataset": false,
+    "preprocess": {
+        // data augmentations
+        "use_pitch_shift": false,
+        "use_formant_shift": false,
+        "use_time_stretch": false,
+        "use_equalizer": false,
+        // acoustic features
+        "extract_mel": true,
+        "mel_min_max_norm": true,
+        "extract_pitch": true,
+        "pitch_extractor": "parselmouth",
+        "extract_uv": true,
+        "extract_energy": true,
+        // content features
+        "extract_whisper_feature": false,
+        "whisper_sample_rate": 16000,
+        "extract_contentvec_feature": false,
+        "contentvec_sample_rate": 16000,
+        "extract_wenet_feature": false,
+        "wenet_sample_rate": 16000,
+        "extract_mert_feature": false,
+        "mert_sample_rate": 16000,
+        // Default config for whisper
+        "whisper_frameshift": 0.01,
+        "whisper_downsample_rate": 2,
+        // Default config for content vector
+        "contentvec_frameshift": 0.02,
+        // Default config for mert
+        "mert_model": "m-a-p/MERT-v1-330M",
+        "mert_feature_layer": -1,
+        "mert_hop_size": 320,
+        // 24k
+        "mert_frameshit": 0.01333,
+        // 10ms
+        "wenet_frameshift": 0.01,
+        // wenetspeech is 4, gigaspeech is 6
+        "wenet_downsample_rate": 4,
+        // Default config
+        "n_mel": 100,
+        "win_size": 1024,
+        // todo
+        "hop_size": 256,
+        "sample_rate": 24000,
+        "n_fft": 1024,
+        // todo
+        "fmin": 0,
+        "fmax": 12000,
+        // todo
+        "f0_min": 50,
+        // ~C2
+        "f0_max": 1100,
+        //1100,    // ~C6(1100), ~G5(800)
+        "pitch_bin": 256,
+        "pitch_max": 1100.0,
+        "pitch_min": 50.0,
+        "is_label": true,
+        "is_mu_law": true,
+        "bits": 8,
+        "mel_min_max_stats_dir": "mel_min_max_stats",
+        "whisper_dir": "whisper",
+        "contentvec_dir": "contentvec",
+        "wenet_dir": "wenet",
+        "mert_dir": "mert",
+        // Extract content features using dataloader
+        "pin_memory": true,
+        "num_workers": 8,
+        "content_feature_batch_size": 16,
+        // Features used for model training
+        "use_mel": true,
+        "use_min_max_norm_mel": true,
+        "use_frame_pitch": true,
+        "use_uv": true,
+        "use_frame_energy": true,
+        "use_log_scale_pitch": false,
+        "use_log_scale_energy": false,
+        "use_spkid": true,
+        // Meta file
+        "train_file": "train.json",
+        "valid_file": "test.json",
+        "spk2id": "singers.json",
+        "utt2spk": "utt2singer"
+    },
+    "model": {
+        "condition_encoder": {
+            "merge_mode": "add",
+            "input_melody_dim": 1,
+            "use_log_f0": true,
+            "n_bins_melody": 256,
+            //# Quantization (0 for not quantization)
+            "output_melody_dim": 384,
+            "input_loudness_dim": 1,
+            "use_log_loudness": true,
+            "n_bins_loudness": 256,
+            "output_loudness_dim": 384,
+            "use_whisper": false,
+            "use_contentvec": true,
+            "use_wenet": false,
+            "use_mert": false,
+            "whisper_dim": 1024,
+            "contentvec_dim": 256,
+            "mert_dim": 256,
+            "wenet_dim": 512,
+            "content_encoder_dim": 384,
+            "output_singer_dim": 384,
+            "singer_table_size": 512,
+            "output_content_dim": 384,
+            "use_spkid": true
+        },
+        "transformer": {
+            "type": "conformer",
+            // 'conformer' or 'transformer'
+            "input_dim": 384,
+            "output_dim": 100,
+            "n_heads": 2,
+            "n_layers": 6,
+            "filter_channels": 512,
+            "dropout": 0.1,
+        }
+    },
+    "train": {
+        // Basic settings
+        "batch_size": 64,
+        "gradient_accumulation_step": 1,
+        "max_epoch": -1,
+        // -1 means no limit
+        "save_checkpoint_stride": [
+            10,
+            100
+        ],
+        // unit is epoch
+        "keep_last": [
+            3,
+            -1
+        ],
+        // -1 means infinite, if one number will broadcast
+        "run_eval": [
+            false,
+            true
+        ],
+        // if one number will broadcast
+        // Fix the random seed
+        "random_seed": 10086,
+        // Batchsampler
+        "sampler": {
+            "holistic_shuffle": true,
+            "drop_last": true
+        },
+        // Dataloader
+        "dataloader": {
+            "num_worker": 32,
+            "pin_memory": true
+        },
+        // Trackers
+        "tracker": [
+            "tensorboard"
+            // "wandb",
+            // "cometml",
+            // "mlflow",
+        ],
+        // Optimizer
+        "optimizer": "AdamW",
+        "adamw": {
+            "lr": 4.0e-4
+            // nn model lr
+        },
+        // LR Scheduler
+        "scheduler": "ReduceLROnPlateau",
+        "reducelronplateau": {
+            "factor": 0.8,
+            "patience": 10,
+            // unit is epoch
+            "min_lr": 1.0e-4
+        }
+    }
+}

config/tts.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "base_config": "config/base.json",
+  "supported_model_type": [
+    "Fastspeech2",
+    "VITS",
+    "VALLE",
+  ],
+  "task_type": "tts",
+  "preprocess": {
+    "language": "en-us",
+    // linguistic features
+    "extract_phone": true,
+    "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
+    "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
+    // Directory names of processed data or extracted features
+    "phone_dir": "phones",
+    "use_phone": true,
+    // "spk2id": "spk2id.json", // used for multi-speaker dataset
+    // "utt2spk": "utt2spk", // used for multi-speaker dataset
+    "add_blank": true
+  },
+  "model": {
+      "text_token_num": 512,
+  }
+}

config/valle.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+    "base_config": "config/tts.json",
+    "model_type": "VALLE",
+    "task_type": "tts",
+    "dataset": [
+        "libritts"
+    ],
+    "preprocess": {
+        "extract_phone": true,
+        "phone_extractor": "espeak", // phoneme extractor: espeak, pypinyin, pypinyin_initials_finals or lexicon
+        "extract_acoustic_token": true,
+        "acoustic_token_extractor": "Encodec", // acoustic token extractor: encodec, dac(todo)
+        "acoustic_token_dir": "acoutic_tokens",
+        "use_text": false,
+        "use_phone": true,
+        "use_acoustic_token": true,
+        "symbols_dict": "symbols.dict",
+        "min_duration": 0.5, // the duration lowerbound to filter the audio with duration < min_duration
+        "max_duration": 14, //  the duration uperbound to filter the audio with duration > max_duration.
+        "sampling_rate": 24000,
+    },
+    "model": {
+        "text_token_num": 512,
+        "audio_token_num": 1024,
+        "decoder_dim": 1024, // embedding dimension of the decoder model
+        "nhead": 16, // number of attention heads in the decoder layers
+        "num_decoder_layers": 12, // number of decoder layers
+        "norm_first": true, // pre or post Normalization.
+        "add_prenet": false, // whether add PreNet after Inputs
+        "prefix_mode": 0, //  mode for how to prefix VALL-E NAR Decoder, 0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance
+        "share_embedding": true, // share the parameters of the output projection layer with the parameters of the acoustic embedding
+        "nar_scale_factor": 1, // model scale factor which will be assigned different meanings in different models
+        "prepend_bos": false, // whether prepend <BOS> to the acoustic tokens -> AR Decoder inputs
+        "num_quantizers": 8, // numbert of the audio quantization layers
+        // "scaling_xformers": false, // Apply Reworked Conformer scaling on Transformers
+    },
+    "train": {
+        "ddp": false,
+        "train_stage": 1, // 0: train all modules, For VALL_E, support 1: AR Decoder 2: NAR Decoder(s)
+        "max_epoch": 20,
+        "optimizer": "ScaledAdam",
+        "scheduler": "Eden",
+        "warmup_steps": 200, // number of steps that affects how rapidly the learning rate decreases
+        "base_lr": 0.05, // base learning rate."
+        "valid_interval": 1000,
+        "log_epoch_step": 1000,
+        "save_checkpoint_stride": [
+            1,
+            1
+        ]
+    }
+}

config/vits.json ADDED Viewed

	@@ -0,0 +1,101 @@

+{
+    "base_config": "config/tts.json",
+    "model_type": "VITS",
+    "task_type": "tts",
+    "preprocess": {
+        "extract_phone": true,
+        "extract_mel": true,
+        "n_mel": 80,
+        "fmin": 0,
+        "fmax": null,
+        "extract_linear_spec": true,
+        "extract_audio": true,
+        "use_linear": true,
+        "use_mel": true,
+        "use_audio": true,
+        "use_text": false,
+        "use_phone": true,
+        "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
+        "n_fft": 1024,
+        "win_size": 1024,
+        "hop_size": 256,
+        "segment_size": 8192,
+        "text_cleaners": [
+            "english_cleaners"
+        ]
+    },
+    "model": {
+        "text_token_num": 512,
+        "inter_channels": 192,
+        "hidden_channels": 192,
+        "filter_channels": 768,
+        "n_heads": 2,
+        "n_layers": 6,
+        "kernel_size": 3,
+        "p_dropout": 0.1,
+        "resblock": "1",
+        "resblock_kernel_sizes": [
+            3,
+            7,
+            11
+        ],
+        "resblock_dilation_sizes": [
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ]
+        ],
+        "upsample_rates": [
+            8,
+            8,
+            2,
+            2
+        ],
+        "upsample_initial_channel": 512,
+        "upsample_kernel_sizes": [
+            16,
+            16,
+            4,
+            4
+        ],
+        "n_layers_q": 3,
+        "use_spectral_norm": false,
+        "n_speakers": 10, // number of speakers, while be automatically set if n_speakers is 0 and multi_speaker_training is true
+        "gin_channels": 256,
+        "use_sdp": true
+    },
+    "train": {
+        "fp16_run": true,
+        "learning_rate": 2e-4,
+        "betas": [
+            0.8,
+            0.99
+        ],
+        "eps": 1e-9,
+        "batch_size": 16,
+        "lr_decay": 0.999875,
+        // "segment_size": 8192,
+        "init_lr_ratio": 1,
+        "warmup_epochs": 0,
+        "c_mel": 45,
+        "c_kl": 1.0,
+        "AdamW": {
+            "betas": [
+                0.8,
+                0.99
+            ],
+            "eps": 1e-9,
+        }
+    }
+}

config/vocoder.json ADDED Viewed

	@@ -0,0 +1,84 @@

+{
+  "base_config": "config/base.json",
+  "dataset": [
+    "LJSpeech",
+    "LibriTTS",
+    "opencpop",
+    "m4singer",
+    "svcc",
+    "svcceval",
+    "pjs",
+    "opensinger",
+    "popbutfy",
+    "nus48e",
+    "popcs",
+    "kising",
+    "csd",
+    "opera",
+    "vctk",
+    "lijian",
+    "cdmusiceval"
+  ],
+  "task_type": "vocoder",
+  "preprocess": {
+    // acoustic features
+    "extract_mel": true,
+    "extract_pitch": false,
+    "extract_uv": false,
+    "extract_audio": true,
+    "extract_label": false,
+    "extract_one_hot": false,
+    "extract_amplitude_phase": false,
+    "pitch_extractor": "parselmouth",
+    // Settings for data preprocessing
+    "n_mel": 100,
+    "win_size": 1024,
+    "hop_size": 256,
+    "sample_rate": 24000,
+    "n_fft": 1024,
+    "fmin": 0,
+    "fmax": 12000,
+    "f0_min": 50,
+    "f0_max": 1100,
+    "pitch_bin": 256,
+    "pitch_max": 1100.0,
+    "pitch_min": 50.0,
+    "is_mu_law": false,
+    "bits": 8,
+    "cut_mel_frame": 32,
+    // Directory names of processed data or extracted features
+    "spk2id": "singers.json",
+    // Features used for model training
+    "use_mel": true,
+    "use_frame_pitch": false,
+    "use_uv": false,
+    "use_audio": true,
+    "use_label": false,
+    "use_one_hot": false,
+    "train_file": "train.json",
+    "valid_file": "test.json"
+  },
+  "train": {
+    "random_seed": 114514,
+    "batch_size": 64,
+    "gradient_accumulation_step": 1,
+    "max_epoch": 1000000,
+    "save_checkpoint_stride": [
+      20
+    ],
+    "run_eval": [
+      true
+    ],
+    "sampler": {
+      "holistic_shuffle": true,
+      "drop_last": true
+    },
+    "dataloader": {
+      "num_worker": 4,
+      "pin_memory": true
+    },
+    "tracker": [
+      "tensorboard"
+    ],
+  }
+}

egs/datasets/README.md ADDED Viewed

	@@ -0,0 +1,381 @@

+# Datasets Format
+Amphion support the following academic datasets (sort alphabetically):
+- [Datasets Format](#datasets-format)
+  - [AudioCaps](#audiocaps)
+  - [CSD](#csd)
+  - [KiSing](#kising)
+  - [LibriTTS](#libritts)
+  - [LJSpeech](#ljspeech)
+  - [M4Singer](#m4singer)
+  - [NUS-48E](#nus-48e)
+  - [Opencpop](#opencpop)
+  - [OpenSinger](#opensinger)
+  - [Opera](#opera)
+  - [PopBuTFy](#popbutfy)
+  - [PopCS](#popcs)
+  - [PJS](#pjs)
+  - [SVCC](#svcc)
+  - [VCTK](#vctk)
+The downloading link and the file structure tree of each dataset is displayed as follows.
+## AudioCaps
+AudioCaps is a dataset of around 44K audio-caption pairs, where each audio clip corresponds to a caption with rich semantic information. You can download the dataset [here](https://github.com/cdjkim/audiocaps). The file structure tree is like:
+```plaintext
+[AudioCaps dataset path]
+┣ AudioCpas
+┃   ┣ wav
+┃   ┃   ┣ ---1_cCGK4M_0_10000.wav
+┃   ┃   ┣ ---lTs1dxhU_30000_40000.wav
+┃   ┃   ┣ ...
+```
+## CSD
+The official CSD dataset can be download [here](https://zenodo.org/records/4785016). The file structure tree is like:
+```plaintext
+[CSD dataset path]
+ ┣ english
+ ┣ korean
+ ┣ utterances
+ ┃ ┣ en001a
+ ┃ ┃ ┣ {UtterenceID}.wav
+ ┃ ┣ en001b
+ ┃ ┣ en002a
+ ┃ ┣ en002b
+ ┃ ┣ ...
+ ┣ README
+```
+## KiSing
+The official KiSing dataset can be download [here](http://shijt.site/index.php/2021/05/16/kising-the-first-open-source-mandarin-singing-voice-synthesis-corpus/). The file structure tree is like:
+```plaintext
+[KiSing dataset path]
+ ┣ clean
+ ┃ ┣ 421
+ ┃ ┣ 422
+ ┃ ┣ ...
+```
+## LibriTTS
+The official LibriTTS dataset can be download [here](https://www.openslr.org/60/). The file structure tree is like:
+```plaintext
+[LibriTTS dataset path]
+ ┣ BOOKS.txt
+ ┣ CHAPTERS.txt
+ ┣ eval_sentences10.tsv
+ ┣ LICENSE.txt
+ ┣ NOTE.txt
+ ┣ reader_book.tsv
+ ┣ README_librispeech.txt
+ ┣ README_libritts.txt
+ ┣ speakers.tsv
+ ┣ SPEAKERS.txt
+ ┣ dev-clean (Subset)
+ ┃ ┣ 1272{Speaker_ID}
+ ┃ ┃ ┣ 128104 {Chapter_ID}
+ ┃ ┃ ┃ ┣ 1272_128104_000001_000000.normalized.txt
+ ┃ ┃ ┃ ┣ 1272_128104_000001_000000.original.txt
+ ┃ ┃ ┃ ┣ 1272_128104_000001_000000.wav
+ ┃ ┃ ┃ ┣ ...
+ ┃ ┃ ┃ ┣ 1272_128104.book.tsv
+ ┃ ┃ ┃ ┣ 1272_128104.trans.tsv
+ ┃ ┃ ┣ ...
+ ┃ ┣ ...
+ ┣ dev-other (Subset)
+ ┃ ┣ 116 (Speaker)
+ ┃ ┃ ┣ 288045 {Chapter_ID}
+ ┃ ┃ ┃ ┣ 116_288045_000003_000000.normalized.txt
+ ┃ ┃ ┃ ┣ 116_288045_000003_000000.original.txt
+ ┃ ┃ ┃ ┣ 116_288045_000003_000000.wav
+ ┃ ┃ ┃ ┣ ...
+ ┃ ┃ ┃ ┣ 116_288045.book.tsv
+ ┃ ┃ ┃ ┣ 116_288045.trans.tsv
+ ┃ ┃ ┣ ...
+ ┃ ┣ ...
+ ┃ ┣ ...
+ ┣ test-clean  (Subset)
+ ┃ ┣ {Speaker_ID}
+ ┃ ┃ ┣ {Chapter_ID}
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
+ ┃ ┃ ┃ ┣ ...
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
+ ┃ ┃ ┣ ...
+ ┃ ┣ ...
+ ┣ test-other
+ ┃ ┣ {Speaker_ID}
+ ┃ ┃ ┣ {Chapter_ID}
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
+ ┃ ┃ ┃ ┣ ...
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
+ ┃ ┃ ┣ ...
+ ┃ ┣ ...
+ ┣ train-clean-100
+ ┃ ┣ {Speaker_ID}
+ ┃ ┃ ┣ {Chapter_ID}
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
+ ┃ ┃ ┃ ┣ ...
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
+ ┃ ┃ ┣ ...
+ ┃ ┣ ...
+ ┣ train-clean-360
+ ┃ ┣ {Speaker_ID}
+ ┃ ┃ ┣ {Chapter_ID}
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
+ ┃ ┃ ┃ ┣ ...
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
+ ┃ ┃ ┣ ...
+ ┃ ┣ ...
+ ┣ train-other-500
+ ┃ ┣ {Speaker_ID}
+ ┃ ┃ ┣ {Chapter_ID}
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.normalized.txt
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.original.txt
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}_{Utterance_ID}.wav
+ ┃ ┃ ┃ ┣ ...
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.book.tsv
+ ┃ ┃ ┃ ┣ {Speaker_ID}_{Chapter_ID}.trans.tsv
+ ┃ ┃ ┣ ...
+ ┃ ┣ ...
+```
+## LJSpeech
+The official LibriTTS dataset can be download [here](https://keithito.com/LJ-Speech-Dataset/). The file structure tree is like:
+```plaintext
+[LJSpeech dataset path]
+ ┣ metadata.csv
+ ┣ wavs
+ ┃ ┣ LJ001-0001.wav
+ ┃ ┣ LJ001-0002.wav
+ ┃ ┣ ...
+ ┣ README
+```
+## M4Singer
+The official M4Singer dataset can be downloaded [here](https://drive.google.com/file/d/1xC37E59EWRRFFLdG3aJkVqwtLDgtFNqW/view). The file structure tree is like:
+```plaintext
+[M4Singer dataset path]
+ ┣ {Singer_1}#{Song_1}
+ ┃ ┣ 0000.mid
+ ┃ ┣ 0000.TextGrid
+ ┃ ┣ 0000.wav
+ ┃ ┣ ...
+ ┣ {Singer_1}#{Song_2}
+ ┣ ...
+ ┣ {Singer_2}#{Song_1}
+ ┣ {Singer_2}#{Song_2}
+ ┣ ...
+ ┗ meta.json
+```
+## NUS-48E
+The official NUS-48E dataset can be download [here](https://drive.google.com/drive/folders/12pP9uUl0HTVANU3IPLnumTJiRjPtVUMx). The file structure tree is like:
+```plaintext
+[NUS-48E dataset path]
+ ┣ {SpeakerID}
+ ┃ ┣ read
+ ┃ ┃ ┣ {SongID}.txt
+ ┃ ┃ ┣ {SongID}.wav
+ ┃ ┃ ┣ ...
+ ┃ ┣ sing
+ ┃ ┃ ┣ {SongID}.txt
+ ┃ ┃ ┣ {SongID}.wav
+ ┃ ┃ ┣ ...
+ ┣ ...
+ ┣ README.txt
+```
+## Opencpop
+The official Opera dataset can be downloaded [here](https://wenet.org.cn/opencpop/). The file structure tree is like:
+```plaintext
+[Opencpop dataset path]
+ ┣ midis
+ ┃ ┣ 2001.midi
+ ┃ ┣ 2002.midi
+ ┃ ┣ 2003.midi
+ ┃ ┣ ...
+ ┣ segments
+ ┃ ┣ wavs
+ ┃ ┃ ┣ 2001000001.wav
+ ┃ ┃ ┣ 2001000002.wav
+ ┃ ┃ ┣ 2001000003.wav
+ ┃ ┃ ┣ ...
+ ┃ ┣ test.txt
+ ┃ ┣ train.txt
+ ┃ ┗ transcriptions.txt
+ ┣ textgrids
+ ┃ ┣ 2001.TextGrid
+ ┃ ┣ 2002.TextGrid
+ ┃ ┣ 2003.TextGrid
+ ┃ ┣ ...
+ ┣ wavs
+ ┃ ┣ 2001.wav
+ ┃ ┣ 2002.wav
+ ┃ ┣ 2003.wav
+ ┃ ┣ ...
+ ┣ TERMS_OF_ACCESS
+ ┗ readme.md
+```
+## OpenSinger
+The official OpenSinger dataset can be downloaded [here](https://drive.google.com/file/d/1EofoZxvalgMjZqzUEuEdleHIZ6SHtNuK/view). The file structure tree is like:
+```plaintext
+[OpenSinger dataset path]
+ ┣ ManRaw
+ ┃ ┣ {Singer_1}_{Song_1}
+ ┃ ┃ ┣ {Singer_1}_{Song_1}_0.lab
+ ┃ ┃ ┣ {Singer_1}_{Song_1}_0.txt
+ ┃ ┃ ┣ {Singer_1}_{Song_1}_0.wav
+ ┃ ┃ ┣ ...
+ ┃ ┣ {Singer_1}_{Song_2}
+ ┃ ┣ ...
+ ┣ WomanRaw
+ ┣ LICENSE
+ ┗ README.md
+```
+## Opera
+The official Opera dataset can be downloaded [here](http://isophonics.net/SingingVoiceDataset). The file structure tree is like:
+```plaintext
+[Opera dataset path]
+ ┣ monophonic
+ ┃ ┣ chinese
+ ┃ ┃ ┣ {Gender}_{SingerID}
+ ┃ ┃ ┃ ┣ {Emotion}_{SongID}.wav
+ ┃ ┃ ┃ ┣ ...
+ ┃ ┃ ┣ ...
+ ┃ ┣ western
+ ┣ polyphonic
+ ┃ ┣ chinese
+ ┃ ┣ western
+ ┣ CrossculturalDataSet.xlsx
+```
+## PopBuTFy
+The official PopBuTFy dataset can be downloaded [here](https://github.com/MoonInTheRiver/NeuralSVB). The file structure tree is like:
+```plaintext
+[PopBuTFy dataset path]
+ ┣ data
+ ┃ ┣ {SingerID}#singing#{SongName}_Amateur
+ ┃ ┃ ┣ {SingerID}#singing#{SongName}_Amateur_{UtteranceID}.mp3
+ ┃ ┃ ┣ ...
+ ┃ ┣ {SingerID}#singing#{SongName}_Professional
+ ┃ ┃ ┣ {SingerID}#singing#{SongName}_Professional_{UtteranceID}.mp3
+ ┃ ┃ ┣ ...
+ ┣ text_labels
+ ┗ TERMS_OF_ACCESS
+```
+## PopCS
+The official PopCS dataset can be downloaded [here](https://github.com/MoonInTheRiver/DiffSinger/blob/master/resources/apply_form.md). The file structure tree is like:
+```plaintext
+[PopCS dataset path]
+ ┣ popcs
+ ┃ ┣ popcs-{SongName}
+ ┃ ┃ ┣ {UtteranceID}_ph.txt
+ ┃ ┃ ┣ {UtteranceID}_wf0.wav
+ ┃ ┃ ┣ {UtteranceID}.TextGrid
+ ┃ ┃ ┣ {UtteranceID}.txt
+ ┃ ┃ ┣ ...
+ ┃ ┣ ...
+ ┗ TERMS_OF_ACCESS
+```
+## PJS
+The official PJS dataset can be downloaded [here](https://sites.google.com/site/shinnosuketakamichi/research-topics/pjs_corpus). The file structure tree is like:
+```plaintext
+[PJS dataset path]
+ ┣ PJS_corpus_ver1.1
+ ┃ ┣ background_noise
+ ┃ ┣ pjs{SongID}
+ ┃ ┃ ┣ pjs{SongID}_song.wav
+ ┃ ┃ ┣ pjs{SongID}_speech.wav
+ ┃ ┃ ┣ pjs{SongID}.lab
+ ┃ ┃ ┣ pjs{SongID}.mid
+ ┃ ┃ ┣ pjs{SongID}.musicxml
+ ┃ ┃ ┣ pjs{SongID}.txt
+ ┃ ┣ ...
+```
+## SVCC
+The official SVCC dataset can be downloaded [here](https://github.com/lesterphillip/SVCC23_FastSVC/tree/main/egs/generate_dataset). The file structure tree is like:
+```plaintext
+[SVCC dataset path]
+ ┣ Data
+ ┃ ┣ CDF1
+ ┃ ┃ ┣ 10001.wav
+ ┃ ┃ ┣ 10002.wav
+ ┃ ┃ ┣ ...
+ ┃ ┣ CDM1
+ ┃ ┣ IDF1
+ ┃ ┣ IDM1
+ ┗ README.md
+```
+## VCTK
+The official VCTK dataset can be downloaded [here](https://datashare.ed.ac.uk/handle/10283/3443). The file structure tree is like:
+```plaintext
+[VCTK dataset path]
+ ┣ txt
+ ┃ ┣ {Speaker_1}
+ ┃ ┃ ┣ {Speaker_1}_001.txt
+ ┃ ┃ ┣ {Speaker_1}_002.txt
+ ┃ ┃ ┣ ...
+ ┃ ┣ {Speaker_2}
+ ┃ ┣ ...
+ ┣ wav48_silence_trimmed
+ ┃ ┣ {Speaker_1}
+ ┃ ┃ ┣ {Speaker_1}_001_mic1.flac
+ ┃ ┃ ┣ {Speaker_1}_001_mic2.flac
+ ┃ ┃ ┣ {Speaker_1}_002_mic1.flac
+ ┃ ┃ ┣ ...
+ ┃ ┣ {Speaker_2}
+ ┃ ┣ ...
+ ┣ speaker-info.txt
+ ┗ update.txt
+```

egs/metrics/README.md ADDED Viewed

	@@ -0,0 +1,93 @@

+# Amphion Evaluation Recipe
+## Supported Evaluation Metrics
+Until now, Amphion Evaluation has supported the following objective metrics:
+- **F0 Modeling**:
+  - F0 Pearson Coefficients (FPC)
+  - F0 Periodicity Root Mean Square Error (PeriodicityRMSE)
+  - F0 Root Mean Square Error (F0RMSE)
+  - Voiced/Unvoiced F1 Score (V/UV F1)
+- **Energy Modeling**:
+  - Energy Root Mean Square Error (EnergyRMSE)
+  - Energy Pearson Coefficients (EnergyPC)
+- **Intelligibility**:
+  - Character Error Rate (CER) based on [Whipser](https://github.com/openai/whisper)
+  - Word Error Rate (WER) based on [Whipser](https://github.com/openai/whisper)
+- **Spectrogram Distortion**:
+  - Frechet Audio Distance (FAD)
+  - Mel Cepstral Distortion (MCD)
+  - Multi-Resolution STFT Distance (MSTFT)
+  - Perceptual Evaluation of Speech Quality (PESQ)
+  - Short Time Objective Intelligibility (STOI)
+  - Scale Invariant Signal to Distortion Ratio (SISDR)
+  - Scale Invariant Signal to Noise Ratio (SISNR)
+- **Speaker Similarity**:
+  - Cosine similarity based on [Rawnet3](https://github.com/Jungjee/RawNet)
+  - Cosine similarity based on [WeSpeaker](https://github.com/wenet-e2e/wespeaker) (👨‍💻 developing)
+We provide a recipe to demonstrate how to objectively evaluate your generated audios. There are three steps in total:
+1. Pretrained Models Preparation
+2. Audio Data Preparation
+3. Evaluation
+## 1. Pretrained Models Preparation
+If you want to calculate `RawNet3` based speaker similarity, you need to download the pretrained model first, as illustrated [here](../../pretrained/README.md).
+## 2. Aduio Data Preparation
+Prepare reference audios and generated audios in two folders, the `ref_dir` contains the reference audio and the `gen_dir` contains the generated audio. Here is an example.
+```plaintext
+ ┣ {ref_dir}
+ ┃ ┣ sample1.wav
+ ┃ ┣ sample2.wav
+ ┣ {gen_dir}
+ ┃ ┣ sample1.wav
+ ┃ ┣ sample2.wav
+```
+You have to make sure that the pairwise **reference audio and generated audio are named the same**, as illustrated above (sample1 to sample1, sample2 to sample2).
+## 3. Evaluation
+Run the `run.sh` with specified refenrece folder, generated folder, dump folder and metrics.
+```bash
+cd Amphion
+sh egs/metrics/run.sh \
+	--reference_folder [Your path to the reference audios] \
+	--generated_folder [Your path to the generated audios] \
+	--dump_folder [Your path to dump the objective results] \
+	--metrics [The metrics you need] \
+```
+As for the metrics, an example is provided below:
+```bash
+--metrics "mcd pesq fad"
+```
+All currently available metrics keywords are listed below:
+| Keys                  | Description                                |
+| --------------------- | ------------------------------------------ |
+| `fpc`                 | F0 Pearson Coefficients                    |
+| `f0_periodicity_rmse` | F0 Periodicity Root Mean Square Error      |
+| `f0rmse`              | F0 Root Mean Square Error                  |
+| `v_uv_f1`             | Voiced/Unvoiced F1 Score                   |
+| `energy_rmse`         | Energy Root Mean Square Error              |
+| `energy_pc`           | Energy Pearson Coefficients                |
+| `cer`                 | Character Error Rate                       |
+| `wer`                 | Word Error Rate                            |
+| `speaker_similarity`  | Cos Similarity based on RawNet3            |
+| `fad`                 | Frechet Audio Distance                     |
+| `mcd`                 | Mel Cepstral Distortion                    |
+| `mstft`               | Multi-Resolution STFT Distance             |
+| `pesq`                | Perceptual Evaluation of Speech Quality    |
+| `si_sdr`              | Scale Invariant Signal to Distortion Ratio |
+| `si_snr`              | Scale Invariant Signal to Noise Ratio      |
+| `stoi`                | Short Time Objective Intelligibility       |

egs/metrics/run.sh ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+######## Build Experiment Environment ###########
+exp_dir=$(cd `dirname $0`; pwd)
+work_dir=$(dirname $(dirname $exp_dir))
+export WORK_DIR=$work_dir
+export PYTHONPATH=$work_dir
+export PYTHONIOENCODING=UTF-8
+######## Parse the Given Parameters from the Commond ###########
+options=$(getopt -o c:n:s --long gpu:,reference_folder:,generated_folder:,dump_folder:,metrics: -- "$@")
+eval set -- "$options"
+while true; do
+  case $1 in
+    # Reference Audio Folder
+    --reference_folder) shift; ref_dir=$1 ; shift ;;
+    # Generated Audio Folder
+    --generated_folder) shift; deg_dir=$1 ; shift ;;
+    # Result Dumping Folder
+    --dump_folder) shift; dump_dir=$1 ; shift ;;
+    # Metrics to Compute
+    --metrics) shift; metrics=$1 ; shift ;;
+    --) shift ; break ;;
+    *) echo "Invalid option: $1" exit 1 ;;
+  esac
+done
+######## Calculate Objective Metrics ###########
+CUDA_VISIBLE_DEVICES=$gpu python "$work_dir"/bins/calc_metrics.py \
+    --ref_dir $ref_dir
+    --deg_dir $deg_dir
+    --dump_dir $dump_dir
+    --metrics $metrics
+    --fs

egs/svc/DiffComoSVC/README.md ADDED Viewed

	@@ -0,0 +1,234 @@

+# Accelerating Diffusion-based Singing Voice Conversion through Consistency Distillation
+<br>
+<div align="center">
+<img src="../../../imgs/svc/DiffComoSVC.png" width="90%">
+</div>
+<br>
+This is an implement of [Consistency Models](https://arxiv.org/abs/2303.01469) for accelerating diffusion-based singing voice conversion. The overall architecture follows "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio), only a slightly modification is applied on acoustic model. Specifically,
+* The acoustic model is a conformer which generates a coarse spectrogram and a diffusion decoder based on Bidirectional Non-Causal Dilated CNN which polish the former spectrogram for better. This is similar to [CoMoSpeech: One-Step Speech and Singing Voice Synthesis via Consistency Model](https://comospeech.github.io/)
+* To accelerate diffusion model, we apply consistency distillation from [Consistency Models](https://arxiv.org/abs/2303.01469). For teacher model, the diffusion schedule of the diffusion decoder follows [karras diffusion](https://arxiv.org/abs/2206.00364). For distilling teacher model, the condition encoder and the conformer part of acoustic model are frozen while the diffusion decoder model is updated via exponential moving average. See Figure above for details.
+There are five stages in total:
+1. Data preparation
+2. Features extraction
+3. Teacher Model Training
+4. Consistency Distillation
+5. Inference/conversion
+## 1. Data Preparation
+### Dataset Download
+By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
+### Configuration
+Specify the dataset paths in  `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
+```json
+    "dataset": [
+        "m4singer",
+        "opencpop",
+        "opensinger",
+        "svcc",
+        "vctk"
+    ],
+    "dataset_path": {
+        // TODO: Fill in your dataset path
+        "m4singer": "[M4Singer dataset path]",
+        "opencpop": "[Opencpop dataset path]",
+        "opensinger": "[OpenSinger dataset path]",
+        "svcc": "[SVCC dataset path]",
+        "vctk": "[VCTK dataset path]"
+    },
+```
+## 2. Features Extraction
+### Content-based Pretrained Models Download
+By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
+### Configuration
+Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
+```json
+    // TODO: Fill in the output log path
+    "log_dir": "[Your path to save logs and checkpoints]",
+    "preprocess": {
+        // TODO: Fill in the output data path
+        "processed_dir": "[Your path to save processed data]",
+        ...
+    },
+```
+### Run
+Run the `run.sh` as the preproces stage (set  `--stage 1`).
+```bash
+cd Amphion
+sh egs/svc/DiffComoSVC/run.sh --stage 1
+```
+Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
+## 3. Teacher Model Training
+### Configuration
+Set the `distill` in `config/comosvc.json` to `false` for teacher model training, you can also specify the detailed configuration for conformer encoder and diffusion process here:
+```JSON
+"comosvc":{
+            "distill": false,
+            // conformer encoder
+            "input_dim": 384,
+            "output_dim": 100,
+            "n_heads": 2,
+            "n_layers": 6,
+            "filter_channels":512,
+            // karras diffusion
+            "P_mean": -1.2,
+            "P_std": 1.2,
+            "sigma_data": 0.5,
+            "sigma_min": 0.002,
+            "sigma_max": 80,
+            "rho": 7,
+            "n_timesteps": 40,
+        },
+```
+We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
+```json
+"train": {
+        "batch_size": 32,
+        ...
+        "adamw": {
+            "lr": 2.0e-4
+        },
+        ...
+    }
+```
+### Run
+Run the `run.sh` as the training stage (set  `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `[Your path to save logs and checkpoints]/[YourExptName]`.
+```bash
+cd Amphion
+sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName]
+```
+Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can specify it when running `run.sh` such as:
+```bash
+cd Amphion
+sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName] --gpu "0,1,2,3"
+```
+## 4. Consistency Distillation
+### Configuration
+Set the `distill` in `config/comosvc.json` to `true` for teacher model training, and specify the `teacher_model_path` for consistency distillation. You can also specify the detailed configuration for conformer encoder and diffusion process here:
+```JSON
+"model": {
+    "teacher_model_path":"[Your_teacher_model_checkpoint].bin",
+    ...
+    "comosvc":{
+                "distill": true,
+                // conformer encoder
+                "input_dim": 384,
+                "output_dim": 100,
+                "n_heads": 2,
+                "n_layers": 6,
+                "filter_channels":512,
+                // karras diffusion
+                "P_mean": -1.2,
+                "P_std": 1.2,
+                "sigma_data": 0.5,
+                "sigma_min": 0.002,
+                "sigma_max": 80,
+                "rho": 7,
+                "n_timesteps": 40,
+            },
+```
+We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
+```json
+"train": {
+        "batch_size": 32,
+        ...
+        "adamw": {
+            "lr": 2.0e-4
+        },
+        ...
+    }
+```
+### Run
+Run the `run.sh` as the training stage (set  `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `[Your path to save logs and checkpoints]/[YourExptName]`.
+```bash
+cd Amphion
+sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName]
+```
+Note: The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can specify it when running `run.sh` such as:
+```bash
+cd Amphion
+sh egs/svc/DiffComoSVC/run.sh --stage 2 --name [YourExptName] --gpu "0,1,2,3"
+```
+## 5. Inference/Conversion
+### Pretrained Vocoder Download
+We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
+### Run
+For inference/conversion, you need to specify the following configurations when running `run.sh`:
+| Parameters                                          | Description                                                  | Example                                                      |
+| --------------------------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| `--infer_expt_dir`                                  | The experimental directory which contains `checkpoint`       | `[Your path to save logs and checkpoints]/[YourExptName]`    |
+| `--infer_output_dir`                                | The output directory to save inferred audios.                | `[Your path to save logs and checkpoints]/[YourExptName]/result` |
+| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir).          | The `infer_source_file` could be `[Your path to save processed data]/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
+| `--infer_target_speaker`                            | The target speaker you want to convert into. You can refer to `[Your path to save logs and checkpoints]/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
+| `--infer_key_shift`                                 | How many semitones you want to transpose.                    | `"autoshfit"` (by default), `3`, `-3`, etc.                  |
+For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
+```bash
+cd Amphion
+sh egs/svc/DiffComoSVC/run.sh --stage 3 --gpu "0" \
+    --infer_expt_dir [Your path to save logs and checkpoints]/[YourExptName] \
+    --infer_output_dir [Your path to save logs and checkpoints]/[YourExptName]/result \
+    --infer_source_audio_dir [Your Audios Folder] \
+    --infer_target_speaker "opencpop_female1" \
+    --infer_key_shift "autoshift"
+```
+Specially, you can configurate the inference steps for teacher model by setting `inference` at `exp_config`(student model is always one-step sampling):
+```json
+    "inference": {
+        "comosvc": {
+            "inference_steps": 40
+        }
+    }
+```
+# Reference
+https://github.com/zhenye234/CoMoSpeech
+https://github.com/openai/consistency_models

egs/svc/DiffComoSVC/exp_config.json ADDED Viewed

	@@ -0,0 +1,143 @@

+{
+    "base_config": "config/comosvc.json",
+    "model_type": "DiffComoSVC",
+    "dataset": [
+        "m4singer",
+        "opencpop",
+        "opensinger",
+        "svcc",
+        "vctk"
+    ],
+    "dataset_path": {
+        // TODO: Fill in your dataset path
+        "m4singer": "[M4Singer dataset path]",
+        "opencpop": "[Opencpop dataset path]",
+        "opensinger": "[OpenSinger dataset path]",
+        "svcc": "[SVCC dataset path]",
+        "vctk": "[VCTK dataset path]"
+    },
+    // TODO: Fill in the output log path
+    "log_dir": "[Your path to save logs and checkpoints]",
+    "preprocess": {
+        // TODO: Fill in the output data path
+        "processed_dir": "[Your path to save processed data]",
+        // Config for features extraction
+        "extract_mel": true,
+        "extract_pitch": true,
+        "extract_energy": true,
+        "extract_whisper_feature": true,
+        "extract_contentvec_feature": true,
+        "extract_wenet_feature": false,
+        "whisper_batch_size": 30, // decrease it if your GPU is out of memory
+        "contentvec_batch_size": 1,
+        // Fill in the content-based pretrained model's path
+        "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
+        "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
+        "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
+        "whisper_model": "medium",
+        "whisper_model_path": "pretrained/whisper/medium.pt",
+        // Config for features usage
+        "use_mel": true,
+        "use_min_max_norm_mel": true,
+        "use_frame_pitch": true,
+        "use_frame_energy": true,
+        "use_spkid": true,
+        "use_whisper": true,
+        "use_contentvec": true,
+        "use_wenet": false,
+        "n_mel": 100,
+        "sample_rate": 24000
+    },
+    "model": {
+        "teacher_model_path":"[Your_teacher_model_checkpoint].bin",
+        "condition_encoder": {
+            // Config for features usage
+            "use_whisper": true,
+            "use_contentvec": true,
+            "use_wenet": false,
+            "whisper_dim": 1024,
+            "contentvec_dim": 256,
+            "wenet_dim": 512,
+            "use_singer_encoder": false,
+            "pitch_min": 50,
+            "pitch_max": 1100
+        },
+        "comosvc":{
+            "distill": false,
+            // conformer encoder
+            "input_dim": 384,
+            "output_dim": 100,
+            "n_heads": 2,
+            "n_layers": 6,
+            "filter_channels":512,
+            "dropout":0.1,
+            // karras diffusion
+            "P_mean": -1.2,
+            "P_std": 1.2,
+            "sigma_data": 0.5,
+            "sigma_min": 0.002,
+            "sigma_max": 80,
+            "rho": 7,
+            "n_timesteps": 40,
+        },
+        "diffusion": {
+            // Diffusion steps encoder
+            "step_encoder": {
+                "dim_raw_embedding": 128,
+                "dim_hidden_layer": 512,
+                "activation": "SiLU",
+                "num_layer": 2,
+                "max_period": 10000
+            },
+            // Diffusion decoder
+            "model_type": "bidilconv",
+            // bidilconv, unet2d, TODO: unet1d
+            "bidilconv": {
+                "base_channel": 384,
+                "n_res_block": 20,
+                "conv_kernel_size": 3,
+                "dilation_cycle_length": 4,
+                // specially, 1 means no dilation
+                "conditioner_size": 100
+            }
+        }
+    },
+    "train": {
+        "batch_size": 64,
+        "gradient_accumulation_step": 1,
+        "max_epoch": -1, // -1 means no limit
+        "save_checkpoint_stride": [
+            50,
+            50
+        ],
+        "keep_last": [
+            5,
+            -1
+        ],
+        "run_eval": [
+            false,
+            true
+        ],
+        "adamw": {
+            "lr": 4.0e-4
+        },
+        "reducelronplateau": {
+            "factor": 0.8,
+            "patience": 10,
+            "min_lr": 1.0e-4
+        },
+        "dataloader": {
+            "num_worker": 8,
+            "pin_memory": true
+        },
+        "sampler": {
+            "holistic_shuffle": false,
+            "drop_last": true
+        }
+    },
+    "inference": {
+        "comosvc": {
+            "inference_steps": 40
+        }
+    }
+}

egs/svc/DiffComoSVC/run.sh ADDED Viewed

	@@ -0,0 +1,150 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+######## Build Experiment Environment ###########
+exp_dir=$(cd `dirname $0`; pwd)
+work_dir=$(dirname $(dirname $(dirname $exp_dir)))
+export WORK_DIR=$work_dir
+export PYTHONPATH=$work_dir
+export PYTHONIOENCODING=UTF-8
+######## Parse the Given Parameters from the Commond ###########
+options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir: -- "$@")
+eval set -- "$options"
+while true; do
+  case $1 in
+    # Experimental Configuration File
+    -c | --config) shift; exp_config=$1 ; shift ;;
+    # Experimental Name
+    -n | --name) shift; exp_name=$1 ; shift ;;
+    # Running Stage
+    -s | --stage) shift; running_stage=$1 ; shift ;;
+    # Visible GPU machines. The default value is "0".
+    --gpu) shift; gpu=$1 ; shift ;;
+    # [Only for Training] Resume configuration
+    --resume) shift; resume=$1 ; shift ;;
+    # [Only for Training] The specific checkpoint path that you want to resume from.
+    --resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;;
+    # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
+    --resume_type) shift; resume_type=$1 ; shift ;;
+    # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
+    --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
+    # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
+    --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
+    # [Only for Inference] The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir can be "$work_dir/source_audio" which includes several audio files (*.wav, *.mp3 or *.flac).
+    --infer_source_file) shift; infer_source_file=$1 ; shift ;;
+    --infer_source_audio_dir) shift; infer_source_audio_dir=$1 ; shift ;;
+    # [Only for Inference] Specify the target speaker you want to convert into. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1".
+    --infer_target_speaker) shift; infer_target_speaker=$1 ; shift ;;
+    # [Only for Inference] For advanced users, you can modify the trans_key parameters into an integer (which means the semitones you want to transpose). Its default value is "autoshift".
+    --infer_key_shift) shift; infer_key_shift=$1 ; shift ;;
+    # [Only for Inference] The vocoder dir. Its default value is Amphion/pretrained/bigvgan. See Amphion/pretrained/README.md to download the pretrained BigVGAN vocoders.
+    --infer_vocoder_dir) shift; infer_vocoder_dir=$1 ; shift ;;
+    --) shift ; break ;;
+    *) echo "Invalid option: $1" exit 1 ;;
+  esac
+done
+### Value check ###
+if [ -z "$running_stage" ]; then
+    echo "[Error] Please specify the running stage"
+    exit 1
+fi
+if [ -z "$exp_config" ]; then
+    exp_config="${exp_dir}"/exp_config.json
+fi
+echo "Exprimental Configuration File: $exp_config"
+if [ -z "$gpu" ]; then
+    gpu="0"
+fi
+######## Features Extraction ###########
+if [ $running_stage -eq 1 ]; then
+    CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/svc/preprocess.py \
+        --config $exp_config \
+        --num_workers 4
+fi
+######## Training ###########
+if [ $running_stage -eq 2 ]; then
+    if [ -z "$exp_name" ]; then
+        echo "[Error] Please specify the experiments name"
+        exit 1
+    fi
+    echo "Exprimental Name: $exp_name"
+    if [ "$resume" = true ]; then
+        echo "Automatically resume from the experimental dir..."
+        CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/svc/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --resume
+    else
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/svc/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --resume_from_ckpt_path "$resume_from_ckpt_path" \
+            --resume_type "$resume_type"
+    fi
+fi
+######## Inference/Conversion ###########
+if [ $running_stage -eq 3 ]; then
+    if [ -z "$infer_expt_dir" ]; then
+        echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
+        exit 1
+    fi
+    if [ -z "$infer_output_dir" ]; then
+        infer_output_dir="$expt_dir/result"
+    fi
+    if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then
+        echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)."
+        exit 1
+    fi
+    if [ -z "$infer_source_file" ]; then
+        infer_source=$infer_source_audio_dir
+    fi
+    if [ -z "$infer_source_audio_dir" ]; then
+        infer_source=$infer_source_file
+    fi
+    if [ -z "$infer_target_speaker" ]; then
+        echo "[Error] Please specify the target speaker. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1""
+        exit 1
+    fi
+    if [ -z "$infer_key_shift" ]; then
+        infer_key_shift="autoshift"
+    fi
+    if [ -z "$infer_vocoder_dir" ]; then
+        infer_vocoder_dir="$work_dir"/pretrained/bigvgan
+        echo "[Warning] You don't specify the infer_vocoder_dir. It is set $infer_vocoder_dir by default. Make sure that you have followed Amphoion/pretrained/README.md to download the pretrained BigVGAN vocoder checkpoint."
+    fi
+    CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/svc/inference.py \
+        --config $exp_config \
+        --acoustics_dir $infer_expt_dir \
+        --vocoder_dir $infer_vocoder_dir \
+        --target_singer $infer_target_speaker \
+        --trans_key $infer_key_shift \
+        --source $infer_source \
+        --output_dir $infer_output_dir  \
+        --log_level debug
+fi

egs/svc/MultipleContentsSVC/README.md ADDED Viewed

	@@ -0,0 +1,153 @@

+# Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion
+[![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2310.11160)
+[![demo](https://img.shields.io/badge/SVC-Demo-red)](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html)
+<br>
+<div align="center">
+<img src="../../../imgs/svc/MultipleContentsSVC.png" width="85%">
+</div>
+<br>
+This is the official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Specially,
+- The muptile content features are from [Whipser](https://github.com/wenet-e2e/wenet) and [ContentVec](https://github.com/auspicious3000/contentvec).
+- The acoustic model is based on Bidirectional Non-Causal Dilated CNN (called `DiffWaveNetSVC` in Amphion), which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
+- The vocoder is [BigVGAN](https://github.com/NVIDIA/BigVGAN) architecture and we fine-tuned it in over 120 hours singing voice data.
+There are four stages in total:
+1. Data preparation
+2. Features extraction
+3. Training
+4. Inference/conversion
+> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
+> ```bash
+> cd Amphion
+> ```
+## 1. Data Preparation
+### Dataset Download
+By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
+### Configuration
+Specify the dataset paths in  `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
+```json
+    "dataset": [
+        "m4singer",
+        "opencpop",
+        "opensinger",
+        "svcc",
+        "vctk"
+    ],
+    "dataset_path": {
+        // TODO: Fill in your dataset path
+        "m4singer": "[M4Singer dataset path]",
+        "opencpop": "[Opencpop dataset path]",
+        "opensinger": "[OpenSinger dataset path]",
+        "svcc": "[SVCC dataset path]",
+        "vctk": "[VCTK dataset path]"
+    },
+```
+## 2. Features Extraction
+### Content-based Pretrained Models Download
+By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
+### Configuration
+Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
+```json
+    // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
+    "log_dir": "ckpts/svc",
+    "preprocess": {
+        // TODO: Fill in the output data path. The default value is "Amphion/data"
+        "processed_dir": "data",
+        ...
+    },
+```
+### Run
+Run the `run.sh` as the preproces stage (set  `--stage 1`).
+```bash
+sh egs/svc/MultipleContentsSVC/run.sh --stage 1
+```
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
+## 3. Training
+### Configuration
+We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
+```json
+"train": {
+        "batch_size": 32,
+        ...
+        "adamw": {
+            "lr": 2.0e-4
+        },
+        ...
+    }
+```
+### Run
+Run the `run.sh` as the training stage (set  `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
+```bash
+sh egs/svc/MultipleContentsSVC/run.sh --stage 2 --name [YourExptName]
+```
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
+## 4. Inference/Conversion
+### Pretrained Vocoder Download
+We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
+### Run
+For inference/conversion, you need to specify the following configurations when running `run.sh`:
+| Parameters                                          | Description                                                                                                                                | Example                                                                                                                                                                            |
+| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `--infer_expt_dir`                                  | The experimental directory which contains `checkpoint`                                                                                     | `Amphion/ckpts/svc/[YourExptName]`                                                                                                                                                 |
+| `--infer_output_dir`                                | The output directory to save inferred audios.                                                                                              | `Amphion/ckpts/svc/[YourExptName]/result`                                                                                                                                          |
+| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir).                                                                                        | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
+| `--infer_target_speaker`                            | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`.                                                                                                                |
+| `--infer_key_shift`                                 | How many semitones you want to transpose.                                                                                                  | `"autoshfit"` (by default), `3`, `-3`, etc.                                                                                                                                        |
+For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
+```bash
+sh egs/svc/MultipleContentsSVC/run.sh --stage 3 --gpu "0" \
+	--infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
+	--infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
+	--infer_source_audio_dir [Your Audios Folder] \
+	--infer_target_speaker "opencpop_female1" \
+	--infer_key_shift "autoshift"
+```
+## Citations
+```bibtex
+@article{zhang2023leveraging,
+  title={Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion},
+  author={Zhang, Xueyao and Gu, Yicheng and Chen, Haopeng and Fang, Zihao and Zou, Lexiao and Xue, Liumeng and Wu, Zhizheng},
+  journal={Machine Learning for Audio Worshop, NeurIPS 2023},
+  year={2023}
+}
+```

egs/svc/MultipleContentsSVC/exp_config.json ADDED Viewed

	@@ -0,0 +1,126 @@

+{
+    "base_config": "config/diffusion.json",
+    "model_type": "DiffWaveNetSVC",
+    "dataset": [
+        "m4singer",
+        "opencpop",
+        "opensinger",
+        "svcc",
+        "vctk"
+    ],
+    "dataset_path": {
+        // TODO: Fill in your dataset path
+        "m4singer": "[M4Singer dataset path]",
+        "opencpop": "[Opencpop dataset path]",
+        "opensinger": "[OpenSinger dataset path]",
+        "svcc": "[SVCC dataset path]",
+        "vctk": "[VCTK dataset path]"
+    },
+    // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
+    "log_dir": "ckpts/svc",
+    "preprocess": {
+        // TODO: Fill in the output data path. The default value is "Amphion/data"
+        "processed_dir": "data",
+        // Config for features extraction
+        "extract_mel": true,
+        "extract_pitch": true,
+        "extract_energy": true,
+        "extract_whisper_feature": true,
+        "extract_contentvec_feature": true,
+        "extract_wenet_feature": false,
+        "whisper_batch_size": 30, // decrease it if your GPU is out of memory
+        "contentvec_batch_size": 1,
+        // Fill in the content-based pretrained model's path
+        "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
+        "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
+        "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
+        "whisper_model": "medium",
+        "whisper_model_path": "pretrained/whisper/medium.pt",
+        // Config for features usage
+        "use_mel": true,
+        "use_min_max_norm_mel": true,
+        "use_frame_pitch": true,
+        "use_frame_energy": true,
+        "use_spkid": true,
+        "use_whisper": true,
+        "use_contentvec": true,
+        "use_wenet": false,
+        "n_mel": 100,
+        "sample_rate": 24000
+    },
+    "model": {
+        "condition_encoder": {
+            // Config for features usage
+            "use_whisper": true,
+            "use_contentvec": true,
+            "use_wenet": false,
+            "whisper_dim": 1024,
+            "contentvec_dim": 256,
+            "wenet_dim": 512,
+            "use_singer_encoder": false,
+            "pitch_min": 50,
+            "pitch_max": 1100
+        },
+        "diffusion": {
+            "scheduler": "ddpm",
+            "scheduler_settings": {
+                "num_train_timesteps": 1000,
+                "beta_start": 1.0e-4,
+                "beta_end": 0.02,
+                "beta_schedule": "linear"
+            },
+            // Diffusion steps encoder
+            "step_encoder": {
+                "dim_raw_embedding": 128,
+                "dim_hidden_layer": 512,
+                "activation": "SiLU",
+                "num_layer": 2,
+                "max_period": 10000
+            },
+            // Diffusion decoder
+            "model_type": "bidilconv",
+            // bidilconv, unet2d, TODO: unet1d
+            "bidilconv": {
+                "base_channel": 512,
+                "n_res_block": 40,
+                "conv_kernel_size": 3,
+                "dilation_cycle_length": 4,
+                // specially, 1 means no dilation
+                "conditioner_size": 384
+            }
+        }
+    },
+    "train": {
+        "batch_size": 32,
+        "gradient_accumulation_step": 1,
+        "max_epoch": -1, // -1 means no limit
+        "save_checkpoint_stride": [
+            3,
+            50
+        ],
+        "keep_last": [
+            3,
+            2
+        ],
+        "run_eval": [
+            true,
+            true
+        ],
+        "adamw": {
+            "lr": 2.0e-4
+        },
+        "reducelronplateau": {
+            "factor": 0.8,
+            "patience": 30,
+            "min_lr": 1.0e-4
+        },
+        "dataloader": {
+            "num_worker": 8,
+            "pin_memory": true
+        },
+        "sampler": {
+            "holistic_shuffle": false,
+            "drop_last": true
+        }
+    }
+}

egs/svc/MultipleContentsSVC/run.sh ADDED Viewed

	@@ -0,0 +1,150 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+######## Build Experiment Environment ###########
+exp_dir=$(cd `dirname $0`; pwd)
+work_dir=$(dirname $(dirname $(dirname $exp_dir)))
+export WORK_DIR=$work_dir
+export PYTHONPATH=$work_dir
+export PYTHONIOENCODING=UTF-8
+######## Parse the Given Parameters from the Commond ###########
+options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir: -- "$@")
+eval set -- "$options"
+while true; do
+  case $1 in
+    # Experimental Configuration File
+    -c | --config) shift; exp_config=$1 ; shift ;;
+    # Experimental Name
+    -n | --name) shift; exp_name=$1 ; shift ;;
+    # Running Stage
+    -s | --stage) shift; running_stage=$1 ; shift ;;
+    # Visible GPU machines. The default value is "0".
+    --gpu) shift; gpu=$1 ; shift ;;
+    # [Only for Training] Resume configuration
+    --resume) shift; resume=$1 ; shift ;;
+    # [Only for Training] The specific checkpoint path that you want to resume from.
+    --resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;;
+    # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
+    --resume_type) shift; resume_type=$1 ; shift ;;
+    # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
+    --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
+    # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
+    --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
+    # [Only for Inference] The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir can be "$work_dir/source_audio" which includes several audio files (*.wav, *.mp3 or *.flac).
+    --infer_source_file) shift; infer_source_file=$1 ; shift ;;
+    --infer_source_audio_dir) shift; infer_source_audio_dir=$1 ; shift ;;
+    # [Only for Inference] Specify the target speaker you want to convert into. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1".
+    --infer_target_speaker) shift; infer_target_speaker=$1 ; shift ;;
+    # [Only for Inference] For advanced users, you can modify the trans_key parameters into an integer (which means the semitones you want to transpose). Its default value is "autoshift".
+    --infer_key_shift) shift; infer_key_shift=$1 ; shift ;;
+    # [Only for Inference] The vocoder dir. Its default value is Amphion/pretrained/bigvgan. See Amphion/pretrained/README.md to download the pretrained BigVGAN vocoders.
+    --infer_vocoder_dir) shift; infer_vocoder_dir=$1 ; shift ;;
+    --) shift ; break ;;
+    *) echo "Invalid option: $1" exit 1 ;;
+  esac
+done
+### Value check ###
+if [ -z "$running_stage" ]; then
+    echo "[Error] Please specify the running stage"
+    exit 1
+fi
+if [ -z "$exp_config" ]; then
+    exp_config="${exp_dir}"/exp_config.json
+fi
+echo "Exprimental Configuration File: $exp_config"
+if [ -z "$gpu" ]; then
+    gpu="0"
+fi
+######## Features Extraction ###########
+if [ $running_stage -eq 1 ]; then
+    CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/svc/preprocess.py \
+        --config $exp_config \
+        --num_workers 4
+fi
+######## Training ###########
+if [ $running_stage -eq 2 ]; then
+    if [ -z "$exp_name" ]; then
+        echo "[Error] Please specify the experiments name"
+        exit 1
+    fi
+    echo "Exprimental Name: $exp_name"
+    if [ "$resume" = true ]; then
+        echo "Automatically resume from the experimental dir..."
+        CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/svc/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --resume
+    else
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/svc/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --resume_from_ckpt_path "$resume_from_ckpt_path" \
+            --resume_type "$resume_type"
+    fi
+fi
+######## Inference/Conversion ###########
+if [ $running_stage -eq 3 ]; then
+    if [ -z "$infer_expt_dir" ]; then
+        echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
+        exit 1
+    fi
+    if [ -z "$infer_output_dir" ]; then
+        infer_output_dir="$expt_dir/result"
+    fi
+    if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then
+        echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)."
+        exit 1
+    fi
+    if [ -z "$infer_source_file" ]; then
+        infer_source=$infer_source_audio_dir
+    fi
+    if [ -z "$infer_source_audio_dir" ]; then
+        infer_source=$infer_source_file
+    fi
+    if [ -z "$infer_target_speaker" ]; then
+        echo "[Error] Please specify the target speaker. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1""
+        exit 1
+    fi
+    if [ -z "$infer_key_shift" ]; then
+        infer_key_shift="autoshift"
+    fi
+    if [ -z "$infer_vocoder_dir" ]; then
+        infer_vocoder_dir="$work_dir"/pretrained/bigvgan
+        echo "[Warning] You don't specify the infer_vocoder_dir. It is set $infer_vocoder_dir by default. Make sure that you have followed Amphoion/pretrained/README.md to download the pretrained BigVGAN vocoder checkpoint."
+    fi
+    CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/svc/inference.py \
+        --config $exp_config \
+        --acoustics_dir $infer_expt_dir \
+        --vocoder_dir $infer_vocoder_dir \
+        --target_singer $infer_target_speaker \
+        --trans_key $infer_key_shift \
+        --source $infer_source \
+        --output_dir $infer_output_dir  \
+        --log_level debug
+fi

egs/svc/README.md ADDED Viewed

	@@ -0,0 +1,34 @@

+# Amphion Singing Voice Conversion (SVC) Recipe
+## Quick Start
+We provide a **[beginner recipe](MultipleContentsSVC)** to demonstrate how to train a cutting edge SVC model. Specifically, it is also an official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Some demos can be seen [here](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html).
+## Supported Model Architectures
+The main idea of SVC is to first disentangle the speaker-agnostic representations from the source audio, and then inject the desired speaker information to synthesize the target, which usually utilizes an acoustic decoder and a subsequent waveform synthesizer (vocoder):
+<br>
+<div align="center">
+  <img src="../../imgs/svc/pipeline.png" width="70%">
+</div>
+<br>
+Until now, Amphion SVC has supported the following features and models:
+- **Speaker-agnostic Representations**:
+  - Content Features: Sourcing from [WeNet](https://github.com/wenet-e2e/wenet), [Whisper](https://github.com/openai/whisper), and [ContentVec](https://github.com/auspicious3000/contentvec).
+  - Prosody Features: F0 and energy.
+- **Speaker Embeddings**:
+  - Speaker Look-Up Table.
+  - Reference Encoder (👨‍💻 developing): It can be used for zero-shot SVC.
+- **Acoustic Decoders**:
+  - Diffusion-based models:
+    - **[DiffWaveNetSVC](MultipleContentsSVC)**: The encoder is based on Bidirectional Non-Causal Dilated CNN, which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
+    - **[DiffComoSVC](DiffComoSVC)** (👨‍💻 developing): The diffusion framework is based on [Consistency Model](https://proceedings.mlr.press/v202/song23a.html). It can significantly accelerate the inference process of the diffusion model.
+  - Transformer-based models:
+    - **[TransformerSVC](TransformerSVC)**: Encoder-only and Non-autoregressive Transformer Architecture.
+  - VAE- and Flow-based models:
+    - **[VitsSVC]()** (👨‍💻 developing): It is designed as a [VITS](https://arxiv.org/abs/2106.06103)-like model whose textual input is replaced by the content features, which is similar to [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc).
+- **Waveform Synthesizers (Vocoders)**:
+  - The supported vocoders can be seen in [Amphion Vocoder Recipe](../vocoder/README.md).

egs/svc/TransformerSVC/README.md ADDED Viewed

	@@ -0,0 +1,164 @@

+# Transformer for Singing Voice Conversion
+This is an implementation of **vanilla transformer encoder**/**conformer** as acoustic model for singing voice conversion.
+There are four stages in total:
+1. Data preparation
+2. Features extraction
+3. Training
+4. Inference/conversion
+> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
+> ```bash
+> cd Amphion
+> ```
+## 1. Data Preparation
+### Dataset Download
+By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
+### Configuration
+Specify the dataset paths in  `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
+```json
+    "dataset": [
+        "m4singer",
+        "opencpop",
+        "opensinger",
+        "svcc",
+        "vctk"
+    ],
+    "dataset_path": {
+        // TODO: Fill in your dataset path
+        "m4singer": "[M4Singer dataset path]",
+        "opencpop": "[Opencpop dataset path]",
+        "opensinger": "[OpenSinger dataset path]",
+        "svcc": "[SVCC dataset path]",
+        "vctk": "[VCTK dataset path]"
+    },
+```
+## 2. Features Extraction
+### Content-based Pretrained Models Download
+By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
+### Configuration
+Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
+```json
+    // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
+    "log_dir": "ckpts/svc",
+    "preprocess": {
+        // TODO: Fill in the output data path. The default value is "Amphion/data"
+        "processed_dir": "data",
+        ...
+    },
+```
+### Run
+Run the `run.sh` as the preproces stage (set  `--stage 1`).
+```bash
+sh egs/svc/TransformerSVC/run.sh --stage 1
+```
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
+## 3. Training
+### Configuration
+Specify the detailed configuration for transformer block in `exp_config.json`. For key `type`, `conformer` and `transformer` are supported:
+```json
+"model": {
+        ...
+        "transformer":{
+            // 'conformer' or 'transformer'
+            "type": "conformer",
+            "input_dim": 384,
+            "output_dim": 100,
+            "n_heads": 2,
+            "n_layers": 6,
+            "filter_channels":512,
+            "dropout":0.1,
+        }
+    }
+```
+We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
+```json
+"train": {
+        "batch_size": 32,
+        ...
+        "adamw": {
+            "lr": 2.0e-4
+        },
+        ...
+    }
+```
+### Run
+Run the `run.sh` as the training stage (set  `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
+```bash
+sh egs/svc/TransformerSVC/run.sh --stage 2 --name [YourExptName]
+```
+> **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
+## 4. Inference/Conversion
+### Pretrained Vocoder Download
+We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
+### Run
+For inference/conversion, you need to specify the following configurations when running `run.sh`:
+| Parameters                                          | Description                                                                                                                                | Example                                                                                                                                                                            |
+| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `--infer_expt_dir`                                  | The experimental directory which contains `checkpoint`                                                                                     | `Amphion/ckpts/svc/[YourExptName]`                                                                                                                                                 |
+| `--infer_output_dir`                                | The output directory to save inferred audios.                                                                                              | `Amphion/ckpts/svc/[YourExptName]/result`                                                                                                                                          |
+| `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir).                                                                                        | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
+| `--infer_target_speaker`                            | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`.                                                                                                                |
+| `--infer_key_shift`                                 | How many semitones you want to transpose.                                                                                                  | `"autoshfit"` (by default), `3`, `-3`, etc.                                                                                                                                        |
+For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
+```bash
+cd Amphion
+sh egs/svc/TransformerSVC/run.sh --stage 3 --gpu "0" \
+	--infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
+	--infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
+	--infer_source_audio_dir [Your Audios Folder] \
+	--infer_target_speaker "opencpop_female1" \
+	--infer_key_shift "autoshift"
+```
+## Citations
+```bibtex
+@inproceedings{transformer,
+  author       = {Ashish Vaswani and
+                  Noam Shazeer and
+                  Niki Parmar and
+                  Jakob Uszkoreit and
+                  Llion Jones and
+                  Aidan N. Gomez and
+                  Lukasz Kaiser and
+                  Illia Polosukhin},
+  title        = {Attention is All you Need},
+  booktitle    = {{NIPS}},
+  pages        = {5998--6008},
+  year         = {2017}
+}
+```

egs/svc/TransformerSVC/exp_config.json ADDED Viewed

	@@ -0,0 +1,108 @@

+{
+    "base_config": "config/transformer.json",
+    "model_type": "TransformerSVC",
+    "dataset": [
+        "m4singer",
+        "opencpop",
+        "opensinger",
+        "svcc",
+        "vctk"
+    ],
+    "dataset_path": {
+        // TODO: Fill in your dataset path
+        "m4singer": "[M4Singer dataset path]",
+        "opencpop": "[Opencpop dataset path]",
+        "opensinger": "[OpenSinger dataset path]",
+        "svcc": "[SVCC dataset path]",
+        "vctk": "[VCTK dataset path]"
+    },
+    // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
+    "log_dir": "ckpts/svc",
+    "preprocess": {
+        // TODO: Fill in the output data path. The default value is "Amphion/data"
+        "processed_dir": "data",
+        // Config for features extraction
+        "extract_mel": true,
+        "extract_pitch": true,
+        "extract_energy": true,
+        "extract_whisper_feature": true,
+        "extract_contentvec_feature": true,
+        "extract_wenet_feature": false,
+        "whisper_batch_size": 30, // decrease it if your GPU is out of memory
+        "contentvec_batch_size": 1,
+        // Fill in the content-based pretrained model's path
+        "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
+        "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
+        "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
+        "whisper_model": "medium",
+        "whisper_model_path": "pretrained/whisper/medium.pt",
+        // Config for features usage
+        "use_mel": true,
+        "use_min_max_norm_mel": true,
+        "use_frame_pitch": true,
+        "use_frame_energy": true,
+        "use_spkid": true,
+        "use_whisper": true,
+        "use_contentvec": true,
+        "use_wenet": false,
+        "n_mel": 100,
+        "sample_rate": 24000
+    },
+    "model": {
+        "condition_encoder": {
+            // Config for features usage
+            "use_whisper": true,
+            "use_contentvec": true,
+            "use_wenet": false,
+            "whisper_dim": 1024,
+            "contentvec_dim": 256,
+            "wenet_dim": 512,
+            "use_singer_encoder": false,
+            "pitch_min": 50,
+            "pitch_max": 1100
+        },
+        "transformer": {
+            // 'conformer' or 'transformer'
+            "type": "conformer",
+            "input_dim": 384,
+            "output_dim": 100,
+            "n_heads": 2,
+            "n_layers": 6,
+            "filter_channels": 512,
+            "dropout": 0.1,
+        }
+    },
+    "train": {
+        "batch_size": 64,
+        "gradient_accumulation_step": 1,
+        "max_epoch": -1, // -1 means no limit
+        "save_checkpoint_stride": [
+            50,
+            50
+        ],
+        "keep_last": [
+            5,
+            -1
+        ],
+        "run_eval": [
+            false,
+            true
+        ],
+        "adamw": {
+            "lr": 4.0e-4
+        },
+        "reducelronplateau": {
+            "factor": 0.8,
+            "patience": 10,
+            "min_lr": 1.0e-4
+        },
+        "dataloader": {
+            "num_worker": 8,
+            "pin_memory": true
+        },
+        "sampler": {
+            "holistic_shuffle": false,
+            "drop_last": true
+        }
+    }
+}

egs/svc/TransformerSVC/run.sh ADDED Viewed

	@@ -0,0 +1,150 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+######## Build Experiment Environment ###########
+exp_dir=$(cd `dirname $0`; pwd)
+work_dir=$(dirname $(dirname $(dirname $exp_dir)))
+export WORK_DIR=$work_dir
+export PYTHONPATH=$work_dir
+export PYTHONIOENCODING=UTF-8
+######## Parse the Given Parameters from the Commond ###########
+options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir: -- "$@")
+eval set -- "$options"
+while true; do
+  case $1 in
+    # Experimental Configuration File
+    -c | --config) shift; exp_config=$1 ; shift ;;
+    # Experimental Name
+    -n | --name) shift; exp_name=$1 ; shift ;;
+    # Running Stage
+    -s | --stage) shift; running_stage=$1 ; shift ;;
+    # Visible GPU machines. The default value is "0".
+    --gpu) shift; gpu=$1 ; shift ;;
+    # [Only for Training] Resume configuration
+    --resume) shift; resume=$1 ; shift ;;
+    # [Only for Training] The specific checkpoint path that you want to resume from.
+    --resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;;
+    # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
+    --resume_type) shift; resume_type=$1 ; shift ;;
+    # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
+    --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
+    # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
+    --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
+    # [Only for Inference] The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir can be "$work_dir/source_audio" which includes several audio files (*.wav, *.mp3 or *.flac).
+    --infer_source_file) shift; infer_source_file=$1 ; shift ;;
+    --infer_source_audio_dir) shift; infer_source_audio_dir=$1 ; shift ;;
+    # [Only for Inference] Specify the target speaker you want to convert into. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1".
+    --infer_target_speaker) shift; infer_target_speaker=$1 ; shift ;;
+    # [Only for Inference] For advanced users, you can modify the trans_key parameters into an integer (which means the semitones you want to transpose). Its default value is "autoshift".
+    --infer_key_shift) shift; infer_key_shift=$1 ; shift ;;
+    # [Only for Inference] The vocoder dir. Its default value is Amphion/pretrained/bigvgan. See Amphion/pretrained/README.md to download the pretrained BigVGAN vocoders.
+    --infer_vocoder_dir) shift; infer_vocoder_dir=$1 ; shift ;;
+    --) shift ; break ;;
+    *) echo "Invalid option: $1" exit 1 ;;
+  esac
+done
+### Value check ###
+if [ -z "$running_stage" ]; then
+    echo "[Error] Please specify the running stage"
+    exit 1
+fi
+if [ -z "$exp_config" ]; then
+    exp_config="${exp_dir}"/exp_config.json
+fi
+echo "Exprimental Configuration File: $exp_config"
+if [ -z "$gpu" ]; then
+    gpu="0"
+fi
+######## Features Extraction ###########
+if [ $running_stage -eq 1 ]; then
+    CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/svc/preprocess.py \
+        --config $exp_config \
+        --num_workers 4
+fi
+######## Training ###########
+if [ $running_stage -eq 2 ]; then
+    if [ -z "$exp_name" ]; then
+        echo "[Error] Please specify the experiments name"
+        exit 1
+    fi
+    echo "Exprimental Name: $exp_name"
+    if [ "$resume" = true ]; then
+        echo "Automatically resume from the experimental dir..."
+        CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/svc/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --resume
+    else
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/svc/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --resume_from_ckpt_path "$resume_from_ckpt_path" \
+            --resume_type "$resume_type"
+    fi
+fi
+######## Inference/Conversion ###########
+if [ $running_stage -eq 3 ]; then
+    if [ -z "$infer_expt_dir" ]; then
+        echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
+        exit 1
+    fi
+    if [ -z "$infer_output_dir" ]; then
+        infer_output_dir="$expt_dir/result"
+    fi
+    if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then
+        echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)."
+        exit 1
+    fi
+    if [ -z "$infer_source_file" ]; then
+        infer_source=$infer_source_audio_dir
+    fi
+    if [ -z "$infer_source_audio_dir" ]; then
+        infer_source=$infer_source_file
+    fi
+    if [ -z "$infer_target_speaker" ]; then
+        echo "[Error] Please specify the target speaker. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1""
+        exit 1
+    fi
+    if [ -z "$infer_key_shift" ]; then
+        infer_key_shift="autoshift"
+    fi
+    if [ -z "$infer_vocoder_dir" ]; then
+        infer_vocoder_dir="$work_dir"/pretrained/bigvgan
+        echo "[Warning] You don't specify the infer_vocoder_dir. It is set $infer_vocoder_dir by default. Make sure that you have followed Amphoion/pretrained/README.md to download the pretrained BigVGAN vocoder checkpoint."
+    fi
+    CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/svc/inference.py \
+        --config $exp_config \
+        --acoustics_dir $infer_expt_dir \
+        --vocoder_dir $infer_vocoder_dir \
+        --target_singer $infer_target_speaker \
+        --trans_key $infer_key_shift \
+        --source $infer_source \
+        --output_dir $infer_output_dir  \
+        --log_level debug
+fi

egs/svc/_template/run.sh ADDED Viewed

	@@ -0,0 +1,150 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+######## Build Experiment Environment ###########
+exp_dir=$(cd `dirname $0`; pwd)
+work_dir=$(dirname $(dirname $(dirname $exp_dir)))
+export WORK_DIR=$work_dir
+export PYTHONPATH=$work_dir
+export PYTHONIOENCODING=UTF-8
+######## Parse the Given Parameters from the Commond ###########
+options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir: -- "$@")
+eval set -- "$options"
+while true; do
+  case $1 in
+    # Experimental Configuration File
+    -c | --config) shift; exp_config=$1 ; shift ;;
+    # Experimental Name
+    -n | --name) shift; exp_name=$1 ; shift ;;
+    # Running Stage
+    -s | --stage) shift; running_stage=$1 ; shift ;;
+    # Visible GPU machines. The default value is "0".
+    --gpu) shift; gpu=$1 ; shift ;;
+    # [Only for Training] Resume configuration
+    --resume) shift; resume=$1 ; shift ;;
+    # [Only for Training] The specific checkpoint path that you want to resume from.
+    --resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;;
+    # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
+    --resume_type) shift; resume_type=$1 ; shift ;;
+    # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
+    --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
+    # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
+    --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
+    # [Only for Inference] The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir can be "$work_dir/source_audio" which includes several audio files (*.wav, *.mp3 or *.flac).
+    --infer_source_file) shift; infer_source_file=$1 ; shift ;;
+    --infer_source_audio_dir) shift; infer_source_audio_dir=$1 ; shift ;;
+    # [Only for Inference] Specify the target speaker you want to convert into. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1".
+    --infer_target_speaker) shift; infer_target_speaker=$1 ; shift ;;
+    # [Only for Inference] For advanced users, you can modify the trans_key parameters into an integer (which means the semitones you want to transpose). Its default value is "autoshift".
+    --infer_key_shift) shift; infer_key_shift=$1 ; shift ;;
+    # [Only for Inference] The vocoder dir. Its default value is Amphion/pretrained/bigvgan. See Amphion/pretrained/README.md to download the pretrained BigVGAN vocoders.
+    --infer_vocoder_dir) shift; infer_vocoder_dir=$1 ; shift ;;
+    --) shift ; break ;;
+    *) echo "Invalid option: $1" exit 1 ;;
+  esac
+done
+### Value check ###
+if [ -z "$running_stage" ]; then
+    echo "[Error] Please specify the running stage"
+    exit 1
+fi
+if [ -z "$exp_config" ]; then
+    exp_config="${exp_dir}"/exp_config.json
+fi
+echo "Exprimental Configuration File: $exp_config"
+if [ -z "$gpu" ]; then
+    gpu="0"
+fi
+######## Features Extraction ###########
+if [ $running_stage -eq 1 ]; then
+    CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/svc/preprocess.py \
+        --config $exp_config \
+        --num_workers 4
+fi
+######## Training ###########
+if [ $running_stage -eq 2 ]; then
+    if [ -z "$exp_name" ]; then
+        echo "[Error] Please specify the experiments name"
+        exit 1
+    fi
+    echo "Exprimental Name: $exp_name"
+    if [ "$resume" = true ]; then
+        echo "Automatically resume from the experimental dir..."
+        CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/svc/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --resume
+    else
+        CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/svc/train.py \
+            --config "$exp_config" \
+            --exp_name "$exp_name" \
+            --log_level info \
+            --resume_from_ckpt_path "$resume_from_ckpt_path" \
+            --resume_type "$resume_type"
+    fi
+fi
+######## Inference/Conversion ###########
+if [ $running_stage -eq 3 ]; then
+    if [ -z "$infer_expt_dir" ]; then
+        echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
+        exit 1
+    fi
+    if [ -z "$infer_output_dir" ]; then
+        infer_output_dir="$expt_dir/result"
+    fi
+    if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then
+        echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)."
+        exit 1
+    fi
+    if [ -z "$infer_source_file" ]; then
+        infer_source=$infer_source_audio_dir
+    fi
+    if [ -z "$infer_source_audio_dir" ]; then
+        infer_source=$infer_source_file
+    fi
+    if [ -z "$infer_target_speaker" ]; then
+        echo "[Error] Please specify the target speaker. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1""
+        exit 1
+    fi
+    if [ -z "$infer_key_shift" ]; then
+        infer_key_shift="autoshift"
+    fi
+    if [ -z "$infer_vocoder_dir" ]; then
+        infer_vocoder_dir="$work_dir"/pretrained/bigvgan
+        echo "[Warning] You don't specify the infer_vocoder_dir. It is set $infer_vocoder_dir by default. Make sure that you have followed Amphoion/pretrained/README.md to download the pretrained BigVGAN vocoder checkpoint."
+    fi
+    CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/svc/inference.py \
+        --config $exp_config \
+        --acoustics_dir $infer_expt_dir \
+        --vocoder_dir $infer_vocoder_dir \
+        --target_singer $infer_target_speaker \
+        --trans_key $infer_key_shift \
+        --source $infer_source \
+        --output_dir $infer_output_dir  \
+        --log_level debug
+fi

egs/tta/README.md ADDED Viewed

	@@ -0,0 +1,19 @@

+# Amphion Text-to-Audio (TTA) Recipe
+## Quick Start
+We provide a **[beginner recipe](RECIPE.md)** to demonstrate how to train a cutting edge TTA model. Specifically, it is designed as a latent diffusion model like [AudioLDM](https://arxiv.org/abs/2301.12503), [Make-an-Audio](https://arxiv.org/abs/2301.12661), and [AUDIT](https://arxiv.org/abs/2304.00830).
+## Supported Model Architectures
+Until now, Amphion has supported a latent diffusion based text-to-audio model:
+<br>
+<div align="center">
+<img src="../../imgs/tta/DiffusionTTA.png" width="65%">
+</div>
+<br>
+Similar to [AUDIT](https://arxiv.org/abs/2304.00830), we implement it in two-stage training:
+1. Training the VAE which is called `AutoencoderKL` in Amphion.
+2. Training the conditional latent diffusion model which is called `AudioLDM` in Amphion.

egs/tta/RECIPE.md ADDED Viewed

	@@ -0,0 +1,156 @@

+# Text-to-Audio with Latent Diffusion Model
+This is the quicktour for training a text-to-audio model with the popular and powerful generative model: [Latent Diffusion Model](https://arxiv.org/abs/2112.10752). Specially, this recipe is also the official implementation of the text-to-audio generation part of our NeurIPS 2023 paper "[AUDIT: Audio Editing by Following Instructions with Latent Diffusion Models](https://arxiv.org/abs/2304.00830)". You can check the last part of [AUDIT demos](https://audit-demo.github.io/) to see same text-to-audio examples.
+<br>
+<div align="center">
+<img src="../../imgs/tta/DiffusionTTA.png" width="65%">
+</div>
+<br>
+We train this latent diffusion model in two stages:
+1. In the first stage, we aims to obtain a high-quality VAE (called `AutoencoderKL` in Amphion), in order that we can project
+the input mel-spectrograms to an efficient, low-dimensional latent space. Specially, we train the VAE with GAN loss to improve the reconstruction quality.
+1. In the second stage, we aims to obtain a text-controllable diffusion model (called `AudioLDM` in Amphion). We use U-Net architecture diffusion model, and use T5 encoder as text encoder.
+There are four stages in total for training the text-to-audio model:
+1. Data preparation and processing
+2. Train the VAE model
+3. Train the latent diffusion model
+4. Inference
+> **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
+> ```bash
+> cd Amphion
+> ```
+## Overview
+```sh
+# Train the VAE model
+sh egs/tta/autoencoderkl/run_train.sh
+# Train the latent diffusion model
+sh egs/tta/audioldm/run_train.sh
+# Inference
+sh egs/tta/audioldm/run_inference.sh
+```
+## 1. Data preparation and processing
+### Dataset Download
+We take [AudioCaps](https://audiocaps.github.io/) as an example, AudioCaps is a dataset of around 44K audio-caption pairs, where each audio clip corresponds to a caption with rich semantic information. You can download the dataset [here](https://github.com/cdjkim/audiocaps).
+<!-- How to download AudioCaps is detailed [here](../datasets/README.md) -->
+<!-- You can downlaod the dataset [here](https://github.com/cdjkim/audiocaps). -->
+### Data Processing
+- Download AudioCaps dataset to `[Your path to save tta dataset]` and modify `preprocess.processed_dir` in `egs/tta/.../exp_config.json`.
+```json
+{
+  "dataset": [
+    "AudioCaps"
+  ],
+  "preprocess": {
+    // Specify the output root path to save the processed data
+    "processed_dir": "[Your path to save tta dataset]",
+    ...
+  }
+}
+```
+The folder structure of your downloaded data should be similar to:
+```plaintext
+.../[Your path to save tta dataset]
+┣ AudioCpas
+┃   ┣ wav
+┃   ┃   ┣ ---1_cCGK4M_0_10000.wav
+┃   ┃   ┣ ---lTs1dxhU_30000_40000.wav
+┃   ┃   ┣ ...
+```
+- Then you may process the data to mel-specgram and save it as `.npy` format. If you use the data we provide, we have processed all the wav data.
+- Generate a json file to save the metadata, the json file is like:
+```json
+[
+    {
+        "Dataset": "AudioCaps",
+        "Uid": "---1_cCGK4M_0_10000",
+        "Caption": "Idling car, train blows horn and passes"
+    },
+    {
+        "Dataset": "AudioCaps",
+        "Uid": "---lTs1dxhU_30000_40000",
+        "Caption": "A racing vehicle engine is heard passing by"
+    },
+    ...
+]
+```
+- Finally, the folder structure is like:
+```plaintext
+.../[Your path to save tta dataset]
+┣ AudioCpas
+┃   ┣ wav
+┃   ┃   ┣ ---1_cCGK4M_0_10000.wav
+┃   ┃   ┣ ---lTs1dxhU_30000_40000.wav
+┃   ┃   ┣ ...
+┃   ┣ mel
+┃   ┃   ┣ ---1_cCGK4M_0_10000.npy
+┃   ┃   ┣ ---lTs1dxhU_30000_40000.npy
+┃   ┃   ┣ ...
+┃   ┣ train.json
+┃   ┣ valid.json
+┃   ┣ ...
+```
+## 2. Training the VAE Model
+The first stage model is a VAE model trained with GAN loss (called `AutoencoderKL` in Amphion), run the follow commands:
+```sh
+sh egs/tta/autoencoderkl/run_train.sh
+```
+## 3. Training the Latent Diffusion Model
+The second stage model is a condition diffusion model with a T5 text encoder (called `AudioLDM` in Amphion), run the following commands:
+```sh
+sh egs/tta/audioldm/run_train.sh
+```
+## 4. Inference
+Now you can generate audio with your pre-trained latent diffusion model, run the following commands and modify the `text` argument.
+```sh
+sh egs/tta/audioldm/run_inference.sh \
+--text "A man is whistling"
+```
+## Citations
+```bibtex
+@article{wang2023audit,
+  title={AUDIT: Audio Editing by Following Instructions with Latent Diffusion Models},
+  author={Wang, Yuancheng and Ju, Zeqian and Tan, Xu and He, Lei and Wu, Zhizheng and Bian, Jiang and Zhao, Sheng},
+  journal={NeurIPS 2023},
+  year={2023}
+}
+@article{liu2023audioldm,
+  title={{AudioLDM}: Text-to-Audio Generation with Latent Diffusion Models},
+  author={Liu, Haohe and Chen, Zehua and Yuan, Yi and Mei, Xinhao and Liu, Xubo and Mandic, Danilo and Wang, Wenwu and Plumbley, Mark D},
+  journal={Proceedings of the International Conference on Machine Learning},
+  year={2023}
+}
+```

egs/tta/audioldm/exp_config.json ADDED Viewed

	@@ -0,0 +1,90 @@

+{
+    "base_config": "egs/tta/audioldm/exp_config_base.json",
+    "dataset": [
+      "AudioCaps"
+    ],
+    "preprocess": {
+      // Specify the output root path to save the processed data
+      "processed_dir": "data",
+      // For example: "/home/TTADataset/processed_data"
+      // feature
+      "use_spkid": false,
+      "use_uv": false,
+      "use_frame_pitch": false,
+      "use_phone_pitch": false,
+      "use_frame_energy": false,
+      "use_phone_energy": false,
+      "use_mel": false,
+      "use_audio": false,
+      "use_label": false,
+      "use_one_hot": false,
+      // feature for text to audio
+      "use_caption": true,
+      "use_melspec": true,
+      "use_wav": false,
+      // feature dir
+      "melspec_dir": "mel",
+      "wav_dir": "wav"
+    },
+    // Specify the output root path to save model ckpts and logs
+    "log_dir": "ckpts/tta",
+    // For example: "/home/TTADataset/processed_data/logs"
+    // model
+    "model": {
+      "audioldm": {
+        "image_size": 32,
+        "in_channels": 4,
+        "out_channels": 4,
+        "model_channels": 256,
+        "attention_resolutions": [4, 2, 1],
+        "num_res_blocks": 2,
+        "channel_mult": [1, 2, 4],
+        "num_heads": 8,
+        "use_spatial_transformer": true,
+        "transformer_depth": 1,
+        "context_dim": 768,
+        "use_checkpoint": true,
+        "legacy": false
+      },
+      "autoencoderkl": {
+        "ch": 128,
+        "ch_mult": [1,1,2,2,4],
+        "num_res_blocks": 2,
+        "in_channels": 1,
+        "z_channels": 4,
+        "out_ch": 1,
+        "double_z": true
+      },
+      "noise_scheduler": {
+        "num_train_timesteps": 1000,
+        "beta_start": 0.00085,
+        "beta_end": 0.012,
+        "beta_schedule": "scaled_linear",
+        "clip_sample": false,
+        "steps_offset": 1,
+        "set_alpha_to_one": false,
+        "skip_prk_steps": true,
+        "prediction_type": "epsilon"
+      },
+      "autoencoder_path": "ckpts/tta/autoencoder_kl_debug/checkpoints/step-0445000_loss-0.3306.pt"
+    },
+    // train
+    "train": {
+      "adam": {
+        "lr": 5.0e-5
+      },
+      "ddp": false,
+      "random_seed": 12345,
+      "batch_size": 12,
+      "epochs": 50000,
+      "max_steps": 1000000,
+      "total_training_steps": 800000,
+      "save_summary_steps": 1000,
+      "save_checkpoints_steps": 5000,
+      "valid_interval": 5000,
+      "keep_checkpoint_max": 100
+    }
+  }

egs/tta/audioldm/exp_config_base.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "base_config": "config/audioldm.json",
+    "model_type": "AudioLDM",
+    "dataset": [
+        "AudioCaps"
+    ],
+    "preprocess": {
+        "train_file": "train.json",
+        "valid_file": "vaild.json"
+    }
+}

egs/tta/audioldm/exp_config_latent_4_10_78.json ADDED Viewed

	@@ -0,0 +1,88 @@

+{
+    "base_config": "egs/tta/audioldm/exp_config_base.json",
+    "dataset": [
+      "AudioCaps"
+    ],
+    "preprocess": {
+      // Specify the output root path to save the processed data
+      "processed_dir": "data",
+      // feature
+      "use_spkid": false,
+      "use_uv": false,
+      "use_frame_pitch": false,
+      "use_phone_pitch": false,
+      "use_frame_energy": false,
+      "use_phone_energy": false,
+      "use_mel": false,
+      "use_audio": false,
+      "use_label": false,
+      "use_one_hot": false,
+      // feature for text to audio
+      "use_caption": true,
+      "use_melspec": true,
+      "use_wav": false,
+      // feature dir
+      "melspec_dir": "mel",
+      "wav_dir": "wav"
+    },
+    // Specify the output root path to save model ckpts and logs
+    "log_dir": "ckpts/tta",
+    // model
+    "model": {
+      "audioldm": {
+        "image_size": 32,
+        "in_channels": 4,
+        "out_channels": 4,
+        "model_channels": 256,
+        "attention_resolutions": [4, 2, 1],
+        "num_res_blocks": 2,
+        "channel_mult": [1, 2, 4],
+        "num_heads": 8,
+        "use_spatial_transformer": true,
+        "transformer_depth": 1,
+        "context_dim": 768,
+        "use_checkpoint": true,
+        "legacy": false
+      },
+      "autoencoderkl": {
+        "ch": 128,
+        "ch_mult": [1,2,2,4],
+        "num_res_blocks": 2,
+        "in_channels": 1,
+        "z_channels": 4,
+        "out_ch": 1,
+        "double_z": true
+      },
+      "noise_scheduler": {
+        "num_train_timesteps": 1000,
+        "beta_start": 0.00085,
+        "beta_end": 0.012,
+        "beta_schedule": "scaled_linear",
+        "clip_sample": false,
+        "steps_offset": 1,
+        "set_alpha_to_one": false,
+        "skip_prk_steps": true,
+        "prediction_type": "epsilon"
+      },
+      "autoencoder_path": "ckpts/tta/autoencoder_kl_debug_latent_size_4_10_78/checkpoints/step-0390000_loss-0.2876.pt"
+    },
+    // train
+    "train": {
+      "adam": {
+        "lr": 2.0e-5
+      },
+      "ddp": false,
+      "random_seed": 12345,
+      "batch_size": 12,
+      "epochs": 50000,
+      "max_steps": 1000000,
+      "total_training_steps": 800000,
+      "save_summary_steps": 1000,
+      "save_checkpoints_steps": 5000,
+      "valid_interval": 5000,
+      "keep_checkpoint_max": 100
+    }
+  }

egs/tta/audioldm/run_inference.sh ADDED Viewed

	@@ -0,0 +1,52 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+######## Build Experiment Environment ###########
+exp_dir=$(cd `dirname $0`; pwd)
+work_dir=$(dirname $(dirname $(dirname $exp_dir)))
+export WORK_DIR=$work_dir
+export PYTHONPATH=$work_dir
+export PYTHONIOENCODING=UTF-8
+######## Set Experiment Configuration ###########
+exp_config="$exp_dir/exp_config.json"
+exp_name="audioldm_debug_latent_size_4_5_39"
+checkpoint_path="$work_dir/ckpts/tta/audioldm_debug_latent_size_4_5_39/checkpoints/step-0570000_loss-0.2521.pt"
+output_dir="$work_dir/temp"
+vocoder_config_path="$work_dir/ckpts/tta/hifigan_checkpoints/config.json"
+vocoder_path="$work_dir/ckpts/tta/hifigan_checkpoints/g_01250000"
+num_steps=200
+guidance_scale=4.0
+export CUDA_VISIBLE_DEVICES="0"
+######## Parse Command Line Arguments ###########
+while [[ $# -gt 0 ]]
+do
+key="$1"
+case $key in
+    --text)
+    text="$2"
+    shift # past argument
+    shift # past value
+    ;;
+    *)    # unknown option
+    shift # past argument
+    ;;
+esac
+done
+######## Run inference ###########
+python "${work_dir}"/bins/tta/inference.py \
+    --config=$exp_config \
+    --checkpoint_path=$checkpoint_path \
+    --text="$text" \
+    --vocoder_path=$vocoder_path \
+    --vocoder_config_path=$vocoder_config_path \
+    --num_steps=$num_steps \
+    --guidance_scale=$guidance_scale \
+    --output_dir=$output_dir

egs/tta/audioldm/run_inference_latent_4_10_78.sh ADDED Viewed

	@@ -0,0 +1,52 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+######## Build Experiment Environment ###########
+exp_dir=$(cd `dirname $0`; pwd)
+work_dir=$(dirname $(dirname $(dirname $exp_dir)))
+export WORK_DIR=$work_dir
+export PYTHONPATH=$work_dir
+export PYTHONIOENCODING=UTF-8
+######## Set Experiment Configuration ###########
+exp_config="$exp_dir/exp_config_v2.json"
+exp_name="audioldm_debug_latent_size_4_10_78"
+checkpoint_path="$work_dir/ckpts/tta/audioldm_debug_latent_size_4_10_78/checkpoints/step-0325000_loss-0.1936.pt"
+output_dir="$work_dir/temp"
+vocoder_config_path="$work_dir/ckpts/tta/hifigan_checkpoints/config.json"
+vocoder_path="$work_dir/ckpts/tta/hifigan_checkpoints/g_01250000"
+num_steps=200
+guidance_scale=4.0
+export CUDA_VISIBLE_DEVICES="0"
+######## Parse Command Line Arguments ###########
+while [[ $# -gt 0 ]]
+do
+key="$1"
+case $key in
+    --text)
+    text="$2"
+    shift # past argument
+    shift # past value
+    ;;
+    *)    # unknown option
+    shift # past argument
+    ;;
+esac
+done
+######## Run inference ###########
+python "${work_dir}"/bins/tta/inference.py \
+    --config=$exp_config \
+    --checkpoint_path=$checkpoint_path \
+    --text="A man is whistling" \
+    --vocoder_path=$vocoder_path \
+    --vocoder_config_path=$vocoder_config_path \
+    --num_steps=$num_steps \
+    --guidance_scale=$guidance_scale \
+    --output_dir=$output_dir \

egs/tta/audioldm/run_train.sh ADDED Viewed

	@@ -0,0 +1,26 @@

+# Copyright (c) 2023 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+######## Build Experiment Environment ###########
+exp_dir=$(cd `dirname $0`; pwd)
+work_dir=$(dirname $(dirname $(dirname $exp_dir)))
+export WORK_DIR=$work_dir
+export PYTHONPATH=$work_dir
+export PYTHONIOENCODING=UTF-8
+######## Set Experiment Configuration ###########
+exp_config="$exp_dir/exp_config.json"
+exp_name="audioldm_debug_latent_size_4_5_39"
+num_workers=8
+export CUDA_VISIBLE_DEVICES="0"
+######## Train Model ###########
+python "${work_dir}"/bins/tta/train_tta.py \
+    --config=$exp_config \
+    --num_workers=$num_workers \
+    --exp_name=$exp_name \
+    --stdout_interval=25 \