Spaces:
Sleeping
Sleeping
File size: 7,136 Bytes
c5ed230 dc13618 c5ed230 dc13618 c5ed230 d94ccbe c5ed230 d94ccbe c5ed230 dc13618 c5ed230 7a58c1e c5ed230 dc13618 c5ed230 dc13618 c5ed230 dc13618 c5ed230 dc13618 36fb9b8 dc13618 c5ed230 dc13618 c5ed230 dc13618 36fb9b8 c5ed230 5854014 c5ed230 d94ccbe c5ed230 dc13618 36fb9b8 c5ed230 5854014 c5ed230 dc13618 d94ccbe dc13618 c5ed230 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
import os
import json
import logging
import torch
import config
import numpy as np
from utils.utils import check_is_none
from vits import VITS
from voice import TTS
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lang_dict = {
"english_cleaners": ["en"],
"english_cleaners2": ["en"],
"japanese_cleaners": ["ja"],
"japanese_cleaners2": ["ja"],
"korean_cleaners": ["ko"],
"chinese_cleaners": ["zh"],
"zh_ja_mixture_cleaners": ["zh", "ja"],
"sanskrit_cleaners": ["sa"],
"cjks_cleaners": ["zh", "ja", "ko", "sa"],
"cjke_cleaners": ["zh", "ja", "ko", "en"],
"cjke_cleaners2": ["zh", "ja", "ko", "en"],
"cje_cleaners": ["zh", "ja", "en"],
"cje_cleaners2": ["zh", "ja", "en"],
"thai_cleaners": ["th"],
"shanghainese_cleaners": ["sh"],
"chinese_dialect_cleaners": ["zh", "ja", "sh", "gd", "en", "SZ", "WX", "CZ", "HZ", "SX", "NB", "JJ", "YX", "JD",
"ZR", "PH", "TX", "JS", "HN", "LP", "XS", "FY", "RA", "CX", "SM", "TT", "WZ", "SC",
"YB"],
"bert_chinese_cleaners": ["zh"],
}
def analysis(model_config_json):
model_config = json.load(model_config_json)
symbols = model_config.get("symbols", None)
emotion_embedding = model_config.get("data").get("emotion_embedding", False)
if "use_spk_conditioned_encoder" in model_config.get("model"):
model_type = 'bert_vits2'
return model_type
if symbols != None:
if not emotion_embedding:
mode_type = "vits"
else:
mode_type = "w2v2"
else:
mode_type = "hubert"
return mode_type
def load_npy(model_):
if isinstance(model_, list):
# check if is .npy
for i in model_:
_model_extention = os.path.splitext(i)[1]
if _model_extention != ".npy":
raise ValueError(f"Unsupported model type: {_model_extention}")
# merge npy files
emotion_reference = np.empty((0, 1024))
for i in model_:
tmp = np.load(i).reshape(-1, 1024)
emotion_reference = np.append(emotion_reference, tmp, axis=0)
elif os.path.isdir(model_):
emotion_reference = np.empty((0, 1024))
for root, dirs, files in os.walk(model_):
for file_name in files:
# check if is .npy
_model_extention = os.path.splitext(file_name)[1]
if _model_extention != ".npy":
continue
file_path = os.path.join(root, file_name)
# merge npy files
tmp = np.load(file_path).reshape(-1, 1024)
emotion_reference = np.append(emotion_reference, tmp, axis=0)
elif os.path.isfile(model_):
# check if is .npy
_model_extention = os.path.splitext(model_)[1]
if _model_extention != ".npy":
raise ValueError(f"Unsupported model type: {_model_extention}")
emotion_reference = np.load(model_)
logging.info(f"Loaded emotional dimention npy range:{len(emotion_reference)}")
return emotion_reference
def merge_model(merging_model):
vits_obj = []
vits_speakers = []
hubert_vits_obj = []
hubert_vits_speakers = []
w2v2_vits_obj = []
w2v2_vits_speakers = []
bert_vits2_obj = []
bert_vits2_speakers = []
# model list
vits_list = []
hubert_vits_list = []
w2v2_vits_list = []
bert_vits2_list = []
for l in merging_model:
with open(l[1], 'r', encoding='utf-8') as model_config:
model_type = analysis(model_config)
if model_type == "vits":
vits_list.append(l)
elif model_type == "hubert":
hubert_vits_list.append(l)
elif model_type == "w2v2":
w2v2_vits_list.append(l)
elif model_type == "bert_vits2":
bert_vits2_list.append(l)
# merge vits
new_id = 0
for obj_id, i in enumerate(vits_list):
obj = VITS(model=i[0], config=i[1], model_type="vits", device=device)
lang = lang_dict.get(obj.get_cleaner(), ["unknown"])
for id, name in enumerate(obj.get_speakers()):
vits_obj.append([int(id), obj, obj_id])
vits_speakers.append({"id": new_id, "name": name, "lang": lang})
new_id += 1
# merge hubert-vits
if len(hubert_vits_list) != 0:
if getattr(config, "HUBERT_SOFT_MODEL", None) == None or check_is_none(config.HUBERT_SOFT_MODEL):
raise ValueError(f"Please configure HUBERT_SOFT_MODEL path in config.py")
try:
from vits.hubert_model import hubert_soft
hubert = hubert_soft(config.HUBERT_SOFT_MODEL)
except Exception as e:
raise ValueError(f"Load HUBERT_SOFT_MODEL failed {e}")
new_id = 0
for obj_id, i in enumerate(hubert_vits_list):
obj = VITS(model=i[0], config=i[1], model_=hubert, model_type="hubert", device=device)
lang = lang_dict.get(obj.get_cleaner(), ["unknown"])
for id, name in enumerate(obj.get_speakers()):
hubert_vits_obj.append([int(id), obj, obj_id])
hubert_vits_speakers.append({"id": new_id, "name": name, "lang": lang})
new_id += 1
# merge w2v2-vits
emotion_reference = None
if len(w2v2_vits_list) != 0:
if getattr(config, "DIMENSIONAL_EMOTION_NPY", None) == None or check_is_none(config.DIMENSIONAL_EMOTION_NPY):
raise ValueError(f"Please configure DIMENSIONAL_EMOTION_NPY path in config.py")
try:
emotion_reference = load_npy(config.DIMENSIONAL_EMOTION_NPY)
except Exception as e:
raise ValueError(f"Load DIMENSIONAL_EMOTION_NPY failed {e}")
new_id = 0
for obj_id, i in enumerate(w2v2_vits_list):
obj = VITS(model=i[0], config=i[1], model_=emotion_reference, model_type="w2v2", device=device)
lang = lang_dict.get(obj.get_cleaner(), ["unknown"])
for id, name in enumerate(obj.get_speakers()):
w2v2_vits_obj.append([int(id), obj, obj_id])
w2v2_vits_speakers.append({"id": new_id, "name": name, "lang": lang})
new_id += 1
# merge Bert_VITS2
new_id = 0
for obj_id, i in enumerate(bert_vits2_list):
from bert_vits2 import Bert_VITS2
obj = Bert_VITS2(model=i[0], config=i[1], device=device)
lang = ["ZH"]
for id, name in enumerate(obj.get_speakers()):
bert_vits2_obj.append([int(id), obj, obj_id])
bert_vits2_speakers.append({"id": new_id, "name": name, "lang": lang})
new_id += 1
voice_obj = {"VITS": vits_obj, "HUBERT-VITS": hubert_vits_obj, "W2V2-VITS": w2v2_vits_obj,
"BERT-VITS2": bert_vits2_obj}
voice_speakers = {"VITS": vits_speakers, "HUBERT-VITS": hubert_vits_speakers, "W2V2-VITS": w2v2_vits_speakers,
"BERT-VITS2": bert_vits2_speakers}
w2v2_emotion_count = len(emotion_reference) if emotion_reference is not None else 0
tts = TTS(voice_obj, voice_speakers, w2v2_emotion_count=w2v2_emotion_count, device=device)
return tts
|