Spaces:

Hilley
/

ChatVC

Running

App Files Files Community

ChatVC / api.py

Hilley

Update api.py

55f0e01 verified 5 months ago

raw

history blame

8.75 kB

	import torch
	import numpy as np
	import re
	import soundfile
	import utils
	import commons
	import os
	import librosa
	from text import text_to_sequence
	from mel_processing import spectrogram_torch
	from models import SynthesizerTrn


	class OpenVoiceBaseClass(object):
	def __init__(self,
	config_path,
	device='cuda:0'):
	#device="cpu"):
	#if 'cuda' in device:
	# assert torch.cuda.is_available()

	hps = utils.get_hparams_from_file(config_path)

	model = SynthesizerTrn(
	len(getattr(hps, 'symbols', [])),
	hps.data.filter_length // 2 + 1,
	n_speakers=hps.data.n_speakers,
	**hps.model,
	).to(device)

	model.eval()
	self.model = model
	self.hps = hps
	self.device = device

	def load_ckpt(self, ckpt_path):
	checkpoint_dict = torch.load(ckpt_path, map_location=torch.device('cpu'))
	a, b = self.model.load_state_dict(checkpoint_dict['model'], strict=False)
	print("Loaded checkpoint '{}'".format(ckpt_path))
	print('missing/unexpected keys:', a, b)


	class BaseSpeakerTTS(OpenVoiceBaseClass):
	language_marks = {
	"english": "EN",
	"chinese": "ZH",
	}

	@staticmethod
	def get_text(text, hps, is_symbol):
	text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
	if hps.data.add_blank:
	text_norm = commons.intersperse(text_norm, 0)
	text_norm = torch.LongTensor(text_norm)
	return text_norm

	@staticmethod
	def audio_numpy_concat(segment_data_list, sr, speed=1.):
	audio_segments = []
	for segment_data in segment_data_list:
	audio_segments += segment_data.reshape(-1).tolist()
	audio_segments += [0] * int((sr * 0.05)/speed)
	audio_segments = np.array(audio_segments).astype(np.float32)
	return audio_segments

	@staticmethod
	def split_sentences_into_pieces(text, language_str):
	texts = utils.split_sentence(text, language_str=language_str)
	print(" > Text splitted to sentences.")
	print('\n'.join(texts))
	print(" > ===========================")
	return texts

	def tts(self, text, output_path, speaker, language='English', speed=1.0):
	mark = self.language_marks.get(language.lower(), None)
	assert mark is not None, f"language {language} is not supported"

	texts = self.split_sentences_into_pieces(text, mark)

	audio_list = []
	for t in texts:
	t = re.sub(r'([a-z])([A-Z])', r'\1 \2', t)
	t = f'[{mark}]{t}[{mark}]'
	stn_tst = self.get_text(t, self.hps, False)
	device = self.device
	speaker_id = self.hps.speakers[speaker]
	with torch.no_grad():
	x_tst = stn_tst.unsqueeze(0).to(device)
	x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
	sid = torch.LongTensor([speaker_id]).to(device)
	audio = self.model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.6,
	length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
	audio_list.append(audio)
	audio = self.audio_numpy_concat(audio_list, sr=self.hps.data.sampling_rate, speed=speed)

	if output_path is None:
	return audio
	else:
	soundfile.write(output_path, audio, self.hps.data.sampling_rate)


	class ToneColorConverter(OpenVoiceBaseClass):
	def __init__(self, args, *kwargs):
	super().__init__(args, *kwargs)

	if kwargs.get('enable_watermark', True):
	import wavmark
	self.watermark_model = wavmark.load_model().to(self.device)
	else:
	self.watermark_model = None



	def extract_se(self, ref_wav_list, se_save_path=None):
	if isinstance(ref_wav_list, str):
	ref_wav_list = [ref_wav_list]

	device = self.device
	hps = self.hps
	gs = []

	for fname in ref_wav_list:
	audio_ref, sr = librosa.load(fname, sr=hps.data.sampling_rate)
	y = torch.FloatTensor(audio_ref)
	y = y.to(device)
	y = y.unsqueeze(0)
	y = spectrogram_torch(y, hps.data.filter_length,
	hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
	center=False).to(device)
	with torch.no_grad():
	g = self.model.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
	gs.append(g.detach())
	gs = torch.stack(gs).mean(0)

	if se_save_path is not None:
	os.makedirs(os.path.dirname(se_save_path), exist_ok=True)
	torch.save(gs.cpu(), se_save_path)

	return gs

	def convert(self, audio_src_path, src_se, tgt_se, output_path=None, tau=0.3, message="default"):
	hps = self.hps
	# load audio
	audio, sample_rate = librosa.load(audio_src_path, sr=hps.data.sampling_rate)
	audio = torch.tensor(audio).float()

	with torch.no_grad():
	y = torch.FloatTensor(audio).to(self.device)
	y = y.unsqueeze(0)
	spec = spectrogram_torch(y, hps.data.filter_length,
	hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
	center=False).to(self.device)
	spec_lengths = torch.LongTensor([spec.size(-1)]).to(self.device)
	audio = self.model.voice_conversion(spec, spec_lengths, sid_src=src_se, sid_tgt=tgt_se, tau=tau)[0][
	0, 0].data.cpu().float().numpy()
	audio = self.add_watermark(audio, message)
	if output_path is None:
	return audio
	else:
	soundfile.write(output_path, audio, hps.data.sampling_rate)

	def convert_data(self, audio, sample_rate, src_se, tgt_se, output_path=None, tau=0.3, message="default"):
	hps = self.hps
	# load audio
	audio = torch.tensor(audio).float()

	with torch.no_grad():
	y = torch.FloatTensor(audio).to(self.device)
	y = y.unsqueeze(0)
	spec = spectrogram_torch(y, hps.data.filter_length,
	hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
	center=False).to(self.device)
	spec_lengths = torch.LongTensor([spec.size(-1)]).to(self.device)
	audio = self.model.voice_conversion(spec, spec_lengths, sid_src=src_se, sid_tgt=tgt_se, tau=tau)[0][
	0, 0].data.cpu().float().numpy()
	audio = self.add_watermark(audio, message)
	if output_path is None:
	return audio
	else:
	soundfile.write(output_path, audio, hps.data.sampling_rate)

	def add_watermark(self, audio, message):
	if self.watermark_model is None:
	return audio
	device = self.device
	bits = utils.string_to_bits(message).reshape(-1)
	n_repeat = len(bits) // 32

	K = 16000
	coeff = 2
	for n in range(n_repeat):
	trunck = audio[(coeff * n) * K: (coeff * n + 1) * K]
	if len(trunck) != K:
	print('Audio too short, fail to add watermark')
	break
	message_npy = bits[n * 32: (n + 1) * 32]

	with torch.no_grad():
	signal = torch.FloatTensor(trunck).to(device)[None]
	message_tensor = torch.FloatTensor(message_npy).to(device)[None]
	signal_wmd_tensor = self.watermark_model.encode(signal, message_tensor)
	signal_wmd_npy = signal_wmd_tensor.detach().cpu().squeeze()
	audio[(coeff * n) * K: (coeff * n + 1) * K] = signal_wmd_npy
	return audio

	def detect_watermark(self, audio, n_repeat):
	bits = []
	K = 16000
	coeff = 2
	for n in range(n_repeat):
	trunck = audio[(coeff * n) * K: (coeff * n + 1) * K]
	if len(trunck) != K:
	print('Audio too short, fail to detect watermark')
	return 'Fail'
	with torch.no_grad():
	signal = torch.FloatTensor(trunck).to(self.device).unsqueeze(0)
	message_decoded_npy = (self.watermark_model.decode(signal) >= 0.5).int().detach().cpu().numpy().squeeze()
	bits.append(message_decoded_npy)
	bits = np.stack(bits).reshape(-1, 8)
	message = utils.bits_to_string(bits)
	return message