mrtroydev's picture
Upload folder using huggingface_hub
3883c60 verified
import os
import torch.cuda
import torchaudio
import gradio
from webui.modules import util
from webui.modules.download import fill_models
flag_strings = ['denoise', 'denoise output', 'separate background']
def flatten_audio(audio_tensor: torch.Tensor | tuple[torch.Tensor, int] | tuple[int, torch.Tensor], add_batch=True):
if isinstance(audio_tensor, tuple):
if isinstance(audio_tensor[0], int):
return audio_tensor[0], flatten_audio(audio_tensor[1])
elif torch.is_tensor(audio_tensor[0]):
return flatten_audio(audio_tensor[0]), audio_tensor[1]
if audio_tensor.dtype == torch.int16:
audio_tensor = audio_tensor.float() / 32767.0
if audio_tensor.dtype == torch.int32:
audio_tensor = audio_tensor.float() / 2147483647.0
if len(audio_tensor.shape) == 2:
if audio_tensor.shape[0] == 2:
# audio_tensor = audio_tensor[0, :].div(2).add(audio_tensor[1, :].div(2))
audio_tensor = audio_tensor.mean(0)
elif audio_tensor.shape[1] == 2:
# audio_tensor = audio_tensor[:, 0].div(2).add(audio_tensor[:, 1].div(2))
audio_tensor = audio_tensor.mean(1)
audio_tensor = audio_tensor.flatten()
if add_batch:
audio_tensor = audio_tensor.unsqueeze(0)
return audio_tensor
def merge_and_match(x, y, sr):
# import scipy.signal
x = x / 2
y = y / 2
import torchaudio.functional as F
y = F.resample(y, sr, int(sr * (x.shape[-1] / y.shape[-1])))
if x.shape[0] > y.shape[0]:
x = x[-y.shape[0]:]
else:
y = y[-x.shape[0]:]
return x.add(y)
def get_models_installed():
return [gradio.update(choices=fill_models('rvc')), gradio.update()]
def unload_rvc():
import webui.modules.implementations.rvc.rvc as rvc
rvc.unload_rvc()
return [gradio.update(value=''), gradio.update(maximum=0, value=0, visible=False)]
def load_rvc(model):
if not model:
return unload_rvc()
import webui.modules.implementations.rvc.rvc as rvc
maximum = rvc.load_rvc(model)
return [gradio.update(), gradio.update(maximum=maximum, value=0, visible=maximum > 0)]
def denoise(sr, audio):
if not torch.is_tensor(audio):
audio = torch.tensor(audio)
if len(audio.shape) == 1:
audio = audio.unsqueeze(0)
audio = audio.detach().cpu().numpy()
import noisereduce.noisereduce as noisereduce
audio = torch.tensor(noisereduce.reduce_noise(y=audio, sr=sr))
return sr, audio
def gen(rvc_model_selected, speaker_id, pitch_extract, audio_in, up_key, index_rate, filter_radius, protect, crepe_hop_length, flag):
background = None
audio = None
sr, audio_in = audio_in
audio_tuple = (sr, torch.tensor(audio_in))
audio_tuple = flatten_audio(audio_tuple)
if 'separate background' in flag:
if not torch.is_tensor(audio_tuple[1]):
audio_tuple = (audio_tuple[0], torch.tensor(audio_tuple[1]).to(torch.float32))
if len(audio_tuple[1].shape) != 1:
audio_tuple = (audio_tuple[0], audio_tuple[1].flatten())
import webui.modules.implementations.rvc.split_audio as split_audio
foreground, background, sr = split_audio.split(*audio_tuple)
audio_tuple = flatten_audio((sr, foreground))
background = flatten_audio(background)
if 'denoise' in flag:
audio_tuple = denoise(*audio_tuple)
if rvc_model_selected:
print('Selected model', rvc_model_selected)
if len(audio_tuple[1].shape) == 1:
audio_tuple = (audio_tuple[0], audio_tuple[1].unsqueeze(0))
torchaudio.save('speakeraudio.wav', audio_tuple[1], audio_tuple[0])
import webui.modules.implementations.rvc.rvc as rvc
rvc.load_rvc(rvc_model_selected)
index_file = ''
try:
model_basedir = os.path.join('data', 'models', 'rvc', os.path.dirname(rvc_model_selected))
index_files = [f for f in os.listdir(model_basedir) if f.endswith('.index')]
if len(index_files) > 0:
for f in index_files:
full_path = os.path.join(model_basedir, f)
if 'added' in f:
index_file = full_path
if not index_file:
index_file = os.path.join(model_basedir, index_files[0])
except:
pass
out1, out2 = rvc.vc_single(speaker_id, 'speakeraudio.wav', up_key, None, pitch_extract, index_file, '', index_rate, filter_radius, 0, 1, protect, crepe_hop_length)
print(out1)
audio_tuple = out2
if background is not None and 'separate background' in flag:
audio = audio_tuple[1] if torch.is_tensor(audio_tuple[1]) else torch.tensor(audio_tuple[1])
audio_tuple = (audio_tuple[0], flatten_audio(audio, False))
background = flatten_audio(background if torch.is_tensor(background) else torch.tensor(background), False)
if audio_tuple[1].dtype == torch.int16:
audio = audio_tuple[1]
audio = audio.float() / 32767.0
audio_tuple = (audio_tuple[0], audio)
audio = audio_tuple[1]
audio_tuple = (audio_tuple[0], merge_and_match(audio_tuple[1], background, audio_tuple[0]))
if 'denoise output' in flag:
audio_tuple = denoise(*audio_tuple)
if torch.is_tensor(audio_tuple[1]):
audio_tuple = (audio_tuple[0], audio_tuple[1].flatten().detach().cpu().numpy())
sr = audio_tuple[0]
audio = (sr, audio.detach().cpu().numpy()) if audio is not None else None
background = (sr, background.detach().cpu().numpy()) if background is not None else None
return [audio_tuple, util.make_waveform(audio_tuple), background, audio]
def rvc():
with gradio.Row():
with gradio.Column():
use_microphone = gradio.Checkbox(label='Use microphone')
audio_el = gradio.Audio(label='Audio input')
from webui.ui.tabs.text_to_speech import to_rvc, audio_out
from webui.ui.ui import tabs_el
def to_rvc_func(audio):
return gradio.update(selected='πŸ—£β–ΆπŸ—£ RVC'), audio
to_rvc.click(fn=to_rvc_func, inputs=audio_out, outputs=[tabs_el, audio_el])
def update_audio_input(use_mic):
return gradio.update(source='microphone' if use_mic else 'upload')
use_microphone.change(fn=update_audio_input, inputs=use_microphone, outputs=audio_el)
with gradio.Accordion('πŸ—£ RVC'):
with gradio.Row():
selected = gradio.Dropdown(get_models_installed()[0]['choices'], label='RVC Model')
with gradio.Column(elem_classes='smallsplit'):
refresh = gradio.Button('πŸ”ƒ', variant='tool secondary')
unload = gradio.Button('πŸ’£', variant='tool primary')
speaker_id = gradio.Slider(value=0, step=1, maximum=0, visible=False, label='Speaker id', info='For multi speaker models, the speaker to use.')
pitch_extract = gradio.CheckboxGroup(choices=["dio", "pm", "harvest", "torchcrepe", "torchcrepe tiny", "mangio-crepe", "mangio-crepe tiny", "rmvpe"], label='Pitch extraction', value='harvest', interactive=True, info='Default: dio. dio and pm are faster, harvest is slower but good. Crepe is good but uses GPU.')
crepe_hop_length = gradio.Slider(visible=False, minimum=64, maximum=512, step=64, value=128, label='torchcrepe hop length', info='The length of the hops used for torchcrepe\'s crepe implementation')
def update_crepe_hop_length_visible(pitch_mode: str):
return gradio.update(visible=any(['crepe' in v for v in pitch_mode]))
pitch_extract.change(fn=update_crepe_hop_length_visible, inputs=pitch_extract, outputs=crepe_hop_length)
refresh.click(fn=get_models_installed, outputs=[selected, speaker_id], show_progress=True)
unload.click(fn=unload_rvc, outputs=[selected, speaker_id], show_progress=True)
selected.select(fn=load_rvc, inputs=selected, outputs=[selected, speaker_id], show_progress=True)
index_rate = gradio.Slider(0, 1, 0.88, step=0.01, label='Index rate for feature retrieval', info='Default: 0.88. Higher is more indexing, takes longer but could be better')
filter_radius = gradio.Slider(0, 7, 3, step=1, label='Filter radius', info='Default: 3. Smooth out the pitches, should yield less voice cracks.')
up_key = gradio.Number(value=0, label='Pitch offset', info='Default: 0. Shift the pitch up or down')
protect = gradio.Slider(0, 0.5, 0.33, step=0.01, label='Protect amount', info='Default: 0.33. Avoid non voice sounds. Lower is more being ignored.')
flags = gradio.Dropdown(flag_strings, label='Flags', info='Things to apply on the audio input/output', multiselect=True)
with gradio.Column():
with gradio.Row():
generate = gradio.Button('Generate', variant='primary', elem_id='rvc-generate')
with gradio.Row():
audio_out = gradio.Audio(label='output audio', interactive=False)
with gradio.Row():
video_out = gradio.Video(label='output spectrogram video', interactive=False)
with gradio.Row():
audio_bg = gradio.Audio(label='background', interactive=False)
with gradio.Row():
audio_vocal = gradio.Audio(label='vocals', interactive=False)
generate.click(fn=gen, inputs=[selected, speaker_id, pitch_extract, audio_el,
up_key, index_rate, filter_radius, protect, crepe_hop_length, flags], outputs=[audio_out, video_out, audio_bg, audio_vocal])