import os import io import librosa import numpy as np import logging import soundfile import torchaudio import subprocess from flask import Flask, request, send_file, render_template logging.getLogger('numba').setLevel(logging.WARNING) logging.getLogger('markdown_it').setLevel(logging.WARNING) logging.getLogger('urllib3').setLevel(logging.WARNING) logging.getLogger('matplotlib').setLevel(logging.WARNING) template_dir = os.path.abspath('.') # Путь к корневой папке app = Flask(__name__, template_folder=template_dir) limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingface spaces @app.route('/', methods=['GET']) def index(): speakers = ["chapaev", "petka", "anka", "narrator", "floppa"] return render_template('index.html', speakers=speakers, limitation=limitation) @app.route('/inference', methods=['POST']) def run_inference(): input_audio = request.files['audio'] speaker = request.form['speaker'] if input_audio is None: return "You need to upload an audio", 400 audio, sampling_rate = librosa.load(input_audio, sr=None) duration = audio.shape[0] / sampling_rate if duration > 120 and limitation: return "Please upload an audio file that is less than 120 seconds. If you need to generate a longer audio file, please use Colab.", 400 if len(audio.shape) > 1: audio = librosa.to_mono(audio.transpose(1, 0)) if sampling_rate != 16000: audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) # TODO edit from GUI cluster_ratio = 1 noise_scale = 2 is_pitch_prediction_enabled = True f0_method = "dio" transpose = 0 model_path = f"./models/{speaker}/{speaker}.pth" config_path = f"./models/{speaker}/config.json" cluster_path = "" raw_path = 'tmp.wav' soundfile.write(raw_path, audio, 16000, format="wav") inference_cmd = f"svc infer {raw_path} -m {model_path} -c {config_path} {f'-k {cluster_path} -r {cluster_ratio}' if cluster_path != '' and cluster_ratio > 0 else ''} -t {transpose} --f0-method {f0_method} -n {noise_scale} -o out.wav {'' if is_pitch_prediction_enabled else '--no-auto-predict-f0'}" print(inference_cmd) result = subprocess.run( inference_cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True ) return send_file('out.wav', mimetype='audio/wav') if __name__ == '__main__': app.run(host="0.0.0.0", port=7860)