import gradio as gr import time import torch import scipy.io.wavfile from espnet2.bin.tts_inference import Text2Speech from espnet2.utils.types import str_or_none tagen = 'espnet/english_male_ryanspeech_tacotron' vocoder_tagen = "parallel_wavegan/ljspeech_melgan.v1.long" text2speechen = Text2Speech.from_pretrained( model_tag=str_or_none(tagen), vocoder_tag=str_or_none(vocoder_tagen), device="cpu", ) def inference(text, gender): with torch.no_grad(): if gender == "male": wav = text2speechen(text)["wav"] scipy.io.wavfile.write("out.wav", text2speechen.fs, wav.view(-1).cpu().numpy()) return "out.wav" title = "RyanSpeech TTS" description = "Gradio demo for RyanSpeech: First high quality speech dataset in the domain of conversation. (the female voice will be added in future).You get much better outputs when you use our pre-trained vocoder. To use it, simply input a text, or click one of the examples to load. Please cite our work" article = "
" "" "RyanSpeech-TTS | Website | Download Dataset | Github
" examples = [['When he reached the suburbs, the light of homes was shining through curtains of all colors', "male"], ['I am a fully autonomous social robot. I can talk, listen, express, understand, and remember. My programming lets me have a conversation with just about anyone.', "male"], ['When in the very midst of our victory, here comes an order to halt.', "male"]] gr.Interface( inference, [gr.inputs.Textbox(label="input text", lines=10), gr.inputs.Radio(choices=["male", "female"], type="value", default="male", label="Gender")], gr.outputs.Audio(type="file", label="Output"), title=title, description=description, article=article, enable_queue=True, examples=examples ).launch(debug=True)