File size: 2,337 Bytes
24f6ec0 4ca1c60 9a87ba4 2dd5cbe 4ca1c60 c6d4b6c 4ca1c60 24f6ec0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import gradio as gr
import numpy as np
from audioldm import text_to_audio, build_model
from share_btn import community_icon_html, loading_icon_html, share_js
model_id="haoheliu/AudioLDM-S-Full"
audioldm = None
current_model_name = None
# def predict(input, history=[]):
# # tokenize the new input sentence
# new_user_input_ids = tokenizer.encode(input + tokenizer.eos_token, return_tensors='pt')
# # append the new user input tokens to the chat history
# bot_input_ids = torch.cat([torch.LongTensor(history), new_user_input_ids], dim=-1)
# # generate a response
# history = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id).tolist()
# # convert the tokens to text, and then split the responses into lines
# response = tokenizer.decode(history[0]).split("<|endoftext|>")
# response = [(response[i], response[i+1]) for i in range(0, len(response)-1, 2)] # convert to tuples of list
# return response, history
def text2audio(text, duration, guidance_scale, random_seed, n_candidates, model_name="audioldm-m-text-ft"):
global audioldm, current_model_name
if audioldm is None or model_name != current_model_name:
audioldm=build_model(model_name=model_name)
current_model_name = model_name
# print(text, length, guidance_scale)
waveform = text_to_audio(
latent_diffusion=audioldm,
text=text,
seed=random_seed,
duration=duration,
guidance_scale=guidance_scale,
n_candidate_gen_per_text=int(n_candidates),
) # [bs, 1, samples]
waveform = [
gr.make_waveform((16000, wave[0]), bg_image="bg.png") for wave in waveform
]
# waveform = [(16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
if(len(waveform) == 1):
waveform = waveform[0]
return waveform
iface = gr.Interface(fn=text2audio, inputs=[
gr.Textbox(value="A man is speaking in a huge room", max_lines=1),
gr.Slider(2.5, 10, value=5, step=2.5),
gr.Slider(0, 5, value=2.5, step=0.5),
gr.Number(value=42),
gr.Number(value=3)
], outputs="audio",
allow_flagging="never"
)
iface.launch(share=False)
#iface.queue(max_size=10).launch(debug=True)
# iface.launch(debug=True, share=True)
|