Spaces:
Running
on
A10G
Running
on
A10G
import os | |
import shutil | |
from huggingface_hub import snapshot_download | |
import gradio as gr | |
from gradio_client import Client, handle_file | |
from mutagen.mp3 import MP3 | |
from pydub import AudioSegment | |
from PIL import Image | |
import ffmpeg | |
os.chdir(os.path.dirname(os.path.abspath(__file__))) | |
from scripts.inference import inference_process | |
import argparse | |
import uuid | |
is_shared_ui = True if "fffiloni/tts-hallo-talking-portrait" in os.environ['SPACE_ID'] else False | |
hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models") | |
AUDIO_MAX_DURATION = 4000 | |
############# | |
# UTILITIES # | |
############# | |
def is_mp3(file_path): | |
try: | |
audio = MP3(file_path) | |
return True | |
except Exception as e: | |
return False | |
def convert_mp3_to_wav(mp3_file_path, wav_file_path): | |
# Load the MP3 file | |
audio = AudioSegment.from_mp3(mp3_file_path) | |
# Export as WAV file | |
audio.export(wav_file_path, format="wav") | |
return wav_file_path | |
def trim_audio(file_path, output_path, max_duration): | |
# Load the audio file | |
audio = AudioSegment.from_wav(file_path) | |
# Check the length of the audio in milliseconds | |
audio_length = len(audio) | |
# If the audio is longer than the maximum duration, trim it | |
if audio_length > max_duration: | |
trimmed_audio = audio[:max_duration] | |
else: | |
trimmed_audio = audio | |
# Export the trimmed audio to a new file | |
trimmed_audio.export(output_path, format="wav") | |
return output_path | |
def add_silence_to_wav(wav_file_path, duration_s=1): | |
# Load the WAV file | |
audio = AudioSegment.from_wav(wav_file_path) | |
# Create 1 second of silence | |
silence = AudioSegment.silent(duration=duration_s * 1000) # duration is in milliseconds | |
# Add silence to the end of the audio file | |
audio_with_silence = audio + silence | |
# Export the modified audio | |
audio_with_silence.export(wav_file_path, format="wav") | |
return wav_file_path | |
def check_mp3(file_path): | |
if is_mp3(file_path): | |
unique_id = uuid.uuid4() | |
wav_file_path = f"{os.path.splitext(file_path)[0]}-{unique_id}.wav" | |
converted_audio = convert_mp3_to_wav(file_path, wav_file_path) | |
print(f"File converted to {wav_file_path}") | |
return converted_audio, gr.update(value=converted_audio, visible=True) | |
else: | |
print("The file is not an MP3 file.") | |
return file_path, gr.update(value=file_path, visible=True) | |
def check_and_convert_webp_to_png(input_path, output_path): | |
try: | |
# Open the image file | |
with Image.open(input_path) as img: | |
# Check if the image is in WebP format | |
if img.format == 'WEBP': | |
# Convert and save as PNG | |
img.save(output_path, 'PNG') | |
print(f"Converted {input_path} to {output_path}") | |
return output_path | |
else: | |
print(f"The file {input_path} is not in WebP format.") | |
return input_path | |
except IOError: | |
print(f"Cannot open {input_path}. The file might not exist or is not an image.") | |
def convert_user_uploded_webp(input_path): | |
# convert to png if necessary | |
input_file = input_path | |
unique_id = uuid.uuid4() | |
output_file = f"converted_to_png_portrait-{unique_id}.png" | |
ready_png = check_and_convert_webp_to_png(input_file, output_file) | |
print(f"PORTRAIT PNG FILE: {ready_png}") | |
return ready_png | |
def clear_audio_elms(): | |
return gr.update(value=None, visible=False) | |
def change_video_codec(input_file, output_file, codec='libx264', audio_codec='aac'): | |
try: | |
( | |
ffmpeg | |
.input(input_file) | |
.output(output_file, vcodec=codec, acodec=audio_codec) | |
.run(overwrite_output=True) | |
) | |
print(f'Successfully changed codec of {input_file} and saved as {output_file}') | |
except ffmpeg.Error as e: | |
print(f'Error occurred: {e.stderr.decode()}') | |
####################################################### | |
# Gradio APIs for optional image and voice generation # | |
####################################################### | |
def generate_portrait(prompt_image): | |
if prompt_image is None or prompt_image == "": | |
raise gr.Error("Can't generate a portrait without a prompt !") | |
try: | |
client = Client("ByteDance/SDXL-Lightning") | |
except: | |
raise gr.Error(f"ByteDance/SDXL-Lightning space's api might not be ready, please wait, or upload an image instead.") | |
result = client.predict( | |
prompt = prompt_image, | |
ckpt = "4-Step", | |
api_name = "/generate_image" | |
) | |
print(result) | |
# convert to png if necessary | |
input_file = result | |
unique_id = uuid.uuid4() | |
output_file = f"converted_to_png_portrait-{unique_id}.png" | |
ready_png = check_and_convert_webp_to_png(input_file, output_file) | |
print(f"PORTRAIT PNG FILE: {ready_png}") | |
return ready_png | |
def generate_voice_with_parler(prompt_audio, voice_description): | |
if prompt_audio is None or prompt_audio == "" : | |
raise gr.Error(f"Can't generate a voice without text to synthetize !") | |
if voice_description is None or voice_description == "": | |
gr.Info( | |
"For better control, You may want to provide a voice character description next time.", | |
duration = 10, | |
visible = True | |
) | |
try: | |
client = Client("parler-tts/parler_tts_mini") | |
except: | |
raise gr.Error(f"parler-tts/parler_tts_mini space's api might not be ready, please wait, or upload an audio instead.") | |
result = client.predict( | |
text = prompt_audio, | |
description = voice_description, | |
api_name = "/gen_tts" | |
) | |
print(result) | |
return result, gr.update(value=result, visible=True) | |
def get_whisperspeech(prompt_audio_whisperspeech, audio_to_clone): | |
try: | |
client = Client("collabora/WhisperSpeech") | |
except: | |
raise gr.Error(f"collabora/WhisperSpeech space's api might not be ready, please wait, or upload an audio instead.") | |
result = client.predict( | |
multilingual_text = prompt_audio_whisperspeech, | |
speaker_audio = handle_file(audio_to_clone), | |
speaker_url = "", | |
cps = 14, | |
api_name = "/whisper_speech_demo" | |
) | |
print(result) | |
return result, gr.update(value=result, visible=True) | |
def get_maskGCT_TTS(prompt_audio_maskGCT, audio_to_clone): | |
try: | |
client = Client("amphion/maskgct") | |
except: | |
raise gr.Error(f"amphion/maskgct space's api might not be ready, please wait, or upload an audio instead.") | |
result = client.predict( | |
prompt_wav = handle_file(audio_to_clone), | |
target_text = prompt_audio_maskGCT, | |
target_len=-1, | |
n_timesteps=25, | |
api_name="/predict" | |
) | |
print(result) | |
return result, gr.update(value=result, visible=True) | |
######################## | |
# TALKING PORTRAIT GEN # | |
######################## | |
def run_hallo(source_image, driving_audio, progress=gr.Progress(track_tqdm=True)): | |
unique_id = uuid.uuid4() | |
args = argparse.Namespace( | |
config = 'configs/inference/default.yaml', | |
source_image = source_image, | |
driving_audio = driving_audio, | |
output = f'output-{unique_id}.mp4', | |
pose_weight = 1.0, | |
face_weight = 1.0, | |
lip_weight = 1.0, | |
face_expand_ratio = 1.2, | |
checkpoint = None | |
) | |
inference_process(args) | |
return f'output-{unique_id}.mp4' | |
def generate_talking_portrait(portrait, voice, progress=gr.Progress(track_tqdm=True)): | |
if portrait is None: | |
raise gr.Error("Please provide a portrait to animate.") | |
if voice is None: | |
raise gr.Error("Please provide audio (4 seconds max).") | |
if is_shared_ui : | |
# Trim audio to AUDIO_MAX_DURATION for better shared experience with community | |
input_file = voice | |
unique_id = uuid.uuid4() | |
trimmed_output_file = f"-{unique_id}.wav" | |
trimmed_output_file = trim_audio(input_file, trimmed_output_file, AUDIO_MAX_DURATION) | |
voice = trimmed_output_file | |
# Add 1 second of silence at the end to avoid last word being cut by hallo | |
ready_audio = add_silence_to_wav(voice) | |
print(f"1 second of silence added to {voice}") | |
# Call hallo | |
talking_portrait_vid = run_hallo(portrait, ready_audio) | |
# Convert video to readable format | |
final_output_file = f"converted_{talking_portrait_vid}" | |
change_video_codec(talking_portrait_vid, final_output_file) | |
return final_output_file | |
css = ''' | |
#col-container { | |
margin: 0 auto; | |
} | |
#column-names { | |
margin-top: 50px; | |
} | |
#main-group { | |
background-color: none; | |
} | |
.tabs { | |
background-color: unset; | |
} | |
#image-block { | |
flex: 1; | |
} | |
#video-block { | |
flex: 9; | |
} | |
#audio-block, #audio-clone-elm, audio-clone-elm-maskGCT { | |
flex: 1; | |
} | |
div#audio-clone-elm > .audio-container > button { | |
height: 180px!important; | |
} | |
div#audio-clone-elm > .audio-container > button > .wrap { | |
font-size: 0.9em; | |
} | |
div#audio-clone-elm-maskGCT > .audio-container > button { | |
height: 180px!important; | |
} | |
div#audio-clone-elm-maskGCT > .audio-container > button > .wrap { | |
font-size: 0.9em; | |
} | |
#text-synth, #voice-desc{ | |
height: 130px; | |
} | |
#text-synth-wsp { | |
height: 120px; | |
} | |
#text-synth-maskGCT { | |
height: 120px; | |
} | |
#audio-column, #result-column { | |
display: flex; | |
} | |
#gen-voice-btn { | |
flex: 1; | |
} | |
#parler-tab, #whisperspeech-tab, #maskGCT-tab { | |
padding: 0; | |
} | |
#main-submit{ | |
flex: 1; | |
} | |
#pro-tips { | |
margin-top: 50px; | |
} | |
div#warning-ready { | |
background-color: #ecfdf5; | |
padding: 0 16px 16px; | |
margin: 20px 0; | |
color: #030303!important; | |
} | |
div#warning-ready > .gr-prose > h2, div#warning-ready > .gr-prose > p { | |
color: #057857!important; | |
} | |
div#warning-duplicate { | |
background-color: #ebf5ff; | |
padding: 0 16px 16px; | |
margin: 20px 0; | |
color: #030303!important; | |
} | |
div#warning-duplicate > .gr-prose > h2, div#warning-duplicate > .gr-prose > p { | |
color: #0f4592!important; | |
} | |
div#warning-duplicate strong { | |
color: #0f4592; | |
} | |
p.actions { | |
display: flex; | |
align-items: center; | |
margin: 20px 0; | |
} | |
div#warning-duplicate .actions a { | |
display: inline-block; | |
margin-right: 10px; | |
} | |
.dark #warning-duplicate { | |
background-color: #0c0c0c !important; | |
border: 1px solid white !important; | |
} | |
div#component-8 { | |
align-items: stretch; | |
} | |
''' | |
with gr.Blocks(css=css) as demo: | |
with gr.Column(elem_id="col-container"): | |
gr.Markdown(""" | |
# TTS x Hallo Talking Portrait Generator | |
This demo allows you to generate a talking portrait with the help of several open-source projects: SDXL Lightning | Parler TTS | WhisperSpeech | Hallo | |
To let the community try and enjoy this demo, video length is limited to 4 seconds audio maximum. | |
Duplicate this space to skip the queue and get unlimited video duration. 4-5 seconds of audio will take ~5 minutes per inference, please be patient. | |
""") | |
with gr.Row(elem_id="column-names"): | |
gr.Markdown("## 1. Load Portrait") | |
gr.Markdown("## 2. Load Voice") | |
gr.Markdown("## 3. Result") | |
with gr.Group(elem_id="main-group"): | |
with gr.Row(): | |
with gr.Column(): | |
portrait = gr.Image( | |
sources = ["upload"], | |
type = "filepath", | |
format = "png", | |
elem_id = "image-block" | |
) | |
prompt_image = gr.Textbox( | |
label = "Generate image", | |
lines = 2, | |
max_lines = 2 | |
) | |
gen_image_btn = gr.Button("Generate portrait (optional)") | |
with gr.Column(elem_id="audio-column"): | |
voice = gr.Audio( | |
type = "filepath", | |
elem_id = "audio-block" | |
) | |
preprocess_audio_file = gr.File(visible=False) | |
with gr.Tab("Parler TTS", elem_id="parler-tab"): | |
prompt_audio = gr.Textbox( | |
label = "Text to synthetize", | |
lines = 3, | |
max_lines = 3, | |
elem_id = "text-synth" | |
) | |
voice_description = gr.Textbox( | |
label = "Voice description", | |
lines = 3, | |
max_lines = 3, | |
elem_id = "voice-desc" | |
) | |
gen_voice_btn = gr.Button("Generate voice (optional)") | |
with gr.Tab("WhisperSpeech", elem_id="whisperspeech-tab"): | |
prompt_audio_whisperspeech = gr.Textbox( | |
label = "Text to synthetize", | |
lines = 2, | |
max_lines = 2, | |
elem_id = "text-synth-wsp" | |
) | |
audio_to_clone = gr.Audio( | |
label = "Voice to clone", | |
type = "filepath", | |
elem_id = "audio-clone-elm" | |
) | |
gen_wsp_voice_btn = gr.Button("Generate voice clone (optional)") | |
with gr.Tab("MaskGCT TTS", elem_id="maskGCT-tab"): | |
prompt_audio_maskGCT = gr.Textbox( | |
label = "Text to synthetize", | |
lines = 2, | |
max_lines = 2, | |
elem_id = "text-synth-maskGCT" | |
) | |
audio_to_clone_maskGCT = gr.Audio( | |
label = "Voice to clone", | |
type = "filepath", | |
elem_id = "audio-clone-elm-maskGCT" | |
) | |
gen_maskGCT_voice_btn = gr.Button("Generate voice clone (optional)") | |
with gr.Column(elem_id="result-column"): | |
result = gr.Video( | |
elem_id="video-block" | |
) | |
submit_btn = gr.Button("Go talking Portrait !", elem_id="main-submit") | |
with gr.Row(elem_id="pro-tips"): | |
gr.Markdown(""" | |
# Hallo Pro Tips: | |
Hallo has a few simple requirements for input data: | |
For the source image: | |
1. It should be cropped into squares. | |
2. The face should be the main focus, making up 50%-70% of the image. | |
3. The face should be facing forward, with a rotation angle of less than 30° (no side profiles). | |
For the driving audio: | |
1. It must be in WAV format. | |
2. It must be in English since our training datasets are only in this language. | |
3. Ensure the vocals are clear; background music is acceptable. | |
""") | |
gr.Markdown(""" | |
# TTS Pro Tips: | |
For Parler TTS: | |
- Include the term "very clear audio" to generate the highest quality audio, and "very noisy audio" for high levels of background noise | |
- Punctuation can be used to control the prosody of the generations, e.g. use commas to add small breaks in speech | |
- The remaining speech features (gender, speaking rate, pitch and reverberation) can be controlled directly through the prompt | |
For WhisperSpeech: | |
WhisperSpeech is able to quickly clone a voice from an audio sample. | |
- Upload a voice sample in the WhisperSpeech tab | |
- Add text to synthetize, hit Generate voice clone button | |
""") | |
portrait.upload( | |
fn = convert_user_uploded_webp, | |
inputs = [portrait], | |
outputs = [portrait], | |
queue = False, | |
show_api = False | |
) | |
voice.upload( | |
fn = check_mp3, | |
inputs = [voice], | |
outputs = [voice, preprocess_audio_file], | |
queue = False, | |
show_api = False | |
) | |
voice.clear( | |
fn = clear_audio_elms, | |
inputs = None, | |
outputs = [preprocess_audio_file], | |
queue = False, | |
show_api = False | |
) | |
gen_image_btn.click( | |
fn = generate_portrait, | |
inputs = [prompt_image], | |
outputs = [portrait], | |
queue = False, | |
show_api = False | |
) | |
gen_voice_btn.click( | |
fn = generate_voice_with_parler, | |
inputs = [prompt_audio, voice_description], | |
outputs = [voice, preprocess_audio_file], | |
queue = False, | |
show_api = False | |
) | |
gen_wsp_voice_btn.click( | |
fn = get_whisperspeech, | |
inputs = [prompt_audio_whisperspeech, audio_to_clone], | |
outputs = [voice, preprocess_audio_file], | |
queue = False, | |
show_api = False | |
) | |
gen_maskGCT_voice_btn.click( | |
fn = get_maskGCT_TTS, | |
inputs = [prompt_audio_maskGCT, audio_to_clone_maskGCT], | |
outputs = [voice, preprocess_audio_file], | |
queue = False, | |
show_api = False | |
) | |
submit_btn.click( | |
fn = generate_talking_portrait, | |
inputs = [portrait, voice], | |
outputs = [result], | |
show_api = False | |
) | |
demo.queue(max_size=2).launch(show_error=True, show_api=False) |