Spaces:
Running
on
A10G
Running
on
A10G
import os | |
import shutil | |
from huggingface_hub import snapshot_download | |
import gradio as gr | |
from gradio_client import Client, handle_file | |
from mutagen.mp3 import MP3 | |
from pydub import AudioSegment | |
from PIL import Image | |
os.chdir(os.path.dirname(os.path.abspath(__file__))) | |
from scripts.inference import inference_process | |
import argparse | |
import uuid | |
is_shared_ui = True if "fudan-generative-ai/hallo" in os.environ['SPACE_ID'] else False | |
if(not is_shared_ui): | |
hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models") | |
def is_mp3(file_path): | |
try: | |
audio = MP3(file_path) | |
return True | |
except Exception as e: | |
return False | |
def convert_mp3_to_wav(mp3_file_path, wav_file_path): | |
# Load the MP3 file | |
audio = AudioSegment.from_mp3(mp3_file_path) | |
# Export as WAV file | |
audio.export(wav_file_path, format="wav") | |
return wav_file_path | |
def trim_audio(file_path, output_path, max_duration=4000): | |
# Load the audio file | |
audio = AudioSegment.from_wav(file_path) | |
# Check the length of the audio in milliseconds | |
audio_length = len(audio) | |
# If the audio is longer than the maximum duration, trim it | |
if audio_length > max_duration: | |
trimmed_audio = audio[:max_duration] | |
else: | |
trimmed_audio = audio | |
# Export the trimmed audio to a new file | |
trimmed_audio.export(output_path, format="wav") | |
return output_path | |
def add_silence_to_wav(wav_file_path, duration_s=1): | |
# Load the WAV file | |
audio = AudioSegment.from_wav(wav_file_path) | |
# Create 1 second of silence | |
silence = AudioSegment.silent(duration=duration_s * 1000) # duration is in milliseconds | |
# Add silence to the end of the audio file | |
audio_with_silence = audio + silence | |
# Export the modified audio | |
audio_with_silence.export(wav_file_path, format="wav") | |
return wav_file_path | |
def check_mp3(file_path): | |
if is_mp3(file_path): | |
wav_file_path = os.path.splitext(file_path)[0] + '.wav' | |
converted_audio = convert_mp3_to_wav(file_path, wav_file_path) | |
print(f"File converted to {wav_file_path}") | |
return converted_audio | |
else: | |
print("The file is not an MP3 file.") | |
return file_path | |
def convert_webp_to_png(webp_file): | |
# Open the WebP image | |
webp_image = Image.open(webp_file) | |
# Convert and save as PNG | |
webp_image.save("png_converted_image.png", "PNG") | |
return "png_converted_image.png" | |
def generate_portrait(prompt_image): | |
if prompt_image is None or prompt_image == "": | |
raise gr.Error("Can't generate a portrait without a prompt !") | |
client = Client("AP123/SDXL-Lightning") | |
result = client.predict( | |
prompt_image, | |
"4-Step", | |
api_name="/generate_image" | |
) | |
print(result) | |
return result | |
def generate_voice(prompt_audio, voice_description): | |
if prompt_audio is None or prompt_audio == "" : | |
raise gr.Error("Can't generate a voice without text to synthetize !") | |
if voice_description is None or voice_description == "": | |
gr.Info( | |
"For better control, You may want to provide a voice character description next time.", | |
duration = 10, | |
visible = True | |
) | |
client = Client("parler-tts/parler_tts_mini") | |
result = client.predict( | |
text=prompt_audio, | |
description=voice_description, | |
api_name="/gen_tts" | |
) | |
print(result) | |
return result | |
def get_whisperspeech(prompt_audio_whisperspeech, audio_to_clone): | |
client = Client("collabora/WhisperSpeech") | |
result = client.predict( | |
multilingual_text=prompt_audio_whisperspeech, | |
speaker_audio=handle_file(audio_to_clone), | |
speaker_url="", | |
cps=14, | |
api_name="/whisper_speech_demo" | |
) | |
print(result) | |
return result | |
def run_hallo(source_image, driving_audio, progress=gr.Progress(track_tqdm=True)): | |
if is_shared_ui: | |
raise gr.Error("This Space only works in duplicated instances") | |
unique_id = uuid.uuid4() | |
args = argparse.Namespace( | |
config='configs/inference/default.yaml', | |
source_image=source_image, | |
driving_audio=driving_audio, | |
output=f'output-{unique_id}.mp4', | |
pose_weight=1.0, | |
face_weight=1.0, | |
lip_weight=1.0, | |
face_expand_ratio=1.2, | |
checkpoint=None | |
) | |
inference_process(args) | |
return f'output-{unique_id}.mp4' | |
def generate_talking_portrait(portrait, voice): | |
if portrait is None: | |
raise gr.Error("Please provide a portrait to animate.") | |
if voice is None: | |
raise gr.Error("Please provide audio (4 seconds max).") | |
# trim audio | |
input_file = voice | |
trimmed_output_file = "trimmed_audio.wav" | |
trimmed_output_file = trim_audio(input_file, trimmed_output_file) | |
voice = trimmed_output_file | |
ready_audio = add_silence_to_wav(voice) | |
print(f"1 second of silence added to {voice}") | |
# call hallo | |
talking_portrait_vid = run_hallo(portrait, ready_audio) | |
return talking_portrait_vid | |
css = ''' | |
#col-container { | |
margin: 0 auto; | |
} | |
#main-group { | |
background-color: none; | |
} | |
.tabs { | |
background-color: unset; | |
} | |
#image-block { | |
flex: 1; | |
} | |
#video-block { | |
flex: 9; | |
} | |
#audio-block, #audio-clone-elm { | |
flex: 1; | |
} | |
#text-synth, #voice-desc, #text-synth-wsp{ | |
height: 180px; | |
} | |
#audio-column, #result-column { | |
display: flex; | |
} | |
#gen-voice-btn { | |
flex: 1; | |
} | |
#parler-tab, #whisperspeech-tab { | |
padding: 0; | |
} | |
#main-submit{ | |
flex: 1; | |
} | |
div#warning-ready { | |
background-color: #ecfdf5; | |
padding: 0 16px 16px; | |
margin: 20px 0; | |
color: #030303!important; | |
} | |
div#warning-ready > .gr-prose > h2, div#warning-ready > .gr-prose > p { | |
color: #057857!important; | |
} | |
div#warning-duplicate { | |
background-color: #ebf5ff; | |
padding: 0 16px 16px; | |
margin: 20px 0; | |
color: #030303!important; | |
} | |
div#warning-duplicate > .gr-prose > h2, div#warning-duplicate > .gr-prose > p { | |
color: #0f4592!important; | |
} | |
div#warning-duplicate strong { | |
color: #0f4592; | |
} | |
p.actions { | |
display: flex; | |
align-items: center; | |
margin: 20px 0; | |
} | |
div#warning-duplicate .actions a { | |
display: inline-block; | |
margin-right: 10px; | |
} | |
.dark #warning-duplicate { | |
background-color: #0c0c0c !important; | |
border: 1px solid white !important; | |
} | |
''' | |
with gr.Blocks(css=css) as demo: | |
with gr.Column(elem_id="col-container"): | |
gr.Markdown(""" | |
# Parler X Hallo | |
Generate talking portraits | |
""") | |
with gr.Group(elem_id="main-group"): | |
with gr.Row(): | |
with gr.Column(): | |
portrait = gr.Image( | |
sources=["upload"], | |
type="filepath", | |
format="png", | |
elem_id="image-block" | |
) | |
prompt_image = gr.Textbox( | |
label="Generate image", | |
lines=3 | |
) | |
gen_image_btn = gr.Button("Generate portrait (optional)") | |
with gr.Column(elem_id="audio-column"): | |
voice = gr.Audio( | |
type="filepath", | |
max_length=4000, | |
elem_id="audio-block" | |
) | |
with gr.Tab("Parler TTS", elem_id="parler-tab"): | |
prompt_audio = gr.Textbox( | |
label="Text to synthetize", | |
lines=4, | |
max_lines=4, | |
elem_id="text-synth" | |
) | |
voice_description = gr.Textbox( | |
label="Voice description", | |
lines=4, | |
max_lines=4, | |
elem_id="voice-desc" | |
) | |
gen_voice_btn = gr.Button("Generate voice (optional)") | |
with gr.Tab("WhisperSpeech", elem_id="whisperspeech-tab"): | |
prompt_audio_whisperspeech = gr.Textbox( | |
label="Text to synthetize", | |
lines=4, | |
max_lines=4, | |
elem_id="text-synth-wsp" | |
) | |
audio_to_clone = gr.Audio( | |
label="Voice to clone", | |
type="filepath", | |
elem_id="audio-clone-elm" | |
) | |
gen_wsp_voice_btn = gr.Button("Generate voice clone (optional)") | |
with gr.Column(elem_id="result-column"): | |
result = gr.Video( | |
elem_id="video-block" | |
) | |
submit_btn = gr.Button("Submit", elem_id="main-submit") | |
voice.upload( | |
fn = check_mp3, | |
inputs = [voice], | |
outputs = [voice], | |
queue = False, | |
show_api = False | |
) | |
gen_image_btn.click( | |
fn = generate_portrait, | |
inputs = [prompt_image], | |
outputs = [portrait], | |
queue=False, | |
show_api = False | |
) | |
gen_voice_btn.click( | |
fn = generate_voice, | |
inputs = [prompt_audio, voice_description], | |
outputs = [voice], | |
queue=False, | |
show_api = False | |
) | |
gen_wsp_voice_btn.click( | |
fn = get_whisperspeech, | |
inputs = [prompt_audio_whisperspeech, audio_to_clone], | |
outputs = [voice], | |
queue=False, | |
show_api = False | |
) | |
submit_btn.click( | |
fn = generate_talking_portrait, | |
inputs = [portrait, voice], | |
outputs = [result], | |
show_api = False | |
) | |
demo.queue(max_size=2).launch(show_error=True, show_api=False) |