import os import shutil from huggingface_hub import snapshot_download import gradio as gr from gradio_client import Client, handle_file from mutagen.mp3 import MP3 from pydub import AudioSegment from PIL import Image os.chdir(os.path.dirname(os.path.abspath(__file__))) from scripts.inference import inference_process import argparse import uuid is_shared_ui = True if "fudan-generative-ai/hallo" in os.environ['SPACE_ID'] else False if(not is_shared_ui): hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models") def is_mp3(file_path): try: audio = MP3(file_path) return True except Exception as e: return False def convert_mp3_to_wav(mp3_file_path, wav_file_path): # Load the MP3 file audio = AudioSegment.from_mp3(mp3_file_path) # Export as WAV file audio.export(wav_file_path, format="wav") return wav_file_path def trim_audio(file_path, output_path, max_duration=4000): # Load the audio file audio = AudioSegment.from_wav(file_path) # Check the length of the audio in milliseconds audio_length = len(audio) # If the audio is longer than the maximum duration, trim it if audio_length > max_duration: trimmed_audio = audio[:max_duration] else: trimmed_audio = audio # Export the trimmed audio to a new file trimmed_audio.export(output_path, format="wav") return output_path def add_silence_to_wav(wav_file_path, duration_s=1): # Load the WAV file audio = AudioSegment.from_wav(wav_file_path) # Create 1 second of silence silence = AudioSegment.silent(duration=duration_s * 1000) # duration is in milliseconds # Add silence to the end of the audio file audio_with_silence = audio + silence # Export the modified audio audio_with_silence.export(wav_file_path, format="wav") return wav_file_path def check_mp3(file_path): if is_mp3(file_path): wav_file_path = os.path.splitext(file_path)[0] + '.wav' converted_audio = convert_mp3_to_wav(file_path, wav_file_path) print(f"File converted to {wav_file_path}") return converted_audio else: print("The file is not an MP3 file.") return file_path def convert_webp_to_png(webp_file): # Open the WebP image webp_image = Image.open(webp_file) # Convert and save as PNG webp_image.save("png_converted_image.png", "PNG") return "png_converted_image.png" def generate_portrait(prompt_image): if prompt_image is None or prompt_image == "": raise gr.Error("Can't generate a portrait without a prompt !") client = Client("AP123/SDXL-Lightning") result = client.predict( prompt_image, "4-Step", api_name="/generate_image" ) print(result) return result def generate_voice(prompt_audio, voice_description): if prompt_audio is None or prompt_audio == "" : raise gr.Error("Can't generate a voice without text to synthetize !") if voice_description is None or voice_description == "": gr.Info( "For better control, You may want to provide a voice character description next time.", duration = 10, visible = True ) client = Client("parler-tts/parler_tts_mini") result = client.predict( text=prompt_audio, description=voice_description, api_name="/gen_tts" ) print(result) return result def get_whisperspeech(prompt_audio_whisperspeech, audio_to_clone): client = Client("collabora/WhisperSpeech") result = client.predict( multilingual_text=prompt_audio_whisperspeech, speaker_audio=handle_file(audio_to_clone), speaker_url="", cps=14, api_name="/whisper_speech_demo" ) print(result) return result def run_hallo(source_image, driving_audio, progress=gr.Progress(track_tqdm=True)): if is_shared_ui: raise gr.Error("This Space only works in duplicated instances") unique_id = uuid.uuid4() args = argparse.Namespace( config='configs/inference/default.yaml', source_image=source_image, driving_audio=driving_audio, output=f'output-{unique_id}.mp4', pose_weight=1.0, face_weight=1.0, lip_weight=1.0, face_expand_ratio=1.2, checkpoint=None ) inference_process(args) return f'output-{unique_id}.mp4' def generate_talking_portrait(portrait, voice): if portrait is None: raise gr.Error("Please provide a portrait to animate.") if voice is None: raise gr.Error("Please provide audio (4 seconds max).") # trim audio input_file = voice trimmed_output_file = "trimmed_audio.wav" trimmed_output_file = trim_audio(input_file, trimmed_output_file) voice = trimmed_output_file ready_audio = add_silence_to_wav(voice) print(f"1 second of silence added to {voice}") # call hallo talking_portrait_vid = run_hallo(portrait, ready_audio) return talking_portrait_vid css = ''' #col-container { margin: 0 auto; } #main-group { background-color: none; } .tabs { background-color: unset; } #image-block { flex: 1; } #video-block { flex: 9; } #audio-block, #audio-clone-elm { flex: 1; } #text-synth, #voice-desc, #text-synth-wsp{ height: 180px; } #audio-column, #result-column { display: flex; } #gen-voice-btn { flex: 1; } #parler-tab, #whisperspeech-tab { padding: 0; } #main-submit{ flex: 1; } div#warning-ready { background-color: #ecfdf5; padding: 0 16px 16px; margin: 20px 0; color: #030303!important; } div#warning-ready > .gr-prose > h2, div#warning-ready > .gr-prose > p { color: #057857!important; } div#warning-duplicate { background-color: #ebf5ff; padding: 0 16px 16px; margin: 20px 0; color: #030303!important; } div#warning-duplicate > .gr-prose > h2, div#warning-duplicate > .gr-prose > p { color: #0f4592!important; } div#warning-duplicate strong { color: #0f4592; } p.actions { display: flex; align-items: center; margin: 20px 0; } div#warning-duplicate .actions a { display: inline-block; margin-right: 10px; } .dark #warning-duplicate { background-color: #0c0c0c !important; border: 1px solid white !important; } ''' with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): gr.Markdown(""" # Parler X Hallo Generate talking portraits """) with gr.Group(elem_id="main-group"): with gr.Row(): with gr.Column(): portrait = gr.Image( sources=["upload"], type="filepath", format="png", elem_id="image-block" ) prompt_image = gr.Textbox( label="Generate image", lines=3 ) gen_image_btn = gr.Button("Generate portrait (optional)") with gr.Column(elem_id="audio-column"): voice = gr.Audio( type="filepath", max_length=4000, elem_id="audio-block" ) with gr.Tab("Parler TTS", elem_id="parler-tab"): prompt_audio = gr.Textbox( label="Text to synthetize", lines=4, max_lines=4, elem_id="text-synth" ) voice_description = gr.Textbox( label="Voice description", lines=4, max_lines=4, elem_id="voice-desc" ) gen_voice_btn = gr.Button("Generate voice (optional)") with gr.Tab("WhisperSpeech", elem_id="whisperspeech-tab"): prompt_audio_whisperspeech = gr.Textbox( label="Text to synthetize", lines=4, max_lines=4, elem_id="text-synth-wsp" ) audio_to_clone = gr.Audio( label="Voice to clone", type="filepath", elem_id="audio-clone-elm" ) gen_wsp_voice_btn = gr.Button("Generate voice clone (optional)") with gr.Column(elem_id="result-column"): result = gr.Video( elem_id="video-block" ) submit_btn = gr.Button("Submit", elem_id="main-submit") voice.upload( fn = check_mp3, inputs = [voice], outputs = [voice], queue = False, show_api = False ) gen_image_btn.click( fn = generate_portrait, inputs = [prompt_image], outputs = [portrait], queue=False, show_api = False ) gen_voice_btn.click( fn = generate_voice, inputs = [prompt_audio, voice_description], outputs = [voice], queue=False, show_api = False ) gen_wsp_voice_btn.click( fn = get_whisperspeech, inputs = [prompt_audio_whisperspeech, audio_to_clone], outputs = [voice], queue=False, show_api = False ) submit_btn.click( fn = generate_talking_portrait, inputs = [portrait, voice], outputs = [result], show_api = False ) demo.queue(max_size=2).launch(show_error=True, show_api=False)