fffiloni's picture
Update app.py
446a654 verified
raw
history blame
10 kB
import os
import shutil
from huggingface_hub import snapshot_download
import gradio as gr
from gradio_client import Client, handle_file
from mutagen.mp3 import MP3
from pydub import AudioSegment
from PIL import Image
os.chdir(os.path.dirname(os.path.abspath(__file__)))
from scripts.inference import inference_process
import argparse
import uuid
is_shared_ui = True if "fudan-generative-ai/hallo" in os.environ['SPACE_ID'] else False
if(not is_shared_ui):
hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")
def is_mp3(file_path):
try:
audio = MP3(file_path)
return True
except Exception as e:
return False
def convert_mp3_to_wav(mp3_file_path, wav_file_path):
# Load the MP3 file
audio = AudioSegment.from_mp3(mp3_file_path)
# Export as WAV file
audio.export(wav_file_path, format="wav")
return wav_file_path
def trim_audio(file_path, output_path, max_duration=4000):
# Load the audio file
audio = AudioSegment.from_wav(file_path)
# Check the length of the audio in milliseconds
audio_length = len(audio)
# If the audio is longer than the maximum duration, trim it
if audio_length > max_duration:
trimmed_audio = audio[:max_duration]
else:
trimmed_audio = audio
# Export the trimmed audio to a new file
trimmed_audio.export(output_path, format="wav")
return output_path
def add_silence_to_wav(wav_file_path, duration_s=1):
# Load the WAV file
audio = AudioSegment.from_wav(wav_file_path)
# Create 1 second of silence
silence = AudioSegment.silent(duration=duration_s * 1000) # duration is in milliseconds
# Add silence to the end of the audio file
audio_with_silence = audio + silence
# Export the modified audio
audio_with_silence.export(wav_file_path, format="wav")
return wav_file_path
def check_mp3(file_path):
if is_mp3(file_path):
wav_file_path = os.path.splitext(file_path)[0] + '.wav'
converted_audio = convert_mp3_to_wav(file_path, wav_file_path)
print(f"File converted to {wav_file_path}")
return converted_audio
else:
print("The file is not an MP3 file.")
return file_path
def convert_webp_to_png(webp_file):
# Open the WebP image
webp_image = Image.open(webp_file)
# Convert and save as PNG
webp_image.save("png_converted_image.png", "PNG")
return "png_converted_image.png"
def generate_portrait(prompt_image):
if prompt_image is None or prompt_image == "":
raise gr.Error("Can't generate a portrait without a prompt !")
client = Client("AP123/SDXL-Lightning")
result = client.predict(
prompt_image,
"4-Step",
api_name="/generate_image"
)
print(result)
return result
def generate_voice(prompt_audio, voice_description):
if prompt_audio is None or prompt_audio == "" :
raise gr.Error("Can't generate a voice without text to synthetize !")
if voice_description is None or voice_description == "":
gr.Info(
"For better control, You may want to provide a voice character description next time.",
duration = 10,
visible = True
)
client = Client("parler-tts/parler_tts_mini")
result = client.predict(
text=prompt_audio,
description=voice_description,
api_name="/gen_tts"
)
print(result)
return result
def get_whisperspeech(prompt_audio_whisperspeech, audio_to_clone):
client = Client("collabora/WhisperSpeech")
result = client.predict(
multilingual_text=prompt_audio_whisperspeech,
speaker_audio=handle_file(audio_to_clone),
speaker_url="",
cps=14,
api_name="/whisper_speech_demo"
)
print(result)
return result
def run_hallo(source_image, driving_audio, progress=gr.Progress(track_tqdm=True)):
if is_shared_ui:
raise gr.Error("This Space only works in duplicated instances")
unique_id = uuid.uuid4()
args = argparse.Namespace(
config='configs/inference/default.yaml',
source_image=source_image,
driving_audio=driving_audio,
output=f'output-{unique_id}.mp4',
pose_weight=1.0,
face_weight=1.0,
lip_weight=1.0,
face_expand_ratio=1.2,
checkpoint=None
)
inference_process(args)
return f'output-{unique_id}.mp4'
def generate_talking_portrait(portrait, voice):
if portrait is None:
raise gr.Error("Please provide a portrait to animate.")
if voice is None:
raise gr.Error("Please provide audio (4 seconds max).")
# trim audio
input_file = voice
trimmed_output_file = "trimmed_audio.wav"
trimmed_output_file = trim_audio(input_file, trimmed_output_file)
voice = trimmed_output_file
ready_audio = add_silence_to_wav(voice)
print(f"1 second of silence added to {voice}")
# call hallo
talking_portrait_vid = run_hallo(portrait, ready_audio)
return talking_portrait_vid
css = '''
#col-container {
margin: 0 auto;
}
#main-group {
background-color: none;
}
.tabs {
background-color: unset;
}
#image-block {
flex: 1;
}
#video-block {
flex: 9;
}
#audio-block, #audio-clone-elm {
flex: 1;
}
#text-synth, #voice-desc, #text-synth-wsp{
height: 180px;
}
#audio-column, #result-column {
display: flex;
}
#gen-voice-btn {
flex: 1;
}
#parler-tab, #whisperspeech-tab {
padding: 0;
}
#main-submit{
flex: 1;
}
div#warning-ready {
background-color: #ecfdf5;
padding: 0 16px 16px;
margin: 20px 0;
color: #030303!important;
}
div#warning-ready > .gr-prose > h2, div#warning-ready > .gr-prose > p {
color: #057857!important;
}
div#warning-duplicate {
background-color: #ebf5ff;
padding: 0 16px 16px;
margin: 20px 0;
color: #030303!important;
}
div#warning-duplicate > .gr-prose > h2, div#warning-duplicate > .gr-prose > p {
color: #0f4592!important;
}
div#warning-duplicate strong {
color: #0f4592;
}
p.actions {
display: flex;
align-items: center;
margin: 20px 0;
}
div#warning-duplicate .actions a {
display: inline-block;
margin-right: 10px;
}
.dark #warning-duplicate {
background-color: #0c0c0c !important;
border: 1px solid white !important;
}
'''
with gr.Blocks(css=css) as demo:
with gr.Column(elem_id="col-container"):
gr.Markdown("""
# Parler X Hallo
Generate talking portraits
""")
with gr.Group(elem_id="main-group"):
with gr.Row():
with gr.Column():
portrait = gr.Image(
sources=["upload"],
type="filepath",
format="png",
elem_id="image-block"
)
prompt_image = gr.Textbox(
label="Generate image",
lines=3
)
gen_image_btn = gr.Button("Generate portrait (optional)")
with gr.Column(elem_id="audio-column"):
voice = gr.Audio(
type="filepath",
max_length=4000,
elem_id="audio-block"
)
with gr.Tab("Parler TTS", elem_id="parler-tab"):
prompt_audio = gr.Textbox(
label="Text to synthetize",
lines=4,
max_lines=4,
elem_id="text-synth"
)
voice_description = gr.Textbox(
label="Voice description",
lines=4,
max_lines=4,
elem_id="voice-desc"
)
gen_voice_btn = gr.Button("Generate voice (optional)")
with gr.Tab("WhisperSpeech", elem_id="whisperspeech-tab"):
prompt_audio_whisperspeech = gr.Textbox(
label="Text to synthetize",
lines=4,
max_lines=4,
elem_id="text-synth-wsp"
)
audio_to_clone = gr.Audio(
label="Voice to clone",
type="filepath",
elem_id="audio-clone-elm"
)
gen_wsp_voice_btn = gr.Button("Generate voice clone (optional)")
with gr.Column(elem_id="result-column"):
result = gr.Video(
elem_id="video-block"
)
submit_btn = gr.Button("Submit", elem_id="main-submit")
voice.upload(
fn = check_mp3,
inputs = [voice],
outputs = [voice],
queue = False,
show_api = False
)
gen_image_btn.click(
fn = generate_portrait,
inputs = [prompt_image],
outputs = [portrait],
queue=False,
show_api = False
)
gen_voice_btn.click(
fn = generate_voice,
inputs = [prompt_audio, voice_description],
outputs = [voice],
queue=False,
show_api = False
)
gen_wsp_voice_btn.click(
fn = get_whisperspeech,
inputs = [prompt_audio_whisperspeech, audio_to_clone],
outputs = [voice],
queue=False,
show_api = False
)
submit_btn.click(
fn = generate_talking_portrait,
inputs = [portrait, voice],
outputs = [result],
show_api = False
)
demo.queue(max_size=2).launch(show_error=True, show_api=False)