tts-hallo-talking-portrait

Running on A10G

App Files Files Community

tts-hallo-talking-portrait / app.py

fffiloni

Update app.py

446a654 verified 5 months ago

raw

history blame

10 kB

	import os
	import shutil
	from huggingface_hub import snapshot_download
	import gradio as gr
	from gradio_client import Client, handle_file
	from mutagen.mp3 import MP3
	from pydub import AudioSegment
	from PIL import Image
	os.chdir(os.path.dirname(os.path.abspath(__file__)))
	from scripts.inference import inference_process
	import argparse
	import uuid

	is_shared_ui = True if "fudan-generative-ai/hallo" in os.environ['SPACE_ID'] else False

	if(not is_shared_ui):
	hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")

	def is_mp3(file_path):
	try:
	audio = MP3(file_path)
	return True
	except Exception as e:
	return False

	def convert_mp3_to_wav(mp3_file_path, wav_file_path):
	# Load the MP3 file
	audio = AudioSegment.from_mp3(mp3_file_path)
	# Export as WAV file
	audio.export(wav_file_path, format="wav")
	return wav_file_path


	def trim_audio(file_path, output_path, max_duration=4000):
	# Load the audio file
	audio = AudioSegment.from_wav(file_path)

	# Check the length of the audio in milliseconds
	audio_length = len(audio)

	# If the audio is longer than the maximum duration, trim it
	if audio_length > max_duration:
	trimmed_audio = audio[:max_duration]
	else:
	trimmed_audio = audio

	# Export the trimmed audio to a new file
	trimmed_audio.export(output_path, format="wav")

	return output_path


	def add_silence_to_wav(wav_file_path, duration_s=1):
	# Load the WAV file
	audio = AudioSegment.from_wav(wav_file_path)
	# Create 1 second of silence
	silence = AudioSegment.silent(duration=duration_s * 1000) # duration is in milliseconds
	# Add silence to the end of the audio file
	audio_with_silence = audio + silence
	# Export the modified audio
	audio_with_silence.export(wav_file_path, format="wav")
	return wav_file_path

	def check_mp3(file_path):

	if is_mp3(file_path):
	wav_file_path = os.path.splitext(file_path)[0] + '.wav'
	converted_audio = convert_mp3_to_wav(file_path, wav_file_path)
	print(f"File converted to {wav_file_path}")

	return converted_audio
	else:
	print("The file is not an MP3 file.")

	return file_path

	def convert_webp_to_png(webp_file):

	# Open the WebP image
	webp_image = Image.open(webp_file)

	# Convert and save as PNG
	webp_image.save("png_converted_image.png", "PNG")

	return "png_converted_image.png"

	def generate_portrait(prompt_image):
	if prompt_image is None or prompt_image == "":
	raise gr.Error("Can't generate a portrait without a prompt !")
	client = Client("AP123/SDXL-Lightning")
	result = client.predict(
	prompt_image,
	"4-Step",
	api_name="/generate_image"
	)
	print(result)

	return result

	def generate_voice(prompt_audio, voice_description):
	if prompt_audio is None or prompt_audio == "" :
	raise gr.Error("Can't generate a voice without text to synthetize !")
	if voice_description is None or voice_description == "":
	gr.Info(
	"For better control, You may want to provide a voice character description next time.",
	duration = 10,
	visible = True
	)
	client = Client("parler-tts/parler_tts_mini")
	result = client.predict(
	text=prompt_audio,
	description=voice_description,
	api_name="/gen_tts"
	)
	print(result)
	return result

	def get_whisperspeech(prompt_audio_whisperspeech, audio_to_clone):
	client = Client("collabora/WhisperSpeech")
	result = client.predict(
	multilingual_text=prompt_audio_whisperspeech,
	speaker_audio=handle_file(audio_to_clone),
	speaker_url="",
	cps=14,
	api_name="/whisper_speech_demo"
	)
	print(result)
	return result

	def run_hallo(source_image, driving_audio, progress=gr.Progress(track_tqdm=True)):
	if is_shared_ui:
	raise gr.Error("This Space only works in duplicated instances")

	unique_id = uuid.uuid4()

	args = argparse.Namespace(
	config='configs/inference/default.yaml',
	source_image=source_image,
	driving_audio=driving_audio,
	output=f'output-{unique_id}.mp4',
	pose_weight=1.0,
	face_weight=1.0,
	lip_weight=1.0,
	face_expand_ratio=1.2,
	checkpoint=None
	)

	inference_process(args)
	return f'output-{unique_id}.mp4'

	def generate_talking_portrait(portrait, voice):

	if portrait is None:
	raise gr.Error("Please provide a portrait to animate.")
	if voice is None:
	raise gr.Error("Please provide audio (4 seconds max).")

	# trim audio
	input_file = voice
	trimmed_output_file = "trimmed_audio.wav"
	trimmed_output_file = trim_audio(input_file, trimmed_output_file)
	voice = trimmed_output_file

	ready_audio = add_silence_to_wav(voice)
	print(f"1 second of silence added to {voice}")

	# call hallo
	talking_portrait_vid = run_hallo(portrait, ready_audio)
	return talking_portrait_vid


	css = '''
	#col-container {
	margin: 0 auto;
	}
	#main-group {
	background-color: none;
	}
	.tabs {
	background-color: unset;
	}
	#image-block {
	flex: 1;
	}
	#video-block {
	flex: 9;
	}
	#audio-block, #audio-clone-elm {
	flex: 1;
	}
	#text-synth, #voice-desc, #text-synth-wsp{
	height: 180px;
	}
	#audio-column, #result-column {
	display: flex;
	}
	#gen-voice-btn {
	flex: 1;
	}
	#parler-tab, #whisperspeech-tab {
	padding: 0;
	}
	#main-submit{
	flex: 1;
	}
	div#warning-ready {
	background-color: #ecfdf5;
	padding: 0 16px 16px;
	margin: 20px 0;
	color: #030303!important;
	}
	div#warning-ready > .gr-prose > h2, div#warning-ready > .gr-prose > p {
	color: #057857!important;
	}
	div#warning-duplicate {
	background-color: #ebf5ff;
	padding: 0 16px 16px;
	margin: 20px 0;
	color: #030303!important;
	}
	div#warning-duplicate > .gr-prose > h2, div#warning-duplicate > .gr-prose > p {
	color: #0f4592!important;
	}
	div#warning-duplicate strong {
	color: #0f4592;
	}
	p.actions {
	display: flex;
	align-items: center;
	margin: 20px 0;
	}
	div#warning-duplicate .actions a {
	display: inline-block;
	margin-right: 10px;
	}
	.dark #warning-duplicate {
	background-color: #0c0c0c !important;
	border: 1px solid white !important;
	}
	'''

	with gr.Blocks(css=css) as demo:
	with gr.Column(elem_id="col-container"):
	gr.Markdown("""
	# Parler X Hallo
	Generate talking portraits
	""")
	with gr.Group(elem_id="main-group"):
	with gr.Row():
	with gr.Column():
	portrait = gr.Image(
	sources=["upload"],
	type="filepath",
	format="png",
	elem_id="image-block"
	)

	prompt_image = gr.Textbox(
	label="Generate image",
	lines=3
	)

	gen_image_btn = gr.Button("Generate portrait (optional)")

	with gr.Column(elem_id="audio-column"):
	voice = gr.Audio(
	type="filepath",
	max_length=4000,
	elem_id="audio-block"
	)

	with gr.Tab("Parler TTS", elem_id="parler-tab"):

	prompt_audio = gr.Textbox(
	label="Text to synthetize",
	lines=4,
	max_lines=4,
	elem_id="text-synth"
	)

	voice_description = gr.Textbox(
	label="Voice description",
	lines=4,
	max_lines=4,
	elem_id="voice-desc"
	)

	gen_voice_btn = gr.Button("Generate voice (optional)")

	with gr.Tab("WhisperSpeech", elem_id="whisperspeech-tab"):
	prompt_audio_whisperspeech = gr.Textbox(
	label="Text to synthetize",
	lines=4,
	max_lines=4,
	elem_id="text-synth-wsp"
	)
	audio_to_clone = gr.Audio(
	label="Voice to clone",
	type="filepath",
	elem_id="audio-clone-elm"
	)
	gen_wsp_voice_btn = gr.Button("Generate voice clone (optional)")

	with gr.Column(elem_id="result-column"):
	result = gr.Video(
	elem_id="video-block"
	)

	submit_btn = gr.Button("Submit", elem_id="main-submit")

	voice.upload(
	fn = check_mp3,
	inputs = [voice],
	outputs = [voice],
	queue = False,
	show_api = False
	)

	gen_image_btn.click(
	fn = generate_portrait,
	inputs = [prompt_image],
	outputs = [portrait],
	queue=False,
	show_api = False
	)

	gen_voice_btn.click(
	fn = generate_voice,
	inputs = [prompt_audio, voice_description],
	outputs = [voice],
	queue=False,
	show_api = False
	)

	gen_wsp_voice_btn.click(
	fn = get_whisperspeech,
	inputs = [prompt_audio_whisperspeech, audio_to_clone],
	outputs = [voice],
	queue=False,
	show_api = False
	)

	submit_btn.click(
	fn = generate_talking_portrait,
	inputs = [portrait, voice],
	outputs = [result],
	show_api = False
	)


	demo.queue(max_size=2).launch(show_error=True, show_api=False)