speech-to-speech

Paused

App Files Files Community

speech-to-speech / app.py

zongxiao

Update app.py

08a0218 about 1 year ago

raw

history blame contribute delete

7.3 kB

	import torch
	import numpy as np
	import soundfile as sf
	from transformers import pipeline
	from transformers import BarkModel
	from transformers import AutoProcessor

	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	pipe = pipeline(
	"automatic-speech-recognition", model="openai/whisper-large-v2", device=device
	)
	label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
	processor = AutoProcessor.from_pretrained("suno/bark")
	model = BarkModel.from_pretrained("suno/bark")
	model = model.to(device)
	synthesised_rate = model.generation_config.sample_rate

	def translate(audio_file):
	audio, sampling_rate = sf.read(audio_file)
	outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
	language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
	label_outputs = {}
	for pred in language_prediction:
	label_outputs[pred["label"]] = pred["score"]
	return outputs["text"],label_outputs
	def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
	inputs = processor(text_prompt, voice_preset=voice_preset)
	speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
	return speech_output
	def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
	translated_text, label_outputs= translate(audio)
	synthesised_speech = synthesise(translated_text,voice_preset)
	synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
	return (synthesised_rate , synthesised_speech.T),translated_text,label_outputs

	title = "外国话转普通话"
	description = """
	作为[Hugging Face Audio course](https://github.com/danfouer/HFAudioCourse) 的结课大作业，本演示调用了三个自然语言处理的大模型，一个用于将外国话翻译成中文，一个用于判断说的哪个国家的话，一个用于将中文转成普通话语音输出。演示同时支持语音上传和麦克风输入，转换速度比较慢因为租不起GPU的服务器（支出增加20倍），建议您通过已经缓存Examples体验效果。欢迎添加我的微信号：ESGGTP 与我的平行人交流。

	![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
	"""

	examples = [
	# ["./en.mp3", None],
	# ["./de.mp3", None],
	["./fr.mp3", None],
	["./it.mp3", None],
	["./nl.mp3", None],
	["./fi.mp3", None],
	# ["./cs.mp3", None],
	# ["./pl.mp3", None],
	]
	import gradio as gr

	demo = gr.Blocks()
	file_transcribe = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(source="upload", type="filepath"),
	outputs=[
	gr.Audio(label="Generated Speech", type="numpy"),
	gr.Text(label="Transcription"),
	gr.Label(label="Language prediction"),
	],
	title=title,
	description=description,
	examples=examples,
	)
	mic_transcribe = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(source="microphone", type="filepath"),
	outputs=[
	gr.Audio(label="Generated Speech", type="numpy"),
	gr.Text(label="Transcription"),
	gr.Label(label="Language prediction"),
	],
	title=title,
	description=description,
	)
	with demo:
	gr.TabbedInterface(
	[file_transcribe, mic_transcribe],
	["Transcribe Audio File", "Transcribe Microphone"],
	)

	demo.launch()
	###########################################################################################################################
	# import torch
	# import numpy as np
	# import soundfile as sf
	# from transformers import pipeline
	# from transformers import BarkModel
	# from transformers import AutoProcessor

	# device = "cuda:0" if torch.cuda.is_available() else "cpu"

	# pipe = pipeline(
	# "automatic-speech-recognition", model="openai/whisper-large-v2", device=device
	# )
	# #label = pipeline("audio-classification", model="facebook/mms-lid-126", device=device)
	# processor = AutoProcessor.from_pretrained("suno/bark")
	# model = BarkModel.from_pretrained("suno/bark")
	# model = model.to(device)
	# synthesised_rate = model.generation_config.sample_rate

	# def translate(audio_file):
	# # audio, sampling_rate = sf.read(audio_file)
	# outputs = pipe(audio_file, max_new_tokens=256, generate_kwargs={"task": "transcribe","language":"chinese"})
	# # language_prediction = label({"array": audio, "sampling_rate": sampling_rate})
	# # label_outputs = {}
	# # for pred in language_prediction:
	# # label_outputs[pred["label"]] = pred["score"]
	# return outputs["text"]#,label_outputs
	# def synthesise(text_prompt,voice_preset="v2/zh_speaker_1"):
	# inputs = processor(text_prompt, voice_preset=voice_preset)
	# speech_output = model.generate(**inputs.to(device),pad_token_id=10000)
	# return speech_output
	# def speech_to_speech_translation(audio,voice_preset="v2/zh_speaker_1"):
	# #translated_text, label_outputs= translate(audio)
	# translated_text = translate(audio)
	# synthesised_speech = synthesise(translated_text,voice_preset)
	# synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
	# return (synthesised_rate , synthesised_speech.T),translated_text#,label_outputs

	# title = "外国话转中文话"
	# description = """
	# 作为[Hugging Face Audio course](https://github.com/danfouer/HFAudioCourse) 的结课大作业，本演示调用了三个自然语言处理的大模型，一个用于将外国话翻译成中文，一个用于判断说的哪个国家的话（CPU演示太慢暂时先去掉了），一个用于将中文转成语音输出。演示同时支持语音上传和麦克风输入，转换速度比较慢因为租不起GPU的服务器（支出增加20倍），建议您通过已经缓存Examples体验效果。欢迎添加我的微信号：ESGGTP 与我的平行人交流。

	# ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
	# """

	# examples = [
	# ["./en.mp3", None],
	# ["./de.mp3", None],
	# ["./fr.mp3", None],
	# ["./it.mp3", None],
	# ["./nl.mp3", None],
	# ["./fi.mp3", None],
	# ["./cs.mp3", None],
	# ["./pl.mp3", None],
	# ]
	# import gradio as gr

	# demo = gr.Blocks()
	# file_transcribe = gr.Interface(
	# fn=speech_to_speech_translation,
	# inputs=gr.Audio(source="upload", type="filepath"),
	# outputs=[
	# gr.Audio(label="Generated Speech", type="numpy"),
	# gr.Text(label="Transcription"),
	# # gr.Label(label="Language prediction"),
	# ],
	# title=title,
	# description=description,
	# examples=examples,
	# )
	# mic_transcribe = gr.Interface(
	# fn=speech_to_speech_translation,
	# inputs=gr.Audio(source="microphone", type="filepath"),
	# outputs=[
	# gr.Audio(label="Generated Speech", type="numpy"),
	# gr.Text(label="Transcription"),
	# # gr.Label(label="Language prediction"),
	# ],
	# title=title,
	# description=description,
	# )
	# with demo:
	# gr.TabbedInterface(
	# [file_transcribe, mic_transcribe],
	# ["Transcribe Audio File", "Transcribe Microphone"],
	# )

	# demo.launch()