Spaces:
Running
Running
import gradio as gr | |
import torchaudio | |
import torch | |
import torch.nn.functional as F | |
from speechbrain.inference.speaker import EncoderClassifier | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
import noisereduce as nr | |
import librosa | |
# Load the classifier model | |
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb", savedir="pretrained_models/spkrec-xvect-voxceleb") | |
def f2embed(wav_file, classifier, size_embed): | |
signal, fs = stereo_to_mono(wav_file) | |
if signal is None: | |
return None | |
# print(fs, "FS") | |
if fs != 16000: | |
signal, fs = resample_to_16000(signal, fs) | |
if signal is None: | |
return None | |
assert fs == 16000, fs | |
with torch.no_grad(): | |
embeddings = classifier.encode_batch(signal) | |
embeddings = F.normalize(embeddings, dim=2) | |
embeddings = embeddings.squeeze().cpu().numpy() | |
assert embeddings.shape[0] == size_embed, embeddings.shape[0] | |
return embeddings | |
def stereo_to_mono(wav_file): | |
try: | |
signal, fs = torchaudio.load(wav_file) | |
signal_np = signal.numpy() | |
if signal_np.shape[0] == 2: # If stereo | |
signal_mono = librosa.to_mono(signal_np) | |
signal_mono = torch.from_numpy(signal_mono).unsqueeze(0) | |
else: | |
signal_mono = signal # Already mono | |
print(f"Converted to mono: {signal_mono.shape}") | |
return signal_mono, fs | |
except Exception as e: | |
print(f"Error in stereo_to_mono: {e}") | |
return None, None | |
def resample_to_16000(signal, original_sr): | |
try: | |
signal_np = signal.numpy().flatten() | |
signal_resampled = librosa.resample(signal_np, orig_sr=original_sr, target_sr=16000) | |
signal_resampled = torch.from_numpy(signal_resampled).unsqueeze(0) | |
print(f"Resampled to 16000 Hz: {signal_resampled.shape}") | |
return signal_resampled, 16000 | |
except Exception as e: | |
print(f"Error in resample_to_16000: {e}") | |
return None, None | |
def reduce_noise(speech, noise_reduction_amount=0.5): | |
try: | |
denoised_speech = nr.reduce_noise(y=speech, sr=16000) | |
return denoised_speech | |
except Exception as e: | |
print(f"Error in reduce_noise: {e}") | |
return speech | |
def process_audio(wav_file, text): | |
try: | |
# Extract speaker embeddings | |
speaker_embeddings = f2embed(wav_file, classifier, 512) | |
if speaker_embeddings is None: | |
return None, "Error in speaker embedding extraction" | |
embeddings = torch.tensor(speaker_embeddings).unsqueeze(0) | |
# Load and process the speech file | |
signal, fs = torchaudio.load(wav_file) | |
signal_np = signal.numpy().flatten() | |
print(f"Loaded signal: {signal_np.shape}, Sample rate: {fs}") | |
# Convert text to speech using the speaker embeddings | |
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
inputs = processor(text=text, return_tensors="pt") | |
inputs.update({"speaker_embeddings": embeddings}) | |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings=inputs["speaker_embeddings"],vocoder=vocoder) | |
print(f"Generated speech, shape: {speech.shape}") | |
# Reduce noise | |
speech_denoised = reduce_noise(speech) | |
print(f"Reduced noise, signal shape: {speech_denoised.shape}") | |
return speech_denoised, 16000 | |
except Exception as e: | |
print(f"Error in process_audio: {e}") | |
return None, "Error in audio processing" | |
# Gradio interface | |
def gradio_interface(wav_file, text): | |
try: | |
processed_audio, rate = process_audio(wav_file, text) | |
if processed_audio is None: | |
return "Error occurred during processing" | |
return (rate, processed_audio) | |
except Exception as e: | |
print(f"Error in gradio_interface: {e}") | |
return "Error occurred during processing" | |
# Create Gradio interface | |
gr_interface = gr.Interface( | |
fn=gradio_interface, | |
inputs=[gr.Audio(type="filepath"), gr.Textbox(lines=2, placeholder="Enter text here...")], | |
outputs=gr.Audio(type="numpy"), | |
title="Text-to-Speech with Speaker Embeddings", | |
description="Upload a speaker audio file and enter text to convert the text to speech using the speaker's voice.", | |
) | |
gr_interface.launch() | |
# process_audio("/content/Network Chunck.mp3","Hello this network chunk") |