Spaces:
Sleeping
Sleeping
fix
Browse files
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__pycache__
|
demo.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from enum import Enum
|
3 |
+
|
4 |
+
|
5 |
+
|
6 |
+
class Demo:
|
7 |
+
|
8 |
+
def render(self):
|
9 |
+
with gr.Row():
|
10 |
+
with gr.Column(scale=2):
|
11 |
+
with gr.Row():
|
12 |
+
self.text = gr.Textbox(label="请输入需要转换的文本")
|
13 |
+
self.voice = gr.Dropdown(
|
14 |
+
["新闻小说主播-女士", "温柔女士"], value=["swam", "slept"], label="选择音色")
|
15 |
+
gr.Audio(label="Reference audio", type="filepath", value=default_voice_wav, scale=3)
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
if __name__ == "__main__":
|
21 |
+
ui().launch(share=True, server_port=40000, server_name="0.0.0.0")
|
main.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
from fastapi import FastAPI, Body, File, Form, UploadFile, Response, Request
|
2 |
from fastapi.responses import FileResponse, StreamingResponse
|
3 |
from fastapi.staticfiles import StaticFiles
|
4 |
-
from model import
|
5 |
import os
|
6 |
from enum import Enum
|
7 |
import uvicorn
|
@@ -39,30 +39,14 @@ class DefaultVoice(str, Enum):
|
|
39 |
|
40 |
@app.post("/tts")
|
41 |
async def tts(
|
42 |
-
# custom_voice_file: UploadFile = File(None, description="用户自定义声音"),
|
43 |
voice: DefaultVoice = Form("新闻女士"),
|
44 |
-
text: str = Form(..., description="转换文本")
|
45 |
-
wav_filename: str = Form("result.wav", description="输出流-文件名称"),
|
46 |
):
|
47 |
-
|
48 |
-
custom_voice_file = None
|
49 |
-
if custom_voice_file is not None:
|
50 |
-
os.makedirs("static/tmp", exist_ok=True)
|
51 |
-
content = await file.read()
|
52 |
-
filename = f"static/tmp/{file.filename}"
|
53 |
-
with open(filename, "wb") as f:
|
54 |
-
f.write(content)
|
55 |
-
voice = filename
|
56 |
-
if language == Language.en.value:
|
57 |
-
voice = f"static/en/{voice}.mp3"
|
58 |
-
else:
|
59 |
-
voice = f"static/zh/{voice}.mp3"
|
60 |
headers = {
|
61 |
-
"Content-Disposition": f"attachment; filename={
|
62 |
"Content-Type": "audio/wav",
|
63 |
}
|
64 |
-
wav_path = clone_voice(
|
65 |
-
user_voice=voice, user_text=text, user_lang=language)
|
66 |
|
67 |
with open(wav_path, "rb") as audio_file:
|
68 |
audio_content = audio_file.read()
|
|
|
1 |
from fastapi import FastAPI, Body, File, Form, UploadFile, Response, Request
|
2 |
from fastapi.responses import FileResponse, StreamingResponse
|
3 |
from fastapi.staticfiles import StaticFiles
|
4 |
+
from model import text_to_speech
|
5 |
import os
|
6 |
from enum import Enum
|
7 |
import uvicorn
|
|
|
39 |
|
40 |
@app.post("/tts")
|
41 |
async def tts(
|
|
|
42 |
voice: DefaultVoice = Form("新闻女士"),
|
43 |
+
text: str = Form(..., description="转换文本")
|
|
|
44 |
):
|
45 |
+
wav_path = text_to_speech(voice=voice, text=text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
headers = {
|
47 |
+
"Content-Disposition": f"attachment; filename={wav_path}",
|
48 |
"Content-Type": "audio/wav",
|
49 |
}
|
|
|
|
|
50 |
|
51 |
with open(wav_path, "rb") as audio_file:
|
52 |
audio_content = audio_file.read()
|
model.py
CHANGED
@@ -16,6 +16,7 @@ from transformers.pipelines.audio_utils import ffmpeg_read
|
|
16 |
import os,re,sys,LangSegment,librosa,pdb,torch,pytz,random
|
17 |
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
18 |
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
|
|
19 |
from env import *
|
20 |
|
21 |
import logging
|
@@ -491,7 +492,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
|
|
491 |
audio_data = (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
|
492 |
|
493 |
audio_data = (audio_data.astype(np.float32) * volume_scale).astype(np.int16)
|
494 |
-
output_wav = "
|
495 |
sf.write(output_wav, audio_data, hps.data.sampling_rate)
|
496 |
endTime=timer()
|
497 |
tprint(f'🆗TTS COMPLETE,{round(endTime-startTime,4)}s')
|
@@ -750,3 +751,24 @@ def clone_voice(user_voice,user_text,user_lang):
|
|
750 |
return output_wav
|
751 |
|
752 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
import os,re,sys,LangSegment,librosa,pdb,torch,pytz,random
|
17 |
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
18 |
from AR.models.t2s_lightning_module import Text2SemanticLightningModule
|
19 |
+
from enum import Enum
|
20 |
from env import *
|
21 |
|
22 |
import logging
|
|
|
492 |
audio_data = (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
|
493 |
|
494 |
audio_data = (audio_data.astype(np.float32) * volume_scale).astype(np.int16)
|
495 |
+
output_wav = f"{str(int(ttime.time()))}-{random.randint(1000, 9999)}.wav"
|
496 |
sf.write(output_wav, audio_data, hps.data.sampling_rate)
|
497 |
endTime=timer()
|
498 |
tprint(f'🆗TTS COMPLETE,{round(endTime-startTime,4)}s')
|
|
|
751 |
return output_wav
|
752 |
|
753 |
|
754 |
+
class Language(str, Enum):
|
755 |
+
en = "English"
|
756 |
+
zh = "中文"
|
757 |
+
|
758 |
+
|
759 |
+
class DefaultVoice(str, Enum):
|
760 |
+
voice1 = "新闻小说主播-女士"
|
761 |
+
voice2 = "温柔女士"
|
762 |
+
|
763 |
+
|
764 |
+
def text_to_speech(voice, text):
|
765 |
+
language = lang_detector(text)
|
766 |
+
if language == Language.en.value:
|
767 |
+
voice = f"static/en/{voice}.mp3"
|
768 |
+
else:
|
769 |
+
voice = f"static/zh/{voice}.mp3"
|
770 |
+
|
771 |
+
wav_path = clone_voice(
|
772 |
+
user_voice=voice, user_text=text, user_lang=language)
|
773 |
+
return wav_path
|
774 |
+
|