TTS-GPT-SoVITS

Sleeping

lijiacai commited on Jun 2

Commit

e0786ec

•

1 Parent(s): 88c1edb

fix

Files changed (4) hide show

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

demo.py ADDED Viewed

+import gradio as  gr
+from enum import Enum
+class Demo:
+    def render(self):
+        with gr.Row():
+            with gr.Column(scale=2):
+                with gr.Row():
+                    self.text = gr.Textbox(label="请输入需要转换的文本")
+                    self.voice = gr.Dropdown(
+            ["新闻小说主播-女士", "温柔女士"], value=["swam", "slept"], label="选择音色")
+                gr.Audio(label="Reference audio", type="filepath", value=default_voice_wav, scale=3)
+if __name__ == "__main__":
+    ui().launch(share=True, server_port=40000, server_name="0.0.0.0")

main.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from fastapi import FastAPI, Body, File, Form, UploadFile, Response, Request
 from fastapi.responses import FileResponse, StreamingResponse
 from fastapi.staticfiles import StaticFiles
-from model import clone_voice, lang_detector, cut3, cut5
 import os
 from enum import Enum
 import uvicorn
@@ -39,30 +39,14 @@ class DefaultVoice(str, Enum):
 @app.post("/tts")
 async def tts(
-        # custom_voice_file: UploadFile = File(None, description="用户自定义声音"),
         voice: DefaultVoice = Form("新闻女士"),
-        text: str = Form(..., description="转换文本"),
-        wav_filename: str = Form("result.wav", description="输出流-文件名称"),
 ):
-    language = lang_detector(text)
-    custom_voice_file = None
-    if custom_voice_file is not None:
-        os.makedirs("static/tmp", exist_ok=True)
-        content = await file.read()
-        filename = f"static/tmp/{file.filename}"
-        with open(filename, "wb") as f:
-            f.write(content)
-        voice = filename
-    if language == Language.en.value:
-        voice = f"static/en/{voice}.mp3"
-    else:
-        voice = f"static/zh/{voice}.mp3"
     headers = {
-        "Content-Disposition": f"attachment; filename={wav_filename}",
         "Content-Type": "audio/wav",
     }
-    wav_path = clone_voice(
-        user_voice=voice, user_text=text, user_lang=language)
     with open(wav_path, "rb") as audio_file:
         audio_content = audio_file.read()

 from fastapi import FastAPI, Body, File, Form, UploadFile, Response, Request
 from fastapi.responses import FileResponse, StreamingResponse
 from fastapi.staticfiles import StaticFiles
+from model import text_to_speech
 import os
 from enum import Enum
 import uvicorn
 @app.post("/tts")
 async def tts(
         voice: DefaultVoice = Form("新闻女士"),
+        text: str = Form(..., description="转换文本")
 ):
+    wav_path = text_to_speech(voice=voice, text=text)
     headers = {
+        "Content-Disposition": f"attachment; filename={wav_path}",
         "Content-Type": "audio/wav",
     }
     with open(wav_path, "rb") as audio_file:
         audio_content = audio_file.read()

model.py CHANGED Viewed

@@ -16,6 +16,7 @@ from transformers.pipelines.audio_utils import ffmpeg_read
 import os,re,sys,LangSegment,librosa,pdb,torch,pytz,random
 from transformers import AutoModelForMaskedLM, AutoTokenizer
 from AR.models.t2s_lightning_module import Text2SemanticLightningModule
 from env import *
 import logging
@@ -491,7 +492,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
     audio_data = (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
     audio_data = (audio_data.astype(np.float32) * volume_scale).astype(np.int16)
-    output_wav = "output_audio.wav"
     sf.write(output_wav, audio_data, hps.data.sampling_rate)
     endTime=timer()
     tprint(f'🆗TTS COMPLETE,{round(endTime-startTime,4)}s')
@@ -750,3 +751,24 @@ def clone_voice(user_voice,user_text,user_lang):
     return output_wav

 import os,re,sys,LangSegment,librosa,pdb,torch,pytz,random
 from transformers import AutoModelForMaskedLM, AutoTokenizer
 from AR.models.t2s_lightning_module import Text2SemanticLightningModule
+from enum import Enum
 from env import *
 import logging
     audio_data = (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
     audio_data = (audio_data.astype(np.float32) * volume_scale).astype(np.int16)
+    output_wav = f"{str(int(ttime.time()))}-{random.randint(1000, 9999)}.wav"
     sf.write(output_wav, audio_data, hps.data.sampling_rate)
     endTime=timer()
     tprint(f'🆗TTS COMPLETE,{round(endTime-startTime,4)}s')
     return output_wav
+class Language(str, Enum):
+    en = "English"
+    zh = "中文"
+class DefaultVoice(str, Enum):
+    voice1 = "新闻小说主播-女士"
+    voice2 = "温柔女士"
+def text_to_speech(voice, text):
+    language = lang_detector(text)
+    if language == Language.en.value:
+        voice = f"static/en/{voice}.mp3"
+    else:
+        voice = f"static/zh/{voice}.mp3"
+    wav_path = clone_voice(
+        user_voice=voice, user_text=text, user_lang=language)
+    return wav_path