lijiacai commited on
Commit
e0786ec
1 Parent(s): 88c1edb
Files changed (4) hide show
  1. .gitignore +1 -0
  2. demo.py +21 -0
  3. main.py +4 -20
  4. model.py +23 -1
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
demo.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from enum import Enum
3
+
4
+
5
+
6
+ class Demo:
7
+
8
+ def render(self):
9
+ with gr.Row():
10
+ with gr.Column(scale=2):
11
+ with gr.Row():
12
+ self.text = gr.Textbox(label="请输入需要转换的文本")
13
+ self.voice = gr.Dropdown(
14
+ ["新闻小说主播-女士", "温柔女士"], value=["swam", "slept"], label="选择音色")
15
+ gr.Audio(label="Reference audio", type="filepath", value=default_voice_wav, scale=3)
16
+
17
+
18
+
19
+
20
+ if __name__ == "__main__":
21
+ ui().launch(share=True, server_port=40000, server_name="0.0.0.0")
main.py CHANGED
@@ -1,7 +1,7 @@
1
  from fastapi import FastAPI, Body, File, Form, UploadFile, Response, Request
2
  from fastapi.responses import FileResponse, StreamingResponse
3
  from fastapi.staticfiles import StaticFiles
4
- from model import clone_voice, lang_detector, cut3, cut5
5
  import os
6
  from enum import Enum
7
  import uvicorn
@@ -39,30 +39,14 @@ class DefaultVoice(str, Enum):
39
 
40
  @app.post("/tts")
41
  async def tts(
42
- # custom_voice_file: UploadFile = File(None, description="用户自定义声音"),
43
  voice: DefaultVoice = Form("新闻女士"),
44
- text: str = Form(..., description="转换文本"),
45
- wav_filename: str = Form("result.wav", description="输出流-文件名称"),
46
  ):
47
- language = lang_detector(text)
48
- custom_voice_file = None
49
- if custom_voice_file is not None:
50
- os.makedirs("static/tmp", exist_ok=True)
51
- content = await file.read()
52
- filename = f"static/tmp/{file.filename}"
53
- with open(filename, "wb") as f:
54
- f.write(content)
55
- voice = filename
56
- if language == Language.en.value:
57
- voice = f"static/en/{voice}.mp3"
58
- else:
59
- voice = f"static/zh/{voice}.mp3"
60
  headers = {
61
- "Content-Disposition": f"attachment; filename={wav_filename}",
62
  "Content-Type": "audio/wav",
63
  }
64
- wav_path = clone_voice(
65
- user_voice=voice, user_text=text, user_lang=language)
66
 
67
  with open(wav_path, "rb") as audio_file:
68
  audio_content = audio_file.read()
 
1
  from fastapi import FastAPI, Body, File, Form, UploadFile, Response, Request
2
  from fastapi.responses import FileResponse, StreamingResponse
3
  from fastapi.staticfiles import StaticFiles
4
+ from model import text_to_speech
5
  import os
6
  from enum import Enum
7
  import uvicorn
 
39
 
40
  @app.post("/tts")
41
  async def tts(
 
42
  voice: DefaultVoice = Form("新闻女士"),
43
+ text: str = Form(..., description="转换文本")
 
44
  ):
45
+ wav_path = text_to_speech(voice=voice, text=text)
 
 
 
 
 
 
 
 
 
 
 
 
46
  headers = {
47
+ "Content-Disposition": f"attachment; filename={wav_path}",
48
  "Content-Type": "audio/wav",
49
  }
 
 
50
 
51
  with open(wav_path, "rb") as audio_file:
52
  audio_content = audio_file.read()
model.py CHANGED
@@ -16,6 +16,7 @@ from transformers.pipelines.audio_utils import ffmpeg_read
16
  import os,re,sys,LangSegment,librosa,pdb,torch,pytz,random
17
  from transformers import AutoModelForMaskedLM, AutoTokenizer
18
  from AR.models.t2s_lightning_module import Text2SemanticLightningModule
 
19
  from env import *
20
 
21
  import logging
@@ -491,7 +492,7 @@ def get_tts_wav(ref_wav_path, prompt_text, prompt_language, text, text_language,
491
  audio_data = (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
492
 
493
  audio_data = (audio_data.astype(np.float32) * volume_scale).astype(np.int16)
494
- output_wav = "output_audio.wav"
495
  sf.write(output_wav, audio_data, hps.data.sampling_rate)
496
  endTime=timer()
497
  tprint(f'🆗TTS COMPLETE,{round(endTime-startTime,4)}s')
@@ -750,3 +751,24 @@ def clone_voice(user_voice,user_text,user_lang):
750
  return output_wav
751
 
752
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  import os,re,sys,LangSegment,librosa,pdb,torch,pytz,random
17
  from transformers import AutoModelForMaskedLM, AutoTokenizer
18
  from AR.models.t2s_lightning_module import Text2SemanticLightningModule
19
+ from enum import Enum
20
  from env import *
21
 
22
  import logging
 
492
  audio_data = (np.concatenate(audio_opt, 0) * 32768).astype(np.int16)
493
 
494
  audio_data = (audio_data.astype(np.float32) * volume_scale).astype(np.int16)
495
+ output_wav = f"{str(int(ttime.time()))}-{random.randint(1000, 9999)}.wav"
496
  sf.write(output_wav, audio_data, hps.data.sampling_rate)
497
  endTime=timer()
498
  tprint(f'🆗TTS COMPLETE,{round(endTime-startTime,4)}s')
 
751
  return output_wav
752
 
753
 
754
+ class Language(str, Enum):
755
+ en = "English"
756
+ zh = "中文"
757
+
758
+
759
+ class DefaultVoice(str, Enum):
760
+ voice1 = "新闻小说主播-女士"
761
+ voice2 = "温柔女士"
762
+
763
+
764
+ def text_to_speech(voice, text):
765
+ language = lang_detector(text)
766
+ if language == Language.en.value:
767
+ voice = f"static/en/{voice}.mp3"
768
+ else:
769
+ voice = f"static/zh/{voice}.mp3"
770
+
771
+ wav_path = clone_voice(
772
+ user_voice=voice, user_text=text, user_lang=language)
773
+ return wav_path
774
+