Spaces:
Running
Running
Update ONNXVITS_infer.py
Browse files- ONNXVITS_infer.py +81 -130
ONNXVITS_infer.py
CHANGED
@@ -13,17 +13,18 @@ from torch.nn import Conv1d, ConvTranspose1d, Conv2d
|
|
13 |
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
14 |
from commons import init_weights, get_padding
|
15 |
|
|
|
16 |
class TextEncoder(nn.Module):
|
17 |
def __init__(self,
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
super().__init__()
|
28 |
self.n_vocab = n_vocab
|
29 |
self.out_channels = out_channels
|
@@ -34,12 +35,12 @@ class TextEncoder(nn.Module):
|
|
34 |
self.kernel_size = kernel_size
|
35 |
self.p_dropout = p_dropout
|
36 |
self.emotion_embedding = emotion_embedding
|
37 |
-
|
38 |
-
if self.n_vocab!=0:
|
39 |
self.emb = nn.Embedding(n_vocab, hidden_channels)
|
40 |
if emotion_embedding:
|
41 |
self.emo_proj = nn.Linear(1024, hidden_channels)
|
42 |
-
nn.init.normal_(self.emb.weight, 0.0, hidden_channels
|
43 |
|
44 |
self.encoder = attentions.Encoder(
|
45 |
hidden_channels,
|
@@ -48,15 +49,15 @@ class TextEncoder(nn.Module):
|
|
48 |
n_layers,
|
49 |
kernel_size,
|
50 |
p_dropout)
|
51 |
-
self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
52 |
|
53 |
def forward(self, x, x_lengths, emotion_embedding=None):
|
54 |
-
if self.n_vocab!=0:
|
55 |
-
x = self.emb(x) * math.sqrt(self.hidden_channels)
|
56 |
if emotion_embedding is not None:
|
57 |
print("emotion added")
|
58 |
x = x + self.emo_proj(emotion_embedding.unsqueeze(1))
|
59 |
-
x = torch.transpose(x, 1, -1)
|
60 |
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
61 |
|
62 |
x = self.encoder(x * x_mask, x_mask)
|
@@ -65,15 +66,16 @@ class TextEncoder(nn.Module):
|
|
65 |
m, logs = torch.split(stats, self.out_channels, dim=1)
|
66 |
return x, m, logs, x_mask
|
67 |
|
|
|
68 |
class PosteriorEncoder(nn.Module):
|
69 |
def __init__(self,
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
super().__init__()
|
78 |
self.in_channels = in_channels
|
79 |
self.out_channels = out_channels
|
@@ -96,35 +98,36 @@ class PosteriorEncoder(nn.Module):
|
|
96 |
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
|
97 |
return z, m, logs, x_mask
|
98 |
|
|
|
99 |
class SynthesizerTrn(models.SynthesizerTrn):
|
100 |
"""
|
101 |
Synthesizer for Training
|
102 |
"""
|
103 |
|
104 |
-
def __init__(self,
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
super().__init__(
|
128 |
n_vocab,
|
129 |
spec_channels,
|
130 |
segment_size,
|
@@ -135,11 +138,11 @@ class SynthesizerTrn(models.SynthesizerTrn):
|
|
135 |
n_layers,
|
136 |
kernel_size,
|
137 |
p_dropout,
|
138 |
-
resblock,
|
139 |
-
resblock_kernel_sizes,
|
140 |
-
resblock_dilation_sizes,
|
141 |
-
upsample_rates,
|
142 |
-
upsample_initial_channel,
|
143 |
upsample_kernel_sizes,
|
144 |
n_speakers=n_speakers,
|
145 |
gin_channels=gin_channels,
|
@@ -147,27 +150,28 @@ class SynthesizerTrn(models.SynthesizerTrn):
|
|
147 |
**kwargs
|
148 |
)
|
149 |
self.enc_p = TextEncoder(n_vocab,
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
|
159 |
|
160 |
-
def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None,
|
|
|
161 |
from ONNXVITS_utils import runonnx
|
162 |
-
|
163 |
-
|
164 |
|
165 |
if self.n_speakers > 0:
|
166 |
-
g = self.emb_g(sid).unsqueeze(-1)
|
167 |
else:
|
168 |
g = None
|
169 |
|
170 |
-
#logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
|
171 |
logw = runonnx("ONNX_net/dp.onnx", x=x.numpy(), x_mask=x_mask.numpy(), g=g.numpy())
|
172 |
logw = torch.from_numpy(logw[0])
|
173 |
|
@@ -178,26 +182,27 @@ class SynthesizerTrn(models.SynthesizerTrn):
|
|
178 |
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
|
179 |
attn = commons.generate_path(w_ceil, attn_mask)
|
180 |
|
181 |
-
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
|
182 |
-
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1,
|
|
|
183 |
|
184 |
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
|
185 |
-
|
186 |
-
#z = self.flow(z_p, y_mask, g=g, reverse=True)
|
187 |
z = runonnx("ONNX_net/flow.onnx", z_p=z_p.numpy(), y_mask=y_mask.numpy(), g=g.numpy())
|
188 |
z = torch.from_numpy(z[0])
|
189 |
|
190 |
-
#o = self.dec((z * y_mask)[:,:,:max_len], g=g)
|
191 |
-
o = runonnx("ONNX_net/dec.onnx", z_in=(z * y_mask)[
|
192 |
o = torch.from_numpy(o[0])
|
193 |
|
194 |
return o, attn, y_mask, (z, z_p, m_p, logs_p)
|
195 |
|
196 |
def predict_duration(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None,
|
197 |
-
|
198 |
from ONNXVITS_utils import runonnx
|
199 |
|
200 |
-
#x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
|
201 |
x, m_p, logs_p, x_mask = runonnx("ONNX_net/enc_p.onnx", x=x.numpy(), x_lengths=x_lengths.numpy())
|
202 |
x = torch.from_numpy(x)
|
203 |
m_p = torch.from_numpy(m_p)
|
@@ -205,68 +210,14 @@ class SynthesizerTrn(models.SynthesizerTrn):
|
|
205 |
x_mask = torch.from_numpy(x_mask)
|
206 |
|
207 |
if self.n_speakers > 0:
|
208 |
-
g = self.emb_g(sid).unsqueeze(-1)
|
209 |
else:
|
210 |
g = None
|
211 |
|
212 |
-
#logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
|
213 |
logw = runonnx("ONNX_net/dp.onnx", x=x.numpy(), x_mask=x_mask.numpy(), g=g.numpy())
|
214 |
logw = torch.from_numpy(logw[0])
|
215 |
|
216 |
w = torch.exp(logw) * x_mask * length_scale
|
217 |
w_ceil = torch.ceil(w)
|
218 |
-
return list(w_ceil.squeeze())
|
219 |
-
|
220 |
-
def infer_with_duration(self, x, x_lengths, w_ceil, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None,
|
221 |
-
emotion_embedding=None):
|
222 |
-
from ONNXVITS_utils import runonnx
|
223 |
-
|
224 |
-
#x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
|
225 |
-
x, m_p, logs_p, x_mask = runonnx("ONNX_net/enc_p.onnx", x=x.numpy(), x_lengths=x_lengths.numpy())
|
226 |
-
x = torch.from_numpy(x)
|
227 |
-
m_p = torch.from_numpy(m_p)
|
228 |
-
logs_p = torch.from_numpy(logs_p)
|
229 |
-
x_mask = torch.from_numpy(x_mask)
|
230 |
-
|
231 |
-
if self.n_speakers > 0:
|
232 |
-
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
233 |
-
else:
|
234 |
-
g = None
|
235 |
-
assert len(w_ceil) == x.shape[2]
|
236 |
-
w_ceil = torch.FloatTensor(w_ceil).reshape(1, 1, -1)
|
237 |
-
y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
|
238 |
-
y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
|
239 |
-
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
|
240 |
-
attn = commons.generate_path(w_ceil, attn_mask)
|
241 |
-
|
242 |
-
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
|
243 |
-
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
|
244 |
-
|
245 |
-
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
|
246 |
-
|
247 |
-
#z = self.flow(z_p, y_mask, g=g, reverse=True)
|
248 |
-
z = runonnx("ONNX_net/flow.onnx", z_p=z_p.numpy(), y_mask=y_mask.numpy(), g=g.numpy())
|
249 |
-
z = torch.from_numpy(z[0])
|
250 |
-
|
251 |
-
#o = self.dec((z * y_mask)[:,:,:max_len], g=g)
|
252 |
-
o = runonnx("ONNX_net/dec.onnx", z_in=(z * y_mask)[:,:,:max_len].numpy(), g=g.numpy())
|
253 |
-
o = torch.from_numpy(o[0])
|
254 |
-
|
255 |
-
return o, attn, y_mask, (z, z_p, m_p, logs_p)
|
256 |
-
|
257 |
-
def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
|
258 |
-
from ONNXVITS_utils import runonnx
|
259 |
-
assert self.n_speakers > 0, "n_speakers have to be larger than 0."
|
260 |
-
g_src = self.emb_g(sid_src).unsqueeze(-1)
|
261 |
-
g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
|
262 |
-
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
|
263 |
-
# z_p = self.flow(z, y_mask, g=g_src)
|
264 |
-
z_p = runonnx("ONNX_net/flow.onnx", z_p=z.numpy(), y_mask=y_mask.numpy(), g=g_src.numpy())
|
265 |
-
z_p = torch.from_numpy(z_p[0])
|
266 |
-
# z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
|
267 |
-
z_hat = runonnx("ONNX_net/flow.onnx", z_p=z_p.numpy(), y_mask=y_mask.numpy(), g=g_tgt.numpy())
|
268 |
-
z_hat = torch.from_numpy(z_hat[0])
|
269 |
-
# o_hat = self.dec(z_hat * y_mask, g=g_tgt)
|
270 |
-
o_hat = runonnx("ONNX_net/dec.onnx", z_in=(z_hat * y_mask).numpy(), g=g_tgt.numpy())
|
271 |
-
o_hat = torch.from_numpy(o_hat[0])
|
272 |
-
return o_hat, y_mask, (z, z_p, z_hat)
|
|
|
13 |
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
14 |
from commons import init_weights, get_padding
|
15 |
|
16 |
+
|
17 |
class TextEncoder(nn.Module):
|
18 |
def __init__(self,
|
19 |
+
n_vocab,
|
20 |
+
out_channels,
|
21 |
+
hidden_channels,
|
22 |
+
filter_channels,
|
23 |
+
n_heads,
|
24 |
+
n_layers,
|
25 |
+
kernel_size,
|
26 |
+
p_dropout,
|
27 |
+
emotion_embedding):
|
28 |
super().__init__()
|
29 |
self.n_vocab = n_vocab
|
30 |
self.out_channels = out_channels
|
|
|
35 |
self.kernel_size = kernel_size
|
36 |
self.p_dropout = p_dropout
|
37 |
self.emotion_embedding = emotion_embedding
|
38 |
+
|
39 |
+
if self.n_vocab != 0:
|
40 |
self.emb = nn.Embedding(n_vocab, hidden_channels)
|
41 |
if emotion_embedding:
|
42 |
self.emo_proj = nn.Linear(1024, hidden_channels)
|
43 |
+
nn.init.normal_(self.emb.weight, 0.0, hidden_channels ** -0.5)
|
44 |
|
45 |
self.encoder = attentions.Encoder(
|
46 |
hidden_channels,
|
|
|
49 |
n_layers,
|
50 |
kernel_size,
|
51 |
p_dropout)
|
52 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
53 |
|
54 |
def forward(self, x, x_lengths, emotion_embedding=None):
|
55 |
+
if self.n_vocab != 0:
|
56 |
+
x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
|
57 |
if emotion_embedding is not None:
|
58 |
print("emotion added")
|
59 |
x = x + self.emo_proj(emotion_embedding.unsqueeze(1))
|
60 |
+
x = torch.transpose(x, 1, -1) # [b, h, t]
|
61 |
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
62 |
|
63 |
x = self.encoder(x * x_mask, x_mask)
|
|
|
66 |
m, logs = torch.split(stats, self.out_channels, dim=1)
|
67 |
return x, m, logs, x_mask
|
68 |
|
69 |
+
|
70 |
class PosteriorEncoder(nn.Module):
|
71 |
def __init__(self,
|
72 |
+
in_channels,
|
73 |
+
out_channels,
|
74 |
+
hidden_channels,
|
75 |
+
kernel_size,
|
76 |
+
dilation_rate,
|
77 |
+
n_layers,
|
78 |
+
gin_channels=0):
|
79 |
super().__init__()
|
80 |
self.in_channels = in_channels
|
81 |
self.out_channels = out_channels
|
|
|
98 |
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
|
99 |
return z, m, logs, x_mask
|
100 |
|
101 |
+
|
102 |
class SynthesizerTrn(models.SynthesizerTrn):
|
103 |
"""
|
104 |
Synthesizer for Training
|
105 |
"""
|
106 |
|
107 |
+
def __init__(self,
|
108 |
+
n_vocab,
|
109 |
+
spec_channels,
|
110 |
+
segment_size,
|
111 |
+
inter_channels,
|
112 |
+
hidden_channels,
|
113 |
+
filter_channels,
|
114 |
+
n_heads,
|
115 |
+
n_layers,
|
116 |
+
kernel_size,
|
117 |
+
p_dropout,
|
118 |
+
resblock,
|
119 |
+
resblock_kernel_sizes,
|
120 |
+
resblock_dilation_sizes,
|
121 |
+
upsample_rates,
|
122 |
+
upsample_initial_channel,
|
123 |
+
upsample_kernel_sizes,
|
124 |
+
n_speakers=0,
|
125 |
+
gin_channels=0,
|
126 |
+
use_sdp=True,
|
127 |
+
emotion_embedding=False,
|
128 |
+
**kwargs):
|
129 |
+
|
130 |
+
super().__init__(
|
131 |
n_vocab,
|
132 |
spec_channels,
|
133 |
segment_size,
|
|
|
138 |
n_layers,
|
139 |
kernel_size,
|
140 |
p_dropout,
|
141 |
+
resblock,
|
142 |
+
resblock_kernel_sizes,
|
143 |
+
resblock_dilation_sizes,
|
144 |
+
upsample_rates,
|
145 |
+
upsample_initial_channel,
|
146 |
upsample_kernel_sizes,
|
147 |
n_speakers=n_speakers,
|
148 |
gin_channels=gin_channels,
|
|
|
150 |
**kwargs
|
151 |
)
|
152 |
self.enc_p = TextEncoder(n_vocab,
|
153 |
+
inter_channels,
|
154 |
+
hidden_channels,
|
155 |
+
filter_channels,
|
156 |
+
n_heads,
|
157 |
+
n_layers,
|
158 |
+
kernel_size,
|
159 |
+
p_dropout,
|
160 |
+
emotion_embedding)
|
161 |
self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
|
162 |
|
163 |
+
def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None,
|
164 |
+
emotion_embedding=None):
|
165 |
from ONNXVITS_utils import runonnx
|
166 |
+
with torch.no_grad():
|
167 |
+
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, emotion_embedding)
|
168 |
|
169 |
if self.n_speakers > 0:
|
170 |
+
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
171 |
else:
|
172 |
g = None
|
173 |
|
174 |
+
# logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
|
175 |
logw = runonnx("ONNX_net/dp.onnx", x=x.numpy(), x_mask=x_mask.numpy(), g=g.numpy())
|
176 |
logw = torch.from_numpy(logw[0])
|
177 |
|
|
|
182 |
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
|
183 |
attn = commons.generate_path(w_ceil, attn_mask)
|
184 |
|
185 |
+
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
|
186 |
+
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1,
|
187 |
+
2) # [b, t', t], [b, t, d] -> [b, d, t']
|
188 |
|
189 |
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
|
190 |
+
|
191 |
+
# z = self.flow(z_p, y_mask, g=g, reverse=True)
|
192 |
z = runonnx("ONNX_net/flow.onnx", z_p=z_p.numpy(), y_mask=y_mask.numpy(), g=g.numpy())
|
193 |
z = torch.from_numpy(z[0])
|
194 |
|
195 |
+
# o = self.dec((z * y_mask)[:,:,:max_len], g=g)
|
196 |
+
o = runonnx("ONNX_net/dec.onnx", z_in=(z * y_mask)[:, :, :max_len].numpy(), g=g.numpy())
|
197 |
o = torch.from_numpy(o[0])
|
198 |
|
199 |
return o, attn, y_mask, (z, z_p, m_p, logs_p)
|
200 |
|
201 |
def predict_duration(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None,
|
202 |
+
emotion_embedding=None):
|
203 |
from ONNXVITS_utils import runonnx
|
204 |
|
205 |
+
# x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
|
206 |
x, m_p, logs_p, x_mask = runonnx("ONNX_net/enc_p.onnx", x=x.numpy(), x_lengths=x_lengths.numpy())
|
207 |
x = torch.from_numpy(x)
|
208 |
m_p = torch.from_numpy(m_p)
|
|
|
210 |
x_mask = torch.from_numpy(x_mask)
|
211 |
|
212 |
if self.n_speakers > 0:
|
213 |
+
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
214 |
else:
|
215 |
g = None
|
216 |
|
217 |
+
# logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
|
218 |
logw = runonnx("ONNX_net/dp.onnx", x=x.numpy(), x_mask=x_mask.numpy(), g=g.numpy())
|
219 |
logw = torch.from_numpy(logw[0])
|
220 |
|
221 |
w = torch.exp(logw) * x_mask * length_scale
|
222 |
w_ceil = torch.ceil(w)
|
223 |
+
return list(w_ceil.squeeze())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|