Skip to content

Commit c88e891

Browse files
Merge branch 'openvpi:master' into master
2 parents bd7f94f + fd2a9c9 commit c88e891

5 files changed

Lines changed: 638 additions & 18 deletions

File tree

acoustic/dfs_models.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from usr.diff.shallow_diffusion_tts import GaussianDiffusion
2+
3+
import torch
4+
5+
device = 'cpu'
6+
7+
8+
class GaussianDiffusionFS(GaussianDiffusion):
9+
def forward(self, txt_tokens, mel2ph=None, spk_embed=None,
10+
ref_mels=None, f0=None, uv=None, energy=None, infer=False, **kwargs):
11+
ret = self.fs2(txt_tokens, mel2ph, spk_embed, ref_mels, f0, uv, energy,
12+
skip_decoder=True, infer=infer, **kwargs)
13+
return ret['decoder_inp']
14+
15+
16+
class GaussianDiffusionDenoise(GaussianDiffusion):
17+
def forward(self, x, t, cond):
18+
x = self.p_sample(x, t, cond)
19+
return [x, cond]

onnx_export_singer.py

Lines changed: 106 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,10 @@
99
from utils.audio import save_wav
1010
from utils.hparams import set_hparams, hparams
1111

12+
import acoustic.dfs_models as adm
13+
1214
import torch
15+
import numpy as np
1316

1417
from utils.text_encoder import TokenTextEncoder
1518
from usr.diffsinger_task import DIFF_DECODERS
@@ -26,19 +29,30 @@
2629
]
2730

2831

29-
class GaussianDiffusionWrap(GaussianDiffusion):
30-
def forward(self, txt_tokens, mel2ph,
32+
class GaussianDiffusionWrap(adm.GaussianDiffusionFS):
33+
def forward(self, txt_tokens,
3134
# Wrapped Arguments
3235
spk_id,
3336
pitch_midi,
3437
midi_dur,
3538
is_slur,
39+
mel2ph,
3640
):
3741

42+
print(f"txt_tokens: {txt_tokens}")
43+
print(f"spk_id: {spk_id}")
44+
print(f"pitch_midi: {pitch_midi}")
45+
print(f"midi_dur: {midi_dur}")
46+
print(f"is_slur: {is_slur}")
47+
print(f"mel2ph: {mel2ph}")
48+
49+
if (mel2ph[0].item() == 0):
50+
mel2ph = None
51+
else:
52+
mel2ph = mel2ph[1].item()
53+
3854
if (torch.numel(txt_tokens) == 0):
3955
txt_tokens = None
40-
if (torch.numel(mel2ph) == 0):
41-
mel2ph = None
4256
if (torch.numel(spk_id) == 0):
4357
spk_id = None
4458
if (torch.numel(pitch_midi) == 0):
@@ -57,7 +71,8 @@ class DFSInferWrapped(e2e.DiffSingerE2EInfer):
5771
def build_model(self):
5872
model = GaussianDiffusionWrap(
5973
phone_encoder=self.ph_encoder,
60-
out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
74+
out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](
75+
hparams),
6176
timesteps=hparams['timesteps'],
6277
K_step=hparams['K_step'],
6378
loss_type=hparams['diff_loss_type'],
@@ -71,9 +86,33 @@ def build_model(self):
7186
self.pe = PitchExtractor().to(self.device)
7287
load_ckpt(self.pe, hparams['pe_ckpt'], 'model', strict=True)
7388
self.pe.eval()
74-
89+
7590
return model
7691

92+
93+
class DFSInferWrapped2(e2e.DiffSingerE2EInfer):
94+
def build_model(self):
95+
model = adm.GaussianDiffusionDenoise(
96+
phone_encoder=self.ph_encoder,
97+
out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](
98+
hparams),
99+
timesteps=hparams['timesteps'],
100+
K_step=hparams['K_step'],
101+
loss_type=hparams['diff_loss_type'],
102+
spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
103+
)
104+
105+
model.eval()
106+
load_ckpt(model, hparams['work_dir'], 'model')
107+
108+
if hparams.get('pe_enable') is not None and hparams['pe_enable']:
109+
self.pe = PitchExtractor().to(self.device)
110+
load_ckpt(self.pe, hparams['pe_ckpt'], 'model', strict=True)
111+
self.pe.eval()
112+
113+
return model
114+
115+
77116
if __name__ == '__main__':
78117

79118
inp = {
@@ -90,25 +129,43 @@ def build_model(self):
90129
infer_ins = DFSInferWrapped(hparams)
91130
infer_ins.model.to(dev)
92131

132+
infer_ins2 = DFSInferWrapped2(hparams)
133+
infer_ins2.model.to(dev)
134+
135+
adm.device = dev
136+
93137
with torch.no_grad():
94-
inp = infer_ins.preprocess_input(inp, input_type=inp['input_type'] if inp.get('input_type') else 'word')
138+
inp = infer_ins.preprocess_input(
139+
inp, input_type=inp['input_type'] if inp.get('input_type') else 'word')
95140
sample = infer_ins.input_to_batch(inp)
96141
txt_tokens = sample['txt_tokens'] # [B, T_t]
97142
spk_id = sample.get('spk_ids')
98143

144+
print(txt_tokens)
145+
print(spk_id)
146+
print(sample['pitch_midi'])
147+
print(sample['midi_dur'])
148+
print(sample['is_slur'])
149+
print(sample['mel2ph'])
150+
99151
torch.onnx.export(
100152
infer_ins.model,
101153
(
102154
txt_tokens.to(dev),
103-
{
104-
'spk_id': spk_id.to(dev),
105-
'pitch_midi': sample['pitch_midi'].to(dev),
106-
'midi_dur': sample['midi_dur'].to(dev),
107-
'is_slur': spk_id.to(dev),
108-
'mel2ph': spk_id.to(dev)
109-
}
155+
# {
156+
# 'spk_id': spk_id.to(dev),
157+
# 'pitch_midi': sample['pitch_midi'].to(dev),
158+
# 'midi_dur': sample['midi_dur'].to(dev),
159+
# 'is_slur': spk_id.to(dev),
160+
# 'mel2ph': spk_id.to(dev)
161+
# }
162+
spk_id.to(dev),
163+
sample['pitch_midi'].to(dev),
164+
sample['midi_dur'].to(dev),
165+
sample['is_slur'].to(dev),
166+
torch.from_numpy(np.array([0, 0]).astype(np.int64)).to(dev),
110167
),
111-
"singer.onnx",
168+
"singer_fs.onnx",
112169
# verbose=True,
113170
input_names=["txt_tokens", "spk_id",
114171
"pitch_midi", "midi_dur", "is_slur", "mel2ph"],
@@ -132,10 +189,41 @@ def build_model(self):
132189
"is_slur": {
133190
0: "a",
134191
1: "b",
192+
}
193+
},
194+
opset_version=11
195+
)
196+
197+
# fs_res = infer_ins.model(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True,
198+
# pitch_midi=sample['pitch_midi'], midi_dur=sample['midi_dur'],
199+
# is_slur=sample['is_slur'], mel2ph=sample['mel2ph'])
200+
# cond = fs_res.transpose(1, 2)
201+
# shape = (cond.shape[0], 1, infer_ins.model.mel_bins, cond.shape[2])
202+
# x = torch.randn(shape, device=dev)
203+
204+
torch.onnx.export(
205+
infer_ins2.model,
206+
(
207+
torch.rand(1, 1, 80, 967).to(dev),
208+
torch.full((1,), 1, dtype=torch.long).to(dev),
209+
torch.rand(1, 256, 967).to(dev),
210+
),
211+
"singer_denoise.onnx",
212+
input_names=[
213+
"x",
214+
"t",
215+
"cond",
216+
],
217+
dynamic_axes={
218+
"x": {
219+
0: "batch_size",
220+
2: "num_mel_bin",
221+
3: "frames",
135222
},
136-
"mel2ph": {
137-
0: "a",
138-
1: "b",
223+
"cond": {
224+
0: "batch_size",
225+
1: "what",
226+
2: "frames",
139227
}
140228
},
141229
opset_version=11

onnx_test_pe_gpu.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
# coding=utf8
2+
3+
import os
4+
import sys
5+
import inference.svs.ds_e2e as e2e
6+
from utils.audio import save_wav
7+
from utils.hparams import set_hparams, hparams
8+
9+
import numpy as np
10+
11+
import torch
12+
import onnxruntime as ort
13+
14+
root_dir = os.path.dirname(os.path.abspath(__file__))
15+
os.environ['PYTHONPATH'] = f'"{root_dir}"'
16+
17+
sys.argv = [
18+
f'{root_dir}/inference/svs/ds_e2e.py',
19+
'--config',
20+
f'{root_dir}/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml',
21+
'--exp_name',
22+
'0228_opencpop_ds100_rel'
23+
]
24+
25+
26+
def to_numpy(tensor):
27+
return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
28+
29+
30+
class TestHifiganInfer(e2e.DiffSingerE2EInfer):
31+
def __init__(self, hparams, device=None):
32+
super().__init__(hparams, device)
33+
34+
self.pe2 = ort.InferenceSession("xiaoma_pe.onnx", providers=["CUDAExecutionProvider"])
35+
self.vocoder2 = ort.InferenceSession("hifigan.onnx", providers=["CUDAExecutionProvider"])
36+
37+
def run_vocoder(self, c, **kwargs):
38+
c = c.transpose(2, 1) # [B, 80, T]
39+
f0 = kwargs.get('f0') # [B, T]
40+
41+
if f0 is not None and hparams.get('use_nsf'):
42+
ort_inputs = {
43+
'x': to_numpy(c),
44+
'f0': to_numpy(f0)
45+
}
46+
else:
47+
ort_inputs = {
48+
'x': to_numpy(c),
49+
'f0': {}
50+
}
51+
# [T]
52+
53+
ort_out = self.vocoder2.run(None, ort_inputs)
54+
y = torch.from_numpy(ort_out[0]).to(self.device)
55+
56+
return y[None]
57+
58+
def forward_model(self, inp):
59+
sample = self.input_to_batch(inp)
60+
txt_tokens = sample['txt_tokens'] # [B, T_t]
61+
spk_id = sample.get('spk_ids')
62+
63+
print(txt_tokens.shape)
64+
print(spk_id.shape)
65+
print(sample['pitch_midi'].shape)
66+
print(sample['midi_dur'].shape)
67+
if (sample['is_slur'] is not None):
68+
print(sample['is_slur'].shape)
69+
if (sample['mel2ph'] is not None):
70+
print(sample['mel2ph'].shape)
71+
72+
with torch.no_grad():
73+
output = self.model(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True,
74+
pitch_midi=sample['pitch_midi'], midi_dur=sample['midi_dur'],
75+
is_slur=sample['is_slur'], mel2ph=sample['mel2ph'])
76+
77+
mel_out = output['mel_out'] # [B, T,80]
78+
79+
if hparams.get('pe_enable') is not None and hparams['pe_enable']:
80+
pe2_res = self.pe2.run(None,
81+
{
82+
'mel_input': to_numpy(mel_out)
83+
}
84+
)
85+
86+
# pe predict from Pred mel
87+
f0_pred = torch.from_numpy(pe2_res[1])
88+
89+
else:
90+
f0_pred = output['f0_denorm']
91+
92+
# Run Vocoder
93+
wav_out = self.run_vocoder(mel_out, f0=f0_pred)
94+
wav_out = wav_out.cpu().numpy()
95+
return wav_out[0]
96+
97+
98+
if __name__ == '__main__':
99+
c = {
100+
'text': '小酒窝长睫毛AP是你最美的记号',
101+
'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
102+
'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
103+
'input_type': 'word'
104+
} # user input: Chinese characters
105+
106+
target = "./infer_out/onnx_test_res.wav"
107+
108+
set_hparams(print_hparams=False)
109+
infer_ins = TestHifiganInfer(hparams)
110+
111+
out = infer_ins.infer_once(c)
112+
os.makedirs(os.path.dirname(target), exist_ok=True)
113+
print(f'| save audio: {target}')
114+
save_wav(out, target, hparams['audio_sample_rate'])
115+
116+
print(infer_ins.pe)
117+
print("OK")

0 commit comments

Comments
 (0)