Skip to content

Commit fd2a9c9

Browse files
committed
add onnx gpu
1 parent 19355a9 commit fd2a9c9

4 files changed

Lines changed: 324 additions & 5 deletions

File tree

onnx_export_singer.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,8 +123,6 @@ def build_model(self):
123123
} # user input: Chinese characters
124124

125125
set_hparams(print_hparams=False)
126-
spec_min= torch.FloatTensor(hparams['spec_min'])[None, None, :hparams['keep_bins']]
127-
spec_max= torch.FloatTensor(hparams['spec_max'])[None, None, :hparams['keep_bins']]
128126

129127
dev = 'cuda'
130128

onnx_test_pe_gpu.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
# coding=utf8
2+
3+
import os
4+
import sys
5+
import inference.svs.ds_e2e as e2e
6+
from utils.audio import save_wav
7+
from utils.hparams import set_hparams, hparams
8+
9+
import numpy as np
10+
11+
import torch
12+
import onnxruntime as ort
13+
14+
root_dir = os.path.dirname(os.path.abspath(__file__))
15+
os.environ['PYTHONPATH'] = f'"{root_dir}"'
16+
17+
sys.argv = [
18+
f'{root_dir}/inference/svs/ds_e2e.py',
19+
'--config',
20+
f'{root_dir}/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml',
21+
'--exp_name',
22+
'0228_opencpop_ds100_rel'
23+
]
24+
25+
26+
def to_numpy(tensor):
27+
return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
28+
29+
30+
class TestHifiganInfer(e2e.DiffSingerE2EInfer):
31+
def __init__(self, hparams, device=None):
32+
super().__init__(hparams, device)
33+
34+
self.pe2 = ort.InferenceSession("xiaoma_pe.onnx", providers=["CUDAExecutionProvider"])
35+
self.vocoder2 = ort.InferenceSession("hifigan.onnx", providers=["CUDAExecutionProvider"])
36+
37+
def run_vocoder(self, c, **kwargs):
38+
c = c.transpose(2, 1) # [B, 80, T]
39+
f0 = kwargs.get('f0') # [B, T]
40+
41+
if f0 is not None and hparams.get('use_nsf'):
42+
ort_inputs = {
43+
'x': to_numpy(c),
44+
'f0': to_numpy(f0)
45+
}
46+
else:
47+
ort_inputs = {
48+
'x': to_numpy(c),
49+
'f0': {}
50+
}
51+
# [T]
52+
53+
ort_out = self.vocoder2.run(None, ort_inputs)
54+
y = torch.from_numpy(ort_out[0]).to(self.device)
55+
56+
return y[None]
57+
58+
def forward_model(self, inp):
59+
sample = self.input_to_batch(inp)
60+
txt_tokens = sample['txt_tokens'] # [B, T_t]
61+
spk_id = sample.get('spk_ids')
62+
63+
print(txt_tokens.shape)
64+
print(spk_id.shape)
65+
print(sample['pitch_midi'].shape)
66+
print(sample['midi_dur'].shape)
67+
if (sample['is_slur'] is not None):
68+
print(sample['is_slur'].shape)
69+
if (sample['mel2ph'] is not None):
70+
print(sample['mel2ph'].shape)
71+
72+
with torch.no_grad():
73+
output = self.model(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True,
74+
pitch_midi=sample['pitch_midi'], midi_dur=sample['midi_dur'],
75+
is_slur=sample['is_slur'], mel2ph=sample['mel2ph'])
76+
77+
mel_out = output['mel_out'] # [B, T,80]
78+
79+
if hparams.get('pe_enable') is not None and hparams['pe_enable']:
80+
pe2_res = self.pe2.run(None,
81+
{
82+
'mel_input': to_numpy(mel_out)
83+
}
84+
)
85+
86+
# pe predict from Pred mel
87+
f0_pred = torch.from_numpy(pe2_res[1])
88+
89+
else:
90+
f0_pred = output['f0_denorm']
91+
92+
# Run Vocoder
93+
wav_out = self.run_vocoder(mel_out, f0=f0_pred)
94+
wav_out = wav_out.cpu().numpy()
95+
return wav_out[0]
96+
97+
98+
if __name__ == '__main__':
99+
c = {
100+
'text': '小酒窝长睫毛AP是你最美的记号',
101+
'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
102+
'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
103+
'input_type': 'word'
104+
} # user input: Chinese characters
105+
106+
target = "./infer_out/onnx_test_res.wav"
107+
108+
set_hparams(print_hparams=False)
109+
infer_ins = TestHifiganInfer(hparams)
110+
111+
out = infer_ins.infer_once(c)
112+
os.makedirs(os.path.dirname(target), exist_ok=True)
113+
print(f'| save audio: {target}')
114+
save_wav(out, target, hparams['audio_sample_rate'])
115+
116+
print(infer_ins.pe)
117+
print("OK")

onnx_test_singer.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,18 +65,17 @@ def __init__(self, hparams, device=None):
6565
self.vocoder2 = ort.InferenceSession("hifigan.onnx")
6666
print("load singer_fs")
6767
self.model2 = ort.InferenceSession("singer_fs.onnx")
68-
6968
ips = self.model2.get_inputs()
7069
print(len(ips))
7170
for i in range(0, len(ips)):
72-
print(ips[i].name)
71+
print(f'{i}. {ips[i].name}')
7372

7473
print("load singer_denoise")
7574
self.model3 = ort.InferenceSession("singer_denoise.onnx")
7675
ips = self.model3.get_inputs()
7776
print(len(ips))
7877
for i in range(0, len(ips)):
79-
print(ips[i].name)
78+
print(f'{i}. {ips[i].name}')
8079

8180
print("load over")
8281

@@ -124,11 +123,14 @@ def forward_model(self, inp):
124123

125124
cond = torch.from_numpy(decoder_inp[0]).transpose(1, 2)
126125

126+
print(f'cond2: {cond}')
127+
127128
t = hparams['K_step']
128129
print('===> gaussion start.')
129130
shape = (cond.shape[0], 1,
130131
hparams['audio_num_mel_bins'], cond.shape[2])
131132
x = torch.randn(shape, device=device)
133+
# x = torch.zeros(shape, device=device)
132134

133135
for i in tqdm(reversed(range(0, t)), desc='sample time step', total=t):
134136
res2 = self.model3.run(
@@ -182,6 +184,10 @@ def forward_model(self, inp):
182184
target = "./infer_out/onnx_test_singer_res.wav"
183185

184186
set_hparams(print_hparams=False)
187+
188+
spec_min= torch.FloatTensor(hparams['spec_min'])[None, None, :hparams['keep_bins']]
189+
spec_max= torch.FloatTensor(hparams['spec_max'])[None, None, :hparams['keep_bins']]
190+
185191
infer_ins = TestAllInfer(hparams)
186192

187193
out = infer_ins.infer_once(c)

onnx_test_singer_gpu.py

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
# coding=utf8
2+
3+
import os
4+
from pyexpat import model
5+
import sys
6+
import inference.svs.ds_e2e as e2e
7+
from inference.svs.opencpop.map import cpop_pinyin2ph_func
8+
from utils.audio import save_wav
9+
from utils.hparams import set_hparams, hparams
10+
11+
import numpy as np
12+
13+
import torch
14+
import onnxruntime as ort
15+
16+
from tqdm import tqdm
17+
18+
from utils.text_encoder import TokenTextEncoder
19+
20+
root_dir = os.path.dirname(os.path.abspath(__file__))
21+
os.environ['PYTHONPATH'] = f'"{root_dir}"'
22+
23+
sys.argv = [
24+
f'{root_dir}/inference/svs/ds_e2e.py',
25+
'--config',
26+
f'{root_dir}/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml',
27+
'--exp_name',
28+
'0228_opencpop_ds100_rel'
29+
]
30+
31+
32+
def to_numpy(tensor):
33+
if (tensor is None):
34+
return np.array([[]])
35+
return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
36+
37+
38+
spec_max = 0
39+
spec_min = 0
40+
41+
42+
def denorm_spec(x):
43+
return (x + 1) / 2 * (spec_max - spec_min) + spec_min
44+
45+
46+
class TestAllInfer(e2e.DiffSingerE2EInfer):
47+
def __init__(self, hparams, device=None):
48+
if device is None:
49+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
50+
self.hparams = hparams
51+
self.device = device
52+
53+
phone_list = ["AP", "SP", "a", "ai", "an", "ang", "ao", "b", "c", "ch", "d", "e", "ei", "en", "eng", "er", "f", "g",
54+
"h", "i", "ia", "ian", "iang", "iao", "ie", "in", "ing", "iong", "iu", "j", "k", "l", "m", "n", "o",
55+
"ong", "ou", "p", "q", "r", "s", "sh", "t", "u", "ua", "uai", "uan", "uang", "ui", "un", "uo", "v",
56+
"van", "ve", "vn", "w", "x", "y", "z", "zh"]
57+
self.ph_encoder = TokenTextEncoder(
58+
None, vocab_list=phone_list, replace_oov=',')
59+
self.pinyin2phs = cpop_pinyin2ph_func()
60+
self.spk_map = {'opencpop': 0}
61+
62+
print("load pe")
63+
self.pe2 = ort.InferenceSession("xiaoma_pe.onnx", providers=["CUDAExecutionProvider"])
64+
print("load hifigan")
65+
self.vocoder2 = ort.InferenceSession("hifigan.onnx", providers=["CUDAExecutionProvider"])
66+
print("load singer_fs")
67+
self.model2 = ort.InferenceSession("singer_fs.onnx", providers=["CUDAExecutionProvider"])
68+
ips = self.model2.get_inputs()
69+
print(len(ips))
70+
for i in range(0, len(ips)):
71+
print(f'{i}. {ips[i].name}')
72+
73+
print("load singer_denoise")
74+
self.model3 = ort.InferenceSession("singer_denoise.onnx", providers=["CUDAExecutionProvider"])
75+
ips = self.model3.get_inputs()
76+
print(len(ips))
77+
for i in range(0, len(ips)):
78+
print(f'{i}. {ips[i].name}')
79+
80+
print("load over")
81+
82+
def run_vocoder(self, c, **kwargs):
83+
c = c.transpose(2, 1) # [B, 80, T]
84+
f0 = kwargs.get('f0') # [B, T]
85+
86+
if f0 is not None and hparams.get('use_nsf'):
87+
ort_inputs = {
88+
'x': to_numpy(c),
89+
'f0': to_numpy(f0)
90+
}
91+
else:
92+
ort_inputs = {
93+
'x': to_numpy(c),
94+
'f0': {}
95+
}
96+
# [T]
97+
98+
ort_out = self.vocoder2.run(None, ort_inputs)
99+
y = torch.from_numpy(ort_out[0]).to(self.device)
100+
101+
return y[None]
102+
103+
def forward_model(self, inp):
104+
sample = self.input_to_batch(inp)
105+
txt_tokens = sample['txt_tokens'] # [B, T_t]
106+
spk_id = sample.get('spk_ids')
107+
mel2ph = sample['mel2ph']
108+
109+
device = txt_tokens.device
110+
111+
with torch.no_grad():
112+
decoder_inp = self.model2.run(
113+
None,
114+
{
115+
"txt_tokens": to_numpy(txt_tokens),
116+
# "spk_id": to_numpy(spk_id),
117+
"pitch_midi": to_numpy(sample['pitch_midi']).astype(np.int64),
118+
"midi_dur": to_numpy(sample['midi_dur']),
119+
"is_slur": to_numpy(sample['is_slur']).astype(np.int64),
120+
# "mel2ph": np.array([0, 0]).astype(np.int64)
121+
}
122+
)
123+
124+
cond = torch.from_numpy(decoder_inp[0]).transpose(1, 2)
125+
126+
print(f'cond2: {cond}')
127+
128+
t = hparams['K_step']
129+
print('===> gaussion start.')
130+
shape = (cond.shape[0], 1,
131+
hparams['audio_num_mel_bins'], cond.shape[2])
132+
x = torch.randn(shape, device=device)
133+
# x = torch.zeros(shape, device=device)
134+
135+
for i in tqdm(reversed(range(0, t)), desc='sample time step', total=t):
136+
res2 = self.model3.run(
137+
None,
138+
{
139+
"x": to_numpy(x),
140+
"t": np.array([i]).astype(np.int64),
141+
"cond": to_numpy(cond),
142+
}
143+
)
144+
x = torch.from_numpy(res2[0])
145+
cond = torch.from_numpy(res2[1])
146+
147+
x = x[:, 0].transpose(1, 2)
148+
149+
if mel2ph is not None: # for singing
150+
mel_out = denorm_spec(x) * ((mel2ph > 0).float()[:, :, None])
151+
else:
152+
mel_out = denorm_spec(x)
153+
154+
# mel_out = output['mel_out'] # [B, T,80]
155+
156+
if hparams.get('pe_enable') is not None and hparams['pe_enable']:
157+
pe2_res = self.pe2.run(None,
158+
{
159+
'mel_input': to_numpy(mel_out)
160+
}
161+
)
162+
163+
# pe predict from Pred mel
164+
f0_pred = torch.from_numpy(pe2_res[1])
165+
166+
else:
167+
# f0_pred = output['f0_denorm']
168+
f0_pred = None
169+
170+
# Run Vocoder
171+
wav_out = self.run_vocoder(mel_out, f0=f0_pred)
172+
wav_out = wav_out.cpu().numpy()
173+
return wav_out[0]
174+
175+
176+
if __name__ == '__main__':
177+
c = {
178+
'text': '小酒窝长睫毛AP是你最美的记号',
179+
'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
180+
'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
181+
'input_type': 'word'
182+
} # user input: Chinese characters
183+
184+
target = "./infer_out/onnx_test_singer_res.wav"
185+
186+
set_hparams(print_hparams=False)
187+
188+
spec_min= torch.FloatTensor(hparams['spec_min'])[None, None, :hparams['keep_bins']]
189+
spec_max= torch.FloatTensor(hparams['spec_max'])[None, None, :hparams['keep_bins']]
190+
191+
infer_ins = TestAllInfer(hparams)
192+
193+
out = infer_ins.infer_once(c)
194+
os.makedirs(os.path.dirname(target), exist_ok=True)
195+
print(f'| save audio: {target}')
196+
save_wav(out, target, hparams['audio_sample_rate'])
197+
198+
print("OK")

0 commit comments

Comments
 (0)