Skip to content

Commit c8835da

Browse files
committed
add export singer script
1 parent e1b9d86 commit c8835da

5 files changed

Lines changed: 325 additions & 16 deletions

File tree

modules/fastspeech/pe.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ def forward(self, x):
2626
:param x: [B, T, 80]
2727
:return: [L, B, T, H], [B, T, H]
2828
"""
29-
padding_mask = x.abs().sum(-1).eq(0).data # [B, T]
29+
# padding_mask = x.abs().sum(-1).eq(0).data # [B, T]
30+
padding_mask = x.abs().sum(-1).eq(0).detach()
3031
nonpadding_mask_TB = 1 - padding_mask.float()[:, None, :] # [B, 1, T]
3132
x = x.transpose(1, 2)
3233
hiddens = []

onnx_export_hifigan.py

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,14 @@
2222
if __name__ == '__main__':
2323

2424
set_hparams(print_hparams=False)
25-
infer_ins = e2e.DiffSingerE2EInfer(hparams)
2625

27-
infer_ins.vocoder.to('cpu')
26+
dev = 'cuda'
27+
28+
infer_ins = e2e.DiffSingerE2EInfer(hparams)
29+
infer_ins.vocoder.to(dev)
2830
with torch.no_grad():
29-
x = torch.rand(1, 80, 100)
30-
f0 = torch.rand(1, 100)
31+
x = torch.rand(1, 80, 968).to(dev)
32+
f0 = torch.rand(1, 968).to(dev)
3133

3234
torch.onnx.export(
3335
infer_ins.vocoder,
@@ -36,25 +38,20 @@
3638
f0
3739
),
3840
"hifigan.onnx",
41+
verbose=True,
3942
input_names=["x", "f0"],
40-
output_names=["y"],
4143
dynamic_axes={
4244
"x": {
43-
0: "hop_size",
44-
1: "win_size",
45-
2: "fft_size",
45+
0: "batch_size",
46+
1: "num_mel_bin",
47+
2: "frames",
4648
},
4749
"f0": {
48-
0: "len",
50+
0: "batch_size",
4951
1: "frames"
50-
},
51-
"y": {
52-
0: "len",
53-
1: "frames",
54-
2: "batch_size"
5552
}
5653
},
57-
opset_version=11
54+
opset_version=11,
5855
)
5956

6057
print("OK")

onnx_export_pe.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# coding=utf8
2+
3+
import os
4+
import sys
5+
import inference.svs.ds_e2e as e2e
6+
from utils.audio import save_wav
7+
from utils.hparams import set_hparams, hparams
8+
9+
import torch
10+
11+
root_dir = os.path.dirname(os.path.abspath(__file__))
12+
os.environ['PYTHONPATH'] = f'"{root_dir}"'
13+
14+
sys.argv = [
15+
f'{root_dir}/inference/svs/ds_e2e.py',
16+
'--config',
17+
f'{root_dir}/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml',
18+
'--exp_name',
19+
'0228_opencpop_ds100_rel'
20+
]
21+
22+
if __name__ == '__main__':
23+
set_hparams(print_hparams=False)
24+
25+
dev = 'cuda'
26+
27+
infer_ins = e2e.DiffSingerE2EInfer(hparams)
28+
infer_ins.pe.to(dev)
29+
with torch.no_grad():
30+
mel_input = torch.rand(1, 968, 80).to(dev)
31+
32+
torch.onnx.export(
33+
infer_ins.pe,
34+
(
35+
mel_input
36+
),
37+
"xiaoma_pe.onnx",
38+
verbose=True,
39+
input_names=["mel_input"],
40+
dynamic_axes={
41+
"mel_input": {
42+
0: "batch_size",
43+
1: "frames",
44+
2: "num_mel_bin",
45+
}
46+
},
47+
opset_version=11
48+
)
49+
50+
print("OK")

onnx_export_singer.py

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
# coding=utf8
2+
3+
import os
4+
import sys
5+
import inference.svs.ds_e2e as e2e
6+
from modules.fastspeech.pe import PitchExtractor
7+
from usr.diff.shallow_diffusion_tts import GaussianDiffusion
8+
from utils import load_ckpt
9+
from utils.audio import save_wav
10+
from utils.hparams import set_hparams, hparams
11+
12+
import torch
13+
14+
from utils.text_encoder import TokenTextEncoder
15+
from usr.diffsinger_task import DIFF_DECODERS
16+
17+
root_dir = os.path.dirname(os.path.abspath(__file__))
18+
os.environ['PYTHONPATH'] = f'"{root_dir}"'
19+
20+
sys.argv = [
21+
f'{root_dir}/inference/svs/ds_e2e.py',
22+
'--config',
23+
f'{root_dir}/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml',
24+
'--exp_name',
25+
'0228_opencpop_ds100_rel'
26+
]
27+
28+
29+
class GaussianDiffusionWrap(GaussianDiffusion):
30+
def forward(self, txt_tokens, mel2ph,
31+
# Wrapped Arguments
32+
spk_id,
33+
pitch_midi,
34+
midi_dur,
35+
is_slur,
36+
):
37+
38+
if (torch.numel(txt_tokens) == 0):
39+
txt_tokens = None
40+
if (torch.numel(mel2ph) == 0):
41+
mel2ph = None
42+
if (torch.numel(spk_id) == 0):
43+
spk_id = None
44+
if (torch.numel(pitch_midi) == 0):
45+
pitch_midi = None
46+
if (torch.numel(midi_dur) == 0):
47+
midi_dur = None
48+
if (torch.numel(is_slur) == 0):
49+
is_slur = None
50+
51+
return super().forward(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True,
52+
pitch_midi=pitch_midi, midi_dur=midi_dur,
53+
is_slur=is_slur, mel2ph=mel2ph)
54+
55+
56+
class DFSInferWrapped(e2e.DiffSingerE2EInfer):
57+
def build_model(self):
58+
model = GaussianDiffusionWrap(
59+
phone_encoder=self.ph_encoder,
60+
out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
61+
timesteps=hparams['timesteps'],
62+
K_step=hparams['K_step'],
63+
loss_type=hparams['diff_loss_type'],
64+
spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
65+
)
66+
67+
model.eval()
68+
load_ckpt(model, hparams['work_dir'], 'model')
69+
70+
if hparams.get('pe_enable') is not None and hparams['pe_enable']:
71+
self.pe = PitchExtractor().to(self.device)
72+
load_ckpt(self.pe, hparams['pe_ckpt'], 'model', strict=True)
73+
self.pe.eval()
74+
75+
return model
76+
77+
if __name__ == '__main__':
78+
79+
inp = {
80+
'text': '小酒窝长睫毛AP是你最美的记号',
81+
'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
82+
'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
83+
'input_type': 'word'
84+
} # user input: Chinese characters
85+
86+
set_hparams(print_hparams=False)
87+
88+
dev = 'cuda'
89+
90+
infer_ins = DFSInferWrapped(hparams)
91+
infer_ins.model.to(dev)
92+
93+
with torch.no_grad():
94+
inp = infer_ins.preprocess_input(inp, input_type=inp['input_type'] if inp.get('input_type') else 'word')
95+
sample = infer_ins.input_to_batch(inp)
96+
txt_tokens = sample['txt_tokens'] # [B, T_t]
97+
spk_id = sample.get('spk_ids')
98+
99+
torch.onnx.export(
100+
infer_ins.model,
101+
(
102+
txt_tokens.to(dev),
103+
{
104+
'spk_id': spk_id.to(dev),
105+
'pitch_midi': sample['pitch_midi'].to(dev),
106+
'midi_dur': sample['midi_dur'].to(dev),
107+
'is_slur': spk_id.to(dev),
108+
'mel2ph': spk_id.to(dev)
109+
}
110+
),
111+
"singer.onnx",
112+
# verbose=True,
113+
input_names=["txt_tokens", "spk_id",
114+
"pitch_midi", "midi_dur", "is_slur", "mel2ph"],
115+
dynamic_axes={
116+
"txt_tokens": {
117+
0: "a",
118+
1: "b",
119+
},
120+
"spk_id": {
121+
0: "a",
122+
1: "b",
123+
},
124+
"pitch_midi": {
125+
0: "a",
126+
1: "b",
127+
},
128+
"midi_dur": {
129+
0: "a",
130+
1: "b",
131+
},
132+
"is_slur": {
133+
0: "a",
134+
1: "b",
135+
},
136+
"mel2ph": {
137+
0: "a",
138+
1: "b",
139+
}
140+
},
141+
opset_version=11
142+
)
143+
144+
print("OK")

onnx_test_pe.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
# coding=utf8
2+
3+
import os
4+
import sys
5+
import inference.svs.ds_e2e as e2e
6+
from utils.audio import save_wav
7+
from utils.hparams import set_hparams, hparams
8+
9+
import numpy as np
10+
11+
import torch
12+
import onnxruntime as ort
13+
14+
root_dir = os.path.dirname(os.path.abspath(__file__))
15+
os.environ['PYTHONPATH'] = f'"{root_dir}"'
16+
17+
sys.argv = [
18+
f'{root_dir}/inference/svs/ds_e2e.py',
19+
'--config',
20+
f'{root_dir}/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml',
21+
'--exp_name',
22+
'0228_opencpop_ds100_rel'
23+
]
24+
25+
26+
def to_numpy(tensor):
27+
return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
28+
29+
30+
class TestHifiganInfer(e2e.DiffSingerE2EInfer):
31+
def __init__(self, hparams, device=None):
32+
super().__init__(hparams, device)
33+
34+
self.pe2 = ort.InferenceSession("xiaoma_pe.onnx")
35+
self.vocoder2 = ort.InferenceSession("hifigan.onnx")
36+
37+
def run_vocoder(self, c, **kwargs):
38+
c = c.transpose(2, 1) # [B, 80, T]
39+
f0 = kwargs.get('f0') # [B, T]
40+
41+
if f0 is not None and hparams.get('use_nsf'):
42+
ort_inputs = {
43+
'x': to_numpy(c),
44+
'f0': to_numpy(f0)
45+
}
46+
else:
47+
ort_inputs = {
48+
'x': to_numpy(c),
49+
'f0': {}
50+
}
51+
# [T]
52+
53+
ort_out = self.vocoder2.run(None, ort_inputs)
54+
y = torch.from_numpy(ort_out[0]).to(self.device)
55+
56+
return y[None]
57+
58+
def forward_model(self, inp):
59+
sample = self.input_to_batch(inp)
60+
txt_tokens = sample['txt_tokens'] # [B, T_t]
61+
spk_id = sample.get('spk_ids')
62+
63+
print(txt_tokens.shape)
64+
print(spk_id.shape)
65+
print(sample['pitch_midi'].shape)
66+
print(sample['midi_dur'].shape)
67+
if (sample['is_slur'] is not None):
68+
print(sample['is_slur'].shape)
69+
if (sample['mel2ph'] is not None):
70+
print(sample['mel2ph'].shape)
71+
72+
with torch.no_grad():
73+
output = self.model(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True,
74+
pitch_midi=sample['pitch_midi'], midi_dur=sample['midi_dur'],
75+
is_slur=sample['is_slur'], mel2ph=sample['mel2ph'])
76+
77+
mel_out = output['mel_out'] # [B, T,80]
78+
79+
if hparams.get('pe_enable') is not None and hparams['pe_enable']:
80+
pe2_res = self.pe2.run(None,
81+
{
82+
'mel_input': to_numpy(mel_out)
83+
}
84+
)
85+
86+
# pe predict from Pred mel
87+
f0_pred = torch.from_numpy(pe2_res[1])
88+
89+
else:
90+
f0_pred = output['f0_denorm']
91+
92+
# Run Vocoder
93+
wav_out = self.run_vocoder(mel_out, f0=f0_pred)
94+
wav_out = wav_out.cpu().numpy()
95+
return wav_out[0]
96+
97+
98+
if __name__ == '__main__':
99+
c = {
100+
'text': '小酒窝长睫毛AP是你最美的记号',
101+
'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
102+
'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
103+
'input_type': 'word'
104+
} # user input: Chinese characters
105+
106+
target = "./infer_out/onnx_test_res.wav"
107+
108+
set_hparams(print_hparams=False)
109+
infer_ins = TestHifiganInfer(hparams)
110+
111+
out = infer_ins.infer_once(c)
112+
os.makedirs(os.path.dirname(target), exist_ok=True)
113+
print(f'| save audio: {target}')
114+
save_wav(out, target, hparams['audio_sample_rate'])
115+
116+
print(infer_ins.pe)
117+
print("OK")

0 commit comments

Comments
 (0)