Skip to content

Commit bd7f94f

Browse files
Merge branch 'openvpi:master' into master
2 parents 78b20d4 + c8835da commit bd7f94f

6 files changed

Lines changed: 444 additions & 1 deletion

File tree

modules/fastspeech/pe.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ def forward(self, x):
2626
:param x: [B, T, 80]
2727
:return: [L, B, T, H], [B, T, H]
2828
"""
29-
padding_mask = x.abs().sum(-1).eq(0).data # [B, T]
29+
# padding_mask = x.abs().sum(-1).eq(0).data # [B, T]
30+
padding_mask = x.abs().sum(-1).eq(0).detach()
3031
nonpadding_mask_TB = 1 - padding_mask.float()[:, None, :] # [B, 1, T]
3132
x = x.transpose(1, 2)
3233
hiddens = []

onnx_export_hifigan.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# coding=utf8
2+
3+
import os
4+
import sys
5+
import inference.svs.ds_e2e as e2e
6+
from utils.audio import save_wav
7+
from utils.hparams import set_hparams, hparams
8+
9+
import torch
10+
11+
root_dir = os.path.dirname(os.path.abspath(__file__))
12+
os.environ['PYTHONPATH'] = f'"{root_dir}"'
13+
14+
sys.argv = [
15+
f'{root_dir}/inference/svs/ds_e2e.py',
16+
'--config',
17+
f'{root_dir}/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml',
18+
'--exp_name',
19+
'0228_opencpop_ds100_rel'
20+
]
21+
22+
if __name__ == '__main__':
23+
24+
set_hparams(print_hparams=False)
25+
26+
dev = 'cuda'
27+
28+
infer_ins = e2e.DiffSingerE2EInfer(hparams)
29+
infer_ins.vocoder.to(dev)
30+
with torch.no_grad():
31+
x = torch.rand(1, 80, 968).to(dev)
32+
f0 = torch.rand(1, 968).to(dev)
33+
34+
torch.onnx.export(
35+
infer_ins.vocoder,
36+
(
37+
x,
38+
f0
39+
),
40+
"hifigan.onnx",
41+
verbose=True,
42+
input_names=["x", "f0"],
43+
dynamic_axes={
44+
"x": {
45+
0: "batch_size",
46+
1: "num_mel_bin",
47+
2: "frames",
48+
},
49+
"f0": {
50+
0: "batch_size",
51+
1: "frames"
52+
}
53+
},
54+
opset_version=11,
55+
)
56+
57+
print("OK")

onnx_export_pe.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# coding=utf8
2+
3+
import os
4+
import sys
5+
import inference.svs.ds_e2e as e2e
6+
from utils.audio import save_wav
7+
from utils.hparams import set_hparams, hparams
8+
9+
import torch
10+
11+
root_dir = os.path.dirname(os.path.abspath(__file__))
12+
os.environ['PYTHONPATH'] = f'"{root_dir}"'
13+
14+
sys.argv = [
15+
f'{root_dir}/inference/svs/ds_e2e.py',
16+
'--config',
17+
f'{root_dir}/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml',
18+
'--exp_name',
19+
'0228_opencpop_ds100_rel'
20+
]
21+
22+
if __name__ == '__main__':
23+
set_hparams(print_hparams=False)
24+
25+
dev = 'cuda'
26+
27+
infer_ins = e2e.DiffSingerE2EInfer(hparams)
28+
infer_ins.pe.to(dev)
29+
with torch.no_grad():
30+
mel_input = torch.rand(1, 968, 80).to(dev)
31+
32+
torch.onnx.export(
33+
infer_ins.pe,
34+
(
35+
mel_input
36+
),
37+
"xiaoma_pe.onnx",
38+
verbose=True,
39+
input_names=["mel_input"],
40+
dynamic_axes={
41+
"mel_input": {
42+
0: "batch_size",
43+
1: "frames",
44+
2: "num_mel_bin",
45+
}
46+
},
47+
opset_version=11
48+
)
49+
50+
print("OK")

onnx_export_singer.py

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
# coding=utf8
2+
3+
import os
4+
import sys
5+
import inference.svs.ds_e2e as e2e
6+
from modules.fastspeech.pe import PitchExtractor
7+
from usr.diff.shallow_diffusion_tts import GaussianDiffusion
8+
from utils import load_ckpt
9+
from utils.audio import save_wav
10+
from utils.hparams import set_hparams, hparams
11+
12+
import torch
13+
14+
from utils.text_encoder import TokenTextEncoder
15+
from usr.diffsinger_task import DIFF_DECODERS
16+
17+
root_dir = os.path.dirname(os.path.abspath(__file__))
18+
os.environ['PYTHONPATH'] = f'"{root_dir}"'
19+
20+
sys.argv = [
21+
f'{root_dir}/inference/svs/ds_e2e.py',
22+
'--config',
23+
f'{root_dir}/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml',
24+
'--exp_name',
25+
'0228_opencpop_ds100_rel'
26+
]
27+
28+
29+
class GaussianDiffusionWrap(GaussianDiffusion):
30+
def forward(self, txt_tokens, mel2ph,
31+
# Wrapped Arguments
32+
spk_id,
33+
pitch_midi,
34+
midi_dur,
35+
is_slur,
36+
):
37+
38+
if (torch.numel(txt_tokens) == 0):
39+
txt_tokens = None
40+
if (torch.numel(mel2ph) == 0):
41+
mel2ph = None
42+
if (torch.numel(spk_id) == 0):
43+
spk_id = None
44+
if (torch.numel(pitch_midi) == 0):
45+
pitch_midi = None
46+
if (torch.numel(midi_dur) == 0):
47+
midi_dur = None
48+
if (torch.numel(is_slur) == 0):
49+
is_slur = None
50+
51+
return super().forward(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True,
52+
pitch_midi=pitch_midi, midi_dur=midi_dur,
53+
is_slur=is_slur, mel2ph=mel2ph)
54+
55+
56+
class DFSInferWrapped(e2e.DiffSingerE2EInfer):
57+
def build_model(self):
58+
model = GaussianDiffusionWrap(
59+
phone_encoder=self.ph_encoder,
60+
out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
61+
timesteps=hparams['timesteps'],
62+
K_step=hparams['K_step'],
63+
loss_type=hparams['diff_loss_type'],
64+
spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
65+
)
66+
67+
model.eval()
68+
load_ckpt(model, hparams['work_dir'], 'model')
69+
70+
if hparams.get('pe_enable') is not None and hparams['pe_enable']:
71+
self.pe = PitchExtractor().to(self.device)
72+
load_ckpt(self.pe, hparams['pe_ckpt'], 'model', strict=True)
73+
self.pe.eval()
74+
75+
return model
76+
77+
if __name__ == '__main__':
78+
79+
inp = {
80+
'text': '小酒窝长睫毛AP是你最美的记号',
81+
'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
82+
'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
83+
'input_type': 'word'
84+
} # user input: Chinese characters
85+
86+
set_hparams(print_hparams=False)
87+
88+
dev = 'cuda'
89+
90+
infer_ins = DFSInferWrapped(hparams)
91+
infer_ins.model.to(dev)
92+
93+
with torch.no_grad():
94+
inp = infer_ins.preprocess_input(inp, input_type=inp['input_type'] if inp.get('input_type') else 'word')
95+
sample = infer_ins.input_to_batch(inp)
96+
txt_tokens = sample['txt_tokens'] # [B, T_t]
97+
spk_id = sample.get('spk_ids')
98+
99+
torch.onnx.export(
100+
infer_ins.model,
101+
(
102+
txt_tokens.to(dev),
103+
{
104+
'spk_id': spk_id.to(dev),
105+
'pitch_midi': sample['pitch_midi'].to(dev),
106+
'midi_dur': sample['midi_dur'].to(dev),
107+
'is_slur': spk_id.to(dev),
108+
'mel2ph': spk_id.to(dev)
109+
}
110+
),
111+
"singer.onnx",
112+
# verbose=True,
113+
input_names=["txt_tokens", "spk_id",
114+
"pitch_midi", "midi_dur", "is_slur", "mel2ph"],
115+
dynamic_axes={
116+
"txt_tokens": {
117+
0: "a",
118+
1: "b",
119+
},
120+
"spk_id": {
121+
0: "a",
122+
1: "b",
123+
},
124+
"pitch_midi": {
125+
0: "a",
126+
1: "b",
127+
},
128+
"midi_dur": {
129+
0: "a",
130+
1: "b",
131+
},
132+
"is_slur": {
133+
0: "a",
134+
1: "b",
135+
},
136+
"mel2ph": {
137+
0: "a",
138+
1: "b",
139+
}
140+
},
141+
opset_version=11
142+
)
143+
144+
print("OK")

onnx_test_hifigan.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
# coding=utf8
2+
3+
import os
4+
import sys
5+
import inference.svs.ds_e2e as e2e
6+
from utils.audio import save_wav
7+
from utils.hparams import set_hparams, hparams
8+
9+
import torch
10+
import onnxruntime as ort
11+
12+
root_dir = os.path.dirname(os.path.abspath(__file__))
13+
os.environ['PYTHONPATH'] = f'"{root_dir}"'
14+
15+
sys.argv = [
16+
f'{root_dir}/inference/svs/ds_e2e.py',
17+
'--config',
18+
f'{root_dir}/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml',
19+
'--exp_name',
20+
'0228_opencpop_ds100_rel'
21+
]
22+
23+
24+
def to_numpy(tensor):
25+
return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
26+
27+
28+
class TestHifiganInfer(e2e.DiffSingerE2EInfer):
29+
def __init__(self, hparams, device=None):
30+
super().__init__(hparams, device)
31+
32+
self.vocoder2 = ort.InferenceSession("hifigan.onnx")
33+
34+
def run_vocoder(self, c, **kwargs):
35+
c = c.transpose(2, 1) # [B, 80, T]
36+
f0 = kwargs.get('f0') # [B, T]
37+
38+
if f0 is not None and hparams.get('use_nsf'):
39+
ort_inputs = {
40+
'x': to_numpy(c),
41+
'f0': to_numpy(f0)
42+
}
43+
else:
44+
ort_inputs = {
45+
'x': to_numpy(c),
46+
'f0': {}
47+
}
48+
# [T]
49+
50+
ort_out = self.vocoder2.run(None, ort_inputs)
51+
y = torch.from_numpy(ort_out[0]).to(self.device)
52+
53+
return y[None]
54+
55+
56+
if __name__ == '__main__':
57+
c = {
58+
'text': '小酒窝长睫毛AP是你最美的记号',
59+
'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
60+
'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
61+
'input_type': 'word'
62+
} # user input: Chinese characters
63+
64+
target = "./infer_out/onnx_test_hifigan_res.wav"
65+
66+
set_hparams(print_hparams=False)
67+
infer_ins = TestHifiganInfer(hparams)
68+
69+
out = infer_ins.infer_once(c)
70+
os.makedirs(os.path.dirname(target), exist_ok=True)
71+
print(f'| save audio: {target}')
72+
save_wav(out, target, hparams['audio_sample_rate'])
73+
74+
print("OK")

0 commit comments

Comments
 (0)