|
| 1 | +# coding=utf8 |
| 2 | + |
| 3 | +import os |
| 4 | +import sys |
| 5 | +import inference.svs.ds_e2e as e2e |
| 6 | +from utils.audio import save_wav |
| 7 | +from utils.hparams import set_hparams, hparams |
| 8 | + |
| 9 | +import torch |
| 10 | +import onnxruntime as ort |
| 11 | + |
| 12 | +root_dir = os.path.dirname(os.path.abspath(__file__)) |
| 13 | +os.environ['PYTHONPATH'] = f'"{root_dir}"' |
| 14 | + |
| 15 | +sys.argv = [ |
| 16 | + f'{root_dir}/inference/svs/ds_e2e.py', |
| 17 | + '--config', |
| 18 | + f'{root_dir}/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml', |
| 19 | + '--exp_name', |
| 20 | + '0228_opencpop_ds100_rel' |
| 21 | +] |
| 22 | + |
| 23 | + |
| 24 | +def to_numpy(tensor): |
| 25 | + return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy() |
| 26 | + |
| 27 | + |
| 28 | +class TestHifiganInfer(e2e.DiffSingerE2EInfer): |
| 29 | + def __init__(self, hparams, device=None): |
| 30 | + super().__init__(hparams, device) |
| 31 | + |
| 32 | + self.vocoder2 = ort.InferenceSession("hifigan.onnx") |
| 33 | + |
| 34 | + def run_vocoder(self, c, **kwargs): |
| 35 | + c = c.transpose(2, 1) # [B, 80, T] |
| 36 | + f0 = kwargs.get('f0') # [B, T] |
| 37 | + |
| 38 | + if f0 is not None and hparams.get('use_nsf'): |
| 39 | + ort_inputs = { |
| 40 | + 'x': to_numpy(c), |
| 41 | + 'f0': to_numpy(f0) |
| 42 | + } |
| 43 | + else: |
| 44 | + ort_inputs = { |
| 45 | + 'x': to_numpy(c), |
| 46 | + 'f0': {} |
| 47 | + } |
| 48 | + # [T] |
| 49 | + |
| 50 | + ort_out = self.vocoder2.run(None, ort_inputs) |
| 51 | + y = torch.from_numpy(ort_out[0]).to(self.device) |
| 52 | + |
| 53 | + return y[None] |
| 54 | + |
| 55 | + |
| 56 | +if __name__ == '__main__': |
| 57 | + c = { |
| 58 | + 'text': '小酒窝长睫毛AP是你最美的记号', |
| 59 | + 'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4', |
| 60 | + 'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340', |
| 61 | + 'input_type': 'word' |
| 62 | + } # user input: Chinese characters |
| 63 | + |
| 64 | + target = "./infer_out/onnx_test_hifigan_res.wav" |
| 65 | + |
| 66 | + set_hparams(print_hparams=False) |
| 67 | + infer_ins = TestHifiganInfer(hparams) |
| 68 | + |
| 69 | + out = infer_ins.infer_once(c) |
| 70 | + os.makedirs(os.path.dirname(target), exist_ok=True) |
| 71 | + print(f'| save audio: {target}') |
| 72 | + save_wav(out, target, hparams['audio_sample_rate']) |
| 73 | + |
| 74 | + print("OK") |
0 commit comments