add export singer script

SineStriker · SineStriker · commit c8835da7ff48 · 2022-08-16T11:03:53.000+08:00
diff --git a/modules/fastspeech/pe.py b/modules/fastspeech/pe.py
@@ -26,7 +26,8 @@ def forward(self, x):
         :param x: [B, T, 80]
         :return: [L, B, T, H], [B, T, H]
         """
-        padding_mask = x.abs().sum(-1).eq(0).data  # [B, T]
+        # padding_mask = x.abs().sum(-1).eq(0).data  # [B, T]
+        padding_mask = x.abs().sum(-1).eq(0).detach()
         nonpadding_mask_TB = 1 - padding_mask.float()[:, None, :]  # [B, 1, T]
         x = x.transpose(1, 2)
         hiddens = []
diff --git a/onnx_export_hifigan.py b/onnx_export_hifigan.py
@@ -22,12 +22,14 @@
 if __name__ == '__main__':
 
     set_hparams(print_hparams=False)
-    infer_ins = e2e.DiffSingerE2EInfer(hparams)
 
-    infer_ins.vocoder.to('cpu')
+    dev = 'cuda'
+
+    infer_ins = e2e.DiffSingerE2EInfer(hparams)
+    infer_ins.vocoder.to(dev)
     with torch.no_grad():
-        x = torch.rand(1, 80, 100)
-        f0 = torch.rand(1, 100)
+        x = torch.rand(1, 80, 968).to(dev)
+        f0 = torch.rand(1, 968).to(dev)
 
         torch.onnx.export(
             infer_ins.vocoder,
@@ -36,25 +38,20 @@
                 f0
             ),
             "hifigan.onnx",
+            verbose=True,
             input_names=["x", "f0"],
-            output_names=["y"],
             dynamic_axes={
                 "x": {
-                    0: "hop_size",
-                    1: "win_size",
-                    2: "fft_size",
+                    0: "batch_size",
+                    1: "num_mel_bin",
+                    2: "frames",
                 },
                 "f0": {
-                    0: "len",
+                    0: "batch_size",
                     1: "frames"
-                },
-                "y": {
-                    0: "len",
-                    1: "frames",
-                    2: "batch_size"
                 }
             },
-            opset_version=11
+            opset_version=11,
         )
 
     print("OK")
diff --git a/onnx_export_pe.py b/onnx_export_pe.py
@@ -0,0 +1,50 @@
+# coding=utf8
+
+import os
+import sys
+import inference.svs.ds_e2e as e2e
+from utils.audio import save_wav
+from utils.hparams import set_hparams, hparams
+
+import torch
+
+root_dir = os.path.dirname(os.path.abspath(__file__))
+os.environ['PYTHONPATH'] = f'"{root_dir}"'
+
+sys.argv = [
+    f'{root_dir}/inference/svs/ds_e2e.py',
+    '--config',
+    f'{root_dir}/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml',
+    '--exp_name',
+    '0228_opencpop_ds100_rel'
+]
+
+if __name__ == '__main__':
+    set_hparams(print_hparams=False)
+
+    dev = 'cuda'
+
+    infer_ins = e2e.DiffSingerE2EInfer(hparams)
+    infer_ins.pe.to(dev)
+    with torch.no_grad():
+        mel_input = torch.rand(1, 968, 80).to(dev)
+
+        torch.onnx.export(
+            infer_ins.pe,
+            (
+                mel_input
+            ),
+            "xiaoma_pe.onnx",
+            verbose=True,
+            input_names=["mel_input"],
+            dynamic_axes={
+                "mel_input": {
+                    0: "batch_size",
+                    1: "frames",
+                    2: "num_mel_bin",
+                }
+            },
+            opset_version=11
+        )
+
+    print("OK")
diff --git a/onnx_export_singer.py b/onnx_export_singer.py
@@ -0,0 +1,144 @@
+# coding=utf8
+
+import os
+import sys
+import inference.svs.ds_e2e as e2e
+from modules.fastspeech.pe import PitchExtractor
+from usr.diff.shallow_diffusion_tts import GaussianDiffusion
+from utils import load_ckpt
+from utils.audio import save_wav
+from utils.hparams import set_hparams, hparams
+
+import torch
+
+from utils.text_encoder import TokenTextEncoder
+from usr.diffsinger_task import DIFF_DECODERS
+
+root_dir = os.path.dirname(os.path.abspath(__file__))
+os.environ['PYTHONPATH'] = f'"{root_dir}"'
+
+sys.argv = [
+    f'{root_dir}/inference/svs/ds_e2e.py',
+    '--config',
+    f'{root_dir}/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml',
+    '--exp_name',
+    '0228_opencpop_ds100_rel'
+]
+
+
+class GaussianDiffusionWrap(GaussianDiffusion):
+    def forward(self, txt_tokens, mel2ph,
+                # Wrapped Arguments
+                spk_id,
+                pitch_midi,
+                midi_dur,
+                is_slur,
+                ):
+
+        if (torch.numel(txt_tokens) == 0):
+            txt_tokens = None
+        if (torch.numel(mel2ph) == 0):
+            mel2ph = None
+        if (torch.numel(spk_id) == 0):
+            spk_id = None
+        if (torch.numel(pitch_midi) == 0):
+            pitch_midi = None
+        if (torch.numel(midi_dur) == 0):
+            midi_dur = None
+        if (torch.numel(is_slur) == 0):
+            is_slur = None
+
+        return super().forward(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True,
+                               pitch_midi=pitch_midi, midi_dur=midi_dur,
+                               is_slur=is_slur, mel2ph=mel2ph)
+
+
+class DFSInferWrapped(e2e.DiffSingerE2EInfer):
+    def build_model(self):
+        model = GaussianDiffusionWrap(
+            phone_encoder=self.ph_encoder,
+            out_dims=hparams['audio_num_mel_bins'], denoise_fn=DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
+            timesteps=hparams['timesteps'],
+            K_step=hparams['K_step'],
+            loss_type=hparams['diff_loss_type'],
+            spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
+        )
+
+        model.eval()
+        load_ckpt(model, hparams['work_dir'], 'model')
+
+        if hparams.get('pe_enable') is not None and hparams['pe_enable']:
+            self.pe = PitchExtractor().to(self.device)
+            load_ckpt(self.pe, hparams['pe_ckpt'], 'model', strict=True)
+            self.pe.eval()
+        
+        return model
+
+if __name__ == '__main__':
+
+    inp = {
+        'text': '小酒窝长睫毛AP是你最美的记号',
+        'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
+        'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
+        'input_type': 'word'
+    }  # user input: Chinese characters
+
+    set_hparams(print_hparams=False)
+
+    dev = 'cuda'
+
+    infer_ins = DFSInferWrapped(hparams)
+    infer_ins.model.to(dev)
+
+    with torch.no_grad():
+        inp = infer_ins.preprocess_input(inp, input_type=inp['input_type'] if inp.get('input_type') else 'word')
+        sample = infer_ins.input_to_batch(inp)
+        txt_tokens = sample['txt_tokens']  # [B, T_t]
+        spk_id = sample.get('spk_ids')
+
+        torch.onnx.export(
+            infer_ins.model,
+            (
+                txt_tokens.to(dev),
+                {
+                    'spk_id': spk_id.to(dev),
+                    'pitch_midi': sample['pitch_midi'].to(dev),
+                    'midi_dur': sample['midi_dur'].to(dev),
+                    'is_slur': spk_id.to(dev),
+                    'mel2ph': spk_id.to(dev)
+                }
+            ),
+            "singer.onnx",
+            # verbose=True,
+            input_names=["txt_tokens", "spk_id",
+                         "pitch_midi", "midi_dur", "is_slur", "mel2ph"],
+            dynamic_axes={
+                "txt_tokens": {
+                    0: "a",
+                    1: "b",
+                },
+                "spk_id": {
+                    0: "a",
+                    1: "b",
+                },
+                "pitch_midi": {
+                    0: "a",
+                    1: "b",
+                },
+                "midi_dur": {
+                    0: "a",
+                    1: "b",
+                },
+                "is_slur": {
+                    0: "a",
+                    1: "b",
+                },
+                "mel2ph": {
+                    0: "a",
+                    1: "b",
+                }
+            },
+            opset_version=11
+        )
+
+    print("OK")
diff --git a/onnx_test_pe.py b/onnx_test_pe.py
@@ -0,0 +1,117 @@
+# coding=utf8
+
+import os
+import sys
+import inference.svs.ds_e2e as e2e
+from utils.audio import save_wav
+from utils.hparams import set_hparams, hparams
+
+import numpy as np
+
+import torch
+import onnxruntime as ort
+
+root_dir = os.path.dirname(os.path.abspath(__file__))
+os.environ['PYTHONPATH'] = f'"{root_dir}"'
+
+sys.argv = [
+    f'{root_dir}/inference/svs/ds_e2e.py',
+    '--config',
+    f'{root_dir}/usr/configs/midi/e2e/opencpop/ds100_adj_rel.yaml',
+    '--exp_name',
+    '0228_opencpop_ds100_rel'
+]
+
+
+def to_numpy(tensor):
+    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
+
+
+class TestHifiganInfer(e2e.DiffSingerE2EInfer):
+    def __init__(self, hparams, device=None):
+        super().__init__(hparams, device)
+
+        self.pe2 = ort.InferenceSession("xiaoma_pe.onnx")
+        self.vocoder2 = ort.InferenceSession("hifigan.onnx")
+
+    def run_vocoder(self, c, **kwargs):
+        c = c.transpose(2, 1)  # [B, 80, T]
+        f0 = kwargs.get('f0')  # [B, T]
+
+        if f0 is not None and hparams.get('use_nsf'):
+            ort_inputs = {
+                'x': to_numpy(c),
+                'f0': to_numpy(f0)
+            }
+        else:
+            ort_inputs = {
+                'x': to_numpy(c),
+                'f0': {}
+            }
+            # [T]
+
+        ort_out = self.vocoder2.run(None, ort_inputs)
+        y = torch.from_numpy(ort_out[0]).to(self.device)
+
+        return y[None]
+
+    def forward_model(self, inp):
+        sample = self.input_to_batch(inp)
+        txt_tokens = sample['txt_tokens']  # [B, T_t]
+        spk_id = sample.get('spk_ids')
+
+        print(txt_tokens.shape)
+        print(spk_id.shape)
+        print(sample['pitch_midi'].shape)
+        print(sample['midi_dur'].shape)
+        if (sample['is_slur'] is not None):
+            print(sample['is_slur'].shape)
+        if (sample['mel2ph'] is not None):
+            print(sample['mel2ph'].shape)
+
+        with torch.no_grad():
+            output = self.model(txt_tokens, spk_id=spk_id, ref_mels=None, infer=True,
+                                pitch_midi=sample['pitch_midi'], midi_dur=sample['midi_dur'],
+                                is_slur=sample['is_slur'], mel2ph=sample['mel2ph'])
+
+            mel_out = output['mel_out']  # [B, T,80]
+
+            if hparams.get('pe_enable') is not None and hparams['pe_enable']:
+                pe2_res = self.pe2.run(None, 
+                    {
+                        'mel_input': to_numpy(mel_out)
+                    }
+                )
+                
+                # pe predict from Pred mel
+                f0_pred = torch.from_numpy(pe2_res[1])
+
+            else:
+                f0_pred = output['f0_denorm']
+
+            # Run Vocoder
+            wav_out = self.run_vocoder(mel_out, f0=f0_pred)
+        wav_out = wav_out.cpu().numpy()
+        return wav_out[0]
+
+
+if __name__ == '__main__':
+    c = {
+        'text': '小酒窝长睫毛AP是你最美的记号',
+        'notes': 'C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4',
+        'notes_duration': '0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340',
+        'input_type': 'word'
+    }  # user input: Chinese characters
+
+    target = "./infer_out/onnx_test_res.wav"
+
+    set_hparams(print_hparams=False)
+    infer_ins = TestHifiganInfer(hparams)
+
+    out = infer_ins.infer_once(c)
+    os.makedirs(os.path.dirname(target), exist_ok=True)
+    print(f'| save audio: {target}')
+    save_wav(out, target, hparams['audio_sample_rate'])
+
+    print(infer_ins.pe)
+    print("OK")