CoCalc -- infer

GitHub Repository: prophesier/diff-svc
Path: blob/main/infer_tools/infer_tool.py
⁶⁹⁴ views
1
import hashlib
2
import json
3
import os
4
import time
5
from io import BytesIO
6
from pathlib import Path
7

8
import librosa
9
import numpy as np
10
import soundfile
11
import torch
12

13
import utils
14
from modules.fastspeech.pe import PitchExtractor
15
from network.diff.candidate_decoder import FFT
16
from network.diff.diffusion import GaussianDiffusion
17
from network.diff.net import DiffNet
18
from network.vocoders.base_vocoder import VOCODERS, get_vocoder_cls
19
from preprocessing.data_gen_utils import get_pitch_parselmouth, get_pitch_crepe
20
from preprocessing.hubertinfer import Hubertencoder
21
from utils.hparams import hparams, set_hparams
22
from utils.pitch_utils import denorm_f0, norm_interp_f0
23
from modules.diff.diffusion_V2 import GaussianDiffusionOnnx
24

25
if os.path.exists("chunks_temp.json"):
26
    os.remove("chunks_temp.json")
27

28

29
def read_temp(file_name):
30
    if not os.path.exists(file_name):
31
        with open(file_name, "w") as f:
32
            f.write(json.dumps({"info": "temp_dict"}))
33
        return {}
34
    else:
35
        try:
36
            with open(file_name, "r") as f:
37
                data = f.read()
38
            data_dict = json.loads(data)
39
            if os.path.getsize(file_name) > 50 * 1024 * 1024:
40
                f_name = file_name.split("/")[-1]
41
                print(f"clean {f_name}")
42
                for wav_hash in list(data_dict.keys()):
43
                    if int(time.time()) - int(data_dict[wav_hash]["time"]) > 14 * 24 * 3600:
44
                        del data_dict[wav_hash]
45
        except Exception as e:
46
            print(e)
47
            print(f"{file_name} error,auto rebuild file")
48
            data_dict = {"info": "temp_dict"}
49
        return data_dict
50

51

52
f0_dict = read_temp("./infer_tools/f0_temp.json")
53

54

55
def write_temp(file_name, data):
56
    with open(file_name, "w") as f:
57
        f.write(json.dumps(data))
58

59

60
def timeit(func):
61
    def run(*args, **kwargs):
62
        t = time.time()
63
        res = func(*args, **kwargs)
64
        print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
65
        return res
66

67
    return run
68

69

70
def format_wav(audio_path):
71
    if Path(audio_path).suffix=='.wav':
72
        return
73
    raw_audio, raw_sample_rate = librosa.load(audio_path, mono=True,sr=None)
74
    soundfile.write(Path(audio_path).with_suffix(".wav"), raw_audio, raw_sample_rate)
75

76

77
def fill_a_to_b(a, b):
78
    if len(a) < len(b):
79
        for _ in range(0, len(b) - len(a)):
80
            a.append(a[0])
81

82

83
def get_end_file(dir_path, end):
84
    file_lists = []
85
    for root, dirs, files in os.walk(dir_path):
86
        files = [f for f in files if f[0] != '.']
87
        dirs[:] = [d for d in dirs if d[0] != '.']
88
        for f_file in files:
89
            if f_file.endswith(end):
90
                file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
91
    return file_lists
92

93

94
def mkdir(paths: list):
95
    for path in paths:
96
        if not os.path.exists(path):
97
            os.mkdir(path)
98

99

100
def get_md5(content):
101
    return hashlib.new("md5", content).hexdigest()
102

103

104
class Svc:
105
    def __init__(self, project_name, config_name, hubert_gpu, model_path):
106
        self.project_name = project_name
107
        self.DIFF_DECODERS = {
108
            'wavenet': lambda hp: DiffNet(hp['audio_num_mel_bins']),
109
            'fft': lambda hp: FFT(
110
                hp['hidden_size'], hp['dec_layers'], hp['dec_ffn_kernel_size'], hp['num_heads']),
111
        }
112

113
        self.model_path = model_path
114
        self.dev = torch.device("cuda")
115

116
        self._ = set_hparams(config=config_name, exp_name=self.project_name, infer=True,
117
                             reset=True,
118
                             hparams_str='',
119
                             print_hparams=False)
120

121
        self.mel_bins = hparams['audio_num_mel_bins']
122
        self.model = GaussianDiffusion(
123
            phone_encoder=Hubertencoder(hparams['hubert_path']),
124
            out_dims=self.mel_bins, denoise_fn=self.DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
125
            timesteps=hparams['timesteps'],
126
            K_step=hparams['K_step'],
127
            loss_type=hparams['diff_loss_type'],
128
            spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
129
        )
130
        self.load_ckpt()
131
        self.model.cuda()
132
        hparams['hubert_gpu'] = hubert_gpu
133
        self.hubert = Hubertencoder(hparams['hubert_path'])
134
        self.pe = PitchExtractor().cuda()
135
        utils.load_ckpt(self.pe, hparams['pe_ckpt'], 'model', strict=True)
136
        self.pe.eval()
137
        self.vocoder = get_vocoder_cls(hparams)()
138

139
    def load_ckpt(self, model_name='model', force=True, strict=True):
140
        utils.load_ckpt(self.model, self.model_path, model_name, force, strict)
141

142
    def infer(self, in_path, key, acc, use_pe=True, use_crepe=True, thre=0.05, singer=False, **kwargs):
143
        batch = self.pre(in_path, acc, use_crepe, thre)
144
        spk_embed = batch.get('spk_embed') if not hparams['use_spk_id'] else batch.get('spk_ids')
145
        hubert = batch['hubert']
146
        ref_mels = batch["mels"]
147
        energy=batch['energy']
148
        mel2ph = batch['mel2ph']
149
        batch['f0'] = batch['f0'] + (key / 12)
150
        batch['f0'][batch['f0']>np.log2(hparams['f0_max'])]=0
151
        f0 = batch['f0']
152
        uv = batch['uv']
153
        @timeit
154
        def diff_infer():
155
            outputs = self.model(
156
                hubert.cuda(), spk_embed=spk_embed, mel2ph=mel2ph.cuda(), f0=f0.cuda(), uv=uv.cuda(),energy=energy.cuda(),
157
                ref_mels=ref_mels.cuda(),
158
                infer=True, **kwargs)
159
            return outputs
160
        outputs=diff_infer()
161
        batch['outputs'] = self.model.out2mel(outputs['mel_out'])
162
        batch['mel2ph_pred'] = outputs['mel2ph']
163
        batch['f0_gt'] = denorm_f0(batch['f0'], batch['uv'], hparams)
164
        if use_pe:
165
            batch['f0_pred'] = self.pe(outputs['mel_out'])['f0_denorm_pred'].detach()
166
        else:
167
            batch['f0_pred'] = outputs.get('f0_denorm')
168
        return self.after_infer(batch, singer, in_path)
169

170
    @timeit
171
    def after_infer(self, prediction, singer, in_path):
172
        for k, v in prediction.items():
173
            if type(v) is torch.Tensor:
174
                prediction[k] = v.cpu().numpy()
175

176
        # remove paddings
177
        mel_gt = prediction["mels"]
178
        mel_gt_mask = np.abs(mel_gt).sum(-1) > 0
179

180
        mel_pred = prediction["outputs"]
181
        mel_pred_mask = np.abs(mel_pred).sum(-1) > 0
182
        mel_pred = mel_pred[mel_pred_mask]
183
        mel_pred = np.clip(mel_pred, hparams['mel_vmin'], hparams['mel_vmax'])
184

185
        f0_gt = prediction.get("f0_gt")
186
        f0_pred = prediction.get("f0_pred")
187
        if f0_pred is not None:
188
            f0_gt = f0_gt[mel_gt_mask]
189
        if len(f0_pred) > len(mel_pred_mask):
190
            f0_pred = f0_pred[:len(mel_pred_mask)]
191
        f0_pred = f0_pred[mel_pred_mask]
192
        torch.cuda.is_available() and torch.cuda.empty_cache()
193

194
        if singer:
195
            data_path = in_path.replace("batch", "singer_data")
196
            mel_path = data_path[:-4] + "_mel.npy"
197
            f0_path = data_path[:-4] + "_f0.npy"
198
            np.save(mel_path, mel_pred)
199
            np.save(f0_path, f0_pred)
200
        wav_pred = self.vocoder.spec2wav(mel_pred, f0=f0_pred)
201
        return f0_gt, f0_pred, wav_pred
202

203
    def temporary_dict2processed_input(self, item_name, temp_dict, use_crepe=True, thre=0.05):
204
        '''
205
            process data in temporary_dicts
206
        '''
207

208
        binarization_args = hparams['binarization_args']
209

210
        @timeit
211
        def get_pitch(wav, mel):
212
            # get ground truth f0 by self.get_pitch_algorithm
213
            global f0_dict
214
            if use_crepe:
215
                md5 = get_md5(wav)
216
                if f"{md5}_gt" in f0_dict.keys():
217
                    print("load temp crepe f0")
218
                    gt_f0 = np.array(f0_dict[f"{md5}_gt"]["f0"])
219
                    coarse_f0 = np.array(f0_dict[f"{md5}_coarse"]["f0"])
220
                else:
221
                    torch.cuda.is_available() and torch.cuda.empty_cache()
222
                    gt_f0, coarse_f0 = get_pitch_crepe(wav, mel, hparams, thre)
223
                f0_dict[f"{md5}_gt"] = {"f0": gt_f0.tolist(), "time": int(time.time())}
224
                f0_dict[f"{md5}_coarse"] = {"f0": coarse_f0.tolist(), "time": int(time.time())}
225
                write_temp("./infer_tools/f0_temp.json", f0_dict)
226
            else:
227
                gt_f0, coarse_f0 = get_pitch_parselmouth(wav, mel, hparams)
228
            processed_input['f0'] = gt_f0
229
            processed_input['pitch'] = coarse_f0
230

231
        def get_align(mel, phone_encoded):
232
            mel2ph = np.zeros([mel.shape[0]], int)
233
            start_frame = 0
234
            ph_durs = mel.shape[0] / phone_encoded.shape[0]
235
            if hparams['debug']:
236
                print(mel.shape, phone_encoded.shape, mel.shape[0] / phone_encoded.shape[0])
237
            for i_ph in range(phone_encoded.shape[0]):
238
                end_frame = int(i_ph * ph_durs + ph_durs + 0.5)
239
                mel2ph[start_frame:end_frame + 1] = i_ph + 1
240
                start_frame = end_frame + 1
241

242
            processed_input['mel2ph'] = mel2ph
243

244
        if hparams['vocoder'] in VOCODERS:
245
            wav, mel = VOCODERS[hparams['vocoder']].wav2spec(temp_dict['wav_fn'])
246
        else:
247
            wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(temp_dict['wav_fn'])
248
        processed_input = {
249
            'item_name': item_name, 'mel': mel,
250
            'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0]
251
        }
252
        processed_input = {**temp_dict, **processed_input}  # merge two dicts
253

254
        if binarization_args['with_f0']:
255
            get_pitch(wav, mel)
256
        if binarization_args['with_hubert']:
257
            st = time.time()
258
            hubert_encoded = processed_input['hubert'] = self.hubert.encode(temp_dict['wav_fn'])
259
            et = time.time()
260
            dev = 'cuda' if hparams['hubert_gpu'] and torch.cuda.is_available() else 'cpu'
261
            print(f'hubert (on {dev}) time used {et - st}')
262

263
            if binarization_args['with_align']:
264
                get_align(mel, hubert_encoded)
265
        return processed_input
266

267
    def pre(self, wav_fn, accelerate, use_crepe=True, thre=0.05):
268
        if isinstance(wav_fn, BytesIO):
269
            item_name = self.project_name
270
        else:
271
            song_info = wav_fn.split('/')
272
            item_name = song_info[-1].split('.')[-2]
273
        temp_dict = {'wav_fn': wav_fn, 'spk_id': self.project_name}
274

275
        temp_dict = self.temporary_dict2processed_input(item_name, temp_dict, use_crepe, thre)
276
        hparams['pndm_speedup'] = accelerate
277
        batch = processed_input2batch([getitem(temp_dict)])
278
        return batch
279

280

281
def getitem(item):
282
    max_frames = hparams['max_frames']
283
    spec = torch.Tensor(item['mel'])[:max_frames]
284
    energy = (spec.exp() ** 2).sum(-1).sqrt()
285
    mel2ph = torch.LongTensor(item['mel2ph'])[:max_frames] if 'mel2ph' in item else None
286
    f0, uv = norm_interp_f0(item["f0"][:max_frames], hparams)
287
    hubert = torch.Tensor(item['hubert'][:hparams['max_input_tokens']])
288
    pitch = torch.LongTensor(item.get("pitch"))[:max_frames]
289
    sample = {
290
        "item_name": item['item_name'],
291
        "hubert": hubert,
292
        "mel": spec,
293
        "pitch": pitch,
294
        "energy": energy,
295
        "f0": f0,
296
        "uv": uv,
297
        "mel2ph": mel2ph,
298
        "mel_nonpadding": spec.abs().sum(-1) > 0,
299
    }
300
    return sample
301

302

303
def processed_input2batch(samples):
304
    '''
305
        Args:
306
            samples: one batch of processed_input
307
        NOTE:
308
            the batch size is controlled by hparams['max_sentences']
309
    '''
310
    if len(samples) == 0:
311
        return {}
312
    item_names = [s['item_name'] for s in samples]
313
    hubert = utils.collate_2d([s['hubert'] for s in samples], 0.0)
314
    f0 = utils.collate_1d([s['f0'] for s in samples], 0.0)
315
    pitch = utils.collate_1d([s['pitch'] for s in samples])
316
    uv = utils.collate_1d([s['uv'] for s in samples])
317
    energy = utils.collate_1d([s['energy'] for s in samples], 0.0)
318
    mel2ph = utils.collate_1d([s['mel2ph'] for s in samples], 0.0) \
319
        if samples[0]['mel2ph'] is not None else None
320
    mels = utils.collate_2d([s['mel'] for s in samples], 0.0)
321
    mel_lengths = torch.LongTensor([s['mel'].shape[0] for s in samples])
322

323
    batch = {
324
        'item_name': item_names,
325
        'nsamples': len(samples),
326
        'hubert': hubert,
327
        'mels': mels,
328
        'mel_lengths': mel_lengths,
329
        'mel2ph': mel2ph,
330
        'energy': energy,
331
        'pitch': pitch,
332
        'f0': f0,
333
        'uv': uv,
334
    }
335
    return batch
336

337
class SvcOnnx:
338
    def __init__(self, project_name, config_name, hubert_gpu, model_path):
339
        self.project_name = project_name
340
        self.DIFF_DECODERS = {
341
            'wavenet': lambda hp: DiffNet(hp['audio_num_mel_bins']),
342
            'fft': lambda hp: FFT(
343
                hp['hidden_size'], hp['dec_layers'], hp['dec_ffn_kernel_size'], hp['num_heads']),
344
        }
345

346
        self.model_path = model_path
347
        self.dev = torch.device("cuda")
348

349
        self._ = set_hparams(config=config_name, exp_name=self.project_name, infer=True,
350
                             reset=True,
351
                             hparams_str='',
352
                             print_hparams=False)
353

354
        self.mel_bins = hparams['audio_num_mel_bins']
355
        self.model = GaussianDiffusionOnnx(
356
            phone_encoder=Hubertencoder(hparams['hubert_path']),
357
            out_dims=self.mel_bins, denoise_fn=self.DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
358
            timesteps=hparams['timesteps'],
359
            K_step=hparams['K_step'],
360
            loss_type=hparams['diff_loss_type'],
361
            spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
362
        )
363
        self.load_ckpt()
364
        self.model.cuda()
365
        hparams['hubert_gpu'] = hubert_gpu
366
        self.hubert = Hubertencoder(hparams['hubert_path'])
367
        self.pe = PitchExtractor().cuda()
368
        utils.load_ckpt(self.pe, hparams['pe_ckpt'], 'model', strict=True)
369
        self.pe.eval()
370
        self.vocoder = get_vocoder_cls(hparams)()
371

372
    def load_ckpt(self, model_name='model', force=True, strict=True):
373
        utils.load_ckpt(self.model, self.model_path, model_name, force, strict)
374

375
Product

Resources

Company