Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
prophesier
GitHub Repository: prophesier/diff-svc
Path: blob/main/infer_tools/infer_tool.py
694 views
1
import hashlib
2
import json
3
import os
4
import time
5
from io import BytesIO
6
from pathlib import Path
7
8
import librosa
9
import numpy as np
10
import soundfile
11
import torch
12
13
import utils
14
from modules.fastspeech.pe import PitchExtractor
15
from network.diff.candidate_decoder import FFT
16
from network.diff.diffusion import GaussianDiffusion
17
from network.diff.net import DiffNet
18
from network.vocoders.base_vocoder import VOCODERS, get_vocoder_cls
19
from preprocessing.data_gen_utils import get_pitch_parselmouth, get_pitch_crepe
20
from preprocessing.hubertinfer import Hubertencoder
21
from utils.hparams import hparams, set_hparams
22
from utils.pitch_utils import denorm_f0, norm_interp_f0
23
from modules.diff.diffusion_V2 import GaussianDiffusionOnnx
24
25
if os.path.exists("chunks_temp.json"):
26
os.remove("chunks_temp.json")
27
28
29
def read_temp(file_name):
30
if not os.path.exists(file_name):
31
with open(file_name, "w") as f:
32
f.write(json.dumps({"info": "temp_dict"}))
33
return {}
34
else:
35
try:
36
with open(file_name, "r") as f:
37
data = f.read()
38
data_dict = json.loads(data)
39
if os.path.getsize(file_name) > 50 * 1024 * 1024:
40
f_name = file_name.split("/")[-1]
41
print(f"clean {f_name}")
42
for wav_hash in list(data_dict.keys()):
43
if int(time.time()) - int(data_dict[wav_hash]["time"]) > 14 * 24 * 3600:
44
del data_dict[wav_hash]
45
except Exception as e:
46
print(e)
47
print(f"{file_name} error,auto rebuild file")
48
data_dict = {"info": "temp_dict"}
49
return data_dict
50
51
52
f0_dict = read_temp("./infer_tools/f0_temp.json")
53
54
55
def write_temp(file_name, data):
56
with open(file_name, "w") as f:
57
f.write(json.dumps(data))
58
59
60
def timeit(func):
61
def run(*args, **kwargs):
62
t = time.time()
63
res = func(*args, **kwargs)
64
print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
65
return res
66
67
return run
68
69
70
def format_wav(audio_path):
71
if Path(audio_path).suffix=='.wav':
72
return
73
raw_audio, raw_sample_rate = librosa.load(audio_path, mono=True,sr=None)
74
soundfile.write(Path(audio_path).with_suffix(".wav"), raw_audio, raw_sample_rate)
75
76
77
def fill_a_to_b(a, b):
78
if len(a) < len(b):
79
for _ in range(0, len(b) - len(a)):
80
a.append(a[0])
81
82
83
def get_end_file(dir_path, end):
84
file_lists = []
85
for root, dirs, files in os.walk(dir_path):
86
files = [f for f in files if f[0] != '.']
87
dirs[:] = [d for d in dirs if d[0] != '.']
88
for f_file in files:
89
if f_file.endswith(end):
90
file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
91
return file_lists
92
93
94
def mkdir(paths: list):
95
for path in paths:
96
if not os.path.exists(path):
97
os.mkdir(path)
98
99
100
def get_md5(content):
101
return hashlib.new("md5", content).hexdigest()
102
103
104
class Svc:
105
def __init__(self, project_name, config_name, hubert_gpu, model_path):
106
self.project_name = project_name
107
self.DIFF_DECODERS = {
108
'wavenet': lambda hp: DiffNet(hp['audio_num_mel_bins']),
109
'fft': lambda hp: FFT(
110
hp['hidden_size'], hp['dec_layers'], hp['dec_ffn_kernel_size'], hp['num_heads']),
111
}
112
113
self.model_path = model_path
114
self.dev = torch.device("cuda")
115
116
self._ = set_hparams(config=config_name, exp_name=self.project_name, infer=True,
117
reset=True,
118
hparams_str='',
119
print_hparams=False)
120
121
self.mel_bins = hparams['audio_num_mel_bins']
122
self.model = GaussianDiffusion(
123
phone_encoder=Hubertencoder(hparams['hubert_path']),
124
out_dims=self.mel_bins, denoise_fn=self.DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
125
timesteps=hparams['timesteps'],
126
K_step=hparams['K_step'],
127
loss_type=hparams['diff_loss_type'],
128
spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
129
)
130
self.load_ckpt()
131
self.model.cuda()
132
hparams['hubert_gpu'] = hubert_gpu
133
self.hubert = Hubertencoder(hparams['hubert_path'])
134
self.pe = PitchExtractor().cuda()
135
utils.load_ckpt(self.pe, hparams['pe_ckpt'], 'model', strict=True)
136
self.pe.eval()
137
self.vocoder = get_vocoder_cls(hparams)()
138
139
def load_ckpt(self, model_name='model', force=True, strict=True):
140
utils.load_ckpt(self.model, self.model_path, model_name, force, strict)
141
142
def infer(self, in_path, key, acc, use_pe=True, use_crepe=True, thre=0.05, singer=False, **kwargs):
143
batch = self.pre(in_path, acc, use_crepe, thre)
144
spk_embed = batch.get('spk_embed') if not hparams['use_spk_id'] else batch.get('spk_ids')
145
hubert = batch['hubert']
146
ref_mels = batch["mels"]
147
energy=batch['energy']
148
mel2ph = batch['mel2ph']
149
batch['f0'] = batch['f0'] + (key / 12)
150
batch['f0'][batch['f0']>np.log2(hparams['f0_max'])]=0
151
f0 = batch['f0']
152
uv = batch['uv']
153
@timeit
154
def diff_infer():
155
outputs = self.model(
156
hubert.cuda(), spk_embed=spk_embed, mel2ph=mel2ph.cuda(), f0=f0.cuda(), uv=uv.cuda(),energy=energy.cuda(),
157
ref_mels=ref_mels.cuda(),
158
infer=True, **kwargs)
159
return outputs
160
outputs=diff_infer()
161
batch['outputs'] = self.model.out2mel(outputs['mel_out'])
162
batch['mel2ph_pred'] = outputs['mel2ph']
163
batch['f0_gt'] = denorm_f0(batch['f0'], batch['uv'], hparams)
164
if use_pe:
165
batch['f0_pred'] = self.pe(outputs['mel_out'])['f0_denorm_pred'].detach()
166
else:
167
batch['f0_pred'] = outputs.get('f0_denorm')
168
return self.after_infer(batch, singer, in_path)
169
170
@timeit
171
def after_infer(self, prediction, singer, in_path):
172
for k, v in prediction.items():
173
if type(v) is torch.Tensor:
174
prediction[k] = v.cpu().numpy()
175
176
# remove paddings
177
mel_gt = prediction["mels"]
178
mel_gt_mask = np.abs(mel_gt).sum(-1) > 0
179
180
mel_pred = prediction["outputs"]
181
mel_pred_mask = np.abs(mel_pred).sum(-1) > 0
182
mel_pred = mel_pred[mel_pred_mask]
183
mel_pred = np.clip(mel_pred, hparams['mel_vmin'], hparams['mel_vmax'])
184
185
f0_gt = prediction.get("f0_gt")
186
f0_pred = prediction.get("f0_pred")
187
if f0_pred is not None:
188
f0_gt = f0_gt[mel_gt_mask]
189
if len(f0_pred) > len(mel_pred_mask):
190
f0_pred = f0_pred[:len(mel_pred_mask)]
191
f0_pred = f0_pred[mel_pred_mask]
192
torch.cuda.is_available() and torch.cuda.empty_cache()
193
194
if singer:
195
data_path = in_path.replace("batch", "singer_data")
196
mel_path = data_path[:-4] + "_mel.npy"
197
f0_path = data_path[:-4] + "_f0.npy"
198
np.save(mel_path, mel_pred)
199
np.save(f0_path, f0_pred)
200
wav_pred = self.vocoder.spec2wav(mel_pred, f0=f0_pred)
201
return f0_gt, f0_pred, wav_pred
202
203
def temporary_dict2processed_input(self, item_name, temp_dict, use_crepe=True, thre=0.05):
204
'''
205
process data in temporary_dicts
206
'''
207
208
binarization_args = hparams['binarization_args']
209
210
@timeit
211
def get_pitch(wav, mel):
212
# get ground truth f0 by self.get_pitch_algorithm
213
global f0_dict
214
if use_crepe:
215
md5 = get_md5(wav)
216
if f"{md5}_gt" in f0_dict.keys():
217
print("load temp crepe f0")
218
gt_f0 = np.array(f0_dict[f"{md5}_gt"]["f0"])
219
coarse_f0 = np.array(f0_dict[f"{md5}_coarse"]["f0"])
220
else:
221
torch.cuda.is_available() and torch.cuda.empty_cache()
222
gt_f0, coarse_f0 = get_pitch_crepe(wav, mel, hparams, thre)
223
f0_dict[f"{md5}_gt"] = {"f0": gt_f0.tolist(), "time": int(time.time())}
224
f0_dict[f"{md5}_coarse"] = {"f0": coarse_f0.tolist(), "time": int(time.time())}
225
write_temp("./infer_tools/f0_temp.json", f0_dict)
226
else:
227
gt_f0, coarse_f0 = get_pitch_parselmouth(wav, mel, hparams)
228
processed_input['f0'] = gt_f0
229
processed_input['pitch'] = coarse_f0
230
231
def get_align(mel, phone_encoded):
232
mel2ph = np.zeros([mel.shape[0]], int)
233
start_frame = 0
234
ph_durs = mel.shape[0] / phone_encoded.shape[0]
235
if hparams['debug']:
236
print(mel.shape, phone_encoded.shape, mel.shape[0] / phone_encoded.shape[0])
237
for i_ph in range(phone_encoded.shape[0]):
238
end_frame = int(i_ph * ph_durs + ph_durs + 0.5)
239
mel2ph[start_frame:end_frame + 1] = i_ph + 1
240
start_frame = end_frame + 1
241
242
processed_input['mel2ph'] = mel2ph
243
244
if hparams['vocoder'] in VOCODERS:
245
wav, mel = VOCODERS[hparams['vocoder']].wav2spec(temp_dict['wav_fn'])
246
else:
247
wav, mel = VOCODERS[hparams['vocoder'].split('.')[-1]].wav2spec(temp_dict['wav_fn'])
248
processed_input = {
249
'item_name': item_name, 'mel': mel,
250
'sec': len(wav) / hparams['audio_sample_rate'], 'len': mel.shape[0]
251
}
252
processed_input = {**temp_dict, **processed_input} # merge two dicts
253
254
if binarization_args['with_f0']:
255
get_pitch(wav, mel)
256
if binarization_args['with_hubert']:
257
st = time.time()
258
hubert_encoded = processed_input['hubert'] = self.hubert.encode(temp_dict['wav_fn'])
259
et = time.time()
260
dev = 'cuda' if hparams['hubert_gpu'] and torch.cuda.is_available() else 'cpu'
261
print(f'hubert (on {dev}) time used {et - st}')
262
263
if binarization_args['with_align']:
264
get_align(mel, hubert_encoded)
265
return processed_input
266
267
def pre(self, wav_fn, accelerate, use_crepe=True, thre=0.05):
268
if isinstance(wav_fn, BytesIO):
269
item_name = self.project_name
270
else:
271
song_info = wav_fn.split('/')
272
item_name = song_info[-1].split('.')[-2]
273
temp_dict = {'wav_fn': wav_fn, 'spk_id': self.project_name}
274
275
temp_dict = self.temporary_dict2processed_input(item_name, temp_dict, use_crepe, thre)
276
hparams['pndm_speedup'] = accelerate
277
batch = processed_input2batch([getitem(temp_dict)])
278
return batch
279
280
281
def getitem(item):
282
max_frames = hparams['max_frames']
283
spec = torch.Tensor(item['mel'])[:max_frames]
284
energy = (spec.exp() ** 2).sum(-1).sqrt()
285
mel2ph = torch.LongTensor(item['mel2ph'])[:max_frames] if 'mel2ph' in item else None
286
f0, uv = norm_interp_f0(item["f0"][:max_frames], hparams)
287
hubert = torch.Tensor(item['hubert'][:hparams['max_input_tokens']])
288
pitch = torch.LongTensor(item.get("pitch"))[:max_frames]
289
sample = {
290
"item_name": item['item_name'],
291
"hubert": hubert,
292
"mel": spec,
293
"pitch": pitch,
294
"energy": energy,
295
"f0": f0,
296
"uv": uv,
297
"mel2ph": mel2ph,
298
"mel_nonpadding": spec.abs().sum(-1) > 0,
299
}
300
return sample
301
302
303
def processed_input2batch(samples):
304
'''
305
Args:
306
samples: one batch of processed_input
307
NOTE:
308
the batch size is controlled by hparams['max_sentences']
309
'''
310
if len(samples) == 0:
311
return {}
312
item_names = [s['item_name'] for s in samples]
313
hubert = utils.collate_2d([s['hubert'] for s in samples], 0.0)
314
f0 = utils.collate_1d([s['f0'] for s in samples], 0.0)
315
pitch = utils.collate_1d([s['pitch'] for s in samples])
316
uv = utils.collate_1d([s['uv'] for s in samples])
317
energy = utils.collate_1d([s['energy'] for s in samples], 0.0)
318
mel2ph = utils.collate_1d([s['mel2ph'] for s in samples], 0.0) \
319
if samples[0]['mel2ph'] is not None else None
320
mels = utils.collate_2d([s['mel'] for s in samples], 0.0)
321
mel_lengths = torch.LongTensor([s['mel'].shape[0] for s in samples])
322
323
batch = {
324
'item_name': item_names,
325
'nsamples': len(samples),
326
'hubert': hubert,
327
'mels': mels,
328
'mel_lengths': mel_lengths,
329
'mel2ph': mel2ph,
330
'energy': energy,
331
'pitch': pitch,
332
'f0': f0,
333
'uv': uv,
334
}
335
return batch
336
337
class SvcOnnx:
338
def __init__(self, project_name, config_name, hubert_gpu, model_path):
339
self.project_name = project_name
340
self.DIFF_DECODERS = {
341
'wavenet': lambda hp: DiffNet(hp['audio_num_mel_bins']),
342
'fft': lambda hp: FFT(
343
hp['hidden_size'], hp['dec_layers'], hp['dec_ffn_kernel_size'], hp['num_heads']),
344
}
345
346
self.model_path = model_path
347
self.dev = torch.device("cuda")
348
349
self._ = set_hparams(config=config_name, exp_name=self.project_name, infer=True,
350
reset=True,
351
hparams_str='',
352
print_hparams=False)
353
354
self.mel_bins = hparams['audio_num_mel_bins']
355
self.model = GaussianDiffusionOnnx(
356
phone_encoder=Hubertencoder(hparams['hubert_path']),
357
out_dims=self.mel_bins, denoise_fn=self.DIFF_DECODERS[hparams['diff_decoder_type']](hparams),
358
timesteps=hparams['timesteps'],
359
K_step=hparams['K_step'],
360
loss_type=hparams['diff_loss_type'],
361
spec_min=hparams['spec_min'], spec_max=hparams['spec_max'],
362
)
363
self.load_ckpt()
364
self.model.cuda()
365
hparams['hubert_gpu'] = hubert_gpu
366
self.hubert = Hubertencoder(hparams['hubert_path'])
367
self.pe = PitchExtractor().cuda()
368
utils.load_ckpt(self.pe, hparams['pe_ckpt'], 'model', strict=True)
369
self.pe.eval()
370
self.vocoder = get_vocoder_cls(hparams)()
371
372
def load_ckpt(self, model_name='model', force=True, strict=True):
373
utils.load_ckpt(self.model, self.model_path, model_name, force, strict)
374
375