Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
prophesier
GitHub Repository: prophesier/diff-svc
Path: blob/main/modules/parallel_wavegan/models/source.py
694 views
1
import torch
2
import numpy as np
3
import sys
4
import torch.nn.functional as torch_nn_func
5
6
7
class SineGen(torch.nn.Module):
8
""" Definition of sine generator
9
SineGen(samp_rate, harmonic_num = 0,
10
sine_amp = 0.1, noise_std = 0.003,
11
voiced_threshold = 0,
12
flag_for_pulse=False)
13
14
samp_rate: sampling rate in Hz
15
harmonic_num: number of harmonic overtones (default 0)
16
sine_amp: amplitude of sine-wavefrom (default 0.1)
17
noise_std: std of Gaussian noise (default 0.003)
18
voiced_thoreshold: F0 threshold for U/V classification (default 0)
19
flag_for_pulse: this SinGen is used inside PulseGen (default False)
20
21
Note: when flag_for_pulse is True, the first time step of a voiced
22
segment is always sin(np.pi) or cos(0)
23
"""
24
25
def __init__(self, samp_rate, harmonic_num=0,
26
sine_amp=0.1, noise_std=0.003,
27
voiced_threshold=0,
28
flag_for_pulse=False):
29
super(SineGen, self).__init__()
30
self.sine_amp = sine_amp
31
self.noise_std = noise_std
32
self.harmonic_num = harmonic_num
33
self.dim = self.harmonic_num + 1
34
self.sampling_rate = samp_rate
35
self.voiced_threshold = voiced_threshold
36
self.flag_for_pulse = flag_for_pulse
37
38
def _f02uv(self, f0):
39
# generate uv signal
40
uv = torch.ones_like(f0)
41
uv = uv * (f0 > self.voiced_threshold)
42
return uv
43
44
def _f02sine(self, f0_values):
45
""" f0_values: (batchsize, length, dim)
46
where dim indicates fundamental tone and overtones
47
"""
48
# convert to F0 in rad. The interger part n can be ignored
49
# because 2 * np.pi * n doesn't affect phase
50
rad_values = (f0_values / self.sampling_rate) % 1
51
52
# initial phase noise (no noise for fundamental component)
53
rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \
54
device=f0_values.device)
55
rand_ini[:, 0] = 0
56
rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
57
58
# instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
59
if not self.flag_for_pulse:
60
# for normal case
61
62
# To prevent torch.cumsum numerical overflow,
63
# it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
64
# Buffer tmp_over_one_idx indicates the time step to add -1.
65
# This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
66
tmp_over_one = torch.cumsum(rad_values, 1) % 1
67
tmp_over_one_idx = (tmp_over_one[:, 1:, :] -
68
tmp_over_one[:, :-1, :]) < 0
69
cumsum_shift = torch.zeros_like(rad_values)
70
cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
71
72
sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1)
73
* 2 * np.pi)
74
else:
75
# If necessary, make sure that the first time step of every
76
# voiced segments is sin(pi) or cos(0)
77
# This is used for pulse-train generation
78
79
# identify the last time step in unvoiced segments
80
uv = self._f02uv(f0_values)
81
uv_1 = torch.roll(uv, shifts=-1, dims=1)
82
uv_1[:, -1, :] = 1
83
u_loc = (uv < 1) * (uv_1 > 0)
84
85
# get the instantanouse phase
86
tmp_cumsum = torch.cumsum(rad_values, dim=1)
87
# different batch needs to be processed differently
88
for idx in range(f0_values.shape[0]):
89
temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
90
temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
91
# stores the accumulation of i.phase within
92
# each voiced segments
93
tmp_cumsum[idx, :, :] = 0
94
tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
95
96
# rad_values - tmp_cumsum: remove the accumulation of i.phase
97
# within the previous voiced segment.
98
i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
99
100
# get the sines
101
sines = torch.cos(i_phase * 2 * np.pi)
102
return sines
103
104
def forward(self, f0):
105
""" sine_tensor, uv = forward(f0)
106
input F0: tensor(batchsize=1, length, dim=1)
107
f0 for unvoiced steps should be 0
108
output sine_tensor: tensor(batchsize=1, length, dim)
109
output uv: tensor(batchsize=1, length, 1)
110
"""
111
with torch.no_grad():
112
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,
113
device=f0.device)
114
# fundamental component
115
f0_buf[:, :, 0] = f0[:, :, 0]
116
for idx in np.arange(self.harmonic_num):
117
# idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
118
f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)
119
120
# generate sine waveforms
121
sine_waves = self._f02sine(f0_buf) * self.sine_amp
122
123
# generate uv signal
124
# uv = torch.ones(f0.shape)
125
# uv = uv * (f0 > self.voiced_threshold)
126
uv = self._f02uv(f0)
127
128
# noise: for unvoiced should be similar to sine_amp
129
# std = self.sine_amp/3 -> max value ~ self.sine_amp
130
# . for voiced regions is self.noise_std
131
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
132
noise = noise_amp * torch.randn_like(sine_waves)
133
134
# first: set the unvoiced part to 0 by uv
135
# then: additive noise
136
sine_waves = sine_waves * uv + noise
137
return sine_waves, uv, noise
138
139
140
class PulseGen(torch.nn.Module):
141
""" Definition of Pulse train generator
142
143
There are many ways to implement pulse generator.
144
Here, PulseGen is based on SinGen. For a perfect
145
"""
146
def __init__(self, samp_rate, pulse_amp = 0.1,
147
noise_std = 0.003, voiced_threshold = 0):
148
super(PulseGen, self).__init__()
149
self.pulse_amp = pulse_amp
150
self.sampling_rate = samp_rate
151
self.voiced_threshold = voiced_threshold
152
self.noise_std = noise_std
153
self.l_sinegen = SineGen(self.sampling_rate, harmonic_num=0, \
154
sine_amp=self.pulse_amp, noise_std=0, \
155
voiced_threshold=self.voiced_threshold, \
156
flag_for_pulse=True)
157
158
def forward(self, f0):
159
""" Pulse train generator
160
pulse_train, uv = forward(f0)
161
input F0: tensor(batchsize=1, length, dim=1)
162
f0 for unvoiced steps should be 0
163
output pulse_train: tensor(batchsize=1, length, dim)
164
output uv: tensor(batchsize=1, length, 1)
165
166
Note: self.l_sine doesn't make sure that the initial phase of
167
a voiced segment is np.pi, the first pulse in a voiced segment
168
may not be at the first time step within a voiced segment
169
"""
170
with torch.no_grad():
171
sine_wav, uv, noise = self.l_sinegen(f0)
172
173
# sine without additive noise
174
pure_sine = sine_wav - noise
175
176
# step t corresponds to a pulse if
177
# sine[t] > sine[t+1] & sine[t] > sine[t-1]
178
# & sine[t-1], sine[t+1], and sine[t] are voiced
179
# or
180
# sine[t] is voiced, sine[t-1] is unvoiced
181
# we use torch.roll to simulate sine[t+1] and sine[t-1]
182
sine_1 = torch.roll(pure_sine, shifts=1, dims=1)
183
uv_1 = torch.roll(uv, shifts=1, dims=1)
184
uv_1[:, 0, :] = 0
185
sine_2 = torch.roll(pure_sine, shifts=-1, dims=1)
186
uv_2 = torch.roll(uv, shifts=-1, dims=1)
187
uv_2[:, -1, :] = 0
188
189
loc = (pure_sine > sine_1) * (pure_sine > sine_2) \
190
* (uv_1 > 0) * (uv_2 > 0) * (uv > 0) \
191
+ (uv_1 < 1) * (uv > 0)
192
193
# pulse train without noise
194
pulse_train = pure_sine * loc
195
196
# additive noise to pulse train
197
# note that noise from sinegen is zero in voiced regions
198
pulse_noise = torch.randn_like(pure_sine) * self.noise_std
199
200
# with additive noise on pulse, and unvoiced regions
201
pulse_train += pulse_noise * loc + pulse_noise * (1 - uv)
202
return pulse_train, sine_wav, uv, pulse_noise
203
204
205
class SignalsConv1d(torch.nn.Module):
206
""" Filtering input signal with time invariant filter
207
Note: FIRFilter conducted filtering given fixed FIR weight
208
SignalsConv1d convolves two signals
209
Note: this is based on torch.nn.functional.conv1d
210
211
"""
212
213
def __init__(self):
214
super(SignalsConv1d, self).__init__()
215
216
def forward(self, signal, system_ir):
217
""" output = forward(signal, system_ir)
218
219
signal: (batchsize, length1, dim)
220
system_ir: (length2, dim)
221
222
output: (batchsize, length1, dim)
223
"""
224
if signal.shape[-1] != system_ir.shape[-1]:
225
print("Error: SignalsConv1d expects shape:")
226
print("signal (batchsize, length1, dim)")
227
print("system_id (batchsize, length2, dim)")
228
print("But received signal: {:s}".format(str(signal.shape)))
229
print(" system_ir: {:s}".format(str(system_ir.shape)))
230
sys.exit(1)
231
padding_length = system_ir.shape[0] - 1
232
groups = signal.shape[-1]
233
234
# pad signal on the left
235
signal_pad = torch_nn_func.pad(signal.permute(0, 2, 1), \
236
(padding_length, 0))
237
# prepare system impulse response as (dim, 1, length2)
238
# also flip the impulse response
239
ir = torch.flip(system_ir.unsqueeze(1).permute(2, 1, 0), \
240
dims=[2])
241
# convolute
242
output = torch_nn_func.conv1d(signal_pad, ir, groups=groups)
243
return output.permute(0, 2, 1)
244
245
246
class CyclicNoiseGen_v1(torch.nn.Module):
247
""" CyclicnoiseGen_v1
248
Cyclic noise with a single parameter of beta.
249
Pytorch v1 implementation assumes f_t is also fixed
250
"""
251
252
def __init__(self, samp_rate,
253
noise_std=0.003, voiced_threshold=0):
254
super(CyclicNoiseGen_v1, self).__init__()
255
self.samp_rate = samp_rate
256
self.noise_std = noise_std
257
self.voiced_threshold = voiced_threshold
258
259
self.l_pulse = PulseGen(samp_rate, pulse_amp=1.0,
260
noise_std=noise_std,
261
voiced_threshold=voiced_threshold)
262
self.l_conv = SignalsConv1d()
263
264
def noise_decay(self, beta, f0mean):
265
""" decayed_noise = noise_decay(beta, f0mean)
266
decayed_noise = n[t]exp(-t * f_mean / beta / samp_rate)
267
268
beta: (dim=1) or (batchsize=1, 1, dim=1)
269
f0mean (batchsize=1, 1, dim=1)
270
271
decayed_noise (batchsize=1, length, dim=1)
272
"""
273
with torch.no_grad():
274
# exp(-1.0 n / T) < 0.01 => n > -log(0.01)*T = 4.60*T
275
# truncate the noise when decayed by -40 dB
276
length = 4.6 * self.samp_rate / f0mean
277
length = length.int()
278
time_idx = torch.arange(0, length, device=beta.device)
279
time_idx = time_idx.unsqueeze(0).unsqueeze(2)
280
time_idx = time_idx.repeat(beta.shape[0], 1, beta.shape[2])
281
282
noise = torch.randn(time_idx.shape, device=beta.device)
283
284
# due to Pytorch implementation, use f0_mean as the f0 factor
285
decay = torch.exp(-time_idx * f0mean / beta / self.samp_rate)
286
return noise * self.noise_std * decay
287
288
def forward(self, f0s, beta):
289
""" Producde cyclic-noise
290
"""
291
# pulse train
292
pulse_train, sine_wav, uv, noise = self.l_pulse(f0s)
293
pure_pulse = pulse_train - noise
294
295
# decayed_noise (length, dim=1)
296
if (uv < 1).all():
297
# all unvoiced
298
cyc_noise = torch.zeros_like(sine_wav)
299
else:
300
f0mean = f0s[uv > 0].mean()
301
302
decayed_noise = self.noise_decay(beta, f0mean)[0, :, :]
303
# convolute
304
cyc_noise = self.l_conv(pure_pulse, decayed_noise)
305
306
# add noise in invoiced segments
307
cyc_noise = cyc_noise + noise * (1.0 - uv)
308
return cyc_noise, pulse_train, sine_wav, uv, noise
309
310
311
class SineGen(torch.nn.Module):
312
""" Definition of sine generator
313
SineGen(samp_rate, harmonic_num = 0,
314
sine_amp = 0.1, noise_std = 0.003,
315
voiced_threshold = 0,
316
flag_for_pulse=False)
317
318
samp_rate: sampling rate in Hz
319
harmonic_num: number of harmonic overtones (default 0)
320
sine_amp: amplitude of sine-wavefrom (default 0.1)
321
noise_std: std of Gaussian noise (default 0.003)
322
voiced_thoreshold: F0 threshold for U/V classification (default 0)
323
flag_for_pulse: this SinGen is used inside PulseGen (default False)
324
325
Note: when flag_for_pulse is True, the first time step of a voiced
326
segment is always sin(np.pi) or cos(0)
327
"""
328
329
def __init__(self, samp_rate, harmonic_num=0,
330
sine_amp=0.1, noise_std=0.003,
331
voiced_threshold=0,
332
flag_for_pulse=False):
333
super(SineGen, self).__init__()
334
self.sine_amp = sine_amp
335
self.noise_std = noise_std
336
self.harmonic_num = harmonic_num
337
self.dim = self.harmonic_num + 1
338
self.sampling_rate = samp_rate
339
self.voiced_threshold = voiced_threshold
340
self.flag_for_pulse = flag_for_pulse
341
342
def _f02uv(self, f0):
343
# generate uv signal
344
uv = torch.ones_like(f0)
345
uv = uv * (f0 > self.voiced_threshold)
346
return uv
347
348
def _f02sine(self, f0_values):
349
""" f0_values: (batchsize, length, dim)
350
where dim indicates fundamental tone and overtones
351
"""
352
# convert to F0 in rad. The interger part n can be ignored
353
# because 2 * np.pi * n doesn't affect phase
354
rad_values = (f0_values / self.sampling_rate) % 1
355
356
# initial phase noise (no noise for fundamental component)
357
rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \
358
device=f0_values.device)
359
rand_ini[:, 0] = 0
360
rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
361
362
# instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
363
if not self.flag_for_pulse:
364
# for normal case
365
366
# To prevent torch.cumsum numerical overflow,
367
# it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
368
# Buffer tmp_over_one_idx indicates the time step to add -1.
369
# This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
370
tmp_over_one = torch.cumsum(rad_values, 1) % 1
371
tmp_over_one_idx = (tmp_over_one[:, 1:, :] -
372
tmp_over_one[:, :-1, :]) < 0
373
cumsum_shift = torch.zeros_like(rad_values)
374
cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
375
376
sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1)
377
* 2 * np.pi)
378
else:
379
# If necessary, make sure that the first time step of every
380
# voiced segments is sin(pi) or cos(0)
381
# This is used for pulse-train generation
382
383
# identify the last time step in unvoiced segments
384
uv = self._f02uv(f0_values)
385
uv_1 = torch.roll(uv, shifts=-1, dims=1)
386
uv_1[:, -1, :] = 1
387
u_loc = (uv < 1) * (uv_1 > 0)
388
389
# get the instantanouse phase
390
tmp_cumsum = torch.cumsum(rad_values, dim=1)
391
# different batch needs to be processed differently
392
for idx in range(f0_values.shape[0]):
393
temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
394
temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
395
# stores the accumulation of i.phase within
396
# each voiced segments
397
tmp_cumsum[idx, :, :] = 0
398
tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
399
400
# rad_values - tmp_cumsum: remove the accumulation of i.phase
401
# within the previous voiced segment.
402
i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
403
404
# get the sines
405
sines = torch.cos(i_phase * 2 * np.pi)
406
return sines
407
408
def forward(self, f0):
409
""" sine_tensor, uv = forward(f0)
410
input F0: tensor(batchsize=1, length, dim=1)
411
f0 for unvoiced steps should be 0
412
output sine_tensor: tensor(batchsize=1, length, dim)
413
output uv: tensor(batchsize=1, length, 1)
414
"""
415
with torch.no_grad():
416
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, \
417
device=f0.device)
418
# fundamental component
419
f0_buf[:, :, 0] = f0[:, :, 0]
420
for idx in np.arange(self.harmonic_num):
421
# idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
422
f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)
423
424
# generate sine waveforms
425
sine_waves = self._f02sine(f0_buf) * self.sine_amp
426
427
# generate uv signal
428
# uv = torch.ones(f0.shape)
429
# uv = uv * (f0 > self.voiced_threshold)
430
uv = self._f02uv(f0)
431
432
# noise: for unvoiced should be similar to sine_amp
433
# std = self.sine_amp/3 -> max value ~ self.sine_amp
434
# . for voiced regions is self.noise_std
435
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
436
noise = noise_amp * torch.randn_like(sine_waves)
437
438
# first: set the unvoiced part to 0 by uv
439
# then: additive noise
440
sine_waves = sine_waves * uv + noise
441
return sine_waves, uv, noise
442
443
444
class SourceModuleCycNoise_v1(torch.nn.Module):
445
""" SourceModuleCycNoise_v1
446
SourceModule(sampling_rate, noise_std=0.003, voiced_threshod=0)
447
sampling_rate: sampling_rate in Hz
448
449
noise_std: std of Gaussian noise (default: 0.003)
450
voiced_threshold: threshold to set U/V given F0 (default: 0)
451
452
cyc, noise, uv = SourceModuleCycNoise_v1(F0_upsampled, beta)
453
F0_upsampled (batchsize, length, 1)
454
beta (1)
455
cyc (batchsize, length, 1)
456
noise (batchsize, length, 1)
457
uv (batchsize, length, 1)
458
"""
459
460
def __init__(self, sampling_rate, noise_std=0.003, voiced_threshod=0):
461
super(SourceModuleCycNoise_v1, self).__init__()
462
self.sampling_rate = sampling_rate
463
self.noise_std = noise_std
464
self.l_cyc_gen = CyclicNoiseGen_v1(sampling_rate, noise_std,
465
voiced_threshod)
466
467
def forward(self, f0_upsamped, beta):
468
"""
469
cyc, noise, uv = SourceModuleCycNoise_v1(F0, beta)
470
F0_upsampled (batchsize, length, 1)
471
beta (1)
472
cyc (batchsize, length, 1)
473
noise (batchsize, length, 1)
474
uv (batchsize, length, 1)
475
"""
476
# source for harmonic branch
477
cyc, pulse, sine, uv, add_noi = self.l_cyc_gen(f0_upsamped, beta)
478
479
# source for noise branch, in the same shape as uv
480
noise = torch.randn_like(uv) * self.noise_std / 3
481
return cyc, noise, uv
482
483
484
class SourceModuleHnNSF(torch.nn.Module):
485
""" SourceModule for hn-nsf
486
SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
487
add_noise_std=0.003, voiced_threshod=0)
488
sampling_rate: sampling_rate in Hz
489
harmonic_num: number of harmonic above F0 (default: 0)
490
sine_amp: amplitude of sine source signal (default: 0.1)
491
add_noise_std: std of additive Gaussian noise (default: 0.003)
492
note that amplitude of noise in unvoiced is decided
493
by sine_amp
494
voiced_threshold: threhold to set U/V given F0 (default: 0)
495
496
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
497
F0_sampled (batchsize, length, 1)
498
Sine_source (batchsize, length, 1)
499
noise_source (batchsize, length 1)
500
uv (batchsize, length, 1)
501
"""
502
503
def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1,
504
add_noise_std=0.003, voiced_threshod=0):
505
super(SourceModuleHnNSF, self).__init__()
506
507
self.sine_amp = sine_amp
508
self.noise_std = add_noise_std
509
510
# to produce sine waveforms
511
self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
512
sine_amp, add_noise_std, voiced_threshod)
513
514
# to merge source harmonics into a single excitation
515
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
516
self.l_tanh = torch.nn.Tanh()
517
518
def forward(self, x):
519
"""
520
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
521
F0_sampled (batchsize, length, 1)
522
Sine_source (batchsize, length, 1)
523
noise_source (batchsize, length 1)
524
"""
525
# source for harmonic branch
526
sine_wavs, uv, _ = self.l_sin_gen(x)
527
sine_merge = self.l_tanh(self.l_linear(sine_wavs))
528
529
# source for noise branch, in the same shape as uv
530
noise = torch.randn_like(uv) * self.sine_amp / 3
531
return sine_merge, noise, uv
532
533
534
if __name__ == '__main__':
535
source = SourceModuleCycNoise_v1(24000)
536
x = torch.randn(16, 25600, 1)
537
538
539
540