Path: blob/main/modules/parallel_wavegan/models/source.py
694 views
import torch1import numpy as np2import sys3import torch.nn.functional as torch_nn_func456class SineGen(torch.nn.Module):7""" Definition of sine generator8SineGen(samp_rate, harmonic_num = 0,9sine_amp = 0.1, noise_std = 0.003,10voiced_threshold = 0,11flag_for_pulse=False)1213samp_rate: sampling rate in Hz14harmonic_num: number of harmonic overtones (default 0)15sine_amp: amplitude of sine-wavefrom (default 0.1)16noise_std: std of Gaussian noise (default 0.003)17voiced_thoreshold: F0 threshold for U/V classification (default 0)18flag_for_pulse: this SinGen is used inside PulseGen (default False)1920Note: when flag_for_pulse is True, the first time step of a voiced21segment is always sin(np.pi) or cos(0)22"""2324def __init__(self, samp_rate, harmonic_num=0,25sine_amp=0.1, noise_std=0.003,26voiced_threshold=0,27flag_for_pulse=False):28super(SineGen, self).__init__()29self.sine_amp = sine_amp30self.noise_std = noise_std31self.harmonic_num = harmonic_num32self.dim = self.harmonic_num + 133self.sampling_rate = samp_rate34self.voiced_threshold = voiced_threshold35self.flag_for_pulse = flag_for_pulse3637def _f02uv(self, f0):38# generate uv signal39uv = torch.ones_like(f0)40uv = uv * (f0 > self.voiced_threshold)41return uv4243def _f02sine(self, f0_values):44""" f0_values: (batchsize, length, dim)45where dim indicates fundamental tone and overtones46"""47# convert to F0 in rad. The interger part n can be ignored48# because 2 * np.pi * n doesn't affect phase49rad_values = (f0_values / self.sampling_rate) % 15051# initial phase noise (no noise for fundamental component)52rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \53device=f0_values.device)54rand_ini[:, 0] = 055rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini5657# instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)58if not self.flag_for_pulse:59# for normal case6061# To prevent torch.cumsum numerical overflow,62# it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.63# Buffer tmp_over_one_idx indicates the time step to add -1.64# This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi65tmp_over_one = torch.cumsum(rad_values, 1) % 166tmp_over_one_idx = (tmp_over_one[:, 1:, :] -67tmp_over_one[:, :-1, :]) < 068cumsum_shift = torch.zeros_like(rad_values)69cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.07071sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1)72* 2 * np.pi)73else:74# If necessary, make sure that the first time step of every75# voiced segments is sin(pi) or cos(0)76# This is used for pulse-train generation7778# identify the last time step in unvoiced segments79uv = self._f02uv(f0_values)80uv_1 = torch.roll(uv, shifts=-1, dims=1)81uv_1[:, -1, :] = 182u_loc = (uv < 1) * (uv_1 > 0)8384# get the instantanouse phase85tmp_cumsum = torch.cumsum(rad_values, dim=1)86# different batch needs to be processed differently87for idx in range(f0_values.shape[0]):88temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]89temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]90# stores the accumulation of i.phase within91# each voiced segments92tmp_cumsum[idx, :, :] = 093tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum9495# rad_values - tmp_cumsum: remove the accumulation of i.phase96# within the previous voiced segment.97i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)9899# get the sines100sines = torch.cos(i_phase * 2 * np.pi)101return sines102103def forward(self, f0):104""" sine_tensor, uv = forward(f0)105input F0: tensor(batchsize=1, length, dim=1)106f0 for unvoiced steps should be 0107output sine_tensor: tensor(batchsize=1, length, dim)108output uv: tensor(batchsize=1, length, 1)109"""110with torch.no_grad():111f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,112device=f0.device)113# fundamental component114f0_buf[:, :, 0] = f0[:, :, 0]115for idx in np.arange(self.harmonic_num):116# idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic117f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)118119# generate sine waveforms120sine_waves = self._f02sine(f0_buf) * self.sine_amp121122# generate uv signal123# uv = torch.ones(f0.shape)124# uv = uv * (f0 > self.voiced_threshold)125uv = self._f02uv(f0)126127# noise: for unvoiced should be similar to sine_amp128# std = self.sine_amp/3 -> max value ~ self.sine_amp129# . for voiced regions is self.noise_std130noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3131noise = noise_amp * torch.randn_like(sine_waves)132133# first: set the unvoiced part to 0 by uv134# then: additive noise135sine_waves = sine_waves * uv + noise136return sine_waves, uv, noise137138139class PulseGen(torch.nn.Module):140""" Definition of Pulse train generator141142There are many ways to implement pulse generator.143Here, PulseGen is based on SinGen. For a perfect144"""145def __init__(self, samp_rate, pulse_amp = 0.1,146noise_std = 0.003, voiced_threshold = 0):147super(PulseGen, self).__init__()148self.pulse_amp = pulse_amp149self.sampling_rate = samp_rate150self.voiced_threshold = voiced_threshold151self.noise_std = noise_std152self.l_sinegen = SineGen(self.sampling_rate, harmonic_num=0, \153sine_amp=self.pulse_amp, noise_std=0, \154voiced_threshold=self.voiced_threshold, \155flag_for_pulse=True)156157def forward(self, f0):158""" Pulse train generator159pulse_train, uv = forward(f0)160input F0: tensor(batchsize=1, length, dim=1)161f0 for unvoiced steps should be 0162output pulse_train: tensor(batchsize=1, length, dim)163output uv: tensor(batchsize=1, length, 1)164165Note: self.l_sine doesn't make sure that the initial phase of166a voiced segment is np.pi, the first pulse in a voiced segment167may not be at the first time step within a voiced segment168"""169with torch.no_grad():170sine_wav, uv, noise = self.l_sinegen(f0)171172# sine without additive noise173pure_sine = sine_wav - noise174175# step t corresponds to a pulse if176# sine[t] > sine[t+1] & sine[t] > sine[t-1]177# & sine[t-1], sine[t+1], and sine[t] are voiced178# or179# sine[t] is voiced, sine[t-1] is unvoiced180# we use torch.roll to simulate sine[t+1] and sine[t-1]181sine_1 = torch.roll(pure_sine, shifts=1, dims=1)182uv_1 = torch.roll(uv, shifts=1, dims=1)183uv_1[:, 0, :] = 0184sine_2 = torch.roll(pure_sine, shifts=-1, dims=1)185uv_2 = torch.roll(uv, shifts=-1, dims=1)186uv_2[:, -1, :] = 0187188loc = (pure_sine > sine_1) * (pure_sine > sine_2) \189* (uv_1 > 0) * (uv_2 > 0) * (uv > 0) \190+ (uv_1 < 1) * (uv > 0)191192# pulse train without noise193pulse_train = pure_sine * loc194195# additive noise to pulse train196# note that noise from sinegen is zero in voiced regions197pulse_noise = torch.randn_like(pure_sine) * self.noise_std198199# with additive noise on pulse, and unvoiced regions200pulse_train += pulse_noise * loc + pulse_noise * (1 - uv)201return pulse_train, sine_wav, uv, pulse_noise202203204class SignalsConv1d(torch.nn.Module):205""" Filtering input signal with time invariant filter206Note: FIRFilter conducted filtering given fixed FIR weight207SignalsConv1d convolves two signals208Note: this is based on torch.nn.functional.conv1d209210"""211212def __init__(self):213super(SignalsConv1d, self).__init__()214215def forward(self, signal, system_ir):216""" output = forward(signal, system_ir)217218signal: (batchsize, length1, dim)219system_ir: (length2, dim)220221output: (batchsize, length1, dim)222"""223if signal.shape[-1] != system_ir.shape[-1]:224print("Error: SignalsConv1d expects shape:")225print("signal (batchsize, length1, dim)")226print("system_id (batchsize, length2, dim)")227print("But received signal: {:s}".format(str(signal.shape)))228print(" system_ir: {:s}".format(str(system_ir.shape)))229sys.exit(1)230padding_length = system_ir.shape[0] - 1231groups = signal.shape[-1]232233# pad signal on the left234signal_pad = torch_nn_func.pad(signal.permute(0, 2, 1), \235(padding_length, 0))236# prepare system impulse response as (dim, 1, length2)237# also flip the impulse response238ir = torch.flip(system_ir.unsqueeze(1).permute(2, 1, 0), \239dims=[2])240# convolute241output = torch_nn_func.conv1d(signal_pad, ir, groups=groups)242return output.permute(0, 2, 1)243244245class CyclicNoiseGen_v1(torch.nn.Module):246""" CyclicnoiseGen_v1247Cyclic noise with a single parameter of beta.248Pytorch v1 implementation assumes f_t is also fixed249"""250251def __init__(self, samp_rate,252noise_std=0.003, voiced_threshold=0):253super(CyclicNoiseGen_v1, self).__init__()254self.samp_rate = samp_rate255self.noise_std = noise_std256self.voiced_threshold = voiced_threshold257258self.l_pulse = PulseGen(samp_rate, pulse_amp=1.0,259noise_std=noise_std,260voiced_threshold=voiced_threshold)261self.l_conv = SignalsConv1d()262263def noise_decay(self, beta, f0mean):264""" decayed_noise = noise_decay(beta, f0mean)265decayed_noise = n[t]exp(-t * f_mean / beta / samp_rate)266267beta: (dim=1) or (batchsize=1, 1, dim=1)268f0mean (batchsize=1, 1, dim=1)269270decayed_noise (batchsize=1, length, dim=1)271"""272with torch.no_grad():273# exp(-1.0 n / T) < 0.01 => n > -log(0.01)*T = 4.60*T274# truncate the noise when decayed by -40 dB275length = 4.6 * self.samp_rate / f0mean276length = length.int()277time_idx = torch.arange(0, length, device=beta.device)278time_idx = time_idx.unsqueeze(0).unsqueeze(2)279time_idx = time_idx.repeat(beta.shape[0], 1, beta.shape[2])280281noise = torch.randn(time_idx.shape, device=beta.device)282283# due to Pytorch implementation, use f0_mean as the f0 factor284decay = torch.exp(-time_idx * f0mean / beta / self.samp_rate)285return noise * self.noise_std * decay286287def forward(self, f0s, beta):288""" Producde cyclic-noise289"""290# pulse train291pulse_train, sine_wav, uv, noise = self.l_pulse(f0s)292pure_pulse = pulse_train - noise293294# decayed_noise (length, dim=1)295if (uv < 1).all():296# all unvoiced297cyc_noise = torch.zeros_like(sine_wav)298else:299f0mean = f0s[uv > 0].mean()300301decayed_noise = self.noise_decay(beta, f0mean)[0, :, :]302# convolute303cyc_noise = self.l_conv(pure_pulse, decayed_noise)304305# add noise in invoiced segments306cyc_noise = cyc_noise + noise * (1.0 - uv)307return cyc_noise, pulse_train, sine_wav, uv, noise308309310class SineGen(torch.nn.Module):311""" Definition of sine generator312SineGen(samp_rate, harmonic_num = 0,313sine_amp = 0.1, noise_std = 0.003,314voiced_threshold = 0,315flag_for_pulse=False)316317samp_rate: sampling rate in Hz318harmonic_num: number of harmonic overtones (default 0)319sine_amp: amplitude of sine-wavefrom (default 0.1)320noise_std: std of Gaussian noise (default 0.003)321voiced_thoreshold: F0 threshold for U/V classification (default 0)322flag_for_pulse: this SinGen is used inside PulseGen (default False)323324Note: when flag_for_pulse is True, the first time step of a voiced325segment is always sin(np.pi) or cos(0)326"""327328def __init__(self, samp_rate, harmonic_num=0,329sine_amp=0.1, noise_std=0.003,330voiced_threshold=0,331flag_for_pulse=False):332super(SineGen, self).__init__()333self.sine_amp = sine_amp334self.noise_std = noise_std335self.harmonic_num = harmonic_num336self.dim = self.harmonic_num + 1337self.sampling_rate = samp_rate338self.voiced_threshold = voiced_threshold339self.flag_for_pulse = flag_for_pulse340341def _f02uv(self, f0):342# generate uv signal343uv = torch.ones_like(f0)344uv = uv * (f0 > self.voiced_threshold)345return uv346347def _f02sine(self, f0_values):348""" f0_values: (batchsize, length, dim)349where dim indicates fundamental tone and overtones350"""351# convert to F0 in rad. The interger part n can be ignored352# because 2 * np.pi * n doesn't affect phase353rad_values = (f0_values / self.sampling_rate) % 1354355# initial phase noise (no noise for fundamental component)356rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \357device=f0_values.device)358rand_ini[:, 0] = 0359rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini360361# instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)362if not self.flag_for_pulse:363# for normal case364365# To prevent torch.cumsum numerical overflow,366# it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.367# Buffer tmp_over_one_idx indicates the time step to add -1.368# This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi369tmp_over_one = torch.cumsum(rad_values, 1) % 1370tmp_over_one_idx = (tmp_over_one[:, 1:, :] -371tmp_over_one[:, :-1, :]) < 0372cumsum_shift = torch.zeros_like(rad_values)373cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0374375sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1)376* 2 * np.pi)377else:378# If necessary, make sure that the first time step of every379# voiced segments is sin(pi) or cos(0)380# This is used for pulse-train generation381382# identify the last time step in unvoiced segments383uv = self._f02uv(f0_values)384uv_1 = torch.roll(uv, shifts=-1, dims=1)385uv_1[:, -1, :] = 1386u_loc = (uv < 1) * (uv_1 > 0)387388# get the instantanouse phase389tmp_cumsum = torch.cumsum(rad_values, dim=1)390# different batch needs to be processed differently391for idx in range(f0_values.shape[0]):392temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]393temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]394# stores the accumulation of i.phase within395# each voiced segments396tmp_cumsum[idx, :, :] = 0397tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum398399# rad_values - tmp_cumsum: remove the accumulation of i.phase400# within the previous voiced segment.401i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)402403# get the sines404sines = torch.cos(i_phase * 2 * np.pi)405return sines406407def forward(self, f0):408""" sine_tensor, uv = forward(f0)409input F0: tensor(batchsize=1, length, dim=1)410f0 for unvoiced steps should be 0411output sine_tensor: tensor(batchsize=1, length, dim)412output uv: tensor(batchsize=1, length, 1)413"""414with torch.no_grad():415f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, \416device=f0.device)417# fundamental component418f0_buf[:, :, 0] = f0[:, :, 0]419for idx in np.arange(self.harmonic_num):420# idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic421f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)422423# generate sine waveforms424sine_waves = self._f02sine(f0_buf) * self.sine_amp425426# generate uv signal427# uv = torch.ones(f0.shape)428# uv = uv * (f0 > self.voiced_threshold)429uv = self._f02uv(f0)430431# noise: for unvoiced should be similar to sine_amp432# std = self.sine_amp/3 -> max value ~ self.sine_amp433# . for voiced regions is self.noise_std434noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3435noise = noise_amp * torch.randn_like(sine_waves)436437# first: set the unvoiced part to 0 by uv438# then: additive noise439sine_waves = sine_waves * uv + noise440return sine_waves, uv, noise441442443class SourceModuleCycNoise_v1(torch.nn.Module):444""" SourceModuleCycNoise_v1445SourceModule(sampling_rate, noise_std=0.003, voiced_threshod=0)446sampling_rate: sampling_rate in Hz447448noise_std: std of Gaussian noise (default: 0.003)449voiced_threshold: threshold to set U/V given F0 (default: 0)450451cyc, noise, uv = SourceModuleCycNoise_v1(F0_upsampled, beta)452F0_upsampled (batchsize, length, 1)453beta (1)454cyc (batchsize, length, 1)455noise (batchsize, length, 1)456uv (batchsize, length, 1)457"""458459def __init__(self, sampling_rate, noise_std=0.003, voiced_threshod=0):460super(SourceModuleCycNoise_v1, self).__init__()461self.sampling_rate = sampling_rate462self.noise_std = noise_std463self.l_cyc_gen = CyclicNoiseGen_v1(sampling_rate, noise_std,464voiced_threshod)465466def forward(self, f0_upsamped, beta):467"""468cyc, noise, uv = SourceModuleCycNoise_v1(F0, beta)469F0_upsampled (batchsize, length, 1)470beta (1)471cyc (batchsize, length, 1)472noise (batchsize, length, 1)473uv (batchsize, length, 1)474"""475# source for harmonic branch476cyc, pulse, sine, uv, add_noi = self.l_cyc_gen(f0_upsamped, beta)477478# source for noise branch, in the same shape as uv479noise = torch.randn_like(uv) * self.noise_std / 3480return cyc, noise, uv481482483class SourceModuleHnNSF(torch.nn.Module):484""" SourceModule for hn-nsf485SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,486add_noise_std=0.003, voiced_threshod=0)487sampling_rate: sampling_rate in Hz488harmonic_num: number of harmonic above F0 (default: 0)489sine_amp: amplitude of sine source signal (default: 0.1)490add_noise_std: std of additive Gaussian noise (default: 0.003)491note that amplitude of noise in unvoiced is decided492by sine_amp493voiced_threshold: threhold to set U/V given F0 (default: 0)494495Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)496F0_sampled (batchsize, length, 1)497Sine_source (batchsize, length, 1)498noise_source (batchsize, length 1)499uv (batchsize, length, 1)500"""501502def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1,503add_noise_std=0.003, voiced_threshod=0):504super(SourceModuleHnNSF, self).__init__()505506self.sine_amp = sine_amp507self.noise_std = add_noise_std508509# to produce sine waveforms510self.l_sin_gen = SineGen(sampling_rate, harmonic_num,511sine_amp, add_noise_std, voiced_threshod)512513# to merge source harmonics into a single excitation514self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)515self.l_tanh = torch.nn.Tanh()516517def forward(self, x):518"""519Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)520F0_sampled (batchsize, length, 1)521Sine_source (batchsize, length, 1)522noise_source (batchsize, length 1)523"""524# source for harmonic branch525sine_wavs, uv, _ = self.l_sin_gen(x)526sine_merge = self.l_tanh(self.l_linear(sine_wavs))527528# source for noise branch, in the same shape as uv529noise = torch.randn_like(uv) * self.sine_amp / 3530return sine_merge, noise, uv531532533if __name__ == '__main__':534source = SourceModuleCycNoise_v1(24000)535x = torch.randn(16, 25600, 1)536537538539540