Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
shivamshrirao
GitHub Repository: shivamshrirao/diffusers
Path: blob/main/tests/pipelines/audio_diffusion/test_audio_diffusion.py
1450 views
1
# coding=utf-8
2
# Copyright 2023 HuggingFace Inc.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
# http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
16
import gc
17
import unittest
18
19
import numpy as np
20
import torch
21
22
from diffusers import (
23
AudioDiffusionPipeline,
24
AutoencoderKL,
25
DDIMScheduler,
26
DDPMScheduler,
27
DiffusionPipeline,
28
Mel,
29
UNet2DConditionModel,
30
UNet2DModel,
31
)
32
from diffusers.utils import slow, torch_device
33
from diffusers.utils.testing_utils import require_torch_gpu
34
35
36
torch.backends.cuda.matmul.allow_tf32 = False
37
38
39
class PipelineFastTests(unittest.TestCase):
40
def tearDown(self):
41
# clean up the VRAM after each test
42
super().tearDown()
43
gc.collect()
44
torch.cuda.empty_cache()
45
46
@property
47
def dummy_unet(self):
48
torch.manual_seed(0)
49
model = UNet2DModel(
50
sample_size=(32, 64),
51
in_channels=1,
52
out_channels=1,
53
layers_per_block=2,
54
block_out_channels=(128, 128),
55
down_block_types=("AttnDownBlock2D", "DownBlock2D"),
56
up_block_types=("UpBlock2D", "AttnUpBlock2D"),
57
)
58
return model
59
60
@property
61
def dummy_unet_condition(self):
62
torch.manual_seed(0)
63
model = UNet2DConditionModel(
64
sample_size=(64, 32),
65
in_channels=1,
66
out_channels=1,
67
layers_per_block=2,
68
block_out_channels=(128, 128),
69
down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
70
up_block_types=("UpBlock2D", "CrossAttnUpBlock2D"),
71
cross_attention_dim=10,
72
)
73
return model
74
75
@property
76
def dummy_vqvae_and_unet(self):
77
torch.manual_seed(0)
78
vqvae = AutoencoderKL(
79
sample_size=(128, 64),
80
in_channels=1,
81
out_channels=1,
82
latent_channels=1,
83
layers_per_block=2,
84
block_out_channels=(128, 128),
85
down_block_types=("DownEncoderBlock2D", "DownEncoderBlock2D"),
86
up_block_types=("UpDecoderBlock2D", "UpDecoderBlock2D"),
87
)
88
unet = UNet2DModel(
89
sample_size=(64, 32),
90
in_channels=1,
91
out_channels=1,
92
layers_per_block=2,
93
block_out_channels=(128, 128),
94
down_block_types=("AttnDownBlock2D", "DownBlock2D"),
95
up_block_types=("UpBlock2D", "AttnUpBlock2D"),
96
)
97
return vqvae, unet
98
99
@slow
100
def test_audio_diffusion(self):
101
device = "cpu" # ensure determinism for the device-dependent torch.Generator
102
mel = Mel()
103
104
scheduler = DDPMScheduler()
105
pipe = AudioDiffusionPipeline(vqvae=None, unet=self.dummy_unet, mel=mel, scheduler=scheduler)
106
pipe = pipe.to(device)
107
pipe.set_progress_bar_config(disable=None)
108
109
generator = torch.Generator(device=device).manual_seed(42)
110
output = pipe(generator=generator, steps=4)
111
audio = output.audios[0]
112
image = output.images[0]
113
114
generator = torch.Generator(device=device).manual_seed(42)
115
output = pipe(generator=generator, steps=4, return_dict=False)
116
image_from_tuple = output[0][0]
117
118
assert audio.shape == (1, (self.dummy_unet.sample_size[1] - 1) * mel.hop_length)
119
assert image.height == self.dummy_unet.sample_size[0] and image.width == self.dummy_unet.sample_size[1]
120
image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
121
image_from_tuple_slice = np.frombuffer(image_from_tuple.tobytes(), dtype="uint8")[:10]
122
expected_slice = np.array([69, 255, 255, 255, 0, 0, 77, 181, 12, 127])
123
124
assert np.abs(image_slice.flatten() - expected_slice).max() == 0
125
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() == 0
126
127
scheduler = DDIMScheduler()
128
dummy_vqvae_and_unet = self.dummy_vqvae_and_unet
129
pipe = AudioDiffusionPipeline(
130
vqvae=self.dummy_vqvae_and_unet[0], unet=dummy_vqvae_and_unet[1], mel=mel, scheduler=scheduler
131
)
132
pipe = pipe.to(device)
133
pipe.set_progress_bar_config(disable=None)
134
135
np.random.seed(0)
136
raw_audio = np.random.uniform(-1, 1, ((dummy_vqvae_and_unet[0].sample_size[1] - 1) * mel.hop_length,))
137
generator = torch.Generator(device=device).manual_seed(42)
138
output = pipe(raw_audio=raw_audio, generator=generator, start_step=5, steps=10)
139
image = output.images[0]
140
141
assert (
142
image.height == self.dummy_vqvae_and_unet[0].sample_size[0]
143
and image.width == self.dummy_vqvae_and_unet[0].sample_size[1]
144
)
145
image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
146
expected_slice = np.array([120, 117, 110, 109, 138, 167, 138, 148, 132, 121])
147
148
assert np.abs(image_slice.flatten() - expected_slice).max() == 0
149
150
dummy_unet_condition = self.dummy_unet_condition
151
pipe = AudioDiffusionPipeline(
152
vqvae=self.dummy_vqvae_and_unet[0], unet=dummy_unet_condition, mel=mel, scheduler=scheduler
153
)
154
155
np.random.seed(0)
156
encoding = torch.rand((1, 1, 10))
157
output = pipe(generator=generator, encoding=encoding)
158
image = output.images[0]
159
image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
160
expected_slice = np.array([120, 139, 147, 123, 124, 96, 115, 121, 126, 144])
161
162
assert np.abs(image_slice.flatten() - expected_slice).max() == 0
163
164
165
@slow
166
@require_torch_gpu
167
class PipelineIntegrationTests(unittest.TestCase):
168
def tearDown(self):
169
# clean up the VRAM after each test
170
super().tearDown()
171
gc.collect()
172
torch.cuda.empty_cache()
173
174
def test_audio_diffusion(self):
175
device = torch_device
176
177
pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256")
178
pipe = pipe.to(device)
179
pipe.set_progress_bar_config(disable=None)
180
181
generator = torch.Generator(device=device).manual_seed(42)
182
output = pipe(generator=generator)
183
audio = output.audios[0]
184
image = output.images[0]
185
186
assert audio.shape == (1, (pipe.unet.sample_size[1] - 1) * pipe.mel.hop_length)
187
assert image.height == pipe.unet.sample_size[0] and image.width == pipe.unet.sample_size[1]
188
image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]
189
expected_slice = np.array([151, 167, 154, 144, 122, 134, 121, 105, 70, 26])
190
191
assert np.abs(image_slice.flatten() - expected_slice).max() == 0
192
193