Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
shivamshrirao
GitHub Repository: shivamshrirao/diffusers
Path: blob/main/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
1450 views
1
# coding=utf-8
2
# Copyright 2023 HuggingFace Inc.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
# http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
16
import gc
17
import random
18
import unittest
19
20
import numpy as np
21
import torch
22
from PIL import Image
23
from transformers import CLIPImageProcessor, CLIPVisionConfig, CLIPVisionModelWithProjection
24
25
from diffusers import (
26
AutoencoderKL,
27
DPMSolverMultistepScheduler,
28
PNDMScheduler,
29
StableDiffusionImageVariationPipeline,
30
UNet2DConditionModel,
31
)
32
from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
33
from diffusers.utils.testing_utils import require_torch_gpu
34
35
from ...pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS
36
from ...test_pipelines_common import PipelineTesterMixin
37
38
39
torch.backends.cuda.matmul.allow_tf32 = False
40
41
42
class StableDiffusionImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
43
pipeline_class = StableDiffusionImageVariationPipeline
44
params = IMAGE_VARIATION_PARAMS
45
batch_params = IMAGE_VARIATION_BATCH_PARAMS
46
47
def get_dummy_components(self):
48
torch.manual_seed(0)
49
unet = UNet2DConditionModel(
50
block_out_channels=(32, 64),
51
layers_per_block=2,
52
sample_size=32,
53
in_channels=4,
54
out_channels=4,
55
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
56
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
57
cross_attention_dim=32,
58
)
59
scheduler = PNDMScheduler(skip_prk_steps=True)
60
torch.manual_seed(0)
61
vae = AutoencoderKL(
62
block_out_channels=[32, 64],
63
in_channels=3,
64
out_channels=3,
65
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
66
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
67
latent_channels=4,
68
)
69
torch.manual_seed(0)
70
image_encoder_config = CLIPVisionConfig(
71
hidden_size=32,
72
projection_dim=32,
73
intermediate_size=37,
74
layer_norm_eps=1e-05,
75
num_attention_heads=4,
76
num_hidden_layers=5,
77
image_size=32,
78
patch_size=4,
79
)
80
image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
81
feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
82
83
components = {
84
"unet": unet,
85
"scheduler": scheduler,
86
"vae": vae,
87
"image_encoder": image_encoder,
88
"feature_extractor": feature_extractor,
89
"safety_checker": None,
90
}
91
return components
92
93
def get_dummy_inputs(self, device, seed=0):
94
image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
95
image = image.cpu().permute(0, 2, 3, 1)[0]
96
image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32))
97
if str(device).startswith("mps"):
98
generator = torch.manual_seed(seed)
99
else:
100
generator = torch.Generator(device=device).manual_seed(seed)
101
inputs = {
102
"image": image,
103
"generator": generator,
104
"num_inference_steps": 2,
105
"guidance_scale": 6.0,
106
"output_type": "numpy",
107
}
108
return inputs
109
110
def test_stable_diffusion_img_variation_default_case(self):
111
device = "cpu" # ensure determinism for the device-dependent torch.Generator
112
components = self.get_dummy_components()
113
sd_pipe = StableDiffusionImageVariationPipeline(**components)
114
sd_pipe = sd_pipe.to(device)
115
sd_pipe.set_progress_bar_config(disable=None)
116
117
inputs = self.get_dummy_inputs(device)
118
image = sd_pipe(**inputs).images
119
image_slice = image[0, -3:, -3:, -1]
120
121
assert image.shape == (1, 64, 64, 3)
122
expected_slice = np.array([0.5167, 0.5746, 0.4835, 0.4914, 0.5605, 0.4691, 0.5201, 0.4898, 0.4958])
123
124
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
125
126
def test_stable_diffusion_img_variation_multiple_images(self):
127
device = "cpu" # ensure determinism for the device-dependent torch.Generator
128
components = self.get_dummy_components()
129
sd_pipe = StableDiffusionImageVariationPipeline(**components)
130
sd_pipe = sd_pipe.to(device)
131
sd_pipe.set_progress_bar_config(disable=None)
132
133
inputs = self.get_dummy_inputs(device)
134
inputs["image"] = 2 * [inputs["image"]]
135
output = sd_pipe(**inputs)
136
137
image = output.images
138
139
image_slice = image[-1, -3:, -3:, -1]
140
141
assert image.shape == (2, 64, 64, 3)
142
expected_slice = np.array([0.6568, 0.5470, 0.5684, 0.5444, 0.5945, 0.6221, 0.5508, 0.5531, 0.5263])
143
144
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
145
146
147
@slow
148
@require_torch_gpu
149
class StableDiffusionImageVariationPipelineSlowTests(unittest.TestCase):
150
def tearDown(self):
151
super().tearDown()
152
gc.collect()
153
torch.cuda.empty_cache()
154
155
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
156
generator = torch.Generator(device=generator_device).manual_seed(seed)
157
init_image = load_image(
158
"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
159
"/stable_diffusion_imgvar/input_image_vermeer.png"
160
)
161
latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
162
latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
163
inputs = {
164
"image": init_image,
165
"latents": latents,
166
"generator": generator,
167
"num_inference_steps": 3,
168
"guidance_scale": 7.5,
169
"output_type": "numpy",
170
}
171
return inputs
172
173
def test_stable_diffusion_img_variation_pipeline_default(self):
174
sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained(
175
"lambdalabs/sd-image-variations-diffusers", safety_checker=None
176
)
177
sd_pipe = sd_pipe.to(torch_device)
178
sd_pipe.set_progress_bar_config(disable=None)
179
180
inputs = self.get_inputs(torch_device)
181
image = sd_pipe(**inputs).images
182
image_slice = image[0, -3:, -3:, -1].flatten()
183
184
assert image.shape == (1, 512, 512, 3)
185
expected_slice = np.array([0.84491, 0.90789, 0.75708, 0.78734, 0.83485, 0.70099, 0.66938, 0.68727, 0.61379])
186
assert np.abs(image_slice - expected_slice).max() < 1e-4
187
188
def test_stable_diffusion_img_variation_intermediate_state(self):
189
number_of_steps = 0
190
191
def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
192
callback_fn.has_been_called = True
193
nonlocal number_of_steps
194
number_of_steps += 1
195
if step == 1:
196
latents = latents.detach().cpu().numpy()
197
assert latents.shape == (1, 4, 64, 64)
198
latents_slice = latents[0, -3:, -3:, -1]
199
expected_slice = np.array(
200
[-0.1621, 0.2837, -0.7979, -0.1221, -1.3057, 0.7681, -2.1191, 0.0464, 1.6309]
201
)
202
203
assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
204
elif step == 2:
205
latents = latents.detach().cpu().numpy()
206
assert latents.shape == (1, 4, 64, 64)
207
latents_slice = latents[0, -3:, -3:, -1]
208
expected_slice = np.array([0.6299, 1.7500, 1.1992, -2.1582, -1.8994, 0.7334, -0.7090, 1.0137, 1.5273])
209
210
assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
211
212
callback_fn.has_been_called = False
213
214
pipe = StableDiffusionImageVariationPipeline.from_pretrained(
215
"fusing/sd-image-variations-diffusers",
216
safety_checker=None,
217
torch_dtype=torch.float16,
218
)
219
pipe.to(torch_device)
220
pipe.set_progress_bar_config(disable=None)
221
pipe.enable_attention_slicing()
222
223
inputs = self.get_inputs(torch_device, dtype=torch.float16)
224
pipe(**inputs, callback=callback_fn, callback_steps=1)
225
assert callback_fn.has_been_called
226
assert number_of_steps == inputs["num_inference_steps"]
227
228
def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
229
torch.cuda.empty_cache()
230
torch.cuda.reset_max_memory_allocated()
231
torch.cuda.reset_peak_memory_stats()
232
233
model_id = "fusing/sd-image-variations-diffusers"
234
pipe = StableDiffusionImageVariationPipeline.from_pretrained(
235
model_id, safety_checker=None, torch_dtype=torch.float16
236
)
237
pipe = pipe.to(torch_device)
238
pipe.set_progress_bar_config(disable=None)
239
pipe.enable_attention_slicing(1)
240
pipe.enable_sequential_cpu_offload()
241
242
inputs = self.get_inputs(torch_device, dtype=torch.float16)
243
_ = pipe(**inputs)
244
245
mem_bytes = torch.cuda.max_memory_allocated()
246
# make sure that less than 2.6 GB is allocated
247
assert mem_bytes < 2.6 * 10**9
248
249
250
@nightly
251
@require_torch_gpu
252
class StableDiffusionImageVariationPipelineNightlyTests(unittest.TestCase):
253
def tearDown(self):
254
super().tearDown()
255
gc.collect()
256
torch.cuda.empty_cache()
257
258
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
259
generator = torch.Generator(device=generator_device).manual_seed(seed)
260
init_image = load_image(
261
"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
262
"/stable_diffusion_imgvar/input_image_vermeer.png"
263
)
264
latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
265
latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
266
inputs = {
267
"image": init_image,
268
"latents": latents,
269
"generator": generator,
270
"num_inference_steps": 50,
271
"guidance_scale": 7.5,
272
"output_type": "numpy",
273
}
274
return inputs
275
276
def test_img_variation_pndm(self):
277
sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained("fusing/sd-image-variations-diffusers")
278
sd_pipe.to(torch_device)
279
sd_pipe.set_progress_bar_config(disable=None)
280
281
inputs = self.get_inputs(torch_device)
282
image = sd_pipe(**inputs).images[0]
283
284
expected_image = load_numpy(
285
"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
286
"/stable_diffusion_imgvar/lambdalabs_variations_pndm.npy"
287
)
288
max_diff = np.abs(expected_image - image).max()
289
assert max_diff < 1e-3
290
291
def test_img_variation_dpm(self):
292
sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained("fusing/sd-image-variations-diffusers")
293
sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
294
sd_pipe.to(torch_device)
295
sd_pipe.set_progress_bar_config(disable=None)
296
297
inputs = self.get_inputs(torch_device)
298
inputs["num_inference_steps"] = 25
299
image = sd_pipe(**inputs).images[0]
300
301
expected_image = load_numpy(
302
"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
303
"/stable_diffusion_imgvar/lambdalabs_variations_dpm_multi.npy"
304
)
305
max_diff = np.abs(expected_image - image).max()
306
assert max_diff < 1e-3
307
308