Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
shivamshrirao
GitHub Repository: shivamshrirao/diffusers
Path: blob/main/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
1448 views
1
# coding=utf-8
2
# Copyright 2023 HuggingFace Inc.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
# http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
16
import gc
17
import random
18
import unittest
19
20
import numpy as np
21
import torch
22
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
23
24
from diffusers import (
25
AutoencoderKL,
26
DDIMScheduler,
27
DPMSolverMultistepScheduler,
28
LMSDiscreteScheduler,
29
PNDMScheduler,
30
StableDiffusionImg2ImgPipeline,
31
UNet2DConditionModel,
32
)
33
from diffusers.image_processor import VaeImageProcessor
34
from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
35
from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
36
37
from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
38
from ...test_pipelines_common import PipelineTesterMixin
39
40
41
torch.backends.cuda.matmul.allow_tf32 = False
42
43
44
class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
45
pipeline_class = StableDiffusionImg2ImgPipeline
46
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
47
required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
48
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
49
50
def get_dummy_components(self):
51
torch.manual_seed(0)
52
unet = UNet2DConditionModel(
53
block_out_channels=(32, 64),
54
layers_per_block=2,
55
sample_size=32,
56
in_channels=4,
57
out_channels=4,
58
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
59
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
60
cross_attention_dim=32,
61
)
62
scheduler = PNDMScheduler(skip_prk_steps=True)
63
torch.manual_seed(0)
64
vae = AutoencoderKL(
65
block_out_channels=[32, 64],
66
in_channels=3,
67
out_channels=3,
68
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
69
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
70
latent_channels=4,
71
)
72
torch.manual_seed(0)
73
text_encoder_config = CLIPTextConfig(
74
bos_token_id=0,
75
eos_token_id=2,
76
hidden_size=32,
77
intermediate_size=37,
78
layer_norm_eps=1e-05,
79
num_attention_heads=4,
80
num_hidden_layers=5,
81
pad_token_id=1,
82
vocab_size=1000,
83
)
84
text_encoder = CLIPTextModel(text_encoder_config)
85
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
86
87
components = {
88
"unet": unet,
89
"scheduler": scheduler,
90
"vae": vae,
91
"text_encoder": text_encoder,
92
"tokenizer": tokenizer,
93
"safety_checker": None,
94
"feature_extractor": None,
95
}
96
return components
97
98
def get_dummy_inputs(self, device, seed=0, input_image_type="pt", output_type="np"):
99
image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
100
if str(device).startswith("mps"):
101
generator = torch.manual_seed(seed)
102
else:
103
generator = torch.Generator(device=device).manual_seed(seed)
104
105
if input_image_type == "pt":
106
input_image = image
107
elif input_image_type == "np":
108
input_image = image.cpu().numpy().transpose(0, 2, 3, 1)
109
elif input_image_type == "pil":
110
input_image = image.cpu().numpy().transpose(0, 2, 3, 1)
111
input_image = VaeImageProcessor.numpy_to_pil(input_image)
112
else:
113
raise ValueError(f"unsupported input_image_type {input_image_type}.")
114
115
if output_type not in ["pt", "np", "pil"]:
116
raise ValueError(f"unsupported output_type {output_type}")
117
118
inputs = {
119
"prompt": "A painting of a squirrel eating a burger",
120
"image": input_image,
121
"generator": generator,
122
"num_inference_steps": 2,
123
"guidance_scale": 6.0,
124
"output_type": output_type,
125
}
126
return inputs
127
128
def test_stable_diffusion_img2img_default_case(self):
129
device = "cpu" # ensure determinism for the device-dependent torch.Generator
130
components = self.get_dummy_components()
131
sd_pipe = StableDiffusionImg2ImgPipeline(**components)
132
sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=False)
133
sd_pipe = sd_pipe.to(device)
134
sd_pipe.set_progress_bar_config(disable=None)
135
136
inputs = self.get_dummy_inputs(device)
137
image = sd_pipe(**inputs).images
138
image_slice = image[0, -3:, -3:, -1]
139
140
assert image.shape == (1, 32, 32, 3)
141
expected_slice = np.array([0.4492, 0.3865, 0.4222, 0.5854, 0.5139, 0.4379, 0.4193, 0.48, 0.4218])
142
143
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
144
145
def test_stable_diffusion_img2img_negative_prompt(self):
146
device = "cpu" # ensure determinism for the device-dependent torch.Generator
147
components = self.get_dummy_components()
148
sd_pipe = StableDiffusionImg2ImgPipeline(**components)
149
sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=False)
150
sd_pipe = sd_pipe.to(device)
151
sd_pipe.set_progress_bar_config(disable=None)
152
153
inputs = self.get_dummy_inputs(device)
154
negative_prompt = "french fries"
155
output = sd_pipe(**inputs, negative_prompt=negative_prompt)
156
image = output.images
157
image_slice = image[0, -3:, -3:, -1]
158
159
assert image.shape == (1, 32, 32, 3)
160
expected_slice = np.array([0.4065, 0.3783, 0.4050, 0.5266, 0.4781, 0.4252, 0.4203, 0.4692, 0.4365])
161
162
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
163
164
def test_stable_diffusion_img2img_multiple_init_images(self):
165
device = "cpu" # ensure determinism for the device-dependent torch.Generator
166
components = self.get_dummy_components()
167
sd_pipe = StableDiffusionImg2ImgPipeline(**components)
168
sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=False)
169
sd_pipe = sd_pipe.to(device)
170
sd_pipe.set_progress_bar_config(disable=None)
171
172
inputs = self.get_dummy_inputs(device)
173
inputs["prompt"] = [inputs["prompt"]] * 2
174
inputs["image"] = inputs["image"].repeat(2, 1, 1, 1)
175
image = sd_pipe(**inputs).images
176
image_slice = image[-1, -3:, -3:, -1]
177
178
assert image.shape == (2, 32, 32, 3)
179
expected_slice = np.array([0.5144, 0.4447, 0.4735, 0.6676, 0.5526, 0.5454, 0.645, 0.5149, 0.4689])
180
181
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
182
183
def test_stable_diffusion_img2img_k_lms(self):
184
device = "cpu" # ensure determinism for the device-dependent torch.Generator
185
components = self.get_dummy_components()
186
components["scheduler"] = LMSDiscreteScheduler(
187
beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
188
)
189
sd_pipe = StableDiffusionImg2ImgPipeline(**components)
190
sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=False)
191
sd_pipe = sd_pipe.to(device)
192
sd_pipe.set_progress_bar_config(disable=None)
193
194
inputs = self.get_dummy_inputs(device)
195
image = sd_pipe(**inputs).images
196
image_slice = image[0, -3:, -3:, -1]
197
198
assert image.shape == (1, 32, 32, 3)
199
expected_slice = np.array([0.4367, 0.4986, 0.4372, 0.6706, 0.5665, 0.444, 0.5864, 0.6019, 0.5203])
200
201
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
202
203
@skip_mps
204
def test_save_load_local(self):
205
return super().test_save_load_local()
206
207
@skip_mps
208
def test_dict_tuple_outputs_equivalent(self):
209
return super().test_dict_tuple_outputs_equivalent()
210
211
@skip_mps
212
def test_save_load_optional_components(self):
213
return super().test_save_load_optional_components()
214
215
@skip_mps
216
def test_attention_slicing_forward_pass(self):
217
return super().test_attention_slicing_forward_pass()
218
219
@skip_mps
220
def test_pt_np_pil_outputs_equivalent(self):
221
device = "cpu"
222
components = self.get_dummy_components()
223
sd_pipe = StableDiffusionImg2ImgPipeline(**components)
224
sd_pipe = sd_pipe.to(device)
225
sd_pipe.set_progress_bar_config(disable=None)
226
227
output_pt = sd_pipe(**self.get_dummy_inputs(device, output_type="pt"))[0]
228
output_np = sd_pipe(**self.get_dummy_inputs(device, output_type="np"))[0]
229
output_pil = sd_pipe(**self.get_dummy_inputs(device, output_type="pil"))[0]
230
231
assert np.abs(output_pt.cpu().numpy().transpose(0, 2, 3, 1) - output_np).max() <= 1e-4
232
assert np.abs(np.array(output_pil[0]) - (output_np * 255).round()).max() <= 1e-4
233
234
@skip_mps
235
def test_image_types_consistent(self):
236
device = "cpu"
237
components = self.get_dummy_components()
238
sd_pipe = StableDiffusionImg2ImgPipeline(**components)
239
sd_pipe = sd_pipe.to(device)
240
sd_pipe.set_progress_bar_config(disable=None)
241
242
output_pt = sd_pipe(**self.get_dummy_inputs(device, input_image_type="pt"))[0]
243
output_np = sd_pipe(**self.get_dummy_inputs(device, input_image_type="np"))[0]
244
output_pil = sd_pipe(**self.get_dummy_inputs(device, input_image_type="pil"))[0]
245
246
assert np.abs(output_pt - output_np).max() <= 1e-4
247
assert np.abs(output_pil - output_np).max() <= 1e-2
248
249
250
@slow
251
@require_torch_gpu
252
class StableDiffusionImg2ImgPipelineSlowTests(unittest.TestCase):
253
def tearDown(self):
254
super().tearDown()
255
gc.collect()
256
torch.cuda.empty_cache()
257
258
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
259
generator = torch.Generator(device=generator_device).manual_seed(seed)
260
init_image = load_image(
261
"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
262
"/stable_diffusion_img2img/sketch-mountains-input.png"
263
)
264
inputs = {
265
"prompt": "a fantasy landscape, concept art, high resolution",
266
"image": init_image,
267
"generator": generator,
268
"num_inference_steps": 3,
269
"strength": 0.75,
270
"guidance_scale": 7.5,
271
"output_type": "np",
272
}
273
return inputs
274
275
def test_stable_diffusion_img2img_default(self):
276
pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
277
pipe.to(torch_device)
278
pipe.set_progress_bar_config(disable=None)
279
pipe.enable_attention_slicing()
280
281
inputs = self.get_inputs(torch_device)
282
image = pipe(**inputs).images
283
image_slice = image[0, -3:, -3:, -1].flatten()
284
285
assert image.shape == (1, 512, 768, 3)
286
expected_slice = np.array([0.4300, 0.4662, 0.4930, 0.3990, 0.4307, 0.4525, 0.3719, 0.4064, 0.3923])
287
288
assert np.abs(expected_slice - image_slice).max() < 1e-3
289
290
def test_stable_diffusion_img2img_k_lms(self):
291
pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
292
pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
293
pipe.to(torch_device)
294
pipe.set_progress_bar_config(disable=None)
295
pipe.enable_attention_slicing()
296
297
inputs = self.get_inputs(torch_device)
298
image = pipe(**inputs).images
299
image_slice = image[0, -3:, -3:, -1].flatten()
300
301
assert image.shape == (1, 512, 768, 3)
302
expected_slice = np.array([0.0389, 0.0346, 0.0415, 0.0290, 0.0218, 0.0210, 0.0408, 0.0567, 0.0271])
303
304
assert np.abs(expected_slice - image_slice).max() < 1e-3
305
306
def test_stable_diffusion_img2img_ddim(self):
307
pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
308
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
309
pipe.to(torch_device)
310
pipe.set_progress_bar_config(disable=None)
311
pipe.enable_attention_slicing()
312
313
inputs = self.get_inputs(torch_device)
314
image = pipe(**inputs).images
315
image_slice = image[0, -3:, -3:, -1].flatten()
316
317
assert image.shape == (1, 512, 768, 3)
318
expected_slice = np.array([0.0593, 0.0607, 0.0851, 0.0582, 0.0636, 0.0721, 0.0751, 0.0981, 0.0781])
319
320
assert np.abs(expected_slice - image_slice).max() < 1e-3
321
322
def test_stable_diffusion_img2img_intermediate_state(self):
323
number_of_steps = 0
324
325
def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
326
callback_fn.has_been_called = True
327
nonlocal number_of_steps
328
number_of_steps += 1
329
if step == 1:
330
latents = latents.detach().cpu().numpy()
331
assert latents.shape == (1, 4, 64, 96)
332
latents_slice = latents[0, -3:, -3:, -1]
333
expected_slice = np.array([-0.4958, 0.5107, 1.1045, 2.7539, 4.6680, 3.8320, 1.5049, 1.8633, 2.6523])
334
335
assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
336
elif step == 2:
337
latents = latents.detach().cpu().numpy()
338
assert latents.shape == (1, 4, 64, 96)
339
latents_slice = latents[0, -3:, -3:, -1]
340
expected_slice = np.array([-0.4956, 0.5078, 1.0918, 2.7520, 4.6484, 3.8125, 1.5146, 1.8633, 2.6367])
341
342
assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
343
344
callback_fn.has_been_called = False
345
346
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
347
"CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
348
)
349
pipe = pipe.to(torch_device)
350
pipe.set_progress_bar_config(disable=None)
351
pipe.enable_attention_slicing()
352
353
inputs = self.get_inputs(torch_device, dtype=torch.float16)
354
pipe(**inputs, callback=callback_fn, callback_steps=1)
355
assert callback_fn.has_been_called
356
assert number_of_steps == 2
357
358
def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
359
torch.cuda.empty_cache()
360
torch.cuda.reset_max_memory_allocated()
361
torch.cuda.reset_peak_memory_stats()
362
363
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
364
"CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
365
)
366
pipe = pipe.to(torch_device)
367
pipe.set_progress_bar_config(disable=None)
368
pipe.enable_attention_slicing(1)
369
pipe.enable_sequential_cpu_offload()
370
371
inputs = self.get_inputs(torch_device, dtype=torch.float16)
372
_ = pipe(**inputs)
373
374
mem_bytes = torch.cuda.max_memory_allocated()
375
# make sure that less than 2.2 GB is allocated
376
assert mem_bytes < 2.2 * 10**9
377
378
def test_stable_diffusion_pipeline_with_model_offloading(self):
379
torch.cuda.empty_cache()
380
torch.cuda.reset_max_memory_allocated()
381
torch.cuda.reset_peak_memory_stats()
382
383
inputs = self.get_inputs(torch_device, dtype=torch.float16)
384
385
# Normal inference
386
387
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
388
"CompVis/stable-diffusion-v1-4",
389
safety_checker=None,
390
torch_dtype=torch.float16,
391
)
392
pipe.to(torch_device)
393
pipe.set_progress_bar_config(disable=None)
394
pipe(**inputs)
395
mem_bytes = torch.cuda.max_memory_allocated()
396
397
# With model offloading
398
399
# Reload but don't move to cuda
400
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
401
"CompVis/stable-diffusion-v1-4",
402
safety_checker=None,
403
torch_dtype=torch.float16,
404
)
405
406
torch.cuda.empty_cache()
407
torch.cuda.reset_max_memory_allocated()
408
torch.cuda.reset_peak_memory_stats()
409
410
pipe.enable_model_cpu_offload()
411
pipe.set_progress_bar_config(disable=None)
412
_ = pipe(**inputs)
413
mem_bytes_offloaded = torch.cuda.max_memory_allocated()
414
415
assert mem_bytes_offloaded < mem_bytes
416
for module in pipe.text_encoder, pipe.unet, pipe.vae:
417
assert module.device == torch.device("cpu")
418
419
def test_stable_diffusion_img2img_pipeline_multiple_of_8(self):
420
init_image = load_image(
421
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
422
"/img2img/sketch-mountains-input.jpg"
423
)
424
# resize to resolution that is divisible by 8 but not 16 or 32
425
init_image = init_image.resize((760, 504))
426
427
model_id = "CompVis/stable-diffusion-v1-4"
428
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
429
model_id,
430
safety_checker=None,
431
)
432
pipe.to(torch_device)
433
pipe.set_progress_bar_config(disable=None)
434
pipe.enable_attention_slicing()
435
436
prompt = "A fantasy landscape, trending on artstation"
437
438
generator = torch.manual_seed(0)
439
output = pipe(
440
prompt=prompt,
441
image=init_image,
442
strength=0.75,
443
guidance_scale=7.5,
444
generator=generator,
445
output_type="np",
446
)
447
image = output.images[0]
448
449
image_slice = image[255:258, 383:386, -1]
450
451
assert image.shape == (504, 760, 3)
452
expected_slice = np.array([0.9393, 0.9500, 0.9399, 0.9438, 0.9458, 0.9400, 0.9455, 0.9414, 0.9423])
453
454
assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3
455
456
457
@nightly
458
@require_torch_gpu
459
class StableDiffusionImg2ImgPipelineNightlyTests(unittest.TestCase):
460
def tearDown(self):
461
super().tearDown()
462
gc.collect()
463
torch.cuda.empty_cache()
464
465
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
466
generator = torch.Generator(device=generator_device).manual_seed(seed)
467
init_image = load_image(
468
"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
469
"/stable_diffusion_img2img/sketch-mountains-input.png"
470
)
471
inputs = {
472
"prompt": "a fantasy landscape, concept art, high resolution",
473
"image": init_image,
474
"generator": generator,
475
"num_inference_steps": 50,
476
"strength": 0.75,
477
"guidance_scale": 7.5,
478
"output_type": "np",
479
}
480
return inputs
481
482
def test_img2img_pndm(self):
483
sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
484
sd_pipe.to(torch_device)
485
sd_pipe.set_progress_bar_config(disable=None)
486
487
inputs = self.get_inputs(torch_device)
488
image = sd_pipe(**inputs).images[0]
489
490
expected_image = load_numpy(
491
"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
492
"/stable_diffusion_img2img/stable_diffusion_1_5_pndm.npy"
493
)
494
max_diff = np.abs(expected_image - image).max()
495
assert max_diff < 1e-3
496
497
def test_img2img_ddim(self):
498
sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
499
sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
500
sd_pipe.to(torch_device)
501
sd_pipe.set_progress_bar_config(disable=None)
502
503
inputs = self.get_inputs(torch_device)
504
image = sd_pipe(**inputs).images[0]
505
506
expected_image = load_numpy(
507
"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
508
"/stable_diffusion_img2img/stable_diffusion_1_5_ddim.npy"
509
)
510
max_diff = np.abs(expected_image - image).max()
511
assert max_diff < 1e-3
512
513
def test_img2img_lms(self):
514
sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
515
sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
516
sd_pipe.to(torch_device)
517
sd_pipe.set_progress_bar_config(disable=None)
518
519
inputs = self.get_inputs(torch_device)
520
image = sd_pipe(**inputs).images[0]
521
522
expected_image = load_numpy(
523
"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
524
"/stable_diffusion_img2img/stable_diffusion_1_5_lms.npy"
525
)
526
max_diff = np.abs(expected_image - image).max()
527
assert max_diff < 1e-3
528
529
def test_img2img_dpm(self):
530
sd_pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
531
sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
532
sd_pipe.to(torch_device)
533
sd_pipe.set_progress_bar_config(disable=None)
534
535
inputs = self.get_inputs(torch_device)
536
inputs["num_inference_steps"] = 30
537
image = sd_pipe(**inputs).images[0]
538
539
expected_image = load_numpy(
540
"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
541
"/stable_diffusion_img2img/stable_diffusion_1_5_dpm.npy"
542
)
543
max_diff = np.abs(expected_image - image).max()
544
assert max_diff < 1e-3
545
546