Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
shivamshrirao
GitHub Repository: shivamshrirao/diffusers
Path: blob/main/tests/pipelines/stable_diffusion/test_stable_diffusion.py
1448 views
1
# coding=utf-8
2
# Copyright 2023 HuggingFace Inc.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
# http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
16
17
import gc
18
import tempfile
19
import time
20
import unittest
21
22
import numpy as np
23
import torch
24
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
25
26
from diffusers import (
27
AutoencoderKL,
28
DDIMScheduler,
29
DPMSolverMultistepScheduler,
30
EulerAncestralDiscreteScheduler,
31
EulerDiscreteScheduler,
32
LMSDiscreteScheduler,
33
PNDMScheduler,
34
StableDiffusionPipeline,
35
UNet2DConditionModel,
36
logging,
37
)
38
from diffusers.models.attention_processor import AttnProcessor
39
from diffusers.utils import load_numpy, nightly, slow, torch_device
40
from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu
41
42
from ...models.test_models_unet_2d_condition import create_lora_layers
43
from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
44
from ...test_pipelines_common import PipelineTesterMixin
45
46
47
torch.backends.cuda.matmul.allow_tf32 = False
48
49
50
class StableDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
51
pipeline_class = StableDiffusionPipeline
52
params = TEXT_TO_IMAGE_PARAMS
53
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
54
55
def get_dummy_components(self):
56
torch.manual_seed(0)
57
unet = UNet2DConditionModel(
58
block_out_channels=(32, 64),
59
layers_per_block=2,
60
sample_size=32,
61
in_channels=4,
62
out_channels=4,
63
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
64
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
65
cross_attention_dim=32,
66
)
67
scheduler = DDIMScheduler(
68
beta_start=0.00085,
69
beta_end=0.012,
70
beta_schedule="scaled_linear",
71
clip_sample=False,
72
set_alpha_to_one=False,
73
)
74
torch.manual_seed(0)
75
vae = AutoencoderKL(
76
block_out_channels=[32, 64],
77
in_channels=3,
78
out_channels=3,
79
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
80
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
81
latent_channels=4,
82
)
83
torch.manual_seed(0)
84
text_encoder_config = CLIPTextConfig(
85
bos_token_id=0,
86
eos_token_id=2,
87
hidden_size=32,
88
intermediate_size=37,
89
layer_norm_eps=1e-05,
90
num_attention_heads=4,
91
num_hidden_layers=5,
92
pad_token_id=1,
93
vocab_size=1000,
94
)
95
text_encoder = CLIPTextModel(text_encoder_config)
96
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
97
98
components = {
99
"unet": unet,
100
"scheduler": scheduler,
101
"vae": vae,
102
"text_encoder": text_encoder,
103
"tokenizer": tokenizer,
104
"safety_checker": None,
105
"feature_extractor": None,
106
}
107
return components
108
109
def get_dummy_inputs(self, device, seed=0):
110
if str(device).startswith("mps"):
111
generator = torch.manual_seed(seed)
112
else:
113
generator = torch.Generator(device=device).manual_seed(seed)
114
inputs = {
115
"prompt": "A painting of a squirrel eating a burger",
116
"generator": generator,
117
"num_inference_steps": 2,
118
"guidance_scale": 6.0,
119
"output_type": "numpy",
120
}
121
return inputs
122
123
def test_stable_diffusion_ddim(self):
124
device = "cpu" # ensure determinism for the device-dependent torch.Generator
125
126
components = self.get_dummy_components()
127
sd_pipe = StableDiffusionPipeline(**components)
128
sd_pipe = sd_pipe.to(torch_device)
129
sd_pipe.set_progress_bar_config(disable=None)
130
131
inputs = self.get_dummy_inputs(device)
132
output = sd_pipe(**inputs)
133
image = output.images
134
135
image_slice = image[0, -3:, -3:, -1]
136
137
assert image.shape == (1, 64, 64, 3)
138
expected_slice = np.array([0.5643, 0.6017, 0.4799, 0.5267, 0.5584, 0.4641, 0.5159, 0.4963, 0.4791])
139
140
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
141
142
def test_stable_diffusion_lora(self):
143
device = "cpu" # ensure determinism for the device-dependent torch.Generator
144
145
components = self.get_dummy_components()
146
sd_pipe = StableDiffusionPipeline(**components)
147
sd_pipe = sd_pipe.to(torch_device)
148
sd_pipe.set_progress_bar_config(disable=None)
149
150
# forward 1
151
inputs = self.get_dummy_inputs(device)
152
output = sd_pipe(**inputs)
153
image = output.images
154
image_slice = image[0, -3:, -3:, -1]
155
156
# set lora layers
157
lora_attn_procs = create_lora_layers(sd_pipe.unet)
158
sd_pipe.unet.set_attn_processor(lora_attn_procs)
159
sd_pipe = sd_pipe.to(torch_device)
160
161
# forward 2
162
inputs = self.get_dummy_inputs(device)
163
output = sd_pipe(**inputs, cross_attention_kwargs={"scale": 0.0})
164
image = output.images
165
image_slice_1 = image[0, -3:, -3:, -1]
166
167
# forward 3
168
inputs = self.get_dummy_inputs(device)
169
output = sd_pipe(**inputs, cross_attention_kwargs={"scale": 0.5})
170
image = output.images
171
image_slice_2 = image[0, -3:, -3:, -1]
172
173
assert np.abs(image_slice - image_slice_1).max() < 1e-2
174
assert np.abs(image_slice - image_slice_2).max() > 1e-2
175
176
def test_stable_diffusion_prompt_embeds(self):
177
components = self.get_dummy_components()
178
sd_pipe = StableDiffusionPipeline(**components)
179
sd_pipe = sd_pipe.to(torch_device)
180
sd_pipe = sd_pipe.to(torch_device)
181
sd_pipe.set_progress_bar_config(disable=None)
182
183
inputs = self.get_dummy_inputs(torch_device)
184
inputs["prompt"] = 3 * [inputs["prompt"]]
185
186
# forward
187
output = sd_pipe(**inputs)
188
image_slice_1 = output.images[0, -3:, -3:, -1]
189
190
inputs = self.get_dummy_inputs(torch_device)
191
prompt = 3 * [inputs.pop("prompt")]
192
193
text_inputs = sd_pipe.tokenizer(
194
prompt,
195
padding="max_length",
196
max_length=sd_pipe.tokenizer.model_max_length,
197
truncation=True,
198
return_tensors="pt",
199
)
200
text_inputs = text_inputs["input_ids"].to(torch_device)
201
202
prompt_embeds = sd_pipe.text_encoder(text_inputs)[0]
203
204
inputs["prompt_embeds"] = prompt_embeds
205
206
# forward
207
output = sd_pipe(**inputs)
208
image_slice_2 = output.images[0, -3:, -3:, -1]
209
210
assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
211
212
def test_stable_diffusion_negative_prompt_embeds(self):
213
components = self.get_dummy_components()
214
sd_pipe = StableDiffusionPipeline(**components)
215
sd_pipe = sd_pipe.to(torch_device)
216
sd_pipe = sd_pipe.to(torch_device)
217
sd_pipe.set_progress_bar_config(disable=None)
218
219
inputs = self.get_dummy_inputs(torch_device)
220
negative_prompt = 3 * ["this is a negative prompt"]
221
inputs["negative_prompt"] = negative_prompt
222
inputs["prompt"] = 3 * [inputs["prompt"]]
223
224
# forward
225
output = sd_pipe(**inputs)
226
image_slice_1 = output.images[0, -3:, -3:, -1]
227
228
inputs = self.get_dummy_inputs(torch_device)
229
prompt = 3 * [inputs.pop("prompt")]
230
231
embeds = []
232
for p in [prompt, negative_prompt]:
233
text_inputs = sd_pipe.tokenizer(
234
p,
235
padding="max_length",
236
max_length=sd_pipe.tokenizer.model_max_length,
237
truncation=True,
238
return_tensors="pt",
239
)
240
text_inputs = text_inputs["input_ids"].to(torch_device)
241
242
embeds.append(sd_pipe.text_encoder(text_inputs)[0])
243
244
inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
245
246
# forward
247
output = sd_pipe(**inputs)
248
image_slice_2 = output.images[0, -3:, -3:, -1]
249
250
assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
251
252
def test_stable_diffusion_ddim_factor_8(self):
253
device = "cpu" # ensure determinism for the device-dependent torch.Generator
254
255
components = self.get_dummy_components()
256
sd_pipe = StableDiffusionPipeline(**components)
257
sd_pipe = sd_pipe.to(device)
258
sd_pipe.set_progress_bar_config(disable=None)
259
260
inputs = self.get_dummy_inputs(device)
261
output = sd_pipe(**inputs, height=136, width=136)
262
image = output.images
263
264
image_slice = image[0, -3:, -3:, -1]
265
266
assert image.shape == (1, 136, 136, 3)
267
expected_slice = np.array([0.5524, 0.5626, 0.6069, 0.4727, 0.386, 0.3995, 0.4613, 0.4328, 0.4269])
268
269
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
270
271
def test_stable_diffusion_pndm(self):
272
device = "cpu" # ensure determinism for the device-dependent torch.Generator
273
components = self.get_dummy_components()
274
sd_pipe = StableDiffusionPipeline(**components)
275
sd_pipe.scheduler = PNDMScheduler(skip_prk_steps=True)
276
sd_pipe = sd_pipe.to(device)
277
sd_pipe.set_progress_bar_config(disable=None)
278
279
inputs = self.get_dummy_inputs(device)
280
output = sd_pipe(**inputs)
281
image = output.images
282
image_slice = image[0, -3:, -3:, -1]
283
284
assert image.shape == (1, 64, 64, 3)
285
expected_slice = np.array([0.5094, 0.5674, 0.4667, 0.5125, 0.5696, 0.4674, 0.5277, 0.4964, 0.4945])
286
287
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
288
289
def test_stable_diffusion_no_safety_checker(self):
290
pipe = StableDiffusionPipeline.from_pretrained(
291
"hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
292
)
293
assert isinstance(pipe, StableDiffusionPipeline)
294
assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
295
assert pipe.safety_checker is None
296
297
image = pipe("example prompt", num_inference_steps=2).images[0]
298
assert image is not None
299
300
# check that there's no error when saving a pipeline with one of the models being None
301
with tempfile.TemporaryDirectory() as tmpdirname:
302
pipe.save_pretrained(tmpdirname)
303
pipe = StableDiffusionPipeline.from_pretrained(tmpdirname)
304
305
# sanity check that the pipeline still works
306
assert pipe.safety_checker is None
307
image = pipe("example prompt", num_inference_steps=2).images[0]
308
assert image is not None
309
310
def test_stable_diffusion_k_lms(self):
311
device = "cpu" # ensure determinism for the device-dependent torch.Generator
312
313
components = self.get_dummy_components()
314
sd_pipe = StableDiffusionPipeline(**components)
315
sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
316
sd_pipe = sd_pipe.to(device)
317
sd_pipe.set_progress_bar_config(disable=None)
318
319
inputs = self.get_dummy_inputs(device)
320
output = sd_pipe(**inputs)
321
image = output.images
322
image_slice = image[0, -3:, -3:, -1]
323
324
assert image.shape == (1, 64, 64, 3)
325
expected_slice = np.array(
326
[
327
0.47082293033599854,
328
0.5371589064598083,
329
0.4562119245529175,
330
0.5220914483070374,
331
0.5733777284622192,
332
0.4795039892196655,
333
0.5465868711471558,
334
0.5074326395988464,
335
0.5042197108268738,
336
]
337
)
338
339
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
340
341
def test_stable_diffusion_k_euler_ancestral(self):
342
device = "cpu" # ensure determinism for the device-dependent torch.Generator
343
344
components = self.get_dummy_components()
345
sd_pipe = StableDiffusionPipeline(**components)
346
sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
347
sd_pipe = sd_pipe.to(device)
348
sd_pipe.set_progress_bar_config(disable=None)
349
350
inputs = self.get_dummy_inputs(device)
351
output = sd_pipe(**inputs)
352
image = output.images
353
image_slice = image[0, -3:, -3:, -1]
354
355
assert image.shape == (1, 64, 64, 3)
356
expected_slice = np.array(
357
[
358
0.4707113206386566,
359
0.5372191071510315,
360
0.4563021957874298,
361
0.5220003724098206,
362
0.5734264850616455,
363
0.4794946610927582,
364
0.5463782548904419,
365
0.5074145197868347,
366
0.504422664642334,
367
]
368
)
369
370
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
371
372
def test_stable_diffusion_k_euler(self):
373
device = "cpu" # ensure determinism for the device-dependent torch.Generator
374
375
components = self.get_dummy_components()
376
sd_pipe = StableDiffusionPipeline(**components)
377
sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
378
sd_pipe = sd_pipe.to(device)
379
sd_pipe.set_progress_bar_config(disable=None)
380
381
inputs = self.get_dummy_inputs(device)
382
output = sd_pipe(**inputs)
383
image = output.images
384
image_slice = image[0, -3:, -3:, -1]
385
386
assert image.shape == (1, 64, 64, 3)
387
expected_slice = np.array(
388
[
389
0.47082313895225525,
390
0.5371587872505188,
391
0.4562119245529175,
392
0.5220913887023926,
393
0.5733776688575745,
394
0.47950395941734314,
395
0.546586811542511,
396
0.5074326992034912,
397
0.5042197108268738,
398
]
399
)
400
401
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
402
403
def test_stable_diffusion_vae_slicing(self):
404
device = "cpu" # ensure determinism for the device-dependent torch.Generator
405
components = self.get_dummy_components()
406
components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
407
sd_pipe = StableDiffusionPipeline(**components)
408
sd_pipe = sd_pipe.to(device)
409
sd_pipe.set_progress_bar_config(disable=None)
410
411
image_count = 4
412
413
inputs = self.get_dummy_inputs(device)
414
inputs["prompt"] = [inputs["prompt"]] * image_count
415
output_1 = sd_pipe(**inputs)
416
417
# make sure sliced vae decode yields the same result
418
sd_pipe.enable_vae_slicing()
419
inputs = self.get_dummy_inputs(device)
420
inputs["prompt"] = [inputs["prompt"]] * image_count
421
output_2 = sd_pipe(**inputs)
422
423
# there is a small discrepancy at image borders vs. full batch decode
424
assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 3e-3
425
426
def test_stable_diffusion_vae_tiling(self):
427
device = "cpu" # ensure determinism for the device-dependent torch.Generator
428
components = self.get_dummy_components()
429
430
# make sure here that pndm scheduler skips prk
431
components["safety_checker"] = None
432
sd_pipe = StableDiffusionPipeline(**components)
433
sd_pipe = sd_pipe.to(device)
434
sd_pipe.set_progress_bar_config(disable=None)
435
436
prompt = "A painting of a squirrel eating a burger"
437
438
# Test that tiled decode at 512x512 yields the same result as the non-tiled decode
439
generator = torch.Generator(device=device).manual_seed(0)
440
output_1 = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
441
442
# make sure tiled vae decode yields the same result
443
sd_pipe.enable_vae_tiling()
444
generator = torch.Generator(device=device).manual_seed(0)
445
output_2 = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
446
447
assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 5e-1
448
449
# test that tiled decode works with various shapes
450
shapes = [(1, 4, 73, 97), (1, 4, 97, 73), (1, 4, 49, 65), (1, 4, 65, 49)]
451
for shape in shapes:
452
zeros = torch.zeros(shape).to(device)
453
sd_pipe.vae.decode(zeros)
454
455
def test_stable_diffusion_negative_prompt(self):
456
device = "cpu" # ensure determinism for the device-dependent torch.Generator
457
components = self.get_dummy_components()
458
components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
459
sd_pipe = StableDiffusionPipeline(**components)
460
sd_pipe = sd_pipe.to(device)
461
sd_pipe.set_progress_bar_config(disable=None)
462
463
inputs = self.get_dummy_inputs(device)
464
negative_prompt = "french fries"
465
output = sd_pipe(**inputs, negative_prompt=negative_prompt)
466
467
image = output.images
468
image_slice = image[0, -3:, -3:, -1]
469
470
assert image.shape == (1, 64, 64, 3)
471
expected_slice = np.array(
472
[
473
0.5108221173286438,
474
0.5688379406929016,
475
0.4685141146183014,
476
0.5098261833190918,
477
0.5657756328582764,
478
0.4631010890007019,
479
0.5226285457611084,
480
0.49129390716552734,
481
0.4899061322212219,
482
]
483
)
484
485
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
486
487
def test_stable_diffusion_long_prompt(self):
488
components = self.get_dummy_components()
489
components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
490
sd_pipe = StableDiffusionPipeline(**components)
491
sd_pipe = sd_pipe.to(torch_device)
492
sd_pipe.set_progress_bar_config(disable=None)
493
494
do_classifier_free_guidance = True
495
negative_prompt = None
496
num_images_per_prompt = 1
497
logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
498
499
prompt = 25 * "@"
500
with CaptureLogger(logger) as cap_logger_3:
501
text_embeddings_3 = sd_pipe._encode_prompt(
502
prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
503
)
504
505
prompt = 100 * "@"
506
with CaptureLogger(logger) as cap_logger:
507
text_embeddings = sd_pipe._encode_prompt(
508
prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
509
)
510
511
negative_prompt = "Hello"
512
with CaptureLogger(logger) as cap_logger_2:
513
text_embeddings_2 = sd_pipe._encode_prompt(
514
prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
515
)
516
517
assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape
518
assert text_embeddings.shape[1] == 77
519
520
assert cap_logger.out == cap_logger_2.out
521
# 100 - 77 + 1 (BOS token) + 1 (EOS token) = 25
522
assert cap_logger.out.count("@") == 25
523
assert cap_logger_3.out == ""
524
525
def test_stable_diffusion_height_width_opt(self):
526
components = self.get_dummy_components()
527
components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
528
sd_pipe = StableDiffusionPipeline(**components)
529
sd_pipe = sd_pipe.to(torch_device)
530
sd_pipe.set_progress_bar_config(disable=None)
531
532
prompt = "hey"
533
534
output = sd_pipe(prompt, num_inference_steps=1, output_type="np")
535
image_shape = output.images[0].shape[:2]
536
assert image_shape == (64, 64)
537
538
output = sd_pipe(prompt, num_inference_steps=1, height=96, width=96, output_type="np")
539
image_shape = output.images[0].shape[:2]
540
assert image_shape == (96, 96)
541
542
config = dict(sd_pipe.unet.config)
543
config["sample_size"] = 96
544
sd_pipe.unet = UNet2DConditionModel.from_config(config).to(torch_device)
545
output = sd_pipe(prompt, num_inference_steps=1, output_type="np")
546
image_shape = output.images[0].shape[:2]
547
assert image_shape == (192, 192)
548
549
550
@slow
551
@require_torch_gpu
552
class StableDiffusionPipelineSlowTests(unittest.TestCase):
553
def tearDown(self):
554
super().tearDown()
555
gc.collect()
556
torch.cuda.empty_cache()
557
558
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
559
generator = torch.Generator(device=generator_device).manual_seed(seed)
560
latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
561
latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
562
inputs = {
563
"prompt": "a photograph of an astronaut riding a horse",
564
"latents": latents,
565
"generator": generator,
566
"num_inference_steps": 3,
567
"guidance_scale": 7.5,
568
"output_type": "numpy",
569
}
570
return inputs
571
572
def test_stable_diffusion_1_1_pndm(self):
573
sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1")
574
sd_pipe = sd_pipe.to(torch_device)
575
sd_pipe.set_progress_bar_config(disable=None)
576
577
inputs = self.get_inputs(torch_device)
578
image = sd_pipe(**inputs).images
579
image_slice = image[0, -3:, -3:, -1].flatten()
580
581
assert image.shape == (1, 512, 512, 3)
582
expected_slice = np.array([0.43625, 0.43554, 0.36670, 0.40660, 0.39703, 0.38658, 0.43936, 0.43557, 0.40592])
583
assert np.abs(image_slice - expected_slice).max() < 1e-4
584
585
def test_stable_diffusion_1_4_pndm(self):
586
sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
587
sd_pipe = sd_pipe.to(torch_device)
588
sd_pipe.set_progress_bar_config(disable=None)
589
590
inputs = self.get_inputs(torch_device)
591
image = sd_pipe(**inputs).images
592
image_slice = image[0, -3:, -3:, -1].flatten()
593
594
assert image.shape == (1, 512, 512, 3)
595
expected_slice = np.array([0.57400, 0.47841, 0.31625, 0.63583, 0.58306, 0.55056, 0.50825, 0.56306, 0.55748])
596
assert np.abs(image_slice - expected_slice).max() < 1e-4
597
598
def test_stable_diffusion_ddim(self):
599
sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
600
sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
601
sd_pipe = sd_pipe.to(torch_device)
602
sd_pipe.set_progress_bar_config(disable=None)
603
604
inputs = self.get_inputs(torch_device)
605
image = sd_pipe(**inputs).images
606
image_slice = image[0, -3:, -3:, -1].flatten()
607
608
assert image.shape == (1, 512, 512, 3)
609
expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239])
610
assert np.abs(image_slice - expected_slice).max() < 1e-4
611
612
def test_stable_diffusion_lms(self):
613
sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
614
sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
615
sd_pipe = sd_pipe.to(torch_device)
616
sd_pipe.set_progress_bar_config(disable=None)
617
618
inputs = self.get_inputs(torch_device)
619
image = sd_pipe(**inputs).images
620
image_slice = image[0, -3:, -3:, -1].flatten()
621
622
assert image.shape == (1, 512, 512, 3)
623
expected_slice = np.array([0.10542, 0.09620, 0.07332, 0.09015, 0.09382, 0.07597, 0.08496, 0.07806, 0.06455])
624
assert np.abs(image_slice - expected_slice).max() < 1e-4
625
626
def test_stable_diffusion_dpm(self):
627
sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
628
sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
629
sd_pipe = sd_pipe.to(torch_device)
630
sd_pipe.set_progress_bar_config(disable=None)
631
632
inputs = self.get_inputs(torch_device)
633
image = sd_pipe(**inputs).images
634
image_slice = image[0, -3:, -3:, -1].flatten()
635
636
assert image.shape == (1, 512, 512, 3)
637
expected_slice = np.array([0.03503, 0.03494, 0.01087, 0.03128, 0.02552, 0.00803, 0.00742, 0.00372, 0.00000])
638
assert np.abs(image_slice - expected_slice).max() < 1e-4
639
640
def test_stable_diffusion_attention_slicing(self):
641
torch.cuda.reset_peak_memory_stats()
642
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
643
pipe = pipe.to(torch_device)
644
pipe.set_progress_bar_config(disable=None)
645
646
# enable attention slicing
647
pipe.enable_attention_slicing()
648
inputs = self.get_inputs(torch_device, dtype=torch.float16)
649
image_sliced = pipe(**inputs).images
650
651
mem_bytes = torch.cuda.max_memory_allocated()
652
torch.cuda.reset_peak_memory_stats()
653
# make sure that less than 3.75 GB is allocated
654
assert mem_bytes < 3.75 * 10**9
655
656
# disable slicing
657
pipe.disable_attention_slicing()
658
inputs = self.get_inputs(torch_device, dtype=torch.float16)
659
image = pipe(**inputs).images
660
661
# make sure that more than 3.75 GB is allocated
662
mem_bytes = torch.cuda.max_memory_allocated()
663
assert mem_bytes > 3.75 * 10**9
664
assert np.abs(image_sliced - image).max() < 1e-3
665
666
def test_stable_diffusion_vae_slicing(self):
667
torch.cuda.reset_peak_memory_stats()
668
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
669
pipe = pipe.to(torch_device)
670
pipe.set_progress_bar_config(disable=None)
671
pipe.enable_attention_slicing()
672
673
# enable vae slicing
674
pipe.enable_vae_slicing()
675
inputs = self.get_inputs(torch_device, dtype=torch.float16)
676
inputs["prompt"] = [inputs["prompt"]] * 4
677
inputs["latents"] = torch.cat([inputs["latents"]] * 4)
678
image_sliced = pipe(**inputs).images
679
680
mem_bytes = torch.cuda.max_memory_allocated()
681
torch.cuda.reset_peak_memory_stats()
682
# make sure that less than 4 GB is allocated
683
assert mem_bytes < 4e9
684
685
# disable vae slicing
686
pipe.disable_vae_slicing()
687
inputs = self.get_inputs(torch_device, dtype=torch.float16)
688
inputs["prompt"] = [inputs["prompt"]] * 4
689
inputs["latents"] = torch.cat([inputs["latents"]] * 4)
690
image = pipe(**inputs).images
691
692
# make sure that more than 4 GB is allocated
693
mem_bytes = torch.cuda.max_memory_allocated()
694
assert mem_bytes > 4e9
695
# There is a small discrepancy at the image borders vs. a fully batched version.
696
assert np.abs(image_sliced - image).max() < 1e-2
697
698
def test_stable_diffusion_vae_tiling(self):
699
torch.cuda.reset_peak_memory_stats()
700
model_id = "CompVis/stable-diffusion-v1-4"
701
pipe = StableDiffusionPipeline.from_pretrained(model_id, revision="fp16", torch_dtype=torch.float16)
702
pipe.set_progress_bar_config(disable=None)
703
pipe.enable_attention_slicing()
704
pipe.unet = pipe.unet.to(memory_format=torch.channels_last)
705
pipe.vae = pipe.vae.to(memory_format=torch.channels_last)
706
707
prompt = "a photograph of an astronaut riding a horse"
708
709
# enable vae tiling
710
pipe.enable_vae_tiling()
711
pipe.enable_model_cpu_offload()
712
generator = torch.Generator(device="cpu").manual_seed(0)
713
output_chunked = pipe(
714
[prompt],
715
width=1024,
716
height=1024,
717
generator=generator,
718
guidance_scale=7.5,
719
num_inference_steps=2,
720
output_type="numpy",
721
)
722
image_chunked = output_chunked.images
723
724
mem_bytes = torch.cuda.max_memory_allocated()
725
726
# disable vae tiling
727
pipe.disable_vae_tiling()
728
generator = torch.Generator(device="cpu").manual_seed(0)
729
output = pipe(
730
[prompt],
731
width=1024,
732
height=1024,
733
generator=generator,
734
guidance_scale=7.5,
735
num_inference_steps=2,
736
output_type="numpy",
737
)
738
image = output.images
739
740
assert mem_bytes < 1e10
741
assert np.abs(image_chunked.flatten() - image.flatten()).max() < 1e-2
742
743
def test_stable_diffusion_fp16_vs_autocast(self):
744
# this test makes sure that the original model with autocast
745
# and the new model with fp16 yield the same result
746
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
747
pipe = pipe.to(torch_device)
748
pipe.set_progress_bar_config(disable=None)
749
750
inputs = self.get_inputs(torch_device, dtype=torch.float16)
751
image_fp16 = pipe(**inputs).images
752
753
with torch.autocast(torch_device):
754
inputs = self.get_inputs(torch_device)
755
image_autocast = pipe(**inputs).images
756
757
# Make sure results are close enough
758
diff = np.abs(image_fp16.flatten() - image_autocast.flatten())
759
# They ARE different since ops are not run always at the same precision
760
# however, they should be extremely close.
761
assert diff.mean() < 2e-2
762
763
def test_stable_diffusion_intermediate_state(self):
764
number_of_steps = 0
765
766
def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
767
callback_fn.has_been_called = True
768
nonlocal number_of_steps
769
number_of_steps += 1
770
if step == 1:
771
latents = latents.detach().cpu().numpy()
772
assert latents.shape == (1, 4, 64, 64)
773
latents_slice = latents[0, -3:, -3:, -1]
774
expected_slice = np.array(
775
[-0.5693, -0.3018, -0.9746, 0.0518, -0.8770, 0.7559, -1.7402, 0.1022, 1.1582]
776
)
777
778
assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
779
elif step == 2:
780
latents = latents.detach().cpu().numpy()
781
assert latents.shape == (1, 4, 64, 64)
782
latents_slice = latents[0, -3:, -3:, -1]
783
expected_slice = np.array(
784
[-0.1958, -0.2993, -1.0166, -0.5005, -0.4810, 0.6162, -0.9492, 0.6621, 1.4492]
785
)
786
787
assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
788
789
callback_fn.has_been_called = False
790
791
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
792
pipe = pipe.to(torch_device)
793
pipe.set_progress_bar_config(disable=None)
794
pipe.enable_attention_slicing()
795
796
inputs = self.get_inputs(torch_device, dtype=torch.float16)
797
pipe(**inputs, callback=callback_fn, callback_steps=1)
798
assert callback_fn.has_been_called
799
assert number_of_steps == inputs["num_inference_steps"]
800
801
def test_stable_diffusion_low_cpu_mem_usage(self):
802
pipeline_id = "CompVis/stable-diffusion-v1-4"
803
804
start_time = time.time()
805
pipeline_low_cpu_mem_usage = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16)
806
pipeline_low_cpu_mem_usage.to(torch_device)
807
low_cpu_mem_usage_time = time.time() - start_time
808
809
start_time = time.time()
810
_ = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16, low_cpu_mem_usage=False)
811
normal_load_time = time.time() - start_time
812
813
assert 2 * low_cpu_mem_usage_time < normal_load_time
814
815
def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
816
torch.cuda.empty_cache()
817
torch.cuda.reset_max_memory_allocated()
818
torch.cuda.reset_peak_memory_stats()
819
820
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
821
pipe = pipe.to(torch_device)
822
pipe.set_progress_bar_config(disable=None)
823
pipe.enable_attention_slicing(1)
824
pipe.enable_sequential_cpu_offload()
825
826
inputs = self.get_inputs(torch_device, dtype=torch.float16)
827
_ = pipe(**inputs)
828
829
mem_bytes = torch.cuda.max_memory_allocated()
830
# make sure that less than 2.8 GB is allocated
831
assert mem_bytes < 2.8 * 10**9
832
833
def test_stable_diffusion_pipeline_with_model_offloading(self):
834
torch.cuda.empty_cache()
835
torch.cuda.reset_max_memory_allocated()
836
torch.cuda.reset_peak_memory_stats()
837
838
inputs = self.get_inputs(torch_device, dtype=torch.float16)
839
840
# Normal inference
841
842
pipe = StableDiffusionPipeline.from_pretrained(
843
"CompVis/stable-diffusion-v1-4",
844
torch_dtype=torch.float16,
845
)
846
pipe.unet.set_attn_processor(AttnProcessor())
847
pipe.to(torch_device)
848
pipe.set_progress_bar_config(disable=None)
849
outputs = pipe(**inputs)
850
mem_bytes = torch.cuda.max_memory_allocated()
851
852
# With model offloading
853
854
# Reload but don't move to cuda
855
pipe = StableDiffusionPipeline.from_pretrained(
856
"CompVis/stable-diffusion-v1-4",
857
torch_dtype=torch.float16,
858
)
859
pipe.unet.set_attn_processor(AttnProcessor())
860
861
torch.cuda.empty_cache()
862
torch.cuda.reset_max_memory_allocated()
863
torch.cuda.reset_peak_memory_stats()
864
865
pipe.enable_model_cpu_offload()
866
pipe.set_progress_bar_config(disable=None)
867
inputs = self.get_inputs(torch_device, dtype=torch.float16)
868
869
outputs_offloaded = pipe(**inputs)
870
mem_bytes_offloaded = torch.cuda.max_memory_allocated()
871
872
assert np.abs(outputs.images - outputs_offloaded.images).max() < 1e-3
873
assert mem_bytes_offloaded < mem_bytes
874
assert mem_bytes_offloaded < 3.5 * 10**9
875
for module in pipe.text_encoder, pipe.unet, pipe.vae, pipe.safety_checker:
876
assert module.device == torch.device("cpu")
877
878
# With attention slicing
879
torch.cuda.empty_cache()
880
torch.cuda.reset_max_memory_allocated()
881
torch.cuda.reset_peak_memory_stats()
882
883
pipe.enable_attention_slicing()
884
_ = pipe(**inputs)
885
mem_bytes_slicing = torch.cuda.max_memory_allocated()
886
887
assert mem_bytes_slicing < mem_bytes_offloaded
888
assert mem_bytes_slicing < 3 * 10**9
889
890
891
@nightly
892
@require_torch_gpu
893
class StableDiffusionPipelineNightlyTests(unittest.TestCase):
894
def tearDown(self):
895
super().tearDown()
896
gc.collect()
897
torch.cuda.empty_cache()
898
899
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
900
generator = torch.Generator(device=generator_device).manual_seed(seed)
901
latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
902
latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
903
inputs = {
904
"prompt": "a photograph of an astronaut riding a horse",
905
"latents": latents,
906
"generator": generator,
907
"num_inference_steps": 50,
908
"guidance_scale": 7.5,
909
"output_type": "numpy",
910
}
911
return inputs
912
913
def test_stable_diffusion_1_4_pndm(self):
914
sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
915
sd_pipe.set_progress_bar_config(disable=None)
916
917
inputs = self.get_inputs(torch_device)
918
image = sd_pipe(**inputs).images[0]
919
920
expected_image = load_numpy(
921
"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
922
"/stable_diffusion_text2img/stable_diffusion_1_4_pndm.npy"
923
)
924
max_diff = np.abs(expected_image - image).max()
925
assert max_diff < 1e-3
926
927
def test_stable_diffusion_1_5_pndm(self):
928
sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5").to(torch_device)
929
sd_pipe.set_progress_bar_config(disable=None)
930
931
inputs = self.get_inputs(torch_device)
932
image = sd_pipe(**inputs).images[0]
933
934
expected_image = load_numpy(
935
"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
936
"/stable_diffusion_text2img/stable_diffusion_1_5_pndm.npy"
937
)
938
max_diff = np.abs(expected_image - image).max()
939
assert max_diff < 1e-3
940
941
def test_stable_diffusion_ddim(self):
942
sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
943
sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
944
sd_pipe.set_progress_bar_config(disable=None)
945
946
inputs = self.get_inputs(torch_device)
947
image = sd_pipe(**inputs).images[0]
948
949
expected_image = load_numpy(
950
"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
951
"/stable_diffusion_text2img/stable_diffusion_1_4_ddim.npy"
952
)
953
max_diff = np.abs(expected_image - image).max()
954
assert max_diff < 1e-3
955
956
def test_stable_diffusion_lms(self):
957
sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
958
sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
959
sd_pipe.set_progress_bar_config(disable=None)
960
961
inputs = self.get_inputs(torch_device)
962
image = sd_pipe(**inputs).images[0]
963
964
expected_image = load_numpy(
965
"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
966
"/stable_diffusion_text2img/stable_diffusion_1_4_lms.npy"
967
)
968
max_diff = np.abs(expected_image - image).max()
969
assert max_diff < 1e-3
970
971
def test_stable_diffusion_euler(self):
972
sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
973
sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
974
sd_pipe.set_progress_bar_config(disable=None)
975
976
inputs = self.get_inputs(torch_device)
977
image = sd_pipe(**inputs).images[0]
978
979
expected_image = load_numpy(
980
"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
981
"/stable_diffusion_text2img/stable_diffusion_1_4_euler.npy"
982
)
983
max_diff = np.abs(expected_image - image).max()
984
assert max_diff < 1e-3
985
986
def test_stable_diffusion_dpm(self):
987
sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
988
sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
989
sd_pipe.set_progress_bar_config(disable=None)
990
991
inputs = self.get_inputs(torch_device)
992
inputs["num_inference_steps"] = 25
993
image = sd_pipe(**inputs).images[0]
994
995
expected_image = load_numpy(
996
"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
997
"/stable_diffusion_text2img/stable_diffusion_1_4_dpm_multi.npy"
998
)
999
max_diff = np.abs(expected_image - image).max()
1000
assert max_diff < 1e-3
1001
1002