CoCalc -- test_stable

GitHub Repository: shivamshrirao/diffusers
Path: blob/main/tests/pipelines/stable_diffusion/test_stable_diffusion.py
¹⁴⁴⁸ views
1
# coding=utf-8
2
# Copyright 2023 HuggingFace Inc.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16

17
import gc
18
import tempfile
19
import time
20
import unittest
21

22
import numpy as np
23
import torch
24
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
25

26
from diffusers import (
27
    AutoencoderKL,
28
    DDIMScheduler,
29
    DPMSolverMultistepScheduler,
30
    EulerAncestralDiscreteScheduler,
31
    EulerDiscreteScheduler,
32
    LMSDiscreteScheduler,
33
    PNDMScheduler,
34
    StableDiffusionPipeline,
35
    UNet2DConditionModel,
36
    logging,
37
)
38
from diffusers.models.attention_processor import AttnProcessor
39
from diffusers.utils import load_numpy, nightly, slow, torch_device
40
from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu
41

42
from ...models.test_models_unet_2d_condition import create_lora_layers
43
from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
44
from ...test_pipelines_common import PipelineTesterMixin
45

46

47
torch.backends.cuda.matmul.allow_tf32 = False
48

49

50
class StableDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
51
    pipeline_class = StableDiffusionPipeline
52
    params = TEXT_TO_IMAGE_PARAMS
53
    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
54

55
    def get_dummy_components(self):
56
        torch.manual_seed(0)
57
        unet = UNet2DConditionModel(
58
            block_out_channels=(32, 64),
59
            layers_per_block=2,
60
            sample_size=32,
61
            in_channels=4,
62
            out_channels=4,
63
            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
64
            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
65
            cross_attention_dim=32,
66
        )
67
        scheduler = DDIMScheduler(
68
            beta_start=0.00085,
69
            beta_end=0.012,
70
            beta_schedule="scaled_linear",
71
            clip_sample=False,
72
            set_alpha_to_one=False,
73
        )
74
        torch.manual_seed(0)
75
        vae = AutoencoderKL(
76
            block_out_channels=[32, 64],
77
            in_channels=3,
78
            out_channels=3,
79
            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
80
            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
81
            latent_channels=4,
82
        )
83
        torch.manual_seed(0)
84
        text_encoder_config = CLIPTextConfig(
85
            bos_token_id=0,
86
            eos_token_id=2,
87
            hidden_size=32,
88
            intermediate_size=37,
89
            layer_norm_eps=1e-05,
90
            num_attention_heads=4,
91
            num_hidden_layers=5,
92
            pad_token_id=1,
93
            vocab_size=1000,
94
        )
95
        text_encoder = CLIPTextModel(text_encoder_config)
96
        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
97

98
        components = {
99
            "unet": unet,
100
            "scheduler": scheduler,
101
            "vae": vae,
102
            "text_encoder": text_encoder,
103
            "tokenizer": tokenizer,
104
            "safety_checker": None,
105
            "feature_extractor": None,
106
        }
107
        return components
108

109
    def get_dummy_inputs(self, device, seed=0):
110
        if str(device).startswith("mps"):
111
            generator = torch.manual_seed(seed)
112
        else:
113
            generator = torch.Generator(device=device).manual_seed(seed)
114
        inputs = {
115
            "prompt": "A painting of a squirrel eating a burger",
116
            "generator": generator,
117
            "num_inference_steps": 2,
118
            "guidance_scale": 6.0,
119
            "output_type": "numpy",
120
        }
121
        return inputs
122

123
    def test_stable_diffusion_ddim(self):
124
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
125

126
        components = self.get_dummy_components()
127
        sd_pipe = StableDiffusionPipeline(**components)
128
        sd_pipe = sd_pipe.to(torch_device)
129
        sd_pipe.set_progress_bar_config(disable=None)
130

131
        inputs = self.get_dummy_inputs(device)
132
        output = sd_pipe(**inputs)
133
        image = output.images
134

135
        image_slice = image[0, -3:, -3:, -1]
136

137
        assert image.shape == (1, 64, 64, 3)
138
        expected_slice = np.array([0.5643, 0.6017, 0.4799, 0.5267, 0.5584, 0.4641, 0.5159, 0.4963, 0.4791])
139

140
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
141

142
    def test_stable_diffusion_lora(self):
143
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
144

145
        components = self.get_dummy_components()
146
        sd_pipe = StableDiffusionPipeline(**components)
147
        sd_pipe = sd_pipe.to(torch_device)
148
        sd_pipe.set_progress_bar_config(disable=None)
149

150
        # forward 1
151
        inputs = self.get_dummy_inputs(device)
152
        output = sd_pipe(**inputs)
153
        image = output.images
154
        image_slice = image[0, -3:, -3:, -1]
155

156
        # set lora layers
157
        lora_attn_procs = create_lora_layers(sd_pipe.unet)
158
        sd_pipe.unet.set_attn_processor(lora_attn_procs)
159
        sd_pipe = sd_pipe.to(torch_device)
160

161
        # forward 2
162
        inputs = self.get_dummy_inputs(device)
163
        output = sd_pipe(**inputs, cross_attention_kwargs={"scale": 0.0})
164
        image = output.images
165
        image_slice_1 = image[0, -3:, -3:, -1]
166

167
        # forward 3
168
        inputs = self.get_dummy_inputs(device)
169
        output = sd_pipe(**inputs, cross_attention_kwargs={"scale": 0.5})
170
        image = output.images
171
        image_slice_2 = image[0, -3:, -3:, -1]
172

173
        assert np.abs(image_slice - image_slice_1).max() < 1e-2
174
        assert np.abs(image_slice - image_slice_2).max() > 1e-2
175

176
    def test_stable_diffusion_prompt_embeds(self):
177
        components = self.get_dummy_components()
178
        sd_pipe = StableDiffusionPipeline(**components)
179
        sd_pipe = sd_pipe.to(torch_device)
180
        sd_pipe = sd_pipe.to(torch_device)
181
        sd_pipe.set_progress_bar_config(disable=None)
182

183
        inputs = self.get_dummy_inputs(torch_device)
184
        inputs["prompt"] = 3 * [inputs["prompt"]]
185

186
        # forward
187
        output = sd_pipe(**inputs)
188
        image_slice_1 = output.images[0, -3:, -3:, -1]
189

190
        inputs = self.get_dummy_inputs(torch_device)
191
        prompt = 3 * [inputs.pop("prompt")]
192

193
        text_inputs = sd_pipe.tokenizer(
194
            prompt,
195
            padding="max_length",
196
            max_length=sd_pipe.tokenizer.model_max_length,
197
            truncation=True,
198
            return_tensors="pt",
199
        )
200
        text_inputs = text_inputs["input_ids"].to(torch_device)
201

202
        prompt_embeds = sd_pipe.text_encoder(text_inputs)[0]
203

204
        inputs["prompt_embeds"] = prompt_embeds
205

206
        # forward
207
        output = sd_pipe(**inputs)
208
        image_slice_2 = output.images[0, -3:, -3:, -1]
209

210
        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
211

212
    def test_stable_diffusion_negative_prompt_embeds(self):
213
        components = self.get_dummy_components()
214
        sd_pipe = StableDiffusionPipeline(**components)
215
        sd_pipe = sd_pipe.to(torch_device)
216
        sd_pipe = sd_pipe.to(torch_device)
217
        sd_pipe.set_progress_bar_config(disable=None)
218

219
        inputs = self.get_dummy_inputs(torch_device)
220
        negative_prompt = 3 * ["this is a negative prompt"]
221
        inputs["negative_prompt"] = negative_prompt
222
        inputs["prompt"] = 3 * [inputs["prompt"]]
223

224
        # forward
225
        output = sd_pipe(**inputs)
226
        image_slice_1 = output.images[0, -3:, -3:, -1]
227

228
        inputs = self.get_dummy_inputs(torch_device)
229
        prompt = 3 * [inputs.pop("prompt")]
230

231
        embeds = []
232
        for p in [prompt, negative_prompt]:
233
            text_inputs = sd_pipe.tokenizer(
234
                p,
235
                padding="max_length",
236
                max_length=sd_pipe.tokenizer.model_max_length,
237
                truncation=True,
238
                return_tensors="pt",
239
            )
240
            text_inputs = text_inputs["input_ids"].to(torch_device)
241

242
            embeds.append(sd_pipe.text_encoder(text_inputs)[0])
243

244
        inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
245

246
        # forward
247
        output = sd_pipe(**inputs)
248
        image_slice_2 = output.images[0, -3:, -3:, -1]
249

250
        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
251

252
    def test_stable_diffusion_ddim_factor_8(self):
253
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
254

255
        components = self.get_dummy_components()
256
        sd_pipe = StableDiffusionPipeline(**components)
257
        sd_pipe = sd_pipe.to(device)
258
        sd_pipe.set_progress_bar_config(disable=None)
259

260
        inputs = self.get_dummy_inputs(device)
261
        output = sd_pipe(**inputs, height=136, width=136)
262
        image = output.images
263

264
        image_slice = image[0, -3:, -3:, -1]
265

266
        assert image.shape == (1, 136, 136, 3)
267
        expected_slice = np.array([0.5524, 0.5626, 0.6069, 0.4727, 0.386, 0.3995, 0.4613, 0.4328, 0.4269])
268

269
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
270

271
    def test_stable_diffusion_pndm(self):
272
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
273
        components = self.get_dummy_components()
274
        sd_pipe = StableDiffusionPipeline(**components)
275
        sd_pipe.scheduler = PNDMScheduler(skip_prk_steps=True)
276
        sd_pipe = sd_pipe.to(device)
277
        sd_pipe.set_progress_bar_config(disable=None)
278

279
        inputs = self.get_dummy_inputs(device)
280
        output = sd_pipe(**inputs)
281
        image = output.images
282
        image_slice = image[0, -3:, -3:, -1]
283

284
        assert image.shape == (1, 64, 64, 3)
285
        expected_slice = np.array([0.5094, 0.5674, 0.4667, 0.5125, 0.5696, 0.4674, 0.5277, 0.4964, 0.4945])
286

287
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
288

289
    def test_stable_diffusion_no_safety_checker(self):
290
        pipe = StableDiffusionPipeline.from_pretrained(
291
            "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
292
        )
293
        assert isinstance(pipe, StableDiffusionPipeline)
294
        assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
295
        assert pipe.safety_checker is None
296

297
        image = pipe("example prompt", num_inference_steps=2).images[0]
298
        assert image is not None
299

300
        # check that there's no error when saving a pipeline with one of the models being None
301
        with tempfile.TemporaryDirectory() as tmpdirname:
302
            pipe.save_pretrained(tmpdirname)
303
            pipe = StableDiffusionPipeline.from_pretrained(tmpdirname)
304

305
        # sanity check that the pipeline still works
306
        assert pipe.safety_checker is None
307
        image = pipe("example prompt", num_inference_steps=2).images[0]
308
        assert image is not None
309

310
    def test_stable_diffusion_k_lms(self):
311
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
312

313
        components = self.get_dummy_components()
314
        sd_pipe = StableDiffusionPipeline(**components)
315
        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
316
        sd_pipe = sd_pipe.to(device)
317
        sd_pipe.set_progress_bar_config(disable=None)
318

319
        inputs = self.get_dummy_inputs(device)
320
        output = sd_pipe(**inputs)
321
        image = output.images
322
        image_slice = image[0, -3:, -3:, -1]
323

324
        assert image.shape == (1, 64, 64, 3)
325
        expected_slice = np.array(
326
            [
327
                0.47082293033599854,
328
                0.5371589064598083,
329
                0.4562119245529175,
330
                0.5220914483070374,
331
                0.5733777284622192,
332
                0.4795039892196655,
333
                0.5465868711471558,
334
                0.5074326395988464,
335
                0.5042197108268738,
336
            ]
337
        )
338

339
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
340

341
    def test_stable_diffusion_k_euler_ancestral(self):
342
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
343

344
        components = self.get_dummy_components()
345
        sd_pipe = StableDiffusionPipeline(**components)
346
        sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
347
        sd_pipe = sd_pipe.to(device)
348
        sd_pipe.set_progress_bar_config(disable=None)
349

350
        inputs = self.get_dummy_inputs(device)
351
        output = sd_pipe(**inputs)
352
        image = output.images
353
        image_slice = image[0, -3:, -3:, -1]
354

355
        assert image.shape == (1, 64, 64, 3)
356
        expected_slice = np.array(
357
            [
358
                0.4707113206386566,
359
                0.5372191071510315,
360
                0.4563021957874298,
361
                0.5220003724098206,
362
                0.5734264850616455,
363
                0.4794946610927582,
364
                0.5463782548904419,
365
                0.5074145197868347,
366
                0.504422664642334,
367
            ]
368
        )
369

370
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
371

372
    def test_stable_diffusion_k_euler(self):
373
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
374

375
        components = self.get_dummy_components()
376
        sd_pipe = StableDiffusionPipeline(**components)
377
        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
378
        sd_pipe = sd_pipe.to(device)
379
        sd_pipe.set_progress_bar_config(disable=None)
380

381
        inputs = self.get_dummy_inputs(device)
382
        output = sd_pipe(**inputs)
383
        image = output.images
384
        image_slice = image[0, -3:, -3:, -1]
385

386
        assert image.shape == (1, 64, 64, 3)
387
        expected_slice = np.array(
388
            [
389
                0.47082313895225525,
390
                0.5371587872505188,
391
                0.4562119245529175,
392
                0.5220913887023926,
393
                0.5733776688575745,
394
                0.47950395941734314,
395
                0.546586811542511,
396
                0.5074326992034912,
397
                0.5042197108268738,
398
            ]
399
        )
400

401
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
402

403
    def test_stable_diffusion_vae_slicing(self):
404
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
405
        components = self.get_dummy_components()
406
        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
407
        sd_pipe = StableDiffusionPipeline(**components)
408
        sd_pipe = sd_pipe.to(device)
409
        sd_pipe.set_progress_bar_config(disable=None)
410

411
        image_count = 4
412

413
        inputs = self.get_dummy_inputs(device)
414
        inputs["prompt"] = [inputs["prompt"]] * image_count
415
        output_1 = sd_pipe(**inputs)
416

417
        # make sure sliced vae decode yields the same result
418
        sd_pipe.enable_vae_slicing()
419
        inputs = self.get_dummy_inputs(device)
420
        inputs["prompt"] = [inputs["prompt"]] * image_count
421
        output_2 = sd_pipe(**inputs)
422

423
        # there is a small discrepancy at image borders vs. full batch decode
424
        assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 3e-3
425

426
    def test_stable_diffusion_vae_tiling(self):
427
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
428
        components = self.get_dummy_components()
429

430
        # make sure here that pndm scheduler skips prk
431
        components["safety_checker"] = None
432
        sd_pipe = StableDiffusionPipeline(**components)
433
        sd_pipe = sd_pipe.to(device)
434
        sd_pipe.set_progress_bar_config(disable=None)
435

436
        prompt = "A painting of a squirrel eating a burger"
437

438
        # Test that tiled decode at 512x512 yields the same result as the non-tiled decode
439
        generator = torch.Generator(device=device).manual_seed(0)
440
        output_1 = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
441

442
        # make sure tiled vae decode yields the same result
443
        sd_pipe.enable_vae_tiling()
444
        generator = torch.Generator(device=device).manual_seed(0)
445
        output_2 = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
446

447
        assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 5e-1
448

449
        # test that tiled decode works with various shapes
450
        shapes = [(1, 4, 73, 97), (1, 4, 97, 73), (1, 4, 49, 65), (1, 4, 65, 49)]
451
        for shape in shapes:
452
            zeros = torch.zeros(shape).to(device)
453
            sd_pipe.vae.decode(zeros)
454

455
    def test_stable_diffusion_negative_prompt(self):
456
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
457
        components = self.get_dummy_components()
458
        components["scheduler"] = PNDMScheduler(skip_prk_steps=True)
459
        sd_pipe = StableDiffusionPipeline(**components)
460
        sd_pipe = sd_pipe.to(device)
461
        sd_pipe.set_progress_bar_config(disable=None)
462

463
        inputs = self.get_dummy_inputs(device)
464
        negative_prompt = "french fries"
465
        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
466

467
        image = output.images
468
        image_slice = image[0, -3:, -3:, -1]
469

470
        assert image.shape == (1, 64, 64, 3)
471
        expected_slice = np.array(
472
            [
473
                0.5108221173286438,
474
                0.5688379406929016,
475
                0.4685141146183014,
476
                0.5098261833190918,
477
                0.5657756328582764,
478
                0.4631010890007019,
479
                0.5226285457611084,
480
                0.49129390716552734,
481
                0.4899061322212219,
482
            ]
483
        )
484

485
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
486

487
    def test_stable_diffusion_long_prompt(self):
488
        components = self.get_dummy_components()
489
        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
490
        sd_pipe = StableDiffusionPipeline(**components)
491
        sd_pipe = sd_pipe.to(torch_device)
492
        sd_pipe.set_progress_bar_config(disable=None)
493

494
        do_classifier_free_guidance = True
495
        negative_prompt = None
496
        num_images_per_prompt = 1
497
        logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")
498

499
        prompt = 25 * "@"
500
        with CaptureLogger(logger) as cap_logger_3:
501
            text_embeddings_3 = sd_pipe._encode_prompt(
502
                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
503
            )
504

505
        prompt = 100 * "@"
506
        with CaptureLogger(logger) as cap_logger:
507
            text_embeddings = sd_pipe._encode_prompt(
508
                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
509
            )
510

511
        negative_prompt = "Hello"
512
        with CaptureLogger(logger) as cap_logger_2:
513
            text_embeddings_2 = sd_pipe._encode_prompt(
514
                prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
515
            )
516

517
        assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape
518
        assert text_embeddings.shape[1] == 77
519

520
        assert cap_logger.out == cap_logger_2.out
521
        # 100 - 77 + 1 (BOS token) + 1 (EOS token) = 25
522
        assert cap_logger.out.count("@") == 25
523
        assert cap_logger_3.out == ""
524

525
    def test_stable_diffusion_height_width_opt(self):
526
        components = self.get_dummy_components()
527
        components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
528
        sd_pipe = StableDiffusionPipeline(**components)
529
        sd_pipe = sd_pipe.to(torch_device)
530
        sd_pipe.set_progress_bar_config(disable=None)
531

532
        prompt = "hey"
533

534
        output = sd_pipe(prompt, num_inference_steps=1, output_type="np")
535
        image_shape = output.images[0].shape[:2]
536
        assert image_shape == (64, 64)
537

538
        output = sd_pipe(prompt, num_inference_steps=1, height=96, width=96, output_type="np")
539
        image_shape = output.images[0].shape[:2]
540
        assert image_shape == (96, 96)
541

542
        config = dict(sd_pipe.unet.config)
543
        config["sample_size"] = 96
544
        sd_pipe.unet = UNet2DConditionModel.from_config(config).to(torch_device)
545
        output = sd_pipe(prompt, num_inference_steps=1, output_type="np")
546
        image_shape = output.images[0].shape[:2]
547
        assert image_shape == (192, 192)
548

549

550
@slow
551
@require_torch_gpu
552
class StableDiffusionPipelineSlowTests(unittest.TestCase):
553
    def tearDown(self):
554
        super().tearDown()
555
        gc.collect()
556
        torch.cuda.empty_cache()
557

558
    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
559
        generator = torch.Generator(device=generator_device).manual_seed(seed)
560
        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
561
        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
562
        inputs = {
563
            "prompt": "a photograph of an astronaut riding a horse",
564
            "latents": latents,
565
            "generator": generator,
566
            "num_inference_steps": 3,
567
            "guidance_scale": 7.5,
568
            "output_type": "numpy",
569
        }
570
        return inputs
571

572
    def test_stable_diffusion_1_1_pndm(self):
573
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1")
574
        sd_pipe = sd_pipe.to(torch_device)
575
        sd_pipe.set_progress_bar_config(disable=None)
576

577
        inputs = self.get_inputs(torch_device)
578
        image = sd_pipe(**inputs).images
579
        image_slice = image[0, -3:, -3:, -1].flatten()
580

581
        assert image.shape == (1, 512, 512, 3)
582
        expected_slice = np.array([0.43625, 0.43554, 0.36670, 0.40660, 0.39703, 0.38658, 0.43936, 0.43557, 0.40592])
583
        assert np.abs(image_slice - expected_slice).max() < 1e-4
584

585
    def test_stable_diffusion_1_4_pndm(self):
586
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
587
        sd_pipe = sd_pipe.to(torch_device)
588
        sd_pipe.set_progress_bar_config(disable=None)
589

590
        inputs = self.get_inputs(torch_device)
591
        image = sd_pipe(**inputs).images
592
        image_slice = image[0, -3:, -3:, -1].flatten()
593

594
        assert image.shape == (1, 512, 512, 3)
595
        expected_slice = np.array([0.57400, 0.47841, 0.31625, 0.63583, 0.58306, 0.55056, 0.50825, 0.56306, 0.55748])
596
        assert np.abs(image_slice - expected_slice).max() < 1e-4
597

598
    def test_stable_diffusion_ddim(self):
599
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
600
        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
601
        sd_pipe = sd_pipe.to(torch_device)
602
        sd_pipe.set_progress_bar_config(disable=None)
603

604
        inputs = self.get_inputs(torch_device)
605
        image = sd_pipe(**inputs).images
606
        image_slice = image[0, -3:, -3:, -1].flatten()
607

608
        assert image.shape == (1, 512, 512, 3)
609
        expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239])
610
        assert np.abs(image_slice - expected_slice).max() < 1e-4
611

612
    def test_stable_diffusion_lms(self):
613
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
614
        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
615
        sd_pipe = sd_pipe.to(torch_device)
616
        sd_pipe.set_progress_bar_config(disable=None)
617

618
        inputs = self.get_inputs(torch_device)
619
        image = sd_pipe(**inputs).images
620
        image_slice = image[0, -3:, -3:, -1].flatten()
621

622
        assert image.shape == (1, 512, 512, 3)
623
        expected_slice = np.array([0.10542, 0.09620, 0.07332, 0.09015, 0.09382, 0.07597, 0.08496, 0.07806, 0.06455])
624
        assert np.abs(image_slice - expected_slice).max() < 1e-4
625

626
    def test_stable_diffusion_dpm(self):
627
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)
628
        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
629
        sd_pipe = sd_pipe.to(torch_device)
630
        sd_pipe.set_progress_bar_config(disable=None)
631

632
        inputs = self.get_inputs(torch_device)
633
        image = sd_pipe(**inputs).images
634
        image_slice = image[0, -3:, -3:, -1].flatten()
635

636
        assert image.shape == (1, 512, 512, 3)
637
        expected_slice = np.array([0.03503, 0.03494, 0.01087, 0.03128, 0.02552, 0.00803, 0.00742, 0.00372, 0.00000])
638
        assert np.abs(image_slice - expected_slice).max() < 1e-4
639

640
    def test_stable_diffusion_attention_slicing(self):
641
        torch.cuda.reset_peak_memory_stats()
642
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
643
        pipe = pipe.to(torch_device)
644
        pipe.set_progress_bar_config(disable=None)
645

646
        # enable attention slicing
647
        pipe.enable_attention_slicing()
648
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
649
        image_sliced = pipe(**inputs).images
650

651
        mem_bytes = torch.cuda.max_memory_allocated()
652
        torch.cuda.reset_peak_memory_stats()
653
        # make sure that less than 3.75 GB is allocated
654
        assert mem_bytes < 3.75 * 10**9
655

656
        # disable slicing
657
        pipe.disable_attention_slicing()
658
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
659
        image = pipe(**inputs).images
660

661
        # make sure that more than 3.75 GB is allocated
662
        mem_bytes = torch.cuda.max_memory_allocated()
663
        assert mem_bytes > 3.75 * 10**9
664
        assert np.abs(image_sliced - image).max() < 1e-3
665

666
    def test_stable_diffusion_vae_slicing(self):
667
        torch.cuda.reset_peak_memory_stats()
668
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
669
        pipe = pipe.to(torch_device)
670
        pipe.set_progress_bar_config(disable=None)
671
        pipe.enable_attention_slicing()
672

673
        # enable vae slicing
674
        pipe.enable_vae_slicing()
675
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
676
        inputs["prompt"] = [inputs["prompt"]] * 4
677
        inputs["latents"] = torch.cat([inputs["latents"]] * 4)
678
        image_sliced = pipe(**inputs).images
679

680
        mem_bytes = torch.cuda.max_memory_allocated()
681
        torch.cuda.reset_peak_memory_stats()
682
        # make sure that less than 4 GB is allocated
683
        assert mem_bytes < 4e9
684

685
        # disable vae slicing
686
        pipe.disable_vae_slicing()
687
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
688
        inputs["prompt"] = [inputs["prompt"]] * 4
689
        inputs["latents"] = torch.cat([inputs["latents"]] * 4)
690
        image = pipe(**inputs).images
691

692
        # make sure that more than 4 GB is allocated
693
        mem_bytes = torch.cuda.max_memory_allocated()
694
        assert mem_bytes > 4e9
695
        # There is a small discrepancy at the image borders vs. a fully batched version.
696
        assert np.abs(image_sliced - image).max() < 1e-2
697

698
    def test_stable_diffusion_vae_tiling(self):
699
        torch.cuda.reset_peak_memory_stats()
700
        model_id = "CompVis/stable-diffusion-v1-4"
701
        pipe = StableDiffusionPipeline.from_pretrained(model_id, revision="fp16", torch_dtype=torch.float16)
702
        pipe.set_progress_bar_config(disable=None)
703
        pipe.enable_attention_slicing()
704
        pipe.unet = pipe.unet.to(memory_format=torch.channels_last)
705
        pipe.vae = pipe.vae.to(memory_format=torch.channels_last)
706

707
        prompt = "a photograph of an astronaut riding a horse"
708

709
        # enable vae tiling
710
        pipe.enable_vae_tiling()
711
        pipe.enable_model_cpu_offload()
712
        generator = torch.Generator(device="cpu").manual_seed(0)
713
        output_chunked = pipe(
714
            [prompt],
715
            width=1024,
716
            height=1024,
717
            generator=generator,
718
            guidance_scale=7.5,
719
            num_inference_steps=2,
720
            output_type="numpy",
721
        )
722
        image_chunked = output_chunked.images
723

724
        mem_bytes = torch.cuda.max_memory_allocated()
725

726
        # disable vae tiling
727
        pipe.disable_vae_tiling()
728
        generator = torch.Generator(device="cpu").manual_seed(0)
729
        output = pipe(
730
            [prompt],
731
            width=1024,
732
            height=1024,
733
            generator=generator,
734
            guidance_scale=7.5,
735
            num_inference_steps=2,
736
            output_type="numpy",
737
        )
738
        image = output.images
739

740
        assert mem_bytes < 1e10
741
        assert np.abs(image_chunked.flatten() - image.flatten()).max() < 1e-2
742

743
    def test_stable_diffusion_fp16_vs_autocast(self):
744
        # this test makes sure that the original model with autocast
745
        # and the new model with fp16 yield the same result
746
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
747
        pipe = pipe.to(torch_device)
748
        pipe.set_progress_bar_config(disable=None)
749

750
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
751
        image_fp16 = pipe(**inputs).images
752

753
        with torch.autocast(torch_device):
754
            inputs = self.get_inputs(torch_device)
755
            image_autocast = pipe(**inputs).images
756

757
        # Make sure results are close enough
758
        diff = np.abs(image_fp16.flatten() - image_autocast.flatten())
759
        # They ARE different since ops are not run always at the same precision
760
        # however, they should be extremely close.
761
        assert diff.mean() < 2e-2
762

763
    def test_stable_diffusion_intermediate_state(self):
764
        number_of_steps = 0
765

766
        def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
767
            callback_fn.has_been_called = True
768
            nonlocal number_of_steps
769
            number_of_steps += 1
770
            if step == 1:
771
                latents = latents.detach().cpu().numpy()
772
                assert latents.shape == (1, 4, 64, 64)
773
                latents_slice = latents[0, -3:, -3:, -1]
774
                expected_slice = np.array(
775
                    [-0.5693, -0.3018, -0.9746, 0.0518, -0.8770, 0.7559, -1.7402, 0.1022, 1.1582]
776
                )
777

778
                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
779
            elif step == 2:
780
                latents = latents.detach().cpu().numpy()
781
                assert latents.shape == (1, 4, 64, 64)
782
                latents_slice = latents[0, -3:, -3:, -1]
783
                expected_slice = np.array(
784
                    [-0.1958, -0.2993, -1.0166, -0.5005, -0.4810, 0.6162, -0.9492, 0.6621, 1.4492]
785
                )
786

787
                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
788

789
        callback_fn.has_been_called = False
790

791
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
792
        pipe = pipe.to(torch_device)
793
        pipe.set_progress_bar_config(disable=None)
794
        pipe.enable_attention_slicing()
795

796
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
797
        pipe(**inputs, callback=callback_fn, callback_steps=1)
798
        assert callback_fn.has_been_called
799
        assert number_of_steps == inputs["num_inference_steps"]
800

801
    def test_stable_diffusion_low_cpu_mem_usage(self):
802
        pipeline_id = "CompVis/stable-diffusion-v1-4"
803

804
        start_time = time.time()
805
        pipeline_low_cpu_mem_usage = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16)
806
        pipeline_low_cpu_mem_usage.to(torch_device)
807
        low_cpu_mem_usage_time = time.time() - start_time
808

809
        start_time = time.time()
810
        _ = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16, low_cpu_mem_usage=False)
811
        normal_load_time = time.time() - start_time
812

813
        assert 2 * low_cpu_mem_usage_time < normal_load_time
814

815
    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
816
        torch.cuda.empty_cache()
817
        torch.cuda.reset_max_memory_allocated()
818
        torch.cuda.reset_peak_memory_stats()
819

820
        pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
821
        pipe = pipe.to(torch_device)
822
        pipe.set_progress_bar_config(disable=None)
823
        pipe.enable_attention_slicing(1)
824
        pipe.enable_sequential_cpu_offload()
825

826
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
827
        _ = pipe(**inputs)
828

829
        mem_bytes = torch.cuda.max_memory_allocated()
830
        # make sure that less than 2.8 GB is allocated
831
        assert mem_bytes < 2.8 * 10**9
832

833
    def test_stable_diffusion_pipeline_with_model_offloading(self):
834
        torch.cuda.empty_cache()
835
        torch.cuda.reset_max_memory_allocated()
836
        torch.cuda.reset_peak_memory_stats()
837

838
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
839

840
        # Normal inference
841

842
        pipe = StableDiffusionPipeline.from_pretrained(
843
            "CompVis/stable-diffusion-v1-4",
844
            torch_dtype=torch.float16,
845
        )
846
        pipe.unet.set_attn_processor(AttnProcessor())
847
        pipe.to(torch_device)
848
        pipe.set_progress_bar_config(disable=None)
849
        outputs = pipe(**inputs)
850
        mem_bytes = torch.cuda.max_memory_allocated()
851

852
        # With model offloading
853

854
        # Reload but don't move to cuda
855
        pipe = StableDiffusionPipeline.from_pretrained(
856
            "CompVis/stable-diffusion-v1-4",
857
            torch_dtype=torch.float16,
858
        )
859
        pipe.unet.set_attn_processor(AttnProcessor())
860

861
        torch.cuda.empty_cache()
862
        torch.cuda.reset_max_memory_allocated()
863
        torch.cuda.reset_peak_memory_stats()
864

865
        pipe.enable_model_cpu_offload()
866
        pipe.set_progress_bar_config(disable=None)
867
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
868

869
        outputs_offloaded = pipe(**inputs)
870
        mem_bytes_offloaded = torch.cuda.max_memory_allocated()
871

872
        assert np.abs(outputs.images - outputs_offloaded.images).max() < 1e-3
873
        assert mem_bytes_offloaded < mem_bytes
874
        assert mem_bytes_offloaded < 3.5 * 10**9
875
        for module in pipe.text_encoder, pipe.unet, pipe.vae, pipe.safety_checker:
876
            assert module.device == torch.device("cpu")
877

878
        # With attention slicing
879
        torch.cuda.empty_cache()
880
        torch.cuda.reset_max_memory_allocated()
881
        torch.cuda.reset_peak_memory_stats()
882

883
        pipe.enable_attention_slicing()
884
        _ = pipe(**inputs)
885
        mem_bytes_slicing = torch.cuda.max_memory_allocated()
886

887
        assert mem_bytes_slicing < mem_bytes_offloaded
888
        assert mem_bytes_slicing < 3 * 10**9
889

890

891
@nightly
892
@require_torch_gpu
893
class StableDiffusionPipelineNightlyTests(unittest.TestCase):
894
    def tearDown(self):
895
        super().tearDown()
896
        gc.collect()
897
        torch.cuda.empty_cache()
898

899
    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
900
        generator = torch.Generator(device=generator_device).manual_seed(seed)
901
        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
902
        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
903
        inputs = {
904
            "prompt": "a photograph of an astronaut riding a horse",
905
            "latents": latents,
906
            "generator": generator,
907
            "num_inference_steps": 50,
908
            "guidance_scale": 7.5,
909
            "output_type": "numpy",
910
        }
911
        return inputs
912

913
    def test_stable_diffusion_1_4_pndm(self):
914
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
915
        sd_pipe.set_progress_bar_config(disable=None)
916

917
        inputs = self.get_inputs(torch_device)
918
        image = sd_pipe(**inputs).images[0]
919

920
        expected_image = load_numpy(
921
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
922
            "/stable_diffusion_text2img/stable_diffusion_1_4_pndm.npy"
923
        )
924
        max_diff = np.abs(expected_image - image).max()
925
        assert max_diff < 1e-3
926

927
    def test_stable_diffusion_1_5_pndm(self):
928
        sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5").to(torch_device)
929
        sd_pipe.set_progress_bar_config(disable=None)
930

931
        inputs = self.get_inputs(torch_device)
932
        image = sd_pipe(**inputs).images[0]
933

934
        expected_image = load_numpy(
935
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
936
            "/stable_diffusion_text2img/stable_diffusion_1_5_pndm.npy"
937
        )
938
        max_diff = np.abs(expected_image - image).max()
939
        assert max_diff < 1e-3
940

941
    def test_stable_diffusion_ddim(self):
942
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
943
        sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)
944
        sd_pipe.set_progress_bar_config(disable=None)
945

946
        inputs = self.get_inputs(torch_device)
947
        image = sd_pipe(**inputs).images[0]
948

949
        expected_image = load_numpy(
950
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
951
            "/stable_diffusion_text2img/stable_diffusion_1_4_ddim.npy"
952
        )
953
        max_diff = np.abs(expected_image - image).max()
954
        assert max_diff < 1e-3
955

956
    def test_stable_diffusion_lms(self):
957
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
958
        sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)
959
        sd_pipe.set_progress_bar_config(disable=None)
960

961
        inputs = self.get_inputs(torch_device)
962
        image = sd_pipe(**inputs).images[0]
963

964
        expected_image = load_numpy(
965
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
966
            "/stable_diffusion_text2img/stable_diffusion_1_4_lms.npy"
967
        )
968
        max_diff = np.abs(expected_image - image).max()
969
        assert max_diff < 1e-3
970

971
    def test_stable_diffusion_euler(self):
972
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
973
        sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)
974
        sd_pipe.set_progress_bar_config(disable=None)
975

976
        inputs = self.get_inputs(torch_device)
977
        image = sd_pipe(**inputs).images[0]
978

979
        expected_image = load_numpy(
980
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
981
            "/stable_diffusion_text2img/stable_diffusion_1_4_euler.npy"
982
        )
983
        max_diff = np.abs(expected_image - image).max()
984
        assert max_diff < 1e-3
985

986
    def test_stable_diffusion_dpm(self):
987
        sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)
988
        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
989
        sd_pipe.set_progress_bar_config(disable=None)
990

991
        inputs = self.get_inputs(torch_device)
992
        inputs["num_inference_steps"] = 25
993
        image = sd_pipe(**inputs).images[0]
994

995
        expected_image = load_numpy(
996
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
997
            "/stable_diffusion_text2img/stable_diffusion_1_4_dpm_multi.npy"
998
        )
999
        max_diff = np.abs(expected_image - image).max()
1000
        assert max_diff < 1e-3
1001

1002
Product

Resources

Company