CoCalc -- test_semantic

GitHub Repository: shivamshrirao/diffusers
Path: blob/main/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py
¹⁴⁵⁰ views
1
# coding=utf-8
2
# Copyright 2023 HuggingFace Inc.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
import gc
17
import random
18
import tempfile
19
import unittest
20

21
import numpy as np
22
import torch
23
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
24

25
from diffusers import AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
26
from diffusers.pipelines.semantic_stable_diffusion import SemanticStableDiffusionPipeline as StableDiffusionPipeline
27
from diffusers.utils import floats_tensor, nightly, torch_device
28
from diffusers.utils.testing_utils import require_torch_gpu
29

30

31
torch.backends.cuda.matmul.allow_tf32 = False
32

33

34
class SafeDiffusionPipelineFastTests(unittest.TestCase):
35
    def tearDown(self):
36
        # clean up the VRAM after each test
37
        super().tearDown()
38
        gc.collect()
39
        torch.cuda.empty_cache()
40

41
    @property
42
    def dummy_image(self):
43
        batch_size = 1
44
        num_channels = 3
45
        sizes = (32, 32)
46

47
        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
48
        return image
49

50
    @property
51
    def dummy_cond_unet(self):
52
        torch.manual_seed(0)
53
        model = UNet2DConditionModel(
54
            block_out_channels=(32, 64),
55
            layers_per_block=2,
56
            sample_size=32,
57
            in_channels=4,
58
            out_channels=4,
59
            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
60
            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
61
            cross_attention_dim=32,
62
        )
63
        return model
64

65
    @property
66
    def dummy_vae(self):
67
        torch.manual_seed(0)
68
        model = AutoencoderKL(
69
            block_out_channels=[32, 64],
70
            in_channels=3,
71
            out_channels=3,
72
            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
73
            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
74
            latent_channels=4,
75
        )
76
        return model
77

78
    @property
79
    def dummy_text_encoder(self):
80
        torch.manual_seed(0)
81
        config = CLIPTextConfig(
82
            bos_token_id=0,
83
            eos_token_id=2,
84
            hidden_size=32,
85
            intermediate_size=37,
86
            layer_norm_eps=1e-05,
87
            num_attention_heads=4,
88
            num_hidden_layers=5,
89
            pad_token_id=1,
90
            vocab_size=1000,
91
        )
92
        return CLIPTextModel(config)
93

94
    @property
95
    def dummy_extractor(self):
96
        def extract(*args, **kwargs):
97
            class Out:
98
                def __init__(self):
99
                    self.pixel_values = torch.ones([0])
100

101
                def to(self, device):
102
                    self.pixel_values.to(device)
103
                    return self
104

105
            return Out()
106

107
        return extract
108

109
    def test_semantic_diffusion_ddim(self):
110
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
111
        unet = self.dummy_cond_unet
112
        scheduler = DDIMScheduler(
113
            beta_start=0.00085,
114
            beta_end=0.012,
115
            beta_schedule="scaled_linear",
116
            clip_sample=False,
117
            set_alpha_to_one=False,
118
        )
119

120
        vae = self.dummy_vae
121
        bert = self.dummy_text_encoder
122
        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
123

124
        # make sure here that pndm scheduler skips prk
125
        sd_pipe = StableDiffusionPipeline(
126
            unet=unet,
127
            scheduler=scheduler,
128
            vae=vae,
129
            text_encoder=bert,
130
            tokenizer=tokenizer,
131
            safety_checker=None,
132
            feature_extractor=self.dummy_extractor,
133
        )
134
        sd_pipe = sd_pipe.to(device)
135
        sd_pipe.set_progress_bar_config(disable=None)
136

137
        prompt = "A painting of a squirrel eating a burger"
138

139
        generator = torch.Generator(device=device).manual_seed(0)
140
        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
141
        image = output.images
142

143
        generator = torch.Generator(device=device).manual_seed(0)
144
        image_from_tuple = sd_pipe(
145
            [prompt],
146
            generator=generator,
147
            guidance_scale=6.0,
148
            num_inference_steps=2,
149
            output_type="np",
150
            return_dict=False,
151
        )[0]
152

153
        image_slice = image[0, -3:, -3:, -1]
154
        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
155

156
        assert image.shape == (1, 64, 64, 3)
157
        expected_slice = np.array([0.5644, 0.6018, 0.4799, 0.5267, 0.5585, 0.4641, 0.516, 0.4964, 0.4792])
158

159
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
160
        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
161

162
    def test_semantic_diffusion_pndm(self):
163
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
164
        unet = self.dummy_cond_unet
165
        scheduler = PNDMScheduler(skip_prk_steps=True)
166
        vae = self.dummy_vae
167
        bert = self.dummy_text_encoder
168
        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
169

170
        # make sure here that pndm scheduler skips prk
171
        sd_pipe = StableDiffusionPipeline(
172
            unet=unet,
173
            scheduler=scheduler,
174
            vae=vae,
175
            text_encoder=bert,
176
            tokenizer=tokenizer,
177
            safety_checker=None,
178
            feature_extractor=self.dummy_extractor,
179
        )
180
        sd_pipe = sd_pipe.to(device)
181
        sd_pipe.set_progress_bar_config(disable=None)
182

183
        prompt = "A painting of a squirrel eating a burger"
184
        generator = torch.Generator(device=device).manual_seed(0)
185
        output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
186

187
        image = output.images
188

189
        generator = torch.Generator(device=device).manual_seed(0)
190
        image_from_tuple = sd_pipe(
191
            [prompt],
192
            generator=generator,
193
            guidance_scale=6.0,
194
            num_inference_steps=2,
195
            output_type="np",
196
            return_dict=False,
197
        )[0]
198

199
        image_slice = image[0, -3:, -3:, -1]
200
        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
201

202
        assert image.shape == (1, 64, 64, 3)
203
        expected_slice = np.array([0.5095, 0.5674, 0.4668, 0.5126, 0.5697, 0.4675, 0.5278, 0.4964, 0.4945])
204

205
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
206
        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
207

208
    def test_semantic_diffusion_no_safety_checker(self):
209
        pipe = StableDiffusionPipeline.from_pretrained(
210
            "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
211
        )
212
        assert isinstance(pipe, StableDiffusionPipeline)
213
        assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
214
        assert pipe.safety_checker is None
215

216
        image = pipe("example prompt", num_inference_steps=2).images[0]
217
        assert image is not None
218

219
        # check that there's no error when saving a pipeline with one of the models being None
220
        with tempfile.TemporaryDirectory() as tmpdirname:
221
            pipe.save_pretrained(tmpdirname)
222
            pipe = StableDiffusionPipeline.from_pretrained(tmpdirname)
223

224
        # sanity check that the pipeline still works
225
        assert pipe.safety_checker is None
226
        image = pipe("example prompt", num_inference_steps=2).images[0]
227
        assert image is not None
228

229
    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
230
    def test_semantic_diffusion_fp16(self):
231
        """Test that stable diffusion works with fp16"""
232
        unet = self.dummy_cond_unet
233
        scheduler = PNDMScheduler(skip_prk_steps=True)
234
        vae = self.dummy_vae
235
        bert = self.dummy_text_encoder
236
        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
237

238
        # put models in fp16
239
        unet = unet.half()
240
        vae = vae.half()
241
        bert = bert.half()
242

243
        # make sure here that pndm scheduler skips prk
244
        sd_pipe = StableDiffusionPipeline(
245
            unet=unet,
246
            scheduler=scheduler,
247
            vae=vae,
248
            text_encoder=bert,
249
            tokenizer=tokenizer,
250
            safety_checker=None,
251
            feature_extractor=self.dummy_extractor,
252
        )
253
        sd_pipe = sd_pipe.to(torch_device)
254
        sd_pipe.set_progress_bar_config(disable=None)
255

256
        prompt = "A painting of a squirrel eating a burger"
257
        image = sd_pipe([prompt], num_inference_steps=2, output_type="np").images
258

259
        assert image.shape == (1, 64, 64, 3)
260

261

262
@nightly
263
@require_torch_gpu
264
class SemanticDiffusionPipelineIntegrationTests(unittest.TestCase):
265
    def tearDown(self):
266
        # clean up the VRAM after each test
267
        super().tearDown()
268
        gc.collect()
269
        torch.cuda.empty_cache()
270

271
    def test_positive_guidance(self):
272
        torch_device = "cuda"
273
        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
274
        pipe = pipe.to(torch_device)
275
        pipe.set_progress_bar_config(disable=None)
276

277
        prompt = "a photo of a cat"
278
        edit = {
279
            "editing_prompt": ["sunglasses"],
280
            "reverse_editing_direction": [False],
281
            "edit_warmup_steps": 10,
282
            "edit_guidance_scale": 6,
283
            "edit_threshold": 0.95,
284
            "edit_momentum_scale": 0.5,
285
            "edit_mom_beta": 0.6,
286
        }
287

288
        seed = 3
289
        guidance_scale = 7
290

291
        # no sega enabled
292
        generator = torch.Generator(torch_device)
293
        generator.manual_seed(seed)
294
        output = pipe(
295
            [prompt],
296
            generator=generator,
297
            guidance_scale=guidance_scale,
298
            num_inference_steps=50,
299
            output_type="np",
300
            width=512,
301
            height=512,
302
        )
303

304
        image = output.images
305
        image_slice = image[0, -3:, -3:, -1]
306
        expected_slice = [
307
            0.34673113,
308
            0.38492733,
309
            0.37597352,
310
            0.34086335,
311
            0.35650748,
312
            0.35579205,
313
            0.3384763,
314
            0.34340236,
315
            0.3573271,
316
        ]
317

318
        assert image.shape == (1, 512, 512, 3)
319

320
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
321

322
        # with sega enabled
323
        # generator = torch.manual_seed(seed)
324
        generator.manual_seed(seed)
325
        output = pipe(
326
            [prompt],
327
            generator=generator,
328
            guidance_scale=guidance_scale,
329
            num_inference_steps=50,
330
            output_type="np",
331
            width=512,
332
            height=512,
333
            **edit,
334
        )
335

336
        image = output.images
337
        image_slice = image[0, -3:, -3:, -1]
338
        expected_slice = [
339
            0.41887826,
340
            0.37728766,
341
            0.30138272,
342
            0.41416335,
343
            0.41664985,
344
            0.36283392,
345
            0.36191246,
346
            0.43364465,
347
            0.43001732,
348
        ]
349

350
        assert image.shape == (1, 512, 512, 3)
351

352
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
353

354
    def test_negative_guidance(self):
355
        torch_device = "cuda"
356
        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
357
        pipe = pipe.to(torch_device)
358
        pipe.set_progress_bar_config(disable=None)
359

360
        prompt = "an image of a crowded boulevard, realistic, 4k"
361
        edit = {
362
            "editing_prompt": "crowd, crowded, people",
363
            "reverse_editing_direction": True,
364
            "edit_warmup_steps": 10,
365
            "edit_guidance_scale": 8.3,
366
            "edit_threshold": 0.9,
367
            "edit_momentum_scale": 0.5,
368
            "edit_mom_beta": 0.6,
369
        }
370

371
        seed = 9
372
        guidance_scale = 7
373

374
        # no sega enabled
375
        generator = torch.Generator(torch_device)
376
        generator.manual_seed(seed)
377
        output = pipe(
378
            [prompt],
379
            generator=generator,
380
            guidance_scale=guidance_scale,
381
            num_inference_steps=50,
382
            output_type="np",
383
            width=512,
384
            height=512,
385
        )
386

387
        image = output.images
388
        image_slice = image[0, -3:, -3:, -1]
389
        expected_slice = [
390
            0.43497998,
391
            0.91814065,
392
            0.7540739,
393
            0.55580205,
394
            0.8467265,
395
            0.5389691,
396
            0.62574506,
397
            0.58897763,
398
            0.50926757,
399
        ]
400

401
        assert image.shape == (1, 512, 512, 3)
402

403
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
404

405
        # with sega enabled
406
        # generator = torch.manual_seed(seed)
407
        generator.manual_seed(seed)
408
        output = pipe(
409
            [prompt],
410
            generator=generator,
411
            guidance_scale=guidance_scale,
412
            num_inference_steps=50,
413
            output_type="np",
414
            width=512,
415
            height=512,
416
            **edit,
417
        )
418

419
        image = output.images
420
        image_slice = image[0, -3:, -3:, -1]
421
        expected_slice = [
422
            0.3089719,
423
            0.30500144,
424
            0.29016042,
425
            0.30630964,
426
            0.325687,
427
            0.29419225,
428
            0.2908091,
429
            0.28723598,
430
            0.27696294,
431
        ]
432

433
        assert image.shape == (1, 512, 512, 3)
434

435
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
436

437
    def test_multi_cond_guidance(self):
438
        torch_device = "cuda"
439
        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
440
        pipe = pipe.to(torch_device)
441
        pipe.set_progress_bar_config(disable=None)
442

443
        prompt = "a castle next to a river"
444
        edit = {
445
            "editing_prompt": ["boat on a river, boat", "monet, impression, sunrise"],
446
            "reverse_editing_direction": False,
447
            "edit_warmup_steps": [15, 18],
448
            "edit_guidance_scale": 6,
449
            "edit_threshold": [0.9, 0.8],
450
            "edit_momentum_scale": 0.5,
451
            "edit_mom_beta": 0.6,
452
        }
453

454
        seed = 48
455
        guidance_scale = 7
456

457
        # no sega enabled
458
        generator = torch.Generator(torch_device)
459
        generator.manual_seed(seed)
460
        output = pipe(
461
            [prompt],
462
            generator=generator,
463
            guidance_scale=guidance_scale,
464
            num_inference_steps=50,
465
            output_type="np",
466
            width=512,
467
            height=512,
468
        )
469

470
        image = output.images
471
        image_slice = image[0, -3:, -3:, -1]
472
        expected_slice = [
473
            0.75163555,
474
            0.76037145,
475
            0.61785,
476
            0.9189673,
477
            0.8627701,
478
            0.85189694,
479
            0.8512813,
480
            0.87012076,
481
            0.8312857,
482
        ]
483

484
        assert image.shape == (1, 512, 512, 3)
485

486
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
487

488
        # with sega enabled
489
        # generator = torch.manual_seed(seed)
490
        generator.manual_seed(seed)
491
        output = pipe(
492
            [prompt],
493
            generator=generator,
494
            guidance_scale=guidance_scale,
495
            num_inference_steps=50,
496
            output_type="np",
497
            width=512,
498
            height=512,
499
            **edit,
500
        )
501

502
        image = output.images
503
        image_slice = image[0, -3:, -3:, -1]
504
        expected_slice = [
505
            0.73553365,
506
            0.7537271,
507
            0.74341905,
508
            0.66480356,
509
            0.6472925,
510
            0.63039416,
511
            0.64812905,
512
            0.6749717,
513
            0.6517102,
514
        ]
515

516
        assert image.shape == (1, 512, 512, 3)
517

518
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
519

520
    def test_guidance_fp16(self):
521
        torch_device = "cuda"
522
        pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
523
        pipe = pipe.to(torch_device)
524
        pipe.set_progress_bar_config(disable=None)
525

526
        prompt = "a photo of a cat"
527
        edit = {
528
            "editing_prompt": ["sunglasses"],
529
            "reverse_editing_direction": [False],
530
            "edit_warmup_steps": 10,
531
            "edit_guidance_scale": 6,
532
            "edit_threshold": 0.95,
533
            "edit_momentum_scale": 0.5,
534
            "edit_mom_beta": 0.6,
535
        }
536

537
        seed = 3
538
        guidance_scale = 7
539

540
        # no sega enabled
541
        generator = torch.Generator(torch_device)
542
        generator.manual_seed(seed)
543
        output = pipe(
544
            [prompt],
545
            generator=generator,
546
            guidance_scale=guidance_scale,
547
            num_inference_steps=50,
548
            output_type="np",
549
            width=512,
550
            height=512,
551
        )
552

553
        image = output.images
554
        image_slice = image[0, -3:, -3:, -1]
555
        expected_slice = [
556
            0.34887695,
557
            0.3876953,
558
            0.375,
559
            0.34423828,
560
            0.3581543,
561
            0.35717773,
562
            0.3383789,
563
            0.34570312,
564
            0.359375,
565
        ]
566

567
        assert image.shape == (1, 512, 512, 3)
568

569
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
570

571
        # with sega enabled
572
        # generator = torch.manual_seed(seed)
573
        generator.manual_seed(seed)
574
        output = pipe(
575
            [prompt],
576
            generator=generator,
577
            guidance_scale=guidance_scale,
578
            num_inference_steps=50,
579
            output_type="np",
580
            width=512,
581
            height=512,
582
            **edit,
583
        )
584

585
        image = output.images
586
        image_slice = image[0, -3:, -3:, -1]
587
        expected_slice = [
588
            0.42285156,
589
            0.36914062,
590
            0.29077148,
591
            0.42041016,
592
            0.41918945,
593
            0.35498047,
594
            0.3618164,
595
            0.4423828,
596
            0.43115234,
597
        ]
598

599
        assert image.shape == (1, 512, 512, 3)
600

601
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
602

603
Product

Resources

Company