CoCalc -- test_stable_diffusion_pix2pix

GitHub Repository: shivamshrirao/diffusers
Path: blob/main/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
¹⁴⁵¹ views
1
# coding=utf-8
2
# Copyright 2023 HuggingFace Inc.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
import gc
17
import unittest
18

19
import numpy as np
20
import torch
21
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
22

23
from diffusers import (
24
    AutoencoderKL,
25
    DDIMInverseScheduler,
26
    DDIMScheduler,
27
    DDPMScheduler,
28
    EulerAncestralDiscreteScheduler,
29
    LMSDiscreteScheduler,
30
    StableDiffusionPix2PixZeroPipeline,
31
    UNet2DConditionModel,
32
)
33
from diffusers.utils import load_numpy, slow, torch_device
34
from diffusers.utils.testing_utils import load_image, load_pt, require_torch_gpu, skip_mps
35

36
from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
37
from ...test_pipelines_common import PipelineTesterMixin
38

39

40
torch.backends.cuda.matmul.allow_tf32 = False
41

42

43
@skip_mps
44
class StableDiffusionPix2PixZeroPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
45
    pipeline_class = StableDiffusionPix2PixZeroPipeline
46
    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
47
    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
48

49
    @classmethod
50
    def setUpClass(cls):
51
        cls.source_embeds = load_pt(
52
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/src_emb_0.pt"
53
        )
54

55
        cls.target_embeds = load_pt(
56
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/tgt_emb_0.pt"
57
        )
58

59
    def get_dummy_components(self):
60
        torch.manual_seed(0)
61
        unet = UNet2DConditionModel(
62
            block_out_channels=(32, 64),
63
            layers_per_block=2,
64
            sample_size=32,
65
            in_channels=4,
66
            out_channels=4,
67
            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
68
            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
69
            cross_attention_dim=32,
70
        )
71
        scheduler = DDIMScheduler()
72
        torch.manual_seed(0)
73
        vae = AutoencoderKL(
74
            block_out_channels=[32, 64],
75
            in_channels=3,
76
            out_channels=3,
77
            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
78
            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
79
            latent_channels=4,
80
        )
81
        torch.manual_seed(0)
82
        text_encoder_config = CLIPTextConfig(
83
            bos_token_id=0,
84
            eos_token_id=2,
85
            hidden_size=32,
86
            intermediate_size=37,
87
            layer_norm_eps=1e-05,
88
            num_attention_heads=4,
89
            num_hidden_layers=5,
90
            pad_token_id=1,
91
            vocab_size=1000,
92
        )
93
        text_encoder = CLIPTextModel(text_encoder_config)
94
        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
95

96
        components = {
97
            "unet": unet,
98
            "scheduler": scheduler,
99
            "vae": vae,
100
            "text_encoder": text_encoder,
101
            "tokenizer": tokenizer,
102
            "safety_checker": None,
103
            "feature_extractor": None,
104
            "inverse_scheduler": None,
105
            "caption_generator": None,
106
            "caption_processor": None,
107
        }
108
        return components
109

110
    def get_dummy_inputs(self, device, seed=0):
111
        generator = torch.manual_seed(seed)
112

113
        inputs = {
114
            "prompt": "A painting of a squirrel eating a burger",
115
            "generator": generator,
116
            "num_inference_steps": 2,
117
            "guidance_scale": 6.0,
118
            "cross_attention_guidance_amount": 0.15,
119
            "source_embeds": self.source_embeds,
120
            "target_embeds": self.target_embeds,
121
            "output_type": "numpy",
122
        }
123
        return inputs
124

125
    def test_stable_diffusion_pix2pix_zero_default_case(self):
126
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
127
        components = self.get_dummy_components()
128
        sd_pipe = StableDiffusionPix2PixZeroPipeline(**components)
129
        sd_pipe = sd_pipe.to(device)
130
        sd_pipe.set_progress_bar_config(disable=None)
131

132
        inputs = self.get_dummy_inputs(device)
133
        image = sd_pipe(**inputs).images
134
        image_slice = image[0, -3:, -3:, -1]
135
        assert image.shape == (1, 64, 64, 3)
136
        expected_slice = np.array([0.5184, 0.503, 0.4917, 0.4022, 0.3455, 0.464, 0.5324, 0.5323, 0.4894])
137

138
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
139

140
    def test_stable_diffusion_pix2pix_zero_negative_prompt(self):
141
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
142
        components = self.get_dummy_components()
143
        sd_pipe = StableDiffusionPix2PixZeroPipeline(**components)
144
        sd_pipe = sd_pipe.to(device)
145
        sd_pipe.set_progress_bar_config(disable=None)
146

147
        inputs = self.get_dummy_inputs(device)
148
        negative_prompt = "french fries"
149
        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
150
        image = output.images
151
        image_slice = image[0, -3:, -3:, -1]
152

153
        assert image.shape == (1, 64, 64, 3)
154
        expected_slice = np.array([0.5464, 0.5072, 0.5012, 0.4124, 0.3624, 0.466, 0.5413, 0.5468, 0.4927])
155

156
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
157

158
    def test_stable_diffusion_pix2pix_zero_euler(self):
159
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
160
        components = self.get_dummy_components()
161
        components["scheduler"] = EulerAncestralDiscreteScheduler(
162
            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
163
        )
164
        sd_pipe = StableDiffusionPix2PixZeroPipeline(**components)
165
        sd_pipe = sd_pipe.to(device)
166
        sd_pipe.set_progress_bar_config(disable=None)
167

168
        inputs = self.get_dummy_inputs(device)
169
        image = sd_pipe(**inputs).images
170
        image_slice = image[0, -3:, -3:, -1]
171

172
        assert image.shape == (1, 64, 64, 3)
173
        expected_slice = np.array([0.5114, 0.5051, 0.5222, 0.5279, 0.5037, 0.5156, 0.4604, 0.4966, 0.504])
174

175
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
176

177
    def test_stable_diffusion_pix2pix_zero_ddpm(self):
178
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
179
        components = self.get_dummy_components()
180
        components["scheduler"] = DDPMScheduler()
181
        sd_pipe = StableDiffusionPix2PixZeroPipeline(**components)
182
        sd_pipe = sd_pipe.to(device)
183
        sd_pipe.set_progress_bar_config(disable=None)
184

185
        inputs = self.get_dummy_inputs(device)
186
        image = sd_pipe(**inputs).images
187
        image_slice = image[0, -3:, -3:, -1]
188

189
        assert image.shape == (1, 64, 64, 3)
190
        expected_slice = np.array([0.5185, 0.5027, 0.492, 0.401, 0.3445, 0.464, 0.5321, 0.5327, 0.4892])
191

192
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
193

194
    # Non-determinism caused by the scheduler optimizing the latent inputs during inference
195
    @unittest.skip("non-deterministic pipeline")
196
    def test_inference_batch_single_identical(self):
197
        return super().test_inference_batch_single_identical()
198

199

200
@slow
201
@require_torch_gpu
202
class StableDiffusionPix2PixZeroPipelineSlowTests(unittest.TestCase):
203
    def tearDown(self):
204
        super().tearDown()
205
        gc.collect()
206
        torch.cuda.empty_cache()
207

208
    @classmethod
209
    def setUpClass(cls):
210
        cls.source_embeds = load_pt(
211
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat.pt"
212
        )
213

214
        cls.target_embeds = load_pt(
215
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/dog.pt"
216
        )
217

218
    def get_inputs(self, seed=0):
219
        generator = torch.manual_seed(seed)
220

221
        inputs = {
222
            "prompt": "turn him into a cyborg",
223
            "generator": generator,
224
            "num_inference_steps": 3,
225
            "guidance_scale": 7.5,
226
            "cross_attention_guidance_amount": 0.15,
227
            "source_embeds": self.source_embeds,
228
            "target_embeds": self.target_embeds,
229
            "output_type": "numpy",
230
        }
231
        return inputs
232

233
    def test_stable_diffusion_pix2pix_zero_default(self):
234
        pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
235
            "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
236
        )
237
        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
238
        pipe.to(torch_device)
239
        pipe.set_progress_bar_config(disable=None)
240
        pipe.enable_attention_slicing()
241

242
        inputs = self.get_inputs()
243
        image = pipe(**inputs).images
244
        image_slice = image[0, -3:, -3:, -1].flatten()
245

246
        assert image.shape == (1, 512, 512, 3)
247
        expected_slice = np.array([0.5742, 0.5757, 0.5747, 0.5781, 0.5688, 0.5713, 0.5742, 0.5664, 0.5747])
248

249
        assert np.abs(expected_slice - image_slice).max() < 5e-2
250

251
    def test_stable_diffusion_pix2pix_zero_k_lms(self):
252
        pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
253
            "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
254
        )
255
        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
256
        pipe.to(torch_device)
257
        pipe.set_progress_bar_config(disable=None)
258
        pipe.enable_attention_slicing()
259

260
        inputs = self.get_inputs()
261
        image = pipe(**inputs).images
262
        image_slice = image[0, -3:, -3:, -1].flatten()
263

264
        assert image.shape == (1, 512, 512, 3)
265
        expected_slice = np.array([0.6367, 0.5459, 0.5146, 0.5479, 0.4905, 0.4753, 0.4961, 0.4629, 0.4624])
266

267
        assert np.abs(expected_slice - image_slice).max() < 5e-2
268

269
    def test_stable_diffusion_pix2pix_zero_intermediate_state(self):
270
        number_of_steps = 0
271

272
        def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
273
            callback_fn.has_been_called = True
274
            nonlocal number_of_steps
275
            number_of_steps += 1
276
            if step == 1:
277
                latents = latents.detach().cpu().numpy()
278
                assert latents.shape == (1, 4, 64, 64)
279
                latents_slice = latents[0, -3:, -3:, -1]
280
                expected_slice = np.array([0.1345, 0.268, 0.1539, 0.0726, 0.0959, 0.2261, -0.2673, 0.0277, -0.2062])
281

282
                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
283
            elif step == 2:
284
                latents = latents.detach().cpu().numpy()
285
                assert latents.shape == (1, 4, 64, 64)
286
                latents_slice = latents[0, -3:, -3:, -1]
287
                expected_slice = np.array([0.1393, 0.2637, 0.1617, 0.0724, 0.0987, 0.2271, -0.2666, 0.0299, -0.2104])
288

289
                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
290

291
        callback_fn.has_been_called = False
292

293
        pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
294
            "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
295
        )
296
        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
297
        pipe = pipe.to(torch_device)
298
        pipe.set_progress_bar_config(disable=None)
299
        pipe.enable_attention_slicing()
300

301
        inputs = self.get_inputs()
302
        pipe(**inputs, callback=callback_fn, callback_steps=1)
303
        assert callback_fn.has_been_called
304
        assert number_of_steps == 3
305

306
    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
307
        torch.cuda.empty_cache()
308
        torch.cuda.reset_max_memory_allocated()
309
        torch.cuda.reset_peak_memory_stats()
310

311
        pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
312
            "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
313
        )
314
        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
315
        pipe = pipe.to(torch_device)
316
        pipe.set_progress_bar_config(disable=None)
317
        pipe.enable_attention_slicing(1)
318
        pipe.enable_sequential_cpu_offload()
319

320
        inputs = self.get_inputs()
321
        _ = pipe(**inputs)
322

323
        mem_bytes = torch.cuda.max_memory_allocated()
324
        # make sure that less than 8.2 GB is allocated
325
        assert mem_bytes < 8.2 * 10**9
326

327

328
@slow
329
@require_torch_gpu
330
class InversionPipelineSlowTests(unittest.TestCase):
331
    def tearDown(self):
332
        super().tearDown()
333
        gc.collect()
334
        torch.cuda.empty_cache()
335

336
    @classmethod
337
    def setUpClass(cls):
338
        raw_image = load_image(
339
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png"
340
        )
341

342
        raw_image = raw_image.convert("RGB").resize((512, 512))
343

344
        cls.raw_image = raw_image
345

346
    def test_stable_diffusion_pix2pix_inversion(self):
347
        pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
348
            "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
349
        )
350
        pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config)
351

352
        caption = "a photography of a cat with flowers"
353
        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
354
        pipe.enable_model_cpu_offload()
355
        pipe.set_progress_bar_config(disable=None)
356

357
        generator = torch.manual_seed(0)
358
        output = pipe.invert(caption, image=self.raw_image, generator=generator, num_inference_steps=10)
359
        inv_latents = output[0]
360

361
        image_slice = inv_latents[0, -3:, -3:, -1].flatten()
362

363
        assert inv_latents.shape == (1, 4, 64, 64)
364
        expected_slice = np.array([0.8447, -0.0730, 0.7588, -1.2070, -0.4678, 0.1511, -0.8555, 1.1816, -0.7666])
365

366
        assert np.abs(expected_slice - image_slice.cpu().numpy()).max() < 5e-2
367

368
    def test_stable_diffusion_2_pix2pix_inversion(self):
369
        pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
370
            "stabilityai/stable-diffusion-2-1", safety_checker=None, torch_dtype=torch.float16
371
        )
372
        pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config)
373

374
        caption = "a photography of a cat with flowers"
375
        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
376
        pipe.enable_model_cpu_offload()
377
        pipe.set_progress_bar_config(disable=None)
378

379
        generator = torch.manual_seed(0)
380
        output = pipe.invert(caption, image=self.raw_image, generator=generator, num_inference_steps=10)
381
        inv_latents = output[0]
382

383
        image_slice = inv_latents[0, -3:, -3:, -1].flatten()
384

385
        assert inv_latents.shape == (1, 4, 64, 64)
386
        expected_slice = np.array([0.8970, -0.1611, 0.4766, -1.1162, -0.5923, 0.1050, -0.9678, 1.0537, -0.6050])
387

388
        assert np.abs(expected_slice - image_slice.cpu().numpy()).max() < 5e-2
389

390
    def test_stable_diffusion_pix2pix_full(self):
391
        # numpy array of https://huggingface.co/datasets/hf-internal-testing/diffusers-images/blob/main/pix2pix/dog.png
392
        expected_image = load_numpy(
393
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/dog.npy"
394
        )
395

396
        pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
397
            "CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
398
        )
399
        pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config)
400

401
        caption = "a photography of a cat with flowers"
402
        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
403
        pipe.enable_model_cpu_offload()
404
        pipe.set_progress_bar_config(disable=None)
405

406
        generator = torch.manual_seed(0)
407
        output = pipe.invert(caption, image=self.raw_image, generator=generator)
408
        inv_latents = output[0]
409

410
        source_prompts = 4 * ["a cat sitting on the street", "a cat playing in the field", "a face of a cat"]
411
        target_prompts = 4 * ["a dog sitting on the street", "a dog playing in the field", "a face of a dog"]
412

413
        source_embeds = pipe.get_embeds(source_prompts)
414
        target_embeds = pipe.get_embeds(target_prompts)
415

416
        image = pipe(
417
            caption,
418
            source_embeds=source_embeds,
419
            target_embeds=target_embeds,
420
            num_inference_steps=50,
421
            cross_attention_guidance_amount=0.15,
422
            generator=generator,
423
            latents=inv_latents,
424
            negative_prompt=caption,
425
            output_type="np",
426
        ).images
427

428
        max_diff = np.abs(expected_image - image).mean()
429
        assert max_diff < 0.05
430

431
    def test_stable_diffusion_2_pix2pix_full(self):
432
        # numpy array of https://huggingface.co/datasets/hf-internal-testing/diffusers-images/blob/main/pix2pix/dog_2.png
433
        expected_image = load_numpy(
434
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/dog_2.npy"
435
        )
436

437
        pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
438
            "stabilityai/stable-diffusion-2-1", safety_checker=None, torch_dtype=torch.float16
439
        )
440
        pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config)
441

442
        caption = "a photography of a cat with flowers"
443
        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
444
        pipe.enable_model_cpu_offload()
445
        pipe.set_progress_bar_config(disable=None)
446

447
        generator = torch.manual_seed(0)
448
        output = pipe.invert(caption, image=self.raw_image, generator=generator)
449
        inv_latents = output[0]
450

451
        source_prompts = 4 * ["a cat sitting on the street", "a cat playing in the field", "a face of a cat"]
452
        target_prompts = 4 * ["a dog sitting on the street", "a dog playing in the field", "a face of a dog"]
453

454
        source_embeds = pipe.get_embeds(source_prompts)
455
        target_embeds = pipe.get_embeds(target_prompts)
456

457
        image = pipe(
458
            caption,
459
            source_embeds=source_embeds,
460
            target_embeds=target_embeds,
461
            num_inference_steps=125,
462
            cross_attention_guidance_amount=0.015,
463
            generator=generator,
464
            latents=inv_latents,
465
            negative_prompt=caption,
466
            output_type="np",
467
        ).images
468

469
        mean_diff = np.abs(expected_image - image).mean()
470
        assert mean_diff < 0.25
471

472
Product

Resources

Company