CoCalc -- test_stable_diffusion_instruction

GitHub Repository: shivamshrirao/diffusers
Path: blob/main/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
¹⁴⁴⁸ views
1
# coding=utf-8
2
# Copyright 2023 HuggingFace Inc.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
import gc
17
import random
18
import unittest
19

20
import numpy as np
21
import torch
22
from PIL import Image
23
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
24

25
from diffusers import (
26
    AutoencoderKL,
27
    DDIMScheduler,
28
    EulerAncestralDiscreteScheduler,
29
    LMSDiscreteScheduler,
30
    PNDMScheduler,
31
    StableDiffusionInstructPix2PixPipeline,
32
    UNet2DConditionModel,
33
)
34
from diffusers.utils import floats_tensor, load_image, slow, torch_device
35
from diffusers.utils.testing_utils import require_torch_gpu
36

37
from ...pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
38
from ...test_pipelines_common import PipelineTesterMixin
39

40

41
torch.backends.cuda.matmul.allow_tf32 = False
42

43

44
class StableDiffusionInstructPix2PixPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
45
    pipeline_class = StableDiffusionInstructPix2PixPipeline
46
    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width", "cross_attention_kwargs"}
47
    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
48

49
    def get_dummy_components(self):
50
        torch.manual_seed(0)
51
        unet = UNet2DConditionModel(
52
            block_out_channels=(32, 64),
53
            layers_per_block=2,
54
            sample_size=32,
55
            in_channels=8,
56
            out_channels=4,
57
            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
58
            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
59
            cross_attention_dim=32,
60
        )
61
        scheduler = PNDMScheduler(skip_prk_steps=True)
62
        torch.manual_seed(0)
63
        vae = AutoencoderKL(
64
            block_out_channels=[32, 64],
65
            in_channels=3,
66
            out_channels=3,
67
            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
68
            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
69
            latent_channels=4,
70
        )
71
        torch.manual_seed(0)
72
        text_encoder_config = CLIPTextConfig(
73
            bos_token_id=0,
74
            eos_token_id=2,
75
            hidden_size=32,
76
            intermediate_size=37,
77
            layer_norm_eps=1e-05,
78
            num_attention_heads=4,
79
            num_hidden_layers=5,
80
            pad_token_id=1,
81
            vocab_size=1000,
82
        )
83
        text_encoder = CLIPTextModel(text_encoder_config)
84
        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
85

86
        components = {
87
            "unet": unet,
88
            "scheduler": scheduler,
89
            "vae": vae,
90
            "text_encoder": text_encoder,
91
            "tokenizer": tokenizer,
92
            "safety_checker": None,
93
            "feature_extractor": None,
94
        }
95
        return components
96

97
    def get_dummy_inputs(self, device, seed=0):
98
        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
99
        image = image.cpu().permute(0, 2, 3, 1)[0]
100
        image = Image.fromarray(np.uint8(image)).convert("RGB")
101
        if str(device).startswith("mps"):
102
            generator = torch.manual_seed(seed)
103
        else:
104
            generator = torch.Generator(device=device).manual_seed(seed)
105
        inputs = {
106
            "prompt": "A painting of a squirrel eating a burger",
107
            "image": image,
108
            "generator": generator,
109
            "num_inference_steps": 2,
110
            "guidance_scale": 6.0,
111
            "image_guidance_scale": 1,
112
            "output_type": "numpy",
113
        }
114
        return inputs
115

116
    def test_stable_diffusion_pix2pix_default_case(self):
117
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
118
        components = self.get_dummy_components()
119
        sd_pipe = StableDiffusionInstructPix2PixPipeline(**components)
120
        sd_pipe = sd_pipe.to(device)
121
        sd_pipe.set_progress_bar_config(disable=None)
122

123
        inputs = self.get_dummy_inputs(device)
124
        image = sd_pipe(**inputs).images
125
        image_slice = image[0, -3:, -3:, -1]
126
        assert image.shape == (1, 32, 32, 3)
127
        expected_slice = np.array([0.7318, 0.3723, 0.4662, 0.623, 0.5770, 0.5014, 0.4281, 0.5550, 0.4813])
128

129
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
130

131
    def test_stable_diffusion_pix2pix_negative_prompt(self):
132
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
133
        components = self.get_dummy_components()
134
        sd_pipe = StableDiffusionInstructPix2PixPipeline(**components)
135
        sd_pipe = sd_pipe.to(device)
136
        sd_pipe.set_progress_bar_config(disable=None)
137

138
        inputs = self.get_dummy_inputs(device)
139
        negative_prompt = "french fries"
140
        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
141
        image = output.images
142
        image_slice = image[0, -3:, -3:, -1]
143

144
        assert image.shape == (1, 32, 32, 3)
145
        expected_slice = np.array([0.7323, 0.3688, 0.4611, 0.6255, 0.5746, 0.5017, 0.433, 0.5553, 0.4827])
146

147
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
148

149
    def test_stable_diffusion_pix2pix_multiple_init_images(self):
150
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
151
        components = self.get_dummy_components()
152
        sd_pipe = StableDiffusionInstructPix2PixPipeline(**components)
153
        sd_pipe = sd_pipe.to(device)
154
        sd_pipe.set_progress_bar_config(disable=None)
155

156
        inputs = self.get_dummy_inputs(device)
157
        inputs["prompt"] = [inputs["prompt"]] * 2
158

159
        image = np.array(inputs["image"]).astype(np.float32) / 255.0
160
        image = torch.from_numpy(image).unsqueeze(0).to(device)
161
        image = image.permute(0, 3, 1, 2)
162
        inputs["image"] = image.repeat(2, 1, 1, 1)
163

164
        image = sd_pipe(**inputs).images
165
        image_slice = image[-1, -3:, -3:, -1]
166

167
        assert image.shape == (2, 32, 32, 3)
168
        expected_slice = np.array([0.606, 0.5712, 0.5099, 0.598, 0.5805, 0.7205, 0.6793, 0.554, 0.5607])
169

170
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
171

172
    def test_stable_diffusion_pix2pix_euler(self):
173
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
174
        components = self.get_dummy_components()
175
        components["scheduler"] = EulerAncestralDiscreteScheduler(
176
            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
177
        )
178
        sd_pipe = StableDiffusionInstructPix2PixPipeline(**components)
179
        sd_pipe = sd_pipe.to(device)
180
        sd_pipe.set_progress_bar_config(disable=None)
181

182
        inputs = self.get_dummy_inputs(device)
183
        image = sd_pipe(**inputs).images
184
        image_slice = image[0, -3:, -3:, -1]
185

186
        slice = [round(x, 4) for x in image_slice.flatten().tolist()]
187
        print(",".join([str(x) for x in slice]))
188

189
        assert image.shape == (1, 32, 32, 3)
190
        expected_slice = np.array([0.726, 0.3902, 0.4868, 0.585, 0.5672, 0.511, 0.3906, 0.551, 0.4846])
191

192
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
193

194

195
@slow
196
@require_torch_gpu
197
class StableDiffusionInstructPix2PixPipelineSlowTests(unittest.TestCase):
198
    def tearDown(self):
199
        super().tearDown()
200
        gc.collect()
201
        torch.cuda.empty_cache()
202

203
    def get_inputs(self, seed=0):
204
        generator = torch.manual_seed(seed)
205
        image = load_image(
206
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_pix2pix/example.jpg"
207
        )
208
        inputs = {
209
            "prompt": "turn him into a cyborg",
210
            "image": image,
211
            "generator": generator,
212
            "num_inference_steps": 3,
213
            "guidance_scale": 7.5,
214
            "image_guidance_scale": 1.0,
215
            "output_type": "numpy",
216
        }
217
        return inputs
218

219
    def test_stable_diffusion_pix2pix_default(self):
220
        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
221
            "timbrooks/instruct-pix2pix", safety_checker=None
222
        )
223
        pipe.to(torch_device)
224
        pipe.set_progress_bar_config(disable=None)
225
        pipe.enable_attention_slicing()
226

227
        inputs = self.get_inputs()
228
        image = pipe(**inputs).images
229
        image_slice = image[0, -3:, -3:, -1].flatten()
230

231
        assert image.shape == (1, 512, 512, 3)
232
        expected_slice = np.array([0.5902, 0.6015, 0.6027, 0.5983, 0.6092, 0.6061, 0.5765, 0.5785, 0.5555])
233

234
        assert np.abs(expected_slice - image_slice).max() < 1e-3
235

236
    def test_stable_diffusion_pix2pix_k_lms(self):
237
        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
238
            "timbrooks/instruct-pix2pix", safety_checker=None
239
        )
240
        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
241
        pipe.to(torch_device)
242
        pipe.set_progress_bar_config(disable=None)
243
        pipe.enable_attention_slicing()
244

245
        inputs = self.get_inputs()
246
        image = pipe(**inputs).images
247
        image_slice = image[0, -3:, -3:, -1].flatten()
248

249
        assert image.shape == (1, 512, 512, 3)
250
        expected_slice = np.array([0.6578, 0.6817, 0.6972, 0.6761, 0.6856, 0.6916, 0.6428, 0.6516, 0.6301])
251

252
        assert np.abs(expected_slice - image_slice).max() < 1e-3
253

254
    def test_stable_diffusion_pix2pix_ddim(self):
255
        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
256
            "timbrooks/instruct-pix2pix", safety_checker=None
257
        )
258
        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
259
        pipe.to(torch_device)
260
        pipe.set_progress_bar_config(disable=None)
261
        pipe.enable_attention_slicing()
262

263
        inputs = self.get_inputs()
264
        image = pipe(**inputs).images
265
        image_slice = image[0, -3:, -3:, -1].flatten()
266

267
        assert image.shape == (1, 512, 512, 3)
268
        expected_slice = np.array([0.3828, 0.3834, 0.3818, 0.3792, 0.3865, 0.3752, 0.3792, 0.3847, 0.3753])
269

270
        assert np.abs(expected_slice - image_slice).max() < 1e-3
271

272
    def test_stable_diffusion_pix2pix_intermediate_state(self):
273
        number_of_steps = 0
274

275
        def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
276
            callback_fn.has_been_called = True
277
            nonlocal number_of_steps
278
            number_of_steps += 1
279
            if step == 1:
280
                latents = latents.detach().cpu().numpy()
281
                assert latents.shape == (1, 4, 64, 64)
282
                latents_slice = latents[0, -3:, -3:, -1]
283
                expected_slice = np.array([-0.2463, -0.4644, -0.9756, 1.5176, 1.4414, 0.7866, 0.9897, 0.8521, 0.7983])
284

285
                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
286
            elif step == 2:
287
                latents = latents.detach().cpu().numpy()
288
                assert latents.shape == (1, 4, 64, 64)
289
                latents_slice = latents[0, -3:, -3:, -1]
290
                expected_slice = np.array([-0.2644, -0.4626, -0.9653, 1.5176, 1.4551, 0.7686, 0.9805, 0.8452, 0.8115])
291

292
                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
293

294
        callback_fn.has_been_called = False
295

296
        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
297
            "timbrooks/instruct-pix2pix", safety_checker=None, torch_dtype=torch.float16
298
        )
299
        pipe = pipe.to(torch_device)
300
        pipe.set_progress_bar_config(disable=None)
301
        pipe.enable_attention_slicing()
302

303
        inputs = self.get_inputs()
304
        pipe(**inputs, callback=callback_fn, callback_steps=1)
305
        assert callback_fn.has_been_called
306
        assert number_of_steps == 3
307

308
    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
309
        torch.cuda.empty_cache()
310
        torch.cuda.reset_max_memory_allocated()
311
        torch.cuda.reset_peak_memory_stats()
312

313
        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
314
            "timbrooks/instruct-pix2pix", safety_checker=None, torch_dtype=torch.float16
315
        )
316
        pipe = pipe.to(torch_device)
317
        pipe.set_progress_bar_config(disable=None)
318
        pipe.enable_attention_slicing(1)
319
        pipe.enable_sequential_cpu_offload()
320

321
        inputs = self.get_inputs()
322
        _ = pipe(**inputs)
323

324
        mem_bytes = torch.cuda.max_memory_allocated()
325
        # make sure that less than 2.2 GB is allocated
326
        assert mem_bytes < 2.2 * 10**9
327

328
    def test_stable_diffusion_pix2pix_pipeline_multiple_of_8(self):
329
        inputs = self.get_inputs()
330
        # resize to resolution that is divisible by 8 but not 16 or 32
331
        inputs["image"] = inputs["image"].resize((504, 504))
332

333
        model_id = "timbrooks/instruct-pix2pix"
334
        pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
335
            model_id,
336
            safety_checker=None,
337
        )
338
        pipe.to(torch_device)
339
        pipe.set_progress_bar_config(disable=None)
340
        pipe.enable_attention_slicing()
341

342
        output = pipe(**inputs)
343
        image = output.images[0]
344

345
        image_slice = image[255:258, 383:386, -1]
346

347
        assert image.shape == (504, 504, 3)
348
        expected_slice = np.array([0.2726, 0.2529, 0.2664, 0.2655, 0.2641, 0.2642, 0.2591, 0.2649, 0.2590])
349

350
        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3
351

352
Product

Resources

Company