CoCalc -- test_stable_diffusion_image

GitHub Repository: shivamshrirao/diffusers
Path: blob/main/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
¹⁴⁵⁰ views
1
# coding=utf-8
2
# Copyright 2023 HuggingFace Inc.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
import gc
17
import random
18
import unittest
19

20
import numpy as np
21
import torch
22
from PIL import Image
23
from transformers import CLIPImageProcessor, CLIPVisionConfig, CLIPVisionModelWithProjection
24

25
from diffusers import (
26
    AutoencoderKL,
27
    DPMSolverMultistepScheduler,
28
    PNDMScheduler,
29
    StableDiffusionImageVariationPipeline,
30
    UNet2DConditionModel,
31
)
32
from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
33
from diffusers.utils.testing_utils import require_torch_gpu
34

35
from ...pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS
36
from ...test_pipelines_common import PipelineTesterMixin
37

38

39
torch.backends.cuda.matmul.allow_tf32 = False
40

41

42
class StableDiffusionImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
43
    pipeline_class = StableDiffusionImageVariationPipeline
44
    params = IMAGE_VARIATION_PARAMS
45
    batch_params = IMAGE_VARIATION_BATCH_PARAMS
46

47
    def get_dummy_components(self):
48
        torch.manual_seed(0)
49
        unet = UNet2DConditionModel(
50
            block_out_channels=(32, 64),
51
            layers_per_block=2,
52
            sample_size=32,
53
            in_channels=4,
54
            out_channels=4,
55
            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
56
            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
57
            cross_attention_dim=32,
58
        )
59
        scheduler = PNDMScheduler(skip_prk_steps=True)
60
        torch.manual_seed(0)
61
        vae = AutoencoderKL(
62
            block_out_channels=[32, 64],
63
            in_channels=3,
64
            out_channels=3,
65
            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
66
            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
67
            latent_channels=4,
68
        )
69
        torch.manual_seed(0)
70
        image_encoder_config = CLIPVisionConfig(
71
            hidden_size=32,
72
            projection_dim=32,
73
            intermediate_size=37,
74
            layer_norm_eps=1e-05,
75
            num_attention_heads=4,
76
            num_hidden_layers=5,
77
            image_size=32,
78
            patch_size=4,
79
        )
80
        image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
81
        feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
82

83
        components = {
84
            "unet": unet,
85
            "scheduler": scheduler,
86
            "vae": vae,
87
            "image_encoder": image_encoder,
88
            "feature_extractor": feature_extractor,
89
            "safety_checker": None,
90
        }
91
        return components
92

93
    def get_dummy_inputs(self, device, seed=0):
94
        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
95
        image = image.cpu().permute(0, 2, 3, 1)[0]
96
        image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32))
97
        if str(device).startswith("mps"):
98
            generator = torch.manual_seed(seed)
99
        else:
100
            generator = torch.Generator(device=device).manual_seed(seed)
101
        inputs = {
102
            "image": image,
103
            "generator": generator,
104
            "num_inference_steps": 2,
105
            "guidance_scale": 6.0,
106
            "output_type": "numpy",
107
        }
108
        return inputs
109

110
    def test_stable_diffusion_img_variation_default_case(self):
111
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
112
        components = self.get_dummy_components()
113
        sd_pipe = StableDiffusionImageVariationPipeline(**components)
114
        sd_pipe = sd_pipe.to(device)
115
        sd_pipe.set_progress_bar_config(disable=None)
116

117
        inputs = self.get_dummy_inputs(device)
118
        image = sd_pipe(**inputs).images
119
        image_slice = image[0, -3:, -3:, -1]
120

121
        assert image.shape == (1, 64, 64, 3)
122
        expected_slice = np.array([0.5167, 0.5746, 0.4835, 0.4914, 0.5605, 0.4691, 0.5201, 0.4898, 0.4958])
123

124
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
125

126
    def test_stable_diffusion_img_variation_multiple_images(self):
127
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
128
        components = self.get_dummy_components()
129
        sd_pipe = StableDiffusionImageVariationPipeline(**components)
130
        sd_pipe = sd_pipe.to(device)
131
        sd_pipe.set_progress_bar_config(disable=None)
132

133
        inputs = self.get_dummy_inputs(device)
134
        inputs["image"] = 2 * [inputs["image"]]
135
        output = sd_pipe(**inputs)
136

137
        image = output.images
138

139
        image_slice = image[-1, -3:, -3:, -1]
140

141
        assert image.shape == (2, 64, 64, 3)
142
        expected_slice = np.array([0.6568, 0.5470, 0.5684, 0.5444, 0.5945, 0.6221, 0.5508, 0.5531, 0.5263])
143

144
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
145

146

147
@slow
148
@require_torch_gpu
149
class StableDiffusionImageVariationPipelineSlowTests(unittest.TestCase):
150
    def tearDown(self):
151
        super().tearDown()
152
        gc.collect()
153
        torch.cuda.empty_cache()
154

155
    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
156
        generator = torch.Generator(device=generator_device).manual_seed(seed)
157
        init_image = load_image(
158
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
159
            "/stable_diffusion_imgvar/input_image_vermeer.png"
160
        )
161
        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
162
        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
163
        inputs = {
164
            "image": init_image,
165
            "latents": latents,
166
            "generator": generator,
167
            "num_inference_steps": 3,
168
            "guidance_scale": 7.5,
169
            "output_type": "numpy",
170
        }
171
        return inputs
172

173
    def test_stable_diffusion_img_variation_pipeline_default(self):
174
        sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained(
175
            "lambdalabs/sd-image-variations-diffusers", safety_checker=None
176
        )
177
        sd_pipe = sd_pipe.to(torch_device)
178
        sd_pipe.set_progress_bar_config(disable=None)
179

180
        inputs = self.get_inputs(torch_device)
181
        image = sd_pipe(**inputs).images
182
        image_slice = image[0, -3:, -3:, -1].flatten()
183

184
        assert image.shape == (1, 512, 512, 3)
185
        expected_slice = np.array([0.84491, 0.90789, 0.75708, 0.78734, 0.83485, 0.70099, 0.66938, 0.68727, 0.61379])
186
        assert np.abs(image_slice - expected_slice).max() < 1e-4
187

188
    def test_stable_diffusion_img_variation_intermediate_state(self):
189
        number_of_steps = 0
190

191
        def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
192
            callback_fn.has_been_called = True
193
            nonlocal number_of_steps
194
            number_of_steps += 1
195
            if step == 1:
196
                latents = latents.detach().cpu().numpy()
197
                assert latents.shape == (1, 4, 64, 64)
198
                latents_slice = latents[0, -3:, -3:, -1]
199
                expected_slice = np.array(
200
                    [-0.1621, 0.2837, -0.7979, -0.1221, -1.3057, 0.7681, -2.1191, 0.0464, 1.6309]
201
                )
202

203
                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
204
            elif step == 2:
205
                latents = latents.detach().cpu().numpy()
206
                assert latents.shape == (1, 4, 64, 64)
207
                latents_slice = latents[0, -3:, -3:, -1]
208
                expected_slice = np.array([0.6299, 1.7500, 1.1992, -2.1582, -1.8994, 0.7334, -0.7090, 1.0137, 1.5273])
209

210
                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
211

212
        callback_fn.has_been_called = False
213

214
        pipe = StableDiffusionImageVariationPipeline.from_pretrained(
215
            "fusing/sd-image-variations-diffusers",
216
            safety_checker=None,
217
            torch_dtype=torch.float16,
218
        )
219
        pipe.to(torch_device)
220
        pipe.set_progress_bar_config(disable=None)
221
        pipe.enable_attention_slicing()
222

223
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
224
        pipe(**inputs, callback=callback_fn, callback_steps=1)
225
        assert callback_fn.has_been_called
226
        assert number_of_steps == inputs["num_inference_steps"]
227

228
    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
229
        torch.cuda.empty_cache()
230
        torch.cuda.reset_max_memory_allocated()
231
        torch.cuda.reset_peak_memory_stats()
232

233
        model_id = "fusing/sd-image-variations-diffusers"
234
        pipe = StableDiffusionImageVariationPipeline.from_pretrained(
235
            model_id, safety_checker=None, torch_dtype=torch.float16
236
        )
237
        pipe = pipe.to(torch_device)
238
        pipe.set_progress_bar_config(disable=None)
239
        pipe.enable_attention_slicing(1)
240
        pipe.enable_sequential_cpu_offload()
241

242
        inputs = self.get_inputs(torch_device, dtype=torch.float16)
243
        _ = pipe(**inputs)
244

245
        mem_bytes = torch.cuda.max_memory_allocated()
246
        # make sure that less than 2.6 GB is allocated
247
        assert mem_bytes < 2.6 * 10**9
248

249

250
@nightly
251
@require_torch_gpu
252
class StableDiffusionImageVariationPipelineNightlyTests(unittest.TestCase):
253
    def tearDown(self):
254
        super().tearDown()
255
        gc.collect()
256
        torch.cuda.empty_cache()
257

258
    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
259
        generator = torch.Generator(device=generator_device).manual_seed(seed)
260
        init_image = load_image(
261
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
262
            "/stable_diffusion_imgvar/input_image_vermeer.png"
263
        )
264
        latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
265
        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
266
        inputs = {
267
            "image": init_image,
268
            "latents": latents,
269
            "generator": generator,
270
            "num_inference_steps": 50,
271
            "guidance_scale": 7.5,
272
            "output_type": "numpy",
273
        }
274
        return inputs
275

276
    def test_img_variation_pndm(self):
277
        sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained("fusing/sd-image-variations-diffusers")
278
        sd_pipe.to(torch_device)
279
        sd_pipe.set_progress_bar_config(disable=None)
280

281
        inputs = self.get_inputs(torch_device)
282
        image = sd_pipe(**inputs).images[0]
283

284
        expected_image = load_numpy(
285
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
286
            "/stable_diffusion_imgvar/lambdalabs_variations_pndm.npy"
287
        )
288
        max_diff = np.abs(expected_image - image).max()
289
        assert max_diff < 1e-3
290

291
    def test_img_variation_dpm(self):
292
        sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained("fusing/sd-image-variations-diffusers")
293
        sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)
294
        sd_pipe.to(torch_device)
295
        sd_pipe.set_progress_bar_config(disable=None)
296

297
        inputs = self.get_inputs(torch_device)
298
        inputs["num_inference_steps"] = 25
299
        image = sd_pipe(**inputs).images[0]
300

301
        expected_image = load_numpy(
302
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
303
            "/stable_diffusion_imgvar/lambdalabs_variations_dpm_multi.npy"
304
        )
305
        max_diff = np.abs(expected_image - image).max()
306
        assert max_diff < 1e-3
307

308
Product

Resources

Company