CoCalc -- test_stable_diffusion

GitHub Repository: shivamshrirao/diffusers
Path: blob/main/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
¹⁴⁴⁸ views
1
# coding=utf-8
2
# Copyright 2023 HuggingFace Inc.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
import gc
17
import unittest
18

19
import numpy as np
20
import torch
21
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
22

23
from diffusers import (
24
    AutoencoderKL,
25
    DDIMScheduler,
26
    EulerAncestralDiscreteScheduler,
27
    LMSDiscreteScheduler,
28
    PNDMScheduler,
29
    StableDiffusionPanoramaPipeline,
30
    UNet2DConditionModel,
31
)
32
from diffusers.utils import slow, torch_device
33
from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
34

35
from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
36
from ...test_pipelines_common import PipelineTesterMixin
37

38

39
torch.backends.cuda.matmul.allow_tf32 = False
40

41

42
@skip_mps
43
class StableDiffusionPanoramaPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
44
    pipeline_class = StableDiffusionPanoramaPipeline
45
    params = TEXT_TO_IMAGE_PARAMS
46
    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
47

48
    def get_dummy_components(self):
49
        torch.manual_seed(0)
50
        unet = UNet2DConditionModel(
51
            block_out_channels=(32, 64),
52
            layers_per_block=2,
53
            sample_size=32,
54
            in_channels=4,
55
            out_channels=4,
56
            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
57
            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
58
            cross_attention_dim=32,
59
        )
60
        scheduler = DDIMScheduler()
61
        torch.manual_seed(0)
62
        vae = AutoencoderKL(
63
            block_out_channels=[32, 64],
64
            in_channels=3,
65
            out_channels=3,
66
            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
67
            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
68
            latent_channels=4,
69
        )
70
        torch.manual_seed(0)
71
        text_encoder_config = CLIPTextConfig(
72
            bos_token_id=0,
73
            eos_token_id=2,
74
            hidden_size=32,
75
            intermediate_size=37,
76
            layer_norm_eps=1e-05,
77
            num_attention_heads=4,
78
            num_hidden_layers=5,
79
            pad_token_id=1,
80
            vocab_size=1000,
81
        )
82
        text_encoder = CLIPTextModel(text_encoder_config)
83
        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
84

85
        components = {
86
            "unet": unet,
87
            "scheduler": scheduler,
88
            "vae": vae,
89
            "text_encoder": text_encoder,
90
            "tokenizer": tokenizer,
91
            "safety_checker": None,
92
            "feature_extractor": None,
93
        }
94
        return components
95

96
    def get_dummy_inputs(self, device, seed=0):
97
        generator = torch.manual_seed(seed)
98
        inputs = {
99
            "prompt": "a photo of the dolomites",
100
            "generator": generator,
101
            # Setting height and width to None to prevent OOMs on CPU.
102
            "height": None,
103
            "width": None,
104
            "num_inference_steps": 2,
105
            "guidance_scale": 6.0,
106
            "output_type": "numpy",
107
        }
108
        return inputs
109

110
    def test_stable_diffusion_panorama_default_case(self):
111
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
112
        components = self.get_dummy_components()
113
        sd_pipe = StableDiffusionPanoramaPipeline(**components)
114
        sd_pipe = sd_pipe.to(device)
115
        sd_pipe.set_progress_bar_config(disable=None)
116

117
        inputs = self.get_dummy_inputs(device)
118
        image = sd_pipe(**inputs).images
119
        image_slice = image[0, -3:, -3:, -1]
120
        assert image.shape == (1, 64, 64, 3)
121

122
        expected_slice = np.array([0.5101, 0.5006, 0.4962, 0.3995, 0.3501, 0.4632, 0.5339, 0.525, 0.4878])
123

124
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
125

126
    def test_stable_diffusion_panorama_negative_prompt(self):
127
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
128
        components = self.get_dummy_components()
129
        sd_pipe = StableDiffusionPanoramaPipeline(**components)
130
        sd_pipe = sd_pipe.to(device)
131
        sd_pipe.set_progress_bar_config(disable=None)
132

133
        inputs = self.get_dummy_inputs(device)
134
        negative_prompt = "french fries"
135
        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
136
        image = output.images
137
        image_slice = image[0, -3:, -3:, -1]
138

139
        assert image.shape == (1, 64, 64, 3)
140

141
        expected_slice = np.array([0.5326, 0.5009, 0.5074, 0.4133, 0.371, 0.464, 0.5432, 0.5429, 0.4896])
142

143
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
144

145
    def test_stable_diffusion_panorama_euler(self):
146
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
147
        components = self.get_dummy_components()
148
        components["scheduler"] = EulerAncestralDiscreteScheduler(
149
            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
150
        )
151
        sd_pipe = StableDiffusionPanoramaPipeline(**components)
152
        sd_pipe = sd_pipe.to(device)
153
        sd_pipe.set_progress_bar_config(disable=None)
154

155
        inputs = self.get_dummy_inputs(device)
156
        image = sd_pipe(**inputs).images
157
        image_slice = image[0, -3:, -3:, -1]
158

159
        assert image.shape == (1, 64, 64, 3)
160

161
        expected_slice = np.array(
162
            [0.48235387, 0.5423796, 0.46016198, 0.5377287, 0.5803722, 0.4876525, 0.5515428, 0.5045897, 0.50709957]
163
        )
164

165
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
166

167
    def test_stable_diffusion_panorama_pndm(self):
168
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
169
        components = self.get_dummy_components()
170
        components["scheduler"] = PNDMScheduler()
171
        sd_pipe = StableDiffusionPanoramaPipeline(**components)
172
        sd_pipe = sd_pipe.to(device)
173
        sd_pipe.set_progress_bar_config(disable=None)
174

175
        inputs = self.get_dummy_inputs(device)
176
        # the pipeline does not expect pndm so test if it raises error.
177
        with self.assertRaises(ValueError):
178
            _ = sd_pipe(**inputs).images
179

180

181
@slow
182
@require_torch_gpu
183
class StableDiffusionPanoramaSlowTests(unittest.TestCase):
184
    def tearDown(self):
185
        super().tearDown()
186
        gc.collect()
187
        torch.cuda.empty_cache()
188

189
    def get_inputs(self, seed=0):
190
        generator = torch.manual_seed(seed)
191
        inputs = {
192
            "prompt": "a photo of the dolomites",
193
            "generator": generator,
194
            "num_inference_steps": 3,
195
            "guidance_scale": 7.5,
196
            "output_type": "numpy",
197
        }
198
        return inputs
199

200
    def test_stable_diffusion_panorama_default(self):
201
        model_ckpt = "stabilityai/stable-diffusion-2-base"
202
        scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
203
        pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None)
204
        pipe.to(torch_device)
205
        pipe.set_progress_bar_config(disable=None)
206
        pipe.enable_attention_slicing()
207

208
        inputs = self.get_inputs()
209
        image = pipe(**inputs).images
210
        image_slice = image[0, -3:, -3:, -1].flatten()
211

212
        assert image.shape == (1, 512, 2048, 3)
213

214
        expected_slice = np.array(
215
            [
216
                0.36968392,
217
                0.27025372,
218
                0.32446766,
219
                0.28379387,
220
                0.36363274,
221
                0.30733347,
222
                0.27100027,
223
                0.27054125,
224
                0.25536096,
225
            ]
226
        )
227

228
        assert np.abs(expected_slice - image_slice).max() < 1e-2
229

230
    def test_stable_diffusion_panorama_k_lms(self):
231
        pipe = StableDiffusionPanoramaPipeline.from_pretrained(
232
            "stabilityai/stable-diffusion-2-base", safety_checker=None
233
        )
234
        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
235
        pipe.to(torch_device)
236
        pipe.set_progress_bar_config(disable=None)
237
        pipe.enable_attention_slicing()
238

239
        inputs = self.get_inputs()
240
        image = pipe(**inputs).images
241
        image_slice = image[0, -3:, -3:, -1].flatten()
242

243
        assert image.shape == (1, 512, 2048, 3)
244

245
        expected_slice = np.array(
246
            [
247
                [
248
                    0.0,
249
                    0.0,
250
                    0.0,
251
                    0.0,
252
                    0.0,
253
                    0.0,
254
                    0.0,
255
                    0.0,
256
                    0.0,
257
                ]
258
            ]
259
        )
260

261
        assert np.abs(expected_slice - image_slice).max() < 1e-3
262

263
    def test_stable_diffusion_panorama_intermediate_state(self):
264
        number_of_steps = 0
265

266
        def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
267
            callback_fn.has_been_called = True
268
            nonlocal number_of_steps
269
            number_of_steps += 1
270
            if step == 1:
271
                latents = latents.detach().cpu().numpy()
272
                assert latents.shape == (1, 4, 64, 256)
273
                latents_slice = latents[0, -3:, -3:, -1]
274

275
                expected_slice = np.array(
276
                    [
277
                        0.18681869,
278
                        0.33907816,
279
                        0.5361276,
280
                        0.14432865,
281
                        -0.02856611,
282
                        -0.73941123,
283
                        0.23397987,
284
                        0.47322682,
285
                        -0.37823164,
286
                    ]
287
                )
288
                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
289
            elif step == 2:
290
                latents = latents.detach().cpu().numpy()
291
                assert latents.shape == (1, 4, 64, 256)
292
                latents_slice = latents[0, -3:, -3:, -1]
293

294
                expected_slice = np.array(
295
                    [
296
                        0.18539645,
297
                        0.33987248,
298
                        0.5378559,
299
                        0.14437142,
300
                        -0.02455261,
301
                        -0.7338317,
302
                        0.23990755,
303
                        0.47356272,
304
                        -0.3786505,
305
                    ]
306
                )
307

308
                assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
309

310
        callback_fn.has_been_called = False
311

312
        model_ckpt = "stabilityai/stable-diffusion-2-base"
313
        scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
314
        pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None)
315
        pipe = pipe.to(torch_device)
316
        pipe.set_progress_bar_config(disable=None)
317
        pipe.enable_attention_slicing()
318

319
        inputs = self.get_inputs()
320
        pipe(**inputs, callback=callback_fn, callback_steps=1)
321
        assert callback_fn.has_been_called
322
        assert number_of_steps == 3
323

324
    def test_stable_diffusion_panorama_pipeline_with_sequential_cpu_offloading(self):
325
        torch.cuda.empty_cache()
326
        torch.cuda.reset_max_memory_allocated()
327
        torch.cuda.reset_peak_memory_stats()
328

329
        model_ckpt = "stabilityai/stable-diffusion-2-base"
330
        scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
331
        pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None)
332
        pipe = pipe.to(torch_device)
333
        pipe.set_progress_bar_config(disable=None)
334
        pipe.enable_attention_slicing(1)
335
        pipe.enable_sequential_cpu_offload()
336

337
        inputs = self.get_inputs()
338
        _ = pipe(**inputs)
339

340
        mem_bytes = torch.cuda.max_memory_allocated()
341
        # make sure that less than 5.2 GB is allocated
342
        assert mem_bytes < 5.5 * 10**9
343

344
Product

Resources

Company