CoCalc -- test_stable_diffusion_model

GitHub Repository: shivamshrirao/diffusers
Path: blob/main/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py
¹⁴⁵⁰ views
1
# coding=utf-8
2
# Copyright 2023 HuggingFace Inc.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
import gc
17
import unittest
18

19
import numpy as np
20
import torch
21
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
22

23
from diffusers import (
24
    AutoencoderKL,
25
    DDIMScheduler,
26
    EulerAncestralDiscreteScheduler,
27
    PNDMScheduler,
28
    StableDiffusionModelEditingPipeline,
29
    UNet2DConditionModel,
30
)
31
from diffusers.utils import slow, torch_device
32
from diffusers.utils.testing_utils import require_torch_gpu, skip_mps
33

34
from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
35
from ...test_pipelines_common import PipelineTesterMixin
36

37

38
torch.backends.cuda.matmul.allow_tf32 = False
39

40

41
@skip_mps
42
class StableDiffusionModelEditingPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
43
    pipeline_class = StableDiffusionModelEditingPipeline
44
    params = TEXT_TO_IMAGE_PARAMS
45
    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
46

47
    def get_dummy_components(self):
48
        torch.manual_seed(0)
49
        unet = UNet2DConditionModel(
50
            block_out_channels=(32, 64),
51
            layers_per_block=2,
52
            sample_size=32,
53
            in_channels=4,
54
            out_channels=4,
55
            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
56
            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
57
            cross_attention_dim=32,
58
        )
59
        scheduler = DDIMScheduler()
60
        torch.manual_seed(0)
61
        vae = AutoencoderKL(
62
            block_out_channels=[32, 64],
63
            in_channels=3,
64
            out_channels=3,
65
            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
66
            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
67
            latent_channels=4,
68
        )
69
        torch.manual_seed(0)
70
        text_encoder_config = CLIPTextConfig(
71
            bos_token_id=0,
72
            eos_token_id=2,
73
            hidden_size=32,
74
            intermediate_size=37,
75
            layer_norm_eps=1e-05,
76
            num_attention_heads=4,
77
            num_hidden_layers=5,
78
            pad_token_id=1,
79
            vocab_size=1000,
80
        )
81
        text_encoder = CLIPTextModel(text_encoder_config)
82
        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
83

84
        components = {
85
            "unet": unet,
86
            "scheduler": scheduler,
87
            "vae": vae,
88
            "text_encoder": text_encoder,
89
            "tokenizer": tokenizer,
90
            "safety_checker": None,
91
            "feature_extractor": None,
92
        }
93
        return components
94

95
    def get_dummy_inputs(self, device, seed=0):
96
        generator = torch.manual_seed(seed)
97
        inputs = {
98
            "prompt": "A field of roses",
99
            "generator": generator,
100
            # Setting height and width to None to prevent OOMs on CPU.
101
            "height": None,
102
            "width": None,
103
            "num_inference_steps": 2,
104
            "guidance_scale": 6.0,
105
            "output_type": "numpy",
106
        }
107
        return inputs
108

109
    def test_stable_diffusion_model_editing_default_case(self):
110
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
111
        components = self.get_dummy_components()
112
        sd_pipe = StableDiffusionModelEditingPipeline(**components)
113
        sd_pipe = sd_pipe.to(device)
114
        sd_pipe.set_progress_bar_config(disable=None)
115

116
        inputs = self.get_dummy_inputs(device)
117
        image = sd_pipe(**inputs).images
118
        image_slice = image[0, -3:, -3:, -1]
119
        assert image.shape == (1, 64, 64, 3)
120

121
        expected_slice = np.array(
122
            [0.5217179, 0.50658035, 0.5003239, 0.41109088, 0.3595158, 0.46607107, 0.5323504, 0.5335255, 0.49187922]
123
        )
124

125
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
126

127
    def test_stable_diffusion_model_editing_negative_prompt(self):
128
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
129
        components = self.get_dummy_components()
130
        sd_pipe = StableDiffusionModelEditingPipeline(**components)
131
        sd_pipe = sd_pipe.to(device)
132
        sd_pipe.set_progress_bar_config(disable=None)
133

134
        inputs = self.get_dummy_inputs(device)
135
        negative_prompt = "french fries"
136
        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
137
        image = output.images
138
        image_slice = image[0, -3:, -3:, -1]
139

140
        assert image.shape == (1, 64, 64, 3)
141

142
        expected_slice = np.array(
143
            [0.546259, 0.5108156, 0.50897664, 0.41931948, 0.3748669, 0.4669299, 0.5427151, 0.54561913, 0.49353]
144
        )
145

146
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
147

148
    def test_stable_diffusion_model_editing_euler(self):
149
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
150
        components = self.get_dummy_components()
151
        components["scheduler"] = EulerAncestralDiscreteScheduler(
152
            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
153
        )
154
        sd_pipe = StableDiffusionModelEditingPipeline(**components)
155
        sd_pipe = sd_pipe.to(device)
156
        sd_pipe.set_progress_bar_config(disable=None)
157

158
        inputs = self.get_dummy_inputs(device)
159
        image = sd_pipe(**inputs).images
160
        image_slice = image[0, -3:, -3:, -1]
161

162
        assert image.shape == (1, 64, 64, 3)
163

164
        expected_slice = np.array(
165
            [0.47106352, 0.53579676, 0.45798016, 0.514294, 0.56856745, 0.4788605, 0.54380214, 0.5046455, 0.50404465]
166
        )
167

168
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
169

170
    def test_stable_diffusion_model_editing_pndm(self):
171
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
172
        components = self.get_dummy_components()
173
        components["scheduler"] = PNDMScheduler()
174
        sd_pipe = StableDiffusionModelEditingPipeline(**components)
175
        sd_pipe = sd_pipe.to(device)
176
        sd_pipe.set_progress_bar_config(disable=None)
177

178
        inputs = self.get_dummy_inputs(device)
179
        # the pipeline does not expect pndm so test if it raises error.
180
        with self.assertRaises(ValueError):
181
            _ = sd_pipe(**inputs).images
182

183

184
@slow
185
@require_torch_gpu
186
class StableDiffusionModelEditingSlowTests(unittest.TestCase):
187
    def tearDown(self):
188
        super().tearDown()
189
        gc.collect()
190
        torch.cuda.empty_cache()
191

192
    def get_inputs(self, seed=0):
193
        generator = torch.manual_seed(seed)
194
        inputs = {
195
            "prompt": "A field of roses",
196
            "generator": generator,
197
            "num_inference_steps": 3,
198
            "guidance_scale": 7.5,
199
            "output_type": "numpy",
200
        }
201
        return inputs
202

203
    def test_stable_diffusion_model_editing_default(self):
204
        model_ckpt = "CompVis/stable-diffusion-v1-4"
205
        pipe = StableDiffusionModelEditingPipeline.from_pretrained(model_ckpt, safety_checker=None)
206
        pipe.to(torch_device)
207
        pipe.set_progress_bar_config(disable=None)
208
        pipe.enable_attention_slicing()
209

210
        inputs = self.get_inputs()
211
        image = pipe(**inputs).images
212
        image_slice = image[0, -3:, -3:, -1].flatten()
213

214
        assert image.shape == (1, 512, 512, 3)
215

216
        expected_slice = np.array(
217
            [0.6749496, 0.6386453, 0.51443267, 0.66094905, 0.61921215, 0.5491332, 0.5744417, 0.58075106, 0.5174658]
218
        )
219

220
        assert np.abs(expected_slice - image_slice).max() < 1e-2
221

222
        # make sure image changes after editing
223
        pipe.edit_model("A pack of roses", "A pack of blue roses")
224

225
        image = pipe(**inputs).images
226
        image_slice = image[0, -3:, -3:, -1].flatten()
227

228
        assert image.shape == (1, 512, 512, 3)
229

230
        assert np.abs(expected_slice - image_slice).max() > 1e-1
231

232
    def test_stable_diffusion_model_editing_pipeline_with_sequential_cpu_offloading(self):
233
        torch.cuda.empty_cache()
234
        torch.cuda.reset_max_memory_allocated()
235
        torch.cuda.reset_peak_memory_stats()
236

237
        model_ckpt = "CompVis/stable-diffusion-v1-4"
238
        scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
239
        pipe = StableDiffusionModelEditingPipeline.from_pretrained(
240
            model_ckpt, scheduler=scheduler, safety_checker=None
241
        )
242
        pipe = pipe.to(torch_device)
243
        pipe.set_progress_bar_config(disable=None)
244
        pipe.enable_attention_slicing(1)
245
        pipe.enable_sequential_cpu_offload()
246

247
        inputs = self.get_inputs()
248
        _ = pipe(**inputs)
249

250
        mem_bytes = torch.cuda.max_memory_allocated()
251
        # make sure that less than 4.4 GB is allocated
252
        assert mem_bytes < 4.4 * 10**9
253

254
Product

Resources

Company