CoCalc -- test_stable

GitHub Repository: shivamshrirao/diffusers
Path: blob/main/tests/pipelines/stable_unclip/test_stable_unclip.py
¹⁴⁵¹ views
1
import gc
2
import unittest
3

4
import torch
5
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
6

7
from diffusers import (
8
    AutoencoderKL,
9
    DDIMScheduler,
10
    DDPMScheduler,
11
    PriorTransformer,
12
    StableUnCLIPPipeline,
13
    UNet2DConditionModel,
14
)
15
from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
16
from diffusers.utils.testing_utils import load_numpy, require_torch_gpu, slow, torch_device
17

18
from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
19
from ...test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference
20

21

22
class StableUnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
23
    pipeline_class = StableUnCLIPPipeline
24
    params = TEXT_TO_IMAGE_PARAMS
25
    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
26

27
    # TODO(will) Expected attn_bias.stride(1) == 0 to be true, but got false
28
    test_xformers_attention = False
29

30
    def get_dummy_components(self):
31
        embedder_hidden_size = 32
32
        embedder_projection_dim = embedder_hidden_size
33

34
        # prior components
35

36
        torch.manual_seed(0)
37
        prior_tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
38

39
        torch.manual_seed(0)
40
        prior_text_encoder = CLIPTextModelWithProjection(
41
            CLIPTextConfig(
42
                bos_token_id=0,
43
                eos_token_id=2,
44
                hidden_size=embedder_hidden_size,
45
                projection_dim=embedder_projection_dim,
46
                intermediate_size=37,
47
                layer_norm_eps=1e-05,
48
                num_attention_heads=4,
49
                num_hidden_layers=5,
50
                pad_token_id=1,
51
                vocab_size=1000,
52
            )
53
        )
54

55
        torch.manual_seed(0)
56
        prior = PriorTransformer(
57
            num_attention_heads=2,
58
            attention_head_dim=12,
59
            embedding_dim=embedder_projection_dim,
60
            num_layers=1,
61
        )
62

63
        torch.manual_seed(0)
64
        prior_scheduler = DDPMScheduler(
65
            variance_type="fixed_small_log",
66
            prediction_type="sample",
67
            num_train_timesteps=1000,
68
            clip_sample=True,
69
            clip_sample_range=5.0,
70
            beta_schedule="squaredcos_cap_v2",
71
        )
72

73
        # regular denoising components
74

75
        torch.manual_seed(0)
76
        image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedder_hidden_size)
77
        image_noising_scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2")
78

79
        torch.manual_seed(0)
80
        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
81

82
        torch.manual_seed(0)
83
        text_encoder = CLIPTextModel(
84
            CLIPTextConfig(
85
                bos_token_id=0,
86
                eos_token_id=2,
87
                hidden_size=embedder_hidden_size,
88
                projection_dim=32,
89
                intermediate_size=37,
90
                layer_norm_eps=1e-05,
91
                num_attention_heads=4,
92
                num_hidden_layers=5,
93
                pad_token_id=1,
94
                vocab_size=1000,
95
            )
96
        )
97

98
        torch.manual_seed(0)
99
        unet = UNet2DConditionModel(
100
            sample_size=32,
101
            in_channels=4,
102
            out_channels=4,
103
            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
104
            up_block_types=("UpBlock2D", "CrossAttnUpBlock2D"),
105
            block_out_channels=(32, 64),
106
            attention_head_dim=(2, 4),
107
            class_embed_type="projection",
108
            # The class embeddings are the noise augmented image embeddings.
109
            # I.e. the image embeddings concated with the noised embeddings of the same dimension
110
            projection_class_embeddings_input_dim=embedder_projection_dim * 2,
111
            cross_attention_dim=embedder_hidden_size,
112
            layers_per_block=1,
113
            upcast_attention=True,
114
            use_linear_projection=True,
115
        )
116

117
        torch.manual_seed(0)
118
        scheduler = DDIMScheduler(
119
            beta_schedule="scaled_linear",
120
            beta_start=0.00085,
121
            beta_end=0.012,
122
            prediction_type="v_prediction",
123
            set_alpha_to_one=False,
124
            steps_offset=1,
125
        )
126

127
        torch.manual_seed(0)
128
        vae = AutoencoderKL()
129

130
        components = {
131
            # prior components
132
            "prior_tokenizer": prior_tokenizer,
133
            "prior_text_encoder": prior_text_encoder,
134
            "prior": prior,
135
            "prior_scheduler": prior_scheduler,
136
            # image noising components
137
            "image_normalizer": image_normalizer,
138
            "image_noising_scheduler": image_noising_scheduler,
139
            # regular denoising components
140
            "tokenizer": tokenizer,
141
            "text_encoder": text_encoder,
142
            "unet": unet,
143
            "scheduler": scheduler,
144
            "vae": vae,
145
        }
146

147
        return components
148

149
    def get_dummy_inputs(self, device, seed=0):
150
        if str(device).startswith("mps"):
151
            generator = torch.manual_seed(seed)
152
        else:
153
            generator = torch.Generator(device=device).manual_seed(seed)
154
        inputs = {
155
            "prompt": "A painting of a squirrel eating a burger",
156
            "generator": generator,
157
            "num_inference_steps": 2,
158
            "prior_num_inference_steps": 2,
159
            "output_type": "numpy",
160
        }
161
        return inputs
162

163
    # Overriding PipelineTesterMixin::test_attention_slicing_forward_pass
164
    # because UnCLIP GPU undeterminism requires a looser check.
165
    def test_attention_slicing_forward_pass(self):
166
        test_max_difference = torch_device == "cpu"
167

168
        self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference)
169

170
    # Overriding PipelineTesterMixin::test_inference_batch_single_identical
171
    # because UnCLIP undeterminism requires a looser check.
172
    def test_inference_batch_single_identical(self):
173
        test_max_difference = torch_device in ["cpu", "mps"]
174

175
        self._test_inference_batch_single_identical(test_max_difference=test_max_difference)
176

177

178
@slow
179
@require_torch_gpu
180
class StableUnCLIPPipelineIntegrationTests(unittest.TestCase):
181
    def tearDown(self):
182
        # clean up the VRAM after each test
183
        super().tearDown()
184
        gc.collect()
185
        torch.cuda.empty_cache()
186

187
    def test_stable_unclip(self):
188
        expected_image = load_numpy(
189
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/stable_unclip_2_1_l_anime_turtle_fp16.npy"
190
        )
191

192
        pipe = StableUnCLIPPipeline.from_pretrained("fusing/stable-unclip-2-1-l", torch_dtype=torch.float16)
193
        pipe.to(torch_device)
194
        pipe.set_progress_bar_config(disable=None)
195
        # stable unclip will oom when integration tests are run on a V100,
196
        # so turn on memory savings
197
        pipe.enable_attention_slicing()
198
        pipe.enable_sequential_cpu_offload()
199

200
        generator = torch.Generator(device="cpu").manual_seed(0)
201
        output = pipe("anime turle", generator=generator, output_type="np")
202

203
        image = output.images[0]
204

205
        assert image.shape == (768, 768, 3)
206

207
        assert_mean_pixel_difference(image, expected_image)
208

209
    def test_stable_unclip_pipeline_with_sequential_cpu_offloading(self):
210
        torch.cuda.empty_cache()
211
        torch.cuda.reset_max_memory_allocated()
212
        torch.cuda.reset_peak_memory_stats()
213

214
        pipe = StableUnCLIPPipeline.from_pretrained("fusing/stable-unclip-2-1-l", torch_dtype=torch.float16)
215
        pipe = pipe.to(torch_device)
216
        pipe.set_progress_bar_config(disable=None)
217
        pipe.enable_attention_slicing()
218
        pipe.enable_sequential_cpu_offload()
219

220
        _ = pipe(
221
            "anime turtle",
222
            prior_num_inference_steps=2,
223
            num_inference_steps=2,
224
            output_type="np",
225
        )
226

227
        mem_bytes = torch.cuda.max_memory_allocated()
228
        # make sure that less than 7 GB is allocated
229
        assert mem_bytes < 7 * 10**9
230

231
Product

Resources

Company