CoCalc -- test_stable_unclip

GitHub Repository: shivamshrirao/diffusers
Path: blob/main/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py
¹⁴⁴⁸ views
1
import gc
2
import random
3
import unittest
4

5
import torch
6
from transformers import (
7
    CLIPImageProcessor,
8
    CLIPTextConfig,
9
    CLIPTextModel,
10
    CLIPTokenizer,
11
    CLIPVisionConfig,
12
    CLIPVisionModelWithProjection,
13
)
14

15
from diffusers import AutoencoderKL, DDIMScheduler, DDPMScheduler, StableUnCLIPImg2ImgPipeline, UNet2DConditionModel
16
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
17
from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
18
from diffusers.utils.import_utils import is_xformers_available
19
from diffusers.utils.testing_utils import floats_tensor, load_image, load_numpy, require_torch_gpu, slow, torch_device
20

21
from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
22
from ...test_pipelines_common import (
23
    PipelineTesterMixin,
24
    assert_mean_pixel_difference,
25
)
26

27

28
class StableUnCLIPImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
29
    pipeline_class = StableUnCLIPImg2ImgPipeline
30
    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
31
    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
32

33
    def get_dummy_components(self):
34
        embedder_hidden_size = 32
35
        embedder_projection_dim = embedder_hidden_size
36

37
        # image encoding components
38

39
        feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
40

41
        image_encoder = CLIPVisionModelWithProjection(
42
            CLIPVisionConfig(
43
                hidden_size=embedder_hidden_size,
44
                projection_dim=embedder_projection_dim,
45
                num_hidden_layers=5,
46
                num_attention_heads=4,
47
                image_size=32,
48
                intermediate_size=37,
49
                patch_size=1,
50
            )
51
        )
52

53
        # regular denoising components
54

55
        torch.manual_seed(0)
56
        image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedder_hidden_size)
57
        image_noising_scheduler = DDPMScheduler(beta_schedule="squaredcos_cap_v2")
58

59
        torch.manual_seed(0)
60
        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
61

62
        torch.manual_seed(0)
63
        text_encoder = CLIPTextModel(
64
            CLIPTextConfig(
65
                bos_token_id=0,
66
                eos_token_id=2,
67
                hidden_size=embedder_hidden_size,
68
                projection_dim=32,
69
                intermediate_size=37,
70
                layer_norm_eps=1e-05,
71
                num_attention_heads=4,
72
                num_hidden_layers=5,
73
                pad_token_id=1,
74
                vocab_size=1000,
75
            )
76
        )
77

78
        torch.manual_seed(0)
79
        unet = UNet2DConditionModel(
80
            sample_size=32,
81
            in_channels=4,
82
            out_channels=4,
83
            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
84
            up_block_types=("UpBlock2D", "CrossAttnUpBlock2D"),
85
            block_out_channels=(32, 64),
86
            attention_head_dim=(2, 4),
87
            class_embed_type="projection",
88
            # The class embeddings are the noise augmented image embeddings.
89
            # I.e. the image embeddings concated with the noised embeddings of the same dimension
90
            projection_class_embeddings_input_dim=embedder_projection_dim * 2,
91
            cross_attention_dim=embedder_hidden_size,
92
            layers_per_block=1,
93
            upcast_attention=True,
94
            use_linear_projection=True,
95
        )
96

97
        torch.manual_seed(0)
98
        scheduler = DDIMScheduler(
99
            beta_schedule="scaled_linear",
100
            beta_start=0.00085,
101
            beta_end=0.012,
102
            prediction_type="v_prediction",
103
            set_alpha_to_one=False,
104
            steps_offset=1,
105
        )
106

107
        torch.manual_seed(0)
108
        vae = AutoencoderKL()
109

110
        components = {
111
            # image encoding components
112
            "feature_extractor": feature_extractor,
113
            "image_encoder": image_encoder,
114
            # image noising components
115
            "image_normalizer": image_normalizer,
116
            "image_noising_scheduler": image_noising_scheduler,
117
            # regular denoising components
118
            "tokenizer": tokenizer,
119
            "text_encoder": text_encoder,
120
            "unet": unet,
121
            "scheduler": scheduler,
122
            "vae": vae,
123
        }
124

125
        return components
126

127
    def get_dummy_inputs(self, device, seed=0, pil_image=True):
128
        if str(device).startswith("mps"):
129
            generator = torch.manual_seed(seed)
130
        else:
131
            generator = torch.Generator(device=device).manual_seed(seed)
132

133
        input_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
134

135
        if pil_image:
136
            input_image = input_image * 0.5 + 0.5
137
            input_image = input_image.clamp(0, 1)
138
            input_image = input_image.cpu().permute(0, 2, 3, 1).float().numpy()
139
            input_image = DiffusionPipeline.numpy_to_pil(input_image)[0]
140

141
        return {
142
            "prompt": "An anime racoon running a marathon",
143
            "image": input_image,
144
            "generator": generator,
145
            "num_inference_steps": 2,
146
            "output_type": "np",
147
        }
148

149
    # Overriding PipelineTesterMixin::test_attention_slicing_forward_pass
150
    # because GPU undeterminism requires a looser check.
151
    def test_attention_slicing_forward_pass(self):
152
        test_max_difference = torch_device in ["cpu", "mps"]
153

154
        self._test_attention_slicing_forward_pass(test_max_difference=test_max_difference)
155

156
    # Overriding PipelineTesterMixin::test_inference_batch_single_identical
157
    # because undeterminism requires a looser check.
158
    def test_inference_batch_single_identical(self):
159
        test_max_difference = torch_device in ["cpu", "mps"]
160

161
        self._test_inference_batch_single_identical(test_max_difference=test_max_difference)
162

163
    @unittest.skipIf(
164
        torch_device != "cuda" or not is_xformers_available(),
165
        reason="XFormers attention is only available with CUDA and `xformers` installed",
166
    )
167
    def test_xformers_attention_forwardGenerator_pass(self):
168
        self._test_xformers_attention_forwardGenerator_pass(test_max_difference=False)
169

170

171
@slow
172
@require_torch_gpu
173
class StableUnCLIPImg2ImgPipelineIntegrationTests(unittest.TestCase):
174
    def tearDown(self):
175
        # clean up the VRAM after each test
176
        super().tearDown()
177
        gc.collect()
178
        torch.cuda.empty_cache()
179

180
    def test_stable_unclip_l_img2img(self):
181
        input_image = load_image(
182
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/turtle.png"
183
        )
184

185
        expected_image = load_numpy(
186
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/stable_unclip_2_1_l_img2img_anime_turtle_fp16.npy"
187
        )
188

189
        pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
190
            "fusing/stable-unclip-2-1-l-img2img", torch_dtype=torch.float16
191
        )
192
        pipe.to(torch_device)
193
        pipe.set_progress_bar_config(disable=None)
194
        # stable unclip will oom when integration tests are run on a V100,
195
        # so turn on memory savings
196
        pipe.enable_attention_slicing()
197
        pipe.enable_sequential_cpu_offload()
198

199
        generator = torch.Generator(device="cpu").manual_seed(0)
200
        output = pipe("anime turle", image=input_image, generator=generator, output_type="np")
201

202
        image = output.images[0]
203

204
        assert image.shape == (768, 768, 3)
205

206
        assert_mean_pixel_difference(image, expected_image)
207

208
    def test_stable_unclip_h_img2img(self):
209
        input_image = load_image(
210
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/turtle.png"
211
        )
212

213
        expected_image = load_numpy(
214
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/stable_unclip_2_1_h_img2img_anime_turtle_fp16.npy"
215
        )
216

217
        pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
218
            "fusing/stable-unclip-2-1-h-img2img", torch_dtype=torch.float16
219
        )
220
        pipe.to(torch_device)
221
        pipe.set_progress_bar_config(disable=None)
222
        # stable unclip will oom when integration tests are run on a V100,
223
        # so turn on memory savings
224
        pipe.enable_attention_slicing()
225
        pipe.enable_sequential_cpu_offload()
226

227
        generator = torch.Generator(device="cpu").manual_seed(0)
228
        output = pipe("anime turle", image=input_image, generator=generator, output_type="np")
229

230
        image = output.images[0]
231

232
        assert image.shape == (768, 768, 3)
233

234
        assert_mean_pixel_difference(image, expected_image)
235

236
    def test_stable_unclip_img2img_pipeline_with_sequential_cpu_offloading(self):
237
        input_image = load_image(
238
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/stable_unclip/turtle.png"
239
        )
240

241
        torch.cuda.empty_cache()
242
        torch.cuda.reset_max_memory_allocated()
243
        torch.cuda.reset_peak_memory_stats()
244

245
        pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
246
            "fusing/stable-unclip-2-1-h-img2img", torch_dtype=torch.float16
247
        )
248
        pipe = pipe.to(torch_device)
249
        pipe.set_progress_bar_config(disable=None)
250
        pipe.enable_attention_slicing()
251
        pipe.enable_sequential_cpu_offload()
252

253
        _ = pipe(
254
            "anime turtle",
255
            image=input_image,
256
            num_inference_steps=2,
257
            output_type="np",
258
        )
259

260
        mem_bytes = torch.cuda.max_memory_allocated()
261
        # make sure that less than 7 GB is allocated
262
        assert mem_bytes < 7 * 10**9
263

264
Product

Resources

Company