CoCalc -- test_alt_diffusion

GitHub Repository: shivamshrirao/diffusers
Path: blob/main/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
¹⁴⁵⁰ views
1
# coding=utf-8
2
# Copyright 2023 HuggingFace Inc.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
import gc
17
import random
18
import unittest
19

20
import numpy as np
21
import torch
22
from transformers import XLMRobertaTokenizer
23

24
from diffusers import (
25
    AltDiffusionImg2ImgPipeline,
26
    AutoencoderKL,
27
    PNDMScheduler,
28
    UNet2DConditionModel,
29
)
30
from diffusers.image_processor import VaeImageProcessor
31
from diffusers.pipelines.alt_diffusion.modeling_roberta_series import (
32
    RobertaSeriesConfig,
33
    RobertaSeriesModelWithTransformation,
34
)
35
from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
36
from diffusers.utils.testing_utils import require_torch_gpu
37

38

39
torch.backends.cuda.matmul.allow_tf32 = False
40

41

42
class AltDiffusionImg2ImgPipelineFastTests(unittest.TestCase):
43
    def tearDown(self):
44
        # clean up the VRAM after each test
45
        super().tearDown()
46
        gc.collect()
47
        torch.cuda.empty_cache()
48

49
    @property
50
    def dummy_image(self):
51
        batch_size = 1
52
        num_channels = 3
53
        sizes = (32, 32)
54

55
        image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
56
        return image
57

58
    @property
59
    def dummy_cond_unet(self):
60
        torch.manual_seed(0)
61
        model = UNet2DConditionModel(
62
            block_out_channels=(32, 64),
63
            layers_per_block=2,
64
            sample_size=32,
65
            in_channels=4,
66
            out_channels=4,
67
            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
68
            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
69
            cross_attention_dim=32,
70
        )
71
        return model
72

73
    @property
74
    def dummy_vae(self):
75
        torch.manual_seed(0)
76
        model = AutoencoderKL(
77
            block_out_channels=[32, 64],
78
            in_channels=3,
79
            out_channels=3,
80
            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
81
            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
82
            latent_channels=4,
83
        )
84
        return model
85

86
    @property
87
    def dummy_text_encoder(self):
88
        torch.manual_seed(0)
89
        config = RobertaSeriesConfig(
90
            hidden_size=32,
91
            project_dim=32,
92
            intermediate_size=37,
93
            layer_norm_eps=1e-05,
94
            num_attention_heads=4,
95
            num_hidden_layers=5,
96
            pad_token_id=1,
97
            vocab_size=5006,
98
        )
99
        return RobertaSeriesModelWithTransformation(config)
100

101
    @property
102
    def dummy_extractor(self):
103
        def extract(*args, **kwargs):
104
            class Out:
105
                def __init__(self):
106
                    self.pixel_values = torch.ones([0])
107

108
                def to(self, device):
109
                    self.pixel_values.to(device)
110
                    return self
111

112
            return Out()
113

114
        return extract
115

116
    def test_stable_diffusion_img2img_default_case(self):
117
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
118
        unet = self.dummy_cond_unet
119
        scheduler = PNDMScheduler(skip_prk_steps=True)
120
        vae = self.dummy_vae
121
        bert = self.dummy_text_encoder
122
        tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
123
        tokenizer.model_max_length = 77
124

125
        init_image = self.dummy_image.to(device)
126

127
        # make sure here that pndm scheduler skips prk
128
        alt_pipe = AltDiffusionImg2ImgPipeline(
129
            unet=unet,
130
            scheduler=scheduler,
131
            vae=vae,
132
            text_encoder=bert,
133
            tokenizer=tokenizer,
134
            safety_checker=None,
135
            feature_extractor=self.dummy_extractor,
136
        )
137
        alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor, do_normalize=False)
138
        alt_pipe = alt_pipe.to(device)
139
        alt_pipe.set_progress_bar_config(disable=None)
140

141
        prompt = "A painting of a squirrel eating a burger"
142
        generator = torch.Generator(device=device).manual_seed(0)
143
        output = alt_pipe(
144
            [prompt],
145
            generator=generator,
146
            guidance_scale=6.0,
147
            num_inference_steps=2,
148
            output_type="np",
149
            image=init_image,
150
        )
151

152
        image = output.images
153

154
        generator = torch.Generator(device=device).manual_seed(0)
155
        image_from_tuple = alt_pipe(
156
            [prompt],
157
            generator=generator,
158
            guidance_scale=6.0,
159
            num_inference_steps=2,
160
            output_type="np",
161
            image=init_image,
162
            return_dict=False,
163
        )[0]
164

165
        image_slice = image[0, -3:, -3:, -1]
166
        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
167

168
        assert image.shape == (1, 32, 32, 3)
169
        expected_slice = np.array([0.4115, 0.3870, 0.4089, 0.4807, 0.4668, 0.4144, 0.4151, 0.4721, 0.4569])
170

171
        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3
172
        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 5e-3
173

174
    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
175
    def test_stable_diffusion_img2img_fp16(self):
176
        """Test that stable diffusion img2img works with fp16"""
177
        unet = self.dummy_cond_unet
178
        scheduler = PNDMScheduler(skip_prk_steps=True)
179
        vae = self.dummy_vae
180
        bert = self.dummy_text_encoder
181
        tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
182
        tokenizer.model_max_length = 77
183

184
        init_image = self.dummy_image.to(torch_device)
185

186
        # put models in fp16
187
        unet = unet.half()
188
        vae = vae.half()
189
        bert = bert.half()
190

191
        # make sure here that pndm scheduler skips prk
192
        alt_pipe = AltDiffusionImg2ImgPipeline(
193
            unet=unet,
194
            scheduler=scheduler,
195
            vae=vae,
196
            text_encoder=bert,
197
            tokenizer=tokenizer,
198
            safety_checker=None,
199
            feature_extractor=self.dummy_extractor,
200
        )
201
        alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor, do_normalize=False)
202
        alt_pipe = alt_pipe.to(torch_device)
203
        alt_pipe.set_progress_bar_config(disable=None)
204

205
        prompt = "A painting of a squirrel eating a burger"
206
        generator = torch.manual_seed(0)
207
        image = alt_pipe(
208
            [prompt],
209
            generator=generator,
210
            num_inference_steps=2,
211
            output_type="np",
212
            image=init_image,
213
        ).images
214

215
        assert image.shape == (1, 32, 32, 3)
216

217
    @unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
218
    def test_stable_diffusion_img2img_pipeline_multiple_of_8(self):
219
        init_image = load_image(
220
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
221
            "/img2img/sketch-mountains-input.jpg"
222
        )
223
        # resize to resolution that is divisible by 8 but not 16 or 32
224
        init_image = init_image.resize((760, 504))
225

226
        model_id = "BAAI/AltDiffusion"
227
        pipe = AltDiffusionImg2ImgPipeline.from_pretrained(
228
            model_id,
229
            safety_checker=None,
230
        )
231
        pipe.to(torch_device)
232
        pipe.set_progress_bar_config(disable=None)
233
        pipe.enable_attention_slicing()
234

235
        prompt = "A fantasy landscape, trending on artstation"
236

237
        generator = torch.manual_seed(0)
238
        output = pipe(
239
            prompt=prompt,
240
            image=init_image,
241
            strength=0.75,
242
            guidance_scale=7.5,
243
            generator=generator,
244
            output_type="np",
245
        )
246
        image = output.images[0]
247

248
        image_slice = image[255:258, 383:386, -1]
249

250
        assert image.shape == (504, 760, 3)
251
        expected_slice = np.array([0.9358, 0.9397, 0.9599, 0.9901, 1.0000, 1.0000, 0.9882, 1.0000, 1.0000])
252

253
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
254

255

256
@slow
257
@require_torch_gpu
258
class AltDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
259
    def tearDown(self):
260
        # clean up the VRAM after each test
261
        super().tearDown()
262
        gc.collect()
263
        torch.cuda.empty_cache()
264

265
    def test_stable_diffusion_img2img_pipeline_default(self):
266
        init_image = load_image(
267
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
268
            "/img2img/sketch-mountains-input.jpg"
269
        )
270
        init_image = init_image.resize((768, 512))
271
        expected_image = load_numpy(
272
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/fantasy_landscape_alt.npy"
273
        )
274

275
        model_id = "BAAI/AltDiffusion"
276
        pipe = AltDiffusionImg2ImgPipeline.from_pretrained(
277
            model_id,
278
            safety_checker=None,
279
        )
280
        pipe.to(torch_device)
281
        pipe.set_progress_bar_config(disable=None)
282
        pipe.enable_attention_slicing()
283

284
        prompt = "A fantasy landscape, trending on artstation"
285

286
        generator = torch.manual_seed(0)
287
        output = pipe(
288
            prompt=prompt,
289
            image=init_image,
290
            strength=0.75,
291
            guidance_scale=7.5,
292
            generator=generator,
293
            output_type="np",
294
        )
295
        image = output.images[0]
296

297
        assert image.shape == (512, 768, 3)
298
        # img2img is flaky across GPUs even in fp32, so using MAE here
299
        assert np.abs(expected_image - image).max() < 1e-3
300

301
Product

Resources

Company