CoCalc -- test_latent

GitHub Repository: shivamshrirao/diffusers
Path: blob/main/tests/pipelines/latent_diffusion/test_latent_diffusion.py
¹⁴⁵¹ views
1
# coding=utf-8
2
# Copyright 2023 HuggingFace Inc.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
import gc
17
import unittest
18

19
import numpy as np
20
import torch
21
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
22

23
from diffusers import AutoencoderKL, DDIMScheduler, LDMTextToImagePipeline, UNet2DConditionModel
24
from diffusers.utils.testing_utils import load_numpy, nightly, require_torch_gpu, slow, torch_device
25

26
from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
27
from ...test_pipelines_common import PipelineTesterMixin
28

29

30
torch.backends.cuda.matmul.allow_tf32 = False
31

32

33
class LDMTextToImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
34
    pipeline_class = LDMTextToImagePipeline
35
    params = TEXT_TO_IMAGE_PARAMS - {
36
        "negative_prompt",
37
        "negative_prompt_embeds",
38
        "cross_attention_kwargs",
39
        "prompt_embeds",
40
    }
41
    required_optional_params = PipelineTesterMixin.required_optional_params - {
42
        "num_images_per_prompt",
43
        "callback",
44
        "callback_steps",
45
    }
46
    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
47
    test_cpu_offload = False
48

49
    def get_dummy_components(self):
50
        torch.manual_seed(0)
51
        unet = UNet2DConditionModel(
52
            block_out_channels=(32, 64),
53
            layers_per_block=2,
54
            sample_size=32,
55
            in_channels=4,
56
            out_channels=4,
57
            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
58
            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
59
            cross_attention_dim=32,
60
        )
61
        scheduler = DDIMScheduler(
62
            beta_start=0.00085,
63
            beta_end=0.012,
64
            beta_schedule="scaled_linear",
65
            clip_sample=False,
66
            set_alpha_to_one=False,
67
        )
68
        torch.manual_seed(0)
69
        vae = AutoencoderKL(
70
            block_out_channels=(32, 64),
71
            in_channels=3,
72
            out_channels=3,
73
            down_block_types=("DownEncoderBlock2D", "DownEncoderBlock2D"),
74
            up_block_types=("UpDecoderBlock2D", "UpDecoderBlock2D"),
75
            latent_channels=4,
76
        )
77
        torch.manual_seed(0)
78
        text_encoder_config = CLIPTextConfig(
79
            bos_token_id=0,
80
            eos_token_id=2,
81
            hidden_size=32,
82
            intermediate_size=37,
83
            layer_norm_eps=1e-05,
84
            num_attention_heads=4,
85
            num_hidden_layers=5,
86
            pad_token_id=1,
87
            vocab_size=1000,
88
        )
89
        text_encoder = CLIPTextModel(text_encoder_config)
90
        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
91

92
        components = {
93
            "unet": unet,
94
            "scheduler": scheduler,
95
            "vqvae": vae,
96
            "bert": text_encoder,
97
            "tokenizer": tokenizer,
98
        }
99
        return components
100

101
    def get_dummy_inputs(self, device, seed=0):
102
        if str(device).startswith("mps"):
103
            generator = torch.manual_seed(seed)
104
        else:
105
            generator = torch.Generator(device=device).manual_seed(seed)
106
        inputs = {
107
            "prompt": "A painting of a squirrel eating a burger",
108
            "generator": generator,
109
            "num_inference_steps": 2,
110
            "guidance_scale": 6.0,
111
            "output_type": "numpy",
112
        }
113
        return inputs
114

115
    def test_inference_text2img(self):
116
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
117

118
        components = self.get_dummy_components()
119
        pipe = LDMTextToImagePipeline(**components)
120
        pipe.to(device)
121
        pipe.set_progress_bar_config(disable=None)
122

123
        inputs = self.get_dummy_inputs(device)
124
        image = pipe(**inputs).images
125
        image_slice = image[0, -3:, -3:, -1]
126

127
        assert image.shape == (1, 16, 16, 3)
128
        expected_slice = np.array([0.59450, 0.64078, 0.55509, 0.51229, 0.69640, 0.36960, 0.59296, 0.60801, 0.49332])
129

130
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
131

132

133
@slow
134
@require_torch_gpu
135
class LDMTextToImagePipelineSlowTests(unittest.TestCase):
136
    def tearDown(self):
137
        super().tearDown()
138
        gc.collect()
139
        torch.cuda.empty_cache()
140

141
    def get_inputs(self, device, dtype=torch.float32, seed=0):
142
        generator = torch.manual_seed(seed)
143
        latents = np.random.RandomState(seed).standard_normal((1, 4, 32, 32))
144
        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
145
        inputs = {
146
            "prompt": "A painting of a squirrel eating a burger",
147
            "latents": latents,
148
            "generator": generator,
149
            "num_inference_steps": 3,
150
            "guidance_scale": 6.0,
151
            "output_type": "numpy",
152
        }
153
        return inputs
154

155
    def test_ldm_default_ddim(self):
156
        pipe = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256").to(torch_device)
157
        pipe.set_progress_bar_config(disable=None)
158

159
        inputs = self.get_inputs(torch_device)
160
        image = pipe(**inputs).images
161
        image_slice = image[0, -3:, -3:, -1].flatten()
162

163
        assert image.shape == (1, 256, 256, 3)
164
        expected_slice = np.array([0.51825, 0.52850, 0.52543, 0.54258, 0.52304, 0.52569, 0.54363, 0.55276, 0.56878])
165
        max_diff = np.abs(expected_slice - image_slice).max()
166
        assert max_diff < 1e-3
167

168

169
@nightly
170
@require_torch_gpu
171
class LDMTextToImagePipelineNightlyTests(unittest.TestCase):
172
    def tearDown(self):
173
        super().tearDown()
174
        gc.collect()
175
        torch.cuda.empty_cache()
176

177
    def get_inputs(self, device, dtype=torch.float32, seed=0):
178
        generator = torch.manual_seed(seed)
179
        latents = np.random.RandomState(seed).standard_normal((1, 4, 32, 32))
180
        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
181
        inputs = {
182
            "prompt": "A painting of a squirrel eating a burger",
183
            "latents": latents,
184
            "generator": generator,
185
            "num_inference_steps": 50,
186
            "guidance_scale": 6.0,
187
            "output_type": "numpy",
188
        }
189
        return inputs
190

191
    def test_ldm_default_ddim(self):
192
        pipe = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256").to(torch_device)
193
        pipe.set_progress_bar_config(disable=None)
194

195
        inputs = self.get_inputs(torch_device)
196
        image = pipe(**inputs).images[0]
197

198
        expected_image = load_numpy(
199
            "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/ldm_text2img/ldm_large_256_ddim.npy"
200
        )
201
        max_diff = np.abs(expected_image - image).max()
202
        assert max_diff < 1e-3
203

204
Product

Resources

Company