Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
shivamshrirao
GitHub Repository: shivamshrirao/diffusers
Path: blob/main/tests/pipelines/latent_diffusion/test_latent_diffusion.py
1451 views
1
# coding=utf-8
2
# Copyright 2023 HuggingFace Inc.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
# http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
16
import gc
17
import unittest
18
19
import numpy as np
20
import torch
21
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
22
23
from diffusers import AutoencoderKL, DDIMScheduler, LDMTextToImagePipeline, UNet2DConditionModel
24
from diffusers.utils.testing_utils import load_numpy, nightly, require_torch_gpu, slow, torch_device
25
26
from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
27
from ...test_pipelines_common import PipelineTesterMixin
28
29
30
torch.backends.cuda.matmul.allow_tf32 = False
31
32
33
class LDMTextToImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
34
pipeline_class = LDMTextToImagePipeline
35
params = TEXT_TO_IMAGE_PARAMS - {
36
"negative_prompt",
37
"negative_prompt_embeds",
38
"cross_attention_kwargs",
39
"prompt_embeds",
40
}
41
required_optional_params = PipelineTesterMixin.required_optional_params - {
42
"num_images_per_prompt",
43
"callback",
44
"callback_steps",
45
}
46
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
47
test_cpu_offload = False
48
49
def get_dummy_components(self):
50
torch.manual_seed(0)
51
unet = UNet2DConditionModel(
52
block_out_channels=(32, 64),
53
layers_per_block=2,
54
sample_size=32,
55
in_channels=4,
56
out_channels=4,
57
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
58
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
59
cross_attention_dim=32,
60
)
61
scheduler = DDIMScheduler(
62
beta_start=0.00085,
63
beta_end=0.012,
64
beta_schedule="scaled_linear",
65
clip_sample=False,
66
set_alpha_to_one=False,
67
)
68
torch.manual_seed(0)
69
vae = AutoencoderKL(
70
block_out_channels=(32, 64),
71
in_channels=3,
72
out_channels=3,
73
down_block_types=("DownEncoderBlock2D", "DownEncoderBlock2D"),
74
up_block_types=("UpDecoderBlock2D", "UpDecoderBlock2D"),
75
latent_channels=4,
76
)
77
torch.manual_seed(0)
78
text_encoder_config = CLIPTextConfig(
79
bos_token_id=0,
80
eos_token_id=2,
81
hidden_size=32,
82
intermediate_size=37,
83
layer_norm_eps=1e-05,
84
num_attention_heads=4,
85
num_hidden_layers=5,
86
pad_token_id=1,
87
vocab_size=1000,
88
)
89
text_encoder = CLIPTextModel(text_encoder_config)
90
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
91
92
components = {
93
"unet": unet,
94
"scheduler": scheduler,
95
"vqvae": vae,
96
"bert": text_encoder,
97
"tokenizer": tokenizer,
98
}
99
return components
100
101
def get_dummy_inputs(self, device, seed=0):
102
if str(device).startswith("mps"):
103
generator = torch.manual_seed(seed)
104
else:
105
generator = torch.Generator(device=device).manual_seed(seed)
106
inputs = {
107
"prompt": "A painting of a squirrel eating a burger",
108
"generator": generator,
109
"num_inference_steps": 2,
110
"guidance_scale": 6.0,
111
"output_type": "numpy",
112
}
113
return inputs
114
115
def test_inference_text2img(self):
116
device = "cpu" # ensure determinism for the device-dependent torch.Generator
117
118
components = self.get_dummy_components()
119
pipe = LDMTextToImagePipeline(**components)
120
pipe.to(device)
121
pipe.set_progress_bar_config(disable=None)
122
123
inputs = self.get_dummy_inputs(device)
124
image = pipe(**inputs).images
125
image_slice = image[0, -3:, -3:, -1]
126
127
assert image.shape == (1, 16, 16, 3)
128
expected_slice = np.array([0.59450, 0.64078, 0.55509, 0.51229, 0.69640, 0.36960, 0.59296, 0.60801, 0.49332])
129
130
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
131
132
133
@slow
134
@require_torch_gpu
135
class LDMTextToImagePipelineSlowTests(unittest.TestCase):
136
def tearDown(self):
137
super().tearDown()
138
gc.collect()
139
torch.cuda.empty_cache()
140
141
def get_inputs(self, device, dtype=torch.float32, seed=0):
142
generator = torch.manual_seed(seed)
143
latents = np.random.RandomState(seed).standard_normal((1, 4, 32, 32))
144
latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
145
inputs = {
146
"prompt": "A painting of a squirrel eating a burger",
147
"latents": latents,
148
"generator": generator,
149
"num_inference_steps": 3,
150
"guidance_scale": 6.0,
151
"output_type": "numpy",
152
}
153
return inputs
154
155
def test_ldm_default_ddim(self):
156
pipe = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256").to(torch_device)
157
pipe.set_progress_bar_config(disable=None)
158
159
inputs = self.get_inputs(torch_device)
160
image = pipe(**inputs).images
161
image_slice = image[0, -3:, -3:, -1].flatten()
162
163
assert image.shape == (1, 256, 256, 3)
164
expected_slice = np.array([0.51825, 0.52850, 0.52543, 0.54258, 0.52304, 0.52569, 0.54363, 0.55276, 0.56878])
165
max_diff = np.abs(expected_slice - image_slice).max()
166
assert max_diff < 1e-3
167
168
169
@nightly
170
@require_torch_gpu
171
class LDMTextToImagePipelineNightlyTests(unittest.TestCase):
172
def tearDown(self):
173
super().tearDown()
174
gc.collect()
175
torch.cuda.empty_cache()
176
177
def get_inputs(self, device, dtype=torch.float32, seed=0):
178
generator = torch.manual_seed(seed)
179
latents = np.random.RandomState(seed).standard_normal((1, 4, 32, 32))
180
latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
181
inputs = {
182
"prompt": "A painting of a squirrel eating a burger",
183
"latents": latents,
184
"generator": generator,
185
"num_inference_steps": 50,
186
"guidance_scale": 6.0,
187
"output_type": "numpy",
188
}
189
return inputs
190
191
def test_ldm_default_ddim(self):
192
pipe = LDMTextToImagePipeline.from_pretrained("CompVis/ldm-text2im-large-256").to(torch_device)
193
pipe.set_progress_bar_config(disable=None)
194
195
inputs = self.get_inputs(torch_device)
196
image = pipe(**inputs).images[0]
197
198
expected_image = load_numpy(
199
"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/ldm_text2img/ldm_large_256_ddim.npy"
200
)
201
max_diff = np.abs(expected_image - image).max()
202
assert max_diff < 1e-3
203
204