Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
shivamshrirao
GitHub Repository: shivamshrirao/diffusers
Path: blob/main/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py
1448 views
1
# coding=utf-8
2
# Copyright 2023 HuggingFace Inc.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
# http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
16
import gc
17
import random
18
import unittest
19
20
import numpy as np
21
import torch
22
from PIL import Image
23
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
24
25
from diffusers import (
26
AutoencoderKL,
27
DDIMScheduler,
28
EulerAncestralDiscreteScheduler,
29
LMSDiscreteScheduler,
30
PNDMScheduler,
31
StableDiffusionInstructPix2PixPipeline,
32
UNet2DConditionModel,
33
)
34
from diffusers.utils import floats_tensor, load_image, slow, torch_device
35
from diffusers.utils.testing_utils import require_torch_gpu
36
37
from ...pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
38
from ...test_pipelines_common import PipelineTesterMixin
39
40
41
torch.backends.cuda.matmul.allow_tf32 = False
42
43
44
class StableDiffusionInstructPix2PixPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
45
pipeline_class = StableDiffusionInstructPix2PixPipeline
46
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width", "cross_attention_kwargs"}
47
batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
48
49
def get_dummy_components(self):
50
torch.manual_seed(0)
51
unet = UNet2DConditionModel(
52
block_out_channels=(32, 64),
53
layers_per_block=2,
54
sample_size=32,
55
in_channels=8,
56
out_channels=4,
57
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
58
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
59
cross_attention_dim=32,
60
)
61
scheduler = PNDMScheduler(skip_prk_steps=True)
62
torch.manual_seed(0)
63
vae = AutoencoderKL(
64
block_out_channels=[32, 64],
65
in_channels=3,
66
out_channels=3,
67
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
68
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
69
latent_channels=4,
70
)
71
torch.manual_seed(0)
72
text_encoder_config = CLIPTextConfig(
73
bos_token_id=0,
74
eos_token_id=2,
75
hidden_size=32,
76
intermediate_size=37,
77
layer_norm_eps=1e-05,
78
num_attention_heads=4,
79
num_hidden_layers=5,
80
pad_token_id=1,
81
vocab_size=1000,
82
)
83
text_encoder = CLIPTextModel(text_encoder_config)
84
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
85
86
components = {
87
"unet": unet,
88
"scheduler": scheduler,
89
"vae": vae,
90
"text_encoder": text_encoder,
91
"tokenizer": tokenizer,
92
"safety_checker": None,
93
"feature_extractor": None,
94
}
95
return components
96
97
def get_dummy_inputs(self, device, seed=0):
98
image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
99
image = image.cpu().permute(0, 2, 3, 1)[0]
100
image = Image.fromarray(np.uint8(image)).convert("RGB")
101
if str(device).startswith("mps"):
102
generator = torch.manual_seed(seed)
103
else:
104
generator = torch.Generator(device=device).manual_seed(seed)
105
inputs = {
106
"prompt": "A painting of a squirrel eating a burger",
107
"image": image,
108
"generator": generator,
109
"num_inference_steps": 2,
110
"guidance_scale": 6.0,
111
"image_guidance_scale": 1,
112
"output_type": "numpy",
113
}
114
return inputs
115
116
def test_stable_diffusion_pix2pix_default_case(self):
117
device = "cpu" # ensure determinism for the device-dependent torch.Generator
118
components = self.get_dummy_components()
119
sd_pipe = StableDiffusionInstructPix2PixPipeline(**components)
120
sd_pipe = sd_pipe.to(device)
121
sd_pipe.set_progress_bar_config(disable=None)
122
123
inputs = self.get_dummy_inputs(device)
124
image = sd_pipe(**inputs).images
125
image_slice = image[0, -3:, -3:, -1]
126
assert image.shape == (1, 32, 32, 3)
127
expected_slice = np.array([0.7318, 0.3723, 0.4662, 0.623, 0.5770, 0.5014, 0.4281, 0.5550, 0.4813])
128
129
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
130
131
def test_stable_diffusion_pix2pix_negative_prompt(self):
132
device = "cpu" # ensure determinism for the device-dependent torch.Generator
133
components = self.get_dummy_components()
134
sd_pipe = StableDiffusionInstructPix2PixPipeline(**components)
135
sd_pipe = sd_pipe.to(device)
136
sd_pipe.set_progress_bar_config(disable=None)
137
138
inputs = self.get_dummy_inputs(device)
139
negative_prompt = "french fries"
140
output = sd_pipe(**inputs, negative_prompt=negative_prompt)
141
image = output.images
142
image_slice = image[0, -3:, -3:, -1]
143
144
assert image.shape == (1, 32, 32, 3)
145
expected_slice = np.array([0.7323, 0.3688, 0.4611, 0.6255, 0.5746, 0.5017, 0.433, 0.5553, 0.4827])
146
147
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
148
149
def test_stable_diffusion_pix2pix_multiple_init_images(self):
150
device = "cpu" # ensure determinism for the device-dependent torch.Generator
151
components = self.get_dummy_components()
152
sd_pipe = StableDiffusionInstructPix2PixPipeline(**components)
153
sd_pipe = sd_pipe.to(device)
154
sd_pipe.set_progress_bar_config(disable=None)
155
156
inputs = self.get_dummy_inputs(device)
157
inputs["prompt"] = [inputs["prompt"]] * 2
158
159
image = np.array(inputs["image"]).astype(np.float32) / 255.0
160
image = torch.from_numpy(image).unsqueeze(0).to(device)
161
image = image.permute(0, 3, 1, 2)
162
inputs["image"] = image.repeat(2, 1, 1, 1)
163
164
image = sd_pipe(**inputs).images
165
image_slice = image[-1, -3:, -3:, -1]
166
167
assert image.shape == (2, 32, 32, 3)
168
expected_slice = np.array([0.606, 0.5712, 0.5099, 0.598, 0.5805, 0.7205, 0.6793, 0.554, 0.5607])
169
170
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
171
172
def test_stable_diffusion_pix2pix_euler(self):
173
device = "cpu" # ensure determinism for the device-dependent torch.Generator
174
components = self.get_dummy_components()
175
components["scheduler"] = EulerAncestralDiscreteScheduler(
176
beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
177
)
178
sd_pipe = StableDiffusionInstructPix2PixPipeline(**components)
179
sd_pipe = sd_pipe.to(device)
180
sd_pipe.set_progress_bar_config(disable=None)
181
182
inputs = self.get_dummy_inputs(device)
183
image = sd_pipe(**inputs).images
184
image_slice = image[0, -3:, -3:, -1]
185
186
slice = [round(x, 4) for x in image_slice.flatten().tolist()]
187
print(",".join([str(x) for x in slice]))
188
189
assert image.shape == (1, 32, 32, 3)
190
expected_slice = np.array([0.726, 0.3902, 0.4868, 0.585, 0.5672, 0.511, 0.3906, 0.551, 0.4846])
191
192
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
193
194
195
@slow
196
@require_torch_gpu
197
class StableDiffusionInstructPix2PixPipelineSlowTests(unittest.TestCase):
198
def tearDown(self):
199
super().tearDown()
200
gc.collect()
201
torch.cuda.empty_cache()
202
203
def get_inputs(self, seed=0):
204
generator = torch.manual_seed(seed)
205
image = load_image(
206
"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_pix2pix/example.jpg"
207
)
208
inputs = {
209
"prompt": "turn him into a cyborg",
210
"image": image,
211
"generator": generator,
212
"num_inference_steps": 3,
213
"guidance_scale": 7.5,
214
"image_guidance_scale": 1.0,
215
"output_type": "numpy",
216
}
217
return inputs
218
219
def test_stable_diffusion_pix2pix_default(self):
220
pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
221
"timbrooks/instruct-pix2pix", safety_checker=None
222
)
223
pipe.to(torch_device)
224
pipe.set_progress_bar_config(disable=None)
225
pipe.enable_attention_slicing()
226
227
inputs = self.get_inputs()
228
image = pipe(**inputs).images
229
image_slice = image[0, -3:, -3:, -1].flatten()
230
231
assert image.shape == (1, 512, 512, 3)
232
expected_slice = np.array([0.5902, 0.6015, 0.6027, 0.5983, 0.6092, 0.6061, 0.5765, 0.5785, 0.5555])
233
234
assert np.abs(expected_slice - image_slice).max() < 1e-3
235
236
def test_stable_diffusion_pix2pix_k_lms(self):
237
pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
238
"timbrooks/instruct-pix2pix", safety_checker=None
239
)
240
pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
241
pipe.to(torch_device)
242
pipe.set_progress_bar_config(disable=None)
243
pipe.enable_attention_slicing()
244
245
inputs = self.get_inputs()
246
image = pipe(**inputs).images
247
image_slice = image[0, -3:, -3:, -1].flatten()
248
249
assert image.shape == (1, 512, 512, 3)
250
expected_slice = np.array([0.6578, 0.6817, 0.6972, 0.6761, 0.6856, 0.6916, 0.6428, 0.6516, 0.6301])
251
252
assert np.abs(expected_slice - image_slice).max() < 1e-3
253
254
def test_stable_diffusion_pix2pix_ddim(self):
255
pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
256
"timbrooks/instruct-pix2pix", safety_checker=None
257
)
258
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
259
pipe.to(torch_device)
260
pipe.set_progress_bar_config(disable=None)
261
pipe.enable_attention_slicing()
262
263
inputs = self.get_inputs()
264
image = pipe(**inputs).images
265
image_slice = image[0, -3:, -3:, -1].flatten()
266
267
assert image.shape == (1, 512, 512, 3)
268
expected_slice = np.array([0.3828, 0.3834, 0.3818, 0.3792, 0.3865, 0.3752, 0.3792, 0.3847, 0.3753])
269
270
assert np.abs(expected_slice - image_slice).max() < 1e-3
271
272
def test_stable_diffusion_pix2pix_intermediate_state(self):
273
number_of_steps = 0
274
275
def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
276
callback_fn.has_been_called = True
277
nonlocal number_of_steps
278
number_of_steps += 1
279
if step == 1:
280
latents = latents.detach().cpu().numpy()
281
assert latents.shape == (1, 4, 64, 64)
282
latents_slice = latents[0, -3:, -3:, -1]
283
expected_slice = np.array([-0.2463, -0.4644, -0.9756, 1.5176, 1.4414, 0.7866, 0.9897, 0.8521, 0.7983])
284
285
assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
286
elif step == 2:
287
latents = latents.detach().cpu().numpy()
288
assert latents.shape == (1, 4, 64, 64)
289
latents_slice = latents[0, -3:, -3:, -1]
290
expected_slice = np.array([-0.2644, -0.4626, -0.9653, 1.5176, 1.4551, 0.7686, 0.9805, 0.8452, 0.8115])
291
292
assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
293
294
callback_fn.has_been_called = False
295
296
pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
297
"timbrooks/instruct-pix2pix", safety_checker=None, torch_dtype=torch.float16
298
)
299
pipe = pipe.to(torch_device)
300
pipe.set_progress_bar_config(disable=None)
301
pipe.enable_attention_slicing()
302
303
inputs = self.get_inputs()
304
pipe(**inputs, callback=callback_fn, callback_steps=1)
305
assert callback_fn.has_been_called
306
assert number_of_steps == 3
307
308
def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
309
torch.cuda.empty_cache()
310
torch.cuda.reset_max_memory_allocated()
311
torch.cuda.reset_peak_memory_stats()
312
313
pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
314
"timbrooks/instruct-pix2pix", safety_checker=None, torch_dtype=torch.float16
315
)
316
pipe = pipe.to(torch_device)
317
pipe.set_progress_bar_config(disable=None)
318
pipe.enable_attention_slicing(1)
319
pipe.enable_sequential_cpu_offload()
320
321
inputs = self.get_inputs()
322
_ = pipe(**inputs)
323
324
mem_bytes = torch.cuda.max_memory_allocated()
325
# make sure that less than 2.2 GB is allocated
326
assert mem_bytes < 2.2 * 10**9
327
328
def test_stable_diffusion_pix2pix_pipeline_multiple_of_8(self):
329
inputs = self.get_inputs()
330
# resize to resolution that is divisible by 8 but not 16 or 32
331
inputs["image"] = inputs["image"].resize((504, 504))
332
333
model_id = "timbrooks/instruct-pix2pix"
334
pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
335
model_id,
336
safety_checker=None,
337
)
338
pipe.to(torch_device)
339
pipe.set_progress_bar_config(disable=None)
340
pipe.enable_attention_slicing()
341
342
output = pipe(**inputs)
343
image = output.images[0]
344
345
image_slice = image[255:258, 383:386, -1]
346
347
assert image.shape == (504, 504, 3)
348
expected_slice = np.array([0.2726, 0.2529, 0.2664, 0.2655, 0.2641, 0.2642, 0.2591, 0.2649, 0.2590])
349
350
assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3
351
352