Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
shivamshrirao
GitHub Repository: shivamshrirao/diffusers
Path: blob/main/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py
1451 views
1
# coding=utf-8
2
# Copyright 2023 HuggingFace Inc.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
# http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
16
import gc
17
import unittest
18
19
import numpy as np
20
import torch
21
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
22
23
from diffusers import (
24
AutoencoderKL,
25
DDIMInverseScheduler,
26
DDIMScheduler,
27
DDPMScheduler,
28
EulerAncestralDiscreteScheduler,
29
LMSDiscreteScheduler,
30
StableDiffusionPix2PixZeroPipeline,
31
UNet2DConditionModel,
32
)
33
from diffusers.utils import load_numpy, slow, torch_device
34
from diffusers.utils.testing_utils import load_image, load_pt, require_torch_gpu, skip_mps
35
36
from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
37
from ...test_pipelines_common import PipelineTesterMixin
38
39
40
torch.backends.cuda.matmul.allow_tf32 = False
41
42
43
@skip_mps
44
class StableDiffusionPix2PixZeroPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
45
pipeline_class = StableDiffusionPix2PixZeroPipeline
46
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
47
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
48
49
@classmethod
50
def setUpClass(cls):
51
cls.source_embeds = load_pt(
52
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/src_emb_0.pt"
53
)
54
55
cls.target_embeds = load_pt(
56
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/tgt_emb_0.pt"
57
)
58
59
def get_dummy_components(self):
60
torch.manual_seed(0)
61
unet = UNet2DConditionModel(
62
block_out_channels=(32, 64),
63
layers_per_block=2,
64
sample_size=32,
65
in_channels=4,
66
out_channels=4,
67
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
68
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
69
cross_attention_dim=32,
70
)
71
scheduler = DDIMScheduler()
72
torch.manual_seed(0)
73
vae = AutoencoderKL(
74
block_out_channels=[32, 64],
75
in_channels=3,
76
out_channels=3,
77
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
78
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
79
latent_channels=4,
80
)
81
torch.manual_seed(0)
82
text_encoder_config = CLIPTextConfig(
83
bos_token_id=0,
84
eos_token_id=2,
85
hidden_size=32,
86
intermediate_size=37,
87
layer_norm_eps=1e-05,
88
num_attention_heads=4,
89
num_hidden_layers=5,
90
pad_token_id=1,
91
vocab_size=1000,
92
)
93
text_encoder = CLIPTextModel(text_encoder_config)
94
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
95
96
components = {
97
"unet": unet,
98
"scheduler": scheduler,
99
"vae": vae,
100
"text_encoder": text_encoder,
101
"tokenizer": tokenizer,
102
"safety_checker": None,
103
"feature_extractor": None,
104
"inverse_scheduler": None,
105
"caption_generator": None,
106
"caption_processor": None,
107
}
108
return components
109
110
def get_dummy_inputs(self, device, seed=0):
111
generator = torch.manual_seed(seed)
112
113
inputs = {
114
"prompt": "A painting of a squirrel eating a burger",
115
"generator": generator,
116
"num_inference_steps": 2,
117
"guidance_scale": 6.0,
118
"cross_attention_guidance_amount": 0.15,
119
"source_embeds": self.source_embeds,
120
"target_embeds": self.target_embeds,
121
"output_type": "numpy",
122
}
123
return inputs
124
125
def test_stable_diffusion_pix2pix_zero_default_case(self):
126
device = "cpu" # ensure determinism for the device-dependent torch.Generator
127
components = self.get_dummy_components()
128
sd_pipe = StableDiffusionPix2PixZeroPipeline(**components)
129
sd_pipe = sd_pipe.to(device)
130
sd_pipe.set_progress_bar_config(disable=None)
131
132
inputs = self.get_dummy_inputs(device)
133
image = sd_pipe(**inputs).images
134
image_slice = image[0, -3:, -3:, -1]
135
assert image.shape == (1, 64, 64, 3)
136
expected_slice = np.array([0.5184, 0.503, 0.4917, 0.4022, 0.3455, 0.464, 0.5324, 0.5323, 0.4894])
137
138
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
139
140
def test_stable_diffusion_pix2pix_zero_negative_prompt(self):
141
device = "cpu" # ensure determinism for the device-dependent torch.Generator
142
components = self.get_dummy_components()
143
sd_pipe = StableDiffusionPix2PixZeroPipeline(**components)
144
sd_pipe = sd_pipe.to(device)
145
sd_pipe.set_progress_bar_config(disable=None)
146
147
inputs = self.get_dummy_inputs(device)
148
negative_prompt = "french fries"
149
output = sd_pipe(**inputs, negative_prompt=negative_prompt)
150
image = output.images
151
image_slice = image[0, -3:, -3:, -1]
152
153
assert image.shape == (1, 64, 64, 3)
154
expected_slice = np.array([0.5464, 0.5072, 0.5012, 0.4124, 0.3624, 0.466, 0.5413, 0.5468, 0.4927])
155
156
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
157
158
def test_stable_diffusion_pix2pix_zero_euler(self):
159
device = "cpu" # ensure determinism for the device-dependent torch.Generator
160
components = self.get_dummy_components()
161
components["scheduler"] = EulerAncestralDiscreteScheduler(
162
beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
163
)
164
sd_pipe = StableDiffusionPix2PixZeroPipeline(**components)
165
sd_pipe = sd_pipe.to(device)
166
sd_pipe.set_progress_bar_config(disable=None)
167
168
inputs = self.get_dummy_inputs(device)
169
image = sd_pipe(**inputs).images
170
image_slice = image[0, -3:, -3:, -1]
171
172
assert image.shape == (1, 64, 64, 3)
173
expected_slice = np.array([0.5114, 0.5051, 0.5222, 0.5279, 0.5037, 0.5156, 0.4604, 0.4966, 0.504])
174
175
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
176
177
def test_stable_diffusion_pix2pix_zero_ddpm(self):
178
device = "cpu" # ensure determinism for the device-dependent torch.Generator
179
components = self.get_dummy_components()
180
components["scheduler"] = DDPMScheduler()
181
sd_pipe = StableDiffusionPix2PixZeroPipeline(**components)
182
sd_pipe = sd_pipe.to(device)
183
sd_pipe.set_progress_bar_config(disable=None)
184
185
inputs = self.get_dummy_inputs(device)
186
image = sd_pipe(**inputs).images
187
image_slice = image[0, -3:, -3:, -1]
188
189
assert image.shape == (1, 64, 64, 3)
190
expected_slice = np.array([0.5185, 0.5027, 0.492, 0.401, 0.3445, 0.464, 0.5321, 0.5327, 0.4892])
191
192
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
193
194
# Non-determinism caused by the scheduler optimizing the latent inputs during inference
195
@unittest.skip("non-deterministic pipeline")
196
def test_inference_batch_single_identical(self):
197
return super().test_inference_batch_single_identical()
198
199
200
@slow
201
@require_torch_gpu
202
class StableDiffusionPix2PixZeroPipelineSlowTests(unittest.TestCase):
203
def tearDown(self):
204
super().tearDown()
205
gc.collect()
206
torch.cuda.empty_cache()
207
208
@classmethod
209
def setUpClass(cls):
210
cls.source_embeds = load_pt(
211
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat.pt"
212
)
213
214
cls.target_embeds = load_pt(
215
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/dog.pt"
216
)
217
218
def get_inputs(self, seed=0):
219
generator = torch.manual_seed(seed)
220
221
inputs = {
222
"prompt": "turn him into a cyborg",
223
"generator": generator,
224
"num_inference_steps": 3,
225
"guidance_scale": 7.5,
226
"cross_attention_guidance_amount": 0.15,
227
"source_embeds": self.source_embeds,
228
"target_embeds": self.target_embeds,
229
"output_type": "numpy",
230
}
231
return inputs
232
233
def test_stable_diffusion_pix2pix_zero_default(self):
234
pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
235
"CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
236
)
237
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
238
pipe.to(torch_device)
239
pipe.set_progress_bar_config(disable=None)
240
pipe.enable_attention_slicing()
241
242
inputs = self.get_inputs()
243
image = pipe(**inputs).images
244
image_slice = image[0, -3:, -3:, -1].flatten()
245
246
assert image.shape == (1, 512, 512, 3)
247
expected_slice = np.array([0.5742, 0.5757, 0.5747, 0.5781, 0.5688, 0.5713, 0.5742, 0.5664, 0.5747])
248
249
assert np.abs(expected_slice - image_slice).max() < 5e-2
250
251
def test_stable_diffusion_pix2pix_zero_k_lms(self):
252
pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
253
"CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
254
)
255
pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
256
pipe.to(torch_device)
257
pipe.set_progress_bar_config(disable=None)
258
pipe.enable_attention_slicing()
259
260
inputs = self.get_inputs()
261
image = pipe(**inputs).images
262
image_slice = image[0, -3:, -3:, -1].flatten()
263
264
assert image.shape == (1, 512, 512, 3)
265
expected_slice = np.array([0.6367, 0.5459, 0.5146, 0.5479, 0.4905, 0.4753, 0.4961, 0.4629, 0.4624])
266
267
assert np.abs(expected_slice - image_slice).max() < 5e-2
268
269
def test_stable_diffusion_pix2pix_zero_intermediate_state(self):
270
number_of_steps = 0
271
272
def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:
273
callback_fn.has_been_called = True
274
nonlocal number_of_steps
275
number_of_steps += 1
276
if step == 1:
277
latents = latents.detach().cpu().numpy()
278
assert latents.shape == (1, 4, 64, 64)
279
latents_slice = latents[0, -3:, -3:, -1]
280
expected_slice = np.array([0.1345, 0.268, 0.1539, 0.0726, 0.0959, 0.2261, -0.2673, 0.0277, -0.2062])
281
282
assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
283
elif step == 2:
284
latents = latents.detach().cpu().numpy()
285
assert latents.shape == (1, 4, 64, 64)
286
latents_slice = latents[0, -3:, -3:, -1]
287
expected_slice = np.array([0.1393, 0.2637, 0.1617, 0.0724, 0.0987, 0.2271, -0.2666, 0.0299, -0.2104])
288
289
assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2
290
291
callback_fn.has_been_called = False
292
293
pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
294
"CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
295
)
296
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
297
pipe = pipe.to(torch_device)
298
pipe.set_progress_bar_config(disable=None)
299
pipe.enable_attention_slicing()
300
301
inputs = self.get_inputs()
302
pipe(**inputs, callback=callback_fn, callback_steps=1)
303
assert callback_fn.has_been_called
304
assert number_of_steps == 3
305
306
def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
307
torch.cuda.empty_cache()
308
torch.cuda.reset_max_memory_allocated()
309
torch.cuda.reset_peak_memory_stats()
310
311
pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
312
"CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
313
)
314
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
315
pipe = pipe.to(torch_device)
316
pipe.set_progress_bar_config(disable=None)
317
pipe.enable_attention_slicing(1)
318
pipe.enable_sequential_cpu_offload()
319
320
inputs = self.get_inputs()
321
_ = pipe(**inputs)
322
323
mem_bytes = torch.cuda.max_memory_allocated()
324
# make sure that less than 8.2 GB is allocated
325
assert mem_bytes < 8.2 * 10**9
326
327
328
@slow
329
@require_torch_gpu
330
class InversionPipelineSlowTests(unittest.TestCase):
331
def tearDown(self):
332
super().tearDown()
333
gc.collect()
334
torch.cuda.empty_cache()
335
336
@classmethod
337
def setUpClass(cls):
338
raw_image = load_image(
339
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png"
340
)
341
342
raw_image = raw_image.convert("RGB").resize((512, 512))
343
344
cls.raw_image = raw_image
345
346
def test_stable_diffusion_pix2pix_inversion(self):
347
pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
348
"CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
349
)
350
pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config)
351
352
caption = "a photography of a cat with flowers"
353
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
354
pipe.enable_model_cpu_offload()
355
pipe.set_progress_bar_config(disable=None)
356
357
generator = torch.manual_seed(0)
358
output = pipe.invert(caption, image=self.raw_image, generator=generator, num_inference_steps=10)
359
inv_latents = output[0]
360
361
image_slice = inv_latents[0, -3:, -3:, -1].flatten()
362
363
assert inv_latents.shape == (1, 4, 64, 64)
364
expected_slice = np.array([0.8447, -0.0730, 0.7588, -1.2070, -0.4678, 0.1511, -0.8555, 1.1816, -0.7666])
365
366
assert np.abs(expected_slice - image_slice.cpu().numpy()).max() < 5e-2
367
368
def test_stable_diffusion_2_pix2pix_inversion(self):
369
pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
370
"stabilityai/stable-diffusion-2-1", safety_checker=None, torch_dtype=torch.float16
371
)
372
pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config)
373
374
caption = "a photography of a cat with flowers"
375
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
376
pipe.enable_model_cpu_offload()
377
pipe.set_progress_bar_config(disable=None)
378
379
generator = torch.manual_seed(0)
380
output = pipe.invert(caption, image=self.raw_image, generator=generator, num_inference_steps=10)
381
inv_latents = output[0]
382
383
image_slice = inv_latents[0, -3:, -3:, -1].flatten()
384
385
assert inv_latents.shape == (1, 4, 64, 64)
386
expected_slice = np.array([0.8970, -0.1611, 0.4766, -1.1162, -0.5923, 0.1050, -0.9678, 1.0537, -0.6050])
387
388
assert np.abs(expected_slice - image_slice.cpu().numpy()).max() < 5e-2
389
390
def test_stable_diffusion_pix2pix_full(self):
391
# numpy array of https://huggingface.co/datasets/hf-internal-testing/diffusers-images/blob/main/pix2pix/dog.png
392
expected_image = load_numpy(
393
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/dog.npy"
394
)
395
396
pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
397
"CompVis/stable-diffusion-v1-4", safety_checker=None, torch_dtype=torch.float16
398
)
399
pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config)
400
401
caption = "a photography of a cat with flowers"
402
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
403
pipe.enable_model_cpu_offload()
404
pipe.set_progress_bar_config(disable=None)
405
406
generator = torch.manual_seed(0)
407
output = pipe.invert(caption, image=self.raw_image, generator=generator)
408
inv_latents = output[0]
409
410
source_prompts = 4 * ["a cat sitting on the street", "a cat playing in the field", "a face of a cat"]
411
target_prompts = 4 * ["a dog sitting on the street", "a dog playing in the field", "a face of a dog"]
412
413
source_embeds = pipe.get_embeds(source_prompts)
414
target_embeds = pipe.get_embeds(target_prompts)
415
416
image = pipe(
417
caption,
418
source_embeds=source_embeds,
419
target_embeds=target_embeds,
420
num_inference_steps=50,
421
cross_attention_guidance_amount=0.15,
422
generator=generator,
423
latents=inv_latents,
424
negative_prompt=caption,
425
output_type="np",
426
).images
427
428
max_diff = np.abs(expected_image - image).mean()
429
assert max_diff < 0.05
430
431
def test_stable_diffusion_2_pix2pix_full(self):
432
# numpy array of https://huggingface.co/datasets/hf-internal-testing/diffusers-images/blob/main/pix2pix/dog_2.png
433
expected_image = load_numpy(
434
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/dog_2.npy"
435
)
436
437
pipe = StableDiffusionPix2PixZeroPipeline.from_pretrained(
438
"stabilityai/stable-diffusion-2-1", safety_checker=None, torch_dtype=torch.float16
439
)
440
pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config)
441
442
caption = "a photography of a cat with flowers"
443
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
444
pipe.enable_model_cpu_offload()
445
pipe.set_progress_bar_config(disable=None)
446
447
generator = torch.manual_seed(0)
448
output = pipe.invert(caption, image=self.raw_image, generator=generator)
449
inv_latents = output[0]
450
451
source_prompts = 4 * ["a cat sitting on the street", "a cat playing in the field", "a face of a cat"]
452
target_prompts = 4 * ["a dog sitting on the street", "a dog playing in the field", "a face of a dog"]
453
454
source_embeds = pipe.get_embeds(source_prompts)
455
target_embeds = pipe.get_embeds(target_prompts)
456
457
image = pipe(
458
caption,
459
source_embeds=source_embeds,
460
target_embeds=target_embeds,
461
num_inference_steps=125,
462
cross_attention_guidance_amount=0.015,
463
generator=generator,
464
latents=inv_latents,
465
negative_prompt=caption,
466
output_type="np",
467
).images
468
469
mean_diff = np.abs(expected_image - image).mean()
470
assert mean_diff < 0.25
471
472