Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
shivamshrirao
GitHub Repository: shivamshrirao/diffusers
Path: blob/main/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
1448 views
1
# coding=utf-8
2
# Copyright 2023 HuggingFace Inc.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
# http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
16
import gc
17
import tempfile
18
import unittest
19
20
import numpy as np
21
import torch
22
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
23
24
from diffusers import (
25
AutoencoderKL,
26
ControlNetModel,
27
DDIMScheduler,
28
StableDiffusionControlNetPipeline,
29
UNet2DConditionModel,
30
)
31
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import MultiControlNetModel
32
from diffusers.utils import load_image, load_numpy, randn_tensor, slow, torch_device
33
from diffusers.utils.import_utils import is_xformers_available
34
from diffusers.utils.testing_utils import require_torch_gpu
35
36
from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
37
from ...test_pipelines_common import PipelineTesterMixin
38
39
40
class StableDiffusionControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
41
pipeline_class = StableDiffusionControlNetPipeline
42
params = TEXT_TO_IMAGE_PARAMS
43
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
44
45
def get_dummy_components(self):
46
torch.manual_seed(0)
47
unet = UNet2DConditionModel(
48
block_out_channels=(32, 64),
49
layers_per_block=2,
50
sample_size=32,
51
in_channels=4,
52
out_channels=4,
53
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
54
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
55
cross_attention_dim=32,
56
)
57
torch.manual_seed(0)
58
controlnet = ControlNetModel(
59
block_out_channels=(32, 64),
60
layers_per_block=2,
61
in_channels=4,
62
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
63
cross_attention_dim=32,
64
conditioning_embedding_out_channels=(16, 32),
65
)
66
torch.manual_seed(0)
67
scheduler = DDIMScheduler(
68
beta_start=0.00085,
69
beta_end=0.012,
70
beta_schedule="scaled_linear",
71
clip_sample=False,
72
set_alpha_to_one=False,
73
)
74
torch.manual_seed(0)
75
vae = AutoencoderKL(
76
block_out_channels=[32, 64],
77
in_channels=3,
78
out_channels=3,
79
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
80
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
81
latent_channels=4,
82
)
83
torch.manual_seed(0)
84
text_encoder_config = CLIPTextConfig(
85
bos_token_id=0,
86
eos_token_id=2,
87
hidden_size=32,
88
intermediate_size=37,
89
layer_norm_eps=1e-05,
90
num_attention_heads=4,
91
num_hidden_layers=5,
92
pad_token_id=1,
93
vocab_size=1000,
94
)
95
text_encoder = CLIPTextModel(text_encoder_config)
96
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
97
98
components = {
99
"unet": unet,
100
"controlnet": controlnet,
101
"scheduler": scheduler,
102
"vae": vae,
103
"text_encoder": text_encoder,
104
"tokenizer": tokenizer,
105
"safety_checker": None,
106
"feature_extractor": None,
107
}
108
return components
109
110
def get_dummy_inputs(self, device, seed=0):
111
if str(device).startswith("mps"):
112
generator = torch.manual_seed(seed)
113
else:
114
generator = torch.Generator(device=device).manual_seed(seed)
115
116
controlnet_embedder_scale_factor = 2
117
image = randn_tensor(
118
(1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
119
generator=generator,
120
device=torch.device(device),
121
)
122
123
inputs = {
124
"prompt": "A painting of a squirrel eating a burger",
125
"generator": generator,
126
"num_inference_steps": 2,
127
"guidance_scale": 6.0,
128
"output_type": "numpy",
129
"image": image,
130
}
131
132
return inputs
133
134
def test_attention_slicing_forward_pass(self):
135
return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
136
137
@unittest.skipIf(
138
torch_device != "cuda" or not is_xformers_available(),
139
reason="XFormers attention is only available with CUDA and `xformers` installed",
140
)
141
def test_xformers_attention_forwardGenerator_pass(self):
142
self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
143
144
def test_inference_batch_single_identical(self):
145
self._test_inference_batch_single_identical(expected_max_diff=2e-3)
146
147
148
class StableDiffusionMultiControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
149
pipeline_class = StableDiffusionControlNetPipeline
150
params = TEXT_TO_IMAGE_PARAMS
151
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
152
153
def get_dummy_components(self):
154
torch.manual_seed(0)
155
unet = UNet2DConditionModel(
156
block_out_channels=(32, 64),
157
layers_per_block=2,
158
sample_size=32,
159
in_channels=4,
160
out_channels=4,
161
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
162
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
163
cross_attention_dim=32,
164
)
165
torch.manual_seed(0)
166
controlnet1 = ControlNetModel(
167
block_out_channels=(32, 64),
168
layers_per_block=2,
169
in_channels=4,
170
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
171
cross_attention_dim=32,
172
conditioning_embedding_out_channels=(16, 32),
173
)
174
torch.manual_seed(0)
175
controlnet2 = ControlNetModel(
176
block_out_channels=(32, 64),
177
layers_per_block=2,
178
in_channels=4,
179
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
180
cross_attention_dim=32,
181
conditioning_embedding_out_channels=(16, 32),
182
)
183
torch.manual_seed(0)
184
scheduler = DDIMScheduler(
185
beta_start=0.00085,
186
beta_end=0.012,
187
beta_schedule="scaled_linear",
188
clip_sample=False,
189
set_alpha_to_one=False,
190
)
191
torch.manual_seed(0)
192
vae = AutoencoderKL(
193
block_out_channels=[32, 64],
194
in_channels=3,
195
out_channels=3,
196
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
197
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
198
latent_channels=4,
199
)
200
torch.manual_seed(0)
201
text_encoder_config = CLIPTextConfig(
202
bos_token_id=0,
203
eos_token_id=2,
204
hidden_size=32,
205
intermediate_size=37,
206
layer_norm_eps=1e-05,
207
num_attention_heads=4,
208
num_hidden_layers=5,
209
pad_token_id=1,
210
vocab_size=1000,
211
)
212
text_encoder = CLIPTextModel(text_encoder_config)
213
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
214
215
controlnet = MultiControlNetModel([controlnet1, controlnet2])
216
217
components = {
218
"unet": unet,
219
"controlnet": controlnet,
220
"scheduler": scheduler,
221
"vae": vae,
222
"text_encoder": text_encoder,
223
"tokenizer": tokenizer,
224
"safety_checker": None,
225
"feature_extractor": None,
226
}
227
return components
228
229
def get_dummy_inputs(self, device, seed=0):
230
if str(device).startswith("mps"):
231
generator = torch.manual_seed(seed)
232
else:
233
generator = torch.Generator(device=device).manual_seed(seed)
234
235
controlnet_embedder_scale_factor = 2
236
237
images = [
238
randn_tensor(
239
(1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
240
generator=generator,
241
device=torch.device(device),
242
),
243
randn_tensor(
244
(1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),
245
generator=generator,
246
device=torch.device(device),
247
),
248
]
249
250
inputs = {
251
"prompt": "A painting of a squirrel eating a burger",
252
"generator": generator,
253
"num_inference_steps": 2,
254
"guidance_scale": 6.0,
255
"output_type": "numpy",
256
"image": images,
257
}
258
259
return inputs
260
261
def test_attention_slicing_forward_pass(self):
262
return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)
263
264
@unittest.skipIf(
265
torch_device != "cuda" or not is_xformers_available(),
266
reason="XFormers attention is only available with CUDA and `xformers` installed",
267
)
268
def test_xformers_attention_forwardGenerator_pass(self):
269
self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)
270
271
def test_inference_batch_single_identical(self):
272
self._test_inference_batch_single_identical(expected_max_diff=2e-3)
273
274
def test_save_pretrained_raise_not_implemented_exception(self):
275
components = self.get_dummy_components()
276
pipe = self.pipeline_class(**components)
277
pipe.to(torch_device)
278
pipe.set_progress_bar_config(disable=None)
279
with tempfile.TemporaryDirectory() as tmpdir:
280
try:
281
# save_pretrained is not implemented for Multi-ControlNet
282
pipe.save_pretrained(tmpdir)
283
except NotImplementedError:
284
pass
285
286
# override PipelineTesterMixin
287
@unittest.skip("save pretrained not implemented")
288
def test_save_load_float16(self):
289
...
290
291
# override PipelineTesterMixin
292
@unittest.skip("save pretrained not implemented")
293
def test_save_load_local(self):
294
...
295
296
# override PipelineTesterMixin
297
@unittest.skip("save pretrained not implemented")
298
def test_save_load_optional_components(self):
299
...
300
301
302
@slow
303
@require_torch_gpu
304
class StableDiffusionControlNetPipelineSlowTests(unittest.TestCase):
305
def tearDown(self):
306
super().tearDown()
307
gc.collect()
308
torch.cuda.empty_cache()
309
310
def test_canny(self):
311
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
312
313
pipe = StableDiffusionControlNetPipeline.from_pretrained(
314
"runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
315
)
316
pipe.enable_model_cpu_offload()
317
pipe.set_progress_bar_config(disable=None)
318
319
generator = torch.Generator(device="cpu").manual_seed(0)
320
prompt = "bird"
321
image = load_image(
322
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
323
)
324
325
output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
326
327
image = output.images[0]
328
329
assert image.shape == (768, 512, 3)
330
331
expected_image = load_numpy(
332
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out.npy"
333
)
334
335
assert np.abs(expected_image - image).max() < 5e-3
336
337
def test_depth(self):
338
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-depth")
339
340
pipe = StableDiffusionControlNetPipeline.from_pretrained(
341
"runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
342
)
343
pipe.enable_model_cpu_offload()
344
pipe.set_progress_bar_config(disable=None)
345
346
generator = torch.Generator(device="cpu").manual_seed(0)
347
prompt = "Stormtrooper's lecture"
348
image = load_image(
349
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"
350
)
351
352
output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
353
354
image = output.images[0]
355
356
assert image.shape == (512, 512, 3)
357
358
expected_image = load_numpy(
359
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth_out.npy"
360
)
361
362
assert np.abs(expected_image - image).max() < 5e-3
363
364
def test_hed(self):
365
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-hed")
366
367
pipe = StableDiffusionControlNetPipeline.from_pretrained(
368
"runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
369
)
370
pipe.enable_model_cpu_offload()
371
pipe.set_progress_bar_config(disable=None)
372
373
generator = torch.Generator(device="cpu").manual_seed(0)
374
prompt = "oil painting of handsome old man, masterpiece"
375
image = load_image(
376
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/man_hed.png"
377
)
378
379
output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
380
381
image = output.images[0]
382
383
assert image.shape == (704, 512, 3)
384
385
expected_image = load_numpy(
386
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/man_hed_out.npy"
387
)
388
389
assert np.abs(expected_image - image).max() < 5e-3
390
391
def test_mlsd(self):
392
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-mlsd")
393
394
pipe = StableDiffusionControlNetPipeline.from_pretrained(
395
"runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
396
)
397
pipe.enable_model_cpu_offload()
398
pipe.set_progress_bar_config(disable=None)
399
400
generator = torch.Generator(device="cpu").manual_seed(0)
401
prompt = "room"
402
image = load_image(
403
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/room_mlsd.png"
404
)
405
406
output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
407
408
image = output.images[0]
409
410
assert image.shape == (704, 512, 3)
411
412
expected_image = load_numpy(
413
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/room_mlsd_out.npy"
414
)
415
416
assert np.abs(expected_image - image).max() < 5e-3
417
418
def test_normal(self):
419
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-normal")
420
421
pipe = StableDiffusionControlNetPipeline.from_pretrained(
422
"runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
423
)
424
pipe.enable_model_cpu_offload()
425
pipe.set_progress_bar_config(disable=None)
426
427
generator = torch.Generator(device="cpu").manual_seed(0)
428
prompt = "cute toy"
429
image = load_image(
430
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/cute_toy_normal.png"
431
)
432
433
output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
434
435
image = output.images[0]
436
437
assert image.shape == (512, 512, 3)
438
439
expected_image = load_numpy(
440
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/cute_toy_normal_out.npy"
441
)
442
443
assert np.abs(expected_image - image).max() < 5e-3
444
445
def test_openpose(self):
446
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose")
447
448
pipe = StableDiffusionControlNetPipeline.from_pretrained(
449
"runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
450
)
451
pipe.enable_model_cpu_offload()
452
pipe.set_progress_bar_config(disable=None)
453
454
generator = torch.Generator(device="cpu").manual_seed(0)
455
prompt = "Chef in the kitchen"
456
image = load_image(
457
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose.png"
458
)
459
460
output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
461
462
image = output.images[0]
463
464
assert image.shape == (768, 512, 3)
465
466
expected_image = load_numpy(
467
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/chef_pose_out.npy"
468
)
469
470
assert np.abs(expected_image - image).max() < 5e-3
471
472
def test_scribble(self):
473
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-scribble")
474
475
pipe = StableDiffusionControlNetPipeline.from_pretrained(
476
"runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
477
)
478
pipe.enable_model_cpu_offload()
479
pipe.set_progress_bar_config(disable=None)
480
481
generator = torch.Generator(device="cpu").manual_seed(5)
482
prompt = "bag"
483
image = load_image(
484
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bag_scribble.png"
485
)
486
487
output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
488
489
image = output.images[0]
490
491
assert image.shape == (640, 512, 3)
492
493
expected_image = load_numpy(
494
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bag_scribble_out.npy"
495
)
496
497
assert np.abs(expected_image - image).max() < 5e-3
498
499
def test_seg(self):
500
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg")
501
502
pipe = StableDiffusionControlNetPipeline.from_pretrained(
503
"runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
504
)
505
pipe.enable_model_cpu_offload()
506
pipe.set_progress_bar_config(disable=None)
507
508
generator = torch.Generator(device="cpu").manual_seed(5)
509
prompt = "house"
510
image = load_image(
511
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/house_seg.png"
512
)
513
514
output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)
515
516
image = output.images[0]
517
518
assert image.shape == (512, 512, 3)
519
520
expected_image = load_numpy(
521
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/house_seg_out.npy"
522
)
523
524
assert np.abs(expected_image - image).max() < 5e-3
525
526
def test_sequential_cpu_offloading(self):
527
torch.cuda.empty_cache()
528
torch.cuda.reset_max_memory_allocated()
529
torch.cuda.reset_peak_memory_stats()
530
531
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg")
532
533
pipe = StableDiffusionControlNetPipeline.from_pretrained(
534
"runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
535
)
536
pipe.set_progress_bar_config(disable=None)
537
pipe.enable_attention_slicing()
538
pipe.enable_sequential_cpu_offload()
539
540
prompt = "house"
541
image = load_image(
542
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/house_seg.png"
543
)
544
545
_ = pipe(
546
prompt,
547
image,
548
num_inference_steps=2,
549
output_type="np",
550
)
551
552
mem_bytes = torch.cuda.max_memory_allocated()
553
# make sure that less than 7 GB is allocated
554
assert mem_bytes < 4 * 10**9
555
556
557
@slow
558
@require_torch_gpu
559
class StableDiffusionMultiControlNetPipelineSlowTests(unittest.TestCase):
560
def tearDown(self):
561
super().tearDown()
562
gc.collect()
563
torch.cuda.empty_cache()
564
565
def test_pose_and_canny(self):
566
controlnet_canny = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")
567
controlnet_pose = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose")
568
569
pipe = StableDiffusionControlNetPipeline.from_pretrained(
570
"runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=[controlnet_pose, controlnet_canny]
571
)
572
pipe.enable_model_cpu_offload()
573
pipe.set_progress_bar_config(disable=None)
574
575
generator = torch.Generator(device="cpu").manual_seed(0)
576
prompt = "bird and Chef"
577
image_canny = load_image(
578
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
579
)
580
image_pose = load_image(
581
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose.png"
582
)
583
584
output = pipe(prompt, [image_pose, image_canny], generator=generator, output_type="np", num_inference_steps=3)
585
586
image = output.images[0]
587
588
assert image.shape == (768, 512, 3)
589
590
expected_image = load_numpy(
591
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose_canny_out.npy"
592
)
593
594
assert np.abs(expected_image - image).max() < 5e-2
595
596