Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
shivamshrirao
GitHub Repository: shivamshrirao/diffusers
Path: blob/main/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py
1450 views
1
# coding=utf-8
2
# Copyright 2023 HuggingFace Inc.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
# http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
16
import gc
17
import random
18
import tempfile
19
import unittest
20
21
import numpy as np
22
import torch
23
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
24
25
from diffusers import AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel
26
from diffusers.pipelines.semantic_stable_diffusion import SemanticStableDiffusionPipeline as StableDiffusionPipeline
27
from diffusers.utils import floats_tensor, nightly, torch_device
28
from diffusers.utils.testing_utils import require_torch_gpu
29
30
31
torch.backends.cuda.matmul.allow_tf32 = False
32
33
34
class SafeDiffusionPipelineFastTests(unittest.TestCase):
35
def tearDown(self):
36
# clean up the VRAM after each test
37
super().tearDown()
38
gc.collect()
39
torch.cuda.empty_cache()
40
41
@property
42
def dummy_image(self):
43
batch_size = 1
44
num_channels = 3
45
sizes = (32, 32)
46
47
image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
48
return image
49
50
@property
51
def dummy_cond_unet(self):
52
torch.manual_seed(0)
53
model = UNet2DConditionModel(
54
block_out_channels=(32, 64),
55
layers_per_block=2,
56
sample_size=32,
57
in_channels=4,
58
out_channels=4,
59
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
60
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
61
cross_attention_dim=32,
62
)
63
return model
64
65
@property
66
def dummy_vae(self):
67
torch.manual_seed(0)
68
model = AutoencoderKL(
69
block_out_channels=[32, 64],
70
in_channels=3,
71
out_channels=3,
72
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
73
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
74
latent_channels=4,
75
)
76
return model
77
78
@property
79
def dummy_text_encoder(self):
80
torch.manual_seed(0)
81
config = CLIPTextConfig(
82
bos_token_id=0,
83
eos_token_id=2,
84
hidden_size=32,
85
intermediate_size=37,
86
layer_norm_eps=1e-05,
87
num_attention_heads=4,
88
num_hidden_layers=5,
89
pad_token_id=1,
90
vocab_size=1000,
91
)
92
return CLIPTextModel(config)
93
94
@property
95
def dummy_extractor(self):
96
def extract(*args, **kwargs):
97
class Out:
98
def __init__(self):
99
self.pixel_values = torch.ones([0])
100
101
def to(self, device):
102
self.pixel_values.to(device)
103
return self
104
105
return Out()
106
107
return extract
108
109
def test_semantic_diffusion_ddim(self):
110
device = "cpu" # ensure determinism for the device-dependent torch.Generator
111
unet = self.dummy_cond_unet
112
scheduler = DDIMScheduler(
113
beta_start=0.00085,
114
beta_end=0.012,
115
beta_schedule="scaled_linear",
116
clip_sample=False,
117
set_alpha_to_one=False,
118
)
119
120
vae = self.dummy_vae
121
bert = self.dummy_text_encoder
122
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
123
124
# make sure here that pndm scheduler skips prk
125
sd_pipe = StableDiffusionPipeline(
126
unet=unet,
127
scheduler=scheduler,
128
vae=vae,
129
text_encoder=bert,
130
tokenizer=tokenizer,
131
safety_checker=None,
132
feature_extractor=self.dummy_extractor,
133
)
134
sd_pipe = sd_pipe.to(device)
135
sd_pipe.set_progress_bar_config(disable=None)
136
137
prompt = "A painting of a squirrel eating a burger"
138
139
generator = torch.Generator(device=device).manual_seed(0)
140
output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
141
image = output.images
142
143
generator = torch.Generator(device=device).manual_seed(0)
144
image_from_tuple = sd_pipe(
145
[prompt],
146
generator=generator,
147
guidance_scale=6.0,
148
num_inference_steps=2,
149
output_type="np",
150
return_dict=False,
151
)[0]
152
153
image_slice = image[0, -3:, -3:, -1]
154
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
155
156
assert image.shape == (1, 64, 64, 3)
157
expected_slice = np.array([0.5644, 0.6018, 0.4799, 0.5267, 0.5585, 0.4641, 0.516, 0.4964, 0.4792])
158
159
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
160
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
161
162
def test_semantic_diffusion_pndm(self):
163
device = "cpu" # ensure determinism for the device-dependent torch.Generator
164
unet = self.dummy_cond_unet
165
scheduler = PNDMScheduler(skip_prk_steps=True)
166
vae = self.dummy_vae
167
bert = self.dummy_text_encoder
168
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
169
170
# make sure here that pndm scheduler skips prk
171
sd_pipe = StableDiffusionPipeline(
172
unet=unet,
173
scheduler=scheduler,
174
vae=vae,
175
text_encoder=bert,
176
tokenizer=tokenizer,
177
safety_checker=None,
178
feature_extractor=self.dummy_extractor,
179
)
180
sd_pipe = sd_pipe.to(device)
181
sd_pipe.set_progress_bar_config(disable=None)
182
183
prompt = "A painting of a squirrel eating a burger"
184
generator = torch.Generator(device=device).manual_seed(0)
185
output = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")
186
187
image = output.images
188
189
generator = torch.Generator(device=device).manual_seed(0)
190
image_from_tuple = sd_pipe(
191
[prompt],
192
generator=generator,
193
guidance_scale=6.0,
194
num_inference_steps=2,
195
output_type="np",
196
return_dict=False,
197
)[0]
198
199
image_slice = image[0, -3:, -3:, -1]
200
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
201
202
assert image.shape == (1, 64, 64, 3)
203
expected_slice = np.array([0.5095, 0.5674, 0.4668, 0.5126, 0.5697, 0.4675, 0.5278, 0.4964, 0.4945])
204
205
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
206
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
207
208
def test_semantic_diffusion_no_safety_checker(self):
209
pipe = StableDiffusionPipeline.from_pretrained(
210
"hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None
211
)
212
assert isinstance(pipe, StableDiffusionPipeline)
213
assert isinstance(pipe.scheduler, LMSDiscreteScheduler)
214
assert pipe.safety_checker is None
215
216
image = pipe("example prompt", num_inference_steps=2).images[0]
217
assert image is not None
218
219
# check that there's no error when saving a pipeline with one of the models being None
220
with tempfile.TemporaryDirectory() as tmpdirname:
221
pipe.save_pretrained(tmpdirname)
222
pipe = StableDiffusionPipeline.from_pretrained(tmpdirname)
223
224
# sanity check that the pipeline still works
225
assert pipe.safety_checker is None
226
image = pipe("example prompt", num_inference_steps=2).images[0]
227
assert image is not None
228
229
@unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
230
def test_semantic_diffusion_fp16(self):
231
"""Test that stable diffusion works with fp16"""
232
unet = self.dummy_cond_unet
233
scheduler = PNDMScheduler(skip_prk_steps=True)
234
vae = self.dummy_vae
235
bert = self.dummy_text_encoder
236
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
237
238
# put models in fp16
239
unet = unet.half()
240
vae = vae.half()
241
bert = bert.half()
242
243
# make sure here that pndm scheduler skips prk
244
sd_pipe = StableDiffusionPipeline(
245
unet=unet,
246
scheduler=scheduler,
247
vae=vae,
248
text_encoder=bert,
249
tokenizer=tokenizer,
250
safety_checker=None,
251
feature_extractor=self.dummy_extractor,
252
)
253
sd_pipe = sd_pipe.to(torch_device)
254
sd_pipe.set_progress_bar_config(disable=None)
255
256
prompt = "A painting of a squirrel eating a burger"
257
image = sd_pipe([prompt], num_inference_steps=2, output_type="np").images
258
259
assert image.shape == (1, 64, 64, 3)
260
261
262
@nightly
263
@require_torch_gpu
264
class SemanticDiffusionPipelineIntegrationTests(unittest.TestCase):
265
def tearDown(self):
266
# clean up the VRAM after each test
267
super().tearDown()
268
gc.collect()
269
torch.cuda.empty_cache()
270
271
def test_positive_guidance(self):
272
torch_device = "cuda"
273
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
274
pipe = pipe.to(torch_device)
275
pipe.set_progress_bar_config(disable=None)
276
277
prompt = "a photo of a cat"
278
edit = {
279
"editing_prompt": ["sunglasses"],
280
"reverse_editing_direction": [False],
281
"edit_warmup_steps": 10,
282
"edit_guidance_scale": 6,
283
"edit_threshold": 0.95,
284
"edit_momentum_scale": 0.5,
285
"edit_mom_beta": 0.6,
286
}
287
288
seed = 3
289
guidance_scale = 7
290
291
# no sega enabled
292
generator = torch.Generator(torch_device)
293
generator.manual_seed(seed)
294
output = pipe(
295
[prompt],
296
generator=generator,
297
guidance_scale=guidance_scale,
298
num_inference_steps=50,
299
output_type="np",
300
width=512,
301
height=512,
302
)
303
304
image = output.images
305
image_slice = image[0, -3:, -3:, -1]
306
expected_slice = [
307
0.34673113,
308
0.38492733,
309
0.37597352,
310
0.34086335,
311
0.35650748,
312
0.35579205,
313
0.3384763,
314
0.34340236,
315
0.3573271,
316
]
317
318
assert image.shape == (1, 512, 512, 3)
319
320
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
321
322
# with sega enabled
323
# generator = torch.manual_seed(seed)
324
generator.manual_seed(seed)
325
output = pipe(
326
[prompt],
327
generator=generator,
328
guidance_scale=guidance_scale,
329
num_inference_steps=50,
330
output_type="np",
331
width=512,
332
height=512,
333
**edit,
334
)
335
336
image = output.images
337
image_slice = image[0, -3:, -3:, -1]
338
expected_slice = [
339
0.41887826,
340
0.37728766,
341
0.30138272,
342
0.41416335,
343
0.41664985,
344
0.36283392,
345
0.36191246,
346
0.43364465,
347
0.43001732,
348
]
349
350
assert image.shape == (1, 512, 512, 3)
351
352
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
353
354
def test_negative_guidance(self):
355
torch_device = "cuda"
356
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
357
pipe = pipe.to(torch_device)
358
pipe.set_progress_bar_config(disable=None)
359
360
prompt = "an image of a crowded boulevard, realistic, 4k"
361
edit = {
362
"editing_prompt": "crowd, crowded, people",
363
"reverse_editing_direction": True,
364
"edit_warmup_steps": 10,
365
"edit_guidance_scale": 8.3,
366
"edit_threshold": 0.9,
367
"edit_momentum_scale": 0.5,
368
"edit_mom_beta": 0.6,
369
}
370
371
seed = 9
372
guidance_scale = 7
373
374
# no sega enabled
375
generator = torch.Generator(torch_device)
376
generator.manual_seed(seed)
377
output = pipe(
378
[prompt],
379
generator=generator,
380
guidance_scale=guidance_scale,
381
num_inference_steps=50,
382
output_type="np",
383
width=512,
384
height=512,
385
)
386
387
image = output.images
388
image_slice = image[0, -3:, -3:, -1]
389
expected_slice = [
390
0.43497998,
391
0.91814065,
392
0.7540739,
393
0.55580205,
394
0.8467265,
395
0.5389691,
396
0.62574506,
397
0.58897763,
398
0.50926757,
399
]
400
401
assert image.shape == (1, 512, 512, 3)
402
403
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
404
405
# with sega enabled
406
# generator = torch.manual_seed(seed)
407
generator.manual_seed(seed)
408
output = pipe(
409
[prompt],
410
generator=generator,
411
guidance_scale=guidance_scale,
412
num_inference_steps=50,
413
output_type="np",
414
width=512,
415
height=512,
416
**edit,
417
)
418
419
image = output.images
420
image_slice = image[0, -3:, -3:, -1]
421
expected_slice = [
422
0.3089719,
423
0.30500144,
424
0.29016042,
425
0.30630964,
426
0.325687,
427
0.29419225,
428
0.2908091,
429
0.28723598,
430
0.27696294,
431
]
432
433
assert image.shape == (1, 512, 512, 3)
434
435
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
436
437
def test_multi_cond_guidance(self):
438
torch_device = "cuda"
439
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
440
pipe = pipe.to(torch_device)
441
pipe.set_progress_bar_config(disable=None)
442
443
prompt = "a castle next to a river"
444
edit = {
445
"editing_prompt": ["boat on a river, boat", "monet, impression, sunrise"],
446
"reverse_editing_direction": False,
447
"edit_warmup_steps": [15, 18],
448
"edit_guidance_scale": 6,
449
"edit_threshold": [0.9, 0.8],
450
"edit_momentum_scale": 0.5,
451
"edit_mom_beta": 0.6,
452
}
453
454
seed = 48
455
guidance_scale = 7
456
457
# no sega enabled
458
generator = torch.Generator(torch_device)
459
generator.manual_seed(seed)
460
output = pipe(
461
[prompt],
462
generator=generator,
463
guidance_scale=guidance_scale,
464
num_inference_steps=50,
465
output_type="np",
466
width=512,
467
height=512,
468
)
469
470
image = output.images
471
image_slice = image[0, -3:, -3:, -1]
472
expected_slice = [
473
0.75163555,
474
0.76037145,
475
0.61785,
476
0.9189673,
477
0.8627701,
478
0.85189694,
479
0.8512813,
480
0.87012076,
481
0.8312857,
482
]
483
484
assert image.shape == (1, 512, 512, 3)
485
486
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
487
488
# with sega enabled
489
# generator = torch.manual_seed(seed)
490
generator.manual_seed(seed)
491
output = pipe(
492
[prompt],
493
generator=generator,
494
guidance_scale=guidance_scale,
495
num_inference_steps=50,
496
output_type="np",
497
width=512,
498
height=512,
499
**edit,
500
)
501
502
image = output.images
503
image_slice = image[0, -3:, -3:, -1]
504
expected_slice = [
505
0.73553365,
506
0.7537271,
507
0.74341905,
508
0.66480356,
509
0.6472925,
510
0.63039416,
511
0.64812905,
512
0.6749717,
513
0.6517102,
514
]
515
516
assert image.shape == (1, 512, 512, 3)
517
518
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
519
520
def test_guidance_fp16(self):
521
torch_device = "cuda"
522
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
523
pipe = pipe.to(torch_device)
524
pipe.set_progress_bar_config(disable=None)
525
526
prompt = "a photo of a cat"
527
edit = {
528
"editing_prompt": ["sunglasses"],
529
"reverse_editing_direction": [False],
530
"edit_warmup_steps": 10,
531
"edit_guidance_scale": 6,
532
"edit_threshold": 0.95,
533
"edit_momentum_scale": 0.5,
534
"edit_mom_beta": 0.6,
535
}
536
537
seed = 3
538
guidance_scale = 7
539
540
# no sega enabled
541
generator = torch.Generator(torch_device)
542
generator.manual_seed(seed)
543
output = pipe(
544
[prompt],
545
generator=generator,
546
guidance_scale=guidance_scale,
547
num_inference_steps=50,
548
output_type="np",
549
width=512,
550
height=512,
551
)
552
553
image = output.images
554
image_slice = image[0, -3:, -3:, -1]
555
expected_slice = [
556
0.34887695,
557
0.3876953,
558
0.375,
559
0.34423828,
560
0.3581543,
561
0.35717773,
562
0.3383789,
563
0.34570312,
564
0.359375,
565
]
566
567
assert image.shape == (1, 512, 512, 3)
568
569
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
570
571
# with sega enabled
572
# generator = torch.manual_seed(seed)
573
generator.manual_seed(seed)
574
output = pipe(
575
[prompt],
576
generator=generator,
577
guidance_scale=guidance_scale,
578
num_inference_steps=50,
579
output_type="np",
580
width=512,
581
height=512,
582
**edit,
583
)
584
585
image = output.images
586
image_slice = image[0, -3:, -3:, -1]
587
expected_slice = [
588
0.42285156,
589
0.36914062,
590
0.29077148,
591
0.42041016,
592
0.41918945,
593
0.35498047,
594
0.3618164,
595
0.4423828,
596
0.43115234,
597
]
598
599
assert image.shape == (1, 512, 512, 3)
600
601
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
602
603