Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
shivamshrirao
GitHub Repository: shivamshrirao/diffusers
Path: blob/main/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py
1450 views
1
# coding=utf-8
2
# Copyright 2023 HuggingFace Inc.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
# http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
16
import gc
17
import random
18
import unittest
19
20
import numpy as np
21
import torch
22
from transformers import XLMRobertaTokenizer
23
24
from diffusers import (
25
AltDiffusionImg2ImgPipeline,
26
AutoencoderKL,
27
PNDMScheduler,
28
UNet2DConditionModel,
29
)
30
from diffusers.image_processor import VaeImageProcessor
31
from diffusers.pipelines.alt_diffusion.modeling_roberta_series import (
32
RobertaSeriesConfig,
33
RobertaSeriesModelWithTransformation,
34
)
35
from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device
36
from diffusers.utils.testing_utils import require_torch_gpu
37
38
39
torch.backends.cuda.matmul.allow_tf32 = False
40
41
42
class AltDiffusionImg2ImgPipelineFastTests(unittest.TestCase):
43
def tearDown(self):
44
# clean up the VRAM after each test
45
super().tearDown()
46
gc.collect()
47
torch.cuda.empty_cache()
48
49
@property
50
def dummy_image(self):
51
batch_size = 1
52
num_channels = 3
53
sizes = (32, 32)
54
55
image = floats_tensor((batch_size, num_channels) + sizes, rng=random.Random(0)).to(torch_device)
56
return image
57
58
@property
59
def dummy_cond_unet(self):
60
torch.manual_seed(0)
61
model = UNet2DConditionModel(
62
block_out_channels=(32, 64),
63
layers_per_block=2,
64
sample_size=32,
65
in_channels=4,
66
out_channels=4,
67
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
68
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
69
cross_attention_dim=32,
70
)
71
return model
72
73
@property
74
def dummy_vae(self):
75
torch.manual_seed(0)
76
model = AutoencoderKL(
77
block_out_channels=[32, 64],
78
in_channels=3,
79
out_channels=3,
80
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
81
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
82
latent_channels=4,
83
)
84
return model
85
86
@property
87
def dummy_text_encoder(self):
88
torch.manual_seed(0)
89
config = RobertaSeriesConfig(
90
hidden_size=32,
91
project_dim=32,
92
intermediate_size=37,
93
layer_norm_eps=1e-05,
94
num_attention_heads=4,
95
num_hidden_layers=5,
96
pad_token_id=1,
97
vocab_size=5006,
98
)
99
return RobertaSeriesModelWithTransformation(config)
100
101
@property
102
def dummy_extractor(self):
103
def extract(*args, **kwargs):
104
class Out:
105
def __init__(self):
106
self.pixel_values = torch.ones([0])
107
108
def to(self, device):
109
self.pixel_values.to(device)
110
return self
111
112
return Out()
113
114
return extract
115
116
def test_stable_diffusion_img2img_default_case(self):
117
device = "cpu" # ensure determinism for the device-dependent torch.Generator
118
unet = self.dummy_cond_unet
119
scheduler = PNDMScheduler(skip_prk_steps=True)
120
vae = self.dummy_vae
121
bert = self.dummy_text_encoder
122
tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
123
tokenizer.model_max_length = 77
124
125
init_image = self.dummy_image.to(device)
126
127
# make sure here that pndm scheduler skips prk
128
alt_pipe = AltDiffusionImg2ImgPipeline(
129
unet=unet,
130
scheduler=scheduler,
131
vae=vae,
132
text_encoder=bert,
133
tokenizer=tokenizer,
134
safety_checker=None,
135
feature_extractor=self.dummy_extractor,
136
)
137
alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor, do_normalize=False)
138
alt_pipe = alt_pipe.to(device)
139
alt_pipe.set_progress_bar_config(disable=None)
140
141
prompt = "A painting of a squirrel eating a burger"
142
generator = torch.Generator(device=device).manual_seed(0)
143
output = alt_pipe(
144
[prompt],
145
generator=generator,
146
guidance_scale=6.0,
147
num_inference_steps=2,
148
output_type="np",
149
image=init_image,
150
)
151
152
image = output.images
153
154
generator = torch.Generator(device=device).manual_seed(0)
155
image_from_tuple = alt_pipe(
156
[prompt],
157
generator=generator,
158
guidance_scale=6.0,
159
num_inference_steps=2,
160
output_type="np",
161
image=init_image,
162
return_dict=False,
163
)[0]
164
165
image_slice = image[0, -3:, -3:, -1]
166
image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
167
168
assert image.shape == (1, 32, 32, 3)
169
expected_slice = np.array([0.4115, 0.3870, 0.4089, 0.4807, 0.4668, 0.4144, 0.4151, 0.4721, 0.4569])
170
171
assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3
172
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 5e-3
173
174
@unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
175
def test_stable_diffusion_img2img_fp16(self):
176
"""Test that stable diffusion img2img works with fp16"""
177
unet = self.dummy_cond_unet
178
scheduler = PNDMScheduler(skip_prk_steps=True)
179
vae = self.dummy_vae
180
bert = self.dummy_text_encoder
181
tokenizer = XLMRobertaTokenizer.from_pretrained("hf-internal-testing/tiny-xlm-roberta")
182
tokenizer.model_max_length = 77
183
184
init_image = self.dummy_image.to(torch_device)
185
186
# put models in fp16
187
unet = unet.half()
188
vae = vae.half()
189
bert = bert.half()
190
191
# make sure here that pndm scheduler skips prk
192
alt_pipe = AltDiffusionImg2ImgPipeline(
193
unet=unet,
194
scheduler=scheduler,
195
vae=vae,
196
text_encoder=bert,
197
tokenizer=tokenizer,
198
safety_checker=None,
199
feature_extractor=self.dummy_extractor,
200
)
201
alt_pipe.image_processor = VaeImageProcessor(vae_scale_factor=alt_pipe.vae_scale_factor, do_normalize=False)
202
alt_pipe = alt_pipe.to(torch_device)
203
alt_pipe.set_progress_bar_config(disable=None)
204
205
prompt = "A painting of a squirrel eating a burger"
206
generator = torch.manual_seed(0)
207
image = alt_pipe(
208
[prompt],
209
generator=generator,
210
num_inference_steps=2,
211
output_type="np",
212
image=init_image,
213
).images
214
215
assert image.shape == (1, 32, 32, 3)
216
217
@unittest.skipIf(torch_device != "cuda", "This test requires a GPU")
218
def test_stable_diffusion_img2img_pipeline_multiple_of_8(self):
219
init_image = load_image(
220
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
221
"/img2img/sketch-mountains-input.jpg"
222
)
223
# resize to resolution that is divisible by 8 but not 16 or 32
224
init_image = init_image.resize((760, 504))
225
226
model_id = "BAAI/AltDiffusion"
227
pipe = AltDiffusionImg2ImgPipeline.from_pretrained(
228
model_id,
229
safety_checker=None,
230
)
231
pipe.to(torch_device)
232
pipe.set_progress_bar_config(disable=None)
233
pipe.enable_attention_slicing()
234
235
prompt = "A fantasy landscape, trending on artstation"
236
237
generator = torch.manual_seed(0)
238
output = pipe(
239
prompt=prompt,
240
image=init_image,
241
strength=0.75,
242
guidance_scale=7.5,
243
generator=generator,
244
output_type="np",
245
)
246
image = output.images[0]
247
248
image_slice = image[255:258, 383:386, -1]
249
250
assert image.shape == (504, 760, 3)
251
expected_slice = np.array([0.9358, 0.9397, 0.9599, 0.9901, 1.0000, 1.0000, 0.9882, 1.0000, 1.0000])
252
253
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
254
255
256
@slow
257
@require_torch_gpu
258
class AltDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
259
def tearDown(self):
260
# clean up the VRAM after each test
261
super().tearDown()
262
gc.collect()
263
torch.cuda.empty_cache()
264
265
def test_stable_diffusion_img2img_pipeline_default(self):
266
init_image = load_image(
267
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
268
"/img2img/sketch-mountains-input.jpg"
269
)
270
init_image = init_image.resize((768, 512))
271
expected_image = load_numpy(
272
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/img2img/fantasy_landscape_alt.npy"
273
)
274
275
model_id = "BAAI/AltDiffusion"
276
pipe = AltDiffusionImg2ImgPipeline.from_pretrained(
277
model_id,
278
safety_checker=None,
279
)
280
pipe.to(torch_device)
281
pipe.set_progress_bar_config(disable=None)
282
pipe.enable_attention_slicing()
283
284
prompt = "A fantasy landscape, trending on artstation"
285
286
generator = torch.manual_seed(0)
287
output = pipe(
288
prompt=prompt,
289
image=init_image,
290
strength=0.75,
291
guidance_scale=7.5,
292
generator=generator,
293
output_type="np",
294
)
295
image = output.images[0]
296
297
assert image.shape == (512, 768, 3)
298
# img2img is flaky across GPUs even in fp32, so using MAE here
299
assert np.abs(expected_image - image).max() < 1e-3
300
301