Path: blob/main/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py
1450 views
# coding=utf-81# Copyright 2023 HuggingFace Inc.2#3# Licensed under the Apache License, Version 2.0 (the "License");4# you may not use this file except in compliance with the License.5# You may obtain a copy of the License at6#7# http://www.apache.org/licenses/LICENSE-2.08#9# Unless required by applicable law or agreed to in writing, software10# distributed under the License is distributed on an "AS IS" BASIS,11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.12# See the License for the specific language governing permissions and13# limitations under the License.1415import gc16import random17import unittest1819import numpy as np20import torch21from PIL import Image22from transformers import CLIPImageProcessor, CLIPVisionConfig, CLIPVisionModelWithProjection2324from diffusers import (25AutoencoderKL,26DPMSolverMultistepScheduler,27PNDMScheduler,28StableDiffusionImageVariationPipeline,29UNet2DConditionModel,30)31from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device32from diffusers.utils.testing_utils import require_torch_gpu3334from ...pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS35from ...test_pipelines_common import PipelineTesterMixin363738torch.backends.cuda.matmul.allow_tf32 = False394041class StableDiffusionImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase):42pipeline_class = StableDiffusionImageVariationPipeline43params = IMAGE_VARIATION_PARAMS44batch_params = IMAGE_VARIATION_BATCH_PARAMS4546def get_dummy_components(self):47torch.manual_seed(0)48unet = UNet2DConditionModel(49block_out_channels=(32, 64),50layers_per_block=2,51sample_size=32,52in_channels=4,53out_channels=4,54down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),55up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),56cross_attention_dim=32,57)58scheduler = PNDMScheduler(skip_prk_steps=True)59torch.manual_seed(0)60vae = AutoencoderKL(61block_out_channels=[32, 64],62in_channels=3,63out_channels=3,64down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],65up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],66latent_channels=4,67)68torch.manual_seed(0)69image_encoder_config = CLIPVisionConfig(70hidden_size=32,71projection_dim=32,72intermediate_size=37,73layer_norm_eps=1e-05,74num_attention_heads=4,75num_hidden_layers=5,76image_size=32,77patch_size=4,78)79image_encoder = CLIPVisionModelWithProjection(image_encoder_config)80feature_extractor = CLIPImageProcessor(crop_size=32, size=32)8182components = {83"unet": unet,84"scheduler": scheduler,85"vae": vae,86"image_encoder": image_encoder,87"feature_extractor": feature_extractor,88"safety_checker": None,89}90return components9192def get_dummy_inputs(self, device, seed=0):93image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))94image = image.cpu().permute(0, 2, 3, 1)[0]95image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32))96if str(device).startswith("mps"):97generator = torch.manual_seed(seed)98else:99generator = torch.Generator(device=device).manual_seed(seed)100inputs = {101"image": image,102"generator": generator,103"num_inference_steps": 2,104"guidance_scale": 6.0,105"output_type": "numpy",106}107return inputs108109def test_stable_diffusion_img_variation_default_case(self):110device = "cpu" # ensure determinism for the device-dependent torch.Generator111components = self.get_dummy_components()112sd_pipe = StableDiffusionImageVariationPipeline(**components)113sd_pipe = sd_pipe.to(device)114sd_pipe.set_progress_bar_config(disable=None)115116inputs = self.get_dummy_inputs(device)117image = sd_pipe(**inputs).images118image_slice = image[0, -3:, -3:, -1]119120assert image.shape == (1, 64, 64, 3)121expected_slice = np.array([0.5167, 0.5746, 0.4835, 0.4914, 0.5605, 0.4691, 0.5201, 0.4898, 0.4958])122123assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3124125def test_stable_diffusion_img_variation_multiple_images(self):126device = "cpu" # ensure determinism for the device-dependent torch.Generator127components = self.get_dummy_components()128sd_pipe = StableDiffusionImageVariationPipeline(**components)129sd_pipe = sd_pipe.to(device)130sd_pipe.set_progress_bar_config(disable=None)131132inputs = self.get_dummy_inputs(device)133inputs["image"] = 2 * [inputs["image"]]134output = sd_pipe(**inputs)135136image = output.images137138image_slice = image[-1, -3:, -3:, -1]139140assert image.shape == (2, 64, 64, 3)141expected_slice = np.array([0.6568, 0.5470, 0.5684, 0.5444, 0.5945, 0.6221, 0.5508, 0.5531, 0.5263])142143assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3144145146@slow147@require_torch_gpu148class StableDiffusionImageVariationPipelineSlowTests(unittest.TestCase):149def tearDown(self):150super().tearDown()151gc.collect()152torch.cuda.empty_cache()153154def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):155generator = torch.Generator(device=generator_device).manual_seed(seed)156init_image = load_image(157"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"158"/stable_diffusion_imgvar/input_image_vermeer.png"159)160latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))161latents = torch.from_numpy(latents).to(device=device, dtype=dtype)162inputs = {163"image": init_image,164"latents": latents,165"generator": generator,166"num_inference_steps": 3,167"guidance_scale": 7.5,168"output_type": "numpy",169}170return inputs171172def test_stable_diffusion_img_variation_pipeline_default(self):173sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained(174"lambdalabs/sd-image-variations-diffusers", safety_checker=None175)176sd_pipe = sd_pipe.to(torch_device)177sd_pipe.set_progress_bar_config(disable=None)178179inputs = self.get_inputs(torch_device)180image = sd_pipe(**inputs).images181image_slice = image[0, -3:, -3:, -1].flatten()182183assert image.shape == (1, 512, 512, 3)184expected_slice = np.array([0.84491, 0.90789, 0.75708, 0.78734, 0.83485, 0.70099, 0.66938, 0.68727, 0.61379])185assert np.abs(image_slice - expected_slice).max() < 1e-4186187def test_stable_diffusion_img_variation_intermediate_state(self):188number_of_steps = 0189190def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:191callback_fn.has_been_called = True192nonlocal number_of_steps193number_of_steps += 1194if step == 1:195latents = latents.detach().cpu().numpy()196assert latents.shape == (1, 4, 64, 64)197latents_slice = latents[0, -3:, -3:, -1]198expected_slice = np.array(199[-0.1621, 0.2837, -0.7979, -0.1221, -1.3057, 0.7681, -2.1191, 0.0464, 1.6309]200)201202assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2203elif step == 2:204latents = latents.detach().cpu().numpy()205assert latents.shape == (1, 4, 64, 64)206latents_slice = latents[0, -3:, -3:, -1]207expected_slice = np.array([0.6299, 1.7500, 1.1992, -2.1582, -1.8994, 0.7334, -0.7090, 1.0137, 1.5273])208209assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2210211callback_fn.has_been_called = False212213pipe = StableDiffusionImageVariationPipeline.from_pretrained(214"fusing/sd-image-variations-diffusers",215safety_checker=None,216torch_dtype=torch.float16,217)218pipe.to(torch_device)219pipe.set_progress_bar_config(disable=None)220pipe.enable_attention_slicing()221222inputs = self.get_inputs(torch_device, dtype=torch.float16)223pipe(**inputs, callback=callback_fn, callback_steps=1)224assert callback_fn.has_been_called225assert number_of_steps == inputs["num_inference_steps"]226227def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):228torch.cuda.empty_cache()229torch.cuda.reset_max_memory_allocated()230torch.cuda.reset_peak_memory_stats()231232model_id = "fusing/sd-image-variations-diffusers"233pipe = StableDiffusionImageVariationPipeline.from_pretrained(234model_id, safety_checker=None, torch_dtype=torch.float16235)236pipe = pipe.to(torch_device)237pipe.set_progress_bar_config(disable=None)238pipe.enable_attention_slicing(1)239pipe.enable_sequential_cpu_offload()240241inputs = self.get_inputs(torch_device, dtype=torch.float16)242_ = pipe(**inputs)243244mem_bytes = torch.cuda.max_memory_allocated()245# make sure that less than 2.6 GB is allocated246assert mem_bytes < 2.6 * 10**9247248249@nightly250@require_torch_gpu251class StableDiffusionImageVariationPipelineNightlyTests(unittest.TestCase):252def tearDown(self):253super().tearDown()254gc.collect()255torch.cuda.empty_cache()256257def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):258generator = torch.Generator(device=generator_device).manual_seed(seed)259init_image = load_image(260"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"261"/stable_diffusion_imgvar/input_image_vermeer.png"262)263latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))264latents = torch.from_numpy(latents).to(device=device, dtype=dtype)265inputs = {266"image": init_image,267"latents": latents,268"generator": generator,269"num_inference_steps": 50,270"guidance_scale": 7.5,271"output_type": "numpy",272}273return inputs274275def test_img_variation_pndm(self):276sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained("fusing/sd-image-variations-diffusers")277sd_pipe.to(torch_device)278sd_pipe.set_progress_bar_config(disable=None)279280inputs = self.get_inputs(torch_device)281image = sd_pipe(**inputs).images[0]282283expected_image = load_numpy(284"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"285"/stable_diffusion_imgvar/lambdalabs_variations_pndm.npy"286)287max_diff = np.abs(expected_image - image).max()288assert max_diff < 1e-3289290def test_img_variation_dpm(self):291sd_pipe = StableDiffusionImageVariationPipeline.from_pretrained("fusing/sd-image-variations-diffusers")292sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)293sd_pipe.to(torch_device)294sd_pipe.set_progress_bar_config(disable=None)295296inputs = self.get_inputs(torch_device)297inputs["num_inference_steps"] = 25298image = sd_pipe(**inputs).images[0]299300expected_image = load_numpy(301"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"302"/stable_diffusion_imgvar/lambdalabs_variations_dpm_multi.npy"303)304max_diff = np.abs(expected_image - image).max()305assert max_diff < 1e-3306307308