Path: blob/main/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py
1448 views
# coding=utf-81# Copyright 2023 HuggingFace Inc.2#3# Licensed under the Apache License, Version 2.0 (the "License");4# you may not use this file except in compliance with the License.5# You may obtain a copy of the License at6#7# http://www.apache.org/licenses/LICENSE-2.08#9# Unless required by applicable law or agreed to in writing, software10# distributed under the License is distributed on an "AS IS" BASIS,11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.12# See the License for the specific language governing permissions and13# limitations under the License.1415import gc16import random17import tempfile18import unittest1920import numpy as np21import torch22from PIL import Image23from transformers import (24CLIPTextConfig,25CLIPTextModel,26CLIPTokenizer,27DPTConfig,28DPTFeatureExtractor,29DPTForDepthEstimation,30)3132from diffusers import (33AutoencoderKL,34DDIMScheduler,35DPMSolverMultistepScheduler,36LMSDiscreteScheduler,37PNDMScheduler,38StableDiffusionDepth2ImgPipeline,39UNet2DConditionModel,40)41from diffusers.utils import (42floats_tensor,43is_accelerate_available,44is_accelerate_version,45load_image,46load_numpy,47nightly,48slow,49torch_device,50)51from diffusers.utils.testing_utils import require_torch_gpu, skip_mps5253from ...pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS54from ...test_pipelines_common import PipelineTesterMixin555657torch.backends.cuda.matmul.allow_tf32 = False585960@skip_mps61class StableDiffusionDepth2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase):62pipeline_class = StableDiffusionDepth2ImgPipeline63test_save_load_optional_components = False64params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}65required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}66batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS6768def get_dummy_components(self):69torch.manual_seed(0)70unet = UNet2DConditionModel(71block_out_channels=(32, 64),72layers_per_block=2,73sample_size=32,74in_channels=5,75out_channels=4,76down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),77up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),78cross_attention_dim=32,79attention_head_dim=(2, 4),80use_linear_projection=True,81)82scheduler = PNDMScheduler(skip_prk_steps=True)83torch.manual_seed(0)84vae = AutoencoderKL(85block_out_channels=[32, 64],86in_channels=3,87out_channels=3,88down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],89up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],90latent_channels=4,91)92torch.manual_seed(0)93text_encoder_config = CLIPTextConfig(94bos_token_id=0,95eos_token_id=2,96hidden_size=32,97intermediate_size=37,98layer_norm_eps=1e-05,99num_attention_heads=4,100num_hidden_layers=5,101pad_token_id=1,102vocab_size=1000,103)104text_encoder = CLIPTextModel(text_encoder_config)105tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")106107backbone_config = {108"global_padding": "same",109"layer_type": "bottleneck",110"depths": [3, 4, 9],111"out_features": ["stage1", "stage2", "stage3"],112"embedding_dynamic_padding": True,113"hidden_sizes": [96, 192, 384, 768],114"num_groups": 2,115}116depth_estimator_config = DPTConfig(117image_size=32,118patch_size=16,119num_channels=3,120hidden_size=32,121num_hidden_layers=4,122backbone_out_indices=(0, 1, 2, 3),123num_attention_heads=4,124intermediate_size=37,125hidden_act="gelu",126hidden_dropout_prob=0.1,127attention_probs_dropout_prob=0.1,128is_decoder=False,129initializer_range=0.02,130is_hybrid=True,131backbone_config=backbone_config,132backbone_featmap_shape=[1, 384, 24, 24],133)134depth_estimator = DPTForDepthEstimation(depth_estimator_config)135feature_extractor = DPTFeatureExtractor.from_pretrained(136"hf-internal-testing/tiny-random-DPTForDepthEstimation"137)138139components = {140"unet": unet,141"scheduler": scheduler,142"vae": vae,143"text_encoder": text_encoder,144"tokenizer": tokenizer,145"depth_estimator": depth_estimator,146"feature_extractor": feature_extractor,147}148return components149150def get_dummy_inputs(self, device, seed=0):151image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))152image = image.cpu().permute(0, 2, 3, 1)[0]153image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32))154if str(device).startswith("mps"):155generator = torch.manual_seed(seed)156else:157generator = torch.Generator(device=device).manual_seed(seed)158inputs = {159"prompt": "A painting of a squirrel eating a burger",160"image": image,161"generator": generator,162"num_inference_steps": 2,163"guidance_scale": 6.0,164"output_type": "numpy",165}166return inputs167168def test_save_load_local(self):169components = self.get_dummy_components()170pipe = self.pipeline_class(**components)171pipe.to(torch_device)172pipe.set_progress_bar_config(disable=None)173174inputs = self.get_dummy_inputs(torch_device)175output = pipe(**inputs)[0]176177with tempfile.TemporaryDirectory() as tmpdir:178pipe.save_pretrained(tmpdir)179pipe_loaded = self.pipeline_class.from_pretrained(tmpdir)180pipe_loaded.to(torch_device)181pipe_loaded.set_progress_bar_config(disable=None)182183inputs = self.get_dummy_inputs(torch_device)184output_loaded = pipe_loaded(**inputs)[0]185186max_diff = np.abs(output - output_loaded).max()187self.assertLess(max_diff, 1e-4)188189@unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")190def test_save_load_float16(self):191components = self.get_dummy_components()192for name, module in components.items():193if hasattr(module, "half"):194components[name] = module.to(torch_device).half()195pipe = self.pipeline_class(**components)196pipe.to(torch_device)197pipe.set_progress_bar_config(disable=None)198199inputs = self.get_dummy_inputs(torch_device)200output = pipe(**inputs)[0]201202with tempfile.TemporaryDirectory() as tmpdir:203pipe.save_pretrained(tmpdir)204pipe_loaded = self.pipeline_class.from_pretrained(tmpdir, torch_dtype=torch.float16)205pipe_loaded.to(torch_device)206pipe_loaded.set_progress_bar_config(disable=None)207208for name, component in pipe_loaded.components.items():209if hasattr(component, "dtype"):210self.assertTrue(211component.dtype == torch.float16,212f"`{name}.dtype` switched from `float16` to {component.dtype} after loading.",213)214215inputs = self.get_dummy_inputs(torch_device)216output_loaded = pipe_loaded(**inputs)[0]217218max_diff = np.abs(output - output_loaded).max()219self.assertLess(max_diff, 2e-2, "The output of the fp16 pipeline changed after saving and loading.")220221@unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA")222def test_float16_inference(self):223components = self.get_dummy_components()224pipe = self.pipeline_class(**components)225pipe.to(torch_device)226pipe.set_progress_bar_config(disable=None)227228for name, module in components.items():229if hasattr(module, "half"):230components[name] = module.half()231pipe_fp16 = self.pipeline_class(**components)232pipe_fp16.to(torch_device)233pipe_fp16.set_progress_bar_config(disable=None)234235output = pipe(**self.get_dummy_inputs(torch_device))[0]236output_fp16 = pipe_fp16(**self.get_dummy_inputs(torch_device))[0]237238max_diff = np.abs(output - output_fp16).max()239self.assertLess(max_diff, 1.3e-2, "The outputs of the fp16 and fp32 pipelines are too different.")240241@unittest.skipIf(242torch_device != "cuda" or not is_accelerate_available() or is_accelerate_version("<", "0.14.0"),243reason="CPU offload is only available with CUDA and `accelerate v0.14.0` or higher",244)245def test_cpu_offload_forward_pass(self):246components = self.get_dummy_components()247pipe = self.pipeline_class(**components)248pipe.to(torch_device)249pipe.set_progress_bar_config(disable=None)250251inputs = self.get_dummy_inputs(torch_device)252output_without_offload = pipe(**inputs)[0]253254pipe.enable_sequential_cpu_offload()255inputs = self.get_dummy_inputs(torch_device)256output_with_offload = pipe(**inputs)[0]257258max_diff = np.abs(output_with_offload - output_without_offload).max()259self.assertLess(max_diff, 1e-4, "CPU offloading should not affect the inference results")260261def test_dict_tuple_outputs_equivalent(self):262components = self.get_dummy_components()263pipe = self.pipeline_class(**components)264pipe.to(torch_device)265pipe.set_progress_bar_config(disable=None)266267output = pipe(**self.get_dummy_inputs(torch_device))[0]268output_tuple = pipe(**self.get_dummy_inputs(torch_device), return_dict=False)[0]269270max_diff = np.abs(output - output_tuple).max()271self.assertLess(max_diff, 1e-4)272273def test_progress_bar(self):274super().test_progress_bar()275276def test_stable_diffusion_depth2img_default_case(self):277device = "cpu" # ensure determinism for the device-dependent torch.Generator278components = self.get_dummy_components()279pipe = StableDiffusionDepth2ImgPipeline(**components)280pipe = pipe.to(device)281pipe.set_progress_bar_config(disable=None)282283inputs = self.get_dummy_inputs(device)284image = pipe(**inputs).images285image_slice = image[0, -3:, -3:, -1]286287assert image.shape == (1, 32, 32, 3)288if torch_device == "mps":289expected_slice = np.array([0.6071, 0.5035, 0.4378, 0.5776, 0.5753, 0.4316, 0.4513, 0.5263, 0.4546])290else:291expected_slice = np.array([0.6312, 0.4984, 0.4154, 0.4788, 0.5535, 0.4599, 0.4017, 0.5359, 0.4716])292293assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3294295def test_stable_diffusion_depth2img_negative_prompt(self):296device = "cpu" # ensure determinism for the device-dependent torch.Generator297components = self.get_dummy_components()298pipe = StableDiffusionDepth2ImgPipeline(**components)299pipe = pipe.to(device)300pipe.set_progress_bar_config(disable=None)301302inputs = self.get_dummy_inputs(device)303negative_prompt = "french fries"304output = pipe(**inputs, negative_prompt=negative_prompt)305image = output.images306image_slice = image[0, -3:, -3:, -1]307308assert image.shape == (1, 32, 32, 3)309if torch_device == "mps":310expected_slice = np.array([0.5825, 0.5135, 0.4095, 0.5452, 0.6059, 0.4211, 0.3994, 0.5177, 0.4335])311else:312expected_slice = np.array([0.6296, 0.5125, 0.3890, 0.4456, 0.5955, 0.4621, 0.3810, 0.5310, 0.4626])313314assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3315316def test_stable_diffusion_depth2img_multiple_init_images(self):317device = "cpu" # ensure determinism for the device-dependent torch.Generator318components = self.get_dummy_components()319pipe = StableDiffusionDepth2ImgPipeline(**components)320pipe = pipe.to(device)321pipe.set_progress_bar_config(disable=None)322323inputs = self.get_dummy_inputs(device)324inputs["prompt"] = [inputs["prompt"]] * 2325inputs["image"] = 2 * [inputs["image"]]326image = pipe(**inputs).images327image_slice = image[-1, -3:, -3:, -1]328329assert image.shape == (2, 32, 32, 3)330331if torch_device == "mps":332expected_slice = np.array([0.6501, 0.5150, 0.4939, 0.6688, 0.5437, 0.5758, 0.5115, 0.4406, 0.4551])333else:334expected_slice = np.array([0.6267, 0.5232, 0.6001, 0.6738, 0.5029, 0.6429, 0.5364, 0.4159, 0.4674])335336assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3337338def test_stable_diffusion_depth2img_pil(self):339device = "cpu" # ensure determinism for the device-dependent torch.Generator340components = self.get_dummy_components()341pipe = StableDiffusionDepth2ImgPipeline(**components)342pipe = pipe.to(device)343pipe.set_progress_bar_config(disable=None)344345inputs = self.get_dummy_inputs(device)346347image = pipe(**inputs).images348image_slice = image[0, -3:, -3:, -1]349350if torch_device == "mps":351expected_slice = np.array([0.53232, 0.47015, 0.40868, 0.45651, 0.4891, 0.4668, 0.4287, 0.48822, 0.47439])352else:353expected_slice = np.array([0.6312, 0.4984, 0.4154, 0.4788, 0.5535, 0.4599, 0.4017, 0.5359, 0.4716])354355assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3356357@skip_mps358def test_attention_slicing_forward_pass(self):359return super().test_attention_slicing_forward_pass()360361362@slow363@require_torch_gpu364class StableDiffusionDepth2ImgPipelineSlowTests(unittest.TestCase):365def tearDown(self):366super().tearDown()367gc.collect()368torch.cuda.empty_cache()369370def get_inputs(self, device="cpu", dtype=torch.float32, seed=0):371generator = torch.Generator(device=device).manual_seed(seed)372init_image = load_image(373"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/depth2img/two_cats.png"374)375inputs = {376"prompt": "two tigers",377"image": init_image,378"generator": generator,379"num_inference_steps": 3,380"strength": 0.75,381"guidance_scale": 7.5,382"output_type": "numpy",383}384return inputs385386def test_stable_diffusion_depth2img_pipeline_default(self):387pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(388"stabilityai/stable-diffusion-2-depth", safety_checker=None389)390pipe.to(torch_device)391pipe.set_progress_bar_config(disable=None)392pipe.enable_attention_slicing()393394inputs = self.get_inputs()395image = pipe(**inputs).images396image_slice = image[0, 253:256, 253:256, -1].flatten()397398assert image.shape == (1, 480, 640, 3)399expected_slice = np.array([0.9057, 0.9365, 0.9258, 0.8937, 0.8555, 0.8541, 0.8260, 0.7747, 0.7421])400401assert np.abs(expected_slice - image_slice).max() < 1e-4402403def test_stable_diffusion_depth2img_pipeline_k_lms(self):404pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(405"stabilityai/stable-diffusion-2-depth", safety_checker=None406)407pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)408pipe.to(torch_device)409pipe.set_progress_bar_config(disable=None)410pipe.enable_attention_slicing()411412inputs = self.get_inputs()413image = pipe(**inputs).images414image_slice = image[0, 253:256, 253:256, -1].flatten()415416assert image.shape == (1, 480, 640, 3)417expected_slice = np.array([0.6363, 0.6274, 0.6309, 0.6370, 0.6226, 0.6286, 0.6213, 0.6453, 0.6306])418419assert np.abs(expected_slice - image_slice).max() < 1e-4420421def test_stable_diffusion_depth2img_pipeline_ddim(self):422pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(423"stabilityai/stable-diffusion-2-depth", safety_checker=None424)425pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)426pipe.to(torch_device)427pipe.set_progress_bar_config(disable=None)428pipe.enable_attention_slicing()429430inputs = self.get_inputs()431image = pipe(**inputs).images432image_slice = image[0, 253:256, 253:256, -1].flatten()433434assert image.shape == (1, 480, 640, 3)435expected_slice = np.array([0.6424, 0.6524, 0.6249, 0.6041, 0.6634, 0.6420, 0.6522, 0.6555, 0.6436])436437assert np.abs(expected_slice - image_slice).max() < 1e-4438439def test_stable_diffusion_depth2img_intermediate_state(self):440number_of_steps = 0441442def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:443callback_fn.has_been_called = True444nonlocal number_of_steps445number_of_steps += 1446if step == 1:447latents = latents.detach().cpu().numpy()448assert latents.shape == (1, 4, 60, 80)449latents_slice = latents[0, -3:, -3:, -1]450expected_slice = np.array(451[-0.7168, -1.5137, -0.1418, -2.9219, -2.7266, -2.4414, -2.1035, -3.0078, -1.7051]452)453454assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2455elif step == 2:456latents = latents.detach().cpu().numpy()457assert latents.shape == (1, 4, 60, 80)458latents_slice = latents[0, -3:, -3:, -1]459expected_slice = np.array(460[-0.7109, -1.5068, -0.1403, -2.9160, -2.7207, -2.4414, -2.1035, -3.0059, -1.7090]461)462463assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2464465callback_fn.has_been_called = False466467pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(468"stabilityai/stable-diffusion-2-depth", safety_checker=None, torch_dtype=torch.float16469)470pipe = pipe.to(torch_device)471pipe.set_progress_bar_config(disable=None)472pipe.enable_attention_slicing()473474inputs = self.get_inputs(dtype=torch.float16)475pipe(**inputs, callback=callback_fn, callback_steps=1)476assert callback_fn.has_been_called477assert number_of_steps == 2478479def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):480torch.cuda.empty_cache()481torch.cuda.reset_max_memory_allocated()482torch.cuda.reset_peak_memory_stats()483484pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(485"stabilityai/stable-diffusion-2-depth", safety_checker=None, torch_dtype=torch.float16486)487pipe = pipe.to(torch_device)488pipe.set_progress_bar_config(disable=None)489pipe.enable_attention_slicing(1)490pipe.enable_sequential_cpu_offload()491492inputs = self.get_inputs(dtype=torch.float16)493_ = pipe(**inputs)494495mem_bytes = torch.cuda.max_memory_allocated()496# make sure that less than 2.9 GB is allocated497assert mem_bytes < 2.9 * 10**9498499500@nightly501@require_torch_gpu502class StableDiffusionImg2ImgPipelineNightlyTests(unittest.TestCase):503def tearDown(self):504super().tearDown()505gc.collect()506torch.cuda.empty_cache()507508def get_inputs(self, device="cpu", dtype=torch.float32, seed=0):509generator = torch.Generator(device=device).manual_seed(seed)510init_image = load_image(511"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/depth2img/two_cats.png"512)513inputs = {514"prompt": "two tigers",515"image": init_image,516"generator": generator,517"num_inference_steps": 3,518"strength": 0.75,519"guidance_scale": 7.5,520"output_type": "numpy",521}522return inputs523524def test_depth2img_pndm(self):525pipe = StableDiffusionDepth2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2-depth")526pipe.to(torch_device)527pipe.set_progress_bar_config(disable=None)528529inputs = self.get_inputs()530image = pipe(**inputs).images[0]531532expected_image = load_numpy(533"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"534"/stable_diffusion_depth2img/stable_diffusion_2_0_pndm.npy"535)536max_diff = np.abs(expected_image - image).max()537assert max_diff < 1e-3538539def test_depth2img_ddim(self):540pipe = StableDiffusionDepth2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2-depth")541pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)542pipe.to(torch_device)543pipe.set_progress_bar_config(disable=None)544545inputs = self.get_inputs()546image = pipe(**inputs).images[0]547548expected_image = load_numpy(549"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"550"/stable_diffusion_depth2img/stable_diffusion_2_0_ddim.npy"551)552max_diff = np.abs(expected_image - image).max()553assert max_diff < 1e-3554555def test_img2img_lms(self):556pipe = StableDiffusionDepth2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2-depth")557pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)558pipe.to(torch_device)559pipe.set_progress_bar_config(disable=None)560561inputs = self.get_inputs()562image = pipe(**inputs).images[0]563564expected_image = load_numpy(565"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"566"/stable_diffusion_depth2img/stable_diffusion_2_0_lms.npy"567)568max_diff = np.abs(expected_image - image).max()569assert max_diff < 1e-3570571def test_img2img_dpm(self):572pipe = StableDiffusionDepth2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2-depth")573pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)574pipe.to(torch_device)575pipe.set_progress_bar_config(disable=None)576577inputs = self.get_inputs()578inputs["num_inference_steps"] = 30579image = pipe(**inputs).images[0]580581expected_image = load_numpy(582"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"583"/stable_diffusion_depth2img/stable_diffusion_2_0_dpm_multi.npy"584)585max_diff = np.abs(expected_image - image).max()586assert max_diff < 1e-3587588589