Path: blob/main/tests/pipelines/stable_diffusion/test_stable_diffusion_controlnet.py
1448 views
# coding=utf-81# Copyright 2023 HuggingFace Inc.2#3# Licensed under the Apache License, Version 2.0 (the "License");4# you may not use this file except in compliance with the License.5# You may obtain a copy of the License at6#7# http://www.apache.org/licenses/LICENSE-2.08#9# Unless required by applicable law or agreed to in writing, software10# distributed under the License is distributed on an "AS IS" BASIS,11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.12# See the License for the specific language governing permissions and13# limitations under the License.1415import gc16import tempfile17import unittest1819import numpy as np20import torch21from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer2223from diffusers import (24AutoencoderKL,25ControlNetModel,26DDIMScheduler,27StableDiffusionControlNetPipeline,28UNet2DConditionModel,29)30from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import MultiControlNetModel31from diffusers.utils import load_image, load_numpy, randn_tensor, slow, torch_device32from diffusers.utils.import_utils import is_xformers_available33from diffusers.utils.testing_utils import require_torch_gpu3435from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS36from ...test_pipelines_common import PipelineTesterMixin373839class StableDiffusionControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):40pipeline_class = StableDiffusionControlNetPipeline41params = TEXT_TO_IMAGE_PARAMS42batch_params = TEXT_TO_IMAGE_BATCH_PARAMS4344def get_dummy_components(self):45torch.manual_seed(0)46unet = UNet2DConditionModel(47block_out_channels=(32, 64),48layers_per_block=2,49sample_size=32,50in_channels=4,51out_channels=4,52down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),53up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),54cross_attention_dim=32,55)56torch.manual_seed(0)57controlnet = ControlNetModel(58block_out_channels=(32, 64),59layers_per_block=2,60in_channels=4,61down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),62cross_attention_dim=32,63conditioning_embedding_out_channels=(16, 32),64)65torch.manual_seed(0)66scheduler = DDIMScheduler(67beta_start=0.00085,68beta_end=0.012,69beta_schedule="scaled_linear",70clip_sample=False,71set_alpha_to_one=False,72)73torch.manual_seed(0)74vae = AutoencoderKL(75block_out_channels=[32, 64],76in_channels=3,77out_channels=3,78down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],79up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],80latent_channels=4,81)82torch.manual_seed(0)83text_encoder_config = CLIPTextConfig(84bos_token_id=0,85eos_token_id=2,86hidden_size=32,87intermediate_size=37,88layer_norm_eps=1e-05,89num_attention_heads=4,90num_hidden_layers=5,91pad_token_id=1,92vocab_size=1000,93)94text_encoder = CLIPTextModel(text_encoder_config)95tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")9697components = {98"unet": unet,99"controlnet": controlnet,100"scheduler": scheduler,101"vae": vae,102"text_encoder": text_encoder,103"tokenizer": tokenizer,104"safety_checker": None,105"feature_extractor": None,106}107return components108109def get_dummy_inputs(self, device, seed=0):110if str(device).startswith("mps"):111generator = torch.manual_seed(seed)112else:113generator = torch.Generator(device=device).manual_seed(seed)114115controlnet_embedder_scale_factor = 2116image = randn_tensor(117(1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),118generator=generator,119device=torch.device(device),120)121122inputs = {123"prompt": "A painting of a squirrel eating a burger",124"generator": generator,125"num_inference_steps": 2,126"guidance_scale": 6.0,127"output_type": "numpy",128"image": image,129}130131return inputs132133def test_attention_slicing_forward_pass(self):134return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)135136@unittest.skipIf(137torch_device != "cuda" or not is_xformers_available(),138reason="XFormers attention is only available with CUDA and `xformers` installed",139)140def test_xformers_attention_forwardGenerator_pass(self):141self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)142143def test_inference_batch_single_identical(self):144self._test_inference_batch_single_identical(expected_max_diff=2e-3)145146147class StableDiffusionMultiControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase):148pipeline_class = StableDiffusionControlNetPipeline149params = TEXT_TO_IMAGE_PARAMS150batch_params = TEXT_TO_IMAGE_BATCH_PARAMS151152def get_dummy_components(self):153torch.manual_seed(0)154unet = UNet2DConditionModel(155block_out_channels=(32, 64),156layers_per_block=2,157sample_size=32,158in_channels=4,159out_channels=4,160down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),161up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),162cross_attention_dim=32,163)164torch.manual_seed(0)165controlnet1 = ControlNetModel(166block_out_channels=(32, 64),167layers_per_block=2,168in_channels=4,169down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),170cross_attention_dim=32,171conditioning_embedding_out_channels=(16, 32),172)173torch.manual_seed(0)174controlnet2 = ControlNetModel(175block_out_channels=(32, 64),176layers_per_block=2,177in_channels=4,178down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),179cross_attention_dim=32,180conditioning_embedding_out_channels=(16, 32),181)182torch.manual_seed(0)183scheduler = DDIMScheduler(184beta_start=0.00085,185beta_end=0.012,186beta_schedule="scaled_linear",187clip_sample=False,188set_alpha_to_one=False,189)190torch.manual_seed(0)191vae = AutoencoderKL(192block_out_channels=[32, 64],193in_channels=3,194out_channels=3,195down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],196up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],197latent_channels=4,198)199torch.manual_seed(0)200text_encoder_config = CLIPTextConfig(201bos_token_id=0,202eos_token_id=2,203hidden_size=32,204intermediate_size=37,205layer_norm_eps=1e-05,206num_attention_heads=4,207num_hidden_layers=5,208pad_token_id=1,209vocab_size=1000,210)211text_encoder = CLIPTextModel(text_encoder_config)212tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")213214controlnet = MultiControlNetModel([controlnet1, controlnet2])215216components = {217"unet": unet,218"controlnet": controlnet,219"scheduler": scheduler,220"vae": vae,221"text_encoder": text_encoder,222"tokenizer": tokenizer,223"safety_checker": None,224"feature_extractor": None,225}226return components227228def get_dummy_inputs(self, device, seed=0):229if str(device).startswith("mps"):230generator = torch.manual_seed(seed)231else:232generator = torch.Generator(device=device).manual_seed(seed)233234controlnet_embedder_scale_factor = 2235236images = [237randn_tensor(238(1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),239generator=generator,240device=torch.device(device),241),242randn_tensor(243(1, 3, 32 * controlnet_embedder_scale_factor, 32 * controlnet_embedder_scale_factor),244generator=generator,245device=torch.device(device),246),247]248249inputs = {250"prompt": "A painting of a squirrel eating a burger",251"generator": generator,252"num_inference_steps": 2,253"guidance_scale": 6.0,254"output_type": "numpy",255"image": images,256}257258return inputs259260def test_attention_slicing_forward_pass(self):261return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)262263@unittest.skipIf(264torch_device != "cuda" or not is_xformers_available(),265reason="XFormers attention is only available with CUDA and `xformers` installed",266)267def test_xformers_attention_forwardGenerator_pass(self):268self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=2e-3)269270def test_inference_batch_single_identical(self):271self._test_inference_batch_single_identical(expected_max_diff=2e-3)272273def test_save_pretrained_raise_not_implemented_exception(self):274components = self.get_dummy_components()275pipe = self.pipeline_class(**components)276pipe.to(torch_device)277pipe.set_progress_bar_config(disable=None)278with tempfile.TemporaryDirectory() as tmpdir:279try:280# save_pretrained is not implemented for Multi-ControlNet281pipe.save_pretrained(tmpdir)282except NotImplementedError:283pass284285# override PipelineTesterMixin286@unittest.skip("save pretrained not implemented")287def test_save_load_float16(self):288...289290# override PipelineTesterMixin291@unittest.skip("save pretrained not implemented")292def test_save_load_local(self):293...294295# override PipelineTesterMixin296@unittest.skip("save pretrained not implemented")297def test_save_load_optional_components(self):298...299300301@slow302@require_torch_gpu303class StableDiffusionControlNetPipelineSlowTests(unittest.TestCase):304def tearDown(self):305super().tearDown()306gc.collect()307torch.cuda.empty_cache()308309def test_canny(self):310controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")311312pipe = StableDiffusionControlNetPipeline.from_pretrained(313"runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet314)315pipe.enable_model_cpu_offload()316pipe.set_progress_bar_config(disable=None)317318generator = torch.Generator(device="cpu").manual_seed(0)319prompt = "bird"320image = load_image(321"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"322)323324output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)325326image = output.images[0]327328assert image.shape == (768, 512, 3)329330expected_image = load_numpy(331"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out.npy"332)333334assert np.abs(expected_image - image).max() < 5e-3335336def test_depth(self):337controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-depth")338339pipe = StableDiffusionControlNetPipeline.from_pretrained(340"runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet341)342pipe.enable_model_cpu_offload()343pipe.set_progress_bar_config(disable=None)344345generator = torch.Generator(device="cpu").manual_seed(0)346prompt = "Stormtrooper's lecture"347image = load_image(348"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth.png"349)350351output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)352353image = output.images[0]354355assert image.shape == (512, 512, 3)356357expected_image = load_numpy(358"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/stormtrooper_depth_out.npy"359)360361assert np.abs(expected_image - image).max() < 5e-3362363def test_hed(self):364controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-hed")365366pipe = StableDiffusionControlNetPipeline.from_pretrained(367"runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet368)369pipe.enable_model_cpu_offload()370pipe.set_progress_bar_config(disable=None)371372generator = torch.Generator(device="cpu").manual_seed(0)373prompt = "oil painting of handsome old man, masterpiece"374image = load_image(375"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/man_hed.png"376)377378output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)379380image = output.images[0]381382assert image.shape == (704, 512, 3)383384expected_image = load_numpy(385"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/man_hed_out.npy"386)387388assert np.abs(expected_image - image).max() < 5e-3389390def test_mlsd(self):391controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-mlsd")392393pipe = StableDiffusionControlNetPipeline.from_pretrained(394"runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet395)396pipe.enable_model_cpu_offload()397pipe.set_progress_bar_config(disable=None)398399generator = torch.Generator(device="cpu").manual_seed(0)400prompt = "room"401image = load_image(402"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/room_mlsd.png"403)404405output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)406407image = output.images[0]408409assert image.shape == (704, 512, 3)410411expected_image = load_numpy(412"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/room_mlsd_out.npy"413)414415assert np.abs(expected_image - image).max() < 5e-3416417def test_normal(self):418controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-normal")419420pipe = StableDiffusionControlNetPipeline.from_pretrained(421"runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet422)423pipe.enable_model_cpu_offload()424pipe.set_progress_bar_config(disable=None)425426generator = torch.Generator(device="cpu").manual_seed(0)427prompt = "cute toy"428image = load_image(429"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/cute_toy_normal.png"430)431432output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)433434image = output.images[0]435436assert image.shape == (512, 512, 3)437438expected_image = load_numpy(439"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/cute_toy_normal_out.npy"440)441442assert np.abs(expected_image - image).max() < 5e-3443444def test_openpose(self):445controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose")446447pipe = StableDiffusionControlNetPipeline.from_pretrained(448"runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet449)450pipe.enable_model_cpu_offload()451pipe.set_progress_bar_config(disable=None)452453generator = torch.Generator(device="cpu").manual_seed(0)454prompt = "Chef in the kitchen"455image = load_image(456"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose.png"457)458459output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)460461image = output.images[0]462463assert image.shape == (768, 512, 3)464465expected_image = load_numpy(466"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/chef_pose_out.npy"467)468469assert np.abs(expected_image - image).max() < 5e-3470471def test_scribble(self):472controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-scribble")473474pipe = StableDiffusionControlNetPipeline.from_pretrained(475"runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet476)477pipe.enable_model_cpu_offload()478pipe.set_progress_bar_config(disable=None)479480generator = torch.Generator(device="cpu").manual_seed(5)481prompt = "bag"482image = load_image(483"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bag_scribble.png"484)485486output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)487488image = output.images[0]489490assert image.shape == (640, 512, 3)491492expected_image = load_numpy(493"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bag_scribble_out.npy"494)495496assert np.abs(expected_image - image).max() < 5e-3497498def test_seg(self):499controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg")500501pipe = StableDiffusionControlNetPipeline.from_pretrained(502"runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet503)504pipe.enable_model_cpu_offload()505pipe.set_progress_bar_config(disable=None)506507generator = torch.Generator(device="cpu").manual_seed(5)508prompt = "house"509image = load_image(510"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/house_seg.png"511)512513output = pipe(prompt, image, generator=generator, output_type="np", num_inference_steps=3)514515image = output.images[0]516517assert image.shape == (512, 512, 3)518519expected_image = load_numpy(520"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/house_seg_out.npy"521)522523assert np.abs(expected_image - image).max() < 5e-3524525def test_sequential_cpu_offloading(self):526torch.cuda.empty_cache()527torch.cuda.reset_max_memory_allocated()528torch.cuda.reset_peak_memory_stats()529530controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-seg")531532pipe = StableDiffusionControlNetPipeline.from_pretrained(533"runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet534)535pipe.set_progress_bar_config(disable=None)536pipe.enable_attention_slicing()537pipe.enable_sequential_cpu_offload()538539prompt = "house"540image = load_image(541"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/house_seg.png"542)543544_ = pipe(545prompt,546image,547num_inference_steps=2,548output_type="np",549)550551mem_bytes = torch.cuda.max_memory_allocated()552# make sure that less than 7 GB is allocated553assert mem_bytes < 4 * 10**9554555556@slow557@require_torch_gpu558class StableDiffusionMultiControlNetPipelineSlowTests(unittest.TestCase):559def tearDown(self):560super().tearDown()561gc.collect()562torch.cuda.empty_cache()563564def test_pose_and_canny(self):565controlnet_canny = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny")566controlnet_pose = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose")567568pipe = StableDiffusionControlNetPipeline.from_pretrained(569"runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=[controlnet_pose, controlnet_canny]570)571pipe.enable_model_cpu_offload()572pipe.set_progress_bar_config(disable=None)573574generator = torch.Generator(device="cpu").manual_seed(0)575prompt = "bird and Chef"576image_canny = load_image(577"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"578)579image_pose = load_image(580"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose.png"581)582583output = pipe(prompt, [image_pose, image_canny], generator=generator, output_type="np", num_inference_steps=3)584585image = output.images[0]586587assert image.shape == (768, 512, 3)588589expected_image = load_numpy(590"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose_canny_out.npy"591)592593assert np.abs(expected_image - image).max() < 5e-2594595596