Path: blob/main/tests/pipelines/stable_diffusion/test_stable_diffusion.py
1448 views
# coding=utf-81# Copyright 2023 HuggingFace Inc.2#3# Licensed under the Apache License, Version 2.0 (the "License");4# you may not use this file except in compliance with the License.5# You may obtain a copy of the License at6#7# http://www.apache.org/licenses/LICENSE-2.08#9# Unless required by applicable law or agreed to in writing, software10# distributed under the License is distributed on an "AS IS" BASIS,11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.12# See the License for the specific language governing permissions and13# limitations under the License.141516import gc17import tempfile18import time19import unittest2021import numpy as np22import torch23from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer2425from diffusers import (26AutoencoderKL,27DDIMScheduler,28DPMSolverMultistepScheduler,29EulerAncestralDiscreteScheduler,30EulerDiscreteScheduler,31LMSDiscreteScheduler,32PNDMScheduler,33StableDiffusionPipeline,34UNet2DConditionModel,35logging,36)37from diffusers.models.attention_processor import AttnProcessor38from diffusers.utils import load_numpy, nightly, slow, torch_device39from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu4041from ...models.test_models_unet_2d_condition import create_lora_layers42from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS43from ...test_pipelines_common import PipelineTesterMixin444546torch.backends.cuda.matmul.allow_tf32 = False474849class StableDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):50pipeline_class = StableDiffusionPipeline51params = TEXT_TO_IMAGE_PARAMS52batch_params = TEXT_TO_IMAGE_BATCH_PARAMS5354def get_dummy_components(self):55torch.manual_seed(0)56unet = UNet2DConditionModel(57block_out_channels=(32, 64),58layers_per_block=2,59sample_size=32,60in_channels=4,61out_channels=4,62down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),63up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),64cross_attention_dim=32,65)66scheduler = DDIMScheduler(67beta_start=0.00085,68beta_end=0.012,69beta_schedule="scaled_linear",70clip_sample=False,71set_alpha_to_one=False,72)73torch.manual_seed(0)74vae = AutoencoderKL(75block_out_channels=[32, 64],76in_channels=3,77out_channels=3,78down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],79up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],80latent_channels=4,81)82torch.manual_seed(0)83text_encoder_config = CLIPTextConfig(84bos_token_id=0,85eos_token_id=2,86hidden_size=32,87intermediate_size=37,88layer_norm_eps=1e-05,89num_attention_heads=4,90num_hidden_layers=5,91pad_token_id=1,92vocab_size=1000,93)94text_encoder = CLIPTextModel(text_encoder_config)95tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")9697components = {98"unet": unet,99"scheduler": scheduler,100"vae": vae,101"text_encoder": text_encoder,102"tokenizer": tokenizer,103"safety_checker": None,104"feature_extractor": None,105}106return components107108def get_dummy_inputs(self, device, seed=0):109if str(device).startswith("mps"):110generator = torch.manual_seed(seed)111else:112generator = torch.Generator(device=device).manual_seed(seed)113inputs = {114"prompt": "A painting of a squirrel eating a burger",115"generator": generator,116"num_inference_steps": 2,117"guidance_scale": 6.0,118"output_type": "numpy",119}120return inputs121122def test_stable_diffusion_ddim(self):123device = "cpu" # ensure determinism for the device-dependent torch.Generator124125components = self.get_dummy_components()126sd_pipe = StableDiffusionPipeline(**components)127sd_pipe = sd_pipe.to(torch_device)128sd_pipe.set_progress_bar_config(disable=None)129130inputs = self.get_dummy_inputs(device)131output = sd_pipe(**inputs)132image = output.images133134image_slice = image[0, -3:, -3:, -1]135136assert image.shape == (1, 64, 64, 3)137expected_slice = np.array([0.5643, 0.6017, 0.4799, 0.5267, 0.5584, 0.4641, 0.5159, 0.4963, 0.4791])138139assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2140141def test_stable_diffusion_lora(self):142device = "cpu" # ensure determinism for the device-dependent torch.Generator143144components = self.get_dummy_components()145sd_pipe = StableDiffusionPipeline(**components)146sd_pipe = sd_pipe.to(torch_device)147sd_pipe.set_progress_bar_config(disable=None)148149# forward 1150inputs = self.get_dummy_inputs(device)151output = sd_pipe(**inputs)152image = output.images153image_slice = image[0, -3:, -3:, -1]154155# set lora layers156lora_attn_procs = create_lora_layers(sd_pipe.unet)157sd_pipe.unet.set_attn_processor(lora_attn_procs)158sd_pipe = sd_pipe.to(torch_device)159160# forward 2161inputs = self.get_dummy_inputs(device)162output = sd_pipe(**inputs, cross_attention_kwargs={"scale": 0.0})163image = output.images164image_slice_1 = image[0, -3:, -3:, -1]165166# forward 3167inputs = self.get_dummy_inputs(device)168output = sd_pipe(**inputs, cross_attention_kwargs={"scale": 0.5})169image = output.images170image_slice_2 = image[0, -3:, -3:, -1]171172assert np.abs(image_slice - image_slice_1).max() < 1e-2173assert np.abs(image_slice - image_slice_2).max() > 1e-2174175def test_stable_diffusion_prompt_embeds(self):176components = self.get_dummy_components()177sd_pipe = StableDiffusionPipeline(**components)178sd_pipe = sd_pipe.to(torch_device)179sd_pipe = sd_pipe.to(torch_device)180sd_pipe.set_progress_bar_config(disable=None)181182inputs = self.get_dummy_inputs(torch_device)183inputs["prompt"] = 3 * [inputs["prompt"]]184185# forward186output = sd_pipe(**inputs)187image_slice_1 = output.images[0, -3:, -3:, -1]188189inputs = self.get_dummy_inputs(torch_device)190prompt = 3 * [inputs.pop("prompt")]191192text_inputs = sd_pipe.tokenizer(193prompt,194padding="max_length",195max_length=sd_pipe.tokenizer.model_max_length,196truncation=True,197return_tensors="pt",198)199text_inputs = text_inputs["input_ids"].to(torch_device)200201prompt_embeds = sd_pipe.text_encoder(text_inputs)[0]202203inputs["prompt_embeds"] = prompt_embeds204205# forward206output = sd_pipe(**inputs)207image_slice_2 = output.images[0, -3:, -3:, -1]208209assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4210211def test_stable_diffusion_negative_prompt_embeds(self):212components = self.get_dummy_components()213sd_pipe = StableDiffusionPipeline(**components)214sd_pipe = sd_pipe.to(torch_device)215sd_pipe = sd_pipe.to(torch_device)216sd_pipe.set_progress_bar_config(disable=None)217218inputs = self.get_dummy_inputs(torch_device)219negative_prompt = 3 * ["this is a negative prompt"]220inputs["negative_prompt"] = negative_prompt221inputs["prompt"] = 3 * [inputs["prompt"]]222223# forward224output = sd_pipe(**inputs)225image_slice_1 = output.images[0, -3:, -3:, -1]226227inputs = self.get_dummy_inputs(torch_device)228prompt = 3 * [inputs.pop("prompt")]229230embeds = []231for p in [prompt, negative_prompt]:232text_inputs = sd_pipe.tokenizer(233p,234padding="max_length",235max_length=sd_pipe.tokenizer.model_max_length,236truncation=True,237return_tensors="pt",238)239text_inputs = text_inputs["input_ids"].to(torch_device)240241embeds.append(sd_pipe.text_encoder(text_inputs)[0])242243inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds244245# forward246output = sd_pipe(**inputs)247image_slice_2 = output.images[0, -3:, -3:, -1]248249assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4250251def test_stable_diffusion_ddim_factor_8(self):252device = "cpu" # ensure determinism for the device-dependent torch.Generator253254components = self.get_dummy_components()255sd_pipe = StableDiffusionPipeline(**components)256sd_pipe = sd_pipe.to(device)257sd_pipe.set_progress_bar_config(disable=None)258259inputs = self.get_dummy_inputs(device)260output = sd_pipe(**inputs, height=136, width=136)261image = output.images262263image_slice = image[0, -3:, -3:, -1]264265assert image.shape == (1, 136, 136, 3)266expected_slice = np.array([0.5524, 0.5626, 0.6069, 0.4727, 0.386, 0.3995, 0.4613, 0.4328, 0.4269])267268assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2269270def test_stable_diffusion_pndm(self):271device = "cpu" # ensure determinism for the device-dependent torch.Generator272components = self.get_dummy_components()273sd_pipe = StableDiffusionPipeline(**components)274sd_pipe.scheduler = PNDMScheduler(skip_prk_steps=True)275sd_pipe = sd_pipe.to(device)276sd_pipe.set_progress_bar_config(disable=None)277278inputs = self.get_dummy_inputs(device)279output = sd_pipe(**inputs)280image = output.images281image_slice = image[0, -3:, -3:, -1]282283assert image.shape == (1, 64, 64, 3)284expected_slice = np.array([0.5094, 0.5674, 0.4667, 0.5125, 0.5696, 0.4674, 0.5277, 0.4964, 0.4945])285286assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2287288def test_stable_diffusion_no_safety_checker(self):289pipe = StableDiffusionPipeline.from_pretrained(290"hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None291)292assert isinstance(pipe, StableDiffusionPipeline)293assert isinstance(pipe.scheduler, LMSDiscreteScheduler)294assert pipe.safety_checker is None295296image = pipe("example prompt", num_inference_steps=2).images[0]297assert image is not None298299# check that there's no error when saving a pipeline with one of the models being None300with tempfile.TemporaryDirectory() as tmpdirname:301pipe.save_pretrained(tmpdirname)302pipe = StableDiffusionPipeline.from_pretrained(tmpdirname)303304# sanity check that the pipeline still works305assert pipe.safety_checker is None306image = pipe("example prompt", num_inference_steps=2).images[0]307assert image is not None308309def test_stable_diffusion_k_lms(self):310device = "cpu" # ensure determinism for the device-dependent torch.Generator311312components = self.get_dummy_components()313sd_pipe = StableDiffusionPipeline(**components)314sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)315sd_pipe = sd_pipe.to(device)316sd_pipe.set_progress_bar_config(disable=None)317318inputs = self.get_dummy_inputs(device)319output = sd_pipe(**inputs)320image = output.images321image_slice = image[0, -3:, -3:, -1]322323assert image.shape == (1, 64, 64, 3)324expected_slice = np.array(325[3260.47082293033599854,3270.5371589064598083,3280.4562119245529175,3290.5220914483070374,3300.5733777284622192,3310.4795039892196655,3320.5465868711471558,3330.5074326395988464,3340.5042197108268738,335]336)337338assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2339340def test_stable_diffusion_k_euler_ancestral(self):341device = "cpu" # ensure determinism for the device-dependent torch.Generator342343components = self.get_dummy_components()344sd_pipe = StableDiffusionPipeline(**components)345sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)346sd_pipe = sd_pipe.to(device)347sd_pipe.set_progress_bar_config(disable=None)348349inputs = self.get_dummy_inputs(device)350output = sd_pipe(**inputs)351image = output.images352image_slice = image[0, -3:, -3:, -1]353354assert image.shape == (1, 64, 64, 3)355expected_slice = np.array(356[3570.4707113206386566,3580.5372191071510315,3590.4563021957874298,3600.5220003724098206,3610.5734264850616455,3620.4794946610927582,3630.5463782548904419,3640.5074145197868347,3650.504422664642334,366]367)368369assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2370371def test_stable_diffusion_k_euler(self):372device = "cpu" # ensure determinism for the device-dependent torch.Generator373374components = self.get_dummy_components()375sd_pipe = StableDiffusionPipeline(**components)376sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)377sd_pipe = sd_pipe.to(device)378sd_pipe.set_progress_bar_config(disable=None)379380inputs = self.get_dummy_inputs(device)381output = sd_pipe(**inputs)382image = output.images383image_slice = image[0, -3:, -3:, -1]384385assert image.shape == (1, 64, 64, 3)386expected_slice = np.array(387[3880.47082313895225525,3890.5371587872505188,3900.4562119245529175,3910.5220913887023926,3920.5733776688575745,3930.47950395941734314,3940.546586811542511,3950.5074326992034912,3960.5042197108268738,397]398)399400assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2401402def test_stable_diffusion_vae_slicing(self):403device = "cpu" # ensure determinism for the device-dependent torch.Generator404components = self.get_dummy_components()405components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)406sd_pipe = StableDiffusionPipeline(**components)407sd_pipe = sd_pipe.to(device)408sd_pipe.set_progress_bar_config(disable=None)409410image_count = 4411412inputs = self.get_dummy_inputs(device)413inputs["prompt"] = [inputs["prompt"]] * image_count414output_1 = sd_pipe(**inputs)415416# make sure sliced vae decode yields the same result417sd_pipe.enable_vae_slicing()418inputs = self.get_dummy_inputs(device)419inputs["prompt"] = [inputs["prompt"]] * image_count420output_2 = sd_pipe(**inputs)421422# there is a small discrepancy at image borders vs. full batch decode423assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 3e-3424425def test_stable_diffusion_vae_tiling(self):426device = "cpu" # ensure determinism for the device-dependent torch.Generator427components = self.get_dummy_components()428429# make sure here that pndm scheduler skips prk430components["safety_checker"] = None431sd_pipe = StableDiffusionPipeline(**components)432sd_pipe = sd_pipe.to(device)433sd_pipe.set_progress_bar_config(disable=None)434435prompt = "A painting of a squirrel eating a burger"436437# Test that tiled decode at 512x512 yields the same result as the non-tiled decode438generator = torch.Generator(device=device).manual_seed(0)439output_1 = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")440441# make sure tiled vae decode yields the same result442sd_pipe.enable_vae_tiling()443generator = torch.Generator(device=device).manual_seed(0)444output_2 = sd_pipe([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=2, output_type="np")445446assert np.abs(output_2.images.flatten() - output_1.images.flatten()).max() < 5e-1447448# test that tiled decode works with various shapes449shapes = [(1, 4, 73, 97), (1, 4, 97, 73), (1, 4, 49, 65), (1, 4, 65, 49)]450for shape in shapes:451zeros = torch.zeros(shape).to(device)452sd_pipe.vae.decode(zeros)453454def test_stable_diffusion_negative_prompt(self):455device = "cpu" # ensure determinism for the device-dependent torch.Generator456components = self.get_dummy_components()457components["scheduler"] = PNDMScheduler(skip_prk_steps=True)458sd_pipe = StableDiffusionPipeline(**components)459sd_pipe = sd_pipe.to(device)460sd_pipe.set_progress_bar_config(disable=None)461462inputs = self.get_dummy_inputs(device)463negative_prompt = "french fries"464output = sd_pipe(**inputs, negative_prompt=negative_prompt)465466image = output.images467image_slice = image[0, -3:, -3:, -1]468469assert image.shape == (1, 64, 64, 3)470expected_slice = np.array(471[4720.5108221173286438,4730.5688379406929016,4740.4685141146183014,4750.5098261833190918,4760.5657756328582764,4770.4631010890007019,4780.5226285457611084,4790.49129390716552734,4800.4899061322212219,481]482)483484assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2485486def test_stable_diffusion_long_prompt(self):487components = self.get_dummy_components()488components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)489sd_pipe = StableDiffusionPipeline(**components)490sd_pipe = sd_pipe.to(torch_device)491sd_pipe.set_progress_bar_config(disable=None)492493do_classifier_free_guidance = True494negative_prompt = None495num_images_per_prompt = 1496logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion")497498prompt = 25 * "@"499with CaptureLogger(logger) as cap_logger_3:500text_embeddings_3 = sd_pipe._encode_prompt(501prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt502)503504prompt = 100 * "@"505with CaptureLogger(logger) as cap_logger:506text_embeddings = sd_pipe._encode_prompt(507prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt508)509510negative_prompt = "Hello"511with CaptureLogger(logger) as cap_logger_2:512text_embeddings_2 = sd_pipe._encode_prompt(513prompt, torch_device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt514)515516assert text_embeddings_3.shape == text_embeddings_2.shape == text_embeddings.shape517assert text_embeddings.shape[1] == 77518519assert cap_logger.out == cap_logger_2.out520# 100 - 77 + 1 (BOS token) + 1 (EOS token) = 25521assert cap_logger.out.count("@") == 25522assert cap_logger_3.out == ""523524def test_stable_diffusion_height_width_opt(self):525components = self.get_dummy_components()526components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)527sd_pipe = StableDiffusionPipeline(**components)528sd_pipe = sd_pipe.to(torch_device)529sd_pipe.set_progress_bar_config(disable=None)530531prompt = "hey"532533output = sd_pipe(prompt, num_inference_steps=1, output_type="np")534image_shape = output.images[0].shape[:2]535assert image_shape == (64, 64)536537output = sd_pipe(prompt, num_inference_steps=1, height=96, width=96, output_type="np")538image_shape = output.images[0].shape[:2]539assert image_shape == (96, 96)540541config = dict(sd_pipe.unet.config)542config["sample_size"] = 96543sd_pipe.unet = UNet2DConditionModel.from_config(config).to(torch_device)544output = sd_pipe(prompt, num_inference_steps=1, output_type="np")545image_shape = output.images[0].shape[:2]546assert image_shape == (192, 192)547548549@slow550@require_torch_gpu551class StableDiffusionPipelineSlowTests(unittest.TestCase):552def tearDown(self):553super().tearDown()554gc.collect()555torch.cuda.empty_cache()556557def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):558generator = torch.Generator(device=generator_device).manual_seed(seed)559latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))560latents = torch.from_numpy(latents).to(device=device, dtype=dtype)561inputs = {562"prompt": "a photograph of an astronaut riding a horse",563"latents": latents,564"generator": generator,565"num_inference_steps": 3,566"guidance_scale": 7.5,567"output_type": "numpy",568}569return inputs570571def test_stable_diffusion_1_1_pndm(self):572sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-1")573sd_pipe = sd_pipe.to(torch_device)574sd_pipe.set_progress_bar_config(disable=None)575576inputs = self.get_inputs(torch_device)577image = sd_pipe(**inputs).images578image_slice = image[0, -3:, -3:, -1].flatten()579580assert image.shape == (1, 512, 512, 3)581expected_slice = np.array([0.43625, 0.43554, 0.36670, 0.40660, 0.39703, 0.38658, 0.43936, 0.43557, 0.40592])582assert np.abs(image_slice - expected_slice).max() < 1e-4583584def test_stable_diffusion_1_4_pndm(self):585sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")586sd_pipe = sd_pipe.to(torch_device)587sd_pipe.set_progress_bar_config(disable=None)588589inputs = self.get_inputs(torch_device)590image = sd_pipe(**inputs).images591image_slice = image[0, -3:, -3:, -1].flatten()592593assert image.shape == (1, 512, 512, 3)594expected_slice = np.array([0.57400, 0.47841, 0.31625, 0.63583, 0.58306, 0.55056, 0.50825, 0.56306, 0.55748])595assert np.abs(image_slice - expected_slice).max() < 1e-4596597def test_stable_diffusion_ddim(self):598sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)599sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)600sd_pipe = sd_pipe.to(torch_device)601sd_pipe.set_progress_bar_config(disable=None)602603inputs = self.get_inputs(torch_device)604image = sd_pipe(**inputs).images605image_slice = image[0, -3:, -3:, -1].flatten()606607assert image.shape == (1, 512, 512, 3)608expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239])609assert np.abs(image_slice - expected_slice).max() < 1e-4610611def test_stable_diffusion_lms(self):612sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)613sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)614sd_pipe = sd_pipe.to(torch_device)615sd_pipe.set_progress_bar_config(disable=None)616617inputs = self.get_inputs(torch_device)618image = sd_pipe(**inputs).images619image_slice = image[0, -3:, -3:, -1].flatten()620621assert image.shape == (1, 512, 512, 3)622expected_slice = np.array([0.10542, 0.09620, 0.07332, 0.09015, 0.09382, 0.07597, 0.08496, 0.07806, 0.06455])623assert np.abs(image_slice - expected_slice).max() < 1e-4624625def test_stable_diffusion_dpm(self):626sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None)627sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)628sd_pipe = sd_pipe.to(torch_device)629sd_pipe.set_progress_bar_config(disable=None)630631inputs = self.get_inputs(torch_device)632image = sd_pipe(**inputs).images633image_slice = image[0, -3:, -3:, -1].flatten()634635assert image.shape == (1, 512, 512, 3)636expected_slice = np.array([0.03503, 0.03494, 0.01087, 0.03128, 0.02552, 0.00803, 0.00742, 0.00372, 0.00000])637assert np.abs(image_slice - expected_slice).max() < 1e-4638639def test_stable_diffusion_attention_slicing(self):640torch.cuda.reset_peak_memory_stats()641pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)642pipe = pipe.to(torch_device)643pipe.set_progress_bar_config(disable=None)644645# enable attention slicing646pipe.enable_attention_slicing()647inputs = self.get_inputs(torch_device, dtype=torch.float16)648image_sliced = pipe(**inputs).images649650mem_bytes = torch.cuda.max_memory_allocated()651torch.cuda.reset_peak_memory_stats()652# make sure that less than 3.75 GB is allocated653assert mem_bytes < 3.75 * 10**9654655# disable slicing656pipe.disable_attention_slicing()657inputs = self.get_inputs(torch_device, dtype=torch.float16)658image = pipe(**inputs).images659660# make sure that more than 3.75 GB is allocated661mem_bytes = torch.cuda.max_memory_allocated()662assert mem_bytes > 3.75 * 10**9663assert np.abs(image_sliced - image).max() < 1e-3664665def test_stable_diffusion_vae_slicing(self):666torch.cuda.reset_peak_memory_stats()667pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)668pipe = pipe.to(torch_device)669pipe.set_progress_bar_config(disable=None)670pipe.enable_attention_slicing()671672# enable vae slicing673pipe.enable_vae_slicing()674inputs = self.get_inputs(torch_device, dtype=torch.float16)675inputs["prompt"] = [inputs["prompt"]] * 4676inputs["latents"] = torch.cat([inputs["latents"]] * 4)677image_sliced = pipe(**inputs).images678679mem_bytes = torch.cuda.max_memory_allocated()680torch.cuda.reset_peak_memory_stats()681# make sure that less than 4 GB is allocated682assert mem_bytes < 4e9683684# disable vae slicing685pipe.disable_vae_slicing()686inputs = self.get_inputs(torch_device, dtype=torch.float16)687inputs["prompt"] = [inputs["prompt"]] * 4688inputs["latents"] = torch.cat([inputs["latents"]] * 4)689image = pipe(**inputs).images690691# make sure that more than 4 GB is allocated692mem_bytes = torch.cuda.max_memory_allocated()693assert mem_bytes > 4e9694# There is a small discrepancy at the image borders vs. a fully batched version.695assert np.abs(image_sliced - image).max() < 1e-2696697def test_stable_diffusion_vae_tiling(self):698torch.cuda.reset_peak_memory_stats()699model_id = "CompVis/stable-diffusion-v1-4"700pipe = StableDiffusionPipeline.from_pretrained(model_id, revision="fp16", torch_dtype=torch.float16)701pipe.set_progress_bar_config(disable=None)702pipe.enable_attention_slicing()703pipe.unet = pipe.unet.to(memory_format=torch.channels_last)704pipe.vae = pipe.vae.to(memory_format=torch.channels_last)705706prompt = "a photograph of an astronaut riding a horse"707708# enable vae tiling709pipe.enable_vae_tiling()710pipe.enable_model_cpu_offload()711generator = torch.Generator(device="cpu").manual_seed(0)712output_chunked = pipe(713[prompt],714width=1024,715height=1024,716generator=generator,717guidance_scale=7.5,718num_inference_steps=2,719output_type="numpy",720)721image_chunked = output_chunked.images722723mem_bytes = torch.cuda.max_memory_allocated()724725# disable vae tiling726pipe.disable_vae_tiling()727generator = torch.Generator(device="cpu").manual_seed(0)728output = pipe(729[prompt],730width=1024,731height=1024,732generator=generator,733guidance_scale=7.5,734num_inference_steps=2,735output_type="numpy",736)737image = output.images738739assert mem_bytes < 1e10740assert np.abs(image_chunked.flatten() - image.flatten()).max() < 1e-2741742def test_stable_diffusion_fp16_vs_autocast(self):743# this test makes sure that the original model with autocast744# and the new model with fp16 yield the same result745pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)746pipe = pipe.to(torch_device)747pipe.set_progress_bar_config(disable=None)748749inputs = self.get_inputs(torch_device, dtype=torch.float16)750image_fp16 = pipe(**inputs).images751752with torch.autocast(torch_device):753inputs = self.get_inputs(torch_device)754image_autocast = pipe(**inputs).images755756# Make sure results are close enough757diff = np.abs(image_fp16.flatten() - image_autocast.flatten())758# They ARE different since ops are not run always at the same precision759# however, they should be extremely close.760assert diff.mean() < 2e-2761762def test_stable_diffusion_intermediate_state(self):763number_of_steps = 0764765def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:766callback_fn.has_been_called = True767nonlocal number_of_steps768number_of_steps += 1769if step == 1:770latents = latents.detach().cpu().numpy()771assert latents.shape == (1, 4, 64, 64)772latents_slice = latents[0, -3:, -3:, -1]773expected_slice = np.array(774[-0.5693, -0.3018, -0.9746, 0.0518, -0.8770, 0.7559, -1.7402, 0.1022, 1.1582]775)776777assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2778elif step == 2:779latents = latents.detach().cpu().numpy()780assert latents.shape == (1, 4, 64, 64)781latents_slice = latents[0, -3:, -3:, -1]782expected_slice = np.array(783[-0.1958, -0.2993, -1.0166, -0.5005, -0.4810, 0.6162, -0.9492, 0.6621, 1.4492]784)785786assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2787788callback_fn.has_been_called = False789790pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)791pipe = pipe.to(torch_device)792pipe.set_progress_bar_config(disable=None)793pipe.enable_attention_slicing()794795inputs = self.get_inputs(torch_device, dtype=torch.float16)796pipe(**inputs, callback=callback_fn, callback_steps=1)797assert callback_fn.has_been_called798assert number_of_steps == inputs["num_inference_steps"]799800def test_stable_diffusion_low_cpu_mem_usage(self):801pipeline_id = "CompVis/stable-diffusion-v1-4"802803start_time = time.time()804pipeline_low_cpu_mem_usage = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16)805pipeline_low_cpu_mem_usage.to(torch_device)806low_cpu_mem_usage_time = time.time() - start_time807808start_time = time.time()809_ = StableDiffusionPipeline.from_pretrained(pipeline_id, torch_dtype=torch.float16, low_cpu_mem_usage=False)810normal_load_time = time.time() - start_time811812assert 2 * low_cpu_mem_usage_time < normal_load_time813814def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):815torch.cuda.empty_cache()816torch.cuda.reset_max_memory_allocated()817torch.cuda.reset_peak_memory_stats()818819pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)820pipe = pipe.to(torch_device)821pipe.set_progress_bar_config(disable=None)822pipe.enable_attention_slicing(1)823pipe.enable_sequential_cpu_offload()824825inputs = self.get_inputs(torch_device, dtype=torch.float16)826_ = pipe(**inputs)827828mem_bytes = torch.cuda.max_memory_allocated()829# make sure that less than 2.8 GB is allocated830assert mem_bytes < 2.8 * 10**9831832def test_stable_diffusion_pipeline_with_model_offloading(self):833torch.cuda.empty_cache()834torch.cuda.reset_max_memory_allocated()835torch.cuda.reset_peak_memory_stats()836837inputs = self.get_inputs(torch_device, dtype=torch.float16)838839# Normal inference840841pipe = StableDiffusionPipeline.from_pretrained(842"CompVis/stable-diffusion-v1-4",843torch_dtype=torch.float16,844)845pipe.unet.set_attn_processor(AttnProcessor())846pipe.to(torch_device)847pipe.set_progress_bar_config(disable=None)848outputs = pipe(**inputs)849mem_bytes = torch.cuda.max_memory_allocated()850851# With model offloading852853# Reload but don't move to cuda854pipe = StableDiffusionPipeline.from_pretrained(855"CompVis/stable-diffusion-v1-4",856torch_dtype=torch.float16,857)858pipe.unet.set_attn_processor(AttnProcessor())859860torch.cuda.empty_cache()861torch.cuda.reset_max_memory_allocated()862torch.cuda.reset_peak_memory_stats()863864pipe.enable_model_cpu_offload()865pipe.set_progress_bar_config(disable=None)866inputs = self.get_inputs(torch_device, dtype=torch.float16)867868outputs_offloaded = pipe(**inputs)869mem_bytes_offloaded = torch.cuda.max_memory_allocated()870871assert np.abs(outputs.images - outputs_offloaded.images).max() < 1e-3872assert mem_bytes_offloaded < mem_bytes873assert mem_bytes_offloaded < 3.5 * 10**9874for module in pipe.text_encoder, pipe.unet, pipe.vae, pipe.safety_checker:875assert module.device == torch.device("cpu")876877# With attention slicing878torch.cuda.empty_cache()879torch.cuda.reset_max_memory_allocated()880torch.cuda.reset_peak_memory_stats()881882pipe.enable_attention_slicing()883_ = pipe(**inputs)884mem_bytes_slicing = torch.cuda.max_memory_allocated()885886assert mem_bytes_slicing < mem_bytes_offloaded887assert mem_bytes_slicing < 3 * 10**9888889890@nightly891@require_torch_gpu892class StableDiffusionPipelineNightlyTests(unittest.TestCase):893def tearDown(self):894super().tearDown()895gc.collect()896torch.cuda.empty_cache()897898def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):899generator = torch.Generator(device=generator_device).manual_seed(seed)900latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))901latents = torch.from_numpy(latents).to(device=device, dtype=dtype)902inputs = {903"prompt": "a photograph of an astronaut riding a horse",904"latents": latents,905"generator": generator,906"num_inference_steps": 50,907"guidance_scale": 7.5,908"output_type": "numpy",909}910return inputs911912def test_stable_diffusion_1_4_pndm(self):913sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)914sd_pipe.set_progress_bar_config(disable=None)915916inputs = self.get_inputs(torch_device)917image = sd_pipe(**inputs).images[0]918919expected_image = load_numpy(920"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"921"/stable_diffusion_text2img/stable_diffusion_1_4_pndm.npy"922)923max_diff = np.abs(expected_image - image).max()924assert max_diff < 1e-3925926def test_stable_diffusion_1_5_pndm(self):927sd_pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5").to(torch_device)928sd_pipe.set_progress_bar_config(disable=None)929930inputs = self.get_inputs(torch_device)931image = sd_pipe(**inputs).images[0]932933expected_image = load_numpy(934"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"935"/stable_diffusion_text2img/stable_diffusion_1_5_pndm.npy"936)937max_diff = np.abs(expected_image - image).max()938assert max_diff < 1e-3939940def test_stable_diffusion_ddim(self):941sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)942sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config)943sd_pipe.set_progress_bar_config(disable=None)944945inputs = self.get_inputs(torch_device)946image = sd_pipe(**inputs).images[0]947948expected_image = load_numpy(949"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"950"/stable_diffusion_text2img/stable_diffusion_1_4_ddim.npy"951)952max_diff = np.abs(expected_image - image).max()953assert max_diff < 1e-3954955def test_stable_diffusion_lms(self):956sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)957sd_pipe.scheduler = LMSDiscreteScheduler.from_config(sd_pipe.scheduler.config)958sd_pipe.set_progress_bar_config(disable=None)959960inputs = self.get_inputs(torch_device)961image = sd_pipe(**inputs).images[0]962963expected_image = load_numpy(964"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"965"/stable_diffusion_text2img/stable_diffusion_1_4_lms.npy"966)967max_diff = np.abs(expected_image - image).max()968assert max_diff < 1e-3969970def test_stable_diffusion_euler(self):971sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)972sd_pipe.scheduler = EulerDiscreteScheduler.from_config(sd_pipe.scheduler.config)973sd_pipe.set_progress_bar_config(disable=None)974975inputs = self.get_inputs(torch_device)976image = sd_pipe(**inputs).images[0]977978expected_image = load_numpy(979"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"980"/stable_diffusion_text2img/stable_diffusion_1_4_euler.npy"981)982max_diff = np.abs(expected_image - image).max()983assert max_diff < 1e-3984985def test_stable_diffusion_dpm(self):986sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4").to(torch_device)987sd_pipe.scheduler = DPMSolverMultistepScheduler.from_config(sd_pipe.scheduler.config)988sd_pipe.set_progress_bar_config(disable=None)989990inputs = self.get_inputs(torch_device)991inputs["num_inference_steps"] = 25992image = sd_pipe(**inputs).images[0]993994expected_image = load_numpy(995"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"996"/stable_diffusion_text2img/stable_diffusion_1_4_dpm_multi.npy"997)998max_diff = np.abs(expected_image - image).max()999assert max_diff < 1e-3100010011002