Path: blob/main/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py
1448 views
# coding=utf-81# Copyright 2023 HuggingFace Inc.2#3# Licensed under the Apache License, Version 2.0 (the "License");4# you may not use this file except in compliance with the License.5# You may obtain a copy of the License at6#7# http://www.apache.org/licenses/LICENSE-2.08#9# Unless required by applicable law or agreed to in writing, software10# distributed under the License is distributed on an "AS IS" BASIS,11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.12# See the License for the specific language governing permissions and13# limitations under the License.1415import gc16import unittest1718import numpy as np19import torch20from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer2122from diffusers import (23AutoencoderKL,24DDIMScheduler,25EulerAncestralDiscreteScheduler,26LMSDiscreteScheduler,27PNDMScheduler,28StableDiffusionPanoramaPipeline,29UNet2DConditionModel,30)31from diffusers.utils import slow, torch_device32from diffusers.utils.testing_utils import require_torch_gpu, skip_mps3334from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS35from ...test_pipelines_common import PipelineTesterMixin363738torch.backends.cuda.matmul.allow_tf32 = False394041@skip_mps42class StableDiffusionPanoramaPipelineFastTests(PipelineTesterMixin, unittest.TestCase):43pipeline_class = StableDiffusionPanoramaPipeline44params = TEXT_TO_IMAGE_PARAMS45batch_params = TEXT_TO_IMAGE_BATCH_PARAMS4647def get_dummy_components(self):48torch.manual_seed(0)49unet = UNet2DConditionModel(50block_out_channels=(32, 64),51layers_per_block=2,52sample_size=32,53in_channels=4,54out_channels=4,55down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),56up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),57cross_attention_dim=32,58)59scheduler = DDIMScheduler()60torch.manual_seed(0)61vae = AutoencoderKL(62block_out_channels=[32, 64],63in_channels=3,64out_channels=3,65down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],66up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],67latent_channels=4,68)69torch.manual_seed(0)70text_encoder_config = CLIPTextConfig(71bos_token_id=0,72eos_token_id=2,73hidden_size=32,74intermediate_size=37,75layer_norm_eps=1e-05,76num_attention_heads=4,77num_hidden_layers=5,78pad_token_id=1,79vocab_size=1000,80)81text_encoder = CLIPTextModel(text_encoder_config)82tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")8384components = {85"unet": unet,86"scheduler": scheduler,87"vae": vae,88"text_encoder": text_encoder,89"tokenizer": tokenizer,90"safety_checker": None,91"feature_extractor": None,92}93return components9495def get_dummy_inputs(self, device, seed=0):96generator = torch.manual_seed(seed)97inputs = {98"prompt": "a photo of the dolomites",99"generator": generator,100# Setting height and width to None to prevent OOMs on CPU.101"height": None,102"width": None,103"num_inference_steps": 2,104"guidance_scale": 6.0,105"output_type": "numpy",106}107return inputs108109def test_stable_diffusion_panorama_default_case(self):110device = "cpu" # ensure determinism for the device-dependent torch.Generator111components = self.get_dummy_components()112sd_pipe = StableDiffusionPanoramaPipeline(**components)113sd_pipe = sd_pipe.to(device)114sd_pipe.set_progress_bar_config(disable=None)115116inputs = self.get_dummy_inputs(device)117image = sd_pipe(**inputs).images118image_slice = image[0, -3:, -3:, -1]119assert image.shape == (1, 64, 64, 3)120121expected_slice = np.array([0.5101, 0.5006, 0.4962, 0.3995, 0.3501, 0.4632, 0.5339, 0.525, 0.4878])122123assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2124125def test_stable_diffusion_panorama_negative_prompt(self):126device = "cpu" # ensure determinism for the device-dependent torch.Generator127components = self.get_dummy_components()128sd_pipe = StableDiffusionPanoramaPipeline(**components)129sd_pipe = sd_pipe.to(device)130sd_pipe.set_progress_bar_config(disable=None)131132inputs = self.get_dummy_inputs(device)133negative_prompt = "french fries"134output = sd_pipe(**inputs, negative_prompt=negative_prompt)135image = output.images136image_slice = image[0, -3:, -3:, -1]137138assert image.shape == (1, 64, 64, 3)139140expected_slice = np.array([0.5326, 0.5009, 0.5074, 0.4133, 0.371, 0.464, 0.5432, 0.5429, 0.4896])141142assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2143144def test_stable_diffusion_panorama_euler(self):145device = "cpu" # ensure determinism for the device-dependent torch.Generator146components = self.get_dummy_components()147components["scheduler"] = EulerAncestralDiscreteScheduler(148beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"149)150sd_pipe = StableDiffusionPanoramaPipeline(**components)151sd_pipe = sd_pipe.to(device)152sd_pipe.set_progress_bar_config(disable=None)153154inputs = self.get_dummy_inputs(device)155image = sd_pipe(**inputs).images156image_slice = image[0, -3:, -3:, -1]157158assert image.shape == (1, 64, 64, 3)159160expected_slice = np.array(161[0.48235387, 0.5423796, 0.46016198, 0.5377287, 0.5803722, 0.4876525, 0.5515428, 0.5045897, 0.50709957]162)163164assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2165166def test_stable_diffusion_panorama_pndm(self):167device = "cpu" # ensure determinism for the device-dependent torch.Generator168components = self.get_dummy_components()169components["scheduler"] = PNDMScheduler()170sd_pipe = StableDiffusionPanoramaPipeline(**components)171sd_pipe = sd_pipe.to(device)172sd_pipe.set_progress_bar_config(disable=None)173174inputs = self.get_dummy_inputs(device)175# the pipeline does not expect pndm so test if it raises error.176with self.assertRaises(ValueError):177_ = sd_pipe(**inputs).images178179180@slow181@require_torch_gpu182class StableDiffusionPanoramaSlowTests(unittest.TestCase):183def tearDown(self):184super().tearDown()185gc.collect()186torch.cuda.empty_cache()187188def get_inputs(self, seed=0):189generator = torch.manual_seed(seed)190inputs = {191"prompt": "a photo of the dolomites",192"generator": generator,193"num_inference_steps": 3,194"guidance_scale": 7.5,195"output_type": "numpy",196}197return inputs198199def test_stable_diffusion_panorama_default(self):200model_ckpt = "stabilityai/stable-diffusion-2-base"201scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")202pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None)203pipe.to(torch_device)204pipe.set_progress_bar_config(disable=None)205pipe.enable_attention_slicing()206207inputs = self.get_inputs()208image = pipe(**inputs).images209image_slice = image[0, -3:, -3:, -1].flatten()210211assert image.shape == (1, 512, 2048, 3)212213expected_slice = np.array(214[2150.36968392,2160.27025372,2170.32446766,2180.28379387,2190.36363274,2200.30733347,2210.27100027,2220.27054125,2230.25536096,224]225)226227assert np.abs(expected_slice - image_slice).max() < 1e-2228229def test_stable_diffusion_panorama_k_lms(self):230pipe = StableDiffusionPanoramaPipeline.from_pretrained(231"stabilityai/stable-diffusion-2-base", safety_checker=None232)233pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)234pipe.to(torch_device)235pipe.set_progress_bar_config(disable=None)236pipe.enable_attention_slicing()237238inputs = self.get_inputs()239image = pipe(**inputs).images240image_slice = image[0, -3:, -3:, -1].flatten()241242assert image.shape == (1, 512, 2048, 3)243244expected_slice = np.array(245[246[2470.0,2480.0,2490.0,2500.0,2510.0,2520.0,2530.0,2540.0,2550.0,256]257]258)259260assert np.abs(expected_slice - image_slice).max() < 1e-3261262def test_stable_diffusion_panorama_intermediate_state(self):263number_of_steps = 0264265def callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> None:266callback_fn.has_been_called = True267nonlocal number_of_steps268number_of_steps += 1269if step == 1:270latents = latents.detach().cpu().numpy()271assert latents.shape == (1, 4, 64, 256)272latents_slice = latents[0, -3:, -3:, -1]273274expected_slice = np.array(275[2760.18681869,2770.33907816,2780.5361276,2790.14432865,280-0.02856611,281-0.73941123,2820.23397987,2830.47322682,284-0.37823164,285]286)287assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2288elif step == 2:289latents = latents.detach().cpu().numpy()290assert latents.shape == (1, 4, 64, 256)291latents_slice = latents[0, -3:, -3:, -1]292293expected_slice = np.array(294[2950.18539645,2960.33987248,2970.5378559,2980.14437142,299-0.02455261,300-0.7338317,3010.23990755,3020.47356272,303-0.3786505,304]305)306307assert np.abs(latents_slice.flatten() - expected_slice).max() < 5e-2308309callback_fn.has_been_called = False310311model_ckpt = "stabilityai/stable-diffusion-2-base"312scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")313pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None)314pipe = pipe.to(torch_device)315pipe.set_progress_bar_config(disable=None)316pipe.enable_attention_slicing()317318inputs = self.get_inputs()319pipe(**inputs, callback=callback_fn, callback_steps=1)320assert callback_fn.has_been_called321assert number_of_steps == 3322323def test_stable_diffusion_panorama_pipeline_with_sequential_cpu_offloading(self):324torch.cuda.empty_cache()325torch.cuda.reset_max_memory_allocated()326torch.cuda.reset_peak_memory_stats()327328model_ckpt = "stabilityai/stable-diffusion-2-base"329scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")330pipe = StableDiffusionPanoramaPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None)331pipe = pipe.to(torch_device)332pipe.set_progress_bar_config(disable=None)333pipe.enable_attention_slicing(1)334pipe.enable_sequential_cpu_offload()335336inputs = self.get_inputs()337_ = pipe(**inputs)338339mem_bytes = torch.cuda.max_memory_allocated()340# make sure that less than 5.2 GB is allocated341assert mem_bytes < 5.5 * 10**9342343344