Path: blob/main/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py
1450 views
# coding=utf-81# Copyright 2023 HuggingFace Inc.2#3# Licensed under the Apache License, Version 2.0 (the "License");4# you may not use this file except in compliance with the License.5# You may obtain a copy of the License at6#7# http://www.apache.org/licenses/LICENSE-2.08#9# Unless required by applicable law or agreed to in writing, software10# distributed under the License is distributed on an "AS IS" BASIS,11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.12# See the License for the specific language governing permissions and13# limitations under the License.1415import gc16import unittest1718import numpy as np19import torch20from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer2122from diffusers import (23AutoencoderKL,24DDIMScheduler,25EulerAncestralDiscreteScheduler,26PNDMScheduler,27StableDiffusionModelEditingPipeline,28UNet2DConditionModel,29)30from diffusers.utils import slow, torch_device31from diffusers.utils.testing_utils import require_torch_gpu, skip_mps3233from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS34from ...test_pipelines_common import PipelineTesterMixin353637torch.backends.cuda.matmul.allow_tf32 = False383940@skip_mps41class StableDiffusionModelEditingPipelineFastTests(PipelineTesterMixin, unittest.TestCase):42pipeline_class = StableDiffusionModelEditingPipeline43params = TEXT_TO_IMAGE_PARAMS44batch_params = TEXT_TO_IMAGE_BATCH_PARAMS4546def get_dummy_components(self):47torch.manual_seed(0)48unet = UNet2DConditionModel(49block_out_channels=(32, 64),50layers_per_block=2,51sample_size=32,52in_channels=4,53out_channels=4,54down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),55up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),56cross_attention_dim=32,57)58scheduler = DDIMScheduler()59torch.manual_seed(0)60vae = AutoencoderKL(61block_out_channels=[32, 64],62in_channels=3,63out_channels=3,64down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],65up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],66latent_channels=4,67)68torch.manual_seed(0)69text_encoder_config = CLIPTextConfig(70bos_token_id=0,71eos_token_id=2,72hidden_size=32,73intermediate_size=37,74layer_norm_eps=1e-05,75num_attention_heads=4,76num_hidden_layers=5,77pad_token_id=1,78vocab_size=1000,79)80text_encoder = CLIPTextModel(text_encoder_config)81tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")8283components = {84"unet": unet,85"scheduler": scheduler,86"vae": vae,87"text_encoder": text_encoder,88"tokenizer": tokenizer,89"safety_checker": None,90"feature_extractor": None,91}92return components9394def get_dummy_inputs(self, device, seed=0):95generator = torch.manual_seed(seed)96inputs = {97"prompt": "A field of roses",98"generator": generator,99# Setting height and width to None to prevent OOMs on CPU.100"height": None,101"width": None,102"num_inference_steps": 2,103"guidance_scale": 6.0,104"output_type": "numpy",105}106return inputs107108def test_stable_diffusion_model_editing_default_case(self):109device = "cpu" # ensure determinism for the device-dependent torch.Generator110components = self.get_dummy_components()111sd_pipe = StableDiffusionModelEditingPipeline(**components)112sd_pipe = sd_pipe.to(device)113sd_pipe.set_progress_bar_config(disable=None)114115inputs = self.get_dummy_inputs(device)116image = sd_pipe(**inputs).images117image_slice = image[0, -3:, -3:, -1]118assert image.shape == (1, 64, 64, 3)119120expected_slice = np.array(121[0.5217179, 0.50658035, 0.5003239, 0.41109088, 0.3595158, 0.46607107, 0.5323504, 0.5335255, 0.49187922]122)123124assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2125126def test_stable_diffusion_model_editing_negative_prompt(self):127device = "cpu" # ensure determinism for the device-dependent torch.Generator128components = self.get_dummy_components()129sd_pipe = StableDiffusionModelEditingPipeline(**components)130sd_pipe = sd_pipe.to(device)131sd_pipe.set_progress_bar_config(disable=None)132133inputs = self.get_dummy_inputs(device)134negative_prompt = "french fries"135output = sd_pipe(**inputs, negative_prompt=negative_prompt)136image = output.images137image_slice = image[0, -3:, -3:, -1]138139assert image.shape == (1, 64, 64, 3)140141expected_slice = np.array(142[0.546259, 0.5108156, 0.50897664, 0.41931948, 0.3748669, 0.4669299, 0.5427151, 0.54561913, 0.49353]143)144145assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2146147def test_stable_diffusion_model_editing_euler(self):148device = "cpu" # ensure determinism for the device-dependent torch.Generator149components = self.get_dummy_components()150components["scheduler"] = EulerAncestralDiscreteScheduler(151beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"152)153sd_pipe = StableDiffusionModelEditingPipeline(**components)154sd_pipe = sd_pipe.to(device)155sd_pipe.set_progress_bar_config(disable=None)156157inputs = self.get_dummy_inputs(device)158image = sd_pipe(**inputs).images159image_slice = image[0, -3:, -3:, -1]160161assert image.shape == (1, 64, 64, 3)162163expected_slice = np.array(164[0.47106352, 0.53579676, 0.45798016, 0.514294, 0.56856745, 0.4788605, 0.54380214, 0.5046455, 0.50404465]165)166167assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2168169def test_stable_diffusion_model_editing_pndm(self):170device = "cpu" # ensure determinism for the device-dependent torch.Generator171components = self.get_dummy_components()172components["scheduler"] = PNDMScheduler()173sd_pipe = StableDiffusionModelEditingPipeline(**components)174sd_pipe = sd_pipe.to(device)175sd_pipe.set_progress_bar_config(disable=None)176177inputs = self.get_dummy_inputs(device)178# the pipeline does not expect pndm so test if it raises error.179with self.assertRaises(ValueError):180_ = sd_pipe(**inputs).images181182183@slow184@require_torch_gpu185class StableDiffusionModelEditingSlowTests(unittest.TestCase):186def tearDown(self):187super().tearDown()188gc.collect()189torch.cuda.empty_cache()190191def get_inputs(self, seed=0):192generator = torch.manual_seed(seed)193inputs = {194"prompt": "A field of roses",195"generator": generator,196"num_inference_steps": 3,197"guidance_scale": 7.5,198"output_type": "numpy",199}200return inputs201202def test_stable_diffusion_model_editing_default(self):203model_ckpt = "CompVis/stable-diffusion-v1-4"204pipe = StableDiffusionModelEditingPipeline.from_pretrained(model_ckpt, safety_checker=None)205pipe.to(torch_device)206pipe.set_progress_bar_config(disable=None)207pipe.enable_attention_slicing()208209inputs = self.get_inputs()210image = pipe(**inputs).images211image_slice = image[0, -3:, -3:, -1].flatten()212213assert image.shape == (1, 512, 512, 3)214215expected_slice = np.array(216[0.6749496, 0.6386453, 0.51443267, 0.66094905, 0.61921215, 0.5491332, 0.5744417, 0.58075106, 0.5174658]217)218219assert np.abs(expected_slice - image_slice).max() < 1e-2220221# make sure image changes after editing222pipe.edit_model("A pack of roses", "A pack of blue roses")223224image = pipe(**inputs).images225image_slice = image[0, -3:, -3:, -1].flatten()226227assert image.shape == (1, 512, 512, 3)228229assert np.abs(expected_slice - image_slice).max() > 1e-1230231def test_stable_diffusion_model_editing_pipeline_with_sequential_cpu_offloading(self):232torch.cuda.empty_cache()233torch.cuda.reset_max_memory_allocated()234torch.cuda.reset_peak_memory_stats()235236model_ckpt = "CompVis/stable-diffusion-v1-4"237scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")238pipe = StableDiffusionModelEditingPipeline.from_pretrained(239model_ckpt, scheduler=scheduler, safety_checker=None240)241pipe = pipe.to(torch_device)242pipe.set_progress_bar_config(disable=None)243pipe.enable_attention_slicing(1)244pipe.enable_sequential_cpu_offload()245246inputs = self.get_inputs()247_ = pipe(**inputs)248249mem_bytes = torch.cuda.max_memory_allocated()250# make sure that less than 4.4 GB is allocated251assert mem_bytes < 4.4 * 10**9252253254