CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
huggingface

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/diffusers/exploring_simple optimizations_for_sdxl.ipynb
Views: 2535
Kernel: Python 3

Exploring simple optimizations for Stable Diffusion XL

!nvidia-smi
!pip install git+https://github.com/huggingface/diffusers -q !pip install transformers accelerate -q

Unoptimized setup

  • FP32 computation

  • Default attention processor

from diffusers import StableDiffusionXLPipeline pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0") pipe = pipe.to("cuda") pipe.unet.set_default_attn_processor()
import time import torch num_iterations = 3 num_inference_steps = 25 prompt = "a photo of an astronaut riding a horse on mars" num_images_per_prompt = 4 def bytes_to_giga_bytes(bytes): return bytes / 1024 / 1024 / 1024 def timeit( pipeline, prompt_embeds=None, negative_prompt_embeds=None, pooled_prompt_embeds=None, negative_pooled_prompt_embeds=None, ): if prompt_embeds is None: call_args = dict( prompt=prompt, num_images_per_prompt=num_images_per_prompt, num_inference_steps=num_inference_steps, ) else: call_args = dict( prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, pooled_prompt_embeds=pooled_prompt_embeds, negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, num_images_per_prompt=num_images_per_prompt, num_inference_steps=num_inference_steps, ) for i in range(num_iterations): start = time.time_ns() _ = pipeline(**call_args) end = time.time_ns() if i == num_iterations - 1: print(f"Execution time -- {(end - start) / 1e6:.1f} ms\n") print( f"Max memory allocated: {bytes_to_giga_bytes(torch.cuda.max_memory_allocated())} GB" )
timeit(pipe)
import gc import torch def flush(): gc.collect() torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() del pipe flush()

Just FP16

pipe = StableDiffusionXLPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 ) pipe = pipe.to("cuda") pipe.unet.set_default_attn_processor() timeit(pipe)
del pipe flush()

FP16 + SDPA

pipe = StableDiffusionXLPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 ) pipe = pipe.to("cuda") timeit(pipe)
del pipe flush()

From here on, we refer to "FP16 + SDPA" as the default setting.

Default + torch.compile()

pipe = StableDiffusionXLPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 ) pipe = pipe.to("cuda") pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) timeit(pipe)
del pipe flush()

Default + Model CPU Offloading

Here we focus more on the memory optimization rather than inference speed.

pipe = StableDiffusionXLPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 ) pipe.enable_model_cpu_offload() timeit(pipe)
del pipe flush()

Default + Sequential CPU Offloading

pipe = StableDiffusionXLPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 ) pipe.enable_sequential_cpu_offload() timeit(pipe)
del pipe flush()

Default + VAE Slicing

Specifically suited for optimizing memory for decoding latents into higher-res images without compromising too much on the inference speed.

pipe = StableDiffusionXLPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 ) pipe = pipe.to("cuda") pipe.enable_vae_slicing() timeit(pipe)
del pipe flush()

Default + VAE Slicing + Sequential CPU Offloading

pipe = StableDiffusionXLPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 ) pipe.enable_sequential_cpu_offload() pipe.enable_vae_slicing() timeit(pipe)
del pipe flush()

Default + Precompting text embeddings

import torch from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer pipe_id = "stabilityai/stable-diffusion-xl-base-1.0" torch_dtype = torch.float16 # Load the text encoders and tokenizers. text_encoder = CLIPTextModel.from_pretrained(pipe_id, subfolder="text_encoder", torch_dtype=torch.float16).to("cuda") tokenizer = CLIPTokenizer.from_pretrained(pipe_id, subfolder="tokenizer") text_encoder_2 = CLIPTextModelWithProjection.from_pretrained(pipe_id, subfolder="text_encoder_2", torch_dtype=torch.float16).to("cuda") tokenizer_2 = CLIPTokenizer.from_pretrained(pipe_id, subfolder="tokenizer_2")
def encode_prompt(tokenizers, text_encoders, prompt: str, negative_prompt: str = None): device = text_encoders[0].device if isinstance(prompt, str): prompt = [prompt] batch_size = len(prompt) prompt_embeds_list = [] for tokenizer, text_encoder in zip(tokenizers, text_encoders): text_inputs = tokenizer( prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt", ) text_input_ids = text_inputs.input_ids prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True) pooled_prompt_embeds = prompt_embeds[0] prompt_embeds = prompt_embeds.hidden_states[-2] prompt_embeds_list.append(prompt_embeds) prompt_embeds = torch.concat(prompt_embeds_list, dim=-1) if negative_prompt is None: negative_prompt_embeds = torch.zeros_like(prompt_embeds) negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds) else: negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt negative_prompt_embeds_list = [] for tokenizer, text_encoder in zip(tokenizers, text_encoders): uncond_input = tokenizer( negative_prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt", ) negative_prompt_embeds = text_encoder(uncond_input.input_ids.to(device), output_hidden_states=True) negative_pooled_prompt_embeds = negative_prompt_embeds[0] negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2] negative_prompt_embeds_list.append(negative_prompt_embeds) negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1) return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
tokenizers = [tokenizer, tokenizer_2] text_encoders = [text_encoder, text_encoder_2] ( prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds ) = encode_prompt(tokenizers, text_encoders, prompt)
del text_encoder, text_encoder_2, tokenizer, tokenizer_2 flush()
pipe = StableDiffusionXLPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", text_encoder=None, text_encoder_2=None, tokenizer=None, tokenizer_2=None, torch_dtype=torch.float16, ) pipe = pipe.to("cuda") timeit( pipe, prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds, )
del pipe flush()

Default + Tiny Autoencoder

This is better suited for generating (almost) instant previews. The "instant" part is of course, GPU-dependent. On an A10G, for example, it can be achieved.

from diffusers import AutoencoderTiny pipe = StableDiffusionXLPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16 ) pipe.vae = AutoencoderTiny.from_pretrained("madebyollin/taesdxl", torch_dtype=torch.float16) pipe = pipe.to("cuda") timeit(pipe)