CoCalc -- exploring_simple optimizations_for

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/diffusers/exploring_simple optimizations_for_sdxl.ipynb
Views: ²⁵³⁵

Kernel: Python 3

Exploring simple optimizations for Stable Diffusion XL

In [ ]:

!nvidia-smi

In [ ]:

!pip install git+https://github.com/huggingface/diffusers -q
!pip install transformers accelerate -q

Unoptimized setup

FP32 computation
Default attention processor

In [ ]:

from diffusers import StableDiffusionXLPipeline

pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
pipe = pipe.to("cuda")
pipe.unet.set_default_attn_processor()

In [ ]:

import time
import torch

num_iterations = 3
num_inference_steps = 25
prompt = "a photo of an astronaut riding a horse on mars"
num_images_per_prompt = 4


def bytes_to_giga_bytes(bytes):
    return bytes / 1024 / 1024 / 1024


def timeit(
    pipeline,
    prompt_embeds=None,
    negative_prompt_embeds=None,
    pooled_prompt_embeds=None,
    negative_pooled_prompt_embeds=None,
):
    if prompt_embeds is None:
        call_args = dict(
            prompt=prompt,
            num_images_per_prompt=num_images_per_prompt,
            num_inference_steps=num_inference_steps,
        )
    else:
        call_args = dict(
            prompt_embeds=prompt_embeds,
            negative_prompt_embeds=negative_prompt_embeds,
            pooled_prompt_embeds=pooled_prompt_embeds,
            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
            num_images_per_prompt=num_images_per_prompt,
            num_inference_steps=num_inference_steps,
        )
    for i in range(num_iterations):
        start = time.time_ns()
        _ = pipeline(**call_args)
        end = time.time_ns()
        if i == num_iterations - 1:
            print(f"Execution time -- {(end - start) / 1e6:.1f} ms\n")
    print(
        f"Max memory allocated: {bytes_to_giga_bytes(torch.cuda.max_memory_allocated())} GB"
    )

In [ ]:

timeit(pipe)

In [ ]:

import gc
import torch

def flush():
  gc.collect()
  torch.cuda.empty_cache()
  torch.cuda.reset_peak_memory_stats()

del pipe
flush()

Just FP16

In [ ]:

pipe = StableDiffusionXLPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
)
pipe = pipe.to("cuda")
pipe.unet.set_default_attn_processor()

timeit(pipe)

In [ ]:

del pipe
flush()

FP16 + SDPA

In [ ]:

pipe = StableDiffusionXLPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
)
pipe = pipe.to("cuda")

timeit(pipe)

In [ ]:

del pipe
flush()

From here on, we refer to "FP16 + SDPA" as the default setting.

Default + `torch.compile()`

In [ ]:

pipe = StableDiffusionXLPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
)
pipe = pipe.to("cuda")
pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)

timeit(pipe)

In [ ]:

del pipe
flush()

Default + Model CPU Offloading

Here we focus more on the memory optimization rather than inference speed.

In [ ]:

pipe = StableDiffusionXLPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
)
pipe.enable_model_cpu_offload()

timeit(pipe)

In [ ]:

del pipe
flush()

Default + Sequential CPU Offloading

In [ ]:

pipe = StableDiffusionXLPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
)
pipe.enable_sequential_cpu_offload()

timeit(pipe)

In [ ]:

del pipe
flush()

Default + VAE Slicing

Specifically suited for optimizing memory for decoding latents into higher-res images without compromising too much on the inference speed.

In [ ]:

pipe = StableDiffusionXLPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
)
pipe = pipe.to("cuda")
pipe.enable_vae_slicing()

timeit(pipe)

In [ ]:

del pipe
flush()

Default + VAE Slicing + Sequential CPU Offloading

In [ ]:

pipe = StableDiffusionXLPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
)
pipe.enable_sequential_cpu_offload()
pipe.enable_vae_slicing()

timeit(pipe)

In [ ]:

del pipe
flush()

Default + Precompting text embeddings

In [ ]:

import torch
from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer


pipe_id = "stabilityai/stable-diffusion-xl-base-1.0"
torch_dtype = torch.float16

# Load the text encoders and tokenizers.
text_encoder = CLIPTextModel.from_pretrained(pipe_id, subfolder="text_encoder", torch_dtype=torch.float16).to("cuda")
tokenizer = CLIPTokenizer.from_pretrained(pipe_id, subfolder="tokenizer")
text_encoder_2 = CLIPTextModelWithProjection.from_pretrained(pipe_id, subfolder="text_encoder_2", torch_dtype=torch.float16).to("cuda")
tokenizer_2 = CLIPTokenizer.from_pretrained(pipe_id, subfolder="tokenizer_2")

In [ ]:

def encode_prompt(tokenizers, text_encoders, prompt: str, negative_prompt: str = None):
    device = text_encoders[0].device

    if isinstance(prompt, str):
        prompt = [prompt]
    batch_size = len(prompt)

    prompt_embeds_list = []
    for tokenizer, text_encoder in zip(tokenizers, text_encoders):
        text_inputs = tokenizer(
            prompt,
            padding="max_length",
            max_length=tokenizer.model_max_length,
            truncation=True,
            return_tensors="pt",
        )

        text_input_ids = text_inputs.input_ids

        prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
        pooled_prompt_embeds = prompt_embeds[0]
        prompt_embeds = prompt_embeds.hidden_states[-2]
        prompt_embeds_list.append(prompt_embeds)

    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)

    if negative_prompt is None:
        negative_prompt_embeds = torch.zeros_like(prompt_embeds)
        negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
    else:
        negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt

        negative_prompt_embeds_list = []
        for tokenizer, text_encoder in zip(tokenizers, text_encoders):
            uncond_input = tokenizer(
                negative_prompt,
                padding="max_length",
                max_length=tokenizer.model_max_length,
                truncation=True,
                return_tensors="pt",
            )

            negative_prompt_embeds = text_encoder(uncond_input.input_ids.to(device), output_hidden_states=True)
            negative_pooled_prompt_embeds = negative_prompt_embeds[0]
            negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
            negative_prompt_embeds_list.append(negative_prompt_embeds)

        negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)

    return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds

In [ ]:

tokenizers = [tokenizer, tokenizer_2]
text_encoders = [text_encoder, text_encoder_2]

(
    prompt_embeds,
    negative_prompt_embeds,
    pooled_prompt_embeds,
    negative_pooled_prompt_embeds
) = encode_prompt(tokenizers, text_encoders, prompt)

In [ ]:

del text_encoder, text_encoder_2, tokenizer, tokenizer_2
flush()

In [ ]:

pipe = StableDiffusionXLPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    text_encoder=None,
    text_encoder_2=None,
    tokenizer=None,
    tokenizer_2=None,
    torch_dtype=torch.float16,
)
pipe = pipe.to("cuda")

timeit(
    pipe,
    prompt_embeds,
    negative_prompt_embeds,
    pooled_prompt_embeds,
    negative_pooled_prompt_embeds,
)

In [ ]:

del pipe
flush()

Default + Tiny Autoencoder

This is better suited for generating (almost) instant previews. The "instant" part is of course, GPU-dependent. On an A10G, for example, it can be achieved.

In [ ]:

from diffusers import AutoencoderTiny

pipe = StableDiffusionXLPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
)
pipe.vae = AutoencoderTiny.from_pretrained("madebyollin/taesdxl", torch_dtype=torch.float16)
pipe = pipe.to("cuda")

timeit(pipe)

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

Exploring simple optimizations for Stable Diffusion XL

Unoptimized setup

Just FP16

FP16 + SDPA

Default + `torch.compile()`

Default + Model CPU Offloading

Default + Sequential CPU Offloading

Default + VAE Slicing

Default + VAE Slicing + Sequential CPU Offloading

Default + Precompting text embeddings

Default + Tiny Autoencoder

Product

Resources

Company

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more, all in one place. Commercial Alternative to JupyterHub.

Exploring simple optimizations for Stable Diffusion XL

Unoptimized setup

Just FP16

FP16 + SDPA

Default + torch.compile()

Default + Model CPU Offloading

Default + Sequential CPU Offloading

Default + VAE Slicing

Default + VAE Slicing + Sequential CPU Offloading

Default + Precompting text embeddings

Default + Tiny Autoencoder

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

Default + `torch.compile()`