Path: blob/main/examples/community/stable_diffusion_comparison.py
1448 views
from typing import Any, Callable, Dict, List, Optional, Union12import torch3from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer45from diffusers import (6AutoencoderKL,7DDIMScheduler,8DiffusionPipeline,9LMSDiscreteScheduler,10PNDMScheduler,11StableDiffusionPipeline,12UNet2DConditionModel,13)14from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput15from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker161718pipe1_model_id = "CompVis/stable-diffusion-v1-1"19pipe2_model_id = "CompVis/stable-diffusion-v1-2"20pipe3_model_id = "CompVis/stable-diffusion-v1-3"21pipe4_model_id = "CompVis/stable-diffusion-v1-4"222324class StableDiffusionComparisonPipeline(DiffusionPipeline):25r"""26Pipeline for parallel comparison of Stable Diffusion v1-v427This pipeline inherits from DiffusionPipeline and depends on the use of an Auth Token for28downloading pre-trained checkpoints from Hugging Face Hub.29If using Hugging Face Hub, pass the Model ID for Stable Diffusion v1.4 as the previous 3 checkpoints will be loaded30automatically.31Args:32vae ([`AutoencoderKL`]):33Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.34text_encoder ([`CLIPTextModel`]):35Frozen text-encoder. Stable Diffusion uses the text portion of36[CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically37the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.38tokenizer (`CLIPTokenizer`):39Tokenizer of class40[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).41unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.42scheduler ([`SchedulerMixin`]):43A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of44[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].45safety_checker ([`StableDiffusionMegaSafetyChecker`]):46Classification module that estimates whether generated images could be considered offensive or harmful.47Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.48feature_extractor ([`CLIPImageProcessor`]):49Model that extracts features from generated images to be used as inputs for the `safety_checker`.50"""5152def __init__(53self,54vae: AutoencoderKL,55text_encoder: CLIPTextModel,56tokenizer: CLIPTokenizer,57unet: UNet2DConditionModel,58scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],59safety_checker: StableDiffusionSafetyChecker,60feature_extractor: CLIPImageProcessor,61requires_safety_checker: bool = True,62):63super()._init_()6465self.pipe1 = StableDiffusionPipeline.from_pretrained(pipe1_model_id)66self.pipe2 = StableDiffusionPipeline.from_pretrained(pipe2_model_id)67self.pipe3 = StableDiffusionPipeline.from_pretrained(pipe3_model_id)68self.pipe4 = StableDiffusionPipeline(69vae=vae,70text_encoder=text_encoder,71tokenizer=tokenizer,72unet=unet,73scheduler=scheduler,74safety_checker=safety_checker,75feature_extractor=feature_extractor,76requires_safety_checker=requires_safety_checker,77)7879self.register_modules(pipeline1=self.pipe1, pipeline2=self.pipe2, pipeline3=self.pipe3, pipeline4=self.pipe4)8081@property82def layers(self) -> Dict[str, Any]:83return {k: getattr(self, k) for k in self.config.keys() if not k.startswith("_")}8485def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):86r"""87Enable sliced attention computation.88When this option is enabled, the attention module will split the input tensor in slices, to compute attention89in several steps. This is useful to save some memory in exchange for a small speed decrease.90Args:91slice_size (`str` or `int`, *optional*, defaults to `"auto"`):92When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If93a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,94`attention_head_dim` must be a multiple of `slice_size`.95"""96if slice_size == "auto":97# half the attention head size is usually a good trade-off between98# speed and memory99slice_size = self.unet.config.attention_head_dim // 2100self.unet.set_attention_slice(slice_size)101102def disable_attention_slicing(self):103r"""104Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go105back to computing attention in one step.106"""107# set slice_size = `None` to disable `attention slicing`108self.enable_attention_slicing(None)109110@torch.no_grad()111def text2img_sd1_1(112self,113prompt: Union[str, List[str]],114height: int = 512,115width: int = 512,116num_inference_steps: int = 50,117guidance_scale: float = 7.5,118negative_prompt: Optional[Union[str, List[str]]] = None,119num_images_per_prompt: Optional[int] = 1,120eta: float = 0.0,121generator: Optional[torch.Generator] = None,122latents: Optional[torch.FloatTensor] = None,123output_type: Optional[str] = "pil",124return_dict: bool = True,125callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,126callback_steps: int = 1,127**kwargs,128):129return self.pipe1(130prompt=prompt,131height=height,132width=width,133num_inference_steps=num_inference_steps,134guidance_scale=guidance_scale,135negative_prompt=negative_prompt,136num_images_per_prompt=num_images_per_prompt,137eta=eta,138generator=generator,139latents=latents,140output_type=output_type,141return_dict=return_dict,142callback=callback,143callback_steps=callback_steps,144**kwargs,145)146147@torch.no_grad()148def text2img_sd1_2(149self,150prompt: Union[str, List[str]],151height: int = 512,152width: int = 512,153num_inference_steps: int = 50,154guidance_scale: float = 7.5,155negative_prompt: Optional[Union[str, List[str]]] = None,156num_images_per_prompt: Optional[int] = 1,157eta: float = 0.0,158generator: Optional[torch.Generator] = None,159latents: Optional[torch.FloatTensor] = None,160output_type: Optional[str] = "pil",161return_dict: bool = True,162callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,163callback_steps: int = 1,164**kwargs,165):166return self.pipe2(167prompt=prompt,168height=height,169width=width,170num_inference_steps=num_inference_steps,171guidance_scale=guidance_scale,172negative_prompt=negative_prompt,173num_images_per_prompt=num_images_per_prompt,174eta=eta,175generator=generator,176latents=latents,177output_type=output_type,178return_dict=return_dict,179callback=callback,180callback_steps=callback_steps,181**kwargs,182)183184@torch.no_grad()185def text2img_sd1_3(186self,187prompt: Union[str, List[str]],188height: int = 512,189width: int = 512,190num_inference_steps: int = 50,191guidance_scale: float = 7.5,192negative_prompt: Optional[Union[str, List[str]]] = None,193num_images_per_prompt: Optional[int] = 1,194eta: float = 0.0,195generator: Optional[torch.Generator] = None,196latents: Optional[torch.FloatTensor] = None,197output_type: Optional[str] = "pil",198return_dict: bool = True,199callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,200callback_steps: int = 1,201**kwargs,202):203return self.pipe3(204prompt=prompt,205height=height,206width=width,207num_inference_steps=num_inference_steps,208guidance_scale=guidance_scale,209negative_prompt=negative_prompt,210num_images_per_prompt=num_images_per_prompt,211eta=eta,212generator=generator,213latents=latents,214output_type=output_type,215return_dict=return_dict,216callback=callback,217callback_steps=callback_steps,218**kwargs,219)220221@torch.no_grad()222def text2img_sd1_4(223self,224prompt: Union[str, List[str]],225height: int = 512,226width: int = 512,227num_inference_steps: int = 50,228guidance_scale: float = 7.5,229negative_prompt: Optional[Union[str, List[str]]] = None,230num_images_per_prompt: Optional[int] = 1,231eta: float = 0.0,232generator: Optional[torch.Generator] = None,233latents: Optional[torch.FloatTensor] = None,234output_type: Optional[str] = "pil",235return_dict: bool = True,236callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,237callback_steps: int = 1,238**kwargs,239):240return self.pipe4(241prompt=prompt,242height=height,243width=width,244num_inference_steps=num_inference_steps,245guidance_scale=guidance_scale,246negative_prompt=negative_prompt,247num_images_per_prompt=num_images_per_prompt,248eta=eta,249generator=generator,250latents=latents,251output_type=output_type,252return_dict=return_dict,253callback=callback,254callback_steps=callback_steps,255**kwargs,256)257258@torch.no_grad()259def _call_(260self,261prompt: Union[str, List[str]],262height: int = 512,263width: int = 512,264num_inference_steps: int = 50,265guidance_scale: float = 7.5,266negative_prompt: Optional[Union[str, List[str]]] = None,267num_images_per_prompt: Optional[int] = 1,268eta: float = 0.0,269generator: Optional[torch.Generator] = None,270latents: Optional[torch.FloatTensor] = None,271output_type: Optional[str] = "pil",272return_dict: bool = True,273callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,274callback_steps: int = 1,275**kwargs,276):277r"""278Function invoked when calling the pipeline for generation. This function will generate 4 results as part279of running all the 4 pipelines for SD1.1-1.4 together in a serial-processing, parallel-invocation fashion.280Args:281prompt (`str` or `List[str]`):282The prompt or prompts to guide the image generation.283height (`int`, optional, defaults to 512):284The height in pixels of the generated image.285width (`int`, optional, defaults to 512):286The width in pixels of the generated image.287num_inference_steps (`int`, optional, defaults to 50):288The number of denoising steps. More denoising steps usually lead to a higher quality image at the289expense of slower inference.290guidance_scale (`float`, optional, defaults to 7.5):291Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).292`guidance_scale` is defined as `w` of equation 2. of [Imagen293Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >2941`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,295usually at the expense of lower image quality.296eta (`float`, optional, defaults to 0.0):297Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to298[`schedulers.DDIMScheduler`], will be ignored for others.299generator (`torch.Generator`, optional):300A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation301deterministic.302latents (`torch.FloatTensor`, optional):303Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image304generation. Can be used to tweak the same generation with different prompts. If not provided, a latents305tensor will ge generated by sampling using the supplied random `generator`.306output_type (`str`, optional, defaults to `"pil"`):307The output format of the generate image. Choose between308[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.309return_dict (`bool`, optional, defaults to `True`):310Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a311plain tuple.312Returns:313[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:314[`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.315When returning a tuple, the first element is a list with the generated images, and the second element is a316list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"317(nsfw) content, according to the `safety_checker`.318"""319320device = "cuda" if torch.cuda.is_available() else "cpu"321self.to(device)322323# Checks if the height and width are divisible by 8 or not324if height % 8 != 0 or width % 8 != 0:325raise ValueError(f"`height` and `width` must be divisible by 8 but are {height} and {width}.")326327# Get first result from Stable Diffusion Checkpoint v1.1328res1 = self.text2img_sd1_1(329prompt=prompt,330height=height,331width=width,332num_inference_steps=num_inference_steps,333guidance_scale=guidance_scale,334negative_prompt=negative_prompt,335num_images_per_prompt=num_images_per_prompt,336eta=eta,337generator=generator,338latents=latents,339output_type=output_type,340return_dict=return_dict,341callback=callback,342callback_steps=callback_steps,343**kwargs,344)345346# Get first result from Stable Diffusion Checkpoint v1.2347res2 = self.text2img_sd1_2(348prompt=prompt,349height=height,350width=width,351num_inference_steps=num_inference_steps,352guidance_scale=guidance_scale,353negative_prompt=negative_prompt,354num_images_per_prompt=num_images_per_prompt,355eta=eta,356generator=generator,357latents=latents,358output_type=output_type,359return_dict=return_dict,360callback=callback,361callback_steps=callback_steps,362**kwargs,363)364365# Get first result from Stable Diffusion Checkpoint v1.3366res3 = self.text2img_sd1_3(367prompt=prompt,368height=height,369width=width,370num_inference_steps=num_inference_steps,371guidance_scale=guidance_scale,372negative_prompt=negative_prompt,373num_images_per_prompt=num_images_per_prompt,374eta=eta,375generator=generator,376latents=latents,377output_type=output_type,378return_dict=return_dict,379callback=callback,380callback_steps=callback_steps,381**kwargs,382)383384# Get first result from Stable Diffusion Checkpoint v1.4385res4 = self.text2img_sd1_4(386prompt=prompt,387height=height,388width=width,389num_inference_steps=num_inference_steps,390guidance_scale=guidance_scale,391negative_prompt=negative_prompt,392num_images_per_prompt=num_images_per_prompt,393eta=eta,394generator=generator,395latents=latents,396output_type=output_type,397return_dict=return_dict,398callback=callback,399callback_steps=callback_steps,400**kwargs,401)402403# Get all result images into a single list and pass it via StableDiffusionPipelineOutput for final result404return StableDiffusionPipelineOutput([res1[0], res2[0], res3[0], res4[0]])405406407