Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
shivamshrirao
GitHub Repository: shivamshrirao/diffusers
Path: blob/main/tests/pipelines/text_to_video/test_text_to_video.py
1448 views
1
# coding=utf-8
2
# Copyright 2023 HuggingFace Inc.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
# http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
16
import unittest
17
18
import numpy as np
19
import torch
20
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
21
22
from diffusers import (
23
AutoencoderKL,
24
DDIMScheduler,
25
DPMSolverMultistepScheduler,
26
TextToVideoSDPipeline,
27
UNet3DConditionModel,
28
)
29
from diffusers.utils import load_numpy, skip_mps, slow
30
31
from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
32
from ...test_pipelines_common import PipelineTesterMixin
33
34
35
torch.backends.cuda.matmul.allow_tf32 = False
36
37
38
@skip_mps
39
class TextToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
40
pipeline_class = TextToVideoSDPipeline
41
params = TEXT_TO_IMAGE_PARAMS
42
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
43
# No `output_type`.
44
required_optional_params = frozenset(
45
[
46
"num_inference_steps",
47
"generator",
48
"latents",
49
"return_dict",
50
"callback",
51
"callback_steps",
52
]
53
)
54
55
def get_dummy_components(self):
56
torch.manual_seed(0)
57
unet = UNet3DConditionModel(
58
block_out_channels=(32, 64, 64, 64),
59
layers_per_block=2,
60
sample_size=32,
61
in_channels=4,
62
out_channels=4,
63
down_block_types=("CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "DownBlock3D"),
64
up_block_types=("UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D"),
65
cross_attention_dim=32,
66
attention_head_dim=4,
67
)
68
scheduler = DDIMScheduler(
69
beta_start=0.00085,
70
beta_end=0.012,
71
beta_schedule="scaled_linear",
72
clip_sample=False,
73
set_alpha_to_one=False,
74
)
75
torch.manual_seed(0)
76
vae = AutoencoderKL(
77
block_out_channels=[32, 64],
78
in_channels=3,
79
out_channels=3,
80
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
81
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
82
latent_channels=4,
83
sample_size=128,
84
)
85
torch.manual_seed(0)
86
text_encoder_config = CLIPTextConfig(
87
bos_token_id=0,
88
eos_token_id=2,
89
hidden_size=32,
90
intermediate_size=37,
91
layer_norm_eps=1e-05,
92
num_attention_heads=4,
93
num_hidden_layers=5,
94
pad_token_id=1,
95
vocab_size=1000,
96
hidden_act="gelu",
97
projection_dim=512,
98
)
99
text_encoder = CLIPTextModel(text_encoder_config)
100
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
101
102
components = {
103
"unet": unet,
104
"scheduler": scheduler,
105
"vae": vae,
106
"text_encoder": text_encoder,
107
"tokenizer": tokenizer,
108
}
109
return components
110
111
def get_dummy_inputs(self, device, seed=0):
112
if str(device).startswith("mps"):
113
generator = torch.manual_seed(seed)
114
else:
115
generator = torch.Generator(device=device).manual_seed(seed)
116
inputs = {
117
"prompt": "A painting of a squirrel eating a burger",
118
"generator": generator,
119
"num_inference_steps": 2,
120
"guidance_scale": 6.0,
121
"output_type": "pt",
122
}
123
return inputs
124
125
def test_text_to_video_default_case(self):
126
device = "cpu" # ensure determinism for the device-dependent torch.Generator
127
components = self.get_dummy_components()
128
sd_pipe = TextToVideoSDPipeline(**components)
129
sd_pipe = sd_pipe.to(device)
130
sd_pipe.set_progress_bar_config(disable=None)
131
132
inputs = self.get_dummy_inputs(device)
133
inputs["output_type"] = "np"
134
frames = sd_pipe(**inputs).frames
135
image_slice = frames[0][-3:, -3:, -1]
136
137
assert frames[0].shape == (64, 64, 3)
138
expected_slice = np.array([166, 184, 167, 118, 102, 123, 108, 93, 114])
139
140
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
141
142
def test_attention_slicing_forward_pass(self):
143
self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
144
145
# (todo): sayakpaul
146
@unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
147
def test_inference_batch_consistent(self):
148
pass
149
150
# (todo): sayakpaul
151
@unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
152
def test_inference_batch_single_identical(self):
153
pass
154
155
@unittest.skip(reason="`num_images_per_prompt` argument is not supported for this pipeline.")
156
def test_num_images_per_prompt(self):
157
pass
158
159
def test_progress_bar(self):
160
return super().test_progress_bar()
161
162
163
@slow
164
@skip_mps
165
class TextToVideoSDPipelineSlowTests(unittest.TestCase):
166
def test_full_model(self):
167
expected_video = load_numpy(
168
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_to_video/video.npy"
169
)
170
171
pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b")
172
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
173
pipe = pipe.to("cuda")
174
175
prompt = "Spiderman is surfing"
176
generator = torch.Generator(device="cpu").manual_seed(0)
177
178
video_frames = pipe(prompt, generator=generator, num_inference_steps=25, output_type="pt").frames
179
video = video_frames.cpu().numpy()
180
181
assert np.abs(expected_video - video).mean() < 5e-2
182
183
def test_two_step_model(self):
184
expected_video = load_numpy(
185
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_to_video/video_2step.npy"
186
)
187
188
pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b")
189
pipe = pipe.to("cuda")
190
191
prompt = "Spiderman is surfing"
192
generator = torch.Generator(device="cpu").manual_seed(0)
193
194
video_frames = pipe(prompt, generator=generator, num_inference_steps=2, output_type="pt").frames
195
video = video_frames.cpu().numpy()
196
197
assert np.abs(expected_video - video).mean() < 5e-2
198
199