CoCalc -- test_text_to

GitHub Repository: shivamshrirao/diffusers
Path: blob/main/tests/pipelines/text_to_video/test_text_to_video.py
¹⁴⁴⁸ views
1
# coding=utf-8
2
# Copyright 2023 HuggingFace Inc.
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15

16
import unittest
17

18
import numpy as np
19
import torch
20
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
21

22
from diffusers import (
23
    AutoencoderKL,
24
    DDIMScheduler,
25
    DPMSolverMultistepScheduler,
26
    TextToVideoSDPipeline,
27
    UNet3DConditionModel,
28
)
29
from diffusers.utils import load_numpy, skip_mps, slow
30

31
from ...pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
32
from ...test_pipelines_common import PipelineTesterMixin
33

34

35
torch.backends.cuda.matmul.allow_tf32 = False
36

37

38
@skip_mps
39
class TextToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
40
    pipeline_class = TextToVideoSDPipeline
41
    params = TEXT_TO_IMAGE_PARAMS
42
    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
43
    # No `output_type`.
44
    required_optional_params = frozenset(
45
        [
46
            "num_inference_steps",
47
            "generator",
48
            "latents",
49
            "return_dict",
50
            "callback",
51
            "callback_steps",
52
        ]
53
    )
54

55
    def get_dummy_components(self):
56
        torch.manual_seed(0)
57
        unet = UNet3DConditionModel(
58
            block_out_channels=(32, 64, 64, 64),
59
            layers_per_block=2,
60
            sample_size=32,
61
            in_channels=4,
62
            out_channels=4,
63
            down_block_types=("CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "DownBlock3D"),
64
            up_block_types=("UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D"),
65
            cross_attention_dim=32,
66
            attention_head_dim=4,
67
        )
68
        scheduler = DDIMScheduler(
69
            beta_start=0.00085,
70
            beta_end=0.012,
71
            beta_schedule="scaled_linear",
72
            clip_sample=False,
73
            set_alpha_to_one=False,
74
        )
75
        torch.manual_seed(0)
76
        vae = AutoencoderKL(
77
            block_out_channels=[32, 64],
78
            in_channels=3,
79
            out_channels=3,
80
            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
81
            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
82
            latent_channels=4,
83
            sample_size=128,
84
        )
85
        torch.manual_seed(0)
86
        text_encoder_config = CLIPTextConfig(
87
            bos_token_id=0,
88
            eos_token_id=2,
89
            hidden_size=32,
90
            intermediate_size=37,
91
            layer_norm_eps=1e-05,
92
            num_attention_heads=4,
93
            num_hidden_layers=5,
94
            pad_token_id=1,
95
            vocab_size=1000,
96
            hidden_act="gelu",
97
            projection_dim=512,
98
        )
99
        text_encoder = CLIPTextModel(text_encoder_config)
100
        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
101

102
        components = {
103
            "unet": unet,
104
            "scheduler": scheduler,
105
            "vae": vae,
106
            "text_encoder": text_encoder,
107
            "tokenizer": tokenizer,
108
        }
109
        return components
110

111
    def get_dummy_inputs(self, device, seed=0):
112
        if str(device).startswith("mps"):
113
            generator = torch.manual_seed(seed)
114
        else:
115
            generator = torch.Generator(device=device).manual_seed(seed)
116
        inputs = {
117
            "prompt": "A painting of a squirrel eating a burger",
118
            "generator": generator,
119
            "num_inference_steps": 2,
120
            "guidance_scale": 6.0,
121
            "output_type": "pt",
122
        }
123
        return inputs
124

125
    def test_text_to_video_default_case(self):
126
        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
127
        components = self.get_dummy_components()
128
        sd_pipe = TextToVideoSDPipeline(**components)
129
        sd_pipe = sd_pipe.to(device)
130
        sd_pipe.set_progress_bar_config(disable=None)
131

132
        inputs = self.get_dummy_inputs(device)
133
        inputs["output_type"] = "np"
134
        frames = sd_pipe(**inputs).frames
135
        image_slice = frames[0][-3:, -3:, -1]
136

137
        assert frames[0].shape == (64, 64, 3)
138
        expected_slice = np.array([166, 184, 167, 118, 102, 123, 108, 93, 114])
139

140
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
141

142
    def test_attention_slicing_forward_pass(self):
143
        self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
144

145
    # (todo): sayakpaul
146
    @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
147
    def test_inference_batch_consistent(self):
148
        pass
149

150
    # (todo): sayakpaul
151
    @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
152
    def test_inference_batch_single_identical(self):
153
        pass
154

155
    @unittest.skip(reason="`num_images_per_prompt` argument is not supported for this pipeline.")
156
    def test_num_images_per_prompt(self):
157
        pass
158

159
    def test_progress_bar(self):
160
        return super().test_progress_bar()
161

162

163
@slow
164
@skip_mps
165
class TextToVideoSDPipelineSlowTests(unittest.TestCase):
166
    def test_full_model(self):
167
        expected_video = load_numpy(
168
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_to_video/video.npy"
169
        )
170

171
        pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b")
172
        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
173
        pipe = pipe.to("cuda")
174

175
        prompt = "Spiderman is surfing"
176
        generator = torch.Generator(device="cpu").manual_seed(0)
177

178
        video_frames = pipe(prompt, generator=generator, num_inference_steps=25, output_type="pt").frames
179
        video = video_frames.cpu().numpy()
180

181
        assert np.abs(expected_video - video).mean() < 5e-2
182

183
    def test_two_step_model(self):
184
        expected_video = load_numpy(
185
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/text_to_video/video_2step.npy"
186
        )
187

188
        pipe = TextToVideoSDPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b")
189
        pipe = pipe.to("cuda")
190

191
        prompt = "Spiderman is surfing"
192
        generator = torch.Generator(device="cpu").manual_seed(0)
193

194
        video_frames = pipe(prompt, generator=generator, num_inference_steps=2, output_type="pt").frames
195
        video = video_frames.cpu().numpy()
196

197
        assert np.abs(expected_video - video).mean() < 5e-2
198

199
Product

Resources

Company