Path: blob/main/tests/pipelines/audio_diffusion/test_audio_diffusion.py
1450 views
# coding=utf-81# Copyright 2023 HuggingFace Inc.2#3# Licensed under the Apache License, Version 2.0 (the "License");4# you may not use this file except in compliance with the License.5# You may obtain a copy of the License at6#7# http://www.apache.org/licenses/LICENSE-2.08#9# Unless required by applicable law or agreed to in writing, software10# distributed under the License is distributed on an "AS IS" BASIS,11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.12# See the License for the specific language governing permissions and13# limitations under the License.1415import gc16import unittest1718import numpy as np19import torch2021from diffusers import (22AudioDiffusionPipeline,23AutoencoderKL,24DDIMScheduler,25DDPMScheduler,26DiffusionPipeline,27Mel,28UNet2DConditionModel,29UNet2DModel,30)31from diffusers.utils import slow, torch_device32from diffusers.utils.testing_utils import require_torch_gpu333435torch.backends.cuda.matmul.allow_tf32 = False363738class PipelineFastTests(unittest.TestCase):39def tearDown(self):40# clean up the VRAM after each test41super().tearDown()42gc.collect()43torch.cuda.empty_cache()4445@property46def dummy_unet(self):47torch.manual_seed(0)48model = UNet2DModel(49sample_size=(32, 64),50in_channels=1,51out_channels=1,52layers_per_block=2,53block_out_channels=(128, 128),54down_block_types=("AttnDownBlock2D", "DownBlock2D"),55up_block_types=("UpBlock2D", "AttnUpBlock2D"),56)57return model5859@property60def dummy_unet_condition(self):61torch.manual_seed(0)62model = UNet2DConditionModel(63sample_size=(64, 32),64in_channels=1,65out_channels=1,66layers_per_block=2,67block_out_channels=(128, 128),68down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),69up_block_types=("UpBlock2D", "CrossAttnUpBlock2D"),70cross_attention_dim=10,71)72return model7374@property75def dummy_vqvae_and_unet(self):76torch.manual_seed(0)77vqvae = AutoencoderKL(78sample_size=(128, 64),79in_channels=1,80out_channels=1,81latent_channels=1,82layers_per_block=2,83block_out_channels=(128, 128),84down_block_types=("DownEncoderBlock2D", "DownEncoderBlock2D"),85up_block_types=("UpDecoderBlock2D", "UpDecoderBlock2D"),86)87unet = UNet2DModel(88sample_size=(64, 32),89in_channels=1,90out_channels=1,91layers_per_block=2,92block_out_channels=(128, 128),93down_block_types=("AttnDownBlock2D", "DownBlock2D"),94up_block_types=("UpBlock2D", "AttnUpBlock2D"),95)96return vqvae, unet9798@slow99def test_audio_diffusion(self):100device = "cpu" # ensure determinism for the device-dependent torch.Generator101mel = Mel()102103scheduler = DDPMScheduler()104pipe = AudioDiffusionPipeline(vqvae=None, unet=self.dummy_unet, mel=mel, scheduler=scheduler)105pipe = pipe.to(device)106pipe.set_progress_bar_config(disable=None)107108generator = torch.Generator(device=device).manual_seed(42)109output = pipe(generator=generator, steps=4)110audio = output.audios[0]111image = output.images[0]112113generator = torch.Generator(device=device).manual_seed(42)114output = pipe(generator=generator, steps=4, return_dict=False)115image_from_tuple = output[0][0]116117assert audio.shape == (1, (self.dummy_unet.sample_size[1] - 1) * mel.hop_length)118assert image.height == self.dummy_unet.sample_size[0] and image.width == self.dummy_unet.sample_size[1]119image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]120image_from_tuple_slice = np.frombuffer(image_from_tuple.tobytes(), dtype="uint8")[:10]121expected_slice = np.array([69, 255, 255, 255, 0, 0, 77, 181, 12, 127])122123assert np.abs(image_slice.flatten() - expected_slice).max() == 0124assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() == 0125126scheduler = DDIMScheduler()127dummy_vqvae_and_unet = self.dummy_vqvae_and_unet128pipe = AudioDiffusionPipeline(129vqvae=self.dummy_vqvae_and_unet[0], unet=dummy_vqvae_and_unet[1], mel=mel, scheduler=scheduler130)131pipe = pipe.to(device)132pipe.set_progress_bar_config(disable=None)133134np.random.seed(0)135raw_audio = np.random.uniform(-1, 1, ((dummy_vqvae_and_unet[0].sample_size[1] - 1) * mel.hop_length,))136generator = torch.Generator(device=device).manual_seed(42)137output = pipe(raw_audio=raw_audio, generator=generator, start_step=5, steps=10)138image = output.images[0]139140assert (141image.height == self.dummy_vqvae_and_unet[0].sample_size[0]142and image.width == self.dummy_vqvae_and_unet[0].sample_size[1]143)144image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]145expected_slice = np.array([120, 117, 110, 109, 138, 167, 138, 148, 132, 121])146147assert np.abs(image_slice.flatten() - expected_slice).max() == 0148149dummy_unet_condition = self.dummy_unet_condition150pipe = AudioDiffusionPipeline(151vqvae=self.dummy_vqvae_and_unet[0], unet=dummy_unet_condition, mel=mel, scheduler=scheduler152)153154np.random.seed(0)155encoding = torch.rand((1, 1, 10))156output = pipe(generator=generator, encoding=encoding)157image = output.images[0]158image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]159expected_slice = np.array([120, 139, 147, 123, 124, 96, 115, 121, 126, 144])160161assert np.abs(image_slice.flatten() - expected_slice).max() == 0162163164@slow165@require_torch_gpu166class PipelineIntegrationTests(unittest.TestCase):167def tearDown(self):168# clean up the VRAM after each test169super().tearDown()170gc.collect()171torch.cuda.empty_cache()172173def test_audio_diffusion(self):174device = torch_device175176pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-ddim-256")177pipe = pipe.to(device)178pipe.set_progress_bar_config(disable=None)179180generator = torch.Generator(device=device).manual_seed(42)181output = pipe(generator=generator)182audio = output.audios[0]183image = output.images[0]184185assert audio.shape == (1, (pipe.unet.sample_size[1] - 1) * pipe.mel.hop_length)186assert image.height == pipe.unet.sample_size[0] and image.width == pipe.unet.sample_size[1]187image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10]188expected_slice = np.array([151, 167, 154, 144, 122, 134, 121, 105, 70, 26])189190assert np.abs(image_slice.flatten() - expected_slice).max() == 0191192193