GitHub Repository: tensorflow/docs-l10n
Path: blob/master/site/ko/tutorials/audio/music_generation.ipynb
²⁵¹¹⁸ views

Kernel: Python 3

Copyright 2021 The TensorFlow Authors.

In [ ]:

#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

RNN으로 음악 생성

이 튜토리얼은 간단한 순환 신경망(RNN)을 사용하여 음표를 생성하는 방법을 보여줍니다. MAESTRO 데이터세트의 피아노 MIDI 파일 컬렉션을 사용하여 모델을 훈련합니다. 음표 시퀀스가 주어지면 모델은 시퀀스의 다음 음표를 예측하는 방법을 학습합니다. 모델을 반복적으로 호출하여 더 긴 음표 시퀀스를 생성할 수 있습니다.

이 튜토리얼에는 MIDI 파일을 구문 분석하고 생성하는 전체 코드가 포함되어 있습니다. RNN으로 텍스트 생성하기 튜토리얼에서 RNN의 작동 방식에 대해 자세히 알아볼 수 있습니다.

설정

이 튜토리얼은 사용 pretty_midi 만들고 MIDI 파일 및 구문 분석 라이브러리 pyfluidsynth Colab에서 오디오 재생을 생성합니다.

In [ ]:

!sudo apt install -y fluidsynth

In [ ]:

!pip install --upgrade pyfluidsynth

In [ ]:

!pip install pretty_midi

In [ ]:

import collections
import datetime
import fluidsynth
import glob
import numpy as np
import pathlib
import pandas as pd
import pretty_midi
import seaborn as sns
import tensorflow as tf

from IPython import display
from matplotlib import pyplot as plt
from typing import Optional

In [ ]:

seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)

# Sampling rate for audio playback
_SAMPLING_RATE = 16000

Maestro 데이터 세트 다운로드

In [ ]:

data_dir = pathlib.Path('data/maestro-v2.0.0')
if not data_dir.exists():
  tf.keras.utils.get_file(
      'maestro-v2.0.0-midi.zip',
      origin='https://storage.googleapis.com/magentadata/datasets/maestro/v2.0.0/maestro-v2.0.0-midi.zip',
      extract=True,
      cache_dir='.', cache_subdir='data',
  )

데이터 세트에는 약 1,200개의 MIDI 파일이 포함되어 있습니다.

In [ ]:

filenames = glob.glob(str(data_dir/'**/*.mid*'))
print('Number of files:', len(filenames))

MIDI 파일 처리

먼저, pretty_midi 를 사용하여 단일 MIDI 파일을 구문 분석하고 음표 형식을 검사합니다. 아래 MIDI 파일을 다운로드하여 컴퓨터에서 재생하려면 colab에서 files.download(sample_file) 을 작성하여 다운로드할 수 있습니다.

In [ ]:

sample_file = filenames[1]
print(sample_file)

샘플 MIDI 파일에 대한 PrettyMIDI 개체를 생성합니다.

In [ ]:

pm = pretty_midi.PrettyMIDI(sample_file)

샘플 파일을 재생합니다. 재생 위젯을 로드하는 데 몇 초가 걸릴 수 있습니다.

In [ ]:

def display_audio(pm: pretty_midi.PrettyMIDI, seconds=30):
  waveform = pm.fluidsynth(fs=_SAMPLING_RATE)
  # Take a sample of the generated waveform to mitigate kernel resets
  waveform_short = waveform[:seconds*_SAMPLING_RATE]
  return display.Audio(waveform_short, rate=_SAMPLING_RATE)

In [ ]:

display_audio(pm)

MIDI 파일을 검사하십시오. 어떤 종류의 악기가 사용됩니까?

In [ ]:

print('Number of instruments:', len(pm.instruments))
instrument = pm.instruments[0]
instrument_name = pretty_midi.program_to_instrument_name(instrument.program)
print('Instrument name:', instrument_name)

메모 추출

In [ ]:

for i, note in enumerate(instrument.notes[:10]):
  note_name = pretty_midi.note_number_to_name(note.pitch)
  duration = note.end - note.start
  print(f'{i}: pitch={note.pitch}, note_name={note_name},'
        f' duration={duration:.4f}')

모델을 훈련할 때 음표를 나타내기 위해 세 가지 변수(pitch, step 및 duration)를 사용합니다. 피치는 MIDI 음표 번호로서의 사운드의 지각적 품질입니다. step는 트랙의 이전 음표 또는 시작 부분에서 경과된 시간입니다. duration은 음표가 재생되는 시간(초)이며 음표 종료 시간과 음표 시작 시간의 차이입니다.

샘플 MIDI 파일에서 음표를 추출합니다.

In [ ]:

def midi_to_notes(midi_file: str) -> pd.DataFrame:
  pm = pretty_midi.PrettyMIDI(midi_file)
  instrument = pm.instruments[0]
  notes = collections.defaultdict(list)

  # Sort the notes by start time
  sorted_notes = sorted(instrument.notes, key=lambda note: note.start)
  prev_start = sorted_notes[0].start

  for note in sorted_notes:
    start = note.start
    end = note.end
    notes['pitch'].append(note.pitch)
    notes['start'].append(start)
    notes['end'].append(end)
    notes['step'].append(start - prev_start)
    notes['duration'].append(end - start)
    prev_start = start

  return pd.DataFrame({name: np.array(value) for name, value in notes.items()})

In [ ]:

raw_notes = midi_to_notes(sample_file)
raw_notes.head()

음높이보다는 음표 이름을 해석하는 것이 더 쉬울 수 있으므로 아래 기능을 사용하여 숫자 음높이 값에서 음표 이름으로 변환할 수 있습니다. 음표 이름은 음표 유형, 우발적 및 옥타브 번호(예: C#4)를 나타냅니다.

In [ ]:

get_note_names = np.vectorize(pretty_midi.note_number_to_name)
sample_note_names = get_note_names(raw_notes['pitch'])
sample_note_names[:10]

악곡을 시각화하려면 트랙 길이(즉, 피아노 롤)에 걸쳐 음표 피치, 시작 및 끝을 플롯합니다. 처음 100개의 메모로 시작

In [ ]:

def plot_piano_roll(notes: pd.DataFrame, count: Optional[int] = None):
  if count:
    title = f'First {count} notes'
  else:
    title = f'Whole track'
    count = len(notes['pitch'])
  plt.figure(figsize=(20, 4))
  plot_pitch = np.stack([notes['pitch'], notes['pitch']], axis=0)
  plot_start_stop = np.stack([notes['start'], notes['end']], axis=0)
  plt.plot(
      plot_start_stop[:, :count], plot_pitch[:, :count], color="b", marker=".")
  plt.xlabel('Time [s]')
  plt.ylabel('Pitch')
  _ = plt.title(title)

In [ ]:

plot_piano_roll(raw_notes, count=100)

전체 트랙에 대한 메모를 플로팅합니다.

In [ ]:

plot_piano_roll(raw_notes)

각 음표 변수의 분포를 확인하십시오.

In [ ]:

def plot_distributions(notes: pd.DataFrame, drop_percentile=2.5):
  plt.figure(figsize=[15, 5])
  plt.subplot(1, 3, 1)
  sns.histplot(notes, x="pitch", bins=20)

  plt.subplot(1, 3, 2)
  max_step = np.percentile(notes['step'], 100 - drop_percentile)
  sns.histplot(notes, x="step", bins=np.linspace(0, max_step, 21))
  
  plt.subplot(1, 3, 3)
  max_duration = np.percentile(notes['duration'], 100 - drop_percentile)
  sns.histplot(notes, x="duration", bins=np.linspace(0, max_duration, 21))

In [ ]:

plot_distributions(raw_notes)

MIDI 파일 생성

아래 기능을 사용하여 음표 목록에서 자신만의 MIDI 파일을 생성할 수 있습니다.

In [ ]:

def notes_to_midi(
  notes: pd.DataFrame,
  out_file: str, 
  instrument_name: str,
  velocity: int = 100,  # note loudness
) -> pretty_midi.PrettyMIDI:

  pm = pretty_midi.PrettyMIDI()
  instrument = pretty_midi.Instrument(
      program=pretty_midi.instrument_name_to_program(
          instrument_name))

  prev_start = 0
  for i, note in notes.iterrows():
    start = float(prev_start + note['step'])
    end = float(start + note['duration'])
    note = pretty_midi.Note(
        velocity=velocity,
        pitch=int(note['pitch']),
        start=start,
        end=end,
    )
    instrument.notes.append(note)
    prev_start = start

  pm.instruments.append(instrument)
  pm.write(out_file)
  return pm

In [ ]:

example_file = 'example.midi'
example_pm = notes_to_midi(
    raw_notes, out_file=example_file, instrument_name=instrument_name)

생성된 MIDI 파일을 재생하여 차이점이 있는지 확인합니다.

In [ ]:

display_audio(example_pm)

이전과 마찬가지로 files.download(example_file) 를 작성하여 이 파일을 다운로드하고 재생할 수 있습니다.

훈련 데이터 세트 생성

MIDI 파일에서 메모를 추출하여 교육 데이터 세트를 만듭니다. 적은 수의 파일을 사용하여 시작하고 나중에 더 많은 파일로 실험할 수 있습니다. 몇 분 정도 걸릴 수 있습니다.

In [ ]:

num_files = 5
all_notes = []
for f in filenames[:num_files]:
  notes = midi_to_notes(f)
  all_notes.append(notes)

all_notes = pd.concat(all_notes)

In [ ]:

n_notes = len(all_notes)
print('Number of notes parsed:', n_notes)

다음으로, 구문 분석된 메모에서 tf.data.Dataset을 만듭니다.

In [ ]:

key_order = ['pitch', 'step', 'duration']
train_notes = np.stack([all_notes[key] for key in key_order], axis=1)

In [ ]:

notes_ds = tf.data.Dataset.from_tensor_slices(train_notes)
notes_ds.element_spec

모델에 음표 시퀀스 배치에 대한 내용을 학습시킵니다. 각 예제는 일련의 음표가 입력 특성으로, 다음 음표가 레이블로 구성됩니다. 이런 식으로 모델은 시퀀스의 다음 음표를 예측하도록 훈련됩니다. 이 과정과 자세한 내용을 설명하는 다이어그램은 RNN을 사용한 텍스트 분류에서 확인할 수 있습니다.

크기가 seq_length 편리한 창 함수를 사용하여 이 형식으로 기능과 레이블을 만들 수 있습니다.

In [ ]:

def create_sequences(
    dataset: tf.data.Dataset, 
    seq_length: int,
    vocab_size = 128,
) -> tf.data.Dataset:
  """Returns TF Dataset of sequence and label examples."""
  seq_length = seq_length+1

  # Take 1 extra for the labels
  windows = dataset.window(seq_length, shift=1, stride=1,
                              drop_remainder=True)

  # `flat_map` flattens the" dataset of datasets" into a dataset of tensors
  flatten = lambda x: x.batch(seq_length, drop_remainder=True)
  sequences = windows.flat_map(flatten)
  
  # Normalize note pitch
  def scale_pitch(x):
    x = x/[vocab_size,1.0,1.0]
    return x

  # Split the labels
  def split_labels(sequences):
    inputs = sequences[:-1]
    labels_dense = sequences[-1]
    labels = {key:labels_dense[i] for i,key in enumerate(key_order)}

    return scale_pitch(inputs), labels

  return sequences.map(split_labels, num_parallel_calls=tf.data.AUTOTUNE)

각 예의 시퀀스 길이를 설정합니다. 다른 길이(예: 50, 100, 150)로 실험하여 어느 것이 데이터에 가장 적합한지 확인하거나 초 매개변수 조정을 사용합니다. 어휘 (크기 vocab_size )에 의해 지원되는 모든 피치 나타내는 128로 설정 pretty_midi .

In [ ]:

seq_length = 25
vocab_size = 128
seq_ds = create_sequences(notes_ds, seq_length, vocab_size)
seq_ds.element_spec

데이터 세트의 모양은 (100,1) . 즉, 모델은 100개의 메모를 입력으로 사용하고 다음 메모를 출력으로 예측하는 방법을 학습합니다.

In [ ]:

for seq, target in seq_ds.take(1):
  print('sequence shape:', seq.shape)
  print('sequence elements (first 10):', seq[0: 10])
  print()
  print('target:', target)

예제를 일괄 처리하고 성능을 위해 데이터 세트를 구성합니다.

In [ ]:

batch_size = 64
buffer_size = n_notes - seq_length  # the number of items in the dataset
train_ds = (seq_ds
            .shuffle(buffer_size)
            .batch(batch_size, drop_remainder=True)
            .cache()
            .prefetch(tf.data.experimental.AUTOTUNE))

In [ ]:

train_ds.element_spec

모델 생성 및 학습

모델에는 각 음표 변수에 대해 하나씩 3개의 출력이 있습니다. step 및 duration의 경우 모델이 음이 아닌 값을 출력하도록 권장하는 평균 제곱 오차를 기반으로 하는 사용자 정의 손실 함수를 사용합니다.

In [ ]:

def mse_with_positive_pressure(y_true: tf.Tensor, y_pred: tf.Tensor):
  mse = (y_true - y_pred) ** 2
  positive_pressure = 10 * tf.maximum(-y_pred, 0.0)
  return tf.reduce_mean(mse + positive_pressure)

In [ ]:

input_shape = (seq_length, 3)
learning_rate = 0.005

inputs = tf.keras.Input(input_shape)
x = tf.keras.layers.LSTM(128)(inputs)

outputs = {
  'pitch': tf.keras.layers.Dense(128, name='pitch')(x),
  'step': tf.keras.layers.Dense(1, name='step')(x),
  'duration': tf.keras.layers.Dense(1, name='duration')(x),
}

model = tf.keras.Model(inputs, outputs)

loss = {
      'pitch': tf.keras.losses.SparseCategoricalCrossentropy(
          from_logits=True),
      'step': mse_with_positive_pressure,
      'duration': mse_with_positive_pressure,
}

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

model.compile(loss=loss, optimizer=optimizer)

model.summary()

model.evaluate 함수를 테스트하면 pitch step 및 duration 손실보다 훨씬 큰 것을 알 수 있습니다. loss 은 다른 모든 손실을 합산하여 계산된 총 손실이며 현재 pitch 손실이 지배합니다.

In [ ]:

losses = model.evaluate(train_ds, return_dict=True)
losses

이 균형을 유지하는 한 가지 방법은 loss_weights 인수를 사용하여 컴파일하는 것입니다.

In [ ]:

model.compile(
    loss=loss,
    loss_weights={
        'pitch': 0.05,
        'step': 1.0,
        'duration':1.0,
    },
    optimizer=optimizer,
)

loss 후 개별 손실의 가중 합이된다.

In [ ]:

model.evaluate(train_ds, return_dict=True)

모델을 훈련시킵니다.

In [ ]:

callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        filepath='./training_checkpoints/ckpt_{epoch}',
        save_weights_only=True),
    tf.keras.callbacks.EarlyStopping(
        monitor='loss',
        patience=5,
        verbose=1,
        restore_best_weights=True),
]

In [ ]:

%%time
epochs = 50

history = model.fit(
    train_ds,
    epochs=epochs,
    callbacks=callbacks,
)

In [ ]:

plt.plot(history.epoch, history.history['loss'], label='total loss')
plt.show()

메모 생성

모델을 사용하여 메모를 생성하려면 먼저 메모의 시작 순서를 제공해야 합니다. 아래 함수는 일련의 음표에서 하나의 음표를 생성합니다.

음표 피치의 경우 모델로 생성힌 음표의 소프트맥스 분포에서 샘플을 추출하며 단순히 가장 높은 확률의 음표를 선택하지 않습니다. 항상 가장 높은 확률의 음표를 선택하면 음표 시퀀스가 반복적으로 생성될 수 있습니다.

temperature 매개변수는 생성된 음표의 무작위성을 제어하는 데 사용할 수 있습니다. RNN을 사용한 텍스트 생성 에서 온도에 대한 자세한 내용을 찾을 수 있습니다.

In [ ]:

def predict_next_note(
    notes: np.ndarray, 
    keras_model: tf.keras.Model, 
    temperature: float = 1.0) -> tuple[int, float, float]:
  """Generates a note as a tuple of (pitch, step, duration), using a trained sequence model."""

  assert temperature > 0

  # Add batch dimension
  inputs = tf.expand_dims(notes, 0)

  predictions = model.predict(inputs)
  pitch_logits = predictions['pitch']
  step = predictions['step']
  duration = predictions['duration']
 
  pitch_logits /= temperature
  pitch = tf.random.categorical(pitch_logits, num_samples=1)
  pitch = tf.squeeze(pitch, axis=-1)
  duration = tf.squeeze(duration, axis=-1)
  step = tf.squeeze(step, axis=-1)

  # `step` and `duration` values should be non-negative
  step = tf.maximum(0, step)
  duration = tf.maximum(0, duration)

  return int(pitch), float(step), float(duration)

이제 몇 가지 메모를 생성합니다. next_notes 에서 온도와 시작 순서를 가지고 놀고 무슨 일이 일어나는지 볼 수 있습니다.

In [ ]:

temperature = 2.0
num_predictions = 120

sample_notes = np.stack([raw_notes[key] for key in key_order], axis=1)

# The initial sequence of notes; pitch is normalized similar to training
# sequences
input_notes = (
    sample_notes[:seq_length] / np.array([vocab_size, 1, 1]))

generated_notes = []
prev_start = 0
for _ in range(num_predictions):
  pitch, step, duration = predict_next_note(input_notes, model, temperature)
  start = prev_start + step
  end = start + duration
  input_note = (pitch, step, duration)
  generated_notes.append((*input_note, start, end))
  input_notes = np.delete(input_notes, 0, axis=0)
  input_notes = np.append(input_notes, np.expand_dims(input_note, 0), axis=0)
  prev_start = start

generated_notes = pd.DataFrame(
    generated_notes, columns=(*key_order, 'start', 'end'))

In [ ]:

generated_notes.head(10)

In [ ]:

out_file = 'output.mid'
out_pm = notes_to_midi(
    generated_notes, out_file=out_file, instrument_name=instrument_name)
display_audio(out_pm)

아래 두 줄을 추가하여 오디오 파일을 다운로드할 수도 있습니다.

from google.colab import files
files.download(out_file)

생성된 메모를 시각화합니다.

In [ ]:

plot_piano_roll(generated_notes)

pitch , step , duration 의 분포 를 확인 하십시오 .

In [ ]:

plot_distributions(generated_notes)

위의 플롯에서 음표 변수 분포의 변화를 확인할 수 있습니다. 모델의 출력과 입력 사이에 피드백 루프가 있기 때문에 모델은 손실을 줄이기 위해 유사한 출력 시퀀스를 생성하는 경향이 있습니다. 이는 특히 MSE 손실을 사용하는 step 및 duration과 관련이 있습니다. pitch의 경우 predict_next_note에서 temperature를 높여 임의성을 높일 수 있습니다.

다음 단계

이 튜토리얼은 RNN을 사용하여 MIDI 파일 데이터 세트에서 일련의 음표를 생성하는 방법을 보여주었습니다. 자세히 알아보려면 추가 다이어그램과 설명이 포함 된 RNN 자습서로 밀접하게 관련된 텍스트 생성을 방문하세요.

음악 생성에 RNN을 사용하는 대안 중 하나는 GAN을 사용하는 것입니다. GAN 기반 접근 방식은 오디오를 생성하는 대신 전체 시퀀스를 병렬로 생성할 수 있습니다. Magenta 팀은 GANSynth를 사용하여 이 접근 방식에 대한 인상적인 작업을 수행했습니다. Magenta 프로젝트 웹사이트에서 멋진 음악 및 예술 프로젝트와 오픈 소스 코드를 많이 확인할 수도 있습니다.