generative deep learningにて、生成型の機械学習の勉強をしている。その7章で作曲をAIで行う面白いプロジェクトがあったので、多少の説明と共に記載する。




  • pythonライブラリmusic21がインストール済みであること。
  • musescore3がインストール済みであること。
  • .music21が設定済みであること。
  • cuda等GPUの使用設定済みであること。
  • jupyter notebookが使用できる環境であること。


music21自体はpip install music21でインストール可能。 筆者はlinuxのオンプレ機にインストールしているため、 aptでインストールしたmusescore3のパスを以下のように記入した。

<preference name="musescoreDirectPNGPath" value="/usr/bin/mscore3" />
<preference name="musicxmlPath" value="/usr/bin/mscore3" /


import os
import pickle
import numpy
from music21 import note, chord

from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.utils import plot_model

from models.RNNAttention import get_distinct, create_lookups, prepare_sequences, get_music_list, create_network



  • store:以下四つのデータをためておく
    • distincts: 明確さ?(RNNAttentionの理解が必要だ・・・)
    • durations: 音の長さ
    • lookups: 注目度?(RNNAttentionの理解が必要だ・・・)
    • notes: 音程
  • output: 出力のMIDIファイルを格納する
  • weights: 重みを格納する
  • viz: TF2.2でエラーが出る場合の重みの保存場所らしい?


# run params
section = 'compose'
run_id = '0006'
music_name = 'cello'

run_folder = 'run/{}/'.format(section)
run_folder += '_'.join([run_id, music_name])

store_folder = os.path.join(run_folder, 'store')
data_folder = os.path.join('data', music_name)

if not os.path.exists(run_folder):
    os.mkdir(os.path.join(run_folder, 'store'))
    os.mkdir(os.path.join(run_folder, 'output'))
    os.mkdir(os.path.join(run_folder, 'weights'))
    os.mkdir(os.path.join(run_folder, 'viz'))
mode = 'build' # 'load' # 

# data params
intervals = range(1)
seq_len = 32  # データを32個の音符の小さな塊へと分解する

# model params
embed_size = 100  
rnn_units = 256  
use_attention = True  # attensionを使用するか


if mode == 'build':
    # data_folderに入っているmidiファイルのリストとパーサーを取得
    music_list, parser = get_music_list(data_folder)
    print(len(music_list), 'files in total')
    # 音程と音の長さを入れるリストを作成
    notes = []
    durations = []

    # パース開始
    for i, file in enumerate(music_list):
        print(i+1, "Parsing %s" % file)
        original_score = parser.parse(file).chordify()
        for interval in intervals:
            score = original_score.transpose(interval)

            notes.extend(['START'] * seq_len)
            durations.extend([0]* seq_len)

            for element in score.flat:
                if isinstance(element, note.Note):
                    if element.isRest:

                if isinstance(element, chord.Chord):
                    notes.append('.'.join(n.nameWithOctave for n in element.pitches))

    with open(os.path.join(store_folder, 'notes'), 'wb') as f:
        pickle.dump(notes, f) #['G2', 'D3', 'B3', 'A3', 'B3', 'D3', 'B3', 'D3', 'G2',...]
    with open(os.path.join(store_folder, 'durations'), 'wb') as f:
        pickle.dump(durations, f) 
    with open(os.path.join(store_folder, 'notes'), 'rb') as f:
        notes = pickle.load(f) #['G2', 'D3', 'B3', 'A3', 'B3', 'D3', 'B3', 'D3', 'G2',...]
    with open(os.path.join(store_folder, 'durations'), 'rb') as f:
        durations = pickle.load(f) 
36 files in total
1 Parsing data/cello/cs5-4sar.mid
2 Parsing data/cello/cs1-5men.mid
3 Parsing data/cello/cs3-1pre.mid
4 Parsing data/cello/cs6-5gav.mid
5 Parsing data/cello/cs5-3cou.mid
6 Parsing data/cello/cs2-5men.mid
7 Parsing data/cello/cs1-1pre.mid
8 Parsing data/cello/cs3-3cou.mid
9 Parsing data/cello/cs5-5gav.mid
10 Parsing data/cello/cs3-6gig.mid
11 Parsing data/cello/cs2-6gig.mid
12 Parsing data/cello/cs3-4sar.mid
13 Parsing data/cello/cs6-4sar.mid
14 Parsing data/cello/cs6-6gig.mid
15 Parsing data/cello/cs4-3cou.mid
16 Parsing data/cello/cs4-5bou.mid
17 Parsing data/cello/cs5-2all.mid
18 Parsing data/cello/cs4-2all.mid
19 Parsing data/cello/cs2-3cou.mid
20 Parsing data/cello/cs1-6gig.mid
21 Parsing data/cello/cs1-4sar.mid
22 Parsing data/cello/cs3-2all.mid
23 Parsing data/cello/cs6-3cou.mid
24 Parsing data/cello/cs4-4sar.mid
25 Parsing data/cello/cs3-5bou.mid
26 Parsing data/cello/cs2-2all.mid
27 Parsing data/cello/cs6-2all.mid
28 Parsing data/cello/cs6-1pre.mid
29 Parsing data/cello/cs4-1pre.mid
30 Parsing data/cello/cs2-1pre.mid
31 Parsing data/cello/cs5-6gig.mid
32 Parsing data/cello/cs5-1pre.mid
33 Parsing data/cello/cs1-2all.mid
34 Parsing data/cello/cs4-6gig.mid
35 Parsing data/cello/cs2-4sar.mid
36 Parsing data/cello/cs1-3cou.mid

Create the lookup tables

# get the distinct sets of notes and durations
note_names, n_notes = get_distinct(notes)
duration_names, n_durations = get_distinct(durations)
distincts = [note_names, n_notes, duration_names, n_durations]

with open(os.path.join(store_folder, 'distincts'), 'wb') as f:
    pickle.dump(distincts, f)

# lookup辞書を作って保存する
note_to_int, int_to_note = create_lookups(note_names)
duration_to_int, int_to_duration = create_lookups(duration_names)
lookups = [note_to_int, int_to_note, duration_to_int, int_to_duration]

with open(os.path.join(store_folder, 'lookups'), 'wb') as f:
    pickle.dump(lookups, f)

# lookupsのリストのひとつめに格納したnote_to_intを確認する
# たぶん、音程をどの数字に変換したかということ? 
{'A2': 0,
 'A2.A3': 1,
 'G5': 459,
 'START': 460}
# おそらく、音の長さをどの数字に変換したかを確認している?

{0: 0,
 Fraction(1, 12): 1,
 Fraction(1, 6): 2,
 0.25: 3,
 Fraction(1, 3): 4,
 Fraction(5, 12): 5,
 0.5: 6,
 Fraction(2, 3): 7,
 0.75: 8,
 1.0: 9,
 1.25: 10,
 Fraction(4, 3): 11,
 1.5: 12,
 1.75: 13,
 2.0: 14,
 2.25: 15,
 2.5: 16,
 3.0: 17,
 4.0: 18}


network_input, network_output = prepare_sequences(notes, durations, lookups, distincts, seq_len)
# ピッチ情報を出力
print('pitch input')

# 音の長さの情報を出力
print('duration input')

print('pitch output')

print('duration output')
pitch input
[460 460 460 460 460 460 460 460 460 460 460 460 460 460 460 460 460 460
 460 460 460 460 460 460 460 460 460 460 460 460 460 460]
duration input
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
pitch output
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0.]
duration output
[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


model, att_model = create_network(n_notes, n_durations, embed_size, rnn_units, use_attention)
# モデルのサマリーを作成
Model: "model"
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
input_2 (InputLayer)            [(None, None)]       0                                            
embedding (Embedding)           (None, None, 100)    46100       input_1[0][0]                    
embedding_1 (Embedding)         (None, None, 100)    1900        input_2[0][0]                    
concatenate (Concatenate)       (None, None, 200)    0           embedding[0][0]                  
lstm (LSTM)                     (None, None, 256)    467968      concatenate[0][0]                
lstm_1 (LSTM)                   (None, None, 256)    525312      lstm[0][0]                       
dense (Dense)                   (None, None, 1)      257         lstm_1[0][0]                     
reshape (Reshape)               (None, None)         0           dense[0][0]                      
activation (Activation)         (None, None)         0           reshape[0][0]                    
repeat_vector (RepeatVector)    (None, 256, None)    0           activation[0][0]                 
permute (Permute)               (None, None, 256)    0           repeat_vector[0][0]              
multiply (Multiply)             (None, None, 256)    0           lstm_1[0][0]                     
lambda (Lambda)                 (None, 256)          0           multiply[0][0]                   
pitch (Dense)                   (None, 461)          118477      lambda[0][0]                     
duration (Dense)                (None, 19)           4883        lambda[0][0]                     
Total params: 1,164,897
Trainable params: 1,164,897
Non-trainable params: 0
#Currently errors in TF2.2
#plot_model(model, to_file=os.path.join(run_folder ,'viz/model.png'), show_shapes = True, show_layer_names = True)


weights_folder = os.path.join(run_folder, 'weights')
## 追加学習の場合、以下をコメントアウト外す
# model.load_weights(os.path.join(weights_folder, "weights.h5"))

weights_folder = os.path.join(run_folder, 'weights')

# 学習途中のチェックポイントを設定
checkpoint1 = ModelCheckpoint(
    os.path.join(weights_folder, "weights-improvement-{epoch:02d}-{loss:.4f}-bigger.h5"),

# 最終チェックポイントを設定
checkpoint2 = ModelCheckpoint(
    os.path.join(weights_folder, "weights.h5"),

# 早期学習終了の設定
early_stopping = EarlyStopping(
    , restore_best_weights=True
    , patience = 10

callbacks_list = [
    , checkpoint2
    , early_stopping

model.save_weights(os.path.join(weights_folder, "weights.h5"))

# 学習
model.fit(network_input, network_output
          , epochs=2000000, batch_size=32
          , validation_split = 0.2
          , callbacks=callbacks_list
          , shuffle=True
# epoch数がかなり多いが早期学習終了するのでこのままでよい

Epoch 1/2000000
720/720 [==============================] - 34s 47ms/step - loss: 4.3821 - pitch_loss: 3.5396 - duration_loss: 0.8425 - val_loss: 3.7145 - val_pitch_loss: 3.1518 - val_duration_loss: 0.5626
Epoch 2/2000000
720/720 [==============================] - 35s 48ms/step - loss: 3.8736 - pitch_loss: 3.2409 - duration_loss: 0.6327 - val_loss: 3.6074 - val_pitch_loss: 3.0458 - val_duration_loss: 0.5616
Epoch 152/2000000
720/720 [==============================] - 35s 48ms/step - loss: 0.2035 - pitch_loss: 0.1727 - duration_loss: 0.0308 - val_loss: 7.7287 - val_pitch_loss: 6.2841 - val_duration_loss: 1.4446
Epoch 153/2000000
720/720 [==============================] - 35s 48ms/step - loss: 0.2063 - pitch_loss: 0.1768 - duration_loss: 0.0295 - val_loss: 7.6420 - val_pitch_loss: 6.2567 - val_duration_loss: 1.3853

