RNNAttensionで実装する作曲AI-推論編-【2022】

TIP
generative deep learningにて、生成型の機械学習の勉強をしている。その7章で作曲をAIで行う面白いプロジェクトがあったので、学習・推論を行った。前回学習を行ったので、今回は推論について記載する。なお、オンプレのGPU機にて学習・推論を行なっている。

前提#

pythonライブラリmusic21がインストール済みであること。
musescore3がインストール済みであること。
.music21が設定済みであること。
cuda等GPUが使用できるよう設定済みであること。
JupyterNotebookが使用できる環境であること。

ライブラリのインポート#

ルックアップテーブル参照用のpickleや、推論用のRNNAttension、分析用のmatplotlib等をインポートする。

1
import pickle as pkl
2
import time
3
import os
4
import numpy as np
5
import sys
6
from music21 import instrument, note, stream, chord, duration
7
from models.RNNAttention import create_network, sample_with_temp
8
import matplotlib.pyplot as plt

パラメータとフォルダの設定#

1
# run params
2
section = 'compose'
3
run_id = '0006'
4
music_name = 'cello'
5
run_folder = 'run/{}/'.format(section)
6
run_folder += '_'.join([run_id, music_name])
7

8
# model params
9
embed_size = 100
10
rnn_units = 256
11
use_attention = True

ルックアップテーブルのロード#

学習時に保存していたdistincts,lookupsのパラメータをロードする。

1
store_folder = os.path.join(run_folder, 'store')
2

3
with open(os.path.join(store_folder, 'distincts'), 'rb') as filepath:
4
    distincts = pkl.load(filepath)
5
    note_names, n_notes, duration_names, n_durations = distincts
6

7
with open(os.path.join(store_folder, 'lookups'), 'rb') as filepath:
8
    lookups = pkl.load(filepath)
9
    note_to_int, int_to_note, duration_to_int, int_to_duration = lookups

モデルのビルド#

学習した重みをロードして、モデルをビルドする。

1
weights_folder = os.path.join(run_folder, 'weights')
2
weights_file = 'weights.h5'
3

4
model, att_model = create_network(n_notes, n_durations, embed_size, rnn_units, use_attention)
5

6
# Load the weights to each node
7
weight_source = os.path.join(weights_folder,weights_file)
8
model.load_weights(weight_source)
9
model.summary()

1
Model: "model"
2
__________________________________________________________________________________________________
3
Layer (type)                    Output Shape         Param #     Connected to
4
==================================================================================================
5
input_1 (InputLayer)            [(None, None)]       0
6
__________________________________________________________________________________________________
7
input_2 (InputLayer)            [(None, None)]       0
8
__________________________________________________________________________________________________
9
embedding (Embedding)           (None, None, 100)    46100       input_1[0][0]
10
__________________________________________________________________________________________________
11
embedding_1 (Embedding)         (None, None, 100)    1900        input_2[0][0]
12
__________________________________________________________________________________________________
13
concatenate (Concatenate)       (None, None, 200)    0           embedding[0][0]
14
                                                                 embedding_1[0][0]
15
__________________________________________________________________________________________________
16
lstm (LSTM)                     (None, None, 256)    467968      concatenate[0][0]
17
__________________________________________________________________________________________________
18
lstm_1 (LSTM)                   (None, None, 256)    525312      lstm[0][0]
19
__________________________________________________________________________________________________
20
dense (Dense)                   (None, None, 1)      257         lstm_1[0][0]
21
__________________________________________________________________________________________________
22
reshape (Reshape)               (None, None)         0           dense[0][0]
23
__________________________________________________________________________________________________
24
activation (Activation)         (None, None)         0           reshape[0][0]
25
__________________________________________________________________________________________________
26
repeat_vector (RepeatVector)    (None, 256, None)    0           activation[0][0]
27
__________________________________________________________________________________________________
28
permute (Permute)               (None, None, 256)    0           repeat_vector[0][0]
29
__________________________________________________________________________________________________
30
multiply (Multiply)             (None, None, 256)    0           lstm_1[0][0]
31
                                                                 permute[0][0]
32
__________________________________________________________________________________________________
33
lambda (Lambda)                 (None, 256)          0           multiply[0][0]
34
__________________________________________________________________________________________________
35
pitch (Dense)                   (None, 461)          118477      lambda[0][0]
36
__________________________________________________________________________________________________
37
duration (Dense)                (None, 19)           4883        lambda[0][0]
38
==================================================================================================
39
Total params: 1,164,897
40
Trainable params: 1,164,897
41
Non-trainable params: 0
42
__________________________________________________________________________________________________

推論開始時のフレーズを指定する#

あるフレーズから推論を開始するため、最初のフレーズを指定する。何も指定しないことも可能。

1
# prediction params
2
notes_temp=0.5
3
duration_temp = 0.5
4
max_extra_notes = 50
5
max_seq_len = 32
6
seq_len = 32
7

8
#notes = ['START', 'D3', 'D3', 'E3', 'D3', 'G3', 'F#3','D3', 'D3', 'E3', 'D3', 'G3', 'F#3','D3', 'D3', 'E3', 'D3', 'G3', 'F#3','D3', 'D3', 'E3', 'D3', 'G3', 'F#3']
9
#durations = [0, 0.75, 0.25, 1, 1, 1, 2, 0.75, 0.25, 1, 1, 1, 2, 0.75, 0.25, 1, 1, 1, 2, 0.75, 0.25, 1, 1, 1, 2]
10

11

12
#notes = ['START', 'F#3', 'G#3', 'F#3', 'E3', 'F#3', 'G#3', 'F#3', 'E3', 'F#3', 'G#3', 'F#3', 'E3','F#3', 'G#3', 'F#3', 'E3', 'F#3', 'G#3', 'F#3', 'E3', 'F#3', 'G#3', 'F#3', 'E3']
13
#durations = [0, 0.75, 0.25, 1, 1, 1, 2, 0.75, 0.25, 1, 1, 1, 2, 0.75, 0.25, 1, 1, 1, 2, 0.75, 0.25, 1, 1, 1, 2]
14

15

16
notes = ['START', 'C3', 'C3', 'G3', 'G3', 'A3', 'A3', 'G3']
17
durations = [0, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 1]
18

19
if seq_len is not None:
20
    notes = ['START'] * (seq_len - len(notes)) + notes
21
    durations = [0] * (seq_len - len(durations)) + durations
22

23
sequence_length = len(notes)

## 生成一連のnotesに基づいてニューラルネットワークから新しいnotesを生成します

1
prediction_output = []
2
notes_input_sequence = []
3
durations_input_sequence = []
4

5
overall_preds = []
6

7
for n, d in zip(notes,durations):
8
    note_int = note_to_int[n]
9
    duration_int = duration_to_int[d]
10

11
    notes_input_sequence.append(note_int)
12
    durations_input_sequence.append(duration_int)
13

14
    prediction_output.append([n, d])
15

16
    if n != 'START':
17
        midi_note = note.Note(n)
18

19
        new_note = np.zeros(128)
20
        new_note[midi_note.pitch.midi] = 1
21
        overall_preds.append(new_note)
22

23

24
att_matrix = np.zeros(shape = (max_extra_notes+sequence_length, max_extra_notes))
25

26
for note_index in range(max_extra_notes):
27

28
    prediction_input = [
29
        np.array([notes_input_sequence])
30
        , np.array([durations_input_sequence])
31
       ]
32

33
    notes_prediction, durations_prediction = model.predict(prediction_input, verbose=0)
34
    if use_attention:
35
        att_prediction = att_model.predict(prediction_input, verbose=0)[0]
36
        att_matrix[(note_index-len(att_prediction)+sequence_length):(note_index+sequence_length), note_index] = att_prediction
37

38
    new_note = np.zeros(128)
39

40
    for idx, n_i in enumerate(notes_prediction[0]):
41
        try:
42
            note_name = int_to_note[idx]
43
            midi_note = note.Note(note_name)
44
            new_note[midi_note.pitch.midi] = n_i
45

46
        except:
47
            pass
48

49
    overall_preds.append(new_note)
50

51

52
    i1 = sample_with_temp(notes_prediction[0], notes_temp)
53
    i2 = sample_with_temp(durations_prediction[0], duration_temp)
54

55

56
    note_result = int_to_note[i1]
57
    duration_result = int_to_duration[i2]
58

59
    prediction_output.append([note_result, duration_result])
60

61
    notes_input_sequence.append(i1)
62
    durations_input_sequence.append(i2)
63

64
    if len(notes_input_sequence) > max_seq_len:
65
        notes_input_sequence = notes_input_sequence[1:]
66
        durations_input_sequence = durations_input_sequence[1:]
67

68
#     print(note_result)
69
#     print(duration_result)
70

71
    if note_result == 'START':
72
        break
73

74
overall_preds = np.transpose(np.array(overall_preds))
75
print('Generated sequence of {} notes'.format(len(prediction_output)))

1
Generated sequence of 82 notes

確信度をプロット#

ヒートマップで作成した各notesの確信度をプロットする。

1
fig, ax = plt.subplots(figsize=(15,15))
2
ax.set_yticks([int(j) for j in range(35,70)])
3

4
plt.imshow(overall_preds[35:70,:], origin="lower", cmap='coolwarm', vmin = -0.5, vmax = 0.5, extent=[0, max_extra_notes, 35,70])

png

MIDIファイル生成・再生#

予測からの出力をnotesに変換し、notesからMIDIファイルを作成する。

1
output_folder = os.path.join(run_folder, 'output')
2

3
midi_stream = stream.Stream()
4

5
# create note and chord objects based on the values generated by the model
6
for pattern in prediction_output:
7
    note_pattern, duration_pattern = pattern
8
    # pattern is a chord
9
    if ('.' in note_pattern):
10
        notes_in_chord = note_pattern.split('.')
11
        chord_notes = []
12
        for current_note in notes_in_chord:
13
            new_note = note.Note(current_note)
14
            new_note.duration = duration.Duration(duration_pattern)
15
            new_note.storedInstrument = instrument.Violoncello()
16
            chord_notes.append(new_note)
17
        new_chord = chord.Chord(chord_notes)
18
        midi_stream.append(new_chord)
19
    elif note_pattern == 'rest':
20
    # pattern is a rest
21
        new_note = note.Rest()
22
        new_note.duration = duration.Duration(duration_pattern)
23
        new_note.storedInstrument = instrument.Violoncello()
24
        midi_stream.append(new_note)
25
    elif note_pattern != 'START':
26
    # pattern is a note
27
        new_note = note.Note(note_pattern)
28
        new_note.duration = duration.Duration(duration_pattern)
29
        new_note.storedInstrument = instrument.Violoncello()
30
        midi_stream.append(new_note)
31

32

33

34
midi_stream = midi_stream.chordify()
35
timestr = time.strftime("%Y%m%d-%H%M%S")
36
midi_stream.write('midi', fp=os.path.join(output_folder, 'output-' + timestr + '.mid'))

1
'run/compose/0006_cello/output/output-20220426-225539.mid'

1
midi_stream.show('midi')

AI生成メロディリンク(SoundCloud)

生成したnotesの確信度を確認#

それぞれの推論notesの確信度をプロット

1
## attention plot
2
if use_attention:
3
    fig, ax = plt.subplots(figsize=(20,20))
4

5
    im = ax.imshow(att_matrix[(seq_len-2):,], cmap='coolwarm', interpolation='nearest')
6

7
    # Minor ticks
8
    ax.set_xticks(np.arange(-.5, len(prediction_output)- seq_len, 1), minor=True);
9
    ax.set_yticks(np.arange(-.5, len(prediction_output)- seq_len, 1), minor=True);
10

11
    # Gridlines based on minor ticks
12
    ax.grid(which='minor', color='black', linestyle='-', linewidth=1)
13

14
    # We want to show all ticks...
15
    ax.set_xticks(np.arange(len(prediction_output) - seq_len))
16
    ax.set_yticks(np.arange(len(prediction_output)- seq_len+2))
17
    # ... and label them with the respective list entries
18
    ax.set_xticklabels([n[0] for n in prediction_output[(seq_len):]])
19
    ax.set_yticklabels([n[0] for n in prediction_output[(seq_len - 2):]])
20

21
    # ax.grid(color='black', linestyle='-', linewidth=1)
22

23
    ax.xaxis.tick_top()
24

25
    plt.setp(ax.get_xticklabels(), rotation=90, ha="left", va = "center",
26
             rotation_mode="anchor")
27

28
    plt.show()

png