forked from PedroUria/DL-Music_Generation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
training_many_deff_two_voices_dict.py
703 lines (629 loc) · 32.2 KB
/
training_many_deff_two_voices_dict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
# ------------------------------------------------------------
# This script consists on our approach to training the network
# on many songs by computing the aggregated loss of each sequence
# of each song, and updating for each sequences (better explained
# on the final report). Also, the hidden and cell states are kept
# track and distinguished for each song.
# ------------------------------------------------------------
import os
import music21 as ms # python3 -m pip install --user music21 for installing on ubuntu instance
import numpy as np
import torch
import torch.nn as nn
from encoder_decoder import encode, decode
from combine import combine
import matplotlib.pyplot as plt
from time import time
from random import randint
# ----------------------------------------------------
# Getting all the paths for the files under classical
# ----------------------------------------------------
path = os.getcwd()[:-4] + "data/classical"
files_by_author_and_subgenre = {}
# https://stackoverflow.com/questions/973473/getting-a-list-of-all-subdirectories-in-the-current-directory
for dire in [x[0] for x in os.walk(path)][1:]:
if ".mid" in " ".join(os.listdir(dire)):
files_by_author_and_subgenre[key][dire[dire.find(key) + len(key) + 1:]] = [dire + "/" + i for i in os.listdir(dire)]
else:
key = dire[dire.find("classical/")+10:]
files_by_author_and_subgenre[key] = {}
files_by_author = {}
for author, files in files_by_author_and_subgenre.items():
files_by_author[author] = []
for subgenre_files in files.values():
files_by_author[author] += subgenre_files
files_by_subgenre = {}
for files in files_by_author_and_subgenre.values():
for key, filess in files.items():
if key in files_by_subgenre:
files_by_subgenre[key] += filess
else:
files_by_subgenre[key] = filess
# ------------------------------
# Defining our Loading Functions
# ------------------------------
def get_both_hands(midi_file, time_step=0.05):
"""
Encodes the two hands of a MIDI file and
Stacks them together on horizontally
Components [0:89] will be left hand
And components [89:] will be right hand
:param midi_file: path to the file
:param time_step: Duration of each vector
:return: Encoded matrix with both hands on it
"""
# Reads the file and encodes each hand separately
hands = ms.converter.parse(midi_file)
voice = False # If there is more than one voice on
for idx, nt in enumerate(hands[0]): # the right hand (first part), just
if type(nt) == ms.stream.Voice: # takes the first voice
voice = True
break
if voice:
right_notes = encode(hands[0][idx], time_step=time_step)
else:
right_notes = encode(hands[0], time_step=time_step)
for idx, nt in enumerate(hands[1]): # the left hand (second part), just
if type(nt) == ms.stream.Voice: # takes the first voice
voice = True
break
if voice:
left_notes = encode(hands[1][idx], time_step=time_step)
else:
left_notes = encode(hands[1], time_step=time_step)
# Gets rid of the tempo component
right_notes, left_notes = right_notes[:, :-1], left_notes[:, :-1]
# Stacks both hands together
both = np.empty((max([right_notes.shape[0], left_notes.shape[0]]), 178))
left, right = False, False
rest_shortest = np.zeros(89)
rest_shortest[87] = 1
if left_notes.shape[0] > right_notes.shape[0]:
longest = np.copy(left_notes)
left = True
elif right_notes.shape[0] > left_notes.shape[0]:
longest = np.copy(right_notes)
right = True
for idx in range(both.shape[0]):
try:
both[idx, :] = np.hstack((left_notes[idx, :], right_notes[idx, :]))
except IndexError:
if left:
both[idx, :] = np.hstack((longest[idx, :], rest_shortest))
if right:
both[idx, :] = np.hstack((rest_shortest, longest[idx, :]))
return both
def load(author, subgenre, number, time_step=0.25):
"""
Loads the given musical pieces
:param author: Author's name
:param subgenre: Genre
:param number: Number of pieces to load
:param time_step: Duration of each vector
:return: Dictionary containing the encoded files
"""
start = time()
songs = files_by_author_and_subgenre[author][subgenre][:number]
encoded_notes = {}
j = 0
for i in range(len(songs)):
try:
encoded_notes[j] = get_both_hands(songs[i], time_step=time_step) # Encodes both hands of the piece
encoded_notes[j] = torch.from_numpy(encoded_notes[j].reshape(-1, 1, 178)).float().cuda() # as tensor
print("File number", i, "loaded")
j += 1
except:
print("There was an error encoding this file", songs[i])
print("The loading process took", round(time() - start), "seconds")
print(songs)
return encoded_notes
# -----------------------------------------
# Building our Neural Network Architecture
# -----------------------------------------
# Code modified from https://github.com/amir-jafari/Deep-Learning/blob/master/Pytorch_/7-RNN/1_RNN.py
class LSTMMusic(nn.Module):
"""
LSTM network that will try to learn the pattern within a series
of musical pieces. It consists on a single LSTM layer followed
by a fully connected Output Layer with a Sigmoid activation function
"""
def __init__(self, input_size, hidden_size):
super(LSTMMusic, self).__init__()
# Input of shape (seq_len, batch_size, input_size)
self.lstm = nn.LSTM(input_size, hidden_size)
# Fully connected Layer at the end, output_size=input_size because we want to predict
self.out = nn.Linear(hidden_size, input_size) # the next note/sequence of notes
# We use a Sigmoid activation function instead of the usual Softmax
# because we want to predict potentially more than one label per vector,
# like for example, when we have a hold or a chord
# Idea from: https://www.depends-on-the-definition.com/guide-to-multi-label-classification-with-neural-networks/
self.act = nn.Sigmoid()
def forward(self, x, h_c_state):
y_pred, h_c_state = self.lstm(x, h_c_state)
return self.act(self.out(y_pred)), h_c_state
# ------------------------------
# Defining our Training Function
# ------------------------------
def train_lstm_loss_only_last(seq_len, hidden_size=178, lr=0.01,
n_epochs=100, use_all_seq=False, use_n_seq=100):
"""
Training function where we compare only the last predicted note to get the loss,
meaning that we want to focus on predicting the next note even if we input
a sequence of many notes. If input_seq = [1, 2, 3], we get [2, 3, 4] as predicted
but only use [4] and its real value to get the loss
In this approach, we go compute for a particular sequence, for all songs, by adding the losses
on that particular sequence (number) together and then only going backwards and updating the network
once we have gone through all the songs, for that sequence number. We repeat this for each sequence number,
and of course, for each epoch too. The difference between here and training_many_deff_two_voices.py is
that the hidden and cell states are kept for each song in a dictionary
:param seq_len: Number of time steps to input as a sequence
:param hidden_size: Number of neurons on the LSTM hidden layer
:param lr: Learning rate
:param n_epochs: Number of training iterations
:param use_all_seq: If True, uses all the sequences of the pieces. Default: False
:param use_n_seq: Used when use_all_seq=False, we will only use these number of sequences (the first ones)
:return: class instance with learned parameters, loss per sample and loss per epoch
"""
start = time()
net = LSTMMusic(178, hidden_size).cuda()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
loss_func = nn.BCELoss() # Because we are using sigmoid and not softmax, BCELoss is the right choice
h_state, c_state = {}, {}
for i in range(len(notes_encoded)):
h_state[i] = torch.zeros(1, 1, hidden_size).float().cuda() # Initializes the hidden
c_state[i] = torch.zeros(1, 1, hidden_size).float().cuda() # and cell states for each song
l = [] # Stores the loss per sequence
lll = [] # Idem, but will be set to [] after each epoch for ll
ll = [] # Stores the mean loss per epoch
wait_10 = 0 # We will halve the learning rate if the loss does not decrease in the last 10 epochs
len_piece = []
for nts in notes_encoded.values():
len_piece.append(nts.shape[0])
if use_all_seq:
n_seq = min(len_piece) - seq_len - 1
else:
n_seq = use_n_seq # Uses only the first use_n_seq sequences of each piece
for epoch in range(n_epochs):
print("---------- epoch number:", epoch, "----------")
for step in range(n_seq):
loss = 0
for i in range(len(notes_encoded)):
x = notes_encoded[i][step:seq_len+step, :, :]
x.requires_grad = True
# Uses only the next note after input sequence to get the loss
y = notes_encoded[i][seq_len+step:seq_len+step+1, :, :]
y_pred, h_c_state = net(x, (h_state[i], c_state[i]))
y_pred = y_pred[-1].reshape(1, 1, 178) # Uses only the next note after input sequence to get the loss
# repack the hidden state, break the connection from last iteration
h_state[i], c_state[i] = h_c_state[0].data, h_c_state[1].data
loss += loss_func(y_pred, y)
loss = loss/len(notes_encoded)
l.append(loss.data)
lll.append(loss.data.cpu())
optimizer.zero_grad()
loss.backward()
optimizer.step()
ll.append(np.mean(lll))
print(" loss:", ll[-1])
if ll[-1] > np.mean(ll[::-1][:10]) and wait_10 >= 10: # We decrease the learning rate by half
print("Halving learning rate from", lr, "to", lr / 2) # When the loss stops decreasing
lr = lr / 2
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
wait_10 = 0
lll = []
wait_10 += 1
print("\nThe training process took", round(time() - start, 2), "seconds")
return net, l, ll
def train_lstm_loss_whole_seq(seq_len, hidden_size=178, lr=0.01,
n_epochs=100, use_all_seq=False, use_n_seq=100):
"""
Training function where we compare all the notes predicted by the network for a given
input sequence to get the loss. If input_seq = [1, 2, 3], we get [2, 3, 4] as predicted
outputs and use all of them together with their true values to compute the loss.
In this approach, we stack the sequences of each song together, sequentially
And compute the loss and update the weights once for each sequence, on each epoch
:param seq_len: Number of time steps to input as a sequence
:param hidden_size: Number of neurons on the LSTM hidden layer
:param lr: Learning rate
:param n_epochs: Number of training iterations
:param use_all_seq: If True, uses all the sequences of the pieces. Default: False
:param use_n_seq: Used when use_all_seq=False, we will only use these number of sequences (the first ones)
:return: class instance with learned parameters, loss per sample and loss per epoch
"""
start = time()
net = LSTMMusic(178, hidden_size).cuda()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
loss_func = nn.BCELoss() # Because we are using sigmoid and not softmax, BCELoss is the right choice
h_state, c_state = {}, {}
for i in range(len(notes_encoded)):
h_state[i] = torch.zeros(1, 1, hidden_size).float().cuda() # Initializes the hidden
c_state[i] = torch.zeros(1, 1, hidden_size).float().cuda() # and cell states for each song
l = [] # Stores the loss per sequence
lll = [] # Idem, but will be set to [] after each epoch for ll
ll = [] # Stores the mean loss per epoch
wait_10 = 0 # We will halve the learning rate if the loss does not decrease in the last 10 epochs
len_piece = []
for nts in notes_encoded.values():
len_piece.append(nts.shape[0])
if use_all_seq:
n_seq = min(len_piece) - seq_len - 1
else:
n_seq = use_n_seq # Uses only the first use_n_seq sequences of each piece
for epoch in range(n_epochs):
print("---------- epoch number:", epoch, "----------")
for step in range(n_seq):
loss = 0
for i in range(len(notes_encoded)):
x = notes_encoded[i][step:seq_len+step, :, :]
x.requires_grad = True
# Uses only the next note after input sequence to get the loss
y = notes_encoded[i][step+1:seq_len+step+1, :, :]
y_pred, h_c_state = net(x, (h_state[i], c_state[i]))
# Repacks the hidden state, break the connection from last iteration
h_state[i], c_state[i] = h_c_state[0].data, h_c_state[1].data
loss += loss_func(y_pred, y)
loss = loss/len(notes_encoded)
l.append(loss.data)
lll.append(loss.data.cpu())
optimizer.zero_grad()
loss.backward()
optimizer.step()
ll.append(np.mean(lll))
print(" loss:", ll[-1])
if ll[-1] > np.mean(ll[::-1][:10]) and wait_10 >= 10: # We decrease the learning rate by half
print("Halving learning rate from", lr, "to", lr / 2) # When the loss stops decreasing
lr = lr / 2
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
wait_10 = 0
lll = []
wait_10 += 1
print("\nThe training process took", round(time() - start, 2), "seconds")
return net, l, ll
def plot_loss(l, ll):
""" Plots the loss per sample and per epoch"""
plt.plot(range(len(l)), l)
plt.title("Loss for each sample")
plt.ylabel("Loss")
plt.xlabel("Sample on each epoch")
plt.show()
plt.plot(range(len(ll)), ll)
plt.title("Loss for each epoch")
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.show()
# ---------------------------------
# Defining our Generative Functions
# ---------------------------------
def get_tempo_dim_back(notes, tempo=74):
"""
Adds an extra dimension for the tempo
:param notes: encoded matrix without the tempo dim
:param tempo: value of the tempo to include
:return: Same matrix with tempo dimension, in order
to decode it successfully
"""
c = np.empty((notes.shape[0], notes.shape[1]+1))
for idx in range(notes.shape[0]):
c[idx] = np.hstack((notes[idx], np.array([tempo])))
return c
def ltsm_gen(net, seq_len, file_name, sampling_idx=0, sequence_start=0, n_steps=100, hidden_size=178,
time_step=0.05, changing_note=False, note_stuck=False, remove_extra_rests=True):
"""
Uses the trained LSTM to generate new notes and saves the output to a MIDI file
This approach uses a whole sequence of notes of one of the pieces we used to train
the network, with length seq_len, which should be the same as the one used when training
:param net: Trained LSTM
:param seq_len: Length of input sequence
:param file_name: Name to be given to the generated MIDI file
:param sampling_idx: File to get the input sequence from, out of the pieces used to train the LSTM
:param sequence_start: Index of the starting sequence, default to 0
:param n_steps: Number of vectors to generate
:param hidden_size: Hidden size of the trained LSTM
:param time_step: Vector duration. Should be the same as the one on get_right_hand()
:param changing_note: To sample from different sources at some point of the generation
and add this new note to the sequence. This is done in case the generation gets stuck
repeating a particular sequence over and over.
:param note_stuck: To change the note if the generation gets stuck playing the same
note over and over.
:param remove_extra_rests: If the generation outputs a lot of rests in between, use this
:return: None. Just saves the generated music as a .mid file
"""
notes = [] # Will contain a sequence of the predicted notes
x = notes_encoded[sampling_idx][sequence_start:sequence_start+seq_len] # Uses the input sequence
for nt in x: # To start predicting. This will be later removed from
notes.append(nt.cpu().numpy()) # the final output
h_state = torch.zeros(1, 1, hidden_size).float().cuda()
c_state = torch.zeros(1, 1, hidden_size).float().cuda()
print_first = True # To print out a message if every component of a
# predicted vector is less than 0.9
change_note = False
for _ in range(n_steps):
chosen = False # To account for when no dimension's probability is bigger than 0.9
y_pred, h_c_state = net(x, (h_state, c_state)) # Predicts the next notes for all
h_state, c_state = h_c_state[0].data, h_c_state[1].data # the notes in the input sequence
y_pred = y_pred.data # We only care about the last predicted note
y_pred = y_pred[-1] # (next note after last note of input sequence)
choose = torch.zeros((1, 1, 178)) # Coverts the probabilities to the actual note vector
y_pred_left = y_pred[:, :89]
for idx in range(89):
if y_pred_left[:, idx] > 0.9:
choose[:, :, idx] = 1
chosen = True
if y_pred_left[:, -1] >= 0.7: # We add a hold condition, in case the probability
choose[:, :, 88] = 1 # of having a hold is close to the one of having the pitch
if not chosen:
if print_first:
print("\nPrinting out the maximum prob of all notes for a time step",
"when this maximum prob is less than 0.9")
print_first = False
pred_note_idx = np.argmax(y_pred_left.cpu())
choose[:, :, pred_note_idx] = 1
if pred_note_idx != 87: # No holds for rests
if y_pred_left[:, pred_note_idx] - y_pred_left[:, -1] <= 0.2: # Hold condition
choose[:, :, 88] = 1
print(_, "left", y_pred_left[:, np.argmax(y_pred_left.cpu())]) # Maximum probability out of all components
y_pred_right = y_pred[:, 89:]
for idx in range(89):
if y_pred_right[:, idx] > 0.9:
choose[:, :, idx + 89] = 1
chosen = True
if y_pred_right[:, -1] >= 0.7:
choose[:, :, -1] = 1
if not chosen:
if print_first:
print("\nPrinting out the maximum prob of all notes for a time step",
"when this maximum prob is less than 0.9")
print_first = False
pred_note_idx = np.argmax(y_pred_right.cpu())
choose[:, :, pred_note_idx + 89] = 1
if pred_note_idx != 87: # No holds for rests
if y_pred_right[:, pred_note_idx] - y_pred_right[:, -1] <= 0.2: # Hold condition
choose[:, :, -1] = 1
print(_, "right",
y_pred_right[:, np.argmax(y_pred_right.cpu())]) # Maximum probability out of all components
x_new = torch.empty(x.shape) # Uses the output of the last time_step
for idx, nt in enumerate(x[1:]): # As the input for the next time_step
x_new[idx] = nt # So the new sequence will be the same past sequence minus the first note
x_new[-1] = choose
x = x_new.cuda() # We will use this new sequence to predict in the next iteration the next note
notes.append(choose.cpu().numpy()) # Saves the predicted note
# Condition so that the generation does not
# get stuck on a particular sequence
if changing_note:
if _ % seq_len == 0:
if sampling_idx >= len(notes_encoded):
sampling_idx = 0
change_note = True
st = randint(1, 100)
if change_note:
x_new[-1] = notes_encoded[sampling_idx][st, :, :]
change_note = False
else:
x_new[-1] = notes_encoded[sampling_idx][0, :, :]
sampling_idx += 1
x = x_new.cuda()
# Condition so that the generation does not
# get stuck on a particular note
if _ > 6 and note_stuck:
if (notes[-1][:, :, 89:] == notes[-2][:, :, 89:]).sum(2)[0][0].numpy() in [88, 89]:
if (notes[-1][:, :, 89:] == notes[-3][:, :, 89:]).sum(2)[0][0].numpy() in [88, 89]:
if (notes[-1][:, :, 89:] == notes[-4][:, :, 89:]).sum(2)[0][0].numpy() in [88, 89]:
if (notes[-1][:, :, 89:] == notes[-5][:, :, 89:]).sum(2)[0][0].numpy() in [88, 89]:
if (notes[-1][:, :, 89:] == notes[-6][:, :, 89:]).sum(2)[0][0].numpy() in [88, 89]:
for m in range(5):
notes.pop(-1)
if sampling_idx >= len(notes_encoded):
sampling_idx = 0
x_new[-1] = notes_encoded[sampling_idx][randint(1, 100), :, :]
x = x_new.cuda()
sampling_idx += 1
# Gets the notes into the correct NumPy array shape
gen_notes = np.empty((len(notes) - seq_len + 1, 178)) # Doesn't use the first predicted notes
for idx, nt in enumerate(notes[seq_len - 1:]): # Because these were sampled from the training data
gen_notes[idx] = nt[0]
# Decodes the generated music
gen_midi_left = decode(get_tempo_dim_back(gen_notes[:, :89], 74), time_step=time_step)
# Gets rid of too many rests
if remove_extra_rests:
stream_left = ms.stream.Stream()
for idx, nt in enumerate(gen_midi_left):
if type(nt) == ms.note.Rest and idx < len(gen_midi_left) - 5:
if nt.duration.quarterLength > 4 * time_step:
print("Removing rest")
continue
if type(gen_midi_left[idx + 4]) == ms.note.Rest:
print("Removing rest")
continue
stream_left.append(nt)
else:
stream_left.append(nt)
else:
stream_left = gen_midi_left
# Same thing for right hand
gen_midi_right = decode(get_tempo_dim_back(gen_notes[:, 89:], 74), time_step=time_step)
if remove_extra_rests:
stream_right = ms.stream.Stream()
for idx, nt in enumerate(gen_midi_right):
if type(nt) == ms.note.Rest and idx < len(gen_midi_right) - 5:
if nt.duration.quarterLength > 4 * time_step:
print("Removing rest")
continue
if type(gen_midi_right[idx + 4]) == ms.note.Rest:
print("Removing rest")
continue
stream_right.append(nt)
else:
stream_right.append(nt)
else:
stream_right = gen_midi_right
# Saves both hands combined as a MIDI file
combine(stream_left, stream_right, file_name + ".mid")
def ltsm_gen_v2(net, seq_len, file_name, sampling_idx=0, note_pos=0, n_steps=100, hidden_size=178,
time_step=0.05, changing_note=False, note_stuck=False, remove_extra_rests=True):
"""
Uses the trained LSTM to generate new notes and saves the output to a MIDI file
The difference between this and the previous one is that we only use one note as input
And then keep generating notes until we have a sequence of notes of length = seq_len
Once we do, we start appending the generated notes to the final output
:param net: Trained LSTM
:param seq_len: Length of input sequence
:param file_name: Name to be given to the generated MIDI file
:param sampling_idx: File to get the input note from, out of the pieces used to train the LSTM
:param note_pos: Position of the sampled input note in the source piece, default to the first note
:param n_steps: Number of vectors to generate
:param hidden_size: Hidden size of the trained LSTM
:param time_step: Vector duration. Should be the same as the one on get_right_hand()
:param changing_note: To sample from different sources at some point of the generation
and add this new note to the sequence. This is done in case the generation gets stuck
repeating a particular sequence over and over.
:param note_stuck: To change the note if the generation gets stuck playing the same
note over and over.
:param remove_extra_rests: If the generation outputs a lot of rests in between, use this
:return: None. Just saves the generated music as a .mid file
"""
notes = [] # Will contain a sequence of the predicted notes
x = notes_encoded[sampling_idx][note_pos:note_pos+1, :, :] # First note of the piece
notes.append(x.cpu().numpy()) # Saves the first note
h_state = torch.zeros(1, 1, hidden_size).float().cuda()
c_state = torch.zeros(1, 1, hidden_size).float().cuda()
print_first = True
change_note = False
for _ in range(n_steps):
chosen = False # To account for when no dimension's probability is bigger than 0.9
y_pred, h_c_state = net(x, (h_state, c_state))
h_state, c_state = h_c_state[0].data, h_c_state[1].data
y_pred = y_pred.data
y_pred = y_pred[-1] # We only care about the last predicted note (next note after last note of input sequence)
choose = torch.zeros((1, 1, 178)) # Coverts the probabilities to the actual note vector
y_pred_left = y_pred[:, :89]
for idx in range(89):
if y_pred_left[:, idx] > 0.9:
choose[:, :, idx] = 1
chosen = True
if y_pred_left[:, -1] >= 0.7: # We add a hold condition, in case the probability
choose[:, :, 88] = 1 # of having a hold is close to the one of having the pitch
if not chosen:
if print_first:
print("\nPrinting out the maximum prob of all notes for a time step",
"when this maximum prob is less than 0.9")
print_first = False
pred_note_idx = np.argmax(y_pred_left.cpu())
choose[:, :, pred_note_idx] = 1
if pred_note_idx != 87: # No holds for rests TODO: Run this again to see if it changes, but it shouldn't... I changed 88 to 87
if y_pred_left[:, pred_note_idx] - y_pred_left[:, -1] <= 0.2: # Hold condition
choose[:, :, 88] = 1
print(_, "left", y_pred_left[:, np.argmax(y_pred_left.cpu())]) # Maximum probability out of all components
y_pred_right = y_pred[:, 89:]
for idx in range(89):
if y_pred_right[:, idx] > 0.9:
choose[:, :, idx+89] = 1
chosen = True
if y_pred_right[:, -1] >= 0.7:
choose[:, :, -1] = 1
if not chosen:
if print_first:
print("\nPrinting out the maximum prob of all notes for a time step",
"when this maximum prob is less than 0.9")
print_first = False
pred_note_idx = np.argmax(y_pred_right.cpu())
choose[:, :, pred_note_idx+89] = 1
if pred_note_idx != 87: # No holds for rests
if y_pred_right[:, pred_note_idx] - y_pred_right[:, -1] <= 0.2: # Hold condition
choose[:, :, -1] = 1
print(_, "right", y_pred_right[:, np.argmax(y_pred_right.cpu())]) # Maximum probability out of all components
# If the number of input sequences is shorter than the expected one
if x.shape[0] < seq_len: # We keep adding the predicted notes to this input
x_new = torch.empty((x.shape[0] + 1, x.shape[1], x.shape[2]))
for i in range(x_new.shape[0] - 1):
x_new[i, :, :] = x[i, :, :]
x_new[-1, :, :] = y_pred
x = x_new.cuda()
notes.append(choose)
else: # If we already have enough sequences
x_new = torch.empty(x.shape) # Removes the first note
for idx, nt in enumerate(x[1:]): # of the current sequence
x_new[idx] = nt # And appends the predicted note to the
x_new[-1] = choose # input of sequences
x = x_new.cuda()
notes.append(choose)
# Condition so that the generation does not
# get stuck on a particular sequence
if changing_note:
if _ % seq_len == 0:
if sampling_idx >= len(notes_encoded):
sampling_idx = 0
change_note = True
st = randint(1, 100)
if change_note:
x_new[-1] = notes_encoded[sampling_idx][st, :, :]
change_note = False
else:
x_new[-1] = notes_encoded[sampling_idx][0, :, :]
sampling_idx += 1
x = x_new.cuda()
# Condition so that the generation does not
# get stuck on a particular note
if _ > 6 and note_stuck:
if (notes[-1][:, :, 89:] == notes[-2][:, :, 89:]).sum(2)[0][0].numpy() in [88, 89]:
if (notes[-1][:, :, 89:] == notes[-3][:, :, 89:]).sum(2)[0][0].numpy() in [88, 89]:
if (notes[-1][:, :, 89:] == notes[-4][:, :, 89:]).sum(2)[0][0].numpy() in [88, 89]:
if (notes[-1][:, :, 89:] == notes[-5][:, :, 89:]).sum(2)[0][0].numpy() in [88, 89]:
if (notes[-1][:, :, 89:] == notes[-6][:, :, 89:]).sum(2)[0][0].numpy() in [88, 89]:
for m in range(5):
notes.pop(-1)
if sampling_idx >= len(notes_encoded):
sampling_idx = 0
x_new[-1] = notes_encoded[sampling_idx][randint(1, 100), :, :]
x = x_new.cuda()
sampling_idx += 1
# Gets the notes into the correct NumPy array shape
gen_notes = np.empty((len(notes)-seq_len+1, 178)) # Doesn't use the first predicted notes
for idx, nt in enumerate(notes[seq_len-1:]): # Because at first this will be inaccurate
gen_notes[idx] = nt[0]
# Decodes the generated music
gen_midi_left = decode(get_tempo_dim_back(gen_notes[:, :89], 74), time_step=time_step)
# Gets rid of too many rests
if remove_extra_rests:
stream_left = ms.stream.Stream()
for idx, nt in enumerate(gen_midi_left):
if type(nt) == ms.note.Rest and idx < len(gen_midi_left) - 5:
if nt.duration.quarterLength > 4*time_step:
print("Removing rest")
continue
if type(gen_midi_left[idx + 4]) == ms.note.Rest:
print("Removing rest")
continue
stream_left.append(nt)
else:
stream_left.append(nt)
else:
stream_left = gen_midi_left
# Same thing for right hand
gen_midi_right = decode(get_tempo_dim_back(gen_notes[:, 89:], 74), time_step=time_step)
if remove_extra_rests:
stream_right = ms.stream.Stream()
for idx, nt in enumerate(gen_midi_right):
if type(nt) == ms.note.Rest and idx < len(gen_midi_right) - 5:
if nt.duration.quarterLength > 4 * time_step:
print("Removing rest")
continue
if type(gen_midi_right[idx + 4]) == ms.note.Rest:
print("Removing rest")
continue
stream_right.append(nt)
else:
stream_right.append(nt)
else:
stream_right = gen_midi_right
# Saves both hands combined as a MIDI file
combine(stream_left, stream_right, file_name + ".mid")
# -------------
# Some Attempts
# -------------
# notes_encoded = load("mendelssohn", "romantic", 10)
# net, l, ll = train_lstm_loss_whole_seq(50, n_epochs=100, lr=0.01)
# torch.save(net.state_dict(), 'lstm_whole_seq_mendelssohn_both_dict.pkl')
# net = LSTMMusic(178, 178).cuda()
# net.load_state_dict(torch.load("lstm_whole_seq_mendelssohn_both_dict.pkl"))
# net.eval()
# ltsm_gen(net, 50, "mendelssohn_both_dict", time_step=0.25, n_steps=1000, sampling_idx=2)