General cleanup in model

jacbz · jacbz · commit 8bae5f1dbf94 · 2021-07-16T23:16:53.000+02:00
diff --git a/model/constants.py b/model/constants.py
@@ -16,12 +16,15 @@
 # number of epochs to wait before adding the melody loss
 MELODY_EPOCH_DELAY = 0
 
+
 # inverse sigmoid decay
 def sampling_rate_at_epoch(epoch):
     if epoch < 0:
         return START_SCHEDULED_SAMPLING_RATE
-    return (SCHEDULED_SAMPLING_CONVERGENCE / (SCHEDULED_SAMPLING_CONVERGENCE + math.exp(epoch / SCHEDULED_SAMPLING_CONVERGENCE))) \
-           * (START_SCHEDULED_SAMPLING_RATE - END_SCHEDULED_SAMPLING_RATE) + END_SCHEDULED_SAMPLING_RATE
+    return (SCHEDULED_SAMPLING_CONVERGENCE / (
+                SCHEDULED_SAMPLING_CONVERGENCE + math.exp(epoch / SCHEDULED_SAMPLING_CONVERGENCE))) * (
+                       START_SCHEDULED_SAMPLING_RATE - END_SCHEDULED_SAMPLING_RATE) + END_SCHEDULED_SAMPLING_RATE
+
 
 HIDDEN_SIZE = 100
 HIDDEN_SIZE2 = 32
diff --git a/model/dataset.py b/model/dataset.py
@@ -1,4 +1,5 @@
 import collections
+
 from model.constants import *
 
 
@@ -44,7 +45,7 @@ def process_sample(json_file):
     num_chords = num_measures * CHORD_DISCRETIZATION_LENGTH
 
     chords_list, note_list, num_chords = discretize_sample(json_chords, json_notes, octave_boundary_lower,
-                                                                num_chords, num_measures * beats_per_measure)
+                                                           num_chords, num_measures * beats_per_measure)
 
     # pad chord and melodies to max measure length
     chords_list.append(CHORD_END_TOKEN)
diff --git a/model/embeddings.py b/model/embeddings.py
@@ -1,9 +1,10 @@
+import json
+import os
+
 import numpy as np
 import torch
-from transformers import BertTokenizer, BertModel
 from torch.nn.utils.rnn import pad_sequence
-import json
-import os
+from transformers import BertTokenizer, BertModel
 
 tokenizer = None
 model = None
@@ -53,4 +54,4 @@ def make_embedding(lyrics, custom_device=None):
     output = model(**encoded_input)
     embedding = output.last_hidden_state[0]
     length = output.last_hidden_state.shape[1]
-    return embedding, length
+    return embedding, length
diff --git a/model/lofi2lofi_dataset.py b/model/lofi2lofi_dataset.py
@@ -1,6 +1,8 @@
 import json
+
 import torch
 from torch.utils.data import Dataset
+
 from model.dataset import *
 
 
diff --git a/model/lofi2lofi_model.py b/model/lofi2lofi_model.py
@@ -1,8 +1,9 @@
+from hashlib import md5
+
 import numpy as np
 import torch
 from torch import nn
-from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
-from hashlib import md5
+from torch.nn.utils.rnn import pack_padded_sequence
 
 from model.constants import *
 
@@ -16,7 +17,8 @@ def __init__(self, device="cuda" if torch.cuda.is_available() else "cpu"):
         self.mean_linear = nn.Linear(in_features=HIDDEN_SIZE, out_features=HIDDEN_SIZE)
         self.variance_linear = nn.Linear(in_features=HIDDEN_SIZE, out_features=HIDDEN_SIZE)
 
-    def forward(self, gt_chords, gt_melodies, gt_tempo, gt_key, gt_mode, gt_valence, gt_energy, batch_num_chords, num_chords, sampling_rate_chords=0, sampling_rate_melodies=0):
+    def forward(self, gt_chords, gt_melodies, gt_tempo, gt_key, gt_mode, gt_valence, gt_energy, batch_num_chords,
+                num_chords, sampling_rate_chords=0, sampling_rate_melodies=0):
         # encode
         h = self.encoder(gt_chords, gt_melodies, gt_tempo, gt_key, gt_mode, gt_valence, gt_energy, batch_num_chords)
         # VAE
@@ -52,30 +54,29 @@ def __init__(self, device):
         super(Encoder, self).__init__()
         self.device = device
         self.chord_embeddings = nn.Embedding(num_embeddings=CHORD_PREDICTION_LENGTH, embedding_dim=HIDDEN_SIZE)
-        self.chords_lstm = nn.LSTM(input_size=HIDDEN_SIZE, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, bidirectional=True, batch_first=True)
+        self.chords_lstm = nn.LSTM(input_size=HIDDEN_SIZE, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS,
+                                   bidirectional=True, batch_first=True)
 
         self.melody_embeddings = nn.Embedding(num_embeddings=MELODY_PREDICTION_LENGTH, embedding_dim=HIDDEN_SIZE)
-        self.melody_lstm = nn.LSTM(input_size=HIDDEN_SIZE, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, bidirectional=True, batch_first=True)
+        self.melody_lstm = nn.LSTM(input_size=HIDDEN_SIZE, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS,
+                                   bidirectional=True, batch_first=True)
 
         self.tempo_embedding = nn.Linear(in_features=1, out_features=HIDDEN_SIZE2)
         self.key_embedding = nn.Embedding(num_embeddings=NUMBER_OF_KEYS, embedding_dim=HIDDEN_SIZE2)
         self.mode_embedding = nn.Embedding(num_embeddings=NUMBER_OF_MODES, embedding_dim=HIDDEN_SIZE2)
         self.valence_embedding = nn.Linear(in_features=1, out_features=HIDDEN_SIZE2)
         self.energy_embedding = nn.Linear(in_features=1, out_features=HIDDEN_SIZE2)
 
-        self.downsample = nn.Linear(in_features=4*HIDDEN_SIZE + 5*HIDDEN_SIZE2, out_features=HIDDEN_SIZE)
-
+        self.downsample = nn.Linear(in_features=4 * HIDDEN_SIZE + 5 * HIDDEN_SIZE2, out_features=HIDDEN_SIZE)
 
     def forward(self, chords, melodies, tempo, key, mode, valence, energy, batch_num_chords):
         chord_embeddings = self.chord_embeddings(chords)
         chords_input = pack_padded_sequence(chord_embeddings, batch_num_chords, batch_first=True, enforce_sorted=False)
         chords_out, (h_chords, _) = self.chords_lstm(chords_input)
-        # chords_out_repeated = pad_packed_sequence(chords_out, batch_first=True)[0].repeat_interleave( NOTES_PER_CHORD, 1)
-        # chords_out_repeated = chords_out_repeated[:,:,:HIDDEN_SIZE] + chords_out_repeated[:,:,HIDDEN_SIZE:]
 
-        # add two directions together
-        melody_embeddings = self.melody_embeddings(melodies)# + chords_out_repeated
-        melody_input = pack_padded_sequence(melody_embeddings, batch_num_chords * NOTES_PER_CHORD, batch_first=True, enforce_sorted=False)
+        melody_embeddings = self.melody_embeddings(melodies)
+        melody_input = pack_padded_sequence(melody_embeddings, batch_num_chords * NOTES_PER_CHORD, batch_first=True,
+                                            enforce_sorted=False)
         _, (h_melodies, _) = self.melody_lstm(melody_input)
 
         tempo_embedding = self.tempo_embedding(tempo.unsqueeze(1).float())
@@ -85,7 +86,9 @@ def forward(self, chords, melodies, tempo, key, mode, valence, energy, batch_num
         energy_embedding = self.energy_embedding(energy.unsqueeze(1).float())
 
         h_concatenated = torch.cat((h_chords[-1], h_chords[-2], h_melodies[-1], h_melodies[-2]), dim=1)
-        return self.downsample(torch.cat((h_concatenated, tempo_embedding, key_embedding, mode_embedding, valence_embedding, energy_embedding), dim=1))
+        return self.downsample(torch.cat(
+            (h_concatenated, tempo_embedding, key_embedding, mode_embedding, valence_embedding, energy_embedding),
+            dim=1))
 
 
 class Decoder(nn.Module):
@@ -100,7 +103,7 @@ def __init__(self, device):
             nn.ReLU(),
             nn.Linear(in_features=HIDDEN_SIZE, out_features=CHORD_PREDICTION_LENGTH)
         )
-        self.chord_embedding_downsample = nn.Linear(in_features=2*HIDDEN_SIZE, out_features=HIDDEN_SIZE)
+        self.chord_embedding_downsample = nn.Linear(in_features=2 * HIDDEN_SIZE, out_features=HIDDEN_SIZE)
 
         self.melody_embeddings = nn.Embedding(num_embeddings=MELODY_PREDICTION_LENGTH, embedding_dim=HIDDEN_SIZE)
         self.melody_lstm = nn.LSTMCell(input_size=HIDDEN_SIZE * 1, hidden_size=HIDDEN_SIZE * 1)
@@ -109,7 +112,7 @@ def __init__(self, device):
             nn.ReLU(),
             nn.Linear(in_features=HIDDEN_SIZE, out_features=MELODY_PREDICTION_LENGTH)
         )
-        self.melody_embedding_downsample = nn.Linear(in_features=3*HIDDEN_SIZE, out_features=HIDDEN_SIZE)
+        self.melody_embedding_downsample = nn.Linear(in_features=3 * HIDDEN_SIZE, out_features=HIDDEN_SIZE)
 
         self.key_linear = nn.Sequential(
             nn.Linear(in_features=HIDDEN_SIZE, out_features=HIDDEN_SIZE2),
@@ -137,30 +140,27 @@ def __init__(self, device):
             nn.Linear(in_features=HIDDEN_SIZE2, out_features=1),
         )
 
-    def generate(self):
-        mu = torch.randn(1, HIDDEN_SIZE)
-        return self(mu)
-
     def decode(self, mu):
         # create a hash for vector mu
         hash = ""
         # first 20 characters are each sampled from 5 entries
         for i in range(0, 100, 5):
-            hash += str((mu[0][i:i+1].abs().sum() * 587).int().item())[-1]
+            hash += str((mu[0][i:i + 1].abs().sum() * 587).int().item())[-1]
         # last 4 characters are the beginning of the MD5 hash of the whole vector
         hash2 = int(md5(mu.numpy()).hexdigest(), 16)
         hash = f"#{hash}{hash2}"[:25]
         return hash, self(mu, MAX_CHORD_LENGTH)
 
-    def forward(self, z, num_chords=MAX_CHORD_LENGTH, sampling_rate_chords=0, sampling_rate_melodies=0, gt_chords=None, gt_melody=None):
+    def forward(self, z, num_chords=MAX_CHORD_LENGTH, sampling_rate_chords=0, sampling_rate_melodies=0, gt_chords=None,
+                gt_melody=None):
         tempo_output = self.tempo_linear(z)
         key_output = self.key_linear(z)
         mode_output = self.mode_linear(z)
         valence_output = self.valence_linear(z)
         energy_output = self.energy_linear(z)
 
         batch_size = z.shape[0]
-        # initialize hidden states and cell states randomly
+        # initialize hidden states and cell states
         hx_chords = torch.zeros(batch_size, HIDDEN_SIZE, device=self.device)
         cx_chords = torch.zeros(batch_size, HIDDEN_SIZE, device=self.device)
         hx_melody = torch.zeros(batch_size, HIDDEN_SIZE, device=self.device)
@@ -205,7 +205,8 @@ def forward(self, z, num_chords=MAX_CHORD_LENGTH, sampling_rate_chords=0, sampli
                     melody_embeddings = self.melody_embeddings(gt_melody[:, i * NOTES_PER_CHORD + j])
                 else:
                     melody_embeddings = self.melody_embeddings(melody_prediction.argmax(dim=1))
-                melody_embeddings = self.melody_embedding_downsample(torch.cat((melody_embeddings, chord_embeddings, z), dim=1))
+                melody_embeddings = self.melody_embedding_downsample(
+                    torch.cat((melody_embeddings, chord_embeddings, z), dim=1))
 
         chord_outputs = torch.stack(chord_outputs, dim=1)
         melody_outputs = torch.stack(melody_outputs, dim=1)
diff --git a/model/lofi2lofi_train.py b/model/lofi2lofi_train.py
@@ -11,4 +11,4 @@
     dataset = Lofi2LofiDataset(dataset_folder, dataset_files)
     model = Lofi2LofiModel()
 
-    train(dataset, model, "lofi2lofi")
+    train(dataset, model, "lofi2lofi")
diff --git a/model/lyrics2lofi_dataset.py b/model/lyrics2lofi_dataset.py
@@ -1,7 +1,9 @@
 import json
+
 import numpy as np
 import torch
 from torch.utils.data import Dataset
+
 from model.dataset import *
 
 
diff --git a/model/lyrics2lofi_model.py b/model/lyrics2lofi_model.py
@@ -14,7 +14,8 @@ def __init__(self, device="cuda" if torch.cuda.is_available() else "cpu"):
         self.mean_linear = nn.Linear(in_features=HIDDEN_SIZE, out_features=HIDDEN_SIZE)
         self.variance_linear = nn.Linear(in_features=HIDDEN_SIZE, out_features=HIDDEN_SIZE)
 
-    def forward(self, input, num_chords, sampling_rate_chords=0, sampling_rate_melodies=0, gt_chords=None, gt_melody=None):
+    def forward(self, input, num_chords=MAX_CHORD_LENGTH, sampling_rate_chords=0, sampling_rate_melodies=0,
+                gt_chords=None, gt_melody=None):
         # encode
         h = self.encoder(input)
 
@@ -50,8 +51,9 @@ class Encoder(nn.Module):
     def __init__(self, device):
         super(Encoder, self).__init__()
         self.device = device
-        self.encoder_lstm = nn.LSTM(input_size=BERT_EMBEDDING_LENGTH, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, bidirectional=True, batch_first=True)
-        self.downsample = nn.Linear(in_features=2*HIDDEN_SIZE, out_features=HIDDEN_SIZE)
+        self.encoder_lstm = nn.LSTM(input_size=BERT_EMBEDDING_LENGTH, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS,
+                                    bidirectional=True, batch_first=True)
+        self.downsample = nn.Linear(in_features=2 * HIDDEN_SIZE, out_features=HIDDEN_SIZE)
 
     def forward(self, x):
         _, (h, _) = self.encoder_lstm(x)
@@ -92,7 +94,7 @@ def __init__(self, device):
         self.mode_embedding = nn.Linear(in_features=NUMBER_OF_MODES, out_features=HIDDEN_SIZE)
         self.valence_embedding = nn.Linear(in_features=1, out_features=HIDDEN_SIZE)
         self.energy_embedding = nn.Linear(in_features=1, out_features=HIDDEN_SIZE)
-        self.downsample = nn.Linear(in_features=5*HIDDEN_SIZE, out_features=HIDDEN_SIZE)
+        self.downsample = nn.Linear(in_features=5 * HIDDEN_SIZE, out_features=HIDDEN_SIZE)
 
         self.chords_lstm = nn.LSTMCell(input_size=HIDDEN_SIZE * 1, hidden_size=HIDDEN_SIZE * 1)
         self.chord_embeddings = nn.Embedding(num_embeddings=CHORD_PREDICTION_LENGTH, embedding_dim=HIDDEN_SIZE)
@@ -101,7 +103,7 @@ def __init__(self, device):
             nn.ReLU(),
             nn.Linear(in_features=HIDDEN_SIZE, out_features=CHORD_PREDICTION_LENGTH)
         )
-        self.chord_embedding_downsample = nn.Linear(in_features=2*HIDDEN_SIZE, out_features=HIDDEN_SIZE)
+        self.chord_embedding_downsample = nn.Linear(in_features=2 * HIDDEN_SIZE, out_features=HIDDEN_SIZE)
 
         self.melody_embeddings = nn.Embedding(num_embeddings=MELODY_PREDICTION_LENGTH, embedding_dim=HIDDEN_SIZE)
         self.melody_lstm = nn.LSTMCell(input_size=HIDDEN_SIZE * 1, hidden_size=HIDDEN_SIZE * 1)
@@ -110,7 +112,7 @@ def __init__(self, device):
             nn.ReLU(),
             nn.Linear(in_features=HIDDEN_SIZE, out_features=MELODY_PREDICTION_LENGTH)
         )
-        self.melody_embedding_downsample = nn.Linear(in_features=3*HIDDEN_SIZE, out_features=HIDDEN_SIZE)
+        self.melody_embedding_downsample = nn.Linear(in_features=3 * HIDDEN_SIZE, out_features=HIDDEN_SIZE)
 
     def forward(self, z, num_chords, sampling_rate_chords=0, sampling_rate_melodies=0, gt_chords=None, gt_melody=None):
         tempo_output = self.tempo_linear(z)
@@ -125,7 +127,7 @@ def forward(self, z, num_chords, sampling_rate_chords=0, sampling_rate_melodies=
         z = self.downsample(torch.cat((z, tempo_embedding, mode_embedding, valence_embedding, energy_embedding), dim=1))
 
         batch_size = z.shape[0]
-        # initialize hidden states and cell states randomly
+        # initialize hidden states and cell states
         hx_chords = torch.zeros(batch_size, HIDDEN_SIZE, device=self.device)
         cx_chords = torch.zeros(batch_size, HIDDEN_SIZE, device=self.device)
         hx_melody = torch.zeros(batch_size, HIDDEN_SIZE, device=self.device)
diff --git a/model/lyrics2lofi_train.py b/model/lyrics2lofi_train.py
@@ -13,4 +13,4 @@
     dataset = Lyrics2LofiDataset(dataset_folder, dataset_files, embeddings_file, embedding_lengths_file)
     model = Lyrics2LofiModel()
 
-    train(dataset, model, "lyrics2lofi")
+    train(dataset, model, "lyrics2lofi")
diff --git a/model/train.py b/model/train.py
@@ -53,8 +53,10 @@ def compute_loss(data):
                 model(input, max_num_chords, sampling_rate_chords, sampling_rate_melodies, chords_gt, notes_gt)
         else:
             pred_chords, pred_notes, pred_tempo, pred_key, pred_mode, pred_valence, pred_energy, kl = \
-                model(chords_gt, notes_gt, tempo_gt, key_gt, mode_gt, valence_gt, energy_gt, num_chords, max_num_chords, sampling_rate_chords, sampling_rate_melodies)
+                model(chords_gt, notes_gt, tempo_gt, key_gt, mode_gt, valence_gt, energy_gt, num_chords, max_num_chords,
+                      sampling_rate_chords, sampling_rate_melodies)
 
+        # compute a boolean mask to select entries up to a specific index
         def compute_mask(max_length, curr_length):
             arange = torch.arange(max_length, device=device).repeat((chords_gt.shape[0], 1)).permute(0, 1)
             lengths_stacked = curr_length.repeat((max_length, 1)).permute(1, 0)
@@ -70,7 +72,7 @@ def compute_mask(max_length, curr_length):
         mask_melody = compute_mask(max_num_notes, num_notes)
         loss_melody = torch.masked_select(loss_melody_notes, mask_melody).mean()
 
-        if (epoch < MELODY_EPOCH_DELAY):
+        if epoch < MELODY_EPOCH_DELAY:
             loss_melody = 0
 
         loss_kl = kl
@@ -107,9 +109,9 @@ def compute_mask(max_length, curr_length):
         # TRAINING
         model.train()
         for batch, data in enumerate(train_dataloader):
-            loss, loss_chords, kl_loss, loss_melody,\
-                loss_tempo, loss_key, loss_mode, loss_valence, loss_energy,\
-                batch_tp_chords, batch_tp_melodies = compute_loss(data)
+            loss, loss_chords, kl_loss, loss_melody, \
+            loss_tempo, loss_key, loss_mode, loss_valence, loss_energy, \
+            batch_tp_chords, batch_tp_melodies = compute_loss(data)
 
             ep_train_losses_chords.append(loss_chords)
             ep_train_losses_melodies.append(loss_melody)
@@ -130,9 +132,9 @@ def compute_mask(max_length, curr_length):
         model.eval()
         for batch, data in enumerate(val_dataloader):
             with torch.no_grad():
-                loss, loss_chords, kl_loss, loss_melody,\
-                    loss_tempo, loss_key, loss_mode, loss_valence, loss_energy,\
-                    batch_tp_chords, batch_tp_melodies = compute_loss(data)
+                loss, loss_chords, kl_loss, loss_melody, \
+                loss_tempo, loss_key, loss_mode, loss_valence, loss_energy, \
+                batch_tp_chords, batch_tp_melodies = compute_loss(data)
 
                 ep_val_losses_chords.append(loss_chords)
                 ep_val_losses_melodies.append(loss_melody)
diff --git a/server/Dockerfile b/server/Dockerfile
@@ -8,4 +8,4 @@ RUN pip3 install -r requirements.txt
 
 COPY . .
 
-CMD [ "python3", "-m" , "flask", "run", "--host=0.0.0.0"]
+CMD [ "python3", "-m" , "flask", "run", "--host=0.0.0.0"]
diff --git a/server/app.py b/server/app.py
@@ -1,15 +1,14 @@
 import json
 
-import numpy as np
+import torch
 from flask import Flask, request, jsonify
 from flask_limiter import Limiter
 from flask_limiter.util import get_remote_address
-import torch
 
-from server.lofi2lofi_generate import generate, decode
-from server.lyrics2lofi_predict import predict
 from model.lofi2lofi_model import Decoder as Lofi2LofiDecoder
 from model.lyrics2lofi_model import Lyrics2LofiModel
+from server.lofi2lofi_generate import decode
+from server.lyrics2lofi_predict import predict
 
 device = "cpu"
 app = Flask(__name__)
@@ -41,14 +40,6 @@ def home():
     return 'Server running'
 
 
-@app.route('/generate', methods=['GET'])
-def generate_new_sample():
-    json_output = generate(lofi2lofi_model)
-    response = jsonify(json_output)
-    response.headers.add('Access-Control-Allow-Origin', '*')
-
-    return response
-
 @app.route('/decode', methods=['GET'])
 def decode_input():
     input = request.args.get('input')
@@ -59,6 +50,7 @@ def decode_input():
 
     return response
 
+
 @app.route('/predict', methods=['GET'])
 def lyrics_to_track():
     input = request.args.get('input')
diff --git a/server/lofi2lofi_generate.py b/server/lofi2lofi_generate.py
diff --git a/server/lyrics2lofi_predict.py b/server/lyrics2lofi_predict.py
diff --git a/server/output.py b/server/output.py