From 93dca030923f9a3c668d37801c126f74fef70b90 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Mon, 5 Jun 2023 22:45:14 -0700
Subject: [PATCH 01/38] less iterations for hyperparam search in random forest
 mixer

---
 lightwood/mixer/random_forest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightwood/mixer/random_forest.py b/lightwood/mixer/random_forest.py
index 10df3c3f9..4d79fb9d1 100644
--- a/lightwood/mixer/random_forest.py
+++ b/lightwood/mixer/random_forest.py
@@ -57,7 +57,7 @@ def __init__(
 
         self.model = None
         self.positive_domain = False
-        self.num_trials = 20
+        self.num_trials = 5
         self.cv = 3
         self.map = {}
 

From 9baaf63f70834bbd1c76174cbddc5254805bd012 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Mon, 5 Jun 2023 23:11:36 -0700
Subject: [PATCH 02/38] rename filter_df, add featurize cache top level,
 slightly improve encoded_ds.get_item readability

---
 lightwood/analysis/analyze.py |  3 ---
 lightwood/api/json_ai.py      | 17 ++++++++++++++---
 lightwood/data/encoded_ds.py  | 11 ++++++-----
 lightwood/helpers/ts.py       |  3 +--
 4 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/lightwood/analysis/analyze.py b/lightwood/analysis/analyze.py
index 408ce317c..87e21b9c9 100644
--- a/lightwood/analysis/analyze.py
+++ b/lightwood/analysis/analyze.py
@@ -3,7 +3,6 @@
 from dataprep_ml import StatisticalAnalysis
 
 from lightwood.helpers.log import log
-from lightwood.helpers.ts import filter_ds
 from type_infer.dtype import dtype
 from lightwood.ensemble import BaseEnsemble
 from lightwood.analysis.base import BaseAnalysisBlock
@@ -60,8 +59,6 @@ def model_analyzer(
     normal_predictions = None
 
     if len(analysis_blocks) > 0:
-        filtered_df = filter_ds(encoded_val_data, tss)
-        encoded_val_data = EncodedDs(encoded_val_data.encoders, filtered_df, encoded_val_data.target)
         normal_predictions = predictor(encoded_val_data, args=PredictionArguments.from_dict(args))
         normal_predictions = normal_predictions.set_index(encoded_val_data.data_frame.index)
 
diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py
index 1f21007ed..00b511f04 100644
--- a/lightwood/api/json_ai.py
+++ b/lightwood/api/json_ai.py
@@ -997,7 +997,16 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
     feature_body = f"""
 log.info('Featurizing the data')
 
-feature_data = {{ key: EncodedDs(self.encoders, data, self.target) for key, data in split_data.items() if key != "stratified_on"}}
+tss = self.problem_definition.timeseries_settings
+
+feature_data = dict()
+for key, data in split_data.items():
+    if key != 'stratified_on':
+        if key not in self.feature_cache:
+            featurized_split = EncodedDs(self.encoders, filter_ts(data, tss), self.target)
+
+        self.feature_cache[key] = featurized_split
+        feature_data[key] = self.feature_cache[key]
 
 return feature_data
 
@@ -1019,8 +1028,6 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
 encoded_train_data = enc_data['train']
 encoded_dev_data = enc_data['dev']
 encoded_test_data = enc_data['test']
-filtered_df = filter_ds(encoded_test_data, self.problem_definition.timeseries_settings)
-encoded_test_data = EncodedDs(encoded_test_data.encoders, filtered_df, encoded_test_data.target)
 
 log.info('Training the mixers')
 
@@ -1174,6 +1181,7 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
                                                                       enc_train_test["dev"]]).data_frame,
                                                                       adjust_args={'learn_call': True})
 
+self.feature_cache = dict()  # empty feature cache to avoid large predictor objects
 """
     learn_body = align(learn_body, 2)
     # ----------------- #
@@ -1252,6 +1260,9 @@ def __init__(self):
         self.runtime_log = dict()
         self.global_insights = dict()
 
+        # Feature cache
+        self.feature_cache = dict()
+
     @timed
     def analyze_data(self, data: pd.DataFrame) -> None:
         # Perform a statistical analysis on the unprocessed data
diff --git a/lightwood/data/encoded_ds.py b/lightwood/data/encoded_ds.py
index b7f90993f..44fb803c4 100644
--- a/lightwood/data/encoded_ds.py
+++ b/lightwood/data/encoded_ds.py
@@ -44,7 +44,7 @@ def __len__(self):
     def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         The getter yields a tuple (X, y), where:
-          - `X `is a concatenation of all encoded representations of the row
+          - `X `is a concatenation of all encoded representations of the row. Size: (n_features,)
           - `y` is the encoded target
           
         :param idx: index of the row to access.
@@ -56,7 +56,7 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
             if self.cache[idx] is not None:
                 return self.cache[idx]
 
-        X = torch.FloatTensor()
+        X = []
         Y = torch.FloatTensor()
         for col in self.data_frame:
             if self.encoders.get(col, None):
@@ -72,16 +72,17 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
                     cols = [col]
                     data = self.data_frame[cols].iloc[idx].tolist()
 
-                encoded_tensor = self.encoders[col].encode(data, **kwargs)[0]
+                encoded_tensor = self.encoders[col].encode(data, **kwargs)
                 if torch.isnan(encoded_tensor).any() or torch.isinf(encoded_tensor).any():
                     raise Exception(f'Encoded tensor: {encoded_tensor} contains nan or inf values, this tensor is \
                                       the encoding of column {col} using {self.encoders[col].__class__}')
                 if col != self.target:
-                    X = torch.cat([X, encoded_tensor])
+                    X.append(encoded_tensor)
                 else:
-                    Y = encoded_tensor
+                    Y = encoded_tensor.squeeze()
 
         if self.cache_encoded:
+            X = torch.cat(X, dim=1).float().squeeze()
             self.cache[idx] = (X, Y)
 
         return X, Y
diff --git a/lightwood/helpers/ts.py b/lightwood/helpers/ts.py
index 445492cf6..c1306157a 100644
--- a/lightwood/helpers/ts.py
+++ b/lightwood/helpers/ts.py
@@ -297,13 +297,12 @@ def min_k(top_k, data):
     return candidate_sps
 
 
-def filter_ds(ds, tss, n_rows=1):
+def filter_ts(df: pd.DataFrame, tss, n_rows=1):
     """
     This method triggers only for timeseries datasets.
 
     It returns a dataframe that filters out all but the first ``n_rows`` per group.
     """  # noqa
-    df = ds.data_frame
     if tss.is_timeseries:
         gby = tss.group_by
         if gby is None:

From 60d650b766777537716ba1baed86f0892f3e1d05 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Tue, 6 Jun 2023 18:43:55 -0700
Subject: [PATCH 03/38] early stopping + default frozen pretrained text enc

---
 lightwood/encoder/text/pretrained.py | 157 +++++++++++++++++----------
 1 file changed, 99 insertions(+), 58 deletions(-)

diff --git a/lightwood/encoder/text/pretrained.py b/lightwood/encoder/text/pretrained.py
index b9fcd1bae..e94819aab 100644
--- a/lightwood/encoder/text/pretrained.py
+++ b/lightwood/encoder/text/pretrained.py
@@ -1,15 +1,12 @@
-"""
-"""
+import os
 import time
+from typing import Iterable
+from collections import deque
+
+import numpy as np
 import torch
 from torch.utils.data import DataLoader
-import os
 import pandas as pd
-from lightwood.encoder.text.helpers.pretrained_helpers import TextEmbed
-from lightwood.helpers.device import get_device_from_name
-from lightwood.encoder.base import BaseEncoder
-from lightwood.helpers.log import log
-from lightwood.helpers.torch import LightwoodAutocast
 from type_infer.dtype import dtype
 from transformers import (
     DistilBertModel,
@@ -18,8 +15,14 @@
     AdamW,
     get_linear_schedule_with_warmup,
 )
+from sklearn.model_selection import train_test_split
+
+from lightwood.encoder.text.helpers.pretrained_helpers import TextEmbed
+from lightwood.helpers.device import get_device_from_name
+from lightwood.encoder.base import BaseEncoder
+from lightwood.helpers.log import log
+from lightwood.helpers.torch import LightwoodAutocast
 from lightwood.helpers.general import is_none
-from typing import Iterable
 
 
 class PretrainedLangEncoder(BaseEncoder):
@@ -38,7 +41,7 @@ def __init__(
         is_target: bool = False,
         batch_size: int = 10,
         max_position_embeddings: int = None,
-        frozen: bool = False,
+        frozen: bool = True,
         epochs: int = 1,
         output_type: str = None,
         embed_mode: bool = True,
@@ -48,7 +51,6 @@ def __init__(
         :param is_target: Whether this encoder represents the target. NOT functional for text generation yet.
         :param batch_size: size of batch while fine-tuning
         :param max_position_embeddings: max sequence length of input text
-        :param custom_train: If True, trains model on target procided
         :param frozen: If True, freezes transformer layers during training.
         :param epochs: number of epochs to train model with
         :param output_type: Data dtype of the target; if categorical/binary, the option to return logits is possible.
@@ -64,12 +66,14 @@ def __init__(
         self._frozen = frozen
         self._batch_size = batch_size
         self._epochs = epochs
+        self._patience = 3  # measured in batches rather than epochs
+        self._val_loss_every = -1  # how many batches to wait before checking val loss. If -1, will check train loss instead of val for early stopping.  # noqa
+        self._tr_loss_every = 2  # same as above, but only applies if `_val_loss_every` is set to -1
 
         # Model setup
         self._model = None
         self.model_type = None
 
-        # TODO: Other LMs; Distilbert is a good balance of speed/performance
         self._classifier_model_class = DistilBertForSequenceClassification
         self._embeddings_model_class = DistilBertModel
         self._pretrained_model_name = "distilbert-base-uncased"
@@ -90,46 +94,45 @@ def __init__(
 
     def prepare(
         self,
-        train_priming_data: Iterable[str],
-        dev_priming_data: Iterable[str],
+        train_priming_data: pd.Series,
+        dev_priming_data: pd.Series,
         encoded_target_values: torch.Tensor,
     ):
         """
         Fine-tunes a transformer on the priming data.
 
-        CURRENTLY WIP; train + dev are placeholders for a validation-based approach. 
-
-        Train + Dev are concatenated together and a transformer is then fine tuned with weight-decay applied on the transformer parameters. The option to freeze the underlying transformer and only train a linear layer exists if `frozen=True`. This trains faster, with the exception that the performance is often lower than fine-tuning on internal benchmarks.
+        Transformer is fine-tuned with weight-decay on training split. 
+        By default, underlying transformer is frozen and only final linear layer is trained. This trains faster, often as tradeoff for performance.
 
         :param train_priming_data: Text data in the train set
-        :param dev_priming_data: Text data in the dev set (not currently supported; can be empty)
+        :param dev_priming_data: Text data in the dev set
         :param encoded_target_values: Encoded target labels in Nrows x N_output_dimension
         """ # noqa
         if self.is_prepared:
             raise Exception("Encoder is already prepared.")
 
         os.environ['TOKENIZERS_PARALLELISM'] = 'true'
+        val_size = (len(dev_priming_data)) / len(train_priming_data)
 
-        # TODO -> we shouldn't be concatenating these together
-        if len(dev_priming_data) > 0:
-            priming_data = pd.concat([train_priming_data, dev_priming_data]).values
-        else:
-            priming_data = train_priming_data.tolist()
+        # remove empty strings (`None`s for dtype `object`)
+        priming_data = pd.concat([
+            train_priming_data[~train_priming_data.isna()],
+            dev_priming_data[~dev_priming_data.isna()]]
+        ).tolist()
 
-        # Replaces empty strings with ''
-        priming_data = [x if x is not None else "" for x in priming_data]
+        # Label encode the OHE/binary output for classification
+        labels = encoded_target_values.argmax(dim=1)
+
+        # Split into train and validation sets
+        train_texts, val_texts, train_labels, val_labels = train_test_split(priming_data, labels, test_size=val_size)
 
         # If classification, then fine-tune
-        if (self.output_type in (dtype.categorical, dtype.binary)):
-            log.info("Training model.")
+        if self.output_type in (dtype.categorical, dtype.binary):
+            log.info("Training model.\n\tOutput trained is categorical")
 
             # Prepare priming data into tokenized form + attention masks
-            text = self._tokenizer(priming_data, truncation=True, padding=True)
-
-            log.info("\tOutput trained is categorical")
-
-            # Label encode the OHE/binary output for classification
-            labels = encoded_target_values.argmax(dim=1)
+            training_text = self._tokenizer(train_texts, truncation=True, padding=True)
+            validation_text = self._tokenizer(val_texts, truncation=True, padding=True)
 
             # Construct the model
             self._model = self._classifier_model_class.from_pretrained(
@@ -138,8 +141,12 @@ def prepare(
             ).to(self.device)
 
             # Construct the dataset for training
-            xinp = TextEmbed(text, labels)
-            dataset = DataLoader(xinp, batch_size=self._batch_size, shuffle=True)
+            xinp = TextEmbed(training_text, train_labels)
+            train_dataset = DataLoader(xinp, batch_size=self._batch_size, shuffle=True)
+
+            # Construct the dataset for validation
+            xvalinp = TextEmbed(validation_text, val_labels)
+            val_dataset = DataLoader(xvalinp, batch_size=self._batch_size, shuffle=True)
 
             # Set max length of input string; affects input to the model
             if self._max_len is None:
@@ -148,8 +155,7 @@ def prepare(
             if self._frozen:
                 log.info("\tFrozen Model + Training Classifier Layers")
                 """
-                Freeze the base transformer model and train
-                a linear layer on top
+                Freeze the base transformer model and train a linear layer on top
                 """
                 # Freeze all the transformer parameters
                 for param in self._model.base_model.parameters():
@@ -189,12 +195,12 @@ def prepare(
             scheduler = get_linear_schedule_with_warmup(
                 optimizer,
                 num_warmup_steps=0,  # default value for GLUE
-                num_training_steps=len(dataset) * self._epochs,
+                num_training_steps=len(train_dataset) * self._epochs,
             )
 
             # Train model; declare optimizer earlier if desired.
             self._tune_model(
-                dataset, optim=optimizer, scheduler=scheduler, n_epochs=self._epochs
+                train_dataset, val_dataset, optim=optimizer, scheduler=scheduler, n_epochs=self._epochs
             )
 
         else:
@@ -206,8 +212,7 @@ def prepare(
             ).to(self.device)
 
             # TODO: Not a great flag
-            # Currently, if the task is not classification, you must have
-            # an embedding generator only.
+            # Currently, if the task is not classification, you must have an embedding generator only
             if self.embed_mode is False:
                 log.info("Embedding mode must be ON for non-classification targets.")
                 self.embed_mode = True
@@ -216,19 +221,15 @@ def prepare(
         encoded = self.encode(priming_data[0:1])
         self.output_size = len(encoded[0])
 
-    def _tune_model(self, dataset, optim, scheduler, n_epochs=1):
+    def _tune_model(self, train_dataset, val_dataset, optim, scheduler, n_epochs=1):
         """
-        Given a model, train for n_epochs.
-        Specifically intended for tuning; it does NOT use loss/
-        stopping criterion.
-
-        model - torch.nn model;
-        dataset - torch.DataLoader; dataset to train
-        device - torch.device; cuda/cpu
-        log - lightwood.logger.log; log.info output
+        Given a model, tune for n_epochs.
+
+        train_dataset - torch.DataLoader; dataset to train
+        val_dataset - torch.DataLoader; dataset used to compute validation loss + early stopping
         optim - transformers.optimization.AdamW; optimizer
         scheduler - scheduling params
-        n_epochs - number of epochs to train
+        n_epochs - max number of epochs to train for, provided there is no early stopping
 
         """ # noqa
         self._model.train()
@@ -244,20 +245,21 @@ def _tune_model(self, dataset, optim, scheduler, n_epochs=1):
         else:
             log.info("Scheduler provided.")
 
+        best_tr_loss = best_val_loss = float("inf")
+        tr_loss_queue = deque(maxlen=self._patience)
+        patience_counter = self._patience
+
         started = time.time()
         for epoch in range(n_epochs):
             total_loss = 0
 
-            for batch in dataset:
+            for bidx, batch in enumerate(train_dataset):
                 optim.zero_grad()
 
                 with LightwoodAutocast():
-                    inpids = batch["input_ids"].to(self.device)
-                    attn = batch["attention_mask"].to(self.device)
-                    labels = batch["labels"].to(self.device)
-                    outputs = self._model(inpids, attention_mask=attn, labels=labels)
-                    loss = outputs[0]
+                    loss = self._call(batch)
 
+                tr_loss_queue.append(loss.item())
                 total_loss += loss.item()
 
                 loss.backward()
@@ -267,9 +269,48 @@ def _tune_model(self, dataset, optim, scheduler, n_epochs=1):
                 if time.time() - started > self.stop_after:
                     break
 
+                # val-based early stopping
+                if (self._val_loss_every != -1) and (bidx % self._val_loss_every == 0):
+                    self._model.eval()
+                    val_loss = 0
+
+                    for vbatch in val_dataset:
+                        val_loss += self._call(vbatch).item()
+
+                    log.info(f"Epoch {epoch+1} train batch {bidx+1} - Validation loss: {val_loss/len(val_dataset)}")
+                    if val_loss / len(val_dataset) >= best_val_loss:
+                        break
+
+                    best_val_loss = val_loss / len(val_dataset)
+                    self._model.train()
+
+                # train-based early stopping
+                elif (bidx + 1) % self._tr_loss_every == 0:
+                    self._model.eval()
+
+                    tr_loss = np.average(tr_loss_queue)
+                    log.info(f"Epoch {epoch} train batch {bidx} - Train loss: {tr_loss}")  # noqa
+                    self._model.train()
+
+                    if tr_loss >= best_tr_loss and patience_counter == 0:
+                        break
+                    elif patience_counter > 0:
+                        patience_counter -= 1
+                    elif tr_loss < best_tr_loss:
+                        best_tr_loss = tr_loss
+                        patience_counter = self._patience
+
             if time.time() - started > self.stop_after:
                 break
-            self._train_callback(epoch, total_loss / len(dataset))
+            self._train_callback(epoch, total_loss / len(train_dataset))
+
+    def _call(self, batch):
+        inpids = batch["input_ids"].to(self.device)
+        attn = batch["attention_mask"].to(self.device)
+        labels = batch["labels"].to(self.device)
+        outputs = self._model(inpids, attention_mask=attn, labels=labels)
+        loss = outputs[0]
+        return loss
 
     def _train_callback(self, epoch, loss):
         log.info(f"{self.name} at epoch {epoch+1} and loss {loss}!")

From 55320568049c7c00ef83e84ecf67e40fa3a5fd71 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Wed, 7 Jun 2023 03:37:52 +0000
Subject: [PATCH 04/38] progress

---
 lightwood/encoder/text/pretrained.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lightwood/encoder/text/pretrained.py b/lightwood/encoder/text/pretrained.py
index e94819aab..65a2a6509 100644
--- a/lightwood/encoder/text/pretrained.py
+++ b/lightwood/encoder/text/pretrained.py
@@ -41,7 +41,7 @@ def __init__(
         is_target: bool = False,
         batch_size: int = 10,
         max_position_embeddings: int = None,
-        frozen: bool = True,
+        frozen: bool = False,
         epochs: int = 1,
         output_type: str = None,
         embed_mode: bool = True,
@@ -67,7 +67,7 @@ def __init__(
         self._batch_size = batch_size
         self._epochs = epochs
         self._patience = 3  # measured in batches rather than epochs
-        self._val_loss_every = -1  # how many batches to wait before checking val loss. If -1, will check train loss instead of val for early stopping.  # noqa
+        self._val_loss_every = 5  # how many batches to wait before checking val loss. If -1, will check train loss instead of val for early stopping.  # noqa
         self._tr_loss_every = 2  # same as above, but only applies if `_val_loss_every` is set to -1
 
         # Model setup
@@ -270,7 +270,7 @@ def _tune_model(self, train_dataset, val_dataset, optim, scheduler, n_epochs=1):
                     break
 
                 # val-based early stopping
-                if (self._val_loss_every != -1) and (bidx % self._val_loss_every == 0):
+                if False and (self._val_loss_every != -1) and (bidx % self._val_loss_every == 0):
                     self._model.eval()
                     val_loss = 0
 
@@ -285,7 +285,7 @@ def _tune_model(self, train_dataset, val_dataset, optim, scheduler, n_epochs=1):
                     self._model.train()
 
                 # train-based early stopping
-                elif (bidx + 1) % self._tr_loss_every == 0:
+                elif False and (bidx + 1) % self._tr_loss_every == 0:
                     self._model.eval()
 
                     tr_loss = np.average(tr_loss_queue)

From 07708ab19c0146b61c02042ed4fc86ca0d5757e1 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Wed, 7 Jun 2023 19:59:54 -0700
Subject: [PATCH 05/38] refactor num enc + tests; improve neural early stopping

---
 lightwood/data/encoded_ds.py                  |   4 +-
 lightwood/encoder/numeric/numeric.py          | 127 +++++++-----------
 lightwood/encoder/numeric/ts_numeric.py       |   6 -
 lightwood/mixer/neural.py                     |  29 ++--
 .../encoder/numeric/test_numeric.py           |  70 +++++-----
 5 files changed, 103 insertions(+), 133 deletions(-)

diff --git a/lightwood/data/encoded_ds.py b/lightwood/data/encoded_ds.py
index 44fb803c4..b5bf5e7dd 100644
--- a/lightwood/data/encoded_ds.py
+++ b/lightwood/data/encoded_ds.py
@@ -67,10 +67,10 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
                 if hasattr(self.encoders[col], 'data_window'):
                     cols = [self.target] + [f'{self.target}_timestep_{i}'
                                             for i in range(1, self.encoders[col].data_window)]
-                    data = [self.data_frame[cols].iloc[idx].tolist()]
+                    data = [self.data_frame[cols].iloc[idx].values]
                 else:
                     cols = [col]
-                    data = self.data_frame[cols].iloc[idx].tolist()
+                    data = self.data_frame[cols].iloc[idx].values
 
                 encoded_tensor = self.encoders[col].encode(data, **kwargs)
                 if torch.isnan(encoded_tensor).any() or torch.isinf(encoded_tensor).any():
diff --git a/lightwood/encoder/numeric/numeric.py b/lightwood/encoder/numeric/numeric.py
index 251fd1ae6..aa99f1921 100644
--- a/lightwood/encoder/numeric/numeric.py
+++ b/lightwood/encoder/numeric/numeric.py
@@ -1,12 +1,15 @@
 import math
-from typing import Iterable, List, Union
+from typing import List, Union
+
 import torch
 import numpy as np
+import pandas as pd
 from torch.types import Number
+from type_infer.dtype import dtype
+
 from lightwood.encoder.base import BaseEncoder
 from lightwood.helpers.log import log
 from lightwood.helpers.general import is_none
-from type_infer.dtype import dtype
 
 
 class NumericEncoder(BaseEncoder):
@@ -28,13 +31,12 @@ def __init__(self, data_type: dtype = None, is_target: bool = False, positive_do
         :param positive_domain: Forces the encoder to always output positive values
         """
         super().__init__(is_target)
-        self._type = data_type
         self._abs_mean = None
         self.positive_domain = positive_domain
         self.decode_log = False
         self.output_size = 4 if not self.is_target else 3
 
-    def prepare(self, priming_data: Iterable):
+    def prepare(self, priming_data: pd.Series):
         """
         "NumericalEncoder" uses a rule-based form to prepare results on training (priming) data. The averages etc. are taken from this distribution.
 
@@ -43,55 +45,36 @@ def prepare(self, priming_data: Iterable):
         if self.is_prepared:
             raise Exception('You can only call "prepare" once for a given encoder.')
 
-        value_type = 'int'
-        for number in priming_data:
-            if not is_none(number):
-                if int(number) != number:
-                    value_type = 'float'
-
-        self._type = value_type if self._type is None else self._type
-        non_null_priming_data = [x for x in priming_data if not is_none(x)]
-        self._abs_mean = np.mean(np.abs(non_null_priming_data))
+        self._abs_mean = priming_data.abs().mean()
         self.is_prepared = True
 
-    def encode(self, data: Iterable):
+    def encode(self, data: pd.Series):
         """
-        :param data: An iterable data structure containing the numbers to be encoded
-
+        :param data: A pandas series containing the numbers to be encoded
         :returns: A torch tensor with the representations of each number
         """
         if not self.is_prepared:
             raise Exception('You need to call "prepare" before calling "encode" or "decode".')
 
-        ret = []
-        for real in data:
-            try:
-                real = float(real)
-            except Exception:
-                real = None
-            if self.is_target:
-                # Will crash if ``real`` is not a float, this is fine, targets should always have a value
-                vector = [0] * 3
-                vector[0] = 1 if real < 0 and not self.positive_domain else 0
-                vector[1] = math.log(abs(real)) if abs(real) > 0 else -20
-                vector[2] = real / self._abs_mean
-
-            else:
-                vector = [0] * 4
-                try:
-                    if is_none(real):
-                        vector[0] = 0
-                    else:
-                        vector[0] = 1
-                        vector[1] = math.log(abs(real)) if abs(real) > 0 else -20
-                        vector[2] = 1 if real < 0 and not self.positive_domain else 0
-                        vector[3] = real / self._abs_mean
-                except Exception as e:
-                    vector = [0] * 4
-                    log.error(f'Can\'t encode input value: {real}, exception: {e}')
-
-            ret.append(vector)
-
+        # todo: wrap with try/except to cover non-real edge cases
+        if not self.positive_domain:
+            sign = np.vectorize(lambda x: 0 if x < 0 else 1)(data)
+        else:
+            sign = np.zeros(len(data))
+        log_value = np.vectorize(lambda x: math.log(abs(x)) if abs(x) > 0 else -20)(data)
+        log_value = np.nan_to_num(log_value, nan=0, posinf=20, neginf=-20)
+
+        exp = np.vectorize(lambda x: x / self._abs_mean)(data)
+        exp = np.nan_to_num(exp, nan=0, posinf=20, neginf=-20)
+
+        if self.is_target:
+            components = [sign, log_value, exp]
+        else:
+            # todo: if can't encode return 0s and log.error(f'Can\'t encode input value: {real}, exception: {e}')
+            nones = np.vectorize(lambda x: 1 if is_none(x) else 0)(data)
+            components = [sign, log_value, exp, nones]
+
+        ret = torch.Tensor(np.array(components)).T
         return torch.Tensor(ret)
 
     def decode(self, encoded_values: Union[List[Number], torch.Tensor], decode_log: bool = None) -> list:
@@ -112,40 +95,32 @@ def decode(self, encoded_values: Union[List[Number], torch.Tensor], decode_log:
             encoded_values = encoded_values.tolist()
 
         for vector in encoded_values:
-            if self.is_target:
-                if np.isnan(
-                        vector[0]) or vector[0] == float('inf') or np.isnan(
-                        vector[1]) or vector[1] == float('inf') or np.isnan(
-                        vector[2]) or vector[2] == float('inf'):
-                    log.error(f'Got weird target value to decode: {vector}')
-                    real_value = pow(10, 63)
-                else:
-                    if decode_log:
-                        sign = -1 if vector[0] > 0.5 else 1
-                        try:
-                            real_value = math.exp(vector[1]) * sign
-                        except OverflowError:
-                            real_value = pow(10, 63) * sign
-                    else:
-                        real_value = vector[2] * self._abs_mean
-
-                    if self.positive_domain:
-                        real_value = abs(real_value)
-
-                    if self._type == 'int':
-                        real_value = int(real_value)
+            # check for none
+            if len(vector) == 4 and vector[-1] == 1:
+                ret.append(None)
+                continue
 
-            else:
-                if vector[0] < 0.5:
-                    ret.append(None)
-                    continue
+            # edge case: divergence
+            elif np.isnan(vector[0]) or vector[0] == float('inf') or \
+                    np.isnan(vector[1]) or vector[1] == float('inf') or \
+                    np.isnan(vector[2]) or vector[2] == float('inf'):
 
-                real_value = vector[3] * self._abs_mean
+                log.error(f'Got weird target value to decode: {vector}')
+                real_value = pow(10, 63)
+
+            elif decode_log:
+                sign = -1 if vector[0] < 0.5 else 1
+                try:
+                    real_value = math.exp(vector[1]) * sign
+                except OverflowError:
+                    real_value = pow(10, 63) * sign
+            else:
+                real_value = vector[2] * self._abs_mean
 
-                if self._type == 'int':
-                    real_value = round(real_value)
+            if self.positive_domain:
+                real_value = abs(real_value)
 
-            if isinstance(real_value, torch.Tensor):
-                real_value = real_value.item()
+            # if isinstance(real_value, torch.Tensor):
+            #     real_value = real_value.item()
             ret.append(real_value)
         return ret
diff --git a/lightwood/encoder/numeric/ts_numeric.py b/lightwood/encoder/numeric/ts_numeric.py
index 06127c9a3..3203e355a 100644
--- a/lightwood/encoder/numeric/ts_numeric.py
+++ b/lightwood/encoder/numeric/ts_numeric.py
@@ -107,14 +107,8 @@ def decode(self, encoded_values, decode_log=None, dependency_data=None):
                     if self.positive_domain:
                         real_value = abs(real_value)
 
-                    if self._type == 'int':
-                        real_value = int(round(real_value, 0))
-
             else:
                 real_value = vector[0] * self._abs_mean
 
-                if self._type == 'int':
-                    real_value = round(real_value)
-
             ret.append(real_value)
         return ret
diff --git a/lightwood/mixer/neural.py b/lightwood/mixer/neural.py
index 90040a3aa..ce4e7386d 100644
--- a/lightwood/mixer/neural.py
+++ b/lightwood/mixer/neural.py
@@ -1,5 +1,6 @@
 import time
 from copy import deepcopy
+from collections import deque
 from typing import Dict, List, Optional
 
 import torch
@@ -42,7 +43,8 @@ def __init__(
             net: str,
             fit_on_dev: bool,
             search_hyperparameters: bool,
-            n_epochs: Optional[int] = None
+            n_epochs: Optional[int] = None,
+            lr: Optional[float] = None,
     ):
         """
         The Neural mixer trains a fully connected dense network from concatenated encoded outputs of each of the features in the dataset to predicted the encoded output. 
@@ -55,6 +57,7 @@ def __init__(
         :param fit_on_dev: If we should fit on the dev dataset
         :param search_hyperparameters: If the network should run a more through hyperparameter search (currently disabled)
         :param n_epochs: amount of epochs that the network will be trained for. Supersedes all other early stopping criteria if specified.
+        :param lr: learning rate for the network. By default, it is automatically selected based on an initial search process.
         """ # noqa
         super().__init__(stop_after)
         self.dtype_dict = dtype_dict
@@ -62,6 +65,8 @@ def __init__(
         self.target_encoder = target_encoder
         self.epochs_to_best = 0
         self.n_epochs = n_epochs
+        self.lr = lr
+        self.loss_hist_len = 5  # length of queue to use for early stopping
         self.fit_on_dev = fit_on_dev
         self.net_name = net
         self.supports_proba = dtype_dict[target] in [dtype.binary, dtype.categorical]
@@ -106,12 +111,12 @@ def _select_criterion(self) -> torch.nn.Module:
 
         return criterion
 
-    def _select_optimizer(self) -> Optimizer:
-        optimizer = ad_optim.Ranger(self.model.parameters(), lr=self.lr, weight_decay=2e-2)
+    def _select_optimizer(self, lr) -> Optimizer:
+        optimizer = ad_optim.Ranger(self.model.parameters(), lr=lr, weight_decay=2e-2)
         return optimizer
 
     def _find_lr(self, dl):
-        optimizer = self._select_optimizer()
+        optimizer = self._select_optimizer(lr=1e-3)  # magic number for ranger optimizer, should be good starting point
         criterion = self._select_criterion()
         scaler = GradScaler()
 
@@ -168,7 +173,7 @@ def _find_lr(self, dl):
     def _max_fit(self, train_dl, dev_dl, criterion, optimizer, scaler, stop_after, return_model_after):
         epochs_to_best = 0
         best_dev_error = pow(2, 32)
-        running_errors = []
+        running_errors = deque(maxlen=self.loss_hist_len)
         best_model = self.model
 
         for epoch in range(1, return_model_after + 1):
@@ -215,10 +220,11 @@ def _max_fit(self, train_dl, dev_dl, criterion, optimizer, scaler, stop_after, r
 
             # automated early stopping
             else:
-                if len(running_errors) >= 5:
-                    delta_mean = np.average([running_errors[-i - 1] - running_errors[-i] for i in range(1, 5)],
-                                            weights=[(1 / 2)**i for i in range(1, 5)])
-                    if delta_mean <= 0:
+                if len(running_errors) >= self.loss_hist_len:
+                    delta_mean = np.average([
+                        running_errors[-i - 1] - running_errors[-i] for i in range(len(running_errors)-1)],
+                        weights=[(1 / 2)**i for i in range(len(running_errors)-1)])
+                    if delta_mean >= 0:
                         break
                 elif (time.time() - self.started) > stop_after:
                     break
@@ -274,7 +280,6 @@ def _fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None:
         dev_dl = DataLoader(dev_data, batch_size=self.batch_size, shuffle=False)
         train_dl = DataLoader(train_data, batch_size=self.batch_size, shuffle=False)
 
-        self.lr = 1e-4
         self.num_hidden = 1
 
         # Find learning rate
@@ -284,7 +289,7 @@ def _fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None:
             self.lr, self.model = self._find_lr(train_dl)
 
         # Keep on training
-        optimizer = self._select_optimizer()
+        optimizer = self._select_optimizer(lr=self.lr)
         criterion = self._select_criterion()
         scaler = GradScaler()
 
@@ -314,7 +319,7 @@ def partial_fit(self, train_data: EncodedDs, dev_data: EncodedDs, args: Optional
         self.started = time.time()
         train_dl = DataLoader(train_data, batch_size=self.batch_size, shuffle=True)
         dev_dl = DataLoader(dev_data, batch_size=self.batch_size, shuffle=True)
-        optimizer = self._select_optimizer()
+        optimizer = self._select_optimizer(lr=self.lr)
         criterion = self._select_criterion()
         scaler = GradScaler()
 
diff --git a/tests/unit_tests/encoder/numeric/test_numeric.py b/tests/unit_tests/encoder/numeric/test_numeric.py
index 93590c071..57b8815d9 100644
--- a/tests/unit_tests/encoder/numeric/test_numeric.py
+++ b/tests/unit_tests/encoder/numeric/test_numeric.py
@@ -1,5 +1,6 @@
 import unittest
 import numpy as np
+import pandas as pd
 import torch
 from lightwood.encoder.numeric import NumericEncoder
 from lightwood.encoder.numeric import TsNumericEncoder
@@ -16,31 +17,38 @@ def _pollute(array):
 
 class TestNumericEncoder(unittest.TestCase):
     def test_encode_and_decode(self):
-        data = [1, 1.1, 2, -8.6, None, 0]
+        data = pd.Series([1, 1.1, 2, -8.6, None, 0])
 
         encoder = NumericEncoder()
-
         encoder.prepare(data)
         encoded_vals = encoder.encode(data)
 
-        self.assertTrue(encoded_vals[1][1] > 0)
-        self.assertTrue(encoded_vals[2][1] > 0)
-        self.assertTrue(encoded_vals[3][1] > 0)
-        for i in range(0, 3):
-            self.assertTrue(encoded_vals[i][2] == 0)
-        self.assertTrue(encoded_vals[3][2] == 1)
-        self.assertTrue(encoded_vals[4][3] == 0)
+        # sign component check
+        self.assertTrue(encoded_vals[0][0] > 0)
+        self.assertTrue(encoded_vals[1][0] > 0)
+        self.assertTrue(encoded_vals[2][0] > 0)
+        self.assertTrue(encoded_vals[3][0] == 0)
 
-        decoded_vals = encoder.decode(encoded_vals)
+        # none component check
+        for i in range(0, len(encoded_vals)):
+            if i != 4:
+                self.assertTrue(encoded_vals[i][-1] == 0)
+            else:
+                self.assertTrue(encoded_vals[i][-1] == 1)
 
-        for i in range(len(encoded_vals)):
-            if decoded_vals[i] is None:
-                self.assertTrue(decoded_vals[i] == data[i])
+        # exp component nan edge case check
+        self.assertTrue(encoded_vals[4][2] == 0)
+
+        # compare decoded v/s real
+        decoded_vals = encoder.decode(encoded_vals)
+        for decoded, real in zip(decoded_vals, data.tolist()):
+            if decoded is None:
+                self.assertTrue((real is None) or (real != real))
             else:
-                np.testing.assert_almost_equal(round(decoded_vals[i], 10), round(data[i], 10))
+                np.testing.assert_almost_equal(round(decoded, 10), round(real, 10))
 
     def test_positive_domain(self):
-        data = [-1, -2, -100, 5, 10, 15]
+        data = pd.Series([-1, -2, -100, 5, 10, 15])
         for encoder in [NumericEncoder(), TsNumericEncoder()]:
             encoder.is_target = True        # only affects target values
             encoder.positive_domain = True
@@ -51,7 +59,7 @@ def test_positive_domain(self):
                 self.assertTrue(val >= 0)
 
     def test_log_overflow_and_none(self):
-        data = list(range(-2000, 2000, 66))
+        data = pd.Series(list(range(-2000, 2000, 66)))
         encoder = NumericEncoder()
 
         encoder.is_target = True
@@ -72,10 +80,10 @@ def test_nan_encoding(self):
 
         # Prepare with the correct data and decode invalid data
         encoder = NumericEncoder()
-        encoder.prepare(data)
+        encoder.prepare(pd.Series(data))
         for array in invalid_data:
             # Make sure the encoding has no nans or infs
-            encoded_repr = encoder.encode(array)
+            encoded_repr = encoder.encode(pd.Series(array))
             assert not torch.isnan(encoded_repr).any()
             assert not torch.isinf(encoded_repr).any()
 
@@ -88,29 +96,17 @@ def test_nan_encoding(self):
         # Prepare with the invalid data and decode the valid data
         for array in invalid_data:
             encoder = NumericEncoder()
-            encoder.prepare(array)
+            encoder.prepare(pd.Series(array))
 
             # Make sure the encoding has no nans or infs
-            encoded_repr = encoder.encode(data)
+            encoded_repr = encoder.encode(pd.Series(array))
             assert not torch.isnan(encoded_repr).any()
             assert not torch.isinf(encoded_repr).any()
 
             # Make sure the invalid value is decoded as `None` and the rest as numbers
             decoded_repr = encoder.decode(encoded_repr)
-            for x in decoded_repr:
-                assert not is_none(x)
-
-        # Prepare with the invalid data and decode invalid data
-        for array in invalid_data:
-            encoder = NumericEncoder()
-            encoder.prepare(array)
-            # Make sure the encoding has no nans or infs
-            encoded_repr = encoder.encode(array)
-            assert not torch.isnan(encoded_repr).any()
-            assert not torch.isinf(encoded_repr).any()
-
-            # Make sure the invalid value is decoded as `None` and the rest as numbers
-            decoded_repr = encoder.decode(encoded_repr)
-            for x in decoded_repr[:-1]:
-                assert not is_none(x)
-            assert decoded_repr[-1] is None
+            for dec, real in zip(decoded_repr, array):
+                if is_none(real):
+                    assert is_none(dec)
+                else:
+                    assert not is_none(x) or x != 0.0

From b6551149e0871dfa1043d081409179184a9ca40a Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Wed, 7 Jun 2023 21:29:34 -0700
Subject: [PATCH 06/38] better lr search

---
 lightwood/mixer/neural.py | 65 ++++++++++++++++++++-------------------
 1 file changed, 33 insertions(+), 32 deletions(-)

diff --git a/lightwood/mixer/neural.py b/lightwood/mixer/neural.py
index ce4e7386d..d6612b54c 100644
--- a/lightwood/mixer/neural.py
+++ b/lightwood/mixer/neural.py
@@ -63,10 +63,11 @@ def __init__(
         self.dtype_dict = dtype_dict
         self.target = target
         self.target_encoder = target_encoder
+        self.num_hidden = 1
         self.epochs_to_best = 0
         self.n_epochs = n_epochs
         self.lr = lr
-        self.loss_hist_len = 5  # length of queue to use for early stopping
+        self.loss_hist_len = 7  # length of queue to use for early stopping
         self.fit_on_dev = fit_on_dev
         self.net_name = net
         self.supports_proba = dtype_dict[target] in [dtype.binary, dtype.categorical]
@@ -116,27 +117,31 @@ def _select_optimizer(self, lr) -> Optimizer:
         return optimizer
 
     def _find_lr(self, dl):
-        optimizer = self._select_optimizer(lr=1e-3)  # magic number for ranger optimizer, should be good starting point
+        lr = 1e-5  # good starting point as search escalates
+        lrs = deque([5e-5, 1e-4, 5e-4, 1e-3, 2e-3, 3e-3, 5e-3, 1e-2, 5e-2, 1e-1])
+        starting_model = deepcopy(self.model)
         criterion = self._select_criterion()
         scaler = GradScaler()
 
-        running_losses: List[float] = []
-        cum_loss = 0
-        lr_log = []
+        running_losses = deque(maxlen=self.loss_hist_len)
+        lr_log = deque(maxlen=self.loss_hist_len)
         best_model = self.model
         stop = False
-        batches = 0
-        for epoch in range(1, 101):
-            if stop:
-                break
 
-            for i, (X, Y) in enumerate(dl):
-                if stop:
-                    break
+        _, test_batch = next(enumerate(dl))
+        X, Y = test_batch
+        n_steps = 10
+        cum_loss = 0
+
+        while stop is False:
+            # overfit learning on first sample (yes, biased, but we only really want an intuition on what LR is decent)
+            optimizer = self._select_optimizer(lr=lr)
+            self.model = starting_model
 
-                batches += len(X)
+            for i in range(n_steps):
                 X = X.to(self.model.device)
                 Y = Y.to(self.model.device)
+
                 with LightwoodAutocast():
                     optimizer.zero_grad()
                     Yh = self._net_call(X)
@@ -150,20 +155,16 @@ def _find_lr(self, dl):
                         optimizer.step()
                 cum_loss += loss.item()
 
-                # Account for ranger lookahead update
-                if (i + 1) * epoch % 6:
-                    batches = 0
-                    lr = optimizer.param_groups[0]['lr']
-                    log.info(f'Loss of {cum_loss} with learning rate {lr}')
-                    running_losses.append(cum_loss)
-                    lr_log.append(lr)
-                    cum_loss = 0
-                    if len(running_losses) < 2 or np.mean(running_losses[:-1]) > np.mean(running_losses):
-                        optimizer.param_groups[0]['lr'] = lr * 1.4
-                        # Time saving since we don't have to start training fresh
-                        best_model = deepcopy(self.model)
-                    else:
-                        stop = True
+            log.info(f'Loss of {cum_loss} with learning rate {lr}')
+            running_losses.append(cum_loss)
+            lr_log.append(lr)
+            cum_loss = 0
+
+            if len(running_losses) < 2 or np.mean(list(running_losses)[:-1]) > np.mean(running_losses) and len(lrs) > 0:
+                lr = lrs.popleft()
+                best_model = deepcopy(self.model)  # store model for slight time savings
+            else:
+                stop = True
 
         best_loss_lr = lr_log[np.argmin(running_losses)]
         lr = best_loss_lr
@@ -280,13 +281,13 @@ def _fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None:
         dev_dl = DataLoader(dev_data, batch_size=self.batch_size, shuffle=False)
         train_dl = DataLoader(train_data, batch_size=self.batch_size, shuffle=False)
 
-        self.num_hidden = 1
-
-        # Find learning rate
-        # keep the weights
+        # Find learning rate & keep initial weights
         self._init_net(train_data)
         if not self.lr:
-            self.lr, self.model = self._find_lr(train_dl)
+            sample_dl = DataLoader(train_data,
+                                   batch_size=min(len(train_data.data_frame), 32, self.batch_size),
+                                   shuffle=True)
+            self.lr, self.model = self._find_lr(sample_dl)
 
         # Keep on training
         optimizer = self._select_optimizer(lr=self.lr)

From 3578cbd86c6f1b6d32ded744114cefd401696d4a Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Wed, 7 Jun 2023 21:50:50 -0700
Subject: [PATCH 07/38] move EncodedDS cache building to offline within init().
 Preliminary results suggest a 10x runtime improvement across the board, with
 no accuracy loss

---
 lightwood/data/encoded_ds.py | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/lightwood/data/encoded_ds.py b/lightwood/data/encoded_ds.py
index b5bf5e7dd..9b654df6f 100644
--- a/lightwood/data/encoded_ds.py
+++ b/lightwood/data/encoded_ds.py
@@ -33,6 +33,9 @@ def __init__(self, encoders: List[BaseEncoder], data_frame: pd.DataFrame, target
                                            self.input_length + self.encoders[col].output_size)
                 self.input_length += self.encoders[col].output_size
 
+        # if cache enabled, we immediately build it
+        self.build_cache()  # TODO: ensure we remove these instances from predictor object before serializing
+
     def __len__(self):
         """
         The length of an `EncodedDs` datasource equals the amount of rows of the original dataframe.
@@ -56,21 +59,28 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
             if self.cache[idx] is not None:
                 return self.cache[idx]
 
+        X, Y = self._encode_idxs(idx)
+
+        if self.cache_encoded:
+            X = torch.cat(X, dim=1).float().squeeze()
+            self.cache[idx] = (X, Y)
+
+    def _encode_idxs(self, idxs):
         X = []
         Y = torch.FloatTensor()
         for col in self.data_frame:
             if self.encoders.get(col, None):
                 kwargs = {}
                 if 'dependency_data' in inspect.signature(self.encoders[col].encode).parameters:
-                    kwargs['dependency_data'] = {dep: [self.data_frame.iloc[idx][dep]]
+                    kwargs['dependency_data'] = {dep: [self.data_frame.iloc[idxs][dep]]
                                                  for dep in self.encoders[col].dependencies}
                 if hasattr(self.encoders[col], 'data_window'):
                     cols = [self.target] + [f'{self.target}_timestep_{i}'
                                             for i in range(1, self.encoders[col].data_window)]
-                    data = [self.data_frame[cols].iloc[idx].values]
+                    data = [self.data_frame[cols].iloc[idxs].values]  # TODO: this is likely to fail as is
                 else:
                     cols = [col]
-                    data = self.data_frame[cols].iloc[idx].values
+                    data = self.data_frame[cols].iloc[idxs].values.flatten()
 
                 encoded_tensor = self.encoders[col].encode(data, **kwargs)
                 if torch.isnan(encoded_tensor).any() or torch.isinf(encoded_tensor).any():
@@ -81,12 +91,18 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
                 else:
                     Y = encoded_tensor.squeeze()
 
-        if self.cache_encoded:
-            X = torch.cat(X, dim=1).float().squeeze()
-            self.cache[idx] = (X, Y)
-
+        # concatenate features into single tensor
+        X = torch.concat(X, dim=1)
         return X, Y
 
+    def build_cache(self):
+        assert self.cache_encoded
+        idxs = list(range(len(self.data_frame)))
+        X, Y = self._encode_idxs(idxs)
+
+        for i, (x, y) in enumerate(zip(X, Y)):
+            self.cache[i] = (x, y)
+
     def get_column_original_data(self, column_name: str) -> pd.Series:
         """
         Gets the original data for any given column of the `EncodedDs`.

From 776cbdd55f9848fd1991a7f9ba0d282453c208b9 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Fri, 9 Jun 2023 12:26:43 -0400
Subject: [PATCH 08/38] fix indentation bug

---
 lightwood/api/json_ai.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py
index 00b511f04..0f22604ed 100644
--- a/lightwood/api/json_ai.py
+++ b/lightwood/api/json_ai.py
@@ -1004,8 +1004,8 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
     if key != 'stratified_on':
         if key not in self.feature_cache:
             featurized_split = EncodedDs(self.encoders, filter_ts(data, tss), self.target)
+            self.feature_cache[key] = featurized_split
 
-        self.feature_cache[key] = featurized_split
         feature_data[key] = self.feature_cache[key]
 
 return feature_data

From 5ab843643a61d88eca482fbad8ed8479a60fdad4 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Fri, 9 Jun 2023 12:28:38 -0400
Subject: [PATCH 09/38] fix indentation bug in json_ai

---
 lightwood/api/json_ai.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py
index 00b511f04..0f22604ed 100644
--- a/lightwood/api/json_ai.py
+++ b/lightwood/api/json_ai.py
@@ -1004,8 +1004,8 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
     if key != 'stratified_on':
         if key not in self.feature_cache:
             featurized_split = EncodedDs(self.encoders, filter_ts(data, tss), self.target)
+            self.feature_cache[key] = featurized_split
 
-        self.feature_cache[key] = featurized_split
         feature_data[key] = self.feature_cache[key]
 
 return feature_data

From f08bd9f87a4e469955d0814c57992f8cd320342a Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Fri, 9 Jun 2023 12:37:20 -0400
Subject: [PATCH 10/38] lint: flake8

---
 lightwood/mixer/neural.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lightwood/mixer/neural.py b/lightwood/mixer/neural.py
index d6612b54c..57ce00bfc 100644
--- a/lightwood/mixer/neural.py
+++ b/lightwood/mixer/neural.py
@@ -223,8 +223,8 @@ def _max_fit(self, train_dl, dev_dl, criterion, optimizer, scaler, stop_after, r
             else:
                 if len(running_errors) >= self.loss_hist_len:
                     delta_mean = np.average([
-                        running_errors[-i - 1] - running_errors[-i] for i in range(len(running_errors)-1)],
-                        weights=[(1 / 2)**i for i in range(len(running_errors)-1)])
+                        running_errors[-i - 1] - running_errors[-i] for i in range(len(running_errors) - 1)],
+                        weights=[(1 / 2)**i for i in range(len(running_errors) - 1)])
                     if delta_mean >= 0:
                         break
                 elif (time.time() - self.started) > stop_after:

From 835353c7c4a93710ea89261579f89f5363732c64 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Fri, 9 Jun 2023 20:01:58 -0400
Subject: [PATCH 11/38] fix jsonai: reset feature_cache after predict calls

---
 lightwood/api/json_ai.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py
index 0f22604ed..ba521243c 100644
--- a/lightwood/api/json_ai.py
+++ b/lightwood/api/json_ai.py
@@ -1216,13 +1216,14 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
 log.info(f'[Predict phase 3/{{n_phases}}] - Calling ensemble')
 df = self.ensemble(encoded_ds, args=self.pred_args)
 
-if self.pred_args.all_mixers:
-    return df
-else:
+if not self.pred_args.all_mixers:
     log.info(f'[Predict phase 4/{{n_phases}}] - Analyzing output')
-    insights, global_insights = {call(json_ai.explainer)}
+    df, global_insights = {call(json_ai.explainer)}
     self.global_insights = {{**self.global_insights, **global_insights}}
-    return insights
+
+self.feature_cache = dict()  # empty feature cache to avoid large predictor objects
+
+return df
 """
 
     predict_body = align(predict_body, 2)

From abba8bd3b978d9445e6c1911edeafbbe25c7a287 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Fri, 9 Jun 2023 20:23:19 -0400
Subject: [PATCH 12/38] fix edge case empty dev split

---
 lightwood/encoder/text/pretrained.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/lightwood/encoder/text/pretrained.py b/lightwood/encoder/text/pretrained.py
index 65a2a6509..a74c0b1e5 100644
--- a/lightwood/encoder/text/pretrained.py
+++ b/lightwood/encoder/text/pretrained.py
@@ -112,13 +112,17 @@ def prepare(
             raise Exception("Encoder is already prepared.")
 
         os.environ['TOKENIZERS_PARALLELISM'] = 'true'
-        val_size = (len(dev_priming_data)) / len(train_priming_data)
 
         # remove empty strings (`None`s for dtype `object`)
-        priming_data = pd.concat([
-            train_priming_data[~train_priming_data.isna()],
-            dev_priming_data[~dev_priming_data.isna()]]
-        ).tolist()
+        filtered_tr = train_priming_data[~train_priming_data.isna()]
+        filtered_dev = dev_priming_data[~dev_priming_data.isna()]
+
+        if filtered_dev.shape[0] > 0:
+            priming_data = pd.concat([filtered_tr, filtered_dev]).tolist()
+            val_size = (len(dev_priming_data)) / len(train_priming_data)
+        else:
+            priming_data = filtered_tr.tolist()
+            val_size = 0.1  # leave out 0.1 for validation
 
         # Label encode the OHE/binary output for classification
         labels = encoded_target_values.argmax(dim=1)

From 542dc7855a22c2cdb00f989b3383ab8abfe9f88c Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Fri, 9 Jun 2023 21:49:13 -0400
Subject: [PATCH 13/38] fix: create ts filtered splits, use ravel()

---
 lightwood/api/json_ai.py     | 14 ++++++++++----
 lightwood/data/encoded_ds.py |  2 +-
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py
index ba521243c..2240a016b 100644
--- a/lightwood/api/json_ai.py
+++ b/lightwood/api/json_ai.py
@@ -1002,11 +1002,17 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
 feature_data = dict()
 for key, data in split_data.items():
     if key != 'stratified_on':
+
+        # compute and store two splits - full and filtered (useful for time series post-train analysis)
         if key not in self.feature_cache:
-            featurized_split = EncodedDs(self.encoders, filter_ts(data, tss), self.target)
-            self.feature_cache[key] = featurized_split
+            featurized_split = EncodedDs(self.encoders, data, self.target)
+            filtered_subset = EncodedDs(self.encoders, filter_ts(data, tss), self.target)
+
+            for k, s in zip((key, f'{{key}}_filtered'), (featurized_split, filtered_subset)):
+                self.feature_cache[k] = s
 
-        feature_data[key] = self.feature_cache[key]
+        for k in (key, f'{{key}}_filtered'):
+            feature_data[k] = self.feature_cache[k]
 
 return feature_data
 
@@ -1027,7 +1033,7 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
 # Extract the featurized data into train/dev/test
 encoded_train_data = enc_data['train']
 encoded_dev_data = enc_data['dev']
-encoded_test_data = enc_data['test']
+encoded_test_data = enc_data['test_filtered']
 
 log.info('Training the mixers')
 
diff --git a/lightwood/data/encoded_ds.py b/lightwood/data/encoded_ds.py
index 44fb803c4..196c7f0e9 100644
--- a/lightwood/data/encoded_ds.py
+++ b/lightwood/data/encoded_ds.py
@@ -79,7 +79,7 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
                 if col != self.target:
                     X.append(encoded_tensor)
                 else:
-                    Y = encoded_tensor.squeeze()
+                    Y = encoded_tensor.ravel()
 
         if self.cache_encoded:
             X = torch.cat(X, dim=1).float().squeeze()

From 47237b7506a963d9411f18349954cfc8244f6deb Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Fri, 9 Jun 2023 22:23:18 -0400
Subject: [PATCH 14/38] fix comment

---
 lightwood/encoder/text/pretrained.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lightwood/encoder/text/pretrained.py b/lightwood/encoder/text/pretrained.py
index a74c0b1e5..4532192bb 100644
--- a/lightwood/encoder/text/pretrained.py
+++ b/lightwood/encoder/text/pretrained.py
@@ -102,8 +102,9 @@ def prepare(
         Fine-tunes a transformer on the priming data.
 
         Transformer is fine-tuned with weight-decay on training split. 
-        By default, underlying transformer is frozen and only final linear layer is trained. This trains faster, often as tradeoff for performance.
-
+        
+        Train + Dev are concatenated together and a transformer is then fine tuned with weight-decay applied on the transformer parameters. The option to freeze the underlying transformer and only train a linear layer exists if `frozen=True`. This trains faster, with the exception that the performance is often lower than fine-tuning on internal benchmarks.
+        
         :param train_priming_data: Text data in the train set
         :param dev_priming_data: Text data in the dev set
         :param encoded_target_values: Encoded target labels in Nrows x N_output_dimension

From 5be101b770c01a31d931b407095ee4c76186b5a7 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Sat, 10 Jun 2023 00:37:59 -0400
Subject: [PATCH 15/38] partially address feedback

---
 lightwood/encoder/numeric/numeric.py | 2 +-
 lightwood/mixer/neural.py            | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lightwood/encoder/numeric/numeric.py b/lightwood/encoder/numeric/numeric.py
index aa99f1921..a040d0581 100644
--- a/lightwood/encoder/numeric/numeric.py
+++ b/lightwood/encoder/numeric/numeric.py
@@ -74,7 +74,7 @@ def encode(self, data: pd.Series):
             nones = np.vectorize(lambda x: 1 if is_none(x) else 0)(data)
             components = [sign, log_value, exp, nones]
 
-        ret = torch.Tensor(np.array(components)).T
+        ret = torch.Tensor(np.asarray(components)).T
         return torch.Tensor(ret)
 
     def decode(self, encoded_values: Union[List[Number], torch.Tensor], decode_log: bool = None) -> list:
diff --git a/lightwood/mixer/neural.py b/lightwood/mixer/neural.py
index 57ce00bfc..4a66c604f 100644
--- a/lightwood/mixer/neural.py
+++ b/lightwood/mixer/neural.py
@@ -128,8 +128,8 @@ def _find_lr(self, dl):
         best_model = self.model
         stop = False
 
-        _, test_batch = next(enumerate(dl))
-        X, Y = test_batch
+        dl_iter = iter(dl)
+        X, Y = next(dl_iter)
         n_steps = 10
         cum_loss = 0
 

From 57bf20c7ceb186c7214aca80e185dc90a6598c79 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Sat, 10 Jun 2023 01:01:45 -0400
Subject: [PATCH 16/38] vectorized operations as functions, type hints

---
 lightwood/encoder/numeric/numeric.py | 39 +++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/lightwood/encoder/numeric/numeric.py b/lightwood/encoder/numeric/numeric.py
index a040d0581..815e08794 100644
--- a/lightwood/encoder/numeric/numeric.py
+++ b/lightwood/encoder/numeric/numeric.py
@@ -48,35 +48,54 @@ def prepare(self, priming_data: pd.Series):
         self._abs_mean = priming_data.abs().mean()
         self.is_prepared = True
 
-    def encode(self, data: pd.Series):
+    def encode(self, data: Union[np.ndarray, pd.Series]):
         """
-        :param data: A pandas series containing the numbers to be encoded
+        :param data: A pandas series or numpy array containing the numbers to be encoded
         :returns: A torch tensor with the representations of each number
         """
         if not self.is_prepared:
             raise Exception('You need to call "prepare" before calling "encode" or "decode".')
 
-        # todo: wrap with try/except to cover non-real edge cases
+        if isinstance(data, pd.Series):
+            data = data.values
+
+        data = np.nan_to_num(data, nan=0).astype(float)
+
         if not self.positive_domain:
-            sign = np.vectorize(lambda x: 0 if x < 0 else 1)(data)
+            sign = np.vectorize(self._sign_fn, otypes=[float])(data)
         else:
             sign = np.zeros(len(data))
-        log_value = np.vectorize(lambda x: math.log(abs(x)) if abs(x) > 0 else -20)(data)
+        log_value = np.vectorize(self._log_fn, otypes=[float])(data)
         log_value = np.nan_to_num(log_value, nan=0, posinf=20, neginf=-20)
 
-        exp = np.vectorize(lambda x: x / self._abs_mean)(data)
-        exp = np.nan_to_num(exp, nan=0, posinf=20, neginf=-20)
+        norm = np.vectorize(self._norm_fn, otypes=[float])(data)
+        norm = np.nan_to_num(norm, nan=0, posinf=20, neginf=-20)
 
         if self.is_target:
-            components = [sign, log_value, exp]
+            components = [sign, log_value, norm]
         else:
             # todo: if can't encode return 0s and log.error(f'Can\'t encode input value: {real}, exception: {e}')
-            nones = np.vectorize(lambda x: 1 if is_none(x) else 0)(data)
-            components = [sign, log_value, exp, nones]
+            nones = np.vectorize(self._none_fn, otypes=[float])(data)
+            components = [sign, log_value, norm, nones]
 
         ret = torch.Tensor(np.asarray(components)).T
         return torch.Tensor(ret)
 
+    @staticmethod
+    def _sign_fn(x: float) -> float:
+        return 0 if x < 0 else 1
+
+    @staticmethod
+    def _log_fn(x: float) -> float:
+        return math.log(abs(x)) if abs(x) > 0 else -20
+
+    def _norm_fn(self, x: float) -> float:
+        return x / self._abs_mean
+
+    @staticmethod
+    def _none_fn(x: float) -> float:
+        return 1 if is_none(x) else 0
+
     def decode(self, encoded_values: Union[List[Number], torch.Tensor], decode_log: bool = None) -> list:
         """
         :param encoded_values: The encoded values to decode into single numbers

From 73737b5a45b9f086c1d270ffc9015c10e90729d8 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Sat, 10 Jun 2023 01:47:31 -0400
Subject: [PATCH 17/38] address feedback by vectorizing
 numerical_encoder.decode method

---
 lightwood/encoder/numeric/numeric.py | 74 +++++++++++++---------------
 lightwood/mixer/regression.py        |  5 +-
 2 files changed, 39 insertions(+), 40 deletions(-)

diff --git a/lightwood/encoder/numeric/numeric.py b/lightwood/encoder/numeric/numeric.py
index 815e08794..52f12d0c1 100644
--- a/lightwood/encoder/numeric/numeric.py
+++ b/lightwood/encoder/numeric/numeric.py
@@ -1,14 +1,12 @@
 import math
-from typing import List, Union
+from typing import Union
 
 import torch
 import numpy as np
 import pandas as pd
-from torch.types import Number
 from type_infer.dtype import dtype
 
 from lightwood.encoder.base import BaseEncoder
-from lightwood.helpers.log import log
 from lightwood.helpers.general import is_none
 
 
@@ -96,50 +94,48 @@ def _norm_fn(self, x: float) -> float:
     def _none_fn(x: float) -> float:
         return 1 if is_none(x) else 0
 
-    def decode(self, encoded_values: Union[List[Number], torch.Tensor], decode_log: bool = None) -> list:
+    def decode(self, encoded_values: torch.Tensor, decode_log: bool = None) -> list:
         """
         :param encoded_values: The encoded values to decode into single numbers
         :param decode_log: Whether to decode the ``log`` or ``linear`` part of the representation, since the encoded vector contains both a log and a linear part
 
-        :returns: The decoded number
+        :returns: The decoded array
         """ # noqa
+
         if not self.is_prepared:
             raise Exception('You need to call "prepare" before calling "encode" or "decode".')
 
         if decode_log is None:
             decode_log = self.decode_log
 
-        ret = []
-        if isinstance(encoded_values, torch.Tensor):
-            encoded_values = encoded_values.tolist()
-
-        for vector in encoded_values:
-            # check for none
-            if len(vector) == 4 and vector[-1] == 1:
-                ret.append(None)
-                continue
-
-            # edge case: divergence
-            elif np.isnan(vector[0]) or vector[0] == float('inf') or \
-                    np.isnan(vector[1]) or vector[1] == float('inf') or \
-                    np.isnan(vector[2]) or vector[2] == float('inf'):
-
-                log.error(f'Got weird target value to decode: {vector}')
-                real_value = pow(10, 63)
-
-            elif decode_log:
-                sign = -1 if vector[0] < 0.5 else 1
-                try:
-                    real_value = math.exp(vector[1]) * sign
-                except OverflowError:
-                    real_value = pow(10, 63) * sign
-            else:
-                real_value = vector[2] * self._abs_mean
-
-            if self.positive_domain:
-                real_value = abs(real_value)
-
-            # if isinstance(real_value, torch.Tensor):
-            #     real_value = real_value.item()
-            ret.append(real_value)
-        return ret
+        # force = True prevents side effects on the original encoded_values
+        ev = encoded_values.numpy(force=True)
+
+        # set "divergent" value as default (note: finfo.max() instead of pow(10, 63))
+        ret = np.full((ev.shape[0],), dtype=float, fill_value=np.finfo(np.float64).max)
+
+        # sign component
+        sign = np.ones(ev.shape[0], dtype=float)
+        mask_sign = ev[:, 0] < 0.5
+        sign[mask_sign] = -1
+
+        # real component
+        if decode_log:
+            real_value = np.exp(ev[:, 1]) * sign
+            overflow_mask = ev[:, 1] >= 63
+            real_value[overflow_mask] = 10 ** 63
+            valid_mask = ~overflow_mask
+        else:
+            real_value = ev[:, 2] * self._abs_mean
+            valid_mask = np.ones_like(real_value, dtype=bool)
+
+        # final filters
+        if self.positive_domain:
+            real_value = abs(real_value)
+
+        ret[valid_mask] = real_value[valid_mask]
+
+        nan_mask = ret[:, ] == np.nan
+        ret[nan_mask] = None
+
+        return ret.tolist()  # TODO: update signature on BaseEncoder and replace all encs to return ndarrays
diff --git a/lightwood/mixer/regression.py b/lightwood/mixer/regression.py
index 99c2a9905..fc63183cc 100644
--- a/lightwood/mixer/regression.py
+++ b/lightwood/mixer/regression.py
@@ -89,7 +89,10 @@ def __call__(self, ds: EncodedDs,
         """ # noqa
         X = []
         for x, _ in ds:
-            X.append(x.tolist())
+            entry = x.numpy()
+            if len(entry.shape) > 1:
+                entry = entry[0]
+            X.append(entry)
 
         Yh = self.model.predict(X)
 

From 2a82d61c4a634a88bc1682f6a573eb253794eed3 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Sat, 10 Jun 2023 01:47:52 -0400
Subject: [PATCH 18/38] partially address feedback for encodedDs class

---
 lightwood/data/encoded_ds.py | 47 +++++++++++++++++++++---------------
 1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/lightwood/data/encoded_ds.py b/lightwood/data/encoded_ds.py
index 396b6f4cd..7af492017 100644
--- a/lightwood/data/encoded_ds.py
+++ b/lightwood/data/encoded_ds.py
@@ -21,10 +21,10 @@ def __init__(self, encoders: List[BaseEncoder], data_frame: pd.DataFrame, target
         self.data_frame = data_frame
         self.encoders = encoders
         self.target = target
-        self.cache_encoded = True
+        self.use_cache = True
         self.cache = [None] * len(self.data_frame)
         self.encoder_spans = {}
-        self.input_length = 0
+        self.input_length = 0  # feature tensor dim
 
         # save encoder span, has to use same iterator as in __getitem__ for correct indeces
         for col in self.data_frame:
@@ -34,7 +34,7 @@ def __init__(self, encoders: List[BaseEncoder], data_frame: pd.DataFrame, target
                 self.input_length += self.encoders[col].output_size
 
         # if cache enabled, we immediately build it
-        self.build_cache()  # TODO: ensure we remove these instances from predictor object before serializing
+        self.build_cache()
 
     def __len__(self):
         """
@@ -55,18 +55,19 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
         :return: tuple (X, y) with encoded data.
         
         """  # noqa
-        if self.cache_encoded:
-            if self.cache[idx] is not None:
-                return self.cache[idx]
-
-        X, Y = self._encode_idxs(idx)
+        if self.use_cache and self.cache[idx] is not None:
+            X, Y = self.cache[idx]
+        else:
+            X, Y = self._encode_idxs([idx, ])
+            if self.use_cache:
+                self.cache[idx] = [X, Y]
+        return X, Y
 
-        if self.cache_encoded:
-            X = torch.cat(X, dim=1).float().squeeze()
-            self.cache[idx] = (X, Y)
+    def _encode_idxs(self, idxs: list):
+        if not isinstance(idxs, list):
+            raise Exception(f"Passed indexes is not an iterable. Check the type! Index: {idxs}")
 
-    def _encode_idxs(self, idxs):
-        X = []
+        X = torch.zeros((len(idxs), self.input_length))
         Y = torch.FloatTensor()
         for col in self.data_frame:
             if self.encoders.get(col, None):
@@ -87,21 +88,28 @@ def _encode_idxs(self, idxs):
                     raise Exception(f'Encoded tensor: {encoded_tensor} contains nan or inf values, this tensor is \
                                       the encoding of column {col} using {self.encoders[col].__class__}')
                 if col != self.target:
-                    X.append(encoded_tensor)
+                    a, b = self.encoder_spans[col]
+                    X[:, a:b] = torch.squeeze(encoded_tensor, dim=list(range(2, len(encoded_tensor.shape))))
+
+                # target post-processing
                 else:
-                    Y = encoded_tensor.ravel()
+                    if len(encoded_tensor.shape) > 1:
+                        Y = encoded_tensor.squeeze()
+                    else:
+                        Y = encoded_tensor.ravel()
 
-        # concatenate features into single tensor
-        X = torch.concat(X, dim=1)
         return X, Y
 
     def build_cache(self):
-        assert self.cache_encoded
+        """ This method builds a cache for the entire dataframe provided at initialization. """
+        if not self.use_cache:
+            raise RuntimeError("Cannot build a cache for EncodedDS with `use_cache` set to False.")
+
         idxs = list(range(len(self.data_frame)))
         X, Y = self._encode_idxs(idxs)
 
         for i, (x, y) in enumerate(zip(X, Y)):
-            self.cache[i] = (x, y)
+            self.cache[i] = [x, y]
 
     def get_column_original_data(self, column_name: str) -> pd.Series:
         """
@@ -157,6 +165,7 @@ class ConcatedEncodedDs(EncodedDs):
     """
     `ConcatedEncodedDs` abstracts over multiple encoded datasources (`EncodedDs`) as if they were a single entity.
     """  # noqa
+
     # TODO: We should probably delete this abstraction, it's not really useful and it adds complexity/overhead
     def __init__(self, encoded_ds_arr: List[EncodedDs]) -> None:
         # @TODO: missing super() call here?

From 0273dedf55b562bb05e56cc8a30ddb5c547a0b74 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Sat, 10 Jun 2023 01:48:29 -0400
Subject: [PATCH 19/38] add support for new type_infer version (backwards
 compatible change)

---
 lightwood/encoder/text/short.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lightwood/encoder/text/short.py b/lightwood/encoder/text/short.py
index e7f68186e..e4bb320c7 100644
--- a/lightwood/encoder/text/short.py
+++ b/lightwood/encoder/text/short.py
@@ -55,7 +55,7 @@ def prepare(self, priming_data):
         unique_tokens = set()
         max_words_per_sent = 0
         for sent in no_null_sentences:
-            tokens = tokenize_text(sent)
+            tokens = list(tokenize_text(sent))
             max_words_per_sent = max(max_words_per_sent, len(tokens))
             for tok in tokens:
                 unique_tokens.add(tok)
@@ -78,7 +78,7 @@ def encode(self, column_data: List[str]) -> torch.Tensor:
         no_null_sentences = (x if x is not None else '' for x in column_data)
         output = []
         for sent in no_null_sentences:
-            tokens = tokenize_text(sent)
+            tokens = list(tokenize_text(sent))
             encoded_words = self.cae.encode(tokens)
             encoded_sent = self._combine_fn(encoded_words)
             output.append(torch.Tensor(encoded_sent))

From 74bd95c71711cc5c1a2e9375365f0f0d754ae801 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Sat, 10 Jun 2023 02:03:54 -0400
Subject: [PATCH 20/38] fix tests

---
 lightwood/encoder/numeric/numeric.py             | 16 +++++++++++-----
 tests/unit_tests/encoder/numeric/test_numeric.py |  4 ++--
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/lightwood/encoder/numeric/numeric.py b/lightwood/encoder/numeric/numeric.py
index 52f12d0c1..fe037708f 100644
--- a/lightwood/encoder/numeric/numeric.py
+++ b/lightwood/encoder/numeric/numeric.py
@@ -57,8 +57,6 @@ def encode(self, data: Union[np.ndarray, pd.Series]):
         if isinstance(data, pd.Series):
             data = data.values
 
-        data = np.nan_to_num(data, nan=0).astype(float)
-
         if not self.positive_domain:
             sign = np.vectorize(self._sign_fn, otypes=[float])(data)
         else:
@@ -72,7 +70,6 @@ def encode(self, data: Union[np.ndarray, pd.Series]):
         if self.is_target:
             components = [sign, log_value, norm]
         else:
-            # todo: if can't encode return 0s and log.error(f'Can\'t encode input value: {real}, exception: {e}')
             nones = np.vectorize(self._none_fn, otypes=[float])(data)
             components = [sign, log_value, norm, nones]
 
@@ -114,6 +111,13 @@ def decode(self, encoded_values: torch.Tensor, decode_log: bool = None) -> list:
         # set "divergent" value as default (note: finfo.max() instead of pow(10, 63))
         ret = np.full((ev.shape[0],), dtype=float, fill_value=np.finfo(np.float64).max)
 
+        # `none` filter (if not a target column)
+        if not self.is_target:
+            mask_none = ev[:, -1] == 1
+            ret[mask_none] = np.nan
+        else:
+            mask_none = np.zeros_like(ret)
+
         # sign component
         sign = np.ones(ev.shape[0], dtype=float)
         mask_sign = ev[:, 0] < 0.5
@@ -135,7 +139,9 @@ def decode(self, encoded_values: torch.Tensor, decode_log: bool = None) -> list:
 
         ret[valid_mask] = real_value[valid_mask]
 
-        nan_mask = ret[:, ] == np.nan
-        ret[nan_mask] = None
+        # set nan back to None
+        if mask_none.sum() > 0:
+            ret = ret.astype(object)
+            ret[mask_none] = None
 
         return ret.tolist()  # TODO: update signature on BaseEncoder and replace all encs to return ndarrays
diff --git a/tests/unit_tests/encoder/numeric/test_numeric.py b/tests/unit_tests/encoder/numeric/test_numeric.py
index 57b8815d9..b81ef7a06 100644
--- a/tests/unit_tests/encoder/numeric/test_numeric.py
+++ b/tests/unit_tests/encoder/numeric/test_numeric.py
@@ -45,7 +45,7 @@ def test_encode_and_decode(self):
             if decoded is None:
                 self.assertTrue((real is None) or (real != real))
             else:
-                np.testing.assert_almost_equal(round(decoded, 10), round(real, 10))
+                np.testing.assert_almost_equal(round(decoded, 6), round(real, 6))
 
     def test_positive_domain(self):
         data = pd.Series([-1, -2, -100, 5, 10, 15])
@@ -69,7 +69,7 @@ def test_log_overflow_and_none(self):
         encoder.decode(encoder.encode(data))
 
         for i in range(0, 70, 10):
-            encoder.decode([[0, pow(2, i), 0]])
+            encoder.decode(torch.Tensor([[0, pow(2, i), 0]]))
 
     def test_nan_encoding(self):
         # Generate some numbers

From 32f7a1740f3dc23fdaa79aa1c9e58dd8e5d70e66 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Sat, 10 Jun 2023 03:47:15 -0400
Subject: [PATCH 21/38] uncap torchvision to follow torch

---
 requirements_image.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements_image.txt b/requirements_image.txt
index c35ae2276..a66506e04 100644
--- a/requirements_image.txt
+++ b/requirements_image.txt
@@ -1,2 +1,2 @@
-torchvision >=0.10.0,<0.11.0
+torchvision
 pillow >8.3.1

From 8f77c4330858d1d6a82ad610c7a72d5adc70b6db Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Sat, 10 Jun 2023 17:38:53 -0400
Subject: [PATCH 22/38] separate into two caches

---
 .../analysis/helpers/feature_importance.py    |  1 +
 lightwood/data/encoded_ds.py                  | 86 ++++++++++++-------
 lightwood/encoder/numeric/numeric.py          |  3 +-
 lightwood/mixer/neural.py                     |  5 +-
 lightwood/mixer/random_forest.py              |  4 +-
 lightwood/mixer/regression.py                 |  8 +-
 6 files changed, 64 insertions(+), 43 deletions(-)

diff --git a/lightwood/analysis/helpers/feature_importance.py b/lightwood/analysis/helpers/feature_importance.py
index ce205f388..de01e6888 100644
--- a/lightwood/analysis/helpers/feature_importance.py
+++ b/lightwood/analysis/helpers/feature_importance.py
@@ -81,6 +81,7 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
                 shuffle_data = deepcopy(ref_data)
                 shuffle_data.clear_cache()
                 shuffle_data.data_frame[col] = shuffle(shuffle_data.data_frame[col].values)
+                shuffle_data.build_cache()  # TODO: bottleneck, add a method to build a single column instead!
 
                 shuffled_preds = ns.predictor(shuffle_data, args=PredictionArguments.from_dict(args))
                 shuffled_col_accuracy[col] = np.mean(list(evaluate_accuracies(
diff --git a/lightwood/data/encoded_ds.py b/lightwood/data/encoded_ds.py
index 7af492017..63484b04f 100644
--- a/lightwood/data/encoded_ds.py
+++ b/lightwood/data/encoded_ds.py
@@ -1,5 +1,5 @@
 import inspect
-from typing import List, Tuple
+from typing import List, Tuple, Dict
 import torch
 import numpy as np
 import pandas as pd
@@ -8,7 +8,7 @@
 
 
 class EncodedDs(Dataset):
-    def __init__(self, encoders: List[BaseEncoder], data_frame: pd.DataFrame, target: str) -> None:
+    def __init__(self, encoders: Dict[str, BaseEncoder], data_frame: pd.DataFrame, target: str) -> None:
         """
         Create a Lightwood datasource from a data frame and some encoders. This class inherits from `torch.utils.data.Dataset`.
         
@@ -21,8 +21,6 @@ def __init__(self, encoders: List[BaseEncoder], data_frame: pd.DataFrame, target
         self.data_frame = data_frame
         self.encoders = encoders
         self.target = target
-        self.use_cache = True
-        self.cache = [None] * len(self.data_frame)
         self.encoder_spans = {}
         self.input_length = 0  # feature tensor dim
 
@@ -34,6 +32,10 @@ def __init__(self, encoders: List[BaseEncoder], data_frame: pd.DataFrame, target
                 self.input_length += self.encoders[col].output_size
 
         # if cache enabled, we immediately build it
+        self.use_cache = True
+        self.cache_built = False
+        self.X_cache: torch.Tensor = torch.full((len(self.data_frame),), fill_value=torch.nan)
+        self.Y_cache: torch.Tensor = torch.full((len(self.data_frame),), fill_value=torch.nan)
         self.build_cache()
 
     def __len__(self):
@@ -47,20 +49,23 @@ def __len__(self):
     def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         The getter yields a tuple (X, y), where:
-          - `X `is a concatenation of all encoded representations of the row. Size: (n_features,)
-          - `y` is the encoded target
+          - `X `is a concatenation of all encoded representations of the row. Size: (B, n_features)
+          - `y` is the encoded target. Size: (B, n_features)
           
         :param idx: index of the row to access.
         
         :return: tuple (X, y) with encoded data.
         
         """  # noqa
-        if self.use_cache and self.cache[idx] is not None:
-            X, Y = self.cache[idx]
+        if self.use_cache and self.X_cache[idx] is not torch.nan:
+            X = self.X_cache[idx, :]
+            Y = self.Y_cache[idx]
         else:
             X, Y = self._encode_idxs([idx, ])
             if self.use_cache:
-                self.cache[idx] = [X, Y]
+                self.X_cache[idx, :] = X
+                self.Y_cache[idx, :] = Y
+
         return X, Y
 
     def _encode_idxs(self, idxs: list):
@@ -68,7 +73,7 @@ def _encode_idxs(self, idxs: list):
             raise Exception(f"Passed indexes is not an iterable. Check the type! Index: {idxs}")
 
         X = torch.zeros((len(idxs), self.input_length))
-        Y = torch.FloatTensor()
+        Y = torch.zeros((len(idxs),))
         for col in self.data_frame:
             if self.encoders.get(col, None):
                 kwargs = {}
@@ -93,23 +98,18 @@ def _encode_idxs(self, idxs: list):
 
                 # target post-processing
                 else:
-                    if len(encoded_tensor.shape) > 1:
-                        Y = encoded_tensor.squeeze()
-                    else:
-                        Y = encoded_tensor.ravel()
+                    Y = encoded_tensor
 
-        return X, Y
+                    if len(encoded_tensor.shape) > 2:
+                        Y = encoded_tensor.squeeze()
 
-    def build_cache(self):
-        """ This method builds a cache for the entire dataframe provided at initialization. """
-        if not self.use_cache:
-            raise RuntimeError("Cannot build a cache for EncodedDS with `use_cache` set to False.")
+                    if len(encoded_tensor.shape) < 2:
+                        Y = encoded_tensor.unsqueeze(1)
 
-        idxs = list(range(len(self.data_frame)))
-        X, Y = self._encode_idxs(idxs)
+                    # else:
+                    #     Y = encoded_tensor.ravel()
 
-        for i, (x, y) in enumerate(zip(X, Y)):
-            self.cache[i] = [x, y]
+        return X, Y
 
     def get_column_original_data(self, column_name: str) -> pd.Series:
         """
@@ -127,20 +127,35 @@ def get_encoded_column_data(self, column_name: str) -> torch.Tensor:
         :param column_name: name of the column.
         :return: A `torch.Tensor` with the encoded data of the `column_name` column.
         """
+        if self.use_cache and self.cache_built:
+            if column_name == self.target and self.Y_cache is not None:
+                return self.Y_cache
+            elif self.X_cache is not torch.nan:
+                a, b = self.encoder_spans[column_name]
+                return self.X_cache[:, a:b]
+
         kwargs = {}
         if 'dependency_data' in inspect.signature(self.encoders[column_name].encode).parameters:
             deps = [dep for dep in self.encoders[column_name].dependencies if dep in self.data_frame.columns]
-            kwargs['dependency_data'] = {dep: self.data_frame[dep].tolist() for dep in deps}
+            kwargs['dependency_data'] = {dep: self.data_frame[dep] for dep in deps}
         encoded_data = self.encoders[column_name].encode(self.data_frame[column_name], **kwargs)
         if torch.isnan(encoded_data).any() or torch.isinf(encoded_data).any():
             raise Exception(f'Encoded tensor: {encoded_data} contains nan or inf values')
 
         if not isinstance(encoded_data, torch.Tensor):
             raise Exception(
-                f'The encoder: {self.encoders[column_name]} for column: {column_name} does not return a Tensor !')
+                f'The encoder: {self.encoders[column_name]} for column: {column_name} does not return a Tensor!')
+
+        if self.use_cache and not self.cache_built:
+            if column_name == self.target:
+                self.Y_cache = encoded_data
+            else:
+                a, b = self.encoder_spans[column_name]
+                self.X_cache = self.X_cache[:, a:b]
+
         return encoded_data
 
-    def get_encoded_data(self, include_target=True) -> torch.Tensor:
+    def get_encoded_data(self, include_target: bool = True) -> torch.Tensor:
         """
         Gets all encoded data.
 
@@ -154,11 +169,22 @@ def get_encoded_data(self, include_target=True) -> torch.Tensor:
 
         return torch.cat(encoded_dfs, 1)
 
+    def build_cache(self):
+        """ This method builds a cache for the entire dataframe provided at initialization. """
+        if not self.use_cache:
+            raise RuntimeError("Cannot build a cache for EncodedDS with `use_cache` set to False.")
+
+        idxs = list(range(len(self.data_frame)))
+        X, Y = self._encode_idxs(idxs)
+        self.X_cache = X
+        self.Y_cache = Y
+        self.cache_built = True
+
     def clear_cache(self):
-        """
-        Clears the `EncodedDs` cache.
-        """
-        self.cache = [None] * len(self.data_frame)
+        """ Clears the `EncodedDs` cache. """
+        self.X_cache = torch.full((len(self.data_frame),), fill_value=torch.nan)
+        self.Y_cache = torch.full((len(self.data_frame),), fill_value=torch.nan)
+        self.cache_built = False
 
 
 class ConcatedEncodedDs(EncodedDs):
diff --git a/lightwood/encoder/numeric/numeric.py b/lightwood/encoder/numeric/numeric.py
index fe037708f..79edc4425 100644
--- a/lightwood/encoder/numeric/numeric.py
+++ b/lightwood/encoder/numeric/numeric.py
@@ -73,8 +73,7 @@ def encode(self, data: Union[np.ndarray, pd.Series]):
             nones = np.vectorize(self._none_fn, otypes=[float])(data)
             components = [sign, log_value, norm, nones]
 
-        ret = torch.Tensor(np.asarray(components)).T
-        return torch.Tensor(ret)
+        return torch.Tensor(np.asarray(components)).T
 
     @staticmethod
     def _sign_fn(x: float) -> float:
diff --git a/lightwood/mixer/neural.py b/lightwood/mixer/neural.py
index 4a66c604f..d3e59ec77 100644
--- a/lightwood/mixer/neural.py
+++ b/lightwood/mixer/neural.py
@@ -250,8 +250,9 @@ def _error(self, dev_dl, criterion) -> float:
     def _init_net(self, ds: EncodedDs):
         self.net_class = DefaultNet if self.net_name == 'DefaultNet' else ArNet
 
-        net_kwargs = {'input_size': len(ds[0][0]),
-                      'output_size': len(ds[0][1]),
+        X, Y = ds[0]
+        net_kwargs = {'input_size': len(X),
+                      'output_size': len(Y),
                       'num_hidden': self.num_hidden,
                       'dropout': 0}
 
diff --git a/lightwood/mixer/random_forest.py b/lightwood/mixer/random_forest.py
index 4d79fb9d1..e4ac48106 100644
--- a/lightwood/mixer/random_forest.py
+++ b/lightwood/mixer/random_forest.py
@@ -14,7 +14,7 @@
 from type_infer.dtype import dtype
 from lightwood.helpers.log import log
 from lightwood.encoder.base import BaseEncoder
-from lightwood.data.encoded_ds import ConcatedEncodedDs, EncodedDs
+from lightwood.data.encoded_ds import EncodedDs, ConcatedEncodedDs
 from lightwood.mixer.base import BaseMixer
 from lightwood.api.types import PredictionArguments
 
@@ -203,7 +203,7 @@ def __call__(self, ds: EncodedDs,
 
         :return: dataframe with predictions.
         """
-        data = ds.get_encoded_data(include_target=False)
+        data = ds.get_encoded_data(include_target=False).numpy()
 
         if self.is_classifier:
             predictions = self.model.predict_proba(data)
diff --git a/lightwood/mixer/regression.py b/lightwood/mixer/regression.py
index fc63183cc..88b2ab709 100644
--- a/lightwood/mixer/regression.py
+++ b/lightwood/mixer/regression.py
@@ -87,13 +87,7 @@ def __call__(self, ds: EncodedDs,
 
         :returns: A dataframe cotaining the decoded predictions and (depending on the args) additional information such as the probabilites for each target class
         """ # noqa
-        X = []
-        for x, _ in ds:
-            entry = x.numpy()
-            if len(entry.shape) > 1:
-                entry = entry[0]
-            X.append(entry)
-
+        X = ds.get_encoded_data(include_target=False)
         Yh = self.model.predict(X)
 
         decoded_predictions = self.target_encoder.decode(torch.Tensor(Yh))

From 89fcfa88d18de07bcc2b95c027f79cdcb9641098 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Sat, 10 Jun 2023 18:41:17 -0400
Subject: [PATCH 23/38] grouped ts support

---
 lightwood/data/encoded_ds.py            |   2 +-
 lightwood/encoder/array/ts_num_array.py |  12 +-
 lightwood/encoder/numeric/ts_numeric.py | 143 ++++++++++++------------
 lightwood/mixer/neural_ts.py            |   8 +-
 4 files changed, 80 insertions(+), 85 deletions(-)

diff --git a/lightwood/data/encoded_ds.py b/lightwood/data/encoded_ds.py
index 63484b04f..d9ba4e498 100644
--- a/lightwood/data/encoded_ds.py
+++ b/lightwood/data/encoded_ds.py
@@ -83,7 +83,7 @@ def _encode_idxs(self, idxs: list):
                 if hasattr(self.encoders[col], 'data_window'):
                     cols = [self.target] + [f'{self.target}_timestep_{i}'
                                             for i in range(1, self.encoders[col].data_window)]
-                    data = [self.data_frame[cols].iloc[idxs].values]  # TODO: this is likely to fail as is
+                    data = self.data_frame[cols].iloc[idxs].values
                 else:
                     cols = [col]
                     data = self.data_frame[cols].iloc[idxs].values.flatten()
diff --git a/lightwood/encoder/array/ts_num_array.py b/lightwood/encoder/array/ts_num_array.py
index b61dae507..852f7c3c4 100644
--- a/lightwood/encoder/array/ts_num_array.py
+++ b/lightwood/encoder/array/ts_num_array.py
@@ -2,13 +2,14 @@
 
 import torch
 import torch.nn.functional as F
+import numpy as np
 
 from lightwood.encoder import BaseEncoder
 from lightwood.encoder.numeric import TsNumericEncoder
 
 
 class TsArrayNumericEncoder(BaseEncoder):
-    def __init__(self, timesteps: int, is_target: bool = False, positive_domain: bool = False, grouped_by=None):
+    def __init__(self, timesteps: int, is_target: bool = False, positive_domain: bool = False, grouped_by=None, nan=0):
         """
         This encoder handles arrays of numerical time series data by wrapping the numerical encoder with behavior specific to time series tasks.
         
@@ -23,6 +24,7 @@ def __init__(self, timesteps: int, is_target: bool = False, positive_domain: boo
         self.dependencies = grouped_by
         self.data_window = timesteps
         self.positive_domain = positive_domain
+        self.nan_value = nan
         self.sub_encoder = TsNumericEncoder(is_target=is_target, positive_domain=positive_domain, grouped_by=grouped_by)
         self.output_size = self.data_window * self.sub_encoder.output_size
 
@@ -56,9 +58,9 @@ def encode(self, data: Iterable[Iterable], dependency_data: Optional[Dict[str, s
         for series in data:
             ret.append(self.encode_one(series, dependency_data=dependency_data))
 
-        return torch.vstack(ret)
+        return torch.vstack(ret).nan_to_num(self.nan_value)
 
-    def encode_one(self, data: Iterable, dependency_data: Optional[Dict[str, str]] = {}) -> torch.Tensor:
+    def encode_one(self, data: np.ndarray, dependency_data: Optional[Dict[str, str]] = {}) -> torch.Tensor:
         """
         Encodes a single windowed slice of any given time series.
 
@@ -70,8 +72,8 @@ def encode_one(self, data: Iterable, dependency_data: Optional[Dict[str, str]] =
         """  # noqa
         ret = []
 
-        for data_point in data:
-            ret.append(self.sub_encoder.encode([data_point], dependency_data=dependency_data))
+        for data_point in data.reshape(-1, 1):
+            ret.append(self.sub_encoder.encode(data_point, dependency_data=dependency_data))
 
         ret = torch.hstack(ret)
         padding_size = self.output_size - ret.shape[-1]
diff --git a/lightwood/encoder/numeric/ts_numeric.py b/lightwood/encoder/numeric/ts_numeric.py
index 3203e355a..8773373e3 100644
--- a/lightwood/encoder/numeric/ts_numeric.py
+++ b/lightwood/encoder/numeric/ts_numeric.py
@@ -1,9 +1,10 @@
-import math
+from typing import Union, List, Dict
+
 import torch
 import numpy as np
+import pandas as pd
+
 from lightwood.encoder.numeric import NumericEncoder
-from lightwood.helpers.general import is_none
-from lightwood.helpers.log import log
 
 
 class TsNumericEncoder(NumericEncoder):
@@ -20,95 +21,93 @@ def __init__(self, is_target: bool = False, positive_domain: bool = False, group
         self.dependencies = grouped_by
         self.output_size = 1
 
-    def encode(self, data, dependency_data={}):
+    def encode(self, data: Union[np.ndarray, pd.Series], dependency_data: Dict[str, List[pd.Series]] = {}):
         """
+        :param data: A pandas series containing the numbers to be encoded
         :param dependency_data: dict with grouped_by column info, to retrieve the correct normalizer for each datum
+
+        :returns: A torch tensor with the representations of each number
         """  # noqa
         if not self.is_prepared:
             raise Exception('You need to call "prepare" before calling "encode" or "decode".')
+
         if not dependency_data:
             dependency_data = {'__default': [None] * len(data)}
 
-        ret = []
-        for real, group in zip(data, list(zip(*dependency_data.values()))):
+        if isinstance(data, pd.Series):
+            data = data.values
+
+        # get array of series-wise observed means
+        if self.normalizers is None:
+            means = np.full((len(data)), fill_value=self._abs_mean)
+        else:
+            # use global mean as default for novel series
             try:
-                real = float(real)
+                means = np.full((len(data)), fill_value=self.normalizers['__default'].abs_mean)
             except Exception:
-                try:
-                    real = float(real.replace(',', '.'))
-                except Exception:
-                    real = None
-            if self.is_target:
-                vector = [0]
-                if group is not None and self.normalizers is not None:
-                    try:
-                        mean = self.normalizers[tuple(group)].abs_mean
-                    except KeyError:
-                        # novel group-by, we use default normalizer mean
-                        mean = self.normalizers['__default'].abs_mean
-                else:
-                    mean = self._abs_mean
+                print('!')
 
-                if not is_none(real):
-                    vector[0] = real / mean if mean != 0 else real
+            def _get_group_mean(group) -> float:
+                if (group, ) in self.normalizers:
+                    return self.normalizers[(group, )].abs_mean
                 else:
-                    pass
-                    # This should raise an exception *once* we fix the TsEncoder such that this doesn't get feed `nan`
-                    # raise Exception(f'Can\'t encode target value: {real}')
-            else:
-                vector = [0]
-                try:
-                    if not is_none(real):
-                        vector[0] = real / self._abs_mean
-                except Exception as e:
-                    log.error(f'Can\'t encode input value: {real}, exception: {e}')
-
-            ret.append(vector)
-
-        return torch.Tensor(ret)
-
-    def decode(self, encoded_values, decode_log=None, dependency_data=None):
+                    return self.normalizers['__default'].abs_mean
+
+            for i, group in enumerate(list(zip(*dependency_data.values()))):  # TODO: support multigroup
+                if group is not None:
+                    means = np.vectorize(_get_group_mean, otypes=[float])(group[0].values)
+
+        def _norm_fn(x: float, mean: float) -> float:
+            return x / mean
+
+        # nones = np.vectorize(self._none_fn, otypes=[float])(data)  # TODO
+        encoded = np.vectorize(_norm_fn, otypes=[float])(data, means)
+        # encoded[nones] = 0  # if measurement is None, it is zeroed out  # TODO
+
+        # TODO: mask for where mean is 0, then pass real as-is
+
+        return torch.Tensor(encoded).unsqueeze(1)
+
+    def decode(self, encoded_values: torch.Tensor, decode_log: bool = None, dependency_data=None):
         if not self.is_prepared:
             raise Exception('You need to call "prepare" before calling "encode" or "decode".')
 
-        if decode_log is None:
-            decode_log = self.decode_log
+        assert isinstance(encoded_values, torch.Tensor), 'It is not a tensor!'  # TODO: debug purposes
+        assert not decode_log  # TODO: debug purposes
 
-        ret = []
         if not dependency_data:
             dependency_data = {'__default': [None] * len(encoded_values)}
-        if isinstance(encoded_values, torch.Tensor):
-            encoded_values = encoded_values.tolist()
-
-        for vector, group in zip(encoded_values, list(zip(*dependency_data.values()))):
-            if self.is_target:
-                if np.isnan(vector[0]) or vector[0] == float('inf'):
-                    log.error(f'Got weird target value to decode: {vector}')
-                    real_value = pow(10, 63)
-                else:
-                    if decode_log:
-                        sign = -1 if vector[0] < 0 else 1
-                        try:
-                            real_value = math.exp(vector[0]) * sign
-                        except OverflowError:
-                            real_value = pow(10, 63) * sign
+
+        # force = True prevents side effects on the original encoded_values
+        ev = encoded_values.numpy(force=True)
+
+        # set global mean as default
+        ret = np.full((ev.shape[0],), dtype=float, fill_value=self._abs_mean)
+
+        # TODO: perhaps capture nan, infs, etc and set to pow(10,63)?
+
+        # set means array
+        if self.normalizers is None:
+            means = np.full((ev.shape[0],), fill_value=self._abs_mean)
+        else:
+            means = np.full((len(encoded_values)), fill_value=self.normalizers['__default'].abs_mean)
+            for i, group in enumerate(list(zip(*dependency_data.values()))):
+                if group is not None:
+                    if tuple(group) in self.normalizers:
+                        means[i] = self.normalizers[tuple(group)].abs_mean
                     else:
-                        if group is not None and self.normalizers is not None:
-                            try:
-                                mean = self.normalizers[tuple(group)].abs_mean
-                            except KeyError:
-                                # decode new group with default normalizer
-                                mean = self.normalizers['__default'].abs_mean
-                        else:
-                            mean = self._abs_mean
+                        means[i] = self.normalizers['__default'].abs_mean
+                else:
+                    means[i] = self._abs_mean
 
-                        real_value = vector[0] * mean
+        # set real value
+        real_value = np.multiply(ev[:].reshape(-1,), means)
+        valid_mask = np.ones_like(real_value, dtype=bool)
 
-                    if self.positive_domain:
-                        real_value = abs(real_value)
+        # final filters
+        if self.positive_domain:
+            real_value = abs(real_value)
 
-            else:
-                real_value = vector[0] * self._abs_mean
+        ret[valid_mask] = real_value[valid_mask]  # TODO probably not needed
 
-            ret.append(real_value)
-        return ret
+        return ret.tolist()
diff --git a/lightwood/mixer/neural_ts.py b/lightwood/mixer/neural_ts.py
index ef34b53f7..f40f880a1 100644
--- a/lightwood/mixer/neural_ts.py
+++ b/lightwood/mixer/neural_ts.py
@@ -7,10 +7,8 @@
 
 import torch
 from torch import nn
-import torch_optimizer as ad_optim
 from torch.cuda.amp import GradScaler
 from torch.utils.data import DataLoader
-from torch.optim.optimizer import Optimizer
 
 from type_infer.dtype import dtype
 from lightwood.api.types import PredictionArguments
@@ -76,10 +74,6 @@ def _select_criterion(self) -> torch.nn.Module:
 
         return criterion
 
-    def _select_optimizer(self) -> Optimizer:
-        optimizer = ad_optim.Ranger(self.model.parameters(), lr=self.lr)
-        return optimizer
-
     def _fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None:
         """
         :param train_data: The network is fit/trained on this
@@ -109,7 +103,7 @@ def _fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None:
         self.lr, self.model = self._find_lr(train_dl)
 
         # Keep on training
-        optimizer = self._select_optimizer()
+        optimizer = self._select_optimizer(lr=self.lr)
         criterion = self._select_criterion()
         scaler = GradScaler()
 

From 85922fc6e6e9a76e4f2dbc016867dae60c610437 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Sat, 10 Jun 2023 18:51:57 -0400
Subject: [PATCH 24/38] ts tests pass

---
 lightwood/encoder/numeric/ts_numeric.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/lightwood/encoder/numeric/ts_numeric.py b/lightwood/encoder/numeric/ts_numeric.py
index 8773373e3..55590d5aa 100644
--- a/lightwood/encoder/numeric/ts_numeric.py
+++ b/lightwood/encoder/numeric/ts_numeric.py
@@ -54,8 +54,11 @@ def _get_group_mean(group) -> float:
                     return self.normalizers['__default'].abs_mean
 
             for i, group in enumerate(list(zip(*dependency_data.values()))):  # TODO: support multigroup
-                if group is not None:
-                    means = np.vectorize(_get_group_mean, otypes=[float])(group[0].values)
+                if group[0] is not None:
+                    try:
+                        means = np.vectorize(_get_group_mean, otypes=[float])(group[0].values)
+                    except Exception:
+                        print("!")
 
         def _norm_fn(x: float, mean: float) -> float:
             return x / mean

From 4519f81d77a10c53187749b3ef2083849ce6e07c Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Sun, 11 Jun 2023 21:33:16 -0400
Subject: [PATCH 25/38] disable optuna by default on RF mixer

---
 lightwood/api/json_ai.py                |  1 -
 lightwood/encoder/numeric/ts_numeric.py | 10 ++--------
 lightwood/mixer/neural.py               |  2 +-
 lightwood/mixer/random_forest.py        | 12 +++---------
 4 files changed, 6 insertions(+), 19 deletions(-)

diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py
index 2240a016b..87b73a5e1 100644
--- a/lightwood/api/json_ai.py
+++ b/lightwood/api/json_ai.py
@@ -617,7 +617,6 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
             mixers[i]["args"]["target_encoder"] = mixers[i]["args"].get(
                 "target_encoder", "$encoders[self.target]"
             )
-            mixers[i]["args"]["use_optuna"] = True
 
         elif mixers[i]["module"] == "LightGBMArray":
             mixers[i]["args"]["input_cols"] = mixers[i]["args"].get(
diff --git a/lightwood/encoder/numeric/ts_numeric.py b/lightwood/encoder/numeric/ts_numeric.py
index 55590d5aa..937413208 100644
--- a/lightwood/encoder/numeric/ts_numeric.py
+++ b/lightwood/encoder/numeric/ts_numeric.py
@@ -42,10 +42,7 @@ def encode(self, data: Union[np.ndarray, pd.Series], dependency_data: Dict[str,
             means = np.full((len(data)), fill_value=self._abs_mean)
         else:
             # use global mean as default for novel series
-            try:
-                means = np.full((len(data)), fill_value=self.normalizers['__default'].abs_mean)
-            except Exception:
-                print('!')
+            means = np.full((len(data)), fill_value=self.normalizers['__default'].abs_mean)
 
             def _get_group_mean(group) -> float:
                 if (group, ) in self.normalizers:
@@ -55,10 +52,7 @@ def _get_group_mean(group) -> float:
 
             for i, group in enumerate(list(zip(*dependency_data.values()))):  # TODO: support multigroup
                 if group[0] is not None:
-                    try:
-                        means = np.vectorize(_get_group_mean, otypes=[float])(group[0].values)
-                    except Exception:
-                        print("!")
+                    means = np.vectorize(_get_group_mean, otypes=[float])(group[0].values)
 
         def _norm_fn(x: float, mean: float) -> float:
             return x / mean
diff --git a/lightwood/mixer/neural.py b/lightwood/mixer/neural.py
index d3e59ec77..3b552db89 100644
--- a/lightwood/mixer/neural.py
+++ b/lightwood/mixer/neural.py
@@ -166,7 +166,7 @@ def _find_lr(self, dl):
             else:
                 stop = True
 
-        best_loss_lr = lr_log[np.argmin(running_losses)]
+        best_loss_lr = lr_log[np.nanargmin(running_losses)]  # nanargmin ignores nans that may arise
         lr = best_loss_lr
         log.info(f'Found learning rate of: {lr}')
         return lr, best_model
diff --git a/lightwood/mixer/random_forest.py b/lightwood/mixer/random_forest.py
index e4ac48106..89f6ca682 100644
--- a/lightwood/mixer/random_forest.py
+++ b/lightwood/mixer/random_forest.py
@@ -33,8 +33,8 @@ def __init__(
             target: str,
             dtype_dict: Dict[str, str],
             fit_on_dev: bool,
-            use_optuna: bool,
-            target_encoder: BaseEncoder
+            target_encoder: BaseEncoder,
+            use_optuna: bool = False,
     ):
         """
         The `RandomForest` mixer supports both regression and classification tasks. 
@@ -100,7 +100,6 @@ def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None:
         init_params = {
             'n_estimators': 50,
             'max_depth': 5,
-            'max_features': 1.,
             'bootstrap': True,
             'n_jobs': -1,
             'random_state': 0
@@ -128,15 +127,10 @@ def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None:
             else (mean_squared_error, 'predict')
 
         def objective(trial: trial_module.Trial):
-            criterion = trial.suggest_categorical("criterion",
-                                                  ["gini", "entropy"]) if self.is_classifier else 'squared_error'
+            criterion = trial.suggest_categorical("criterion", "gini") if self.is_classifier else 'squared_error'
 
             params = {
                 'n_estimators': trial.suggest_int('n_estimators', 2, 512),
-                'max_depth': trial.suggest_int('max_depth', 2, 15),
-                'min_samples_split': trial.suggest_int("min_samples_split", 2, 20),
-                'min_samples_leaf': trial.suggest_int("min_samples_leaf", 1, 20),
-                'max_features': trial.suggest_float("max_features", 0.1, 1),
                 'criterion': criterion,
             }
 

From 0ab1f33bdbecc8caea5de50400ba4ec08322c3f1 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Mon, 12 Jun 2023 16:26:57 -0400
Subject: [PATCH 26/38] fix find_lr

---
 lightwood/mixer/neural.py | 46 +++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/lightwood/mixer/neural.py b/lightwood/mixer/neural.py
index 3b552db89..15153fc73 100644
--- a/lightwood/mixer/neural.py
+++ b/lightwood/mixer/neural.py
@@ -112,13 +112,13 @@ def _select_criterion(self) -> torch.nn.Module:
 
         return criterion
 
-    def _select_optimizer(self, lr) -> Optimizer:
-        optimizer = ad_optim.Ranger(self.model.parameters(), lr=lr, weight_decay=2e-2)
+    def _select_optimizer(self, model, lr) -> Optimizer:
+        optimizer = ad_optim.Ranger(model.parameters(), lr=lr, weight_decay=2e-2)
         return optimizer
 
-    def _find_lr(self, dl):
-        lr = 1e-5  # good starting point as search escalates
-        lrs = deque([5e-5, 1e-4, 5e-4, 1e-3, 2e-3, 3e-3, 5e-3, 1e-2, 5e-2, 1e-1])
+    def _find_lr(self, train_data):
+        lr = 1e-4  # good starting point as search escalates
+        lrs = deque([5e-4, 1e-3, 2e-3, 3e-3, 5e-3, 1e-2, 5e-2, 1e-1])
         starting_model = deepcopy(self.model)
         criterion = self._select_criterion()
         scaler = GradScaler()
@@ -128,17 +128,21 @@ def _find_lr(self, dl):
         best_model = self.model
         stop = False
 
-        dl_iter = iter(dl)
-        X, Y = next(dl_iter)
         n_steps = 10
         cum_loss = 0
 
         while stop is False:
-            # overfit learning on first sample (yes, biased, but we only really want an intuition on what LR is decent)
-            optimizer = self._select_optimizer(lr=lr)
-            self.model = starting_model
+            # overfit learning on first n_steps samples (biased, but we only want an intuition on what LR is decent)
+            dl = DataLoader(train_data,
+                            batch_size=min(len(train_data.data_frame), 32, self.batch_size),
+                            shuffle=False)
+            dl_iter = iter(dl)
+            self.model = deepcopy(starting_model)
+            self.model.train()
+            optimizer = self._select_optimizer(self.model, lr=lr)
 
             for i in range(n_steps):
+                X, Y = next(dl_iter)
                 X = X.to(self.model.device)
                 Y = Y.to(self.model.device)
 
@@ -159,14 +163,17 @@ def _find_lr(self, dl):
             running_losses.append(cum_loss)
             lr_log.append(lr)
             cum_loss = 0
+            lr = lrs.popleft()
+            if len(lrs) == 0:
+                stop = True
 
-            if len(running_losses) < 2 or np.mean(list(running_losses)[:-1]) > np.mean(running_losses) and len(lrs) > 0:
-                lr = lrs.popleft()
+            # store model if best so far
+            inv_running_losses = list(running_losses)[::-1]  # invert so when tied we pick the most aggresive LR
+            best_loss_idx = np.nanargmin(inv_running_losses)  # nanargmin ignores nans that may arise
+            if best_loss_idx == 0:
                 best_model = deepcopy(self.model)  # store model for slight time savings
-            else:
-                stop = True
+                best_loss_lr = lr_log[-1]
 
-        best_loss_lr = lr_log[np.nanargmin(running_losses)]  # nanargmin ignores nans that may arise
         lr = best_loss_lr
         log.info(f'Found learning rate of: {lr}')
         return lr, best_model
@@ -285,13 +292,10 @@ def _fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None:
         # Find learning rate & keep initial weights
         self._init_net(train_data)
         if not self.lr:
-            sample_dl = DataLoader(train_data,
-                                   batch_size=min(len(train_data.data_frame), 32, self.batch_size),
-                                   shuffle=True)
-            self.lr, self.model = self._find_lr(sample_dl)
+            self.lr, self.model = self._find_lr(train_data)
 
         # Keep on training
-        optimizer = self._select_optimizer(lr=self.lr)
+        optimizer = self._select_optimizer(self.model, lr=self.lr)
         criterion = self._select_criterion()
         scaler = GradScaler()
 
@@ -321,7 +325,7 @@ def partial_fit(self, train_data: EncodedDs, dev_data: EncodedDs, args: Optional
         self.started = time.time()
         train_dl = DataLoader(train_data, batch_size=self.batch_size, shuffle=True)
         dev_dl = DataLoader(dev_data, batch_size=self.batch_size, shuffle=True)
-        optimizer = self._select_optimizer(lr=self.lr)
+        optimizer = self._select_optimizer(self.model, lr=self.lr)
         criterion = self._select_criterion()
         scaler = GradScaler()
 

From 44d6b0873dfb5d1fd7e965c864b14a5f3a2c9c50 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Mon, 12 Jun 2023 17:08:02 -0400
Subject: [PATCH 27/38] strict criteria to deploy CAE when OHE vector would be
 larger than 16"

---
 lightwood/api/json_ai.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py
index 87b73a5e1..3a7072130 100644
--- a/lightwood/api/json_ai.py
+++ b/lightwood/api/json_ai.py
@@ -94,7 +94,7 @@ def lookup_encoder(
         dtype.binary: "BinaryEncoder",
         dtype.categorical: "CategoricalAutoEncoder"
         if statistical_analysis is None
-        or len(statistical_analysis.histograms[col_name]) > 100
+        or len(statistical_analysis.histograms[col_name]['x']) > 16
         else "OneHotEncoder",
         dtype.tags: "MultiHotEncoder",
         dtype.date: "DatetimeEncoder",
@@ -943,14 +943,17 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
 parallel_encoding = parallel_encoding_check(data['train'], self.encoders)
 
 if parallel_encoding:
+    log.debug('Preparing in parallel...')
     for col_name, encoder in self.encoders.items():
         if col_name != self.target and not encoder.is_trainable_encoder:
             prepped_encoders[col_name] = (encoder, concatenated_train_dev[col_name], 'prepare')
     prepped_encoders = mut_method_call(prepped_encoders)
 
 else:
+    log.debug('Preparing sequentially...')
     for col_name, encoder in self.encoders.items():
         if col_name != self.target and not encoder.is_trainable_encoder:
+            log.debug(f'Preparing encoder for {{col_name}}...')
             encoder.prepare(concatenated_train_dev[col_name])
             prepped_encoders[col_name] = encoder
 

From 53c89afac8dd133a528f4c3fd056e7cfb88c16fb Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Mon, 12 Jun 2023 17:11:27 -0400
Subject: [PATCH 28/38] fix neural_ts

---
 lightwood/mixer/neural.py    | 7 ++++++-
 lightwood/mixer/neural_ts.py | 4 ++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/lightwood/mixer/neural.py b/lightwood/mixer/neural.py
index 15153fc73..697ef288f 100644
--- a/lightwood/mixer/neural.py
+++ b/lightwood/mixer/neural.py
@@ -142,7 +142,12 @@ def _find_lr(self, train_data):
             optimizer = self._select_optimizer(self.model, lr=lr)
 
             for i in range(n_steps):
-                X, Y = next(dl_iter)
+                try:
+                    X, Y = next(dl_iter)
+                except StopIteration:
+                    dl_iter = iter(dl)
+                    X, Y = next(dl_iter)
+
                 X = X.to(self.model.device)
                 Y = Y.to(self.model.device)
 
diff --git a/lightwood/mixer/neural_ts.py b/lightwood/mixer/neural_ts.py
index f40f880a1..813266cff 100644
--- a/lightwood/mixer/neural_ts.py
+++ b/lightwood/mixer/neural_ts.py
@@ -100,10 +100,10 @@ def _fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None:
         # Find learning rate
         # keep the weights
         self._init_net(train_data)
-        self.lr, self.model = self._find_lr(train_dl)
+        self.lr, self.model = self._find_lr(train_data)
 
         # Keep on training
-        optimizer = self._select_optimizer(lr=self.lr)
+        optimizer = self._select_optimizer(self.model, lr=self.lr)
         criterion = self._select_criterion()
         scaler = GradScaler()
 

From b16bd1a6adfe4eee9e1749f3e491c55f8063bf42 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Mon, 12 Jun 2023 17:11:46 -0400
Subject: [PATCH 29/38] vectorized tsNumArray.encode()

---
 lightwood/encoder/array/ts_num_array.py | 31 ++-----------------------
 lightwood/encoder/categorical/onehot.py |  4 ++--
 lightwood/encoder/numeric/ts_numeric.py |  5 ++++
 3 files changed, 9 insertions(+), 31 deletions(-)

diff --git a/lightwood/encoder/array/ts_num_array.py b/lightwood/encoder/array/ts_num_array.py
index 852f7c3c4..5a20a8fac 100644
--- a/lightwood/encoder/array/ts_num_array.py
+++ b/lightwood/encoder/array/ts_num_array.py
@@ -1,8 +1,6 @@
 from typing import List, Dict, Iterable, Optional
 
 import torch
-import torch.nn.functional as F
-import numpy as np
 
 from lightwood.encoder import BaseEncoder
 from lightwood.encoder.numeric import TsNumericEncoder
@@ -54,34 +52,9 @@ def encode(self, data: Iterable[Iterable], dependency_data: Optional[Dict[str, s
         if not dependency_data:
             dependency_data = {'__default': [None] * len(data)}
 
-        ret = []
-        for series in data:
-            ret.append(self.encode_one(series, dependency_data=dependency_data))
-
-        return torch.vstack(ret).nan_to_num(self.nan_value)
-
-    def encode_one(self, data: np.ndarray, dependency_data: Optional[Dict[str, str]] = {}) -> torch.Tensor:
-        """
-        Encodes a single windowed slice of any given time series.
+        ret = self.sub_encoder.encode(data, dependency_data=dependency_data)
 
-        :param data: windowed slice of a numerical time series.
-        :param dependency_data: used to determine the correct normalizer for the input.
-        
-        :return: an encoded time series array, as per the underlying `TsNumericEncoder` object. 
-        The output of this encoder for all time steps is concatenated, so the final shape of the tensor is (1, NxK), where N: self.data_window and K: sub-encoder # of output features. 
-        """  # noqa
-        ret = []
-
-        for data_point in data.reshape(-1, 1):
-            ret.append(self.sub_encoder.encode(data_point, dependency_data=dependency_data))
-
-        ret = torch.hstack(ret)
-        padding_size = self.output_size - ret.shape[-1]
-
-        if padding_size > 0:
-            ret = F.pad(ret, (0, padding_size))
-
-        return ret
+        return torch.Tensor(ret).nan_to_num(self.nan_value)
 
     def decode(self, encoded_values, dependency_data=None) -> List[List]:
         """
diff --git a/lightwood/encoder/categorical/onehot.py b/lightwood/encoder/categorical/onehot.py
index c25c09879..e72a1f59c 100644
--- a/lightwood/encoder/categorical/onehot.py
+++ b/lightwood/encoder/categorical/onehot.py
@@ -68,12 +68,12 @@ def prepare(self, priming_data: Iterable[str]):
         unq_cats = np.unique([i for i in priming_data if i is not None]).tolist()
 
         if self.use_unknown:
-            log.info("Encoding UNKNOWN categories as index 0")
+            log.debug("Encoding UNKNOWN categories as index 0")
             self.map = {cat: indx + 1 for indx, cat in enumerate(unq_cats)}
             self.map.update({_UNCOMMON_WORD: 0})
             self.rev_map = {indx: cat for cat, indx in self.map.items()}
         else:
-            log.info("Encoding UNKNOWN categories as vector of all 0s")
+            log.debug("Encoding UNKNOWN categories as vector of all 0s")
             self.map = {cat: indx for indx, cat in enumerate(unq_cats)}
             self.rev_map = {indx: cat for cat, indx in self.map.items()}
 
diff --git a/lightwood/encoder/numeric/ts_numeric.py b/lightwood/encoder/numeric/ts_numeric.py
index 937413208..d790f5cb5 100644
--- a/lightwood/encoder/numeric/ts_numeric.py
+++ b/lightwood/encoder/numeric/ts_numeric.py
@@ -54,6 +54,11 @@ def _get_group_mean(group) -> float:
                 if group[0] is not None:
                     means = np.vectorize(_get_group_mean, otypes=[float])(group[0].values)
 
+        if len(data.shape) > 1 and data.shape[1] > 1:
+            if len(means.shape) == 1:
+                means = np.expand_dims(means, 1)
+            means = np.repeat(means, data.shape[1], axis=1)
+
         def _norm_fn(x: float, mean: float) -> float:
             return x / mean
 

From 5734e0c5c9fc85bb60d5c2cee046c101676d753d Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Mon, 12 Jun 2023 18:10:23 -0400
Subject: [PATCH 30/38] handle unknown param

---
 lightwood/encoder/categorical/binary.py | 33 +++++++++++++++++++------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/lightwood/encoder/categorical/binary.py b/lightwood/encoder/categorical/binary.py
index 77c019a42..051cdef8c 100644
--- a/lightwood/encoder/categorical/binary.py
+++ b/lightwood/encoder/categorical/binary.py
@@ -7,6 +7,7 @@
 
 from lightwood.encoder.base import BaseEncoder
 from lightwood.helpers.constants import _UNCOMMON_WORD
+from lightwood.helpers.log import log
 
 
 class BinaryEncoder(BaseEncoder):
@@ -34,17 +35,21 @@ def __init__(
         self,
         is_target: bool = False,
         target_weights: Dict[str, float] = None,
+        handle_unknown: str = 'use_encoded_value'
     ):
         super().__init__(is_target)
         """
         :param is_target: Whether encoder featurizes target column
         :param target_weights: Percentage of total population represented by each category (from [0, 1]), as a dictionary.
+        :param handle_unknown: if set to `use_encoded_value`, will encode all classes with index greater than 1 to a special UNKNOWN index that decodes back to `None`. `error` will raise an error when preparing the encoder.
         """  # noqa
 
         self.map = {}  # category name -> index
         self.rev_map = {}  # index -> category name
-        self.output_size = 2
+        self.output_size = 3
         self.encoder_class_type = str
+        self.handle_unknown = handle_unknown
+        self.UNK_IDX = 2
 
         # Weight-balance info if encoder represents target
         self.target_weights = None
@@ -67,13 +72,17 @@ def prepare(self, priming_data: Iterable[str]):
         self.rev_map = {indx: cat for cat, indx in self.map.items()}
 
         # Enforce only binary; map must have exactly 2 classes.
-        if len(self.map) > 2:
-            raise ValueError(f'Issue with dtype; data has > 2 classes. All classes are: {self.map}')
+        if len(self.map) > 2 and self.handle_unknown == 'use_encoded_value':
+            log.warning('Warning: dtype for binary encoder has > 2 classes. Extra classes will be encoded to an invalid token and performance will not be optimal. Try overriding this encoder with a multi-class categorical encoder.')  # noqa
+            log.warning(f'Observed classes are: {self.map}.')
+        elif self.handle_unknown == 'error':
+            raise Exception(f'Issue with dtype; data has > 2 classes. All classes are: {self.map}. Aborting.')
 
         # For target-only, report on relative weights of classes
         if self.is_target:
 
-            self.index_weights = torch.Tensor([1, 1])  # Equally wt. both classes
+            self.index_weights = torch.ones(self.output_size)  # Equally wt. both classes
+            self.index_weights[self.UNK_IDX] = 0  # set unknown index to have no effect
 
             # If target weights provided, weight by inverse
             if self.target_weights is not None:
@@ -102,13 +111,17 @@ def encode(self, column_data: Iterable[str]) -> torch.Tensor:
                 'You need to call "prepare" before calling "encode" or "decode".'
             )
 
-        ret = torch.zeros(size=(len(column_data), 2))
+        ret = torch.zeros(size=(len(column_data), self.output_size))
 
         for idx, word in enumerate(column_data):
             index = self.map.get(word, None)
 
-            if index is not None:
-                ret[idx, index] = 1
+            if index is None:
+                index = self.UNK_IDX  # any unknown value maps to UNK_IDX
+            else:
+                index = min(index, self.UNK_IDX)  # any known value beyond first two also maps to index UNK_IDX
+
+            ret[idx, index] = 1
 
         return torch.Tensor(ret)
 
@@ -130,7 +143,11 @@ def decode(self, encoded_data: torch.Tensor):
             if not np.any(vector):  # Vector of all 0s -> unknown category
                 ret.append(_UNCOMMON_WORD)
             else:
-                ret.append(self.rev_map[np.argmax(vector)])
+                idx = np.argmax(vector)
+                if idx == self.UNK_IDX:
+                    ret.append(None)  # known, but not either of the supported categories
+                else:
+                    ret.append(self.rev_map[idx])
 
         return ret
 

From 399715a99c06f0416402cbf24ea14d0562d26a40 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Mon, 12 Jun 2023 18:41:27 -0400
Subject: [PATCH 31/38] tests pass

---
 lightwood/__about__.py                        |  2 +-
 lightwood/encoder/categorical/binary.py       | 19 ++++++++-----------
 requirements.txt                              |  6 +++---
 .../encoder/categorical/test_binary.py        |  2 +-
 4 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/lightwood/__about__.py b/lightwood/__about__.py
index 42dc89079..7605812b6 100644
--- a/lightwood/__about__.py
+++ b/lightwood/__about__.py
@@ -1,6 +1,6 @@
 __title__ = 'lightwood'
 __package_name__ = 'lightwood'
-__version__ = '23.5.1.1'
+__version__ = '23.6.2.0'
 __description__ = "Lightwood is a toolkit for automatic machine learning model building"
 __email__ = "community@mindsdb.com"
 __author__ = 'MindsDB Inc'
diff --git a/lightwood/encoder/categorical/binary.py b/lightwood/encoder/categorical/binary.py
index 051cdef8c..8c29f8f0d 100644
--- a/lightwood/encoder/categorical/binary.py
+++ b/lightwood/encoder/categorical/binary.py
@@ -41,12 +41,12 @@ def __init__(
         """
         :param is_target: Whether encoder featurizes target column
         :param target_weights: Percentage of total population represented by each category (from [0, 1]), as a dictionary.
-        :param handle_unknown: if set to `use_encoded_value`, will encode all classes with index greater than 1 to a special UNKNOWN index that decodes back to `None`. `error` will raise an error when preparing the encoder.
+        :param handle_unknown: if set to `use_encoded_value`, will assign all classes with index greater than 1 to a special UNKNOWN index. This doesn't affect the encoded representation of shape (B, 2). During decoding, any unknown or otherwise known but "out-of-bounds" word will be decoded back to the lightwood unknown category token. If this argument is set to `error`, the encoder will raise an error while preparing if there are more than two observed classes.
         """  # noqa
 
         self.map = {}  # category name -> index
         self.rev_map = {}  # index -> category name
-        self.output_size = 3
+        self.output_size = 2
         self.encoder_class_type = str
         self.handle_unknown = handle_unknown
         self.UNK_IDX = 2
@@ -73,16 +73,15 @@ def prepare(self, priming_data: Iterable[str]):
 
         # Enforce only binary; map must have exactly 2 classes.
         if len(self.map) > 2 and self.handle_unknown == 'use_encoded_value':
-            log.warning('Warning: dtype for binary encoder has > 2 classes. Extra classes will be encoded to an invalid token and performance will not be optimal. Try overriding this encoder with a multi-class categorical encoder.')  # noqa
+            log.warning('Warning: dtype for binary encoder has > 2 classes. Extra classes will be pointed to an invalid token. Try overriding this encoder with a multi-class categorical encoder, otherwise performance may not be optimal.')  # noqa
             log.warning(f'Observed classes are: {self.map}.')
         elif self.handle_unknown == 'error':
-            raise Exception(f'Issue with dtype; data has > 2 classes. All classes are: {self.map}. Aborting.')
+            raise ValueError(f'Data has > 2 classes and encoder is in strict mode. Aborting. All classes are: {self.map}.')  # noqa
 
         # For target-only, report on relative weights of classes
         if self.is_target:
 
             self.index_weights = torch.ones(self.output_size)  # Equally wt. both classes
-            self.index_weights[self.UNK_IDX] = 0  # set unknown index to have no effect
 
             # If target weights provided, weight by inverse
             if self.target_weights is not None:
@@ -116,12 +115,10 @@ def encode(self, column_data: Iterable[str]) -> torch.Tensor:
         for idx, word in enumerate(column_data):
             index = self.map.get(word, None)
 
-            if index is None:
-                index = self.UNK_IDX  # any unknown value maps to UNK_IDX
+            if index is None or index == self.UNK_IDX:
+                pass  # any unknown value is ignored
             else:
-                index = min(index, self.UNK_IDX)  # any known value beyond first two also maps to index UNK_IDX
-
-            ret[idx, index] = 1
+                ret[idx, index] = 1
 
         return torch.Tensor(ret)
 
@@ -145,7 +142,7 @@ def decode(self, encoded_data: torch.Tensor):
             else:
                 idx = np.argmax(vector)
                 if idx == self.UNK_IDX:
-                    ret.append(None)  # known, but not either of the supported categories
+                    ret.append(_UNCOMMON_WORD)  # known, but not either of the supported categories
                 else:
                     ret.append(self.rev_map[idx])
 
diff --git a/requirements.txt b/requirements.txt
index a07b5a275..335c78f41 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
-type_infer ==0.0.9
-dataprep_ml ==0.0.8
-mindsdb-evaluator >=0.0.7
+type_infer >=0.0.10
+dataprep_ml >=0.0.9
+mindsdb-evaluator >=0.0.9
 numpy
 nltk >=3,<3.6
 python-dateutil >=2.8.1
diff --git a/tests/unit_tests/encoder/categorical/test_binary.py b/tests/unit_tests/encoder/categorical/test_binary.py
index ad2aff72a..4eb7a8837 100644
--- a/tests/unit_tests/encoder/categorical/test_binary.py
+++ b/tests/unit_tests/encoder/categorical/test_binary.py
@@ -72,7 +72,7 @@ def test_check_only_binary(self):
         """ Ensure binary strictly enforces binary typing """
         data = ["apple", "apple", "orange", "banana", "apple", "orange"]
 
-        enc = BinaryEncoder()
+        enc = BinaryEncoder(handle_unknown='error')
         self.assertRaises(ValueError, enc.prepare, data)
 
     def test_check_probabilities(self):

From 860b69787cb7048b725e4285049f4d263fc74118 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Mon, 12 Jun 2023 18:43:39 -0400
Subject: [PATCH 32/38] reformat for clearer src

---
 lightwood/encoder/categorical/binary.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lightwood/encoder/categorical/binary.py b/lightwood/encoder/categorical/binary.py
index 8c29f8f0d..8dad1241f 100644
--- a/lightwood/encoder/categorical/binary.py
+++ b/lightwood/encoder/categorical/binary.py
@@ -49,7 +49,6 @@ def __init__(
         self.output_size = 2
         self.encoder_class_type = str
         self.handle_unknown = handle_unknown
-        self.UNK_IDX = 2
 
         # Weight-balance info if encoder represents target
         self.target_weights = None
@@ -115,7 +114,7 @@ def encode(self, column_data: Iterable[str]) -> torch.Tensor:
         for idx, word in enumerate(column_data):
             index = self.map.get(word, None)
 
-            if index is None or index == self.UNK_IDX:
+            if index is None or index >= self.output_size:
                 pass  # any unknown value is ignored
             else:
                 ret[idx, index] = 1
@@ -141,7 +140,7 @@ def decode(self, encoded_data: torch.Tensor):
                 ret.append(_UNCOMMON_WORD)
             else:
                 idx = np.argmax(vector)
-                if idx == self.UNK_IDX:
+                if idx >= self.output_size:
                     ret.append(_UNCOMMON_WORD)  # known, but not either of the supported categories
                 else:
                     ret.append(self.rev_map[idx])

From 0f9372d230da9170f53fa511837ac16bb8941a91 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Mon, 12 Jun 2023 19:45:43 -0400
Subject: [PATCH 33/38] fix #1134

---
 lightwood/api/json_ai.py             | 3 ++-
 lightwood/encoder/numeric/numeric.py | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py
index 3a7072130..58081d7b3 100644
--- a/lightwood/api/json_ai.py
+++ b/lightwood/api/json_ai.py
@@ -173,7 +173,8 @@ def lookup_encoder(
     if encoder_dict["module"] == "PretrainedLangEncoder" and not is_target:
         encoder_dict["args"]["output_type"] = "$dtype_dict[$target]"
 
-    if eval(encoder_dict["module"]).is_trainable_encoder:
+    enc_cls = eval(encoder_dict["module"])
+    if enc_cls.is_trainable_encoder and hasattr(enc_cls, 'stop_after'):
         encoder_dict["args"]["stop_after"] = "$problem_definition.seconds_per_encoder"
 
     if is_target_predicting_encoder:
diff --git a/lightwood/encoder/numeric/numeric.py b/lightwood/encoder/numeric/numeric.py
index 79edc4425..c62a4ba31 100644
--- a/lightwood/encoder/numeric/numeric.py
+++ b/lightwood/encoder/numeric/numeric.py
@@ -57,15 +57,15 @@ def encode(self, data: Union[np.ndarray, pd.Series]):
         if isinstance(data, pd.Series):
             data = data.values
 
+        data = np.nan_to_num(data.astype(float), nan=0, posinf=20, neginf=-20)
+
         if not self.positive_domain:
             sign = np.vectorize(self._sign_fn, otypes=[float])(data)
         else:
             sign = np.zeros(len(data))
-        log_value = np.vectorize(self._log_fn, otypes=[float])(data)
-        log_value = np.nan_to_num(log_value, nan=0, posinf=20, neginf=-20)
 
+        log_value = np.vectorize(self._log_fn, otypes=[float])(data)
         norm = np.vectorize(self._norm_fn, otypes=[float])(data)
-        norm = np.nan_to_num(norm, nan=0, posinf=20, neginf=-20)
 
         if self.is_target:
             components = [sign, log_value, norm]

From 9636253ea5b8b721da824c8fc72a0b53d828828b Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Mon, 12 Jun 2023 20:08:18 -0400
Subject: [PATCH 34/38] update example notebook

---
 .../custom_encoder_rulebased.ipynb            | 446 ++----------------
 1 file changed, 28 insertions(+), 418 deletions(-)

diff --git a/docssrc/source/tutorials/custom_encoder_rulebased/custom_encoder_rulebased.ipynb b/docssrc/source/tutorials/custom_encoder_rulebased/custom_encoder_rulebased.ipynb
index 10f9a14e6..56be888cf 100644
--- a/docssrc/source/tutorials/custom_encoder_rulebased/custom_encoder_rulebased.ipynb
+++ b/docssrc/source/tutorials/custom_encoder_rulebased/custom_encoder_rulebased.ipynb
@@ -39,7 +39,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "raising-adventure",
    "metadata": {
     "execution": {
@@ -74,7 +74,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "technical-government",
    "metadata": {
     "execution": {
@@ -84,118 +84,7 @@
      "shell.execute_reply": "2022-02-03T21:30:38.234810Z"
     }
    },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>model</th>\n",
-       "      <th>year</th>\n",
-       "      <th>price</th>\n",
-       "      <th>transmission</th>\n",
-       "      <th>mileage</th>\n",
-       "      <th>fuelType</th>\n",
-       "      <th>tax</th>\n",
-       "      <th>mpg</th>\n",
-       "      <th>engineSize</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>A1</td>\n",
-       "      <td>2017</td>\n",
-       "      <td>12500</td>\n",
-       "      <td>Manual</td>\n",
-       "      <td>15735</td>\n",
-       "      <td>Petrol</td>\n",
-       "      <td>150</td>\n",
-       "      <td>55.4</td>\n",
-       "      <td>1.4</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>A6</td>\n",
-       "      <td>2016</td>\n",
-       "      <td>16500</td>\n",
-       "      <td>Automatic</td>\n",
-       "      <td>36203</td>\n",
-       "      <td>Diesel</td>\n",
-       "      <td>20</td>\n",
-       "      <td>64.2</td>\n",
-       "      <td>2.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>A1</td>\n",
-       "      <td>2016</td>\n",
-       "      <td>11000</td>\n",
-       "      <td>Manual</td>\n",
-       "      <td>29946</td>\n",
-       "      <td>Petrol</td>\n",
-       "      <td>30</td>\n",
-       "      <td>55.4</td>\n",
-       "      <td>1.4</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>A4</td>\n",
-       "      <td>2017</td>\n",
-       "      <td>16800</td>\n",
-       "      <td>Automatic</td>\n",
-       "      <td>25952</td>\n",
-       "      <td>Diesel</td>\n",
-       "      <td>145</td>\n",
-       "      <td>67.3</td>\n",
-       "      <td>2.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>A3</td>\n",
-       "      <td>2019</td>\n",
-       "      <td>17300</td>\n",
-       "      <td>Manual</td>\n",
-       "      <td>1998</td>\n",
-       "      <td>Petrol</td>\n",
-       "      <td>145</td>\n",
-       "      <td>49.6</td>\n",
-       "      <td>1.0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "  model  year  price transmission  mileage fuelType  tax   mpg  engineSize\n",
-       "0    A1  2017  12500       Manual    15735   Petrol  150  55.4         1.4\n",
-       "1    A6  2016  16500    Automatic    36203   Diesel   20  64.2         2.0\n",
-       "2    A1  2016  11000       Manual    29946   Petrol   30  55.4         1.4\n",
-       "3    A4  2017  16800    Automatic    25952   Diesel  145  67.3         2.0\n",
-       "4    A3  2019  17300       Manual     1998   Petrol  145  49.6         1.0"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "filename = 'https://raw.githubusercontent.com/mindsdb/benchmarks/main/benchmarks/datasets/used_car_price/data.csv'\n",
     "df = pd.read_csv(filename)\n",
@@ -224,7 +113,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "absent-maker",
    "metadata": {
     "execution": {
@@ -234,38 +123,7 @@
      "shell.execute_reply": "2022-02-03T21:30:38.968531Z"
     }
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001B[32mINFO:lightwood-1462817:Dropping features: []\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Analyzing a sample of 6920\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:from a total population of 10668, this is equivalent to 64.9% of your data.\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Using 7 processes to deduct types.\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Infering type for: model\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Infering type for: year\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Infering type for: price\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Infering type for: transmission\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Infering type for: fuelType\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Infering type for: mileage\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Infering type for: tax\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Column year has data type integer\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Column price has data type integer\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Infering type for: mpg\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Infering type for: engineSize\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Column tax has data type integer\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Column mileage has data type integer\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Column engineSize has data type float\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Column mpg has data type float\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Column transmission has data type categorical\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Column fuelType has data type categorical\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Column model has data type categorical\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Starting statistical analysis\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Finished statistical analysis\u001B[0m\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Create the Problem Definition\n",
     "pdef = ProblemDefinition.from_dict({\n",
@@ -287,7 +145,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "coastal-paragraph",
    "metadata": {
     "execution": {
@@ -297,134 +155,7 @@
      "shell.execute_reply": "2022-02-03T21:30:38.973749Z"
     }
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{\n",
-      "    \"encoders\": {\n",
-      "        \"price\": {\n",
-      "            \"module\": \"NumericEncoder\",\n",
-      "            \"args\": {\n",
-      "                \"is_target\": \"True\",\n",
-      "                \"positive_domain\": \"$statistical_analysis.positive_domain\"\n",
-      "            }\n",
-      "        },\n",
-      "        \"model\": {\n",
-      "            \"module\": \"OneHotEncoder\",\n",
-      "            \"args\": {}\n",
-      "        },\n",
-      "        \"year\": {\n",
-      "            \"module\": \"NumericEncoder\",\n",
-      "            \"args\": {}\n",
-      "        },\n",
-      "        \"transmission\": {\n",
-      "            \"module\": \"OneHotEncoder\",\n",
-      "            \"args\": {}\n",
-      "        },\n",
-      "        \"mileage\": {\n",
-      "            \"module\": \"NumericEncoder\",\n",
-      "            \"args\": {}\n",
-      "        },\n",
-      "        \"fuelType\": {\n",
-      "            \"module\": \"OneHotEncoder\",\n",
-      "            \"args\": {}\n",
-      "        },\n",
-      "        \"tax\": {\n",
-      "            \"module\": \"NumericEncoder\",\n",
-      "            \"args\": {}\n",
-      "        },\n",
-      "        \"mpg\": {\n",
-      "            \"module\": \"NumericEncoder\",\n",
-      "            \"args\": {}\n",
-      "        },\n",
-      "        \"engineSize\": {\n",
-      "            \"module\": \"NumericEncoder\",\n",
-      "            \"args\": {}\n",
-      "        }\n",
-      "    },\n",
-      "    \"dtype_dict\": {\n",
-      "        \"model\": \"categorical\",\n",
-      "        \"year\": \"integer\",\n",
-      "        \"price\": \"integer\",\n",
-      "        \"transmission\": \"categorical\",\n",
-      "        \"mileage\": \"integer\",\n",
-      "        \"fuelType\": \"categorical\",\n",
-      "        \"tax\": \"integer\",\n",
-      "        \"mpg\": \"float\",\n",
-      "        \"engineSize\": \"float\"\n",
-      "    },\n",
-      "    \"dependency_dict\": {},\n",
-      "    \"model\": {\n",
-      "        \"module\": \"BestOf\",\n",
-      "        \"args\": {\n",
-      "            \"submodels\": [\n",
-      "                {\n",
-      "                    \"module\": \"Neural\",\n",
-      "                    \"args\": {\n",
-      "                        \"fit_on_dev\": true,\n",
-      "                        \"stop_after\": \"$problem_definition.seconds_per_mixer\",\n",
-      "                        \"search_hyperparameters\": true\n",
-      "                    }\n",
-      "                },\n",
-      "                {\n",
-      "                    \"module\": \"LightGBM\",\n",
-      "                    \"args\": {\n",
-      "                        \"stop_after\": \"$problem_definition.seconds_per_mixer\",\n",
-      "                        \"fit_on_dev\": true\n",
-      "                    }\n",
-      "                },\n",
-      "                {\n",
-      "                    \"module\": \"Regression\",\n",
-      "                    \"args\": {\n",
-      "                        \"stop_after\": \"$problem_definition.seconds_per_mixer\"\n",
-      "                    }\n",
-      "                }\n",
-      "            ],\n",
-      "            \"args\": \"$pred_args\",\n",
-      "            \"accuracy_functions\": \"$accuracy_functions\",\n",
-      "            \"ts_analysis\": null\n",
-      "        }\n",
-      "    },\n",
-      "    \"problem_definition\": {\n",
-      "        \"target\": \"price\",\n",
-      "        \"pct_invalid\": 2,\n",
-      "        \"unbias_target\": true,\n",
-      "        \"seconds_per_mixer\": 57024.0,\n",
-      "        \"seconds_per_encoder\": null,\n",
-      "        \"expected_additional_time\": 0.5703437328338623,\n",
-      "        \"time_aim\": 259200,\n",
-      "        \"target_weights\": null,\n",
-      "        \"positive_domain\": false,\n",
-      "        \"timeseries_settings\": {\n",
-      "            \"is_timeseries\": false,\n",
-      "            \"order_by\": null,\n",
-      "            \"window\": null,\n",
-      "            \"group_by\": null,\n",
-      "            \"use_previous_target\": true,\n",
-      "            \"horizon\": null,\n",
-      "            \"historical_columns\": null,\n",
-      "            \"target_type\": \"\",\n",
-      "            \"allow_incomplete_history\": true,\n",
-      "            \"eval_cold_start\": true,\n",
-      "            \"interval_periods\": []\n",
-      "        },\n",
-      "        \"anomaly_detection\": false,\n",
-      "        \"use_default_analysis\": true,\n",
-      "        \"ignore_features\": [],\n",
-      "        \"fit_on_all\": true,\n",
-      "        \"strict_mode\": true,\n",
-      "        \"seed_nr\": 420\n",
-      "    },\n",
-      "    \"identifiers\": {},\n",
-      "    \"accuracy_functions\": [\n",
-      "        \"r2_score\"\n",
-      "    ]\n",
-      "}\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(json_ai.to_json())"
    ]
@@ -484,7 +215,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "e03db1b0",
    "metadata": {
     "execution": {
@@ -494,15 +225,7 @@
      "shell.execute_reply": "2022-02-03T21:30:38.978491Z"
     }
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Overwriting LabelEncoder.py\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "%%writefile LabelEncoder.py\n",
     "\n",
@@ -533,9 +256,9 @@
     "    is_prepared: bool\n",
     "\n",
     "    is_timeseries_encoder: bool = False\n",
-    "    is_trainable_encoder: bool = False\n",
+    "    is_trainable_encoder: bool = True\n",
     "\n",
-    "    def __init__(self, is_target: bool = False) -> None:\n",
+    "    def __init__(self, is_target: bool = False, stop_after = 10) -> None:\n",
     "        \"\"\"\n",
     "        Initialize the Label Encoder\n",
     "\n",
@@ -548,8 +271,7 @@
     "        # For LabelEncoder, this is always 1 (1 label per category)\n",
     "        self.output_size = 1\n",
     "\n",
-    "    # Not all encoders need to be prepared\n",
-    "    def prepare(self, priming_data: pd.Series) -> None:\n",
+    "    def prepare(self, train_data: pd.Series, dev_data: pd.Series) -> None:\n",
     "        \"\"\"\n",
     "        Create a LabelEncoder for categorical data.\n",
     "\n",
@@ -561,7 +283,7 @@
     "        \"\"\"\n",
     "\n",
     "        # Find all unique categories in the dataset\n",
-    "        categories = priming_data.unique()\n",
+    "        categories = train_data.unique()\n",
     "\n",
     "        log.info(\"Categories Detected = \" + str(self.output_size))\n",
     "\n",
@@ -608,7 +330,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "e30866c1",
    "metadata": {
     "execution": {
@@ -670,7 +392,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "elementary-fusion",
    "metadata": {
     "execution": {
@@ -699,7 +421,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "id": "inappropriate-james",
    "metadata": {
     "execution": {
@@ -733,7 +455,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "id": "palestinian-harvey",
    "metadata": {
     "execution": {
@@ -743,47 +465,7 @@
      "shell.execute_reply": "2022-02-03T21:30:39.355539Z"
     }
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001B[32mINFO:lightwood-1462817:Performing statistical analysis on data\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Starting statistical analysis\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Finished statistical analysis\u001B[0m\n",
-      "\u001B[37mDEBUG:lightwood-1462817: `analyze_data` runtime: 0.14 seconds\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Cleaning the data\u001B[0m\n",
-      "\u001B[37mDEBUG:lightwood-1462817: `preprocess` runtime: 0.05 seconds\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Splitting the data into train/test\u001B[0m\n",
-      "\u001B[37mDEBUG:lightwood-1462817: `split` runtime: 0.0 seconds\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Preparing the encoders\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 1\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 2\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 3\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 4\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 5\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 6\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 7\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 8\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 9\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Categories Detected = 1\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Categories Detected = 1\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Categories Detected = 1\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Done running for: price\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Done running for: model\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Done running for: year\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Done running for: transmission\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Done running for: mileage\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Done running for: fuelType\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Done running for: tax\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Done running for: mpg\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Done running for: engineSize\u001B[0m\n",
-      "\u001B[37mDEBUG:lightwood-1462817: `prepare` runtime: 0.16 seconds\u001B[0m\n",
-      "\u001B[32mINFO:lightwood-1462817:Featurizing the data\u001B[0m\n",
-      "\u001B[37mDEBUG:lightwood-1462817: `featurize` runtime: 0.0 seconds\u001B[0m\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Perform Stats Analysis\n",
     "predictor.analyze_data(df)\n",
@@ -811,7 +493,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "id": "silent-dealing",
    "metadata": {
     "execution": {
@@ -821,76 +503,7 @@
      "shell.execute_reply": "2022-02-03T21:30:39.392125Z"
     }
    },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>fuelType</th>\n",
-       "      <th>EncData</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>Diesel</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>Diesel</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>Diesel</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>Petrol</td>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>Diesel</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "  fuelType  EncData\n",
-       "0   Diesel        1\n",
-       "1   Diesel        1\n",
-       "2   Diesel        1\n",
-       "3   Petrol        2\n",
-       "4   Diesel        1"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Pick a categorical column name\n",
     "col_name = \"fuelType\"\n",
@@ -916,7 +529,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "id": "superior-mobility",
    "metadata": {
     "execution": {
@@ -926,15 +539,7 @@
      "shell.execute_reply": "2022-02-03T21:30:39.396663Z"
     }
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'Unknown': 0, 'Diesel': 1, 'Petrol': 2, 'Hybrid': 3}\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Label Name -> Label Number\n",
     "print(predictor.encoders[col_name].label_dict)"
@@ -952,6 +557,11 @@
   }
  ],
  "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
@@ -967,4 +577,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
\ No newline at end of file
+}

From ac3ee693b5e46e828490cd479a814b6d90d207e7 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Mon, 12 Jun 2023 20:08:41 -0400
Subject: [PATCH 35/38] fix shorttext encoder is_trainable

---
 lightwood/api/json_ai.py        | 3 +--
 lightwood/encoder/text/short.py | 2 ++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py
index 58081d7b3..3a7072130 100644
--- a/lightwood/api/json_ai.py
+++ b/lightwood/api/json_ai.py
@@ -173,8 +173,7 @@ def lookup_encoder(
     if encoder_dict["module"] == "PretrainedLangEncoder" and not is_target:
         encoder_dict["args"]["output_type"] = "$dtype_dict[$target]"
 
-    enc_cls = eval(encoder_dict["module"])
-    if enc_cls.is_trainable_encoder and hasattr(enc_cls, 'stop_after'):
+    if eval(encoder_dict["module"]).is_trainable_encoder:
         encoder_dict["args"]["stop_after"] = "$problem_definition.seconds_per_encoder"
 
     if is_target_predicting_encoder:
diff --git a/lightwood/encoder/text/short.py b/lightwood/encoder/text/short.py
index e4bb320c7..127bb863f 100644
--- a/lightwood/encoder/text/short.py
+++ b/lightwood/encoder/text/short.py
@@ -8,6 +8,8 @@
 
 
 class ShortTextEncoder(BaseEncoder):
+    is_trainable_encoder = False
+
     def __init__(self, is_target=False, mode=None, device=''):
         """
         :param is_target:

From 3babcd958e9e71ad4ca427da33f502e03bfbdcc2 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Mon, 12 Jun 2023 20:27:29 -0400
Subject: [PATCH 36/38] fix numerical encoder sign none handling

---
 lightwood/encoder/numeric/numeric.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/lightwood/encoder/numeric/numeric.py b/lightwood/encoder/numeric/numeric.py
index c62a4ba31..1a1cf8b25 100644
--- a/lightwood/encoder/numeric/numeric.py
+++ b/lightwood/encoder/numeric/numeric.py
@@ -57,15 +57,16 @@ def encode(self, data: Union[np.ndarray, pd.Series]):
         if isinstance(data, pd.Series):
             data = data.values
 
-        data = np.nan_to_num(data.astype(float), nan=0, posinf=20, neginf=-20)
-
         if not self.positive_domain:
-            sign = np.vectorize(self._sign_fn, otypes=[float])(data)
+            sign_data = np.nan_to_num(data, nan=0, posinf=0, neginf=0)
+            sign = np.vectorize(self._sign_fn, otypes=[float])(sign_data)
         else:
             sign = np.zeros(len(data))
-
         log_value = np.vectorize(self._log_fn, otypes=[float])(data)
+        log_value = np.nan_to_num(log_value, nan=0, posinf=20, neginf=-20)
+
         norm = np.vectorize(self._norm_fn, otypes=[float])(data)
+        norm = np.nan_to_num(norm, nan=0, posinf=20, neginf=-20)
 
         if self.is_target:
             components = [sign, log_value, norm]

From 8c7b78fa04c0bdcbfc094cecb056ebccb198f237 Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Mon, 12 Jun 2023 20:49:46 -0400
Subject: [PATCH 37/38] fix numerical encoder sign none handling

---
 lightwood/encoder/numeric/numeric.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lightwood/encoder/numeric/numeric.py b/lightwood/encoder/numeric/numeric.py
index 1a1cf8b25..9c040ca43 100644
--- a/lightwood/encoder/numeric/numeric.py
+++ b/lightwood/encoder/numeric/numeric.py
@@ -58,7 +58,7 @@ def encode(self, data: Union[np.ndarray, pd.Series]):
             data = data.values
 
         if not self.positive_domain:
-            sign_data = np.nan_to_num(data, nan=0, posinf=0, neginf=0)
+            sign_data = np.nan_to_num(data.astype(float), nan=0, posinf=0, neginf=0)
             sign = np.vectorize(self._sign_fn, otypes=[float])(sign_data)
         else:
             sign = np.zeros(len(data))

From 7eaf17ea7547e78c8c540c65cf8da32db59b359e Mon Sep 17 00:00:00 2001
From: Patricio Cerda Mardini <patricio.mardini@mindsdb.com>
Date: Mon, 12 Jun 2023 21:14:19 -0400
Subject: [PATCH 38/38] fix numerical encoder sign none handling

---
 lightwood/encoder/numeric/numeric.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lightwood/encoder/numeric/numeric.py b/lightwood/encoder/numeric/numeric.py
index 9c040ca43..a2b261e3b 100644
--- a/lightwood/encoder/numeric/numeric.py
+++ b/lightwood/encoder/numeric/numeric.py
@@ -57,15 +57,15 @@ def encode(self, data: Union[np.ndarray, pd.Series]):
         if isinstance(data, pd.Series):
             data = data.values
 
+        inp_data = np.nan_to_num(data.astype(float), nan=0, posinf=np.finfo(np.float32).max, neginf=np.finfo(np.float32).min)  # noqa
         if not self.positive_domain:
-            sign_data = np.nan_to_num(data.astype(float), nan=0, posinf=0, neginf=0)
-            sign = np.vectorize(self._sign_fn, otypes=[float])(sign_data)
+            sign = np.vectorize(self._sign_fn, otypes=[float])(inp_data)
         else:
             sign = np.zeros(len(data))
-        log_value = np.vectorize(self._log_fn, otypes=[float])(data)
+        log_value = np.vectorize(self._log_fn, otypes=[float])(inp_data)
         log_value = np.nan_to_num(log_value, nan=0, posinf=20, neginf=-20)
 
-        norm = np.vectorize(self._norm_fn, otypes=[float])(data)
+        norm = np.vectorize(self._norm_fn, otypes=[float])(inp_data)
         norm = np.nan_to_num(norm, nan=0, posinf=20, neginf=-20)
 
         if self.is_target: