From 93dca030923f9a3c668d37801c126f74fef70b90 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Mon, 5 Jun 2023 22:45:14 -0700 Subject: [PATCH 01/38] less iterations for hyperparam search in random forest mixer --- lightwood/mixer/random_forest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightwood/mixer/random_forest.py b/lightwood/mixer/random_forest.py index 10df3c3f9..4d79fb9d1 100644 --- a/lightwood/mixer/random_forest.py +++ b/lightwood/mixer/random_forest.py @@ -57,7 +57,7 @@ def __init__( self.model = None self.positive_domain = False - self.num_trials = 20 + self.num_trials = 5 self.cv = 3 self.map = {} From 9baaf63f70834bbd1c76174cbddc5254805bd012 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Mon, 5 Jun 2023 23:11:36 -0700 Subject: [PATCH 02/38] rename filter_df, add featurize cache top level, slightly improve encoded_ds.get_item readability --- lightwood/analysis/analyze.py | 3 --- lightwood/api/json_ai.py | 17 ++++++++++++++--- lightwood/data/encoded_ds.py | 11 ++++++----- lightwood/helpers/ts.py | 3 +-- 4 files changed, 21 insertions(+), 13 deletions(-) diff --git a/lightwood/analysis/analyze.py b/lightwood/analysis/analyze.py index 408ce317c..87e21b9c9 100644 --- a/lightwood/analysis/analyze.py +++ b/lightwood/analysis/analyze.py @@ -3,7 +3,6 @@ from dataprep_ml import StatisticalAnalysis from lightwood.helpers.log import log -from lightwood.helpers.ts import filter_ds from type_infer.dtype import dtype from lightwood.ensemble import BaseEnsemble from lightwood.analysis.base import BaseAnalysisBlock @@ -60,8 +59,6 @@ def model_analyzer( normal_predictions = None if len(analysis_blocks) > 0: - filtered_df = filter_ds(encoded_val_data, tss) - encoded_val_data = EncodedDs(encoded_val_data.encoders, filtered_df, encoded_val_data.target) normal_predictions = predictor(encoded_val_data, args=PredictionArguments.from_dict(args)) normal_predictions = normal_predictions.set_index(encoded_val_data.data_frame.index) diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py index 1f21007ed..00b511f04 100644 --- a/lightwood/api/json_ai.py +++ b/lightwood/api/json_ai.py @@ -997,7 +997,16 @@ def code_from_json_ai(json_ai: JsonAI) -> str: feature_body = f""" log.info('Featurizing the data') -feature_data = {{ key: EncodedDs(self.encoders, data, self.target) for key, data in split_data.items() if key != "stratified_on"}} +tss = self.problem_definition.timeseries_settings + +feature_data = dict() +for key, data in split_data.items(): + if key != 'stratified_on': + if key not in self.feature_cache: + featurized_split = EncodedDs(self.encoders, filter_ts(data, tss), self.target) + + self.feature_cache[key] = featurized_split + feature_data[key] = self.feature_cache[key] return feature_data @@ -1019,8 +1028,6 @@ def code_from_json_ai(json_ai: JsonAI) -> str: encoded_train_data = enc_data['train'] encoded_dev_data = enc_data['dev'] encoded_test_data = enc_data['test'] -filtered_df = filter_ds(encoded_test_data, self.problem_definition.timeseries_settings) -encoded_test_data = EncodedDs(encoded_test_data.encoders, filtered_df, encoded_test_data.target) log.info('Training the mixers') @@ -1174,6 +1181,7 @@ def code_from_json_ai(json_ai: JsonAI) -> str: enc_train_test["dev"]]).data_frame, adjust_args={'learn_call': True}) +self.feature_cache = dict() # empty feature cache to avoid large predictor objects """ learn_body = align(learn_body, 2) # ----------------- # @@ -1252,6 +1260,9 @@ def __init__(self): self.runtime_log = dict() self.global_insights = dict() + # Feature cache + self.feature_cache = dict() + @timed def analyze_data(self, data: pd.DataFrame) -> None: # Perform a statistical analysis on the unprocessed data diff --git a/lightwood/data/encoded_ds.py b/lightwood/data/encoded_ds.py index b7f90993f..44fb803c4 100644 --- a/lightwood/data/encoded_ds.py +++ b/lightwood/data/encoded_ds.py @@ -44,7 +44,7 @@ def __len__(self): def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: """ The getter yields a tuple (X, y), where: - - `X `is a concatenation of all encoded representations of the row + - `X `is a concatenation of all encoded representations of the row. Size: (n_features,) - `y` is the encoded target :param idx: index of the row to access. @@ -56,7 +56,7 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: if self.cache[idx] is not None: return self.cache[idx] - X = torch.FloatTensor() + X = [] Y = torch.FloatTensor() for col in self.data_frame: if self.encoders.get(col, None): @@ -72,16 +72,17 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: cols = [col] data = self.data_frame[cols].iloc[idx].tolist() - encoded_tensor = self.encoders[col].encode(data, **kwargs)[0] + encoded_tensor = self.encoders[col].encode(data, **kwargs) if torch.isnan(encoded_tensor).any() or torch.isinf(encoded_tensor).any(): raise Exception(f'Encoded tensor: {encoded_tensor} contains nan or inf values, this tensor is \ the encoding of column {col} using {self.encoders[col].__class__}') if col != self.target: - X = torch.cat([X, encoded_tensor]) + X.append(encoded_tensor) else: - Y = encoded_tensor + Y = encoded_tensor.squeeze() if self.cache_encoded: + X = torch.cat(X, dim=1).float().squeeze() self.cache[idx] = (X, Y) return X, Y diff --git a/lightwood/helpers/ts.py b/lightwood/helpers/ts.py index 445492cf6..c1306157a 100644 --- a/lightwood/helpers/ts.py +++ b/lightwood/helpers/ts.py @@ -297,13 +297,12 @@ def min_k(top_k, data): return candidate_sps -def filter_ds(ds, tss, n_rows=1): +def filter_ts(df: pd.DataFrame, tss, n_rows=1): """ This method triggers only for timeseries datasets. It returns a dataframe that filters out all but the first ``n_rows`` per group. """ # noqa - df = ds.data_frame if tss.is_timeseries: gby = tss.group_by if gby is None: From 60d650b766777537716ba1baed86f0892f3e1d05 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Tue, 6 Jun 2023 18:43:55 -0700 Subject: [PATCH 03/38] early stopping + default frozen pretrained text enc --- lightwood/encoder/text/pretrained.py | 157 +++++++++++++++++---------- 1 file changed, 99 insertions(+), 58 deletions(-) diff --git a/lightwood/encoder/text/pretrained.py b/lightwood/encoder/text/pretrained.py index b9fcd1bae..e94819aab 100644 --- a/lightwood/encoder/text/pretrained.py +++ b/lightwood/encoder/text/pretrained.py @@ -1,15 +1,12 @@ -""" -""" +import os import time +from typing import Iterable +from collections import deque + +import numpy as np import torch from torch.utils.data import DataLoader -import os import pandas as pd -from lightwood.encoder.text.helpers.pretrained_helpers import TextEmbed -from lightwood.helpers.device import get_device_from_name -from lightwood.encoder.base import BaseEncoder -from lightwood.helpers.log import log -from lightwood.helpers.torch import LightwoodAutocast from type_infer.dtype import dtype from transformers import ( DistilBertModel, @@ -18,8 +15,14 @@ AdamW, get_linear_schedule_with_warmup, ) +from sklearn.model_selection import train_test_split + +from lightwood.encoder.text.helpers.pretrained_helpers import TextEmbed +from lightwood.helpers.device import get_device_from_name +from lightwood.encoder.base import BaseEncoder +from lightwood.helpers.log import log +from lightwood.helpers.torch import LightwoodAutocast from lightwood.helpers.general import is_none -from typing import Iterable class PretrainedLangEncoder(BaseEncoder): @@ -38,7 +41,7 @@ def __init__( is_target: bool = False, batch_size: int = 10, max_position_embeddings: int = None, - frozen: bool = False, + frozen: bool = True, epochs: int = 1, output_type: str = None, embed_mode: bool = True, @@ -48,7 +51,6 @@ def __init__( :param is_target: Whether this encoder represents the target. NOT functional for text generation yet. :param batch_size: size of batch while fine-tuning :param max_position_embeddings: max sequence length of input text - :param custom_train: If True, trains model on target procided :param frozen: If True, freezes transformer layers during training. :param epochs: number of epochs to train model with :param output_type: Data dtype of the target; if categorical/binary, the option to return logits is possible. @@ -64,12 +66,14 @@ def __init__( self._frozen = frozen self._batch_size = batch_size self._epochs = epochs + self._patience = 3 # measured in batches rather than epochs + self._val_loss_every = -1 # how many batches to wait before checking val loss. If -1, will check train loss instead of val for early stopping. # noqa + self._tr_loss_every = 2 # same as above, but only applies if `_val_loss_every` is set to -1 # Model setup self._model = None self.model_type = None - # TODO: Other LMs; Distilbert is a good balance of speed/performance self._classifier_model_class = DistilBertForSequenceClassification self._embeddings_model_class = DistilBertModel self._pretrained_model_name = "distilbert-base-uncased" @@ -90,46 +94,45 @@ def __init__( def prepare( self, - train_priming_data: Iterable[str], - dev_priming_data: Iterable[str], + train_priming_data: pd.Series, + dev_priming_data: pd.Series, encoded_target_values: torch.Tensor, ): """ Fine-tunes a transformer on the priming data. - CURRENTLY WIP; train + dev are placeholders for a validation-based approach. - - Train + Dev are concatenated together and a transformer is then fine tuned with weight-decay applied on the transformer parameters. The option to freeze the underlying transformer and only train a linear layer exists if `frozen=True`. This trains faster, with the exception that the performance is often lower than fine-tuning on internal benchmarks. + Transformer is fine-tuned with weight-decay on training split. + By default, underlying transformer is frozen and only final linear layer is trained. This trains faster, often as tradeoff for performance. :param train_priming_data: Text data in the train set - :param dev_priming_data: Text data in the dev set (not currently supported; can be empty) + :param dev_priming_data: Text data in the dev set :param encoded_target_values: Encoded target labels in Nrows x N_output_dimension """ # noqa if self.is_prepared: raise Exception("Encoder is already prepared.") os.environ['TOKENIZERS_PARALLELISM'] = 'true' + val_size = (len(dev_priming_data)) / len(train_priming_data) - # TODO -> we shouldn't be concatenating these together - if len(dev_priming_data) > 0: - priming_data = pd.concat([train_priming_data, dev_priming_data]).values - else: - priming_data = train_priming_data.tolist() + # remove empty strings (`None`s for dtype `object`) + priming_data = pd.concat([ + train_priming_data[~train_priming_data.isna()], + dev_priming_data[~dev_priming_data.isna()]] + ).tolist() - # Replaces empty strings with '' - priming_data = [x if x is not None else "" for x in priming_data] + # Label encode the OHE/binary output for classification + labels = encoded_target_values.argmax(dim=1) + + # Split into train and validation sets + train_texts, val_texts, train_labels, val_labels = train_test_split(priming_data, labels, test_size=val_size) # If classification, then fine-tune - if (self.output_type in (dtype.categorical, dtype.binary)): - log.info("Training model.") + if self.output_type in (dtype.categorical, dtype.binary): + log.info("Training model.\n\tOutput trained is categorical") # Prepare priming data into tokenized form + attention masks - text = self._tokenizer(priming_data, truncation=True, padding=True) - - log.info("\tOutput trained is categorical") - - # Label encode the OHE/binary output for classification - labels = encoded_target_values.argmax(dim=1) + training_text = self._tokenizer(train_texts, truncation=True, padding=True) + validation_text = self._tokenizer(val_texts, truncation=True, padding=True) # Construct the model self._model = self._classifier_model_class.from_pretrained( @@ -138,8 +141,12 @@ def prepare( ).to(self.device) # Construct the dataset for training - xinp = TextEmbed(text, labels) - dataset = DataLoader(xinp, batch_size=self._batch_size, shuffle=True) + xinp = TextEmbed(training_text, train_labels) + train_dataset = DataLoader(xinp, batch_size=self._batch_size, shuffle=True) + + # Construct the dataset for validation + xvalinp = TextEmbed(validation_text, val_labels) + val_dataset = DataLoader(xvalinp, batch_size=self._batch_size, shuffle=True) # Set max length of input string; affects input to the model if self._max_len is None: @@ -148,8 +155,7 @@ def prepare( if self._frozen: log.info("\tFrozen Model + Training Classifier Layers") """ - Freeze the base transformer model and train - a linear layer on top + Freeze the base transformer model and train a linear layer on top """ # Freeze all the transformer parameters for param in self._model.base_model.parameters(): @@ -189,12 +195,12 @@ def prepare( scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, # default value for GLUE - num_training_steps=len(dataset) * self._epochs, + num_training_steps=len(train_dataset) * self._epochs, ) # Train model; declare optimizer earlier if desired. self._tune_model( - dataset, optim=optimizer, scheduler=scheduler, n_epochs=self._epochs + train_dataset, val_dataset, optim=optimizer, scheduler=scheduler, n_epochs=self._epochs ) else: @@ -206,8 +212,7 @@ def prepare( ).to(self.device) # TODO: Not a great flag - # Currently, if the task is not classification, you must have - # an embedding generator only. + # Currently, if the task is not classification, you must have an embedding generator only if self.embed_mode is False: log.info("Embedding mode must be ON for non-classification targets.") self.embed_mode = True @@ -216,19 +221,15 @@ def prepare( encoded = self.encode(priming_data[0:1]) self.output_size = len(encoded[0]) - def _tune_model(self, dataset, optim, scheduler, n_epochs=1): + def _tune_model(self, train_dataset, val_dataset, optim, scheduler, n_epochs=1): """ - Given a model, train for n_epochs. - Specifically intended for tuning; it does NOT use loss/ - stopping criterion. - - model - torch.nn model; - dataset - torch.DataLoader; dataset to train - device - torch.device; cuda/cpu - log - lightwood.logger.log; log.info output + Given a model, tune for n_epochs. + + train_dataset - torch.DataLoader; dataset to train + val_dataset - torch.DataLoader; dataset used to compute validation loss + early stopping optim - transformers.optimization.AdamW; optimizer scheduler - scheduling params - n_epochs - number of epochs to train + n_epochs - max number of epochs to train for, provided there is no early stopping """ # noqa self._model.train() @@ -244,20 +245,21 @@ def _tune_model(self, dataset, optim, scheduler, n_epochs=1): else: log.info("Scheduler provided.") + best_tr_loss = best_val_loss = float("inf") + tr_loss_queue = deque(maxlen=self._patience) + patience_counter = self._patience + started = time.time() for epoch in range(n_epochs): total_loss = 0 - for batch in dataset: + for bidx, batch in enumerate(train_dataset): optim.zero_grad() with LightwoodAutocast(): - inpids = batch["input_ids"].to(self.device) - attn = batch["attention_mask"].to(self.device) - labels = batch["labels"].to(self.device) - outputs = self._model(inpids, attention_mask=attn, labels=labels) - loss = outputs[0] + loss = self._call(batch) + tr_loss_queue.append(loss.item()) total_loss += loss.item() loss.backward() @@ -267,9 +269,48 @@ def _tune_model(self, dataset, optim, scheduler, n_epochs=1): if time.time() - started > self.stop_after: break + # val-based early stopping + if (self._val_loss_every != -1) and (bidx % self._val_loss_every == 0): + self._model.eval() + val_loss = 0 + + for vbatch in val_dataset: + val_loss += self._call(vbatch).item() + + log.info(f"Epoch {epoch+1} train batch {bidx+1} - Validation loss: {val_loss/len(val_dataset)}") + if val_loss / len(val_dataset) >= best_val_loss: + break + + best_val_loss = val_loss / len(val_dataset) + self._model.train() + + # train-based early stopping + elif (bidx + 1) % self._tr_loss_every == 0: + self._model.eval() + + tr_loss = np.average(tr_loss_queue) + log.info(f"Epoch {epoch} train batch {bidx} - Train loss: {tr_loss}") # noqa + self._model.train() + + if tr_loss >= best_tr_loss and patience_counter == 0: + break + elif patience_counter > 0: + patience_counter -= 1 + elif tr_loss < best_tr_loss: + best_tr_loss = tr_loss + patience_counter = self._patience + if time.time() - started > self.stop_after: break - self._train_callback(epoch, total_loss / len(dataset)) + self._train_callback(epoch, total_loss / len(train_dataset)) + + def _call(self, batch): + inpids = batch["input_ids"].to(self.device) + attn = batch["attention_mask"].to(self.device) + labels = batch["labels"].to(self.device) + outputs = self._model(inpids, attention_mask=attn, labels=labels) + loss = outputs[0] + return loss def _train_callback(self, epoch, loss): log.info(f"{self.name} at epoch {epoch+1} and loss {loss}!") From 55320568049c7c00ef83e84ecf67e40fa3a5fd71 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Wed, 7 Jun 2023 03:37:52 +0000 Subject: [PATCH 04/38] progress --- lightwood/encoder/text/pretrained.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lightwood/encoder/text/pretrained.py b/lightwood/encoder/text/pretrained.py index e94819aab..65a2a6509 100644 --- a/lightwood/encoder/text/pretrained.py +++ b/lightwood/encoder/text/pretrained.py @@ -41,7 +41,7 @@ def __init__( is_target: bool = False, batch_size: int = 10, max_position_embeddings: int = None, - frozen: bool = True, + frozen: bool = False, epochs: int = 1, output_type: str = None, embed_mode: bool = True, @@ -67,7 +67,7 @@ def __init__( self._batch_size = batch_size self._epochs = epochs self._patience = 3 # measured in batches rather than epochs - self._val_loss_every = -1 # how many batches to wait before checking val loss. If -1, will check train loss instead of val for early stopping. # noqa + self._val_loss_every = 5 # how many batches to wait before checking val loss. If -1, will check train loss instead of val for early stopping. # noqa self._tr_loss_every = 2 # same as above, but only applies if `_val_loss_every` is set to -1 # Model setup @@ -270,7 +270,7 @@ def _tune_model(self, train_dataset, val_dataset, optim, scheduler, n_epochs=1): break # val-based early stopping - if (self._val_loss_every != -1) and (bidx % self._val_loss_every == 0): + if False and (self._val_loss_every != -1) and (bidx % self._val_loss_every == 0): self._model.eval() val_loss = 0 @@ -285,7 +285,7 @@ def _tune_model(self, train_dataset, val_dataset, optim, scheduler, n_epochs=1): self._model.train() # train-based early stopping - elif (bidx + 1) % self._tr_loss_every == 0: + elif False and (bidx + 1) % self._tr_loss_every == 0: self._model.eval() tr_loss = np.average(tr_loss_queue) From 07708ab19c0146b61c02042ed4fc86ca0d5757e1 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Wed, 7 Jun 2023 19:59:54 -0700 Subject: [PATCH 05/38] refactor num enc + tests; improve neural early stopping --- lightwood/data/encoded_ds.py | 4 +- lightwood/encoder/numeric/numeric.py | 127 +++++++----------- lightwood/encoder/numeric/ts_numeric.py | 6 - lightwood/mixer/neural.py | 29 ++-- .../encoder/numeric/test_numeric.py | 70 +++++----- 5 files changed, 103 insertions(+), 133 deletions(-) diff --git a/lightwood/data/encoded_ds.py b/lightwood/data/encoded_ds.py index 44fb803c4..b5bf5e7dd 100644 --- a/lightwood/data/encoded_ds.py +++ b/lightwood/data/encoded_ds.py @@ -67,10 +67,10 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: if hasattr(self.encoders[col], 'data_window'): cols = [self.target] + [f'{self.target}_timestep_{i}' for i in range(1, self.encoders[col].data_window)] - data = [self.data_frame[cols].iloc[idx].tolist()] + data = [self.data_frame[cols].iloc[idx].values] else: cols = [col] - data = self.data_frame[cols].iloc[idx].tolist() + data = self.data_frame[cols].iloc[idx].values encoded_tensor = self.encoders[col].encode(data, **kwargs) if torch.isnan(encoded_tensor).any() or torch.isinf(encoded_tensor).any(): diff --git a/lightwood/encoder/numeric/numeric.py b/lightwood/encoder/numeric/numeric.py index 251fd1ae6..aa99f1921 100644 --- a/lightwood/encoder/numeric/numeric.py +++ b/lightwood/encoder/numeric/numeric.py @@ -1,12 +1,15 @@ import math -from typing import Iterable, List, Union +from typing import List, Union + import torch import numpy as np +import pandas as pd from torch.types import Number +from type_infer.dtype import dtype + from lightwood.encoder.base import BaseEncoder from lightwood.helpers.log import log from lightwood.helpers.general import is_none -from type_infer.dtype import dtype class NumericEncoder(BaseEncoder): @@ -28,13 +31,12 @@ def __init__(self, data_type: dtype = None, is_target: bool = False, positive_do :param positive_domain: Forces the encoder to always output positive values """ super().__init__(is_target) - self._type = data_type self._abs_mean = None self.positive_domain = positive_domain self.decode_log = False self.output_size = 4 if not self.is_target else 3 - def prepare(self, priming_data: Iterable): + def prepare(self, priming_data: pd.Series): """ "NumericalEncoder" uses a rule-based form to prepare results on training (priming) data. The averages etc. are taken from this distribution. @@ -43,55 +45,36 @@ def prepare(self, priming_data: Iterable): if self.is_prepared: raise Exception('You can only call "prepare" once for a given encoder.') - value_type = 'int' - for number in priming_data: - if not is_none(number): - if int(number) != number: - value_type = 'float' - - self._type = value_type if self._type is None else self._type - non_null_priming_data = [x for x in priming_data if not is_none(x)] - self._abs_mean = np.mean(np.abs(non_null_priming_data)) + self._abs_mean = priming_data.abs().mean() self.is_prepared = True - def encode(self, data: Iterable): + def encode(self, data: pd.Series): """ - :param data: An iterable data structure containing the numbers to be encoded - + :param data: A pandas series containing the numbers to be encoded :returns: A torch tensor with the representations of each number """ if not self.is_prepared: raise Exception('You need to call "prepare" before calling "encode" or "decode".') - ret = [] - for real in data: - try: - real = float(real) - except Exception: - real = None - if self.is_target: - # Will crash if ``real`` is not a float, this is fine, targets should always have a value - vector = [0] * 3 - vector[0] = 1 if real < 0 and not self.positive_domain else 0 - vector[1] = math.log(abs(real)) if abs(real) > 0 else -20 - vector[2] = real / self._abs_mean - - else: - vector = [0] * 4 - try: - if is_none(real): - vector[0] = 0 - else: - vector[0] = 1 - vector[1] = math.log(abs(real)) if abs(real) > 0 else -20 - vector[2] = 1 if real < 0 and not self.positive_domain else 0 - vector[3] = real / self._abs_mean - except Exception as e: - vector = [0] * 4 - log.error(f'Can\'t encode input value: {real}, exception: {e}') - - ret.append(vector) - + # todo: wrap with try/except to cover non-real edge cases + if not self.positive_domain: + sign = np.vectorize(lambda x: 0 if x < 0 else 1)(data) + else: + sign = np.zeros(len(data)) + log_value = np.vectorize(lambda x: math.log(abs(x)) if abs(x) > 0 else -20)(data) + log_value = np.nan_to_num(log_value, nan=0, posinf=20, neginf=-20) + + exp = np.vectorize(lambda x: x / self._abs_mean)(data) + exp = np.nan_to_num(exp, nan=0, posinf=20, neginf=-20) + + if self.is_target: + components = [sign, log_value, exp] + else: + # todo: if can't encode return 0s and log.error(f'Can\'t encode input value: {real}, exception: {e}') + nones = np.vectorize(lambda x: 1 if is_none(x) else 0)(data) + components = [sign, log_value, exp, nones] + + ret = torch.Tensor(np.array(components)).T return torch.Tensor(ret) def decode(self, encoded_values: Union[List[Number], torch.Tensor], decode_log: bool = None) -> list: @@ -112,40 +95,32 @@ def decode(self, encoded_values: Union[List[Number], torch.Tensor], decode_log: encoded_values = encoded_values.tolist() for vector in encoded_values: - if self.is_target: - if np.isnan( - vector[0]) or vector[0] == float('inf') or np.isnan( - vector[1]) or vector[1] == float('inf') or np.isnan( - vector[2]) or vector[2] == float('inf'): - log.error(f'Got weird target value to decode: {vector}') - real_value = pow(10, 63) - else: - if decode_log: - sign = -1 if vector[0] > 0.5 else 1 - try: - real_value = math.exp(vector[1]) * sign - except OverflowError: - real_value = pow(10, 63) * sign - else: - real_value = vector[2] * self._abs_mean - - if self.positive_domain: - real_value = abs(real_value) - - if self._type == 'int': - real_value = int(real_value) + # check for none + if len(vector) == 4 and vector[-1] == 1: + ret.append(None) + continue - else: - if vector[0] < 0.5: - ret.append(None) - continue + # edge case: divergence + elif np.isnan(vector[0]) or vector[0] == float('inf') or \ + np.isnan(vector[1]) or vector[1] == float('inf') or \ + np.isnan(vector[2]) or vector[2] == float('inf'): - real_value = vector[3] * self._abs_mean + log.error(f'Got weird target value to decode: {vector}') + real_value = pow(10, 63) + + elif decode_log: + sign = -1 if vector[0] < 0.5 else 1 + try: + real_value = math.exp(vector[1]) * sign + except OverflowError: + real_value = pow(10, 63) * sign + else: + real_value = vector[2] * self._abs_mean - if self._type == 'int': - real_value = round(real_value) + if self.positive_domain: + real_value = abs(real_value) - if isinstance(real_value, torch.Tensor): - real_value = real_value.item() + # if isinstance(real_value, torch.Tensor): + # real_value = real_value.item() ret.append(real_value) return ret diff --git a/lightwood/encoder/numeric/ts_numeric.py b/lightwood/encoder/numeric/ts_numeric.py index 06127c9a3..3203e355a 100644 --- a/lightwood/encoder/numeric/ts_numeric.py +++ b/lightwood/encoder/numeric/ts_numeric.py @@ -107,14 +107,8 @@ def decode(self, encoded_values, decode_log=None, dependency_data=None): if self.positive_domain: real_value = abs(real_value) - if self._type == 'int': - real_value = int(round(real_value, 0)) - else: real_value = vector[0] * self._abs_mean - if self._type == 'int': - real_value = round(real_value) - ret.append(real_value) return ret diff --git a/lightwood/mixer/neural.py b/lightwood/mixer/neural.py index 90040a3aa..ce4e7386d 100644 --- a/lightwood/mixer/neural.py +++ b/lightwood/mixer/neural.py @@ -1,5 +1,6 @@ import time from copy import deepcopy +from collections import deque from typing import Dict, List, Optional import torch @@ -42,7 +43,8 @@ def __init__( net: str, fit_on_dev: bool, search_hyperparameters: bool, - n_epochs: Optional[int] = None + n_epochs: Optional[int] = None, + lr: Optional[float] = None, ): """ The Neural mixer trains a fully connected dense network from concatenated encoded outputs of each of the features in the dataset to predicted the encoded output. @@ -55,6 +57,7 @@ def __init__( :param fit_on_dev: If we should fit on the dev dataset :param search_hyperparameters: If the network should run a more through hyperparameter search (currently disabled) :param n_epochs: amount of epochs that the network will be trained for. Supersedes all other early stopping criteria if specified. + :param lr: learning rate for the network. By default, it is automatically selected based on an initial search process. """ # noqa super().__init__(stop_after) self.dtype_dict = dtype_dict @@ -62,6 +65,8 @@ def __init__( self.target_encoder = target_encoder self.epochs_to_best = 0 self.n_epochs = n_epochs + self.lr = lr + self.loss_hist_len = 5 # length of queue to use for early stopping self.fit_on_dev = fit_on_dev self.net_name = net self.supports_proba = dtype_dict[target] in [dtype.binary, dtype.categorical] @@ -106,12 +111,12 @@ def _select_criterion(self) -> torch.nn.Module: return criterion - def _select_optimizer(self) -> Optimizer: - optimizer = ad_optim.Ranger(self.model.parameters(), lr=self.lr, weight_decay=2e-2) + def _select_optimizer(self, lr) -> Optimizer: + optimizer = ad_optim.Ranger(self.model.parameters(), lr=lr, weight_decay=2e-2) return optimizer def _find_lr(self, dl): - optimizer = self._select_optimizer() + optimizer = self._select_optimizer(lr=1e-3) # magic number for ranger optimizer, should be good starting point criterion = self._select_criterion() scaler = GradScaler() @@ -168,7 +173,7 @@ def _find_lr(self, dl): def _max_fit(self, train_dl, dev_dl, criterion, optimizer, scaler, stop_after, return_model_after): epochs_to_best = 0 best_dev_error = pow(2, 32) - running_errors = [] + running_errors = deque(maxlen=self.loss_hist_len) best_model = self.model for epoch in range(1, return_model_after + 1): @@ -215,10 +220,11 @@ def _max_fit(self, train_dl, dev_dl, criterion, optimizer, scaler, stop_after, r # automated early stopping else: - if len(running_errors) >= 5: - delta_mean = np.average([running_errors[-i - 1] - running_errors[-i] for i in range(1, 5)], - weights=[(1 / 2)**i for i in range(1, 5)]) - if delta_mean <= 0: + if len(running_errors) >= self.loss_hist_len: + delta_mean = np.average([ + running_errors[-i - 1] - running_errors[-i] for i in range(len(running_errors)-1)], + weights=[(1 / 2)**i for i in range(len(running_errors)-1)]) + if delta_mean >= 0: break elif (time.time() - self.started) > stop_after: break @@ -274,7 +280,6 @@ def _fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: dev_dl = DataLoader(dev_data, batch_size=self.batch_size, shuffle=False) train_dl = DataLoader(train_data, batch_size=self.batch_size, shuffle=False) - self.lr = 1e-4 self.num_hidden = 1 # Find learning rate @@ -284,7 +289,7 @@ def _fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: self.lr, self.model = self._find_lr(train_dl) # Keep on training - optimizer = self._select_optimizer() + optimizer = self._select_optimizer(lr=self.lr) criterion = self._select_criterion() scaler = GradScaler() @@ -314,7 +319,7 @@ def partial_fit(self, train_data: EncodedDs, dev_data: EncodedDs, args: Optional self.started = time.time() train_dl = DataLoader(train_data, batch_size=self.batch_size, shuffle=True) dev_dl = DataLoader(dev_data, batch_size=self.batch_size, shuffle=True) - optimizer = self._select_optimizer() + optimizer = self._select_optimizer(lr=self.lr) criterion = self._select_criterion() scaler = GradScaler() diff --git a/tests/unit_tests/encoder/numeric/test_numeric.py b/tests/unit_tests/encoder/numeric/test_numeric.py index 93590c071..57b8815d9 100644 --- a/tests/unit_tests/encoder/numeric/test_numeric.py +++ b/tests/unit_tests/encoder/numeric/test_numeric.py @@ -1,5 +1,6 @@ import unittest import numpy as np +import pandas as pd import torch from lightwood.encoder.numeric import NumericEncoder from lightwood.encoder.numeric import TsNumericEncoder @@ -16,31 +17,38 @@ def _pollute(array): class TestNumericEncoder(unittest.TestCase): def test_encode_and_decode(self): - data = [1, 1.1, 2, -8.6, None, 0] + data = pd.Series([1, 1.1, 2, -8.6, None, 0]) encoder = NumericEncoder() - encoder.prepare(data) encoded_vals = encoder.encode(data) - self.assertTrue(encoded_vals[1][1] > 0) - self.assertTrue(encoded_vals[2][1] > 0) - self.assertTrue(encoded_vals[3][1] > 0) - for i in range(0, 3): - self.assertTrue(encoded_vals[i][2] == 0) - self.assertTrue(encoded_vals[3][2] == 1) - self.assertTrue(encoded_vals[4][3] == 0) + # sign component check + self.assertTrue(encoded_vals[0][0] > 0) + self.assertTrue(encoded_vals[1][0] > 0) + self.assertTrue(encoded_vals[2][0] > 0) + self.assertTrue(encoded_vals[3][0] == 0) - decoded_vals = encoder.decode(encoded_vals) + # none component check + for i in range(0, len(encoded_vals)): + if i != 4: + self.assertTrue(encoded_vals[i][-1] == 0) + else: + self.assertTrue(encoded_vals[i][-1] == 1) - for i in range(len(encoded_vals)): - if decoded_vals[i] is None: - self.assertTrue(decoded_vals[i] == data[i]) + # exp component nan edge case check + self.assertTrue(encoded_vals[4][2] == 0) + + # compare decoded v/s real + decoded_vals = encoder.decode(encoded_vals) + for decoded, real in zip(decoded_vals, data.tolist()): + if decoded is None: + self.assertTrue((real is None) or (real != real)) else: - np.testing.assert_almost_equal(round(decoded_vals[i], 10), round(data[i], 10)) + np.testing.assert_almost_equal(round(decoded, 10), round(real, 10)) def test_positive_domain(self): - data = [-1, -2, -100, 5, 10, 15] + data = pd.Series([-1, -2, -100, 5, 10, 15]) for encoder in [NumericEncoder(), TsNumericEncoder()]: encoder.is_target = True # only affects target values encoder.positive_domain = True @@ -51,7 +59,7 @@ def test_positive_domain(self): self.assertTrue(val >= 0) def test_log_overflow_and_none(self): - data = list(range(-2000, 2000, 66)) + data = pd.Series(list(range(-2000, 2000, 66))) encoder = NumericEncoder() encoder.is_target = True @@ -72,10 +80,10 @@ def test_nan_encoding(self): # Prepare with the correct data and decode invalid data encoder = NumericEncoder() - encoder.prepare(data) + encoder.prepare(pd.Series(data)) for array in invalid_data: # Make sure the encoding has no nans or infs - encoded_repr = encoder.encode(array) + encoded_repr = encoder.encode(pd.Series(array)) assert not torch.isnan(encoded_repr).any() assert not torch.isinf(encoded_repr).any() @@ -88,29 +96,17 @@ def test_nan_encoding(self): # Prepare with the invalid data and decode the valid data for array in invalid_data: encoder = NumericEncoder() - encoder.prepare(array) + encoder.prepare(pd.Series(array)) # Make sure the encoding has no nans or infs - encoded_repr = encoder.encode(data) + encoded_repr = encoder.encode(pd.Series(array)) assert not torch.isnan(encoded_repr).any() assert not torch.isinf(encoded_repr).any() # Make sure the invalid value is decoded as `None` and the rest as numbers decoded_repr = encoder.decode(encoded_repr) - for x in decoded_repr: - assert not is_none(x) - - # Prepare with the invalid data and decode invalid data - for array in invalid_data: - encoder = NumericEncoder() - encoder.prepare(array) - # Make sure the encoding has no nans or infs - encoded_repr = encoder.encode(array) - assert not torch.isnan(encoded_repr).any() - assert not torch.isinf(encoded_repr).any() - - # Make sure the invalid value is decoded as `None` and the rest as numbers - decoded_repr = encoder.decode(encoded_repr) - for x in decoded_repr[:-1]: - assert not is_none(x) - assert decoded_repr[-1] is None + for dec, real in zip(decoded_repr, array): + if is_none(real): + assert is_none(dec) + else: + assert not is_none(x) or x != 0.0 From b6551149e0871dfa1043d081409179184a9ca40a Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Wed, 7 Jun 2023 21:29:34 -0700 Subject: [PATCH 06/38] better lr search --- lightwood/mixer/neural.py | 65 ++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/lightwood/mixer/neural.py b/lightwood/mixer/neural.py index ce4e7386d..d6612b54c 100644 --- a/lightwood/mixer/neural.py +++ b/lightwood/mixer/neural.py @@ -63,10 +63,11 @@ def __init__( self.dtype_dict = dtype_dict self.target = target self.target_encoder = target_encoder + self.num_hidden = 1 self.epochs_to_best = 0 self.n_epochs = n_epochs self.lr = lr - self.loss_hist_len = 5 # length of queue to use for early stopping + self.loss_hist_len = 7 # length of queue to use for early stopping self.fit_on_dev = fit_on_dev self.net_name = net self.supports_proba = dtype_dict[target] in [dtype.binary, dtype.categorical] @@ -116,27 +117,31 @@ def _select_optimizer(self, lr) -> Optimizer: return optimizer def _find_lr(self, dl): - optimizer = self._select_optimizer(lr=1e-3) # magic number for ranger optimizer, should be good starting point + lr = 1e-5 # good starting point as search escalates + lrs = deque([5e-5, 1e-4, 5e-4, 1e-3, 2e-3, 3e-3, 5e-3, 1e-2, 5e-2, 1e-1]) + starting_model = deepcopy(self.model) criterion = self._select_criterion() scaler = GradScaler() - running_losses: List[float] = [] - cum_loss = 0 - lr_log = [] + running_losses = deque(maxlen=self.loss_hist_len) + lr_log = deque(maxlen=self.loss_hist_len) best_model = self.model stop = False - batches = 0 - for epoch in range(1, 101): - if stop: - break - for i, (X, Y) in enumerate(dl): - if stop: - break + _, test_batch = next(enumerate(dl)) + X, Y = test_batch + n_steps = 10 + cum_loss = 0 + + while stop is False: + # overfit learning on first sample (yes, biased, but we only really want an intuition on what LR is decent) + optimizer = self._select_optimizer(lr=lr) + self.model = starting_model - batches += len(X) + for i in range(n_steps): X = X.to(self.model.device) Y = Y.to(self.model.device) + with LightwoodAutocast(): optimizer.zero_grad() Yh = self._net_call(X) @@ -150,20 +155,16 @@ def _find_lr(self, dl): optimizer.step() cum_loss += loss.item() - # Account for ranger lookahead update - if (i + 1) * epoch % 6: - batches = 0 - lr = optimizer.param_groups[0]['lr'] - log.info(f'Loss of {cum_loss} with learning rate {lr}') - running_losses.append(cum_loss) - lr_log.append(lr) - cum_loss = 0 - if len(running_losses) < 2 or np.mean(running_losses[:-1]) > np.mean(running_losses): - optimizer.param_groups[0]['lr'] = lr * 1.4 - # Time saving since we don't have to start training fresh - best_model = deepcopy(self.model) - else: - stop = True + log.info(f'Loss of {cum_loss} with learning rate {lr}') + running_losses.append(cum_loss) + lr_log.append(lr) + cum_loss = 0 + + if len(running_losses) < 2 or np.mean(list(running_losses)[:-1]) > np.mean(running_losses) and len(lrs) > 0: + lr = lrs.popleft() + best_model = deepcopy(self.model) # store model for slight time savings + else: + stop = True best_loss_lr = lr_log[np.argmin(running_losses)] lr = best_loss_lr @@ -280,13 +281,13 @@ def _fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: dev_dl = DataLoader(dev_data, batch_size=self.batch_size, shuffle=False) train_dl = DataLoader(train_data, batch_size=self.batch_size, shuffle=False) - self.num_hidden = 1 - - # Find learning rate - # keep the weights + # Find learning rate & keep initial weights self._init_net(train_data) if not self.lr: - self.lr, self.model = self._find_lr(train_dl) + sample_dl = DataLoader(train_data, + batch_size=min(len(train_data.data_frame), 32, self.batch_size), + shuffle=True) + self.lr, self.model = self._find_lr(sample_dl) # Keep on training optimizer = self._select_optimizer(lr=self.lr) From 3578cbd86c6f1b6d32ded744114cefd401696d4a Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Wed, 7 Jun 2023 21:50:50 -0700 Subject: [PATCH 07/38] move EncodedDS cache building to offline within init(). Preliminary results suggest a 10x runtime improvement across the board, with no accuracy loss --- lightwood/data/encoded_ds.py | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/lightwood/data/encoded_ds.py b/lightwood/data/encoded_ds.py index b5bf5e7dd..9b654df6f 100644 --- a/lightwood/data/encoded_ds.py +++ b/lightwood/data/encoded_ds.py @@ -33,6 +33,9 @@ def __init__(self, encoders: List[BaseEncoder], data_frame: pd.DataFrame, target self.input_length + self.encoders[col].output_size) self.input_length += self.encoders[col].output_size + # if cache enabled, we immediately build it + self.build_cache() # TODO: ensure we remove these instances from predictor object before serializing + def __len__(self): """ The length of an `EncodedDs` datasource equals the amount of rows of the original dataframe. @@ -56,21 +59,28 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: if self.cache[idx] is not None: return self.cache[idx] + X, Y = self._encode_idxs(idx) + + if self.cache_encoded: + X = torch.cat(X, dim=1).float().squeeze() + self.cache[idx] = (X, Y) + + def _encode_idxs(self, idxs): X = [] Y = torch.FloatTensor() for col in self.data_frame: if self.encoders.get(col, None): kwargs = {} if 'dependency_data' in inspect.signature(self.encoders[col].encode).parameters: - kwargs['dependency_data'] = {dep: [self.data_frame.iloc[idx][dep]] + kwargs['dependency_data'] = {dep: [self.data_frame.iloc[idxs][dep]] for dep in self.encoders[col].dependencies} if hasattr(self.encoders[col], 'data_window'): cols = [self.target] + [f'{self.target}_timestep_{i}' for i in range(1, self.encoders[col].data_window)] - data = [self.data_frame[cols].iloc[idx].values] + data = [self.data_frame[cols].iloc[idxs].values] # TODO: this is likely to fail as is else: cols = [col] - data = self.data_frame[cols].iloc[idx].values + data = self.data_frame[cols].iloc[idxs].values.flatten() encoded_tensor = self.encoders[col].encode(data, **kwargs) if torch.isnan(encoded_tensor).any() or torch.isinf(encoded_tensor).any(): @@ -81,12 +91,18 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: else: Y = encoded_tensor.squeeze() - if self.cache_encoded: - X = torch.cat(X, dim=1).float().squeeze() - self.cache[idx] = (X, Y) - + # concatenate features into single tensor + X = torch.concat(X, dim=1) return X, Y + def build_cache(self): + assert self.cache_encoded + idxs = list(range(len(self.data_frame))) + X, Y = self._encode_idxs(idxs) + + for i, (x, y) in enumerate(zip(X, Y)): + self.cache[i] = (x, y) + def get_column_original_data(self, column_name: str) -> pd.Series: """ Gets the original data for any given column of the `EncodedDs`. From 776cbdd55f9848fd1991a7f9ba0d282453c208b9 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Fri, 9 Jun 2023 12:26:43 -0400 Subject: [PATCH 08/38] fix indentation bug --- lightwood/api/json_ai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py index 00b511f04..0f22604ed 100644 --- a/lightwood/api/json_ai.py +++ b/lightwood/api/json_ai.py @@ -1004,8 +1004,8 @@ def code_from_json_ai(json_ai: JsonAI) -> str: if key != 'stratified_on': if key not in self.feature_cache: featurized_split = EncodedDs(self.encoders, filter_ts(data, tss), self.target) + self.feature_cache[key] = featurized_split - self.feature_cache[key] = featurized_split feature_data[key] = self.feature_cache[key] return feature_data From 5ab843643a61d88eca482fbad8ed8479a60fdad4 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Fri, 9 Jun 2023 12:28:38 -0400 Subject: [PATCH 09/38] fix indentation bug in json_ai --- lightwood/api/json_ai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py index 00b511f04..0f22604ed 100644 --- a/lightwood/api/json_ai.py +++ b/lightwood/api/json_ai.py @@ -1004,8 +1004,8 @@ def code_from_json_ai(json_ai: JsonAI) -> str: if key != 'stratified_on': if key not in self.feature_cache: featurized_split = EncodedDs(self.encoders, filter_ts(data, tss), self.target) + self.feature_cache[key] = featurized_split - self.feature_cache[key] = featurized_split feature_data[key] = self.feature_cache[key] return feature_data From f08bd9f87a4e469955d0814c57992f8cd320342a Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Fri, 9 Jun 2023 12:37:20 -0400 Subject: [PATCH 10/38] lint: flake8 --- lightwood/mixer/neural.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightwood/mixer/neural.py b/lightwood/mixer/neural.py index d6612b54c..57ce00bfc 100644 --- a/lightwood/mixer/neural.py +++ b/lightwood/mixer/neural.py @@ -223,8 +223,8 @@ def _max_fit(self, train_dl, dev_dl, criterion, optimizer, scaler, stop_after, r else: if len(running_errors) >= self.loss_hist_len: delta_mean = np.average([ - running_errors[-i - 1] - running_errors[-i] for i in range(len(running_errors)-1)], - weights=[(1 / 2)**i for i in range(len(running_errors)-1)]) + running_errors[-i - 1] - running_errors[-i] for i in range(len(running_errors) - 1)], + weights=[(1 / 2)**i for i in range(len(running_errors) - 1)]) if delta_mean >= 0: break elif (time.time() - self.started) > stop_after: From 835353c7c4a93710ea89261579f89f5363732c64 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Fri, 9 Jun 2023 20:01:58 -0400 Subject: [PATCH 11/38] fix jsonai: reset feature_cache after predict calls --- lightwood/api/json_ai.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py index 0f22604ed..ba521243c 100644 --- a/lightwood/api/json_ai.py +++ b/lightwood/api/json_ai.py @@ -1216,13 +1216,14 @@ def code_from_json_ai(json_ai: JsonAI) -> str: log.info(f'[Predict phase 3/{{n_phases}}] - Calling ensemble') df = self.ensemble(encoded_ds, args=self.pred_args) -if self.pred_args.all_mixers: - return df -else: +if not self.pred_args.all_mixers: log.info(f'[Predict phase 4/{{n_phases}}] - Analyzing output') - insights, global_insights = {call(json_ai.explainer)} + df, global_insights = {call(json_ai.explainer)} self.global_insights = {{**self.global_insights, **global_insights}} - return insights + +self.feature_cache = dict() # empty feature cache to avoid large predictor objects + +return df """ predict_body = align(predict_body, 2) From abba8bd3b978d9445e6c1911edeafbbe25c7a287 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Fri, 9 Jun 2023 20:23:19 -0400 Subject: [PATCH 12/38] fix edge case empty dev split --- lightwood/encoder/text/pretrained.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/lightwood/encoder/text/pretrained.py b/lightwood/encoder/text/pretrained.py index 65a2a6509..a74c0b1e5 100644 --- a/lightwood/encoder/text/pretrained.py +++ b/lightwood/encoder/text/pretrained.py @@ -112,13 +112,17 @@ def prepare( raise Exception("Encoder is already prepared.") os.environ['TOKENIZERS_PARALLELISM'] = 'true' - val_size = (len(dev_priming_data)) / len(train_priming_data) # remove empty strings (`None`s for dtype `object`) - priming_data = pd.concat([ - train_priming_data[~train_priming_data.isna()], - dev_priming_data[~dev_priming_data.isna()]] - ).tolist() + filtered_tr = train_priming_data[~train_priming_data.isna()] + filtered_dev = dev_priming_data[~dev_priming_data.isna()] + + if filtered_dev.shape[0] > 0: + priming_data = pd.concat([filtered_tr, filtered_dev]).tolist() + val_size = (len(dev_priming_data)) / len(train_priming_data) + else: + priming_data = filtered_tr.tolist() + val_size = 0.1 # leave out 0.1 for validation # Label encode the OHE/binary output for classification labels = encoded_target_values.argmax(dim=1) From 542dc7855a22c2cdb00f989b3383ab8abfe9f88c Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Fri, 9 Jun 2023 21:49:13 -0400 Subject: [PATCH 13/38] fix: create ts filtered splits, use ravel() --- lightwood/api/json_ai.py | 14 ++++++++++---- lightwood/data/encoded_ds.py | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py index ba521243c..2240a016b 100644 --- a/lightwood/api/json_ai.py +++ b/lightwood/api/json_ai.py @@ -1002,11 +1002,17 @@ def code_from_json_ai(json_ai: JsonAI) -> str: feature_data = dict() for key, data in split_data.items(): if key != 'stratified_on': + + # compute and store two splits - full and filtered (useful for time series post-train analysis) if key not in self.feature_cache: - featurized_split = EncodedDs(self.encoders, filter_ts(data, tss), self.target) - self.feature_cache[key] = featurized_split + featurized_split = EncodedDs(self.encoders, data, self.target) + filtered_subset = EncodedDs(self.encoders, filter_ts(data, tss), self.target) + + for k, s in zip((key, f'{{key}}_filtered'), (featurized_split, filtered_subset)): + self.feature_cache[k] = s - feature_data[key] = self.feature_cache[key] + for k in (key, f'{{key}}_filtered'): + feature_data[k] = self.feature_cache[k] return feature_data @@ -1027,7 +1033,7 @@ def code_from_json_ai(json_ai: JsonAI) -> str: # Extract the featurized data into train/dev/test encoded_train_data = enc_data['train'] encoded_dev_data = enc_data['dev'] -encoded_test_data = enc_data['test'] +encoded_test_data = enc_data['test_filtered'] log.info('Training the mixers') diff --git a/lightwood/data/encoded_ds.py b/lightwood/data/encoded_ds.py index 44fb803c4..196c7f0e9 100644 --- a/lightwood/data/encoded_ds.py +++ b/lightwood/data/encoded_ds.py @@ -79,7 +79,7 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: if col != self.target: X.append(encoded_tensor) else: - Y = encoded_tensor.squeeze() + Y = encoded_tensor.ravel() if self.cache_encoded: X = torch.cat(X, dim=1).float().squeeze() From 47237b7506a963d9411f18349954cfc8244f6deb Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Fri, 9 Jun 2023 22:23:18 -0400 Subject: [PATCH 14/38] fix comment --- lightwood/encoder/text/pretrained.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lightwood/encoder/text/pretrained.py b/lightwood/encoder/text/pretrained.py index a74c0b1e5..4532192bb 100644 --- a/lightwood/encoder/text/pretrained.py +++ b/lightwood/encoder/text/pretrained.py @@ -102,8 +102,9 @@ def prepare( Fine-tunes a transformer on the priming data. Transformer is fine-tuned with weight-decay on training split. - By default, underlying transformer is frozen and only final linear layer is trained. This trains faster, often as tradeoff for performance. - + + Train + Dev are concatenated together and a transformer is then fine tuned with weight-decay applied on the transformer parameters. The option to freeze the underlying transformer and only train a linear layer exists if `frozen=True`. This trains faster, with the exception that the performance is often lower than fine-tuning on internal benchmarks. + :param train_priming_data: Text data in the train set :param dev_priming_data: Text data in the dev set :param encoded_target_values: Encoded target labels in Nrows x N_output_dimension From 5be101b770c01a31d931b407095ee4c76186b5a7 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Sat, 10 Jun 2023 00:37:59 -0400 Subject: [PATCH 15/38] partially address feedback --- lightwood/encoder/numeric/numeric.py | 2 +- lightwood/mixer/neural.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lightwood/encoder/numeric/numeric.py b/lightwood/encoder/numeric/numeric.py index aa99f1921..a040d0581 100644 --- a/lightwood/encoder/numeric/numeric.py +++ b/lightwood/encoder/numeric/numeric.py @@ -74,7 +74,7 @@ def encode(self, data: pd.Series): nones = np.vectorize(lambda x: 1 if is_none(x) else 0)(data) components = [sign, log_value, exp, nones] - ret = torch.Tensor(np.array(components)).T + ret = torch.Tensor(np.asarray(components)).T return torch.Tensor(ret) def decode(self, encoded_values: Union[List[Number], torch.Tensor], decode_log: bool = None) -> list: diff --git a/lightwood/mixer/neural.py b/lightwood/mixer/neural.py index 57ce00bfc..4a66c604f 100644 --- a/lightwood/mixer/neural.py +++ b/lightwood/mixer/neural.py @@ -128,8 +128,8 @@ def _find_lr(self, dl): best_model = self.model stop = False - _, test_batch = next(enumerate(dl)) - X, Y = test_batch + dl_iter = iter(dl) + X, Y = next(dl_iter) n_steps = 10 cum_loss = 0 From 57bf20c7ceb186c7214aca80e185dc90a6598c79 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Sat, 10 Jun 2023 01:01:45 -0400 Subject: [PATCH 16/38] vectorized operations as functions, type hints --- lightwood/encoder/numeric/numeric.py | 39 +++++++++++++++++++++------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/lightwood/encoder/numeric/numeric.py b/lightwood/encoder/numeric/numeric.py index a040d0581..815e08794 100644 --- a/lightwood/encoder/numeric/numeric.py +++ b/lightwood/encoder/numeric/numeric.py @@ -48,35 +48,54 @@ def prepare(self, priming_data: pd.Series): self._abs_mean = priming_data.abs().mean() self.is_prepared = True - def encode(self, data: pd.Series): + def encode(self, data: Union[np.ndarray, pd.Series]): """ - :param data: A pandas series containing the numbers to be encoded + :param data: A pandas series or numpy array containing the numbers to be encoded :returns: A torch tensor with the representations of each number """ if not self.is_prepared: raise Exception('You need to call "prepare" before calling "encode" or "decode".') - # todo: wrap with try/except to cover non-real edge cases + if isinstance(data, pd.Series): + data = data.values + + data = np.nan_to_num(data, nan=0).astype(float) + if not self.positive_domain: - sign = np.vectorize(lambda x: 0 if x < 0 else 1)(data) + sign = np.vectorize(self._sign_fn, otypes=[float])(data) else: sign = np.zeros(len(data)) - log_value = np.vectorize(lambda x: math.log(abs(x)) if abs(x) > 0 else -20)(data) + log_value = np.vectorize(self._log_fn, otypes=[float])(data) log_value = np.nan_to_num(log_value, nan=0, posinf=20, neginf=-20) - exp = np.vectorize(lambda x: x / self._abs_mean)(data) - exp = np.nan_to_num(exp, nan=0, posinf=20, neginf=-20) + norm = np.vectorize(self._norm_fn, otypes=[float])(data) + norm = np.nan_to_num(norm, nan=0, posinf=20, neginf=-20) if self.is_target: - components = [sign, log_value, exp] + components = [sign, log_value, norm] else: # todo: if can't encode return 0s and log.error(f'Can\'t encode input value: {real}, exception: {e}') - nones = np.vectorize(lambda x: 1 if is_none(x) else 0)(data) - components = [sign, log_value, exp, nones] + nones = np.vectorize(self._none_fn, otypes=[float])(data) + components = [sign, log_value, norm, nones] ret = torch.Tensor(np.asarray(components)).T return torch.Tensor(ret) + @staticmethod + def _sign_fn(x: float) -> float: + return 0 if x < 0 else 1 + + @staticmethod + def _log_fn(x: float) -> float: + return math.log(abs(x)) if abs(x) > 0 else -20 + + def _norm_fn(self, x: float) -> float: + return x / self._abs_mean + + @staticmethod + def _none_fn(x: float) -> float: + return 1 if is_none(x) else 0 + def decode(self, encoded_values: Union[List[Number], torch.Tensor], decode_log: bool = None) -> list: """ :param encoded_values: The encoded values to decode into single numbers From 73737b5a45b9f086c1d270ffc9015c10e90729d8 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Sat, 10 Jun 2023 01:47:31 -0400 Subject: [PATCH 17/38] address feedback by vectorizing numerical_encoder.decode method --- lightwood/encoder/numeric/numeric.py | 74 +++++++++++++--------------- lightwood/mixer/regression.py | 5 +- 2 files changed, 39 insertions(+), 40 deletions(-) diff --git a/lightwood/encoder/numeric/numeric.py b/lightwood/encoder/numeric/numeric.py index 815e08794..52f12d0c1 100644 --- a/lightwood/encoder/numeric/numeric.py +++ b/lightwood/encoder/numeric/numeric.py @@ -1,14 +1,12 @@ import math -from typing import List, Union +from typing import Union import torch import numpy as np import pandas as pd -from torch.types import Number from type_infer.dtype import dtype from lightwood.encoder.base import BaseEncoder -from lightwood.helpers.log import log from lightwood.helpers.general import is_none @@ -96,50 +94,48 @@ def _norm_fn(self, x: float) -> float: def _none_fn(x: float) -> float: return 1 if is_none(x) else 0 - def decode(self, encoded_values: Union[List[Number], torch.Tensor], decode_log: bool = None) -> list: + def decode(self, encoded_values: torch.Tensor, decode_log: bool = None) -> list: """ :param encoded_values: The encoded values to decode into single numbers :param decode_log: Whether to decode the ``log`` or ``linear`` part of the representation, since the encoded vector contains both a log and a linear part - :returns: The decoded number + :returns: The decoded array """ # noqa + if not self.is_prepared: raise Exception('You need to call "prepare" before calling "encode" or "decode".') if decode_log is None: decode_log = self.decode_log - ret = [] - if isinstance(encoded_values, torch.Tensor): - encoded_values = encoded_values.tolist() - - for vector in encoded_values: - # check for none - if len(vector) == 4 and vector[-1] == 1: - ret.append(None) - continue - - # edge case: divergence - elif np.isnan(vector[0]) or vector[0] == float('inf') or \ - np.isnan(vector[1]) or vector[1] == float('inf') or \ - np.isnan(vector[2]) or vector[2] == float('inf'): - - log.error(f'Got weird target value to decode: {vector}') - real_value = pow(10, 63) - - elif decode_log: - sign = -1 if vector[0] < 0.5 else 1 - try: - real_value = math.exp(vector[1]) * sign - except OverflowError: - real_value = pow(10, 63) * sign - else: - real_value = vector[2] * self._abs_mean - - if self.positive_domain: - real_value = abs(real_value) - - # if isinstance(real_value, torch.Tensor): - # real_value = real_value.item() - ret.append(real_value) - return ret + # force = True prevents side effects on the original encoded_values + ev = encoded_values.numpy(force=True) + + # set "divergent" value as default (note: finfo.max() instead of pow(10, 63)) + ret = np.full((ev.shape[0],), dtype=float, fill_value=np.finfo(np.float64).max) + + # sign component + sign = np.ones(ev.shape[0], dtype=float) + mask_sign = ev[:, 0] < 0.5 + sign[mask_sign] = -1 + + # real component + if decode_log: + real_value = np.exp(ev[:, 1]) * sign + overflow_mask = ev[:, 1] >= 63 + real_value[overflow_mask] = 10 ** 63 + valid_mask = ~overflow_mask + else: + real_value = ev[:, 2] * self._abs_mean + valid_mask = np.ones_like(real_value, dtype=bool) + + # final filters + if self.positive_domain: + real_value = abs(real_value) + + ret[valid_mask] = real_value[valid_mask] + + nan_mask = ret[:, ] == np.nan + ret[nan_mask] = None + + return ret.tolist() # TODO: update signature on BaseEncoder and replace all encs to return ndarrays diff --git a/lightwood/mixer/regression.py b/lightwood/mixer/regression.py index 99c2a9905..fc63183cc 100644 --- a/lightwood/mixer/regression.py +++ b/lightwood/mixer/regression.py @@ -89,7 +89,10 @@ def __call__(self, ds: EncodedDs, """ # noqa X = [] for x, _ in ds: - X.append(x.tolist()) + entry = x.numpy() + if len(entry.shape) > 1: + entry = entry[0] + X.append(entry) Yh = self.model.predict(X) From 2a82d61c4a634a88bc1682f6a573eb253794eed3 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Sat, 10 Jun 2023 01:47:52 -0400 Subject: [PATCH 18/38] partially address feedback for encodedDs class --- lightwood/data/encoded_ds.py | 47 +++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/lightwood/data/encoded_ds.py b/lightwood/data/encoded_ds.py index 396b6f4cd..7af492017 100644 --- a/lightwood/data/encoded_ds.py +++ b/lightwood/data/encoded_ds.py @@ -21,10 +21,10 @@ def __init__(self, encoders: List[BaseEncoder], data_frame: pd.DataFrame, target self.data_frame = data_frame self.encoders = encoders self.target = target - self.cache_encoded = True + self.use_cache = True self.cache = [None] * len(self.data_frame) self.encoder_spans = {} - self.input_length = 0 + self.input_length = 0 # feature tensor dim # save encoder span, has to use same iterator as in __getitem__ for correct indeces for col in self.data_frame: @@ -34,7 +34,7 @@ def __init__(self, encoders: List[BaseEncoder], data_frame: pd.DataFrame, target self.input_length += self.encoders[col].output_size # if cache enabled, we immediately build it - self.build_cache() # TODO: ensure we remove these instances from predictor object before serializing + self.build_cache() def __len__(self): """ @@ -55,18 +55,19 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: :return: tuple (X, y) with encoded data. """ # noqa - if self.cache_encoded: - if self.cache[idx] is not None: - return self.cache[idx] - - X, Y = self._encode_idxs(idx) + if self.use_cache and self.cache[idx] is not None: + X, Y = self.cache[idx] + else: + X, Y = self._encode_idxs([idx, ]) + if self.use_cache: + self.cache[idx] = [X, Y] + return X, Y - if self.cache_encoded: - X = torch.cat(X, dim=1).float().squeeze() - self.cache[idx] = (X, Y) + def _encode_idxs(self, idxs: list): + if not isinstance(idxs, list): + raise Exception(f"Passed indexes is not an iterable. Check the type! Index: {idxs}") - def _encode_idxs(self, idxs): - X = [] + X = torch.zeros((len(idxs), self.input_length)) Y = torch.FloatTensor() for col in self.data_frame: if self.encoders.get(col, None): @@ -87,21 +88,28 @@ def _encode_idxs(self, idxs): raise Exception(f'Encoded tensor: {encoded_tensor} contains nan or inf values, this tensor is \ the encoding of column {col} using {self.encoders[col].__class__}') if col != self.target: - X.append(encoded_tensor) + a, b = self.encoder_spans[col] + X[:, a:b] = torch.squeeze(encoded_tensor, dim=list(range(2, len(encoded_tensor.shape)))) + + # target post-processing else: - Y = encoded_tensor.ravel() + if len(encoded_tensor.shape) > 1: + Y = encoded_tensor.squeeze() + else: + Y = encoded_tensor.ravel() - # concatenate features into single tensor - X = torch.concat(X, dim=1) return X, Y def build_cache(self): - assert self.cache_encoded + """ This method builds a cache for the entire dataframe provided at initialization. """ + if not self.use_cache: + raise RuntimeError("Cannot build a cache for EncodedDS with `use_cache` set to False.") + idxs = list(range(len(self.data_frame))) X, Y = self._encode_idxs(idxs) for i, (x, y) in enumerate(zip(X, Y)): - self.cache[i] = (x, y) + self.cache[i] = [x, y] def get_column_original_data(self, column_name: str) -> pd.Series: """ @@ -157,6 +165,7 @@ class ConcatedEncodedDs(EncodedDs): """ `ConcatedEncodedDs` abstracts over multiple encoded datasources (`EncodedDs`) as if they were a single entity. """ # noqa + # TODO: We should probably delete this abstraction, it's not really useful and it adds complexity/overhead def __init__(self, encoded_ds_arr: List[EncodedDs]) -> None: # @TODO: missing super() call here? From 0273dedf55b562bb05e56cc8a30ddb5c547a0b74 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Sat, 10 Jun 2023 01:48:29 -0400 Subject: [PATCH 19/38] add support for new type_infer version (backwards compatible change) --- lightwood/encoder/text/short.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightwood/encoder/text/short.py b/lightwood/encoder/text/short.py index e7f68186e..e4bb320c7 100644 --- a/lightwood/encoder/text/short.py +++ b/lightwood/encoder/text/short.py @@ -55,7 +55,7 @@ def prepare(self, priming_data): unique_tokens = set() max_words_per_sent = 0 for sent in no_null_sentences: - tokens = tokenize_text(sent) + tokens = list(tokenize_text(sent)) max_words_per_sent = max(max_words_per_sent, len(tokens)) for tok in tokens: unique_tokens.add(tok) @@ -78,7 +78,7 @@ def encode(self, column_data: List[str]) -> torch.Tensor: no_null_sentences = (x if x is not None else '' for x in column_data) output = [] for sent in no_null_sentences: - tokens = tokenize_text(sent) + tokens = list(tokenize_text(sent)) encoded_words = self.cae.encode(tokens) encoded_sent = self._combine_fn(encoded_words) output.append(torch.Tensor(encoded_sent)) From 74bd95c71711cc5c1a2e9375365f0f0d754ae801 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Sat, 10 Jun 2023 02:03:54 -0400 Subject: [PATCH 20/38] fix tests --- lightwood/encoder/numeric/numeric.py | 16 +++++++++++----- tests/unit_tests/encoder/numeric/test_numeric.py | 4 ++-- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/lightwood/encoder/numeric/numeric.py b/lightwood/encoder/numeric/numeric.py index 52f12d0c1..fe037708f 100644 --- a/lightwood/encoder/numeric/numeric.py +++ b/lightwood/encoder/numeric/numeric.py @@ -57,8 +57,6 @@ def encode(self, data: Union[np.ndarray, pd.Series]): if isinstance(data, pd.Series): data = data.values - data = np.nan_to_num(data, nan=0).astype(float) - if not self.positive_domain: sign = np.vectorize(self._sign_fn, otypes=[float])(data) else: @@ -72,7 +70,6 @@ def encode(self, data: Union[np.ndarray, pd.Series]): if self.is_target: components = [sign, log_value, norm] else: - # todo: if can't encode return 0s and log.error(f'Can\'t encode input value: {real}, exception: {e}') nones = np.vectorize(self._none_fn, otypes=[float])(data) components = [sign, log_value, norm, nones] @@ -114,6 +111,13 @@ def decode(self, encoded_values: torch.Tensor, decode_log: bool = None) -> list: # set "divergent" value as default (note: finfo.max() instead of pow(10, 63)) ret = np.full((ev.shape[0],), dtype=float, fill_value=np.finfo(np.float64).max) + # `none` filter (if not a target column) + if not self.is_target: + mask_none = ev[:, -1] == 1 + ret[mask_none] = np.nan + else: + mask_none = np.zeros_like(ret) + # sign component sign = np.ones(ev.shape[0], dtype=float) mask_sign = ev[:, 0] < 0.5 @@ -135,7 +139,9 @@ def decode(self, encoded_values: torch.Tensor, decode_log: bool = None) -> list: ret[valid_mask] = real_value[valid_mask] - nan_mask = ret[:, ] == np.nan - ret[nan_mask] = None + # set nan back to None + if mask_none.sum() > 0: + ret = ret.astype(object) + ret[mask_none] = None return ret.tolist() # TODO: update signature on BaseEncoder and replace all encs to return ndarrays diff --git a/tests/unit_tests/encoder/numeric/test_numeric.py b/tests/unit_tests/encoder/numeric/test_numeric.py index 57b8815d9..b81ef7a06 100644 --- a/tests/unit_tests/encoder/numeric/test_numeric.py +++ b/tests/unit_tests/encoder/numeric/test_numeric.py @@ -45,7 +45,7 @@ def test_encode_and_decode(self): if decoded is None: self.assertTrue((real is None) or (real != real)) else: - np.testing.assert_almost_equal(round(decoded, 10), round(real, 10)) + np.testing.assert_almost_equal(round(decoded, 6), round(real, 6)) def test_positive_domain(self): data = pd.Series([-1, -2, -100, 5, 10, 15]) @@ -69,7 +69,7 @@ def test_log_overflow_and_none(self): encoder.decode(encoder.encode(data)) for i in range(0, 70, 10): - encoder.decode([[0, pow(2, i), 0]]) + encoder.decode(torch.Tensor([[0, pow(2, i), 0]])) def test_nan_encoding(self): # Generate some numbers From 32f7a1740f3dc23fdaa79aa1c9e58dd8e5d70e66 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Sat, 10 Jun 2023 03:47:15 -0400 Subject: [PATCH 21/38] uncap torchvision to follow torch --- requirements_image.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements_image.txt b/requirements_image.txt index c35ae2276..a66506e04 100644 --- a/requirements_image.txt +++ b/requirements_image.txt @@ -1,2 +1,2 @@ -torchvision >=0.10.0,<0.11.0 +torchvision pillow >8.3.1 From 8f77c4330858d1d6a82ad610c7a72d5adc70b6db Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Sat, 10 Jun 2023 17:38:53 -0400 Subject: [PATCH 22/38] separate into two caches --- .../analysis/helpers/feature_importance.py | 1 + lightwood/data/encoded_ds.py | 86 ++++++++++++------- lightwood/encoder/numeric/numeric.py | 3 +- lightwood/mixer/neural.py | 5 +- lightwood/mixer/random_forest.py | 4 +- lightwood/mixer/regression.py | 8 +- 6 files changed, 64 insertions(+), 43 deletions(-) diff --git a/lightwood/analysis/helpers/feature_importance.py b/lightwood/analysis/helpers/feature_importance.py index ce205f388..de01e6888 100644 --- a/lightwood/analysis/helpers/feature_importance.py +++ b/lightwood/analysis/helpers/feature_importance.py @@ -81,6 +81,7 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]: shuffle_data = deepcopy(ref_data) shuffle_data.clear_cache() shuffle_data.data_frame[col] = shuffle(shuffle_data.data_frame[col].values) + shuffle_data.build_cache() # TODO: bottleneck, add a method to build a single column instead! shuffled_preds = ns.predictor(shuffle_data, args=PredictionArguments.from_dict(args)) shuffled_col_accuracy[col] = np.mean(list(evaluate_accuracies( diff --git a/lightwood/data/encoded_ds.py b/lightwood/data/encoded_ds.py index 7af492017..63484b04f 100644 --- a/lightwood/data/encoded_ds.py +++ b/lightwood/data/encoded_ds.py @@ -1,5 +1,5 @@ import inspect -from typing import List, Tuple +from typing import List, Tuple, Dict import torch import numpy as np import pandas as pd @@ -8,7 +8,7 @@ class EncodedDs(Dataset): - def __init__(self, encoders: List[BaseEncoder], data_frame: pd.DataFrame, target: str) -> None: + def __init__(self, encoders: Dict[str, BaseEncoder], data_frame: pd.DataFrame, target: str) -> None: """ Create a Lightwood datasource from a data frame and some encoders. This class inherits from `torch.utils.data.Dataset`. @@ -21,8 +21,6 @@ def __init__(self, encoders: List[BaseEncoder], data_frame: pd.DataFrame, target self.data_frame = data_frame self.encoders = encoders self.target = target - self.use_cache = True - self.cache = [None] * len(self.data_frame) self.encoder_spans = {} self.input_length = 0 # feature tensor dim @@ -34,6 +32,10 @@ def __init__(self, encoders: List[BaseEncoder], data_frame: pd.DataFrame, target self.input_length += self.encoders[col].output_size # if cache enabled, we immediately build it + self.use_cache = True + self.cache_built = False + self.X_cache: torch.Tensor = torch.full((len(self.data_frame),), fill_value=torch.nan) + self.Y_cache: torch.Tensor = torch.full((len(self.data_frame),), fill_value=torch.nan) self.build_cache() def __len__(self): @@ -47,20 +49,23 @@ def __len__(self): def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: """ The getter yields a tuple (X, y), where: - - `X `is a concatenation of all encoded representations of the row. Size: (n_features,) - - `y` is the encoded target + - `X `is a concatenation of all encoded representations of the row. Size: (B, n_features) + - `y` is the encoded target. Size: (B, n_features) :param idx: index of the row to access. :return: tuple (X, y) with encoded data. """ # noqa - if self.use_cache and self.cache[idx] is not None: - X, Y = self.cache[idx] + if self.use_cache and self.X_cache[idx] is not torch.nan: + X = self.X_cache[idx, :] + Y = self.Y_cache[idx] else: X, Y = self._encode_idxs([idx, ]) if self.use_cache: - self.cache[idx] = [X, Y] + self.X_cache[idx, :] = X + self.Y_cache[idx, :] = Y + return X, Y def _encode_idxs(self, idxs: list): @@ -68,7 +73,7 @@ def _encode_idxs(self, idxs: list): raise Exception(f"Passed indexes is not an iterable. Check the type! Index: {idxs}") X = torch.zeros((len(idxs), self.input_length)) - Y = torch.FloatTensor() + Y = torch.zeros((len(idxs),)) for col in self.data_frame: if self.encoders.get(col, None): kwargs = {} @@ -93,23 +98,18 @@ def _encode_idxs(self, idxs: list): # target post-processing else: - if len(encoded_tensor.shape) > 1: - Y = encoded_tensor.squeeze() - else: - Y = encoded_tensor.ravel() + Y = encoded_tensor - return X, Y + if len(encoded_tensor.shape) > 2: + Y = encoded_tensor.squeeze() - def build_cache(self): - """ This method builds a cache for the entire dataframe provided at initialization. """ - if not self.use_cache: - raise RuntimeError("Cannot build a cache for EncodedDS with `use_cache` set to False.") + if len(encoded_tensor.shape) < 2: + Y = encoded_tensor.unsqueeze(1) - idxs = list(range(len(self.data_frame))) - X, Y = self._encode_idxs(idxs) + # else: + # Y = encoded_tensor.ravel() - for i, (x, y) in enumerate(zip(X, Y)): - self.cache[i] = [x, y] + return X, Y def get_column_original_data(self, column_name: str) -> pd.Series: """ @@ -127,20 +127,35 @@ def get_encoded_column_data(self, column_name: str) -> torch.Tensor: :param column_name: name of the column. :return: A `torch.Tensor` with the encoded data of the `column_name` column. """ + if self.use_cache and self.cache_built: + if column_name == self.target and self.Y_cache is not None: + return self.Y_cache + elif self.X_cache is not torch.nan: + a, b = self.encoder_spans[column_name] + return self.X_cache[:, a:b] + kwargs = {} if 'dependency_data' in inspect.signature(self.encoders[column_name].encode).parameters: deps = [dep for dep in self.encoders[column_name].dependencies if dep in self.data_frame.columns] - kwargs['dependency_data'] = {dep: self.data_frame[dep].tolist() for dep in deps} + kwargs['dependency_data'] = {dep: self.data_frame[dep] for dep in deps} encoded_data = self.encoders[column_name].encode(self.data_frame[column_name], **kwargs) if torch.isnan(encoded_data).any() or torch.isinf(encoded_data).any(): raise Exception(f'Encoded tensor: {encoded_data} contains nan or inf values') if not isinstance(encoded_data, torch.Tensor): raise Exception( - f'The encoder: {self.encoders[column_name]} for column: {column_name} does not return a Tensor !') + f'The encoder: {self.encoders[column_name]} for column: {column_name} does not return a Tensor!') + + if self.use_cache and not self.cache_built: + if column_name == self.target: + self.Y_cache = encoded_data + else: + a, b = self.encoder_spans[column_name] + self.X_cache = self.X_cache[:, a:b] + return encoded_data - def get_encoded_data(self, include_target=True) -> torch.Tensor: + def get_encoded_data(self, include_target: bool = True) -> torch.Tensor: """ Gets all encoded data. @@ -154,11 +169,22 @@ def get_encoded_data(self, include_target=True) -> torch.Tensor: return torch.cat(encoded_dfs, 1) + def build_cache(self): + """ This method builds a cache for the entire dataframe provided at initialization. """ + if not self.use_cache: + raise RuntimeError("Cannot build a cache for EncodedDS with `use_cache` set to False.") + + idxs = list(range(len(self.data_frame))) + X, Y = self._encode_idxs(idxs) + self.X_cache = X + self.Y_cache = Y + self.cache_built = True + def clear_cache(self): - """ - Clears the `EncodedDs` cache. - """ - self.cache = [None] * len(self.data_frame) + """ Clears the `EncodedDs` cache. """ + self.X_cache = torch.full((len(self.data_frame),), fill_value=torch.nan) + self.Y_cache = torch.full((len(self.data_frame),), fill_value=torch.nan) + self.cache_built = False class ConcatedEncodedDs(EncodedDs): diff --git a/lightwood/encoder/numeric/numeric.py b/lightwood/encoder/numeric/numeric.py index fe037708f..79edc4425 100644 --- a/lightwood/encoder/numeric/numeric.py +++ b/lightwood/encoder/numeric/numeric.py @@ -73,8 +73,7 @@ def encode(self, data: Union[np.ndarray, pd.Series]): nones = np.vectorize(self._none_fn, otypes=[float])(data) components = [sign, log_value, norm, nones] - ret = torch.Tensor(np.asarray(components)).T - return torch.Tensor(ret) + return torch.Tensor(np.asarray(components)).T @staticmethod def _sign_fn(x: float) -> float: diff --git a/lightwood/mixer/neural.py b/lightwood/mixer/neural.py index 4a66c604f..d3e59ec77 100644 --- a/lightwood/mixer/neural.py +++ b/lightwood/mixer/neural.py @@ -250,8 +250,9 @@ def _error(self, dev_dl, criterion) -> float: def _init_net(self, ds: EncodedDs): self.net_class = DefaultNet if self.net_name == 'DefaultNet' else ArNet - net_kwargs = {'input_size': len(ds[0][0]), - 'output_size': len(ds[0][1]), + X, Y = ds[0] + net_kwargs = {'input_size': len(X), + 'output_size': len(Y), 'num_hidden': self.num_hidden, 'dropout': 0} diff --git a/lightwood/mixer/random_forest.py b/lightwood/mixer/random_forest.py index 4d79fb9d1..e4ac48106 100644 --- a/lightwood/mixer/random_forest.py +++ b/lightwood/mixer/random_forest.py @@ -14,7 +14,7 @@ from type_infer.dtype import dtype from lightwood.helpers.log import log from lightwood.encoder.base import BaseEncoder -from lightwood.data.encoded_ds import ConcatedEncodedDs, EncodedDs +from lightwood.data.encoded_ds import EncodedDs, ConcatedEncodedDs from lightwood.mixer.base import BaseMixer from lightwood.api.types import PredictionArguments @@ -203,7 +203,7 @@ def __call__(self, ds: EncodedDs, :return: dataframe with predictions. """ - data = ds.get_encoded_data(include_target=False) + data = ds.get_encoded_data(include_target=False).numpy() if self.is_classifier: predictions = self.model.predict_proba(data) diff --git a/lightwood/mixer/regression.py b/lightwood/mixer/regression.py index fc63183cc..88b2ab709 100644 --- a/lightwood/mixer/regression.py +++ b/lightwood/mixer/regression.py @@ -87,13 +87,7 @@ def __call__(self, ds: EncodedDs, :returns: A dataframe cotaining the decoded predictions and (depending on the args) additional information such as the probabilites for each target class """ # noqa - X = [] - for x, _ in ds: - entry = x.numpy() - if len(entry.shape) > 1: - entry = entry[0] - X.append(entry) - + X = ds.get_encoded_data(include_target=False) Yh = self.model.predict(X) decoded_predictions = self.target_encoder.decode(torch.Tensor(Yh)) From 89fcfa88d18de07bcc2b95c027f79cdcb9641098 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Sat, 10 Jun 2023 18:41:17 -0400 Subject: [PATCH 23/38] grouped ts support --- lightwood/data/encoded_ds.py | 2 +- lightwood/encoder/array/ts_num_array.py | 12 +- lightwood/encoder/numeric/ts_numeric.py | 143 ++++++++++++------------ lightwood/mixer/neural_ts.py | 8 +- 4 files changed, 80 insertions(+), 85 deletions(-) diff --git a/lightwood/data/encoded_ds.py b/lightwood/data/encoded_ds.py index 63484b04f..d9ba4e498 100644 --- a/lightwood/data/encoded_ds.py +++ b/lightwood/data/encoded_ds.py @@ -83,7 +83,7 @@ def _encode_idxs(self, idxs: list): if hasattr(self.encoders[col], 'data_window'): cols = [self.target] + [f'{self.target}_timestep_{i}' for i in range(1, self.encoders[col].data_window)] - data = [self.data_frame[cols].iloc[idxs].values] # TODO: this is likely to fail as is + data = self.data_frame[cols].iloc[idxs].values else: cols = [col] data = self.data_frame[cols].iloc[idxs].values.flatten() diff --git a/lightwood/encoder/array/ts_num_array.py b/lightwood/encoder/array/ts_num_array.py index b61dae507..852f7c3c4 100644 --- a/lightwood/encoder/array/ts_num_array.py +++ b/lightwood/encoder/array/ts_num_array.py @@ -2,13 +2,14 @@ import torch import torch.nn.functional as F +import numpy as np from lightwood.encoder import BaseEncoder from lightwood.encoder.numeric import TsNumericEncoder class TsArrayNumericEncoder(BaseEncoder): - def __init__(self, timesteps: int, is_target: bool = False, positive_domain: bool = False, grouped_by=None): + def __init__(self, timesteps: int, is_target: bool = False, positive_domain: bool = False, grouped_by=None, nan=0): """ This encoder handles arrays of numerical time series data by wrapping the numerical encoder with behavior specific to time series tasks. @@ -23,6 +24,7 @@ def __init__(self, timesteps: int, is_target: bool = False, positive_domain: boo self.dependencies = grouped_by self.data_window = timesteps self.positive_domain = positive_domain + self.nan_value = nan self.sub_encoder = TsNumericEncoder(is_target=is_target, positive_domain=positive_domain, grouped_by=grouped_by) self.output_size = self.data_window * self.sub_encoder.output_size @@ -56,9 +58,9 @@ def encode(self, data: Iterable[Iterable], dependency_data: Optional[Dict[str, s for series in data: ret.append(self.encode_one(series, dependency_data=dependency_data)) - return torch.vstack(ret) + return torch.vstack(ret).nan_to_num(self.nan_value) - def encode_one(self, data: Iterable, dependency_data: Optional[Dict[str, str]] = {}) -> torch.Tensor: + def encode_one(self, data: np.ndarray, dependency_data: Optional[Dict[str, str]] = {}) -> torch.Tensor: """ Encodes a single windowed slice of any given time series. @@ -70,8 +72,8 @@ def encode_one(self, data: Iterable, dependency_data: Optional[Dict[str, str]] = """ # noqa ret = [] - for data_point in data: - ret.append(self.sub_encoder.encode([data_point], dependency_data=dependency_data)) + for data_point in data.reshape(-1, 1): + ret.append(self.sub_encoder.encode(data_point, dependency_data=dependency_data)) ret = torch.hstack(ret) padding_size = self.output_size - ret.shape[-1] diff --git a/lightwood/encoder/numeric/ts_numeric.py b/lightwood/encoder/numeric/ts_numeric.py index 3203e355a..8773373e3 100644 --- a/lightwood/encoder/numeric/ts_numeric.py +++ b/lightwood/encoder/numeric/ts_numeric.py @@ -1,9 +1,10 @@ -import math +from typing import Union, List, Dict + import torch import numpy as np +import pandas as pd + from lightwood.encoder.numeric import NumericEncoder -from lightwood.helpers.general import is_none -from lightwood.helpers.log import log class TsNumericEncoder(NumericEncoder): @@ -20,95 +21,93 @@ def __init__(self, is_target: bool = False, positive_domain: bool = False, group self.dependencies = grouped_by self.output_size = 1 - def encode(self, data, dependency_data={}): + def encode(self, data: Union[np.ndarray, pd.Series], dependency_data: Dict[str, List[pd.Series]] = {}): """ + :param data: A pandas series containing the numbers to be encoded :param dependency_data: dict with grouped_by column info, to retrieve the correct normalizer for each datum + + :returns: A torch tensor with the representations of each number """ # noqa if not self.is_prepared: raise Exception('You need to call "prepare" before calling "encode" or "decode".') + if not dependency_data: dependency_data = {'__default': [None] * len(data)} - ret = [] - for real, group in zip(data, list(zip(*dependency_data.values()))): + if isinstance(data, pd.Series): + data = data.values + + # get array of series-wise observed means + if self.normalizers is None: + means = np.full((len(data)), fill_value=self._abs_mean) + else: + # use global mean as default for novel series try: - real = float(real) + means = np.full((len(data)), fill_value=self.normalizers['__default'].abs_mean) except Exception: - try: - real = float(real.replace(',', '.')) - except Exception: - real = None - if self.is_target: - vector = [0] - if group is not None and self.normalizers is not None: - try: - mean = self.normalizers[tuple(group)].abs_mean - except KeyError: - # novel group-by, we use default normalizer mean - mean = self.normalizers['__default'].abs_mean - else: - mean = self._abs_mean + print('!') - if not is_none(real): - vector[0] = real / mean if mean != 0 else real + def _get_group_mean(group) -> float: + if (group, ) in self.normalizers: + return self.normalizers[(group, )].abs_mean else: - pass - # This should raise an exception *once* we fix the TsEncoder such that this doesn't get feed `nan` - # raise Exception(f'Can\'t encode target value: {real}') - else: - vector = [0] - try: - if not is_none(real): - vector[0] = real / self._abs_mean - except Exception as e: - log.error(f'Can\'t encode input value: {real}, exception: {e}') - - ret.append(vector) - - return torch.Tensor(ret) - - def decode(self, encoded_values, decode_log=None, dependency_data=None): + return self.normalizers['__default'].abs_mean + + for i, group in enumerate(list(zip(*dependency_data.values()))): # TODO: support multigroup + if group is not None: + means = np.vectorize(_get_group_mean, otypes=[float])(group[0].values) + + def _norm_fn(x: float, mean: float) -> float: + return x / mean + + # nones = np.vectorize(self._none_fn, otypes=[float])(data) # TODO + encoded = np.vectorize(_norm_fn, otypes=[float])(data, means) + # encoded[nones] = 0 # if measurement is None, it is zeroed out # TODO + + # TODO: mask for where mean is 0, then pass real as-is + + return torch.Tensor(encoded).unsqueeze(1) + + def decode(self, encoded_values: torch.Tensor, decode_log: bool = None, dependency_data=None): if not self.is_prepared: raise Exception('You need to call "prepare" before calling "encode" or "decode".') - if decode_log is None: - decode_log = self.decode_log + assert isinstance(encoded_values, torch.Tensor), 'It is not a tensor!' # TODO: debug purposes + assert not decode_log # TODO: debug purposes - ret = [] if not dependency_data: dependency_data = {'__default': [None] * len(encoded_values)} - if isinstance(encoded_values, torch.Tensor): - encoded_values = encoded_values.tolist() - - for vector, group in zip(encoded_values, list(zip(*dependency_data.values()))): - if self.is_target: - if np.isnan(vector[0]) or vector[0] == float('inf'): - log.error(f'Got weird target value to decode: {vector}') - real_value = pow(10, 63) - else: - if decode_log: - sign = -1 if vector[0] < 0 else 1 - try: - real_value = math.exp(vector[0]) * sign - except OverflowError: - real_value = pow(10, 63) * sign + + # force = True prevents side effects on the original encoded_values + ev = encoded_values.numpy(force=True) + + # set global mean as default + ret = np.full((ev.shape[0],), dtype=float, fill_value=self._abs_mean) + + # TODO: perhaps capture nan, infs, etc and set to pow(10,63)? + + # set means array + if self.normalizers is None: + means = np.full((ev.shape[0],), fill_value=self._abs_mean) + else: + means = np.full((len(encoded_values)), fill_value=self.normalizers['__default'].abs_mean) + for i, group in enumerate(list(zip(*dependency_data.values()))): + if group is not None: + if tuple(group) in self.normalizers: + means[i] = self.normalizers[tuple(group)].abs_mean else: - if group is not None and self.normalizers is not None: - try: - mean = self.normalizers[tuple(group)].abs_mean - except KeyError: - # decode new group with default normalizer - mean = self.normalizers['__default'].abs_mean - else: - mean = self._abs_mean + means[i] = self.normalizers['__default'].abs_mean + else: + means[i] = self._abs_mean - real_value = vector[0] * mean + # set real value + real_value = np.multiply(ev[:].reshape(-1,), means) + valid_mask = np.ones_like(real_value, dtype=bool) - if self.positive_domain: - real_value = abs(real_value) + # final filters + if self.positive_domain: + real_value = abs(real_value) - else: - real_value = vector[0] * self._abs_mean + ret[valid_mask] = real_value[valid_mask] # TODO probably not needed - ret.append(real_value) - return ret + return ret.tolist() diff --git a/lightwood/mixer/neural_ts.py b/lightwood/mixer/neural_ts.py index ef34b53f7..f40f880a1 100644 --- a/lightwood/mixer/neural_ts.py +++ b/lightwood/mixer/neural_ts.py @@ -7,10 +7,8 @@ import torch from torch import nn -import torch_optimizer as ad_optim from torch.cuda.amp import GradScaler from torch.utils.data import DataLoader -from torch.optim.optimizer import Optimizer from type_infer.dtype import dtype from lightwood.api.types import PredictionArguments @@ -76,10 +74,6 @@ def _select_criterion(self) -> torch.nn.Module: return criterion - def _select_optimizer(self) -> Optimizer: - optimizer = ad_optim.Ranger(self.model.parameters(), lr=self.lr) - return optimizer - def _fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: """ :param train_data: The network is fit/trained on this @@ -109,7 +103,7 @@ def _fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: self.lr, self.model = self._find_lr(train_dl) # Keep on training - optimizer = self._select_optimizer() + optimizer = self._select_optimizer(lr=self.lr) criterion = self._select_criterion() scaler = GradScaler() From 85922fc6e6e9a76e4f2dbc016867dae60c610437 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Sat, 10 Jun 2023 18:51:57 -0400 Subject: [PATCH 24/38] ts tests pass --- lightwood/encoder/numeric/ts_numeric.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lightwood/encoder/numeric/ts_numeric.py b/lightwood/encoder/numeric/ts_numeric.py index 8773373e3..55590d5aa 100644 --- a/lightwood/encoder/numeric/ts_numeric.py +++ b/lightwood/encoder/numeric/ts_numeric.py @@ -54,8 +54,11 @@ def _get_group_mean(group) -> float: return self.normalizers['__default'].abs_mean for i, group in enumerate(list(zip(*dependency_data.values()))): # TODO: support multigroup - if group is not None: - means = np.vectorize(_get_group_mean, otypes=[float])(group[0].values) + if group[0] is not None: + try: + means = np.vectorize(_get_group_mean, otypes=[float])(group[0].values) + except Exception: + print("!") def _norm_fn(x: float, mean: float) -> float: return x / mean From 4519f81d77a10c53187749b3ef2083849ce6e07c Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Sun, 11 Jun 2023 21:33:16 -0400 Subject: [PATCH 25/38] disable optuna by default on RF mixer --- lightwood/api/json_ai.py | 1 - lightwood/encoder/numeric/ts_numeric.py | 10 ++-------- lightwood/mixer/neural.py | 2 +- lightwood/mixer/random_forest.py | 12 +++--------- 4 files changed, 6 insertions(+), 19 deletions(-) diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py index 2240a016b..87b73a5e1 100644 --- a/lightwood/api/json_ai.py +++ b/lightwood/api/json_ai.py @@ -617,7 +617,6 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI: mixers[i]["args"]["target_encoder"] = mixers[i]["args"].get( "target_encoder", "$encoders[self.target]" ) - mixers[i]["args"]["use_optuna"] = True elif mixers[i]["module"] == "LightGBMArray": mixers[i]["args"]["input_cols"] = mixers[i]["args"].get( diff --git a/lightwood/encoder/numeric/ts_numeric.py b/lightwood/encoder/numeric/ts_numeric.py index 55590d5aa..937413208 100644 --- a/lightwood/encoder/numeric/ts_numeric.py +++ b/lightwood/encoder/numeric/ts_numeric.py @@ -42,10 +42,7 @@ def encode(self, data: Union[np.ndarray, pd.Series], dependency_data: Dict[str, means = np.full((len(data)), fill_value=self._abs_mean) else: # use global mean as default for novel series - try: - means = np.full((len(data)), fill_value=self.normalizers['__default'].abs_mean) - except Exception: - print('!') + means = np.full((len(data)), fill_value=self.normalizers['__default'].abs_mean) def _get_group_mean(group) -> float: if (group, ) in self.normalizers: @@ -55,10 +52,7 @@ def _get_group_mean(group) -> float: for i, group in enumerate(list(zip(*dependency_data.values()))): # TODO: support multigroup if group[0] is not None: - try: - means = np.vectorize(_get_group_mean, otypes=[float])(group[0].values) - except Exception: - print("!") + means = np.vectorize(_get_group_mean, otypes=[float])(group[0].values) def _norm_fn(x: float, mean: float) -> float: return x / mean diff --git a/lightwood/mixer/neural.py b/lightwood/mixer/neural.py index d3e59ec77..3b552db89 100644 --- a/lightwood/mixer/neural.py +++ b/lightwood/mixer/neural.py @@ -166,7 +166,7 @@ def _find_lr(self, dl): else: stop = True - best_loss_lr = lr_log[np.argmin(running_losses)] + best_loss_lr = lr_log[np.nanargmin(running_losses)] # nanargmin ignores nans that may arise lr = best_loss_lr log.info(f'Found learning rate of: {lr}') return lr, best_model diff --git a/lightwood/mixer/random_forest.py b/lightwood/mixer/random_forest.py index e4ac48106..89f6ca682 100644 --- a/lightwood/mixer/random_forest.py +++ b/lightwood/mixer/random_forest.py @@ -33,8 +33,8 @@ def __init__( target: str, dtype_dict: Dict[str, str], fit_on_dev: bool, - use_optuna: bool, - target_encoder: BaseEncoder + target_encoder: BaseEncoder, + use_optuna: bool = False, ): """ The `RandomForest` mixer supports both regression and classification tasks. @@ -100,7 +100,6 @@ def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: init_params = { 'n_estimators': 50, 'max_depth': 5, - 'max_features': 1., 'bootstrap': True, 'n_jobs': -1, 'random_state': 0 @@ -128,15 +127,10 @@ def fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: else (mean_squared_error, 'predict') def objective(trial: trial_module.Trial): - criterion = trial.suggest_categorical("criterion", - ["gini", "entropy"]) if self.is_classifier else 'squared_error' + criterion = trial.suggest_categorical("criterion", "gini") if self.is_classifier else 'squared_error' params = { 'n_estimators': trial.suggest_int('n_estimators', 2, 512), - 'max_depth': trial.suggest_int('max_depth', 2, 15), - 'min_samples_split': trial.suggest_int("min_samples_split", 2, 20), - 'min_samples_leaf': trial.suggest_int("min_samples_leaf", 1, 20), - 'max_features': trial.suggest_float("max_features", 0.1, 1), 'criterion': criterion, } From 0ab1f33bdbecc8caea5de50400ba4ec08322c3f1 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Mon, 12 Jun 2023 16:26:57 -0400 Subject: [PATCH 26/38] fix find_lr --- lightwood/mixer/neural.py | 46 +++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/lightwood/mixer/neural.py b/lightwood/mixer/neural.py index 3b552db89..15153fc73 100644 --- a/lightwood/mixer/neural.py +++ b/lightwood/mixer/neural.py @@ -112,13 +112,13 @@ def _select_criterion(self) -> torch.nn.Module: return criterion - def _select_optimizer(self, lr) -> Optimizer: - optimizer = ad_optim.Ranger(self.model.parameters(), lr=lr, weight_decay=2e-2) + def _select_optimizer(self, model, lr) -> Optimizer: + optimizer = ad_optim.Ranger(model.parameters(), lr=lr, weight_decay=2e-2) return optimizer - def _find_lr(self, dl): - lr = 1e-5 # good starting point as search escalates - lrs = deque([5e-5, 1e-4, 5e-4, 1e-3, 2e-3, 3e-3, 5e-3, 1e-2, 5e-2, 1e-1]) + def _find_lr(self, train_data): + lr = 1e-4 # good starting point as search escalates + lrs = deque([5e-4, 1e-3, 2e-3, 3e-3, 5e-3, 1e-2, 5e-2, 1e-1]) starting_model = deepcopy(self.model) criterion = self._select_criterion() scaler = GradScaler() @@ -128,17 +128,21 @@ def _find_lr(self, dl): best_model = self.model stop = False - dl_iter = iter(dl) - X, Y = next(dl_iter) n_steps = 10 cum_loss = 0 while stop is False: - # overfit learning on first sample (yes, biased, but we only really want an intuition on what LR is decent) - optimizer = self._select_optimizer(lr=lr) - self.model = starting_model + # overfit learning on first n_steps samples (biased, but we only want an intuition on what LR is decent) + dl = DataLoader(train_data, + batch_size=min(len(train_data.data_frame), 32, self.batch_size), + shuffle=False) + dl_iter = iter(dl) + self.model = deepcopy(starting_model) + self.model.train() + optimizer = self._select_optimizer(self.model, lr=lr) for i in range(n_steps): + X, Y = next(dl_iter) X = X.to(self.model.device) Y = Y.to(self.model.device) @@ -159,14 +163,17 @@ def _find_lr(self, dl): running_losses.append(cum_loss) lr_log.append(lr) cum_loss = 0 + lr = lrs.popleft() + if len(lrs) == 0: + stop = True - if len(running_losses) < 2 or np.mean(list(running_losses)[:-1]) > np.mean(running_losses) and len(lrs) > 0: - lr = lrs.popleft() + # store model if best so far + inv_running_losses = list(running_losses)[::-1] # invert so when tied we pick the most aggresive LR + best_loss_idx = np.nanargmin(inv_running_losses) # nanargmin ignores nans that may arise + if best_loss_idx == 0: best_model = deepcopy(self.model) # store model for slight time savings - else: - stop = True + best_loss_lr = lr_log[-1] - best_loss_lr = lr_log[np.nanargmin(running_losses)] # nanargmin ignores nans that may arise lr = best_loss_lr log.info(f'Found learning rate of: {lr}') return lr, best_model @@ -285,13 +292,10 @@ def _fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: # Find learning rate & keep initial weights self._init_net(train_data) if not self.lr: - sample_dl = DataLoader(train_data, - batch_size=min(len(train_data.data_frame), 32, self.batch_size), - shuffle=True) - self.lr, self.model = self._find_lr(sample_dl) + self.lr, self.model = self._find_lr(train_data) # Keep on training - optimizer = self._select_optimizer(lr=self.lr) + optimizer = self._select_optimizer(self.model, lr=self.lr) criterion = self._select_criterion() scaler = GradScaler() @@ -321,7 +325,7 @@ def partial_fit(self, train_data: EncodedDs, dev_data: EncodedDs, args: Optional self.started = time.time() train_dl = DataLoader(train_data, batch_size=self.batch_size, shuffle=True) dev_dl = DataLoader(dev_data, batch_size=self.batch_size, shuffle=True) - optimizer = self._select_optimizer(lr=self.lr) + optimizer = self._select_optimizer(self.model, lr=self.lr) criterion = self._select_criterion() scaler = GradScaler() From 44d6b0873dfb5d1fd7e965c864b14a5f3a2c9c50 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Mon, 12 Jun 2023 17:08:02 -0400 Subject: [PATCH 27/38] strict criteria to deploy CAE when OHE vector would be larger than 16" --- lightwood/api/json_ai.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py index 87b73a5e1..3a7072130 100644 --- a/lightwood/api/json_ai.py +++ b/lightwood/api/json_ai.py @@ -94,7 +94,7 @@ def lookup_encoder( dtype.binary: "BinaryEncoder", dtype.categorical: "CategoricalAutoEncoder" if statistical_analysis is None - or len(statistical_analysis.histograms[col_name]) > 100 + or len(statistical_analysis.histograms[col_name]['x']) > 16 else "OneHotEncoder", dtype.tags: "MultiHotEncoder", dtype.date: "DatetimeEncoder", @@ -943,14 +943,17 @@ def code_from_json_ai(json_ai: JsonAI) -> str: parallel_encoding = parallel_encoding_check(data['train'], self.encoders) if parallel_encoding: + log.debug('Preparing in parallel...') for col_name, encoder in self.encoders.items(): if col_name != self.target and not encoder.is_trainable_encoder: prepped_encoders[col_name] = (encoder, concatenated_train_dev[col_name], 'prepare') prepped_encoders = mut_method_call(prepped_encoders) else: + log.debug('Preparing sequentially...') for col_name, encoder in self.encoders.items(): if col_name != self.target and not encoder.is_trainable_encoder: + log.debug(f'Preparing encoder for {{col_name}}...') encoder.prepare(concatenated_train_dev[col_name]) prepped_encoders[col_name] = encoder From 53c89afac8dd133a528f4c3fd056e7cfb88c16fb Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Mon, 12 Jun 2023 17:11:27 -0400 Subject: [PATCH 28/38] fix neural_ts --- lightwood/mixer/neural.py | 7 ++++++- lightwood/mixer/neural_ts.py | 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/lightwood/mixer/neural.py b/lightwood/mixer/neural.py index 15153fc73..697ef288f 100644 --- a/lightwood/mixer/neural.py +++ b/lightwood/mixer/neural.py @@ -142,7 +142,12 @@ def _find_lr(self, train_data): optimizer = self._select_optimizer(self.model, lr=lr) for i in range(n_steps): - X, Y = next(dl_iter) + try: + X, Y = next(dl_iter) + except StopIteration: + dl_iter = iter(dl) + X, Y = next(dl_iter) + X = X.to(self.model.device) Y = Y.to(self.model.device) diff --git a/lightwood/mixer/neural_ts.py b/lightwood/mixer/neural_ts.py index f40f880a1..813266cff 100644 --- a/lightwood/mixer/neural_ts.py +++ b/lightwood/mixer/neural_ts.py @@ -100,10 +100,10 @@ def _fit(self, train_data: EncodedDs, dev_data: EncodedDs) -> None: # Find learning rate # keep the weights self._init_net(train_data) - self.lr, self.model = self._find_lr(train_dl) + self.lr, self.model = self._find_lr(train_data) # Keep on training - optimizer = self._select_optimizer(lr=self.lr) + optimizer = self._select_optimizer(self.model, lr=self.lr) criterion = self._select_criterion() scaler = GradScaler() From b16bd1a6adfe4eee9e1749f3e491c55f8063bf42 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Mon, 12 Jun 2023 17:11:46 -0400 Subject: [PATCH 29/38] vectorized tsNumArray.encode() --- lightwood/encoder/array/ts_num_array.py | 31 ++----------------------- lightwood/encoder/categorical/onehot.py | 4 ++-- lightwood/encoder/numeric/ts_numeric.py | 5 ++++ 3 files changed, 9 insertions(+), 31 deletions(-) diff --git a/lightwood/encoder/array/ts_num_array.py b/lightwood/encoder/array/ts_num_array.py index 852f7c3c4..5a20a8fac 100644 --- a/lightwood/encoder/array/ts_num_array.py +++ b/lightwood/encoder/array/ts_num_array.py @@ -1,8 +1,6 @@ from typing import List, Dict, Iterable, Optional import torch -import torch.nn.functional as F -import numpy as np from lightwood.encoder import BaseEncoder from lightwood.encoder.numeric import TsNumericEncoder @@ -54,34 +52,9 @@ def encode(self, data: Iterable[Iterable], dependency_data: Optional[Dict[str, s if not dependency_data: dependency_data = {'__default': [None] * len(data)} - ret = [] - for series in data: - ret.append(self.encode_one(series, dependency_data=dependency_data)) - - return torch.vstack(ret).nan_to_num(self.nan_value) - - def encode_one(self, data: np.ndarray, dependency_data: Optional[Dict[str, str]] = {}) -> torch.Tensor: - """ - Encodes a single windowed slice of any given time series. + ret = self.sub_encoder.encode(data, dependency_data=dependency_data) - :param data: windowed slice of a numerical time series. - :param dependency_data: used to determine the correct normalizer for the input. - - :return: an encoded time series array, as per the underlying `TsNumericEncoder` object. - The output of this encoder for all time steps is concatenated, so the final shape of the tensor is (1, NxK), where N: self.data_window and K: sub-encoder # of output features. - """ # noqa - ret = [] - - for data_point in data.reshape(-1, 1): - ret.append(self.sub_encoder.encode(data_point, dependency_data=dependency_data)) - - ret = torch.hstack(ret) - padding_size = self.output_size - ret.shape[-1] - - if padding_size > 0: - ret = F.pad(ret, (0, padding_size)) - - return ret + return torch.Tensor(ret).nan_to_num(self.nan_value) def decode(self, encoded_values, dependency_data=None) -> List[List]: """ diff --git a/lightwood/encoder/categorical/onehot.py b/lightwood/encoder/categorical/onehot.py index c25c09879..e72a1f59c 100644 --- a/lightwood/encoder/categorical/onehot.py +++ b/lightwood/encoder/categorical/onehot.py @@ -68,12 +68,12 @@ def prepare(self, priming_data: Iterable[str]): unq_cats = np.unique([i for i in priming_data if i is not None]).tolist() if self.use_unknown: - log.info("Encoding UNKNOWN categories as index 0") + log.debug("Encoding UNKNOWN categories as index 0") self.map = {cat: indx + 1 for indx, cat in enumerate(unq_cats)} self.map.update({_UNCOMMON_WORD: 0}) self.rev_map = {indx: cat for cat, indx in self.map.items()} else: - log.info("Encoding UNKNOWN categories as vector of all 0s") + log.debug("Encoding UNKNOWN categories as vector of all 0s") self.map = {cat: indx for indx, cat in enumerate(unq_cats)} self.rev_map = {indx: cat for cat, indx in self.map.items()} diff --git a/lightwood/encoder/numeric/ts_numeric.py b/lightwood/encoder/numeric/ts_numeric.py index 937413208..d790f5cb5 100644 --- a/lightwood/encoder/numeric/ts_numeric.py +++ b/lightwood/encoder/numeric/ts_numeric.py @@ -54,6 +54,11 @@ def _get_group_mean(group) -> float: if group[0] is not None: means = np.vectorize(_get_group_mean, otypes=[float])(group[0].values) + if len(data.shape) > 1 and data.shape[1] > 1: + if len(means.shape) == 1: + means = np.expand_dims(means, 1) + means = np.repeat(means, data.shape[1], axis=1) + def _norm_fn(x: float, mean: float) -> float: return x / mean From 5734e0c5c9fc85bb60d5c2cee046c101676d753d Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Mon, 12 Jun 2023 18:10:23 -0400 Subject: [PATCH 30/38] handle unknown param --- lightwood/encoder/categorical/binary.py | 33 +++++++++++++++++++------ 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/lightwood/encoder/categorical/binary.py b/lightwood/encoder/categorical/binary.py index 77c019a42..051cdef8c 100644 --- a/lightwood/encoder/categorical/binary.py +++ b/lightwood/encoder/categorical/binary.py @@ -7,6 +7,7 @@ from lightwood.encoder.base import BaseEncoder from lightwood.helpers.constants import _UNCOMMON_WORD +from lightwood.helpers.log import log class BinaryEncoder(BaseEncoder): @@ -34,17 +35,21 @@ def __init__( self, is_target: bool = False, target_weights: Dict[str, float] = None, + handle_unknown: str = 'use_encoded_value' ): super().__init__(is_target) """ :param is_target: Whether encoder featurizes target column :param target_weights: Percentage of total population represented by each category (from [0, 1]), as a dictionary. + :param handle_unknown: if set to `use_encoded_value`, will encode all classes with index greater than 1 to a special UNKNOWN index that decodes back to `None`. `error` will raise an error when preparing the encoder. """ # noqa self.map = {} # category name -> index self.rev_map = {} # index -> category name - self.output_size = 2 + self.output_size = 3 self.encoder_class_type = str + self.handle_unknown = handle_unknown + self.UNK_IDX = 2 # Weight-balance info if encoder represents target self.target_weights = None @@ -67,13 +72,17 @@ def prepare(self, priming_data: Iterable[str]): self.rev_map = {indx: cat for cat, indx in self.map.items()} # Enforce only binary; map must have exactly 2 classes. - if len(self.map) > 2: - raise ValueError(f'Issue with dtype; data has > 2 classes. All classes are: {self.map}') + if len(self.map) > 2 and self.handle_unknown == 'use_encoded_value': + log.warning('Warning: dtype for binary encoder has > 2 classes. Extra classes will be encoded to an invalid token and performance will not be optimal. Try overriding this encoder with a multi-class categorical encoder.') # noqa + log.warning(f'Observed classes are: {self.map}.') + elif self.handle_unknown == 'error': + raise Exception(f'Issue with dtype; data has > 2 classes. All classes are: {self.map}. Aborting.') # For target-only, report on relative weights of classes if self.is_target: - self.index_weights = torch.Tensor([1, 1]) # Equally wt. both classes + self.index_weights = torch.ones(self.output_size) # Equally wt. both classes + self.index_weights[self.UNK_IDX] = 0 # set unknown index to have no effect # If target weights provided, weight by inverse if self.target_weights is not None: @@ -102,13 +111,17 @@ def encode(self, column_data: Iterable[str]) -> torch.Tensor: 'You need to call "prepare" before calling "encode" or "decode".' ) - ret = torch.zeros(size=(len(column_data), 2)) + ret = torch.zeros(size=(len(column_data), self.output_size)) for idx, word in enumerate(column_data): index = self.map.get(word, None) - if index is not None: - ret[idx, index] = 1 + if index is None: + index = self.UNK_IDX # any unknown value maps to UNK_IDX + else: + index = min(index, self.UNK_IDX) # any known value beyond first two also maps to index UNK_IDX + + ret[idx, index] = 1 return torch.Tensor(ret) @@ -130,7 +143,11 @@ def decode(self, encoded_data: torch.Tensor): if not np.any(vector): # Vector of all 0s -> unknown category ret.append(_UNCOMMON_WORD) else: - ret.append(self.rev_map[np.argmax(vector)]) + idx = np.argmax(vector) + if idx == self.UNK_IDX: + ret.append(None) # known, but not either of the supported categories + else: + ret.append(self.rev_map[idx]) return ret From 399715a99c06f0416402cbf24ea14d0562d26a40 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Mon, 12 Jun 2023 18:41:27 -0400 Subject: [PATCH 31/38] tests pass --- lightwood/__about__.py | 2 +- lightwood/encoder/categorical/binary.py | 19 ++++++++----------- requirements.txt | 6 +++--- .../encoder/categorical/test_binary.py | 2 +- 4 files changed, 13 insertions(+), 16 deletions(-) diff --git a/lightwood/__about__.py b/lightwood/__about__.py index 42dc89079..7605812b6 100644 --- a/lightwood/__about__.py +++ b/lightwood/__about__.py @@ -1,6 +1,6 @@ __title__ = 'lightwood' __package_name__ = 'lightwood' -__version__ = '23.5.1.1' +__version__ = '23.6.2.0' __description__ = "Lightwood is a toolkit for automatic machine learning model building" __email__ = "community@mindsdb.com" __author__ = 'MindsDB Inc' diff --git a/lightwood/encoder/categorical/binary.py b/lightwood/encoder/categorical/binary.py index 051cdef8c..8c29f8f0d 100644 --- a/lightwood/encoder/categorical/binary.py +++ b/lightwood/encoder/categorical/binary.py @@ -41,12 +41,12 @@ def __init__( """ :param is_target: Whether encoder featurizes target column :param target_weights: Percentage of total population represented by each category (from [0, 1]), as a dictionary. - :param handle_unknown: if set to `use_encoded_value`, will encode all classes with index greater than 1 to a special UNKNOWN index that decodes back to `None`. `error` will raise an error when preparing the encoder. + :param handle_unknown: if set to `use_encoded_value`, will assign all classes with index greater than 1 to a special UNKNOWN index. This doesn't affect the encoded representation of shape (B, 2). During decoding, any unknown or otherwise known but "out-of-bounds" word will be decoded back to the lightwood unknown category token. If this argument is set to `error`, the encoder will raise an error while preparing if there are more than two observed classes. """ # noqa self.map = {} # category name -> index self.rev_map = {} # index -> category name - self.output_size = 3 + self.output_size = 2 self.encoder_class_type = str self.handle_unknown = handle_unknown self.UNK_IDX = 2 @@ -73,16 +73,15 @@ def prepare(self, priming_data: Iterable[str]): # Enforce only binary; map must have exactly 2 classes. if len(self.map) > 2 and self.handle_unknown == 'use_encoded_value': - log.warning('Warning: dtype for binary encoder has > 2 classes. Extra classes will be encoded to an invalid token and performance will not be optimal. Try overriding this encoder with a multi-class categorical encoder.') # noqa + log.warning('Warning: dtype for binary encoder has > 2 classes. Extra classes will be pointed to an invalid token. Try overriding this encoder with a multi-class categorical encoder, otherwise performance may not be optimal.') # noqa log.warning(f'Observed classes are: {self.map}.') elif self.handle_unknown == 'error': - raise Exception(f'Issue with dtype; data has > 2 classes. All classes are: {self.map}. Aborting.') + raise ValueError(f'Data has > 2 classes and encoder is in strict mode. Aborting. All classes are: {self.map}.') # noqa # For target-only, report on relative weights of classes if self.is_target: self.index_weights = torch.ones(self.output_size) # Equally wt. both classes - self.index_weights[self.UNK_IDX] = 0 # set unknown index to have no effect # If target weights provided, weight by inverse if self.target_weights is not None: @@ -116,12 +115,10 @@ def encode(self, column_data: Iterable[str]) -> torch.Tensor: for idx, word in enumerate(column_data): index = self.map.get(word, None) - if index is None: - index = self.UNK_IDX # any unknown value maps to UNK_IDX + if index is None or index == self.UNK_IDX: + pass # any unknown value is ignored else: - index = min(index, self.UNK_IDX) # any known value beyond first two also maps to index UNK_IDX - - ret[idx, index] = 1 + ret[idx, index] = 1 return torch.Tensor(ret) @@ -145,7 +142,7 @@ def decode(self, encoded_data: torch.Tensor): else: idx = np.argmax(vector) if idx == self.UNK_IDX: - ret.append(None) # known, but not either of the supported categories + ret.append(_UNCOMMON_WORD) # known, but not either of the supported categories else: ret.append(self.rev_map[idx]) diff --git a/requirements.txt b/requirements.txt index a07b5a275..335c78f41 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -type_infer ==0.0.9 -dataprep_ml ==0.0.8 -mindsdb-evaluator >=0.0.7 +type_infer >=0.0.10 +dataprep_ml >=0.0.9 +mindsdb-evaluator >=0.0.9 numpy nltk >=3,<3.6 python-dateutil >=2.8.1 diff --git a/tests/unit_tests/encoder/categorical/test_binary.py b/tests/unit_tests/encoder/categorical/test_binary.py index ad2aff72a..4eb7a8837 100644 --- a/tests/unit_tests/encoder/categorical/test_binary.py +++ b/tests/unit_tests/encoder/categorical/test_binary.py @@ -72,7 +72,7 @@ def test_check_only_binary(self): """ Ensure binary strictly enforces binary typing """ data = ["apple", "apple", "orange", "banana", "apple", "orange"] - enc = BinaryEncoder() + enc = BinaryEncoder(handle_unknown='error') self.assertRaises(ValueError, enc.prepare, data) def test_check_probabilities(self): From 860b69787cb7048b725e4285049f4d263fc74118 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Mon, 12 Jun 2023 18:43:39 -0400 Subject: [PATCH 32/38] reformat for clearer src --- lightwood/encoder/categorical/binary.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lightwood/encoder/categorical/binary.py b/lightwood/encoder/categorical/binary.py index 8c29f8f0d..8dad1241f 100644 --- a/lightwood/encoder/categorical/binary.py +++ b/lightwood/encoder/categorical/binary.py @@ -49,7 +49,6 @@ def __init__( self.output_size = 2 self.encoder_class_type = str self.handle_unknown = handle_unknown - self.UNK_IDX = 2 # Weight-balance info if encoder represents target self.target_weights = None @@ -115,7 +114,7 @@ def encode(self, column_data: Iterable[str]) -> torch.Tensor: for idx, word in enumerate(column_data): index = self.map.get(word, None) - if index is None or index == self.UNK_IDX: + if index is None or index >= self.output_size: pass # any unknown value is ignored else: ret[idx, index] = 1 @@ -141,7 +140,7 @@ def decode(self, encoded_data: torch.Tensor): ret.append(_UNCOMMON_WORD) else: idx = np.argmax(vector) - if idx == self.UNK_IDX: + if idx >= self.output_size: ret.append(_UNCOMMON_WORD) # known, but not either of the supported categories else: ret.append(self.rev_map[idx]) From 0f9372d230da9170f53fa511837ac16bb8941a91 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Mon, 12 Jun 2023 19:45:43 -0400 Subject: [PATCH 33/38] fix #1134 --- lightwood/api/json_ai.py | 3 ++- lightwood/encoder/numeric/numeric.py | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py index 3a7072130..58081d7b3 100644 --- a/lightwood/api/json_ai.py +++ b/lightwood/api/json_ai.py @@ -173,7 +173,8 @@ def lookup_encoder( if encoder_dict["module"] == "PretrainedLangEncoder" and not is_target: encoder_dict["args"]["output_type"] = "$dtype_dict[$target]" - if eval(encoder_dict["module"]).is_trainable_encoder: + enc_cls = eval(encoder_dict["module"]) + if enc_cls.is_trainable_encoder and hasattr(enc_cls, 'stop_after'): encoder_dict["args"]["stop_after"] = "$problem_definition.seconds_per_encoder" if is_target_predicting_encoder: diff --git a/lightwood/encoder/numeric/numeric.py b/lightwood/encoder/numeric/numeric.py index 79edc4425..c62a4ba31 100644 --- a/lightwood/encoder/numeric/numeric.py +++ b/lightwood/encoder/numeric/numeric.py @@ -57,15 +57,15 @@ def encode(self, data: Union[np.ndarray, pd.Series]): if isinstance(data, pd.Series): data = data.values + data = np.nan_to_num(data.astype(float), nan=0, posinf=20, neginf=-20) + if not self.positive_domain: sign = np.vectorize(self._sign_fn, otypes=[float])(data) else: sign = np.zeros(len(data)) - log_value = np.vectorize(self._log_fn, otypes=[float])(data) - log_value = np.nan_to_num(log_value, nan=0, posinf=20, neginf=-20) + log_value = np.vectorize(self._log_fn, otypes=[float])(data) norm = np.vectorize(self._norm_fn, otypes=[float])(data) - norm = np.nan_to_num(norm, nan=0, posinf=20, neginf=-20) if self.is_target: components = [sign, log_value, norm] From 9636253ea5b8b721da824c8fc72a0b53d828828b Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Mon, 12 Jun 2023 20:08:18 -0400 Subject: [PATCH 34/38] update example notebook --- .../custom_encoder_rulebased.ipynb | 446 ++---------------- 1 file changed, 28 insertions(+), 418 deletions(-) diff --git a/docssrc/source/tutorials/custom_encoder_rulebased/custom_encoder_rulebased.ipynb b/docssrc/source/tutorials/custom_encoder_rulebased/custom_encoder_rulebased.ipynb index 10f9a14e6..56be888cf 100644 --- a/docssrc/source/tutorials/custom_encoder_rulebased/custom_encoder_rulebased.ipynb +++ b/docssrc/source/tutorials/custom_encoder_rulebased/custom_encoder_rulebased.ipynb @@ -39,7 +39,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "raising-adventure", "metadata": { "execution": { @@ -74,7 +74,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "technical-government", "metadata": { "execution": { @@ -84,118 +84,7 @@ "shell.execute_reply": "2022-02-03T21:30:38.234810Z" } }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
modelyearpricetransmissionmileagefuelTypetaxmpgengineSize
0A1201712500Manual15735Petrol15055.41.4
1A6201616500Automatic36203Diesel2064.22.0
2A1201611000Manual29946Petrol3055.41.4
3A4201716800Automatic25952Diesel14567.32.0
4A3201917300Manual1998Petrol14549.61.0
\n", - "
" - ], - "text/plain": [ - " model year price transmission mileage fuelType tax mpg engineSize\n", - "0 A1 2017 12500 Manual 15735 Petrol 150 55.4 1.4\n", - "1 A6 2016 16500 Automatic 36203 Diesel 20 64.2 2.0\n", - "2 A1 2016 11000 Manual 29946 Petrol 30 55.4 1.4\n", - "3 A4 2017 16800 Automatic 25952 Diesel 145 67.3 2.0\n", - "4 A3 2019 17300 Manual 1998 Petrol 145 49.6 1.0" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "filename = 'https://raw.githubusercontent.com/mindsdb/benchmarks/main/benchmarks/datasets/used_car_price/data.csv'\n", "df = pd.read_csv(filename)\n", @@ -224,7 +113,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "absent-maker", "metadata": { "execution": { @@ -234,38 +123,7 @@ "shell.execute_reply": "2022-02-03T21:30:38.968531Z" } }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001B[32mINFO:lightwood-1462817:Dropping features: []\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Analyzing a sample of 6920\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:from a total population of 10668, this is equivalent to 64.9% of your data.\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Using 7 processes to deduct types.\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Infering type for: model\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Infering type for: year\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Infering type for: price\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Infering type for: transmission\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Infering type for: fuelType\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Infering type for: mileage\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Infering type for: tax\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Column year has data type integer\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Column price has data type integer\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Infering type for: mpg\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Infering type for: engineSize\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Column tax has data type integer\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Column mileage has data type integer\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Column engineSize has data type float\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Column mpg has data type float\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Column transmission has data type categorical\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Column fuelType has data type categorical\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Column model has data type categorical\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Starting statistical analysis\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Finished statistical analysis\u001B[0m\n" - ] - } - ], + "outputs": [], "source": [ "# Create the Problem Definition\n", "pdef = ProblemDefinition.from_dict({\n", @@ -287,7 +145,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "coastal-paragraph", "metadata": { "execution": { @@ -297,134 +155,7 @@ "shell.execute_reply": "2022-02-03T21:30:38.973749Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"encoders\": {\n", - " \"price\": {\n", - " \"module\": \"NumericEncoder\",\n", - " \"args\": {\n", - " \"is_target\": \"True\",\n", - " \"positive_domain\": \"$statistical_analysis.positive_domain\"\n", - " }\n", - " },\n", - " \"model\": {\n", - " \"module\": \"OneHotEncoder\",\n", - " \"args\": {}\n", - " },\n", - " \"year\": {\n", - " \"module\": \"NumericEncoder\",\n", - " \"args\": {}\n", - " },\n", - " \"transmission\": {\n", - " \"module\": \"OneHotEncoder\",\n", - " \"args\": {}\n", - " },\n", - " \"mileage\": {\n", - " \"module\": \"NumericEncoder\",\n", - " \"args\": {}\n", - " },\n", - " \"fuelType\": {\n", - " \"module\": \"OneHotEncoder\",\n", - " \"args\": {}\n", - " },\n", - " \"tax\": {\n", - " \"module\": \"NumericEncoder\",\n", - " \"args\": {}\n", - " },\n", - " \"mpg\": {\n", - " \"module\": \"NumericEncoder\",\n", - " \"args\": {}\n", - " },\n", - " \"engineSize\": {\n", - " \"module\": \"NumericEncoder\",\n", - " \"args\": {}\n", - " }\n", - " },\n", - " \"dtype_dict\": {\n", - " \"model\": \"categorical\",\n", - " \"year\": \"integer\",\n", - " \"price\": \"integer\",\n", - " \"transmission\": \"categorical\",\n", - " \"mileage\": \"integer\",\n", - " \"fuelType\": \"categorical\",\n", - " \"tax\": \"integer\",\n", - " \"mpg\": \"float\",\n", - " \"engineSize\": \"float\"\n", - " },\n", - " \"dependency_dict\": {},\n", - " \"model\": {\n", - " \"module\": \"BestOf\",\n", - " \"args\": {\n", - " \"submodels\": [\n", - " {\n", - " \"module\": \"Neural\",\n", - " \"args\": {\n", - " \"fit_on_dev\": true,\n", - " \"stop_after\": \"$problem_definition.seconds_per_mixer\",\n", - " \"search_hyperparameters\": true\n", - " }\n", - " },\n", - " {\n", - " \"module\": \"LightGBM\",\n", - " \"args\": {\n", - " \"stop_after\": \"$problem_definition.seconds_per_mixer\",\n", - " \"fit_on_dev\": true\n", - " }\n", - " },\n", - " {\n", - " \"module\": \"Regression\",\n", - " \"args\": {\n", - " \"stop_after\": \"$problem_definition.seconds_per_mixer\"\n", - " }\n", - " }\n", - " ],\n", - " \"args\": \"$pred_args\",\n", - " \"accuracy_functions\": \"$accuracy_functions\",\n", - " \"ts_analysis\": null\n", - " }\n", - " },\n", - " \"problem_definition\": {\n", - " \"target\": \"price\",\n", - " \"pct_invalid\": 2,\n", - " \"unbias_target\": true,\n", - " \"seconds_per_mixer\": 57024.0,\n", - " \"seconds_per_encoder\": null,\n", - " \"expected_additional_time\": 0.5703437328338623,\n", - " \"time_aim\": 259200,\n", - " \"target_weights\": null,\n", - " \"positive_domain\": false,\n", - " \"timeseries_settings\": {\n", - " \"is_timeseries\": false,\n", - " \"order_by\": null,\n", - " \"window\": null,\n", - " \"group_by\": null,\n", - " \"use_previous_target\": true,\n", - " \"horizon\": null,\n", - " \"historical_columns\": null,\n", - " \"target_type\": \"\",\n", - " \"allow_incomplete_history\": true,\n", - " \"eval_cold_start\": true,\n", - " \"interval_periods\": []\n", - " },\n", - " \"anomaly_detection\": false,\n", - " \"use_default_analysis\": true,\n", - " \"ignore_features\": [],\n", - " \"fit_on_all\": true,\n", - " \"strict_mode\": true,\n", - " \"seed_nr\": 420\n", - " },\n", - " \"identifiers\": {},\n", - " \"accuracy_functions\": [\n", - " \"r2_score\"\n", - " ]\n", - "}\n" - ] - } - ], + "outputs": [], "source": [ "print(json_ai.to_json())" ] @@ -484,7 +215,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "e03db1b0", "metadata": { "execution": { @@ -494,15 +225,7 @@ "shell.execute_reply": "2022-02-03T21:30:38.978491Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Overwriting LabelEncoder.py\n" - ] - } - ], + "outputs": [], "source": [ "%%writefile LabelEncoder.py\n", "\n", @@ -533,9 +256,9 @@ " is_prepared: bool\n", "\n", " is_timeseries_encoder: bool = False\n", - " is_trainable_encoder: bool = False\n", + " is_trainable_encoder: bool = True\n", "\n", - " def __init__(self, is_target: bool = False) -> None:\n", + " def __init__(self, is_target: bool = False, stop_after = 10) -> None:\n", " \"\"\"\n", " Initialize the Label Encoder\n", "\n", @@ -548,8 +271,7 @@ " # For LabelEncoder, this is always 1 (1 label per category)\n", " self.output_size = 1\n", "\n", - " # Not all encoders need to be prepared\n", - " def prepare(self, priming_data: pd.Series) -> None:\n", + " def prepare(self, train_data: pd.Series, dev_data: pd.Series) -> None:\n", " \"\"\"\n", " Create a LabelEncoder for categorical data.\n", "\n", @@ -561,7 +283,7 @@ " \"\"\"\n", "\n", " # Find all unique categories in the dataset\n", - " categories = priming_data.unique()\n", + " categories = train_data.unique()\n", "\n", " log.info(\"Categories Detected = \" + str(self.output_size))\n", "\n", @@ -608,7 +330,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "e30866c1", "metadata": { "execution": { @@ -670,7 +392,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "elementary-fusion", "metadata": { "execution": { @@ -699,7 +421,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "inappropriate-james", "metadata": { "execution": { @@ -733,7 +455,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "palestinian-harvey", "metadata": { "execution": { @@ -743,47 +465,7 @@ "shell.execute_reply": "2022-02-03T21:30:39.355539Z" } }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001B[32mINFO:lightwood-1462817:Performing statistical analysis on data\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Starting statistical analysis\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Finished statistical analysis\u001B[0m\n", - "\u001B[37mDEBUG:lightwood-1462817: `analyze_data` runtime: 0.14 seconds\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Cleaning the data\u001B[0m\n", - "\u001B[37mDEBUG:lightwood-1462817: `preprocess` runtime: 0.05 seconds\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Splitting the data into train/test\u001B[0m\n", - "\u001B[37mDEBUG:lightwood-1462817: `split` runtime: 0.0 seconds\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Preparing the encoders\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 1\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 2\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 3\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 4\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 5\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 6\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 7\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 8\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Encoder prepping dict length of: 9\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Categories Detected = 1\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Categories Detected = 1\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Categories Detected = 1\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Done running for: price\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Done running for: model\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Done running for: year\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Done running for: transmission\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Done running for: mileage\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Done running for: fuelType\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Done running for: tax\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Done running for: mpg\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Done running for: engineSize\u001B[0m\n", - "\u001B[37mDEBUG:lightwood-1462817: `prepare` runtime: 0.16 seconds\u001B[0m\n", - "\u001B[32mINFO:lightwood-1462817:Featurizing the data\u001B[0m\n", - "\u001B[37mDEBUG:lightwood-1462817: `featurize` runtime: 0.0 seconds\u001B[0m\n" - ] - } - ], + "outputs": [], "source": [ "# Perform Stats Analysis\n", "predictor.analyze_data(df)\n", @@ -811,7 +493,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "silent-dealing", "metadata": { "execution": { @@ -821,76 +503,7 @@ "shell.execute_reply": "2022-02-03T21:30:39.392125Z" } }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
fuelTypeEncData
0Diesel1
1Diesel1
2Diesel1
3Petrol2
4Diesel1
\n", - "
" - ], - "text/plain": [ - " fuelType EncData\n", - "0 Diesel 1\n", - "1 Diesel 1\n", - "2 Diesel 1\n", - "3 Petrol 2\n", - "4 Diesel 1" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Pick a categorical column name\n", "col_name = \"fuelType\"\n", @@ -916,7 +529,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "superior-mobility", "metadata": { "execution": { @@ -926,15 +539,7 @@ "shell.execute_reply": "2022-02-03T21:30:39.396663Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'Unknown': 0, 'Diesel': 1, 'Petrol': 2, 'Hybrid': 3}\n" - ] - } - ], + "outputs": [], "source": [ "# Label Name -> Label Number\n", "print(predictor.encoders[col_name].label_dict)" @@ -952,6 +557,11 @@ } ], "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, "language_info": { "codemirror_mode": { "name": "ipython", @@ -967,4 +577,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} From ac3ee693b5e46e828490cd479a814b6d90d207e7 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Mon, 12 Jun 2023 20:08:41 -0400 Subject: [PATCH 35/38] fix shorttext encoder is_trainable --- lightwood/api/json_ai.py | 3 +-- lightwood/encoder/text/short.py | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py index 58081d7b3..3a7072130 100644 --- a/lightwood/api/json_ai.py +++ b/lightwood/api/json_ai.py @@ -173,8 +173,7 @@ def lookup_encoder( if encoder_dict["module"] == "PretrainedLangEncoder" and not is_target: encoder_dict["args"]["output_type"] = "$dtype_dict[$target]" - enc_cls = eval(encoder_dict["module"]) - if enc_cls.is_trainable_encoder and hasattr(enc_cls, 'stop_after'): + if eval(encoder_dict["module"]).is_trainable_encoder: encoder_dict["args"]["stop_after"] = "$problem_definition.seconds_per_encoder" if is_target_predicting_encoder: diff --git a/lightwood/encoder/text/short.py b/lightwood/encoder/text/short.py index e4bb320c7..127bb863f 100644 --- a/lightwood/encoder/text/short.py +++ b/lightwood/encoder/text/short.py @@ -8,6 +8,8 @@ class ShortTextEncoder(BaseEncoder): + is_trainable_encoder = False + def __init__(self, is_target=False, mode=None, device=''): """ :param is_target: From 3babcd958e9e71ad4ca427da33f502e03bfbdcc2 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Mon, 12 Jun 2023 20:27:29 -0400 Subject: [PATCH 36/38] fix numerical encoder sign none handling --- lightwood/encoder/numeric/numeric.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lightwood/encoder/numeric/numeric.py b/lightwood/encoder/numeric/numeric.py index c62a4ba31..1a1cf8b25 100644 --- a/lightwood/encoder/numeric/numeric.py +++ b/lightwood/encoder/numeric/numeric.py @@ -57,15 +57,16 @@ def encode(self, data: Union[np.ndarray, pd.Series]): if isinstance(data, pd.Series): data = data.values - data = np.nan_to_num(data.astype(float), nan=0, posinf=20, neginf=-20) - if not self.positive_domain: - sign = np.vectorize(self._sign_fn, otypes=[float])(data) + sign_data = np.nan_to_num(data, nan=0, posinf=0, neginf=0) + sign = np.vectorize(self._sign_fn, otypes=[float])(sign_data) else: sign = np.zeros(len(data)) - log_value = np.vectorize(self._log_fn, otypes=[float])(data) + log_value = np.nan_to_num(log_value, nan=0, posinf=20, neginf=-20) + norm = np.vectorize(self._norm_fn, otypes=[float])(data) + norm = np.nan_to_num(norm, nan=0, posinf=20, neginf=-20) if self.is_target: components = [sign, log_value, norm] From 8c7b78fa04c0bdcbfc094cecb056ebccb198f237 Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Mon, 12 Jun 2023 20:49:46 -0400 Subject: [PATCH 37/38] fix numerical encoder sign none handling --- lightwood/encoder/numeric/numeric.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightwood/encoder/numeric/numeric.py b/lightwood/encoder/numeric/numeric.py index 1a1cf8b25..9c040ca43 100644 --- a/lightwood/encoder/numeric/numeric.py +++ b/lightwood/encoder/numeric/numeric.py @@ -58,7 +58,7 @@ def encode(self, data: Union[np.ndarray, pd.Series]): data = data.values if not self.positive_domain: - sign_data = np.nan_to_num(data, nan=0, posinf=0, neginf=0) + sign_data = np.nan_to_num(data.astype(float), nan=0, posinf=0, neginf=0) sign = np.vectorize(self._sign_fn, otypes=[float])(sign_data) else: sign = np.zeros(len(data)) From 7eaf17ea7547e78c8c540c65cf8da32db59b359e Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Mon, 12 Jun 2023 21:14:19 -0400 Subject: [PATCH 38/38] fix numerical encoder sign none handling --- lightwood/encoder/numeric/numeric.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lightwood/encoder/numeric/numeric.py b/lightwood/encoder/numeric/numeric.py index 9c040ca43..a2b261e3b 100644 --- a/lightwood/encoder/numeric/numeric.py +++ b/lightwood/encoder/numeric/numeric.py @@ -57,15 +57,15 @@ def encode(self, data: Union[np.ndarray, pd.Series]): if isinstance(data, pd.Series): data = data.values + inp_data = np.nan_to_num(data.astype(float), nan=0, posinf=np.finfo(np.float32).max, neginf=np.finfo(np.float32).min) # noqa if not self.positive_domain: - sign_data = np.nan_to_num(data.astype(float), nan=0, posinf=0, neginf=0) - sign = np.vectorize(self._sign_fn, otypes=[float])(sign_data) + sign = np.vectorize(self._sign_fn, otypes=[float])(inp_data) else: sign = np.zeros(len(data)) - log_value = np.vectorize(self._log_fn, otypes=[float])(data) + log_value = np.vectorize(self._log_fn, otypes=[float])(inp_data) log_value = np.nan_to_num(log_value, nan=0, posinf=20, neginf=-20) - norm = np.vectorize(self._norm_fn, otypes=[float])(data) + norm = np.vectorize(self._norm_fn, otypes=[float])(inp_data) norm = np.nan_to_num(norm, nan=0, posinf=20, neginf=-20) if self.is_target: