Merge pull request #1027 from mindsdb/staging

Release 22.10.4.0
mindsdb · Oct 26, 2022 · 32d20be · 32d20be
2 parents ddcf71e + 9e7ab65
commit 32d20be
Show file tree

Hide file tree

Showing 45 changed files with 1,017 additions and 1,467 deletions.
diff --git a/.github/workflows/ligthtwood.yml b/.github/workflows/ligthtwood.yml
@@ -13,12 +13,8 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-latest, windows-latest]
+        os: [ubuntu-latest]
         python-version: [3.7,3.8,3.9]
-        exclude:
-          # exclude combination due to #849
-          - os: windows-latest
-            python-version: 3.9
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python ${{ matrix.python-version }}

diff --git a/.gitignore b/.gitignore
@@ -57,6 +57,7 @@ test.pickle
 AI.json
 AI2.json
 
+# docs
 assert.sh
 docssrc/build
 docssrc/build/*
@@ -67,3 +68,4 @@ docs
 docs/*
 *.zip
 docs/*
+.ipynb_checkpoints
diff --git a/README.md b/README.md
@@ -120,7 +120,7 @@ pip3 install lightwood
 However, we recommend creating a python virtual environment.
 
 #### Setting up a dev environment
-
+- Python version should be in the range >=3.7, < 3.10
 - Clone lightwood
 - `cd lightwood && pip install -r requirements.txt && pip install -r requirements_image.txt`
 - Add it to your python path (e.g. by adding `export PYTHONPATH='/where/you/cloned/lightwood':$PYTHONPATH` as a newline at the end of your `~/.bashrc` file)

diff --git a/docssrc/source/tutorials/custom_explainer/custom_explainer.ipynb b/docssrc/source/tutorials/custom_explainer/custom_explainer.ipynb
diff --git a/lightwood/__about__.py b/lightwood/__about__.py
@@ -1,6 +1,6 @@
 __title__ = 'lightwood'
 __package_name__ = 'lightwood'
-__version__ = '22.9.1.0'
+__version__ = '22.10.4.0'
 __description__ = "Lightwood is a toolkit for automatic machine learning model building"
 __email__ = "[email protected]"
 __author__ = 'MindsDB Inc'

diff --git a/lightwood/analysis/analyze.py b/lightwood/analysis/analyze.py
@@ -1,6 +1,7 @@
 from typing import Dict, List, Tuple, Optional
 
 from lightwood.helpers.log import log
+from lightwood.helpers.ts import filter_ds
 from lightwood.api import dtype
 from lightwood.ensemble import BaseEnsemble
 from lightwood.analysis.base import BaseAnalysisBlock
@@ -53,8 +54,10 @@ def model_analyzer(
 
     # raw predictions for validation dataset
     args = {} if not is_classification else {"predict_proba": True}
+    filtered_df = filter_ds(encoded_val_data, tss)
+    encoded_val_data = EncodedDs(encoded_val_data.encoders, filtered_df, encoded_val_data.target)
     normal_predictions = predictor(encoded_val_data, args=PredictionArguments.from_dict(args))
-    normal_predictions = normal_predictions.set_index(data.index)
+    normal_predictions = normal_predictions.set_index(encoded_val_data.data_frame.index)
 
     # ------------------------- #
     # Run analysis blocks, both core and user-defined
@@ -65,7 +68,7 @@ def model_analyzer(
         'input_cols': input_cols,
         'dtype_dict': dtype_dict,
         'normal_predictions': normal_predictions,
-        'data': data,
+        'data': filtered_df,
         'train_data': train_data,
         'encoded_val_data': encoded_val_data,
         'is_classification': is_classification,

diff --git a/lightwood/analysis/helpers/conf_stats.py b/lightwood/analysis/helpers/conf_stats.py
@@ -27,10 +27,10 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
             possible_labels = ns.stats_info.train_observed_classes
             self.ordenc.fit([[label] for label in possible_labels])
             task_type = 'categorical'
-        elif ns.is_numerical:
-            task_type = 'numerical'
         elif ns.is_multi_ts:
             task_type = 'multi_ts'
+        elif ns.is_numerical:
+            task_type = 'numerical'
         else:
             return info
 
@@ -66,14 +66,15 @@ def _get_stats(self, confs, preds, data, target, task_type='categorical'):
         if task_type == 'categorical':
             sorted_inp['__mdb_prediction'] = sorted_preds['prediction']
         else:
-            sorted_inp['__mdb_lower'] = confs['lower']
-            sorted_inp['__mdb_upper'] = confs['upper']
-            if task_type == 'numerical':
-                sorted_inp['__mdb_hits'] = (sorted_inp['__mdb_lower'] <= sorted_inp[target]) & \
-                                           (sorted_inp[target] <= sorted_inp['__mdb_upper'])
-            elif task_type == 'multi_ts':
-                sorted_inp['__mdb_hits'] = (sorted_inp['__mdb_lower'][0] <= sorted_inp[target]) & \
-                                           (sorted_inp[target] <= sorted_inp['__mdb_upper'][0])
+            if isinstance(confs['lower'][0], list):
+                sorted_inp['__mdb_lower'] = confs['lower'].apply(lambda x: x[0])
+                sorted_inp['__mdb_upper'] = confs['upper'].apply(lambda x: x[0])
+            else:
+                sorted_inp['__mdb_lower'] = confs['lower']
+                sorted_inp['__mdb_upper'] = confs['upper']
+
+            sorted_inp['__mdb_hits'] = (sorted_inp['__mdb_lower'] <= sorted_inp[target]) & \
+                                       (sorted_inp[target] <= sorted_inp['__mdb_upper'])
 
         size = round(len(sorted_inp) / self.ece_bins)
         bins = []

diff --git a/lightwood/analysis/nc/calibrate.py b/lightwood/analysis/nc/calibrate.py
@@ -41,6 +41,11 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
         data_type = ns.dtype_dict[ns.target]
         output = {'icp': {'__mdb_active': False}}
 
+        if 'confidence' in ns.normal_predictions.columns:
+            #  bypass calibrator if model already outputs confidence
+            output['result_df'] = ns.normal_predictions[['confidence', 'lower', 'upper']]
+            return {**info, **output}
+
         fit_params = {'horizon': ns.tss.horizon or 0, 'columns_to_ignore': []}
         fit_params['columns_to_ignore'].extend([f'timestep_{i}' for i in range(1, fit_params['horizon'])])
 
@@ -479,6 +484,13 @@ def _formatted(row_insights, global_insights, ns, is_numerical):
         elif ns.target_dtype in (dtype.short_text, dtype.rich_text, dtype.binary, dtype.categorical):
             row_insights['prediction'] = row_insights['prediction'].astype(str)
 
+        # horizon collapse
+        if ns.tss.is_timeseries and is_numerical and ns.tss.horizon > 1:
+            row_insights['prediction_sum'] = row_insights['prediction'].apply(lambda x: sum(x))
+            row_insights['lower_sum'] = row_insights['lower'].apply(lambda x: min(x))
+            row_insights['upper_sum'] = row_insights['upper'].apply(lambda x: max(x))
+            row_insights['confidence_mean'] = row_insights['confidence'].apply(lambda x: np.mean(x))
+
         return row_insights, global_insights
 
     @staticmethod

diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py
@@ -11,6 +11,7 @@
 )
 import inspect
 from lightwood.helpers.log import log
+from lightwood.__about__ import __version__ as lightwood_version
 
 
 # For custom modules, we create a module loader with necessary imports below
@@ -39,6 +40,7 @@
 from lightwood.ensemble import *
 from lightwood.helpers.device import *
 from lightwood.helpers.general import *
+from lightwood.helpers.ts import *
 from lightwood.helpers.log import *
 from lightwood.helpers.numeric import *
 from lightwood.helpers.imputers import *
@@ -278,6 +280,13 @@ def generate_json_ai(
                             "stop_after": "$problem_definition.seconds_per_mixer",
                         },
                     },
+                    {
+                        "module": "RandomForest",
+                        "args": {
+                            "stop_after": "$problem_definition.seconds_per_mixer",
+                            "fit_on_dev": True,
+                        },
+                    },
                 ]
             )
         elif tss.is_timeseries and tss.horizon > 1:
@@ -372,8 +381,10 @@ def generate_json_ai(
         accuracy_functions = ["r2_score"]
     elif output_dtype in [dtype.categorical, dtype.tags, dtype.binary]:
         accuracy_functions = ["balanced_accuracy_score"]
-    elif output_dtype in (dtype.num_array, dtype.num_tsarray):
-        accuracy_functions = ["bounded_ts_accuracy"]
+    elif output_dtype in (dtype.num_tsarray, ):
+        accuracy_functions = ["complementary_smape_array_accuracy"]
+    elif output_dtype in (dtype.num_array, ):
+        accuracy_functions = ["evaluate_num_array_accuracy"]
     elif output_dtype in (dtype.cat_array, dtype.cat_tsarray):
         accuracy_functions = ["evaluate_cat_array_accuracy"]
     else:
@@ -383,7 +394,8 @@ def generate_json_ai(
 
     if is_ts:
         if output_dtype in [dtype.integer, dtype.float]:
-            accuracy_functions = ["bounded_ts_accuracy"]  # forces this acc fn for t+1 time series forecasters  # noqa
+            # forces this acc fn for t+1 time series forecasters
+            accuracy_functions = ["complementary_smape_array_accuracy"]
 
         if output_dtype in (dtype.integer, dtype.float, dtype.num_tsarray):
             imputers.append({"module": "NumericalImputer",
@@ -590,6 +602,12 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
                 "target_encoder", "$encoders[self.target]"
             )
 
+        elif mixers[i]["module"] == "RandomForest":
+            mixers[i]["args"]["target_encoder"] = mixers[i]["args"].get(
+                "target_encoder", "$encoders[self.target]"
+            )
+            mixers[i]["args"]["use_optuna"] = True
+
         elif mixers[i]["module"] == "LightGBMArray":
             mixers[i]["args"]["input_cols"] = mixers[i]["args"].get(
                 "input_cols", "$input_cols"
@@ -602,7 +620,7 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
             mixers[i]["args"]["fit_on_dev"] = mixers[i]["args"].get("fit_on_dev", "True")
             mixers[i]["args"]["use_stl"] = mixers[i]["args"].get("use_stl", "False")
 
-        elif mixers[i]["module"] == "NHitsMixer":
+        elif mixers[i]["module"] in ("NHitsMixer", "GluonTSMixer"):
             mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon"
             mixers[i]["args"]["window"] = "$problem_definition.timeseries_settings.window"
             mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get(
@@ -980,6 +998,8 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
 encoded_train_data = enc_data['train']
 encoded_dev_data = enc_data['dev']
 encoded_test_data = enc_data['test']
+filtered_df = filter_ds(encoded_test_data, self.problem_definition.timeseries_settings)
+encoded_test_data = EncodedDs(encoded_test_data.encoders, filtered_df, encoded_test_data.target)
 
 log.info('Training the mixers')
 
@@ -1186,6 +1206,7 @@ def __init__(self):
         self.accuracy_functions = {json_ai.accuracy_functions}
         self.identifiers = {json_ai.identifiers}
         self.dtype_dict = {inline_dict(dtype_dict)}
+        self.lightwood_version = '{lightwood_version}'
 
         # Any feature-column dependencies
         self.dependencies = {inline_dict(json_ai.dependency_dict)}

diff --git a/lightwood/api/types.py b/lightwood/api/types.py
@@ -62,6 +62,7 @@ class StatisticalAnalysis:
              in the information.
 
     :param nr_rows: Number of rows (samples) in the dataset
+    :param nr_columns: Number of columns (features) in the dataset
     :param df_target_stddev: The standard deviation of the target of the dataset
     :param train_observed_classes:
     :param target_class_distribution:
@@ -77,6 +78,7 @@ class StatisticalAnalysis:
     """ # noqa
 
     nr_rows: int
+    nr_columns: int
     df_target_stddev: Optional[float]
     train_observed_classes: object  # Union[None, List[str]]
     target_class_distribution: object  # Dict[str, float]
@@ -123,7 +125,7 @@ class TimeseriesSettings:
     :param target_type: Automatically inferred dtype of the target (e.g. `dtype.integer`, `dtype.float`).
     :param use_previous_target: Use the previous values of the target column to generate predictions. Defaults to True.
     :param allow_incomplete_history: whether predictions can be made for rows with incomplete historical context (i.e. less than `window` rows have been observed for the datetime that has to be forecasted).
-    :param eval_cold_start: whether to include predictions with incomplete history (thus part of the cold start region for certain mixers) when evaluating mixer scores with the validation dataset.
+    :param eval_incomplete: whether to consider predictions with incomplete history or target information when evaluating mixer accuracy with the validation dataset.
     :param interval_periods: tuple of tuples with user-provided period lengths for time intervals. Default values will be added for intervals left unspecified. For interval options, check the `timeseries_analyzer.detect_period()` method documentation. e.g.: (('daily', 7),).
     """  # noqa
 
@@ -139,7 +141,7 @@ class TimeseriesSettings:
         # @TODO: George: No, I don't think it is, we need to pass this some other way
     )
     allow_incomplete_history: bool = True
-    eval_cold_start: bool = True
+    eval_incomplete: bool = False
     interval_periods: tuple = tuple()
 
     @staticmethod
@@ -170,7 +172,7 @@ def from_dict(obj: Dict):
                 historical_columns=[],
                 horizon=obj.get("horizon", 1),
                 allow_incomplete_history=obj.get('allow_incomplete_history', True),
-                eval_cold_start=obj.get('eval_cold_start', True),
+                eval_incomplete=obj.get('eval_incomplete', False),
                 interval_periods=obj.get('interval_periods', tuple(tuple()))
             )
             for setting in obj:

diff --git a/lightwood/data/statistical_analysis.py b/lightwood/data/statistical_analysis.py
@@ -99,6 +99,7 @@ def statistical_analysis(data: pd.DataFrame,
         except Exception:
             order_format = None
 
+    nr_columns = len(data.columns)
     df = cleaner(data, dtypes, problem_definition.pct_invalid,
                  identifiers, problem_definition.target, 'train', tss,
                  problem_definition.anomaly_detection)
@@ -204,6 +205,7 @@ def statistical_analysis(data: pd.DataFrame,
     log.info('Finished statistical analysis')
     return StatisticalAnalysis(
         nr_rows=nr_rows,
+        nr_columns=nr_columns,
         df_target_stddev=df_std,
         train_observed_classes=train_observed_classes,
         target_class_distribution=target_class_distribution,

diff --git a/lightwood/data/timeseries_analyzer.py b/lightwood/data/timeseries_analyzer.py
@@ -150,7 +150,7 @@ def _pick_ST(tr_subset: pd.Series, dev_subset: pd.Series, sp: list):
     """  # noqa
 
     def _ST_objective(trial: optuna.Trial):
-        trend_degree = trial.suggest_categorical("trend_degree", [1, 2])
+        trend_degree = trial.suggest_categorical("trend_degree", [1])
         ds_sp = trial.suggest_categorical("ds_sp", sp)  # seasonality period to use in deseasonalizer
         if min(min(tr_subset), min(dev_subset)) <= 0:
             decomp_type = trial.suggest_categorical("decomp_type", ['additive'])

diff --git a/lightwood/encoder/categorical/autoencoder.py b/lightwood/encoder/categorical/autoencoder.py
@@ -30,13 +30,15 @@ def __init__(
         max_encoded_length: int = 100,
         desired_error: float = 0.01,
         batch_size: int = 200,
+        device: str = '',
     ):
         """
         :param stop_after: Stops training with provided time limit (sec)
         :param is_target: Encoder represents target class (NOT recommended)
         :param max_encoded_length: Maximum length of vector represented
         :param desired_error: Threshold for reconstruction accuracy error
         :param batch_size: Minimum batch size while training
+        :param device: Name of the device that get_device_from_name will attempt to use
         """  # noqa
         super().__init__(is_target)
         self.is_prepared = False
@@ -48,6 +50,7 @@ def __init__(
         self.encoder = None
         self.decoder = None
         self.onehot_encoder = OneHotEncoder(is_target=self.is_target)
+        self.device_type = device
 
         # Training details
         self.batch_size = batch_size
@@ -148,7 +151,7 @@ def _prepare_AE_input(
 
         # Prepare a one-hot encoder for CatAE inputs
         self.onehot_encoder.prepare(priming_data)
-        self.batch_size = min(self.batch_size, int(len(priming_data) / 50))
+        self.batch_size = max(min(self.batch_size, int(len(priming_data) / 50)), 1)
 
         train_loader = DataLoader(
             list(zip(priming_data, priming_data)),
@@ -170,7 +173,7 @@ def _prepare_catae(self, train_loader: DataLoader, dev_loader: DataLoader):
         """  # noqa
         input_len = self.onehot_encoder.output_size
 
-        self.net = DefaultNet(shape=[input_len, self.output_size, input_len])
+        self.net = DefaultNet(shape=[input_len, self.output_size, input_len], device=self.device_type)
 
         criterion = torch.nn.CrossEntropyLoss()
         optimizer = Ranger(self.net.parameters())

diff --git a/lightwood/encoder/image/helpers/img_to_vec.py b/lightwood/encoder/image/helpers/img_to_vec.py
@@ -1,6 +1,6 @@
 import torch
 import torch.nn as nn
-from lightwood.helpers.device import get_devices
+from lightwood.helpers.device import get_device_from_name
 from lightwood.helpers.torch import LightwoodAutocast
 
 from lightwood.helpers.log import log
@@ -33,16 +33,17 @@ class Img2Vec(nn.Module):
     
     Output is a `self.output_size`-dimensioned vector, generated by taking the output of the Resnext's last convolutional layer and performing an adaptive channel pool average. 
     """  # noqa
-    def __init__(self):
+    def __init__(self, device=''):
         super(Img2Vec, self).__init__()
 
-        self.device, _ = get_devices()
+        self.device = get_device_from_name(device)
+
         self.output_size = 512
         self.model = torch.nn.Sequential(*list(models.resnext50_32x4d(pretrained=True).children())[: -1],
                                          ChannelPoolAdaptiveAvg1d(output_size=self.output_size))
         self.model = self.model.to(self.device)
 
-    def to(self, device, available_devices):
+    def to(self, device, available_devices=1):
         self.device = device
         self.model = self.model.to(self.device)
         return self