Merge pull request #874 from mindsdb/staging

Release 22.5.1.0
mindsdb · May 6, 2022 · 55131e1 · 55131e1
2 parents fe83d62 + 3ddf8ab
commit 55131e1
Show file tree

Hide file tree

Showing 21 changed files with 220 additions and 108 deletions.
diff --git a/docssrc/source/tutorials/custom_cleaner/custom_cleaner.ipynb b/docssrc/source/tutorials/custom_cleaner/custom_cleaner.ipynb
@@ -195,7 +195,7 @@
     "### 2) Create a JSON-AI default object\n",
     "Before we create a custom cleaner object, let's first create JSON-AI syntax for our problem based on its specifications. We can do so by setting up a ``ProblemDefinition``. The ``ProblemDefinition`` allows us to specify the target, the column we intend to predict, along with other details. \n",
     "\n",
-    "The end goal of JSON-AI is to provide **a set of instructions on how to compile a machine learning pipeline*.\n",
+    "The end goal of JSON-AI is to provide *a set of instructions on how to compile a machine learning pipeline*.\n",
     "\n",
     "In this case, let's specify our target, the aptly named **target** column. We will also tell JSON-AI to throw away features we never intend to use, such as \"url_legal\", \"license\", and \"standard_error\". We can do so in the following lines:"
    ]
@@ -217,20 +217,20 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32mINFO:lightwood-1462419:Dropping features: ['url_legal', 'license', 'standard_error']\u001b[0m\n",
-      "\u001b[32mINFO:lightwood-1462419:Analyzing a sample of 2478\u001b[0m\n",
-      "\u001b[32mINFO:lightwood-1462419:from a total population of 2834, this is equivalent to 87.4% of your data.\u001b[0m\n",
-      "\u001b[32mINFO:lightwood-1462419:Using 7 processes to deduct types.\u001b[0m\n",
-      "\u001b[32mINFO:lightwood-1462419:Infering type for: id\u001b[0m\n",
-      "\u001b[32mINFO:lightwood-1462419:Infering type for: target\u001b[0m\n",
-      "\u001b[32mINFO:lightwood-1462419:Infering type for: excerpt\u001b[0m\n",
-      "\u001b[32mINFO:lightwood-1462419:Column target has data type float\u001b[0m\n",
-      "\u001b[32mINFO:lightwood-1462419:Doing text detection for column: id\u001b[0m\n",
-      "\u001b[32mINFO:lightwood-1462419:Doing text detection for column: excerpt\u001b[0m\n",
-      "\u001b[32mINFO:lightwood-1462419:Column id has data type categorical\u001b[0m\n",
-      "\u001b[33mWARNING:lightwood-1462419:Column id is an identifier of type \"Hash-like identifier\"\u001b[0m\n",
-      "\u001b[32mINFO:lightwood-1462419:Starting statistical analysis\u001b[0m\n",
-      "\u001b[32mINFO:lightwood-1462419:Finished statistical analysis\u001b[0m\n"
+      "\u001B[32mINFO:lightwood-1462419:Dropping features: ['url_legal', 'license', 'standard_error']\u001B[0m\n",
+      "\u001B[32mINFO:lightwood-1462419:Analyzing a sample of 2478\u001B[0m\n",
+      "\u001B[32mINFO:lightwood-1462419:from a total population of 2834, this is equivalent to 87.4% of your data.\u001B[0m\n",
+      "\u001B[32mINFO:lightwood-1462419:Using 7 processes to deduct types.\u001B[0m\n",
+      "\u001B[32mINFO:lightwood-1462419:Infering type for: id\u001B[0m\n",
+      "\u001B[32mINFO:lightwood-1462419:Infering type for: target\u001B[0m\n",
+      "\u001B[32mINFO:lightwood-1462419:Infering type for: excerpt\u001B[0m\n",
+      "\u001B[32mINFO:lightwood-1462419:Column target has data type float\u001B[0m\n",
+      "\u001B[32mINFO:lightwood-1462419:Doing text detection for column: id\u001B[0m\n",
+      "\u001B[32mINFO:lightwood-1462419:Doing text detection for column: excerpt\u001B[0m\n",
+      "\u001B[32mINFO:lightwood-1462419:Column id has data type categorical\u001B[0m\n",
+      "\u001B[33mWARNING:lightwood-1462419:Column id is an identifier of type \"Hash-like identifier\"\u001B[0m\n",
+      "\u001B[32mINFO:lightwood-1462419:Starting statistical analysis\u001B[0m\n",
+      "\u001B[32mINFO:lightwood-1462419:Finished statistical analysis\u001B[0m\n"
      ]
     }
    ],
@@ -1069,15 +1069,15 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "\u001b[32mINFO:lightwood-1462419:Performing statistical analysis on data\u001b[0m\n",
-      "\u001b[32mINFO:lightwood-1462419:Starting statistical analysis\u001b[0m\n",
-      "\u001b[32mINFO:lightwood-1462419:Finished statistical analysis\u001b[0m\n",
-      "\u001b[37mDEBUG:lightwood-1462419: `analyze_data` runtime: 0.03 seconds\u001b[0m\n",
-      "\u001b[32mINFO:lightwood-1462419:Cleaning the data\u001b[0m\n",
-      "\u001b[32mINFO:lightwood-1462419:Cleaning column =excerpt\u001b[0m\n",
-      "\u001b[32mINFO:lightwood-1462419:Cleaning column =target\u001b[0m\n",
-      "\u001b[32mINFO:lightwood-1462419:Converted target into strictly non-negative\u001b[0m\n",
-      "\u001b[37mDEBUG:lightwood-1462419: `preprocess` runtime: 0.06 seconds\u001b[0m\n"
+      "\u001B[32mINFO:lightwood-1462419:Performing statistical analysis on data\u001B[0m\n",
+      "\u001B[32mINFO:lightwood-1462419:Starting statistical analysis\u001B[0m\n",
+      "\u001B[32mINFO:lightwood-1462419:Finished statistical analysis\u001B[0m\n",
+      "\u001B[37mDEBUG:lightwood-1462419: `analyze_data` runtime: 0.03 seconds\u001B[0m\n",
+      "\u001B[32mINFO:lightwood-1462419:Cleaning the data\u001B[0m\n",
+      "\u001B[32mINFO:lightwood-1462419:Cleaning column =excerpt\u001B[0m\n",
+      "\u001B[32mINFO:lightwood-1462419:Cleaning column =target\u001B[0m\n",
+      "\u001B[32mINFO:lightwood-1462419:Converted target into strictly non-negative\u001B[0m\n",
+      "\u001B[37mDEBUG:lightwood-1462419: `preprocess` runtime: 0.06 seconds\u001B[0m\n"
      ]
     },
     {
@@ -1178,8 +1178,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001b[1mOriginal Data\n",
-      "\u001b[0m\n",
+      "\u001B[1mOriginal Data\n",
+      "\u001B[0m\n",
       "Excerpt:\n",
       " When the young people returned to the ballroom, it presented a decidedly changed appearance. Instead of an interior scene, it was a winter landscape.\n",
       "The floor was covered with snow-white canvas, not laid on smoothly, but rumpled over bumps and hillocks, like a real snow field. The numerous palms and evergreens that had decorated the room, were powdered with flour and strewn with tufts of cotton, like snow. Also diamond dust had been lightly sprinkled on them, and glittering crystal icicles hung from the branches.\n",
@@ -1190,10 +1190,10 @@
       "\n",
       "Target:\n",
       " -0.340259125\n",
-      "\u001b[1m\n",
+      "\u001B[1m\n",
       "\n",
       "Cleaned Data\n",
-      "\u001b[0m\n",
+      "\u001B[0m\n",
       "Excerpt:\n",
       " When young people returned ballroom, presented decidedly changed appearance. Instead interior scene, winter landscape. The floor covered snow-white canvas, laid smoothly, rumpled bumps hillocks, like real snow field. The numerous palms evergreens decorated room, powdered flour strewn tufts cotton, like snow. Also diamond dust lightly sprinkled them, glittering crystal icicles hung branches. At end room, wall, hung beautiful bear-skin rug. These rugs prizes, one girls one boys. And game. The girls gathered one end room boys other, one end called North Pole, South Pole. Each player given small flag plant reaching Pole. This would easy matter, traveller obliged wear snowshoes.\n",
       "\n",
@@ -1248,4 +1248,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
diff --git a/lightwood/__about__.py b/lightwood/__about__.py
@@ -1,6 +1,6 @@
 __title__ = 'lightwood'
 __package_name__ = 'lightwood'
-__version__ = '22.4.4.0'
+__version__ = '22.5.1.0'
 __description__ = "Lightwood is a toolkit for automatic machine learning model building"
 __email__ = "[email protected]"
 __author__ = 'MindsDB Inc'

diff --git a/lightwood/analysis/nc/calibrate.py b/lightwood/analysis/nc/calibrate.py
@@ -124,7 +124,7 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
                 output['icp']['__mdb_group_keys'] = [x for x in group_info.keys()]
 
                 for combination in all_group_combinations:
-                    output['icp'][frozenset(combination)] = deepcopy(icp)
+                    output['icp'][tuple(combination)] = deepcopy(icp)
 
             # calibrate ICP
             icp_df = deepcopy(ns.data)
@@ -165,37 +165,37 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
                     for key, val in zip(group_keys, group):
                         icp_df = icp_df[icp_df[key] == val]
 
-                    if icps[frozenset(group)].nc_function.normalizer is not None:
-                        group_normalizer = icps[frozenset(group)].nc_function.normalizer
+                    if icps[tuple(group)].nc_function.normalizer is not None:
+                        group_normalizer = icps[tuple(group)].nc_function.normalizer
                         norm_input_df = ns.encoded_val_data.data_frame.iloc[icp_df.pop('__mdb_norm_index')]
                         norm_input = EncodedDs(ns.encoded_val_data.encoders, norm_input_df, ns.target)
                         norm_cache = group_normalizer(norm_input, args=PredictionArguments())
                         icp_df[f'__norm_{ns.target}'] = norm_cache
 
                     # save relevant predictions in the caches, then calibrate the ICP
                     pred_cache = icp_df.pop(f'__predicted_{ns.target}').values
-                    icps[frozenset(group)].nc_function.model.prediction_cache = pred_cache
+                    icps[tuple(group)].nc_function.model.prediction_cache = pred_cache
                     icp_df, y = clean_df(icp_df, ns.target, ns.is_classification, output.get('label_encoders', None))
-                    if icps[frozenset(group)].nc_function.normalizer is not None:
-                        icps[frozenset(group)].nc_function.normalizer.prediction_cache = icp_df.pop(
+                    if icps[tuple(group)].nc_function.normalizer is not None:
+                        icps[tuple(group)].nc_function.normalizer.prediction_cache = icp_df.pop(
                             f'__norm_{ns.target}').values
 
-                    icps[frozenset(group)].index = icp_df.columns  # important at inference time
-                    icps[frozenset(group)].calibrate(icp_df.values, y)
+                    icps[tuple(group)].index = icp_df.columns  # important at inference time
+                    icps[tuple(group)].calibrate(icp_df.values, y)
 
                     # save training std() for bounds width selection
                     if not ns.is_classification:
                         icp_train_df = ns.data
                         for key, val in zip(group_keys, group):
                             icp_train_df = icp_train_df[icp_train_df[key] == val]
                         y_train = icp_train_df[ns.target].values
-                        output['df_target_stddev'][frozenset(group)] = y_train.std()
+                        output['df_target_stddev'][tuple(group)] = y_train.std()
 
                     # get bounds for relevant rows in validation dataset
                     conf, group_ranges = set_conf_range(
-                        icp_df, icps[frozenset(group)],
+                        icp_df, icps[tuple(group)],
                         ns.dtype_dict[ns.target],
-                        output, group=frozenset(group),
+                        output, group=tuple(group),
                         positive_domain=self.positive_domain, significance=self.fixed_significance)
                     # save group bounds
                     if not ns.is_classification:
@@ -304,7 +304,7 @@ def explain(self, row_insights: pd.DataFrame, global_insights: Dict[str, object]
                     group_keys = icps['__mdb_group_keys']
 
                     for group in icps['__mdb_groups']:
-                        icp = icps[frozenset(group)]
+                        icp = icps[tuple(group)]
 
                         # check ICP has calibration scores
                         if icp.cal_scores[0].shape[0] > 0:
@@ -328,7 +328,7 @@ def explain(self, row_insights: pd.DataFrame, global_insights: Dict[str, object]
                                         all_confs,
                                         df_target_stddev=ns.analysis['df_target_stddev'],
                                         positive_domain=self.positive_domain,
-                                        group=frozenset(group),
+                                        group=tuple(group),
                                         fixed_conf=fixed_conf
                                     )
 

diff --git a/lightwood/analysis/nc/util.py b/lightwood/analysis/nc/util.py
@@ -163,8 +163,12 @@ def get_categorical_conf(raw_confs: np.ndarray):
     """
     if len(raw_confs.shape) == 1:
         raw_confs = np.expand_dims(raw_confs, axis=0)
-    second_p = np.sort(raw_confs, axis=1)[:, -2]
-    confs = np.clip(np.subtract(1, second_p), 0.0001, 0.9999)
+    if raw_confs.shape[-1] == 1:
+        # single-class edge case (only happens if predictor sees just one known label at calibration)
+        confs = np.clip(raw_confs[:, 0], 0.0001, 0.9999)
+    else:
+        second_p = np.sort(raw_confs, axis=1)[:, -2]
+        confs = np.clip(np.subtract(1, second_p), 0.0001, 0.9999)
     return confs
 
 

diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py
@@ -210,6 +210,7 @@ def generate_json_ai(
 
     is_target_predicting_encoder = False
     is_ts = problem_definition.timeseries_settings.is_timeseries
+    imputers = []
 
     # Single text column classification
     if (
@@ -267,7 +268,7 @@ def generate_json_ai(
                         "args": {
                             "fit_on_dev": True,
                             "stop_after": "$problem_definition.seconds_per_mixer",
-                            "n_ts_predictions": "$problem_definition.timeseries_settings.horizon",
+                            "horizon": "$problem_definition.timeseries_settings.horizon",
                         },
                     }
                 ]
@@ -280,7 +281,7 @@ def generate_json_ai(
                             "module": "SkTime",
                             "args": {
                                 "stop_after": "$problem_definition.seconds_per_mixer",
-                                "n_ts_predictions": "$problem_definition.timeseries_settings.horizon",
+                                "horizon": "$problem_definition.timeseries_settings.horizon",
                             },
                         }
                     ]
@@ -344,10 +345,22 @@ def generate_json_ai(
             f"Please specify a custom accuracy function for output type {output_dtype}"
         )
 
-    # special dispatch for t+1 time series forecasters
     if is_ts:
         if output_dtype in [dtype.integer, dtype.float]:
-            accuracy_functions = ["evaluate_num_array_accuracy"]
+            accuracy_functions = ["evaluate_num_array_accuracy"]  # forces this acc fn for t+1 time series forecasters
+
+        if output_dtype in (dtype.integer, dtype.float, dtype.num_tsarray):
+            imputers.append({"module": "NumericalImputer",
+                             "args": {
+                                 "value": "'zero'",
+                                 "target": f"'{target}'"}}
+                            )
+        elif output_dtype in [dtype.categorical, dtype.tags, dtype.binary, dtype.cat_tsarray]:
+            imputers.append({"module": "CategoricalImputer",
+                             "args": {
+                                 "value": "'mode'",
+                                 "target": f"'{target}'"}}
+                            )
 
     if problem_definition.time_aim is None:
         # 5 days
@@ -379,6 +392,7 @@ def generate_json_ai(
         analyzer=None,
         explainer=None,
         encoders=encoders,
+        imputers=imputers,
         dtype_dict=dtype_dict,
         dependency_dict=dependency_dict,
         model=model,
@@ -481,6 +495,7 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
     for i in range(len(mixers)):
         if mixers[i]["module"] == "Unit":
             pass
+
         elif mixers[i]["module"] == "Neural":
             mixers[i]["args"]["target_encoder"] = mixers[i]["args"].get(
                 "target_encoder", "$encoders[self.target]"
@@ -511,6 +526,7 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
                 "target_encoder", "$encoders[self.target]"
             )
             mixers[i]["args"]["use_optuna"] = True
+
         elif mixers[i]["module"] == "Regression":
             mixers[i]["args"]["target"] = mixers[i]["args"].get("target", "$target")
             mixers[i]["args"]["dtype_dict"] = mixers[i]["args"].get(
@@ -519,6 +535,7 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
             mixers[i]["args"]["target_encoder"] = mixers[i]["args"].get(
                 "target_encoder", "$encoders[self.target]"
             )
+
         elif mixers[i]["module"] == "LightGBMArray":
             mixers[i]["args"]["target"] = mixers[i]["args"].get("target", "$target")
             mixers[i]["args"]["dtype_dict"] = mixers[i]["args"].get(
@@ -530,17 +547,26 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
             mixers[i]["args"]["target_encoder"] = mixers[i]["args"].get(
                 "target_encoder", "$encoders[self.target]"
             )
-        elif mixers[i]["module"] == "SkTime":
+            if "horizon" not in mixers[i]["args"]:
+                mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon"
+
+        elif mixers[i]["module"] in ("SkTime", "ProphetMixer"):
             mixers[i]["args"]["target"] = mixers[i]["args"].get("target", "$target")
             mixers[i]["args"]["dtype_dict"] = mixers[i]["args"].get(
                 "dtype_dict", "$dtype_dict"
             )
             mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get(
                 "ts_analysis", "$ts_analysis"
             )
+            if "horizon" not in mixers[i]["args"]:
+                mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon"
+
             # enforce fit_on_all if this mixer is specified
             problem_definition.fit_on_all = True
 
+        if "stop_after" not in mixers[i]["args"]:
+            mixers[i]["args"]["stop_after"] = "$problem_definition.seconds_per_mixer"
+
     json_ai.model["args"]["target"] = json_ai.model["args"].get("target", "$target")
     json_ai.model["args"]["data"] = json_ai.model["args"].get("data", "encoded_test_data")
     json_ai.model["args"]["mixers"] = json_ai.model["args"].get("mixers", "$mixers")

diff --git a/lightwood/data/cleaner.py b/lightwood/data/cleaner.py
@@ -58,8 +58,9 @@ def cleaner(
         data = clean_timeseries(data, timeseries_settings)
 
     for col, imputer in imputers.items():
-        cols = [col] + [col for col in imputer.dependencies]
-        data[col] = imputer.impute(data[cols])
+        if col in data.columns:
+            cols = [col] + [col for col in imputer.dependencies]
+            data[col] = imputer.impute(data[cols])
 
     return data
 

diff --git a/lightwood/data/infer_types.py b/lightwood/data/infer_types.py
@@ -204,8 +204,13 @@ def get_column_data_type(arg_tup):
     )
 
     actual_pct_invalid = 100 * (len(data) - max_known_dtype_count) / len(data)
-    if max_known_dtype is None or max_known_dtype == dtype.invalid or actual_pct_invalid > pct_invalid:
+    if max_known_dtype is None or max_known_dtype == dtype.invalid:
         curr_dtype = None
+    elif actual_pct_invalid > pct_invalid:
+        if max_known_dtype in (dtype.integer, dtype.float) and actual_pct_invalid <= 5 * pct_invalid:
+            curr_dtype = max_known_dtype
+        else:
+            curr_dtype = None
     else:
         curr_dtype = max_known_dtype