Skip to content

Commit

Permalink
Merge pull request #874 from mindsdb/staging
Browse files Browse the repository at this point in the history
Release 22.5.1.0
  • Loading branch information
paxcema authored May 6, 2022
2 parents fe83d62 + 3ddf8ab commit 55131e1
Show file tree
Hide file tree
Showing 21 changed files with 220 additions and 108 deletions.
58 changes: 29 additions & 29 deletions docssrc/source/tutorials/custom_cleaner/custom_cleaner.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@
"### 2) Create a JSON-AI default object\n",
"Before we create a custom cleaner object, let's first create JSON-AI syntax for our problem based on its specifications. We can do so by setting up a ``ProblemDefinition``. The ``ProblemDefinition`` allows us to specify the target, the column we intend to predict, along with other details. \n",
"\n",
"The end goal of JSON-AI is to provide **a set of instructions on how to compile a machine learning pipeline*.\n",
"The end goal of JSON-AI is to provide *a set of instructions on how to compile a machine learning pipeline*.\n",
"\n",
"In this case, let's specify our target, the aptly named **target** column. We will also tell JSON-AI to throw away features we never intend to use, such as \"url_legal\", \"license\", and \"standard_error\". We can do so in the following lines:"
]
Expand All @@ -217,20 +217,20 @@
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32mINFO:lightwood-1462419:Dropping features: ['url_legal', 'license', 'standard_error']\u001b[0m\n",
"\u001b[32mINFO:lightwood-1462419:Analyzing a sample of 2478\u001b[0m\n",
"\u001b[32mINFO:lightwood-1462419:from a total population of 2834, this is equivalent to 87.4% of your data.\u001b[0m\n",
"\u001b[32mINFO:lightwood-1462419:Using 7 processes to deduct types.\u001b[0m\n",
"\u001b[32mINFO:lightwood-1462419:Infering type for: id\u001b[0m\n",
"\u001b[32mINFO:lightwood-1462419:Infering type for: target\u001b[0m\n",
"\u001b[32mINFO:lightwood-1462419:Infering type for: excerpt\u001b[0m\n",
"\u001b[32mINFO:lightwood-1462419:Column target has data type float\u001b[0m\n",
"\u001b[32mINFO:lightwood-1462419:Doing text detection for column: id\u001b[0m\n",
"\u001b[32mINFO:lightwood-1462419:Doing text detection for column: excerpt\u001b[0m\n",
"\u001b[32mINFO:lightwood-1462419:Column id has data type categorical\u001b[0m\n",
"\u001b[33mWARNING:lightwood-1462419:Column id is an identifier of type \"Hash-like identifier\"\u001b[0m\n",
"\u001b[32mINFO:lightwood-1462419:Starting statistical analysis\u001b[0m\n",
"\u001b[32mINFO:lightwood-1462419:Finished statistical analysis\u001b[0m\n"
"\u001B[32mINFO:lightwood-1462419:Dropping features: ['url_legal', 'license', 'standard_error']\u001B[0m\n",
"\u001B[32mINFO:lightwood-1462419:Analyzing a sample of 2478\u001B[0m\n",
"\u001B[32mINFO:lightwood-1462419:from a total population of 2834, this is equivalent to 87.4% of your data.\u001B[0m\n",
"\u001B[32mINFO:lightwood-1462419:Using 7 processes to deduct types.\u001B[0m\n",
"\u001B[32mINFO:lightwood-1462419:Infering type for: id\u001B[0m\n",
"\u001B[32mINFO:lightwood-1462419:Infering type for: target\u001B[0m\n",
"\u001B[32mINFO:lightwood-1462419:Infering type for: excerpt\u001B[0m\n",
"\u001B[32mINFO:lightwood-1462419:Column target has data type float\u001B[0m\n",
"\u001B[32mINFO:lightwood-1462419:Doing text detection for column: id\u001B[0m\n",
"\u001B[32mINFO:lightwood-1462419:Doing text detection for column: excerpt\u001B[0m\n",
"\u001B[32mINFO:lightwood-1462419:Column id has data type categorical\u001B[0m\n",
"\u001B[33mWARNING:lightwood-1462419:Column id is an identifier of type \"Hash-like identifier\"\u001B[0m\n",
"\u001B[32mINFO:lightwood-1462419:Starting statistical analysis\u001B[0m\n",
"\u001B[32mINFO:lightwood-1462419:Finished statistical analysis\u001B[0m\n"
]
}
],
Expand Down Expand Up @@ -1069,15 +1069,15 @@
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32mINFO:lightwood-1462419:Performing statistical analysis on data\u001b[0m\n",
"\u001b[32mINFO:lightwood-1462419:Starting statistical analysis\u001b[0m\n",
"\u001b[32mINFO:lightwood-1462419:Finished statistical analysis\u001b[0m\n",
"\u001b[37mDEBUG:lightwood-1462419: `analyze_data` runtime: 0.03 seconds\u001b[0m\n",
"\u001b[32mINFO:lightwood-1462419:Cleaning the data\u001b[0m\n",
"\u001b[32mINFO:lightwood-1462419:Cleaning column =excerpt\u001b[0m\n",
"\u001b[32mINFO:lightwood-1462419:Cleaning column =target\u001b[0m\n",
"\u001b[32mINFO:lightwood-1462419:Converted target into strictly non-negative\u001b[0m\n",
"\u001b[37mDEBUG:lightwood-1462419: `preprocess` runtime: 0.06 seconds\u001b[0m\n"
"\u001B[32mINFO:lightwood-1462419:Performing statistical analysis on data\u001B[0m\n",
"\u001B[32mINFO:lightwood-1462419:Starting statistical analysis\u001B[0m\n",
"\u001B[32mINFO:lightwood-1462419:Finished statistical analysis\u001B[0m\n",
"\u001B[37mDEBUG:lightwood-1462419: `analyze_data` runtime: 0.03 seconds\u001B[0m\n",
"\u001B[32mINFO:lightwood-1462419:Cleaning the data\u001B[0m\n",
"\u001B[32mINFO:lightwood-1462419:Cleaning column =excerpt\u001B[0m\n",
"\u001B[32mINFO:lightwood-1462419:Cleaning column =target\u001B[0m\n",
"\u001B[32mINFO:lightwood-1462419:Converted target into strictly non-negative\u001B[0m\n",
"\u001B[37mDEBUG:lightwood-1462419: `preprocess` runtime: 0.06 seconds\u001B[0m\n"
]
},
{
Expand Down Expand Up @@ -1178,8 +1178,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1mOriginal Data\n",
"\u001b[0m\n",
"\u001B[1mOriginal Data\n",
"\u001B[0m\n",
"Excerpt:\n",
" When the young people returned to the ballroom, it presented a decidedly changed appearance. Instead of an interior scene, it was a winter landscape.\n",
"The floor was covered with snow-white canvas, not laid on smoothly, but rumpled over bumps and hillocks, like a real snow field. The numerous palms and evergreens that had decorated the room, were powdered with flour and strewn with tufts of cotton, like snow. Also diamond dust had been lightly sprinkled on them, and glittering crystal icicles hung from the branches.\n",
Expand All @@ -1190,10 +1190,10 @@
"\n",
"Target:\n",
" -0.340259125\n",
"\u001b[1m\n",
"\u001B[1m\n",
"\n",
"Cleaned Data\n",
"\u001b[0m\n",
"\u001B[0m\n",
"Excerpt:\n",
" When young people returned ballroom, presented decidedly changed appearance. Instead interior scene, winter landscape. The floor covered snow-white canvas, laid smoothly, rumpled bumps hillocks, like real snow field. The numerous palms evergreens decorated room, powdered flour strewn tufts cotton, like snow. Also diamond dust lightly sprinkled them, glittering crystal icicles hung branches. At end room, wall, hung beautiful bear-skin rug. These rugs prizes, one girls one boys. And game. The girls gathered one end room boys other, one end called North Pole, South Pole. Each player given small flag plant reaching Pole. This would easy matter, traveller obliged wear snowshoes.\n",
"\n",
Expand Down Expand Up @@ -1248,4 +1248,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
}
}
2 changes: 1 addition & 1 deletion lightwood/__about__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
__title__ = 'lightwood'
__package_name__ = 'lightwood'
__version__ = '22.4.4.0'
__version__ = '22.5.1.0'
__description__ = "Lightwood is a toolkit for automatic machine learning model building"
__email__ = "[email protected]"
__author__ = 'MindsDB Inc'
Expand Down
26 changes: 13 additions & 13 deletions lightwood/analysis/nc/calibrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
output['icp']['__mdb_group_keys'] = [x for x in group_info.keys()]

for combination in all_group_combinations:
output['icp'][frozenset(combination)] = deepcopy(icp)
output['icp'][tuple(combination)] = deepcopy(icp)

# calibrate ICP
icp_df = deepcopy(ns.data)
Expand Down Expand Up @@ -165,37 +165,37 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
for key, val in zip(group_keys, group):
icp_df = icp_df[icp_df[key] == val]

if icps[frozenset(group)].nc_function.normalizer is not None:
group_normalizer = icps[frozenset(group)].nc_function.normalizer
if icps[tuple(group)].nc_function.normalizer is not None:
group_normalizer = icps[tuple(group)].nc_function.normalizer
norm_input_df = ns.encoded_val_data.data_frame.iloc[icp_df.pop('__mdb_norm_index')]
norm_input = EncodedDs(ns.encoded_val_data.encoders, norm_input_df, ns.target)
norm_cache = group_normalizer(norm_input, args=PredictionArguments())
icp_df[f'__norm_{ns.target}'] = norm_cache

# save relevant predictions in the caches, then calibrate the ICP
pred_cache = icp_df.pop(f'__predicted_{ns.target}').values
icps[frozenset(group)].nc_function.model.prediction_cache = pred_cache
icps[tuple(group)].nc_function.model.prediction_cache = pred_cache
icp_df, y = clean_df(icp_df, ns.target, ns.is_classification, output.get('label_encoders', None))
if icps[frozenset(group)].nc_function.normalizer is not None:
icps[frozenset(group)].nc_function.normalizer.prediction_cache = icp_df.pop(
if icps[tuple(group)].nc_function.normalizer is not None:
icps[tuple(group)].nc_function.normalizer.prediction_cache = icp_df.pop(
f'__norm_{ns.target}').values

icps[frozenset(group)].index = icp_df.columns # important at inference time
icps[frozenset(group)].calibrate(icp_df.values, y)
icps[tuple(group)].index = icp_df.columns # important at inference time
icps[tuple(group)].calibrate(icp_df.values, y)

# save training std() for bounds width selection
if not ns.is_classification:
icp_train_df = ns.data
for key, val in zip(group_keys, group):
icp_train_df = icp_train_df[icp_train_df[key] == val]
y_train = icp_train_df[ns.target].values
output['df_target_stddev'][frozenset(group)] = y_train.std()
output['df_target_stddev'][tuple(group)] = y_train.std()

# get bounds for relevant rows in validation dataset
conf, group_ranges = set_conf_range(
icp_df, icps[frozenset(group)],
icp_df, icps[tuple(group)],
ns.dtype_dict[ns.target],
output, group=frozenset(group),
output, group=tuple(group),
positive_domain=self.positive_domain, significance=self.fixed_significance)
# save group bounds
if not ns.is_classification:
Expand Down Expand Up @@ -304,7 +304,7 @@ def explain(self, row_insights: pd.DataFrame, global_insights: Dict[str, object]
group_keys = icps['__mdb_group_keys']

for group in icps['__mdb_groups']:
icp = icps[frozenset(group)]
icp = icps[tuple(group)]

# check ICP has calibration scores
if icp.cal_scores[0].shape[0] > 0:
Expand All @@ -328,7 +328,7 @@ def explain(self, row_insights: pd.DataFrame, global_insights: Dict[str, object]
all_confs,
df_target_stddev=ns.analysis['df_target_stddev'],
positive_domain=self.positive_domain,
group=frozenset(group),
group=tuple(group),
fixed_conf=fixed_conf
)

Expand Down
8 changes: 6 additions & 2 deletions lightwood/analysis/nc/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,12 @@ def get_categorical_conf(raw_confs: np.ndarray):
"""
if len(raw_confs.shape) == 1:
raw_confs = np.expand_dims(raw_confs, axis=0)
second_p = np.sort(raw_confs, axis=1)[:, -2]
confs = np.clip(np.subtract(1, second_p), 0.0001, 0.9999)
if raw_confs.shape[-1] == 1:
# single-class edge case (only happens if predictor sees just one known label at calibration)
confs = np.clip(raw_confs[:, 0], 0.0001, 0.9999)
else:
second_p = np.sort(raw_confs, axis=1)[:, -2]
confs = np.clip(np.subtract(1, second_p), 0.0001, 0.9999)
return confs


Expand Down
36 changes: 31 additions & 5 deletions lightwood/api/json_ai.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ def generate_json_ai(

is_target_predicting_encoder = False
is_ts = problem_definition.timeseries_settings.is_timeseries
imputers = []

# Single text column classification
if (
Expand Down Expand Up @@ -267,7 +268,7 @@ def generate_json_ai(
"args": {
"fit_on_dev": True,
"stop_after": "$problem_definition.seconds_per_mixer",
"n_ts_predictions": "$problem_definition.timeseries_settings.horizon",
"horizon": "$problem_definition.timeseries_settings.horizon",
},
}
]
Expand All @@ -280,7 +281,7 @@ def generate_json_ai(
"module": "SkTime",
"args": {
"stop_after": "$problem_definition.seconds_per_mixer",
"n_ts_predictions": "$problem_definition.timeseries_settings.horizon",
"horizon": "$problem_definition.timeseries_settings.horizon",
},
}
]
Expand Down Expand Up @@ -344,10 +345,22 @@ def generate_json_ai(
f"Please specify a custom accuracy function for output type {output_dtype}"
)

# special dispatch for t+1 time series forecasters
if is_ts:
if output_dtype in [dtype.integer, dtype.float]:
accuracy_functions = ["evaluate_num_array_accuracy"]
accuracy_functions = ["evaluate_num_array_accuracy"] # forces this acc fn for t+1 time series forecasters

if output_dtype in (dtype.integer, dtype.float, dtype.num_tsarray):
imputers.append({"module": "NumericalImputer",
"args": {
"value": "'zero'",
"target": f"'{target}'"}}
)
elif output_dtype in [dtype.categorical, dtype.tags, dtype.binary, dtype.cat_tsarray]:
imputers.append({"module": "CategoricalImputer",
"args": {
"value": "'mode'",
"target": f"'{target}'"}}
)

if problem_definition.time_aim is None:
# 5 days
Expand Down Expand Up @@ -379,6 +392,7 @@ def generate_json_ai(
analyzer=None,
explainer=None,
encoders=encoders,
imputers=imputers,
dtype_dict=dtype_dict,
dependency_dict=dependency_dict,
model=model,
Expand Down Expand Up @@ -481,6 +495,7 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
for i in range(len(mixers)):
if mixers[i]["module"] == "Unit":
pass

elif mixers[i]["module"] == "Neural":
mixers[i]["args"]["target_encoder"] = mixers[i]["args"].get(
"target_encoder", "$encoders[self.target]"
Expand Down Expand Up @@ -511,6 +526,7 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
"target_encoder", "$encoders[self.target]"
)
mixers[i]["args"]["use_optuna"] = True

elif mixers[i]["module"] == "Regression":
mixers[i]["args"]["target"] = mixers[i]["args"].get("target", "$target")
mixers[i]["args"]["dtype_dict"] = mixers[i]["args"].get(
Expand All @@ -519,6 +535,7 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
mixers[i]["args"]["target_encoder"] = mixers[i]["args"].get(
"target_encoder", "$encoders[self.target]"
)

elif mixers[i]["module"] == "LightGBMArray":
mixers[i]["args"]["target"] = mixers[i]["args"].get("target", "$target")
mixers[i]["args"]["dtype_dict"] = mixers[i]["args"].get(
Expand All @@ -530,17 +547,26 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
mixers[i]["args"]["target_encoder"] = mixers[i]["args"].get(
"target_encoder", "$encoders[self.target]"
)
elif mixers[i]["module"] == "SkTime":
if "horizon" not in mixers[i]["args"]:
mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon"

elif mixers[i]["module"] in ("SkTime", "ProphetMixer"):
mixers[i]["args"]["target"] = mixers[i]["args"].get("target", "$target")
mixers[i]["args"]["dtype_dict"] = mixers[i]["args"].get(
"dtype_dict", "$dtype_dict"
)
mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get(
"ts_analysis", "$ts_analysis"
)
if "horizon" not in mixers[i]["args"]:
mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon"

# enforce fit_on_all if this mixer is specified
problem_definition.fit_on_all = True

if "stop_after" not in mixers[i]["args"]:
mixers[i]["args"]["stop_after"] = "$problem_definition.seconds_per_mixer"

json_ai.model["args"]["target"] = json_ai.model["args"].get("target", "$target")
json_ai.model["args"]["data"] = json_ai.model["args"].get("data", "encoded_test_data")
json_ai.model["args"]["mixers"] = json_ai.model["args"].get("mixers", "$mixers")
Expand Down
5 changes: 3 additions & 2 deletions lightwood/data/cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,9 @@ def cleaner(
data = clean_timeseries(data, timeseries_settings)

for col, imputer in imputers.items():
cols = [col] + [col for col in imputer.dependencies]
data[col] = imputer.impute(data[cols])
if col in data.columns:
cols = [col] + [col for col in imputer.dependencies]
data[col] = imputer.impute(data[cols])

return data

Expand Down
7 changes: 6 additions & 1 deletion lightwood/data/infer_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,8 +204,13 @@ def get_column_data_type(arg_tup):
)

actual_pct_invalid = 100 * (len(data) - max_known_dtype_count) / len(data)
if max_known_dtype is None or max_known_dtype == dtype.invalid or actual_pct_invalid > pct_invalid:
if max_known_dtype is None or max_known_dtype == dtype.invalid:
curr_dtype = None
elif actual_pct_invalid > pct_invalid:
if max_known_dtype in (dtype.integer, dtype.float) and actual_pct_invalid <= 5 * pct_invalid:
curr_dtype = max_known_dtype
else:
curr_dtype = None
else:
curr_dtype = max_known_dtype

Expand Down
Loading

0 comments on commit 55131e1

Please sign in to comment.