Skip to content

Commit

Permalink
Merge pull request #1027 from mindsdb/staging
Browse files Browse the repository at this point in the history
Release 22.10.4.0
  • Loading branch information
paxcema authored Oct 26, 2022
2 parents ddcf71e + 9e7ab65 commit 32d20be
Show file tree
Hide file tree
Showing 45 changed files with 1,017 additions and 1,467 deletions.
6 changes: 1 addition & 5 deletions .github/workflows/ligthtwood.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,8 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, windows-latest]
os: [ubuntu-latest]
python-version: [3.7,3.8,3.9]
exclude:
# exclude combination due to #849
- os: windows-latest
python-version: 3.9
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ test.pickle
AI.json
AI2.json

# docs
assert.sh
docssrc/build
docssrc/build/*
Expand All @@ -67,3 +68,4 @@ docs
docs/*
*.zip
docs/*
.ipynb_checkpoints
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ pip3 install lightwood
However, we recommend creating a python virtual environment.

#### Setting up a dev environment

- Python version should be in the range >=3.7, < 3.10
- Clone lightwood
- `cd lightwood && pip install -r requirements.txt && pip install -r requirements_image.txt`
- Add it to your python path (e.g. by adding `export PYTHONPATH='/where/you/cloned/lightwood':$PYTHONPATH` as a newline at the end of your `~/.bashrc` file)
Expand Down
1,363 changes: 25 additions & 1,338 deletions docssrc/source/tutorials/custom_explainer/custom_explainer.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion lightwood/__about__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
__title__ = 'lightwood'
__package_name__ = 'lightwood'
__version__ = '22.9.1.0'
__version__ = '22.10.4.0'
__description__ = "Lightwood is a toolkit for automatic machine learning model building"
__email__ = "[email protected]"
__author__ = 'MindsDB Inc'
Expand Down
7 changes: 5 additions & 2 deletions lightwood/analysis/analyze.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Dict, List, Tuple, Optional

from lightwood.helpers.log import log
from lightwood.helpers.ts import filter_ds
from lightwood.api import dtype
from lightwood.ensemble import BaseEnsemble
from lightwood.analysis.base import BaseAnalysisBlock
Expand Down Expand Up @@ -53,8 +54,10 @@ def model_analyzer(

# raw predictions for validation dataset
args = {} if not is_classification else {"predict_proba": True}
filtered_df = filter_ds(encoded_val_data, tss)
encoded_val_data = EncodedDs(encoded_val_data.encoders, filtered_df, encoded_val_data.target)
normal_predictions = predictor(encoded_val_data, args=PredictionArguments.from_dict(args))
normal_predictions = normal_predictions.set_index(data.index)
normal_predictions = normal_predictions.set_index(encoded_val_data.data_frame.index)

# ------------------------- #
# Run analysis blocks, both core and user-defined
Expand All @@ -65,7 +68,7 @@ def model_analyzer(
'input_cols': input_cols,
'dtype_dict': dtype_dict,
'normal_predictions': normal_predictions,
'data': data,
'data': filtered_df,
'train_data': train_data,
'encoded_val_data': encoded_val_data,
'is_classification': is_classification,
Expand Down
21 changes: 11 additions & 10 deletions lightwood/analysis/helpers/conf_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
possible_labels = ns.stats_info.train_observed_classes
self.ordenc.fit([[label] for label in possible_labels])
task_type = 'categorical'
elif ns.is_numerical:
task_type = 'numerical'
elif ns.is_multi_ts:
task_type = 'multi_ts'
elif ns.is_numerical:
task_type = 'numerical'
else:
return info

Expand Down Expand Up @@ -66,14 +66,15 @@ def _get_stats(self, confs, preds, data, target, task_type='categorical'):
if task_type == 'categorical':
sorted_inp['__mdb_prediction'] = sorted_preds['prediction']
else:
sorted_inp['__mdb_lower'] = confs['lower']
sorted_inp['__mdb_upper'] = confs['upper']
if task_type == 'numerical':
sorted_inp['__mdb_hits'] = (sorted_inp['__mdb_lower'] <= sorted_inp[target]) & \
(sorted_inp[target] <= sorted_inp['__mdb_upper'])
elif task_type == 'multi_ts':
sorted_inp['__mdb_hits'] = (sorted_inp['__mdb_lower'][0] <= sorted_inp[target]) & \
(sorted_inp[target] <= sorted_inp['__mdb_upper'][0])
if isinstance(confs['lower'][0], list):
sorted_inp['__mdb_lower'] = confs['lower'].apply(lambda x: x[0])
sorted_inp['__mdb_upper'] = confs['upper'].apply(lambda x: x[0])
else:
sorted_inp['__mdb_lower'] = confs['lower']
sorted_inp['__mdb_upper'] = confs['upper']

sorted_inp['__mdb_hits'] = (sorted_inp['__mdb_lower'] <= sorted_inp[target]) & \
(sorted_inp[target] <= sorted_inp['__mdb_upper'])

size = round(len(sorted_inp) / self.ece_bins)
bins = []
Expand Down
12 changes: 12 additions & 0 deletions lightwood/analysis/nc/calibrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
data_type = ns.dtype_dict[ns.target]
output = {'icp': {'__mdb_active': False}}

if 'confidence' in ns.normal_predictions.columns:
# bypass calibrator if model already outputs confidence
output['result_df'] = ns.normal_predictions[['confidence', 'lower', 'upper']]
return {**info, **output}

fit_params = {'horizon': ns.tss.horizon or 0, 'columns_to_ignore': []}
fit_params['columns_to_ignore'].extend([f'timestep_{i}' for i in range(1, fit_params['horizon'])])

Expand Down Expand Up @@ -479,6 +484,13 @@ def _formatted(row_insights, global_insights, ns, is_numerical):
elif ns.target_dtype in (dtype.short_text, dtype.rich_text, dtype.binary, dtype.categorical):
row_insights['prediction'] = row_insights['prediction'].astype(str)

# horizon collapse
if ns.tss.is_timeseries and is_numerical and ns.tss.horizon > 1:
row_insights['prediction_sum'] = row_insights['prediction'].apply(lambda x: sum(x))
row_insights['lower_sum'] = row_insights['lower'].apply(lambda x: min(x))
row_insights['upper_sum'] = row_insights['upper'].apply(lambda x: max(x))
row_insights['confidence_mean'] = row_insights['confidence'].apply(lambda x: np.mean(x))

return row_insights, global_insights

@staticmethod
Expand Down
29 changes: 25 additions & 4 deletions lightwood/api/json_ai.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
)
import inspect
from lightwood.helpers.log import log
from lightwood.__about__ import __version__ as lightwood_version


# For custom modules, we create a module loader with necessary imports below
Expand Down Expand Up @@ -39,6 +40,7 @@
from lightwood.ensemble import *
from lightwood.helpers.device import *
from lightwood.helpers.general import *
from lightwood.helpers.ts import *
from lightwood.helpers.log import *
from lightwood.helpers.numeric import *
from lightwood.helpers.imputers import *
Expand Down Expand Up @@ -278,6 +280,13 @@ def generate_json_ai(
"stop_after": "$problem_definition.seconds_per_mixer",
},
},
{
"module": "RandomForest",
"args": {
"stop_after": "$problem_definition.seconds_per_mixer",
"fit_on_dev": True,
},
},
]
)
elif tss.is_timeseries and tss.horizon > 1:
Expand Down Expand Up @@ -372,8 +381,10 @@ def generate_json_ai(
accuracy_functions = ["r2_score"]
elif output_dtype in [dtype.categorical, dtype.tags, dtype.binary]:
accuracy_functions = ["balanced_accuracy_score"]
elif output_dtype in (dtype.num_array, dtype.num_tsarray):
accuracy_functions = ["bounded_ts_accuracy"]
elif output_dtype in (dtype.num_tsarray, ):
accuracy_functions = ["complementary_smape_array_accuracy"]
elif output_dtype in (dtype.num_array, ):
accuracy_functions = ["evaluate_num_array_accuracy"]
elif output_dtype in (dtype.cat_array, dtype.cat_tsarray):
accuracy_functions = ["evaluate_cat_array_accuracy"]
else:
Expand All @@ -383,7 +394,8 @@ def generate_json_ai(

if is_ts:
if output_dtype in [dtype.integer, dtype.float]:
accuracy_functions = ["bounded_ts_accuracy"] # forces this acc fn for t+1 time series forecasters # noqa
# forces this acc fn for t+1 time series forecasters
accuracy_functions = ["complementary_smape_array_accuracy"]

if output_dtype in (dtype.integer, dtype.float, dtype.num_tsarray):
imputers.append({"module": "NumericalImputer",
Expand Down Expand Up @@ -590,6 +602,12 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
"target_encoder", "$encoders[self.target]"
)

elif mixers[i]["module"] == "RandomForest":
mixers[i]["args"]["target_encoder"] = mixers[i]["args"].get(
"target_encoder", "$encoders[self.target]"
)
mixers[i]["args"]["use_optuna"] = True

elif mixers[i]["module"] == "LightGBMArray":
mixers[i]["args"]["input_cols"] = mixers[i]["args"].get(
"input_cols", "$input_cols"
Expand All @@ -602,7 +620,7 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
mixers[i]["args"]["fit_on_dev"] = mixers[i]["args"].get("fit_on_dev", "True")
mixers[i]["args"]["use_stl"] = mixers[i]["args"].get("use_stl", "False")

elif mixers[i]["module"] == "NHitsMixer":
elif mixers[i]["module"] in ("NHitsMixer", "GluonTSMixer"):
mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon"
mixers[i]["args"]["window"] = "$problem_definition.timeseries_settings.window"
mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get(
Expand Down Expand Up @@ -980,6 +998,8 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
encoded_train_data = enc_data['train']
encoded_dev_data = enc_data['dev']
encoded_test_data = enc_data['test']
filtered_df = filter_ds(encoded_test_data, self.problem_definition.timeseries_settings)
encoded_test_data = EncodedDs(encoded_test_data.encoders, filtered_df, encoded_test_data.target)
log.info('Training the mixers')
Expand Down Expand Up @@ -1186,6 +1206,7 @@ def __init__(self):
self.accuracy_functions = {json_ai.accuracy_functions}
self.identifiers = {json_ai.identifiers}
self.dtype_dict = {inline_dict(dtype_dict)}
self.lightwood_version = '{lightwood_version}'
# Any feature-column dependencies
self.dependencies = {inline_dict(json_ai.dependency_dict)}
Expand Down
8 changes: 5 additions & 3 deletions lightwood/api/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ class StatisticalAnalysis:
in the information.
:param nr_rows: Number of rows (samples) in the dataset
:param nr_columns: Number of columns (features) in the dataset
:param df_target_stddev: The standard deviation of the target of the dataset
:param train_observed_classes:
:param target_class_distribution:
Expand All @@ -77,6 +78,7 @@ class StatisticalAnalysis:
""" # noqa

nr_rows: int
nr_columns: int
df_target_stddev: Optional[float]
train_observed_classes: object # Union[None, List[str]]
target_class_distribution: object # Dict[str, float]
Expand Down Expand Up @@ -123,7 +125,7 @@ class TimeseriesSettings:
:param target_type: Automatically inferred dtype of the target (e.g. `dtype.integer`, `dtype.float`).
:param use_previous_target: Use the previous values of the target column to generate predictions. Defaults to True.
:param allow_incomplete_history: whether predictions can be made for rows with incomplete historical context (i.e. less than `window` rows have been observed for the datetime that has to be forecasted).
:param eval_cold_start: whether to include predictions with incomplete history (thus part of the cold start region for certain mixers) when evaluating mixer scores with the validation dataset.
:param eval_incomplete: whether to consider predictions with incomplete history or target information when evaluating mixer accuracy with the validation dataset.
:param interval_periods: tuple of tuples with user-provided period lengths for time intervals. Default values will be added for intervals left unspecified. For interval options, check the `timeseries_analyzer.detect_period()` method documentation. e.g.: (('daily', 7),).
""" # noqa

Expand All @@ -139,7 +141,7 @@ class TimeseriesSettings:
# @TODO: George: No, I don't think it is, we need to pass this some other way
)
allow_incomplete_history: bool = True
eval_cold_start: bool = True
eval_incomplete: bool = False
interval_periods: tuple = tuple()

@staticmethod
Expand Down Expand Up @@ -170,7 +172,7 @@ def from_dict(obj: Dict):
historical_columns=[],
horizon=obj.get("horizon", 1),
allow_incomplete_history=obj.get('allow_incomplete_history', True),
eval_cold_start=obj.get('eval_cold_start', True),
eval_incomplete=obj.get('eval_incomplete', False),
interval_periods=obj.get('interval_periods', tuple(tuple()))
)
for setting in obj:
Expand Down
2 changes: 2 additions & 0 deletions lightwood/data/statistical_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ def statistical_analysis(data: pd.DataFrame,
except Exception:
order_format = None

nr_columns = len(data.columns)
df = cleaner(data, dtypes, problem_definition.pct_invalid,
identifiers, problem_definition.target, 'train', tss,
problem_definition.anomaly_detection)
Expand Down Expand Up @@ -204,6 +205,7 @@ def statistical_analysis(data: pd.DataFrame,
log.info('Finished statistical analysis')
return StatisticalAnalysis(
nr_rows=nr_rows,
nr_columns=nr_columns,
df_target_stddev=df_std,
train_observed_classes=train_observed_classes,
target_class_distribution=target_class_distribution,
Expand Down
2 changes: 1 addition & 1 deletion lightwood/data/timeseries_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def _pick_ST(tr_subset: pd.Series, dev_subset: pd.Series, sp: list):
""" # noqa

def _ST_objective(trial: optuna.Trial):
trend_degree = trial.suggest_categorical("trend_degree", [1, 2])
trend_degree = trial.suggest_categorical("trend_degree", [1])
ds_sp = trial.suggest_categorical("ds_sp", sp) # seasonality period to use in deseasonalizer
if min(min(tr_subset), min(dev_subset)) <= 0:
decomp_type = trial.suggest_categorical("decomp_type", ['additive'])
Expand Down
7 changes: 5 additions & 2 deletions lightwood/encoder/categorical/autoencoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,15 @@ def __init__(
max_encoded_length: int = 100,
desired_error: float = 0.01,
batch_size: int = 200,
device: str = '',
):
"""
:param stop_after: Stops training with provided time limit (sec)
:param is_target: Encoder represents target class (NOT recommended)
:param max_encoded_length: Maximum length of vector represented
:param desired_error: Threshold for reconstruction accuracy error
:param batch_size: Minimum batch size while training
:param device: Name of the device that get_device_from_name will attempt to use
""" # noqa
super().__init__(is_target)
self.is_prepared = False
Expand All @@ -48,6 +50,7 @@ def __init__(
self.encoder = None
self.decoder = None
self.onehot_encoder = OneHotEncoder(is_target=self.is_target)
self.device_type = device

# Training details
self.batch_size = batch_size
Expand Down Expand Up @@ -148,7 +151,7 @@ def _prepare_AE_input(

# Prepare a one-hot encoder for CatAE inputs
self.onehot_encoder.prepare(priming_data)
self.batch_size = min(self.batch_size, int(len(priming_data) / 50))
self.batch_size = max(min(self.batch_size, int(len(priming_data) / 50)), 1)

train_loader = DataLoader(
list(zip(priming_data, priming_data)),
Expand All @@ -170,7 +173,7 @@ def _prepare_catae(self, train_loader: DataLoader, dev_loader: DataLoader):
""" # noqa
input_len = self.onehot_encoder.output_size

self.net = DefaultNet(shape=[input_len, self.output_size, input_len])
self.net = DefaultNet(shape=[input_len, self.output_size, input_len], device=self.device_type)

criterion = torch.nn.CrossEntropyLoss()
optimizer = Ranger(self.net.parameters())
Expand Down
9 changes: 5 additions & 4 deletions lightwood/encoder/image/helpers/img_to_vec.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import torch
import torch.nn as nn
from lightwood.helpers.device import get_devices
from lightwood.helpers.device import get_device_from_name
from lightwood.helpers.torch import LightwoodAutocast

from lightwood.helpers.log import log
Expand Down Expand Up @@ -33,16 +33,17 @@ class Img2Vec(nn.Module):
Output is a `self.output_size`-dimensioned vector, generated by taking the output of the Resnext's last convolutional layer and performing an adaptive channel pool average.
""" # noqa
def __init__(self):
def __init__(self, device=''):
super(Img2Vec, self).__init__()

self.device, _ = get_devices()
self.device = get_device_from_name(device)

self.output_size = 512
self.model = torch.nn.Sequential(*list(models.resnext50_32x4d(pretrained=True).children())[: -1],
ChannelPoolAdaptiveAvg1d(output_size=self.output_size))
self.model = self.model.to(self.device)

def to(self, device, available_devices):
def to(self, device, available_devices=1):
self.device = device
self.model = self.model.to(self.device)
return self
Expand Down
Loading

0 comments on commit 32d20be

Please sign in to comment.