Skip to content

Commit

Permalink
refactor: organized into inference engine folders. Still WIP, but thi…
Browse files Browse the repository at this point in the history
…s runs (is close to the ideal end state) and does not break existing code.
  • Loading branch information
paxcema committed Dec 21, 2023
1 parent 42419a8 commit 84bbefa
Show file tree
Hide file tree
Showing 14 changed files with 410 additions and 390 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "type_infer"
version = "0.0.17"
version = "0.0.18"
description = "Automated type inference for Machine Learning pipelines."
authors = ["MindsDB Inc. <[email protected]>"]
license = "GPL-3.0"
Expand Down
2 changes: 1 addition & 1 deletion tests/integration_tests/test_type_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from datetime import datetime, timedelta

from type_infer.dtype import dtype
from type_infer.infer import infer_types
from type_infer.api import infer_types


class TestTypeInference(unittest.TestCase):
Expand Down
2 changes: 1 addition & 1 deletion tests/unit_tests/test_dates.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import unittest

from type_infer.dtype import dtype
from type_infer.infer import type_check_date
from type_infer.rule_based.infer import type_check_date


class TestDates(unittest.TestCase):
Expand Down
2 changes: 1 addition & 1 deletion tests/unit_tests/test_infer_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import random

import pandas as pd
from type_infer.infer import get_column_data_type
from type_infer.api import get_column_data_type
from type_infer.dtype import dtype


Expand Down
2 changes: 1 addition & 1 deletion tests/unit_tests/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pathlib import Path

import type_infer
from type_infer.helpers import tokenize_text
from type_infer.rule_based.helpers import tokenize_text


class TestDates(unittest.TestCase):
Expand Down
6 changes: 3 additions & 3 deletions type_infer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from type_infer import base
from type_infer import dtype
from type_infer import infer
from type_infer import api
from type_infer import helpers


__version__ = '0.0.17'
__version__ = '0.0.18'


__all__ = ['base', 'dtype', 'infer', 'helpers', '__version__']
__all__ = ['base', 'dtype', 'api.py', 'helpers', '__version__']
175 changes: 175 additions & 0 deletions type_infer/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
import random
import multiprocessing as mp

from scipy.stats import norm
import pandas as pd

from type_infer.base import TypeInformation
from type_infer.dtype import dtype
from type_infer.helpers import seed, log, get_nr_procs

# inference engine specific imports
from type_infer.rule_based.infer import get_column_data_type
from type_infer.rule_based.helpers import get_identifier_description_mp


def _calculate_sample_size(
population_size,
margin_error=.01,
confidence_level=.995,
sigma=1 / 2
):
"""
Calculate the minimal sample size to use to achieve a certain
margin of error and confidence level for a sample estimate
of the population mean.
Inputs
-------
population_size: integer
Total size of the population that the sample is to be drawn from.
margin_error: number
Maximum expected difference between the true population parameter,
such as the mean, and the sample estimate.
confidence_level: number in the interval (0, 1)
If we were to draw a large number of equal-size samples
from the population, the true population parameter
should lie within this percentage
of the intervals (sample_parameter - e, sample_parameter + e)
where e is the margin_error.
sigma: number
The standard deviation of the population. For the case
of estimating a parameter in the interval [0, 1], sigma=1/2
should be sufficient.
"""
alpha = 1 - confidence_level
# dictionary of confidence levels and corresponding z-scores
# computed via norm.ppf(1 - (alpha/2)), where norm is
# a normal distribution object in scipy.stats.
# Here, ppf is the percentile point function.
zdict = {
.90: 1.645,
.91: 1.695,
.99: 2.576,
.97: 2.17,
.94: 1.881,
.93: 1.812,
.95: 1.96,
.98: 2.326,
.96: 2.054,
.92: 1.751
}
if confidence_level in zdict:
z = zdict[confidence_level]
else:
# Inf fix
if alpha == 0.0:
alpha += 0.001
z = norm.ppf(1 - (alpha / 2))
N = population_size
M = margin_error
numerator = z**2 * sigma**2 * (N / (N - 1))
denom = M**2 + ((z**2 * sigma**2) / (N - 1))
return numerator / denom


def _sample_data(df: pd.DataFrame) -> pd.DataFrame:
population_size = len(df)
if population_size <= 50:
sample_size = population_size
else:
sample_size = int(round(_calculate_sample_size(population_size)))

population_size = len(df)
input_data_sample_indexes = random.sample(range(population_size), sample_size)
return df.iloc[input_data_sample_indexes]


def infer_types(
data: pd.DataFrame,
#TODO: method: InferenceEngine = Union[InferenceEngine.RuleBased, InferenceEngine.BERT],
pct_invalid: float,
seed_nr: int = 420,
mp_cutoff: int = 1e4,
) -> TypeInformation:
"""
Infers the data types of each column of the dataset by analyzing a small sample of
each column's items.
Inputs
----------
data : pd.DataFrame
The input dataset for which we want to infer data type information.
pct_invalid : float
The percentage, i.e. a float between 0.0 and 100.0, of invalid values that are
accepted before failing the type inference for a column.
seed_nr : int, optional
Seed for the random number generator, by default 420
mp_cutoff : int, optional
How many elements in the dataframe before switching to parallel processing, by
default 1e4.
"""
seed(seed_nr)
type_information = TypeInformation()
sample_df = _sample_data(data)
sample_size = len(sample_df)
population_size = len(data)
log.info(f'Analyzing a sample of {sample_size}')
log.info(
f'from a total population of {population_size}, this is equivalent to {round(sample_size*100/population_size, 1)}% of your data.') # noqa

nr_procs = get_nr_procs(df=sample_df)
pool_size = min(nr_procs, len(sample_df.columns.values))
if data.size > mp_cutoff and pool_size > 1:
log.info(f'Using {pool_size} processes to deduct types.')
pool = mp.Pool(processes=pool_size)
# column-wise parallelization # TODO: evaluate switching to row-wise split instead
# TODO: this would be the call to the inference engine -> column in, type out
answer_arr = pool.starmap(get_column_data_type, [
(sample_df[x].dropna(), data[x], x, pct_invalid) for x in sample_df.columns.values
])
pool.close()
pool.join()
else:
answer_arr = []
for x in sample_df.columns:
answer_arr.append(get_column_data_type(sample_df[x].dropna(), data, x, pct_invalid))

for i, col_name in enumerate(sample_df.columns):
(data_dtype, data_dtype_dist, additional_info, warn, info) = answer_arr[i]

for msg in warn:
log.warning(msg)
for msg in info:
log.info(msg)

if data_dtype is None:
data_dtype = dtype.invalid

type_information.dtypes[col_name] = data_dtype
type_information.additional_info[col_name] = {
'dtype_dist': data_dtype_dist
}

if data.size > mp_cutoff and pool_size > 1:
pool = mp.Pool(processes=pool_size)
answer_arr = pool.map(get_identifier_description_mp, [
(data[x], x, type_information.dtypes[x])
for x in sample_df.columns
])
pool.close()
pool.join()
else:
answer_arr = []
for x in sample_df.columns:
answer = get_identifier_description_mp([data[x], x, type_information.dtypes[x]])
answer_arr.append(answer)

for i, col_name in enumerate(sample_df.columns):
# work with the full data
if answer_arr[i] is not None:
log.warning(f'Column {col_name} is an identifier of type "{answer_arr[i]}"')
type_information.identifiers[col_name] = answer_arr[i]

# @TODO Column removal logic was here, if the column was an identifier, move it elsewhere

return type_information
Empty file added type_infer/bert/__init__.py
Empty file.
1 change: 1 addition & 0 deletions type_infer/bert/infer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
STABLE = False
2 changes: 2 additions & 0 deletions type_infer/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,5 @@ class dtype:
# Misc (Unk/NaNs)
empty = "empty"
invalid = "invalid"

# TODO: introduce "modifiers"?
Loading

0 comments on commit 84bbefa

Please sign in to comment.