refactor: organized into inference engine folders. Still WIP, but thi…

…s runs (is close to the ideal end state) and does not break existing code.
mindsdb · Dec 21, 2023 · 84bbefa · 84bbefa
1 parent 42419a8
commit 84bbefa
Show file tree

Hide file tree

Showing 14 changed files with 410 additions and 390 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "type_infer"
-version = "0.0.17"
+version = "0.0.18"
 description = "Automated type inference for Machine Learning pipelines."
 authors = ["MindsDB Inc. <[email protected]>"]
 license = "GPL-3.0"

diff --git a/tests/integration_tests/test_type_infer.py b/tests/integration_tests/test_type_infer.py
@@ -5,7 +5,7 @@
 from datetime import datetime, timedelta
 
 from type_infer.dtype import dtype
-from type_infer.infer import infer_types
+from type_infer.api import infer_types
 
 
 class TestTypeInference(unittest.TestCase):

diff --git a/tests/unit_tests/test_dates.py b/tests/unit_tests/test_dates.py
@@ -1,7 +1,7 @@
 import unittest
 
 from type_infer.dtype import dtype
-from type_infer.infer import type_check_date
+from type_infer.rule_based.infer import type_check_date
 
 
 class TestDates(unittest.TestCase):

diff --git a/tests/unit_tests/test_infer_dtypes.py b/tests/unit_tests/test_infer_dtypes.py
@@ -2,7 +2,7 @@
 import random
 
 import pandas as pd
-from type_infer.infer import get_column_data_type
+from type_infer.api import get_column_data_type
 from type_infer.dtype import dtype
 
 

diff --git a/tests/unit_tests/test_misc.py b/tests/unit_tests/test_misc.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 
 import type_infer
-from type_infer.helpers import tokenize_text
+from type_infer.rule_based.helpers import tokenize_text
 
 
 class TestDates(unittest.TestCase):

diff --git a/type_infer/__init__.py b/type_infer/__init__.py
@@ -1,10 +1,10 @@
 from type_infer import base
 from type_infer import dtype
-from type_infer import infer
+from type_infer import api
 from type_infer import helpers
 
 
-__version__ = '0.0.17'
+__version__ = '0.0.18'
 
 
-__all__ = ['base', 'dtype', 'infer', 'helpers', '__version__']
+__all__ = ['base', 'dtype', 'api.py', 'helpers', '__version__']
diff --git a/type_infer/api.py b/type_infer/api.py
@@ -0,0 +1,175 @@
+import random
+import multiprocessing as mp
+
+from scipy.stats import norm
+import pandas as pd
+
+from type_infer.base import TypeInformation
+from type_infer.dtype import dtype
+from type_infer.helpers import seed, log, get_nr_procs
+
+# inference engine specific imports
+from type_infer.rule_based.infer import get_column_data_type
+from type_infer.rule_based.helpers import get_identifier_description_mp
+
+
+def _calculate_sample_size(
+    population_size,
+    margin_error=.01,
+    confidence_level=.995,
+    sigma=1 / 2
+):
+    """
+    Calculate the minimal sample size to use to achieve a certain
+    margin of error and confidence level for a sample estimate
+    of the population mean.
+    Inputs
+    -------
+    population_size: integer
+        Total size of the population that the sample is to be drawn from.
+    margin_error: number
+        Maximum expected difference between the true population parameter,
+        such as the mean, and the sample estimate.
+    confidence_level: number in the interval (0, 1)
+        If we were to draw a large number of equal-size samples
+        from the population, the true population parameter
+        should lie within this percentage
+        of the intervals (sample_parameter - e, sample_parameter + e)
+        where e is the margin_error.
+    sigma: number
+        The standard deviation of the population.  For the case
+        of estimating a parameter in the interval [0, 1], sigma=1/2
+        should be sufficient.
+    """
+    alpha = 1 - confidence_level
+    # dictionary of confidence levels and corresponding z-scores
+    # computed via norm.ppf(1 - (alpha/2)), where norm is
+    # a normal distribution object in scipy.stats.
+    # Here, ppf is the percentile point function.
+    zdict = {
+        .90: 1.645,
+        .91: 1.695,
+        .99: 2.576,
+        .97: 2.17,
+        .94: 1.881,
+        .93: 1.812,
+        .95: 1.96,
+        .98: 2.326,
+        .96: 2.054,
+        .92: 1.751
+    }
+    if confidence_level in zdict:
+        z = zdict[confidence_level]
+    else:
+        # Inf fix
+        if alpha == 0.0:
+            alpha += 0.001
+        z = norm.ppf(1 - (alpha / 2))
+    N = population_size
+    M = margin_error
+    numerator = z**2 * sigma**2 * (N / (N - 1))
+    denom = M**2 + ((z**2 * sigma**2) / (N - 1))
+    return numerator / denom
+
+
+def _sample_data(df: pd.DataFrame) -> pd.DataFrame:
+    population_size = len(df)
+    if population_size <= 50:
+        sample_size = population_size
+    else:
+        sample_size = int(round(_calculate_sample_size(population_size)))
+
+    population_size = len(df)
+    input_data_sample_indexes = random.sample(range(population_size), sample_size)
+    return df.iloc[input_data_sample_indexes]
+
+
+def infer_types(
+        data: pd.DataFrame,
+        #TODO: method: InferenceEngine = Union[InferenceEngine.RuleBased, InferenceEngine.BERT],
+        pct_invalid: float,
+        seed_nr: int = 420,
+        mp_cutoff: int = 1e4,
+) -> TypeInformation:
+    """
+    Infers the data types of each column of the dataset by analyzing a small sample of
+    each column's items.
+
+    Inputs
+    ----------
+    data : pd.DataFrame
+        The input dataset for which we want to infer data type information.
+    pct_invalid : float
+        The percentage, i.e. a float between 0.0 and 100.0, of invalid values that are
+        accepted before failing the type inference for a column.
+    seed_nr : int, optional
+        Seed for the random number generator, by default 420
+    mp_cutoff : int, optional
+        How many elements in the dataframe before switching to parallel processing, by
+        default 1e4.
+    """
+    seed(seed_nr)
+    type_information = TypeInformation()
+    sample_df = _sample_data(data)
+    sample_size = len(sample_df)
+    population_size = len(data)
+    log.info(f'Analyzing a sample of {sample_size}')
+    log.info(
+        f'from a total population of {population_size}, this is equivalent to {round(sample_size*100/population_size, 1)}% of your data.')  # noqa
+
+    nr_procs = get_nr_procs(df=sample_df)
+    pool_size = min(nr_procs, len(sample_df.columns.values))
+    if data.size > mp_cutoff and pool_size > 1:
+        log.info(f'Using {pool_size} processes to deduct types.')
+        pool = mp.Pool(processes=pool_size)
+        # column-wise parallelization  # TODO: evaluate switching to row-wise split instead
+        # TODO: this would be the call to the inference engine -> column in, type out
+        answer_arr = pool.starmap(get_column_data_type, [
+            (sample_df[x].dropna(), data[x], x, pct_invalid) for x in sample_df.columns.values
+        ])
+        pool.close()
+        pool.join()
+    else:
+        answer_arr = []
+        for x in sample_df.columns:
+            answer_arr.append(get_column_data_type(sample_df[x].dropna(), data, x, pct_invalid))
+
+    for i, col_name in enumerate(sample_df.columns):
+        (data_dtype, data_dtype_dist, additional_info, warn, info) = answer_arr[i]
+
+        for msg in warn:
+            log.warning(msg)
+        for msg in info:
+            log.info(msg)
+
+        if data_dtype is None:
+            data_dtype = dtype.invalid
+
+        type_information.dtypes[col_name] = data_dtype
+        type_information.additional_info[col_name] = {
+            'dtype_dist': data_dtype_dist
+        }
+
+    if data.size > mp_cutoff and pool_size > 1:
+        pool = mp.Pool(processes=pool_size)
+        answer_arr = pool.map(get_identifier_description_mp, [
+            (data[x], x, type_information.dtypes[x])
+            for x in sample_df.columns
+        ])
+        pool.close()
+        pool.join()
+    else:
+        answer_arr = []
+        for x in sample_df.columns:
+            answer = get_identifier_description_mp([data[x], x, type_information.dtypes[x]])
+            answer_arr.append(answer)
+
+    for i, col_name in enumerate(sample_df.columns):
+        # work with the full data
+        if answer_arr[i] is not None:
+            log.warning(f'Column {col_name} is an identifier of type "{answer_arr[i]}"')
+            type_information.identifiers[col_name] = answer_arr[i]
+
+        # @TODO Column removal logic was here, if the column was an identifier, move it elsewhere
+
+    return type_information
diff --git a/type_infer/bert/__init__.py b/type_infer/bert/__init__.py
diff --git a/type_infer/bert/infer.py b/type_infer/bert/infer.py
@@ -0,0 +1 @@
+STABLE = False
diff --git a/type_infer/dtype.py b/type_infer/dtype.py
@@ -45,3 +45,5 @@ class dtype:
     # Misc (Unk/NaNs)
     empty = "empty"
     invalid = "invalid"
+
+# TODO: introduce "modifiers"?