add classes

mindsdb · Dec 22, 2023 · 4b495c4 · 4b495c4
1 parent 2466b8e
commit 4b495c4
Show file tree

Hide file tree

Showing 10 changed files with 530 additions and 492 deletions.
diff --git a/tests/integration_tests/test_type_infer.py → tests/integration_tests/test_rule_based.py b/tests/integration_tests/test_type_infer.py → tests/integration_tests/test_rule_based.py
@@ -8,10 +8,11 @@
 from type_infer.api import infer_types
 
 
-class TestTypeInference(unittest.TestCase):
+class TestRuleBasedTypeInference(unittest.TestCase):
     def test_0_airline_sentiment(self):
         df = pd.read_csv("tests/data/airline_sentiment_sample.csv")
-        inferred_types = infer_types(df, pct_invalid=0)
+        config = {'engine': 'rule_based', 'pct_invalid': 0, 'seed': 420, 'mp_cutoff': 1e4}
+        inferred_types = infer_types(df, config=config)
 
         expected_types = {
             'airline_sentiment': 'categorical',
@@ -44,6 +45,8 @@ def test_0_airline_sentiment(self):
 
     def test_1_stack_overflow_survey(self):
         df = pd.read_csv("tests/data/stack_overflow_survey_sample.csv")
+        config = {'engine': 'rule_based', 'pct_invalid': 0, 'seed': 420, 'mp_cutoff': 1e4}
+
 
         expected_types = {
             'Respondent': 'integer',
@@ -68,7 +71,7 @@ def test_1_stack_overflow_survey(self):
             'Professional': 'No Information'
         }
 
-        inferred_types = infer_types(df, pct_invalid=0)
+        inferred_types = infer_types(df, config=config)
 
         for col in expected_types:
             self.assertTrue(expected_types[col], inferred_types.dtypes[col])
@@ -90,7 +93,10 @@ def test_2_simple(self):
         # manual tinkering
         df['float'].iloc[-n_corrupted:] = 'random string'
 
-        inferred_types = infer_types(df, pct_invalid=100 * (n_corrupted) / n_points)
+        pct_invalid = 100 * (n_corrupted) / n_points
+        config = {'engine': 'rule_based', 'pct_invalid': pct_invalid, 'seed': 420, 'mp_cutoff': 1e4}
+
+        inferred_types = infer_types(df, config=config)
         expected_types = {
             'date': dtype.date,
             'datetime': dtype.datetime,

diff --git a/tests/unit_tests/rule_based/__init__.py b/tests/unit_tests/rule_based/__init__.py
diff --git a/tests/unit_tests/test_dates.py → tests/unit_tests/rule_based/test_dates.py b/tests/unit_tests/test_dates.py → tests/unit_tests/rule_based/test_dates.py
@@ -1,7 +1,9 @@
 import unittest
 
 from type_infer.dtype import dtype
-from type_infer.rule_based.infer import type_check_date
+from type_infer.rule_based.core import RuleBasedEngine
+
+type_check_date = RuleBasedEngine.type_check_date
 
 
 class TestDates(unittest.TestCase):

diff --git a/tests/unit_tests/test_infer_dtypes.py → ...nit_tests/rule_based/test_infer_dtypes.py b/tests/unit_tests/test_infer_dtypes.py → ...nit_tests/rule_based/test_infer_dtypes.py
@@ -2,17 +2,20 @@
 import random
 
 import pandas as pd
-from type_infer.api import get_column_data_type
+from type_infer.rule_based.core import RuleBasedEngine
 from type_infer.dtype import dtype
 
+get_column_data_type = RuleBasedEngine.get_column_data_type
 
 class TestInferDtypes(unittest.TestCase):
     def test_negative_integers(self):
         data = pd.DataFrame([-random.randint(-10, 10) for _ in range(100)], columns=['test_col'])
-        dtyp, dist, ainfo, warn, info = get_column_data_type(data['test_col'], data, 'test_col', 0.0)
+        engine = RuleBasedEngine()
+        dtyp, dist, ainfo, warn, info = engine.get_column_data_type(data['test_col'], data, 'test_col', 0.0)
         self.assertEqual(dtyp, dtype.integer)
 
     def test_negative_floats(self):
         data = pd.DataFrame([-random.randint(-10, 10) for _ in range(100)] + [0.1], columns=['test_col'])
-        dtyp, dist, ainfo, warn, info = get_column_data_type(data['test_col'], data, 'test_col', 0.0)
+        engine = RuleBasedEngine()
+        dtyp, dist, ainfo, warn, info = engine.get_column_data_type(data['test_col'], data, 'test_col', 0.0)
         self.assertEqual(dtyp, dtype.float)
diff --git a/type_infer/__init__.py b/type_infer/__init__.py
@@ -3,8 +3,13 @@
 from type_infer import api
 from type_infer import helpers
 
+from type_infer.api import ENGINES
 
 __version__ = '0.0.18'
 
 
-__all__ = ['base', 'dtype', 'api', 'helpers', '__version__']
+__all__ = [
+    '__version__',
+    'base', 'dtype', 'api', 'helpers',
+    'ENGINES'
+]
diff --git a/type_infer/api.py b/type_infer/api.py
@@ -1,95 +1,17 @@
-import random
-import multiprocessing as mp
-
-from scipy.stats import norm
+from typing import Dict, Optional
 import pandas as pd
 
 from type_infer.base import TypeInformation
-from type_infer.dtype import dtype
-from type_infer.helpers import seed, log, get_nr_procs
-
-# inference engine specific imports
-from type_infer.rule_based.infer import get_column_data_type
-from type_infer.rule_based.helpers import get_identifier_description_mp
-
-
-def _calculate_sample_size(
-    population_size,
-    margin_error=.01,
-    confidence_level=.995,
-    sigma=1 / 2
-):
-    """
-    Calculate the minimal sample size to use to achieve a certain
-    margin of error and confidence level for a sample estimate
-    of the population mean.
-    Inputs
-    -------
-    population_size: integer
-        Total size of the population that the sample is to be drawn from.
-    margin_error: number
-        Maximum expected difference between the true population parameter,
-        such as the mean, and the sample estimate.
-    confidence_level: number in the interval (0, 1)
-        If we were to draw a large number of equal-size samples
-        from the population, the true population parameter
-        should lie within this percentage
-        of the intervals (sample_parameter - e, sample_parameter + e)
-        where e is the margin_error.
-    sigma: number
-        The standard deviation of the population.  For the case
-        of estimating a parameter in the interval [0, 1], sigma=1/2
-        should be sufficient.
-    """
-    alpha = 1 - confidence_level
-    # dictionary of confidence levels and corresponding z-scores
-    # computed via norm.ppf(1 - (alpha/2)), where norm is
-    # a normal distribution object in scipy.stats.
-    # Here, ppf is the percentile point function.
-    zdict = {
-        .90: 1.645,
-        .91: 1.695,
-        .99: 2.576,
-        .97: 2.17,
-        .94: 1.881,
-        .93: 1.812,
-        .95: 1.96,
-        .98: 2.326,
-        .96: 2.054,
-        .92: 1.751
-    }
-    if confidence_level in zdict:
-        z = zdict[confidence_level]
-    else:
-        # Inf fix
-        if alpha == 0.0:
-            alpha += 0.001
-        z = norm.ppf(1 - (alpha / 2))
-    N = population_size
-    M = margin_error
-    numerator = z**2 * sigma**2 * (N / (N - 1))
-    denom = M**2 + ((z**2 * sigma**2) / (N - 1))
-    return numerator / denom
+from type_infer.rule_based.core import RuleBasedEngine
 
 
-def _sample_data(df: pd.DataFrame) -> pd.DataFrame:
-    population_size = len(df)
-    if population_size <= 50:
-        sample_size = population_size
-    else:
-        sample_size = int(round(_calculate_sample_size(population_size)))
-
-    population_size = len(df)
-    input_data_sample_indexes = random.sample(range(population_size), sample_size)
-    return df.iloc[input_data_sample_indexes]
+class ENGINES:
+    RULE_BASED = 'rule_based'
 
 
 def infer_types(
         data: pd.DataFrame,
-        # TODO: method: InferenceEngine = Union[InferenceEngine.RuleBased, InferenceEngine.BERT],
-        pct_invalid: float,
-        seed_nr: int = 420,
-        mp_cutoff: int = 1e4,
+        config: Optional[Dict] = None
 ) -> TypeInformation:
     """
     Infers the data types of each column of the dataset by analyzing a small sample of
@@ -99,77 +21,12 @@ def infer_types(
     ----------
     data : pd.DataFrame
         The input dataset for which we want to infer data type information.
-    pct_invalid : float
-        The percentage, i.e. a float between 0.0 and 100.0, of invalid values that are
-        accepted before failing the type inference for a column.
-    seed_nr : int, optional
-        Seed for the random number generator, by default 420
-    mp_cutoff : int, optional
-        How many elements in the dataframe before switching to parallel processing, by
-        default 1e4.
     """
-    seed(seed_nr)
-    type_information = TypeInformation()
-    sample_df = _sample_data(data)
-    sample_size = len(sample_df)
-    population_size = len(data)
-    log.info(f'Analyzing a sample of {sample_size}')
-    log.info(
-        f'from a total population of {population_size}, this is equivalent to {round(sample_size*100/population_size, 1)}% of your data.')  # noqa
+    if config is None or 'engine' not in config:
+        config = {'engine': 'rule_based', 'pct_invalid': 2, 'seed': 420, 'mp_cutoff': 1e4}
 
-    nr_procs = get_nr_procs(df=sample_df)
-    pool_size = min(nr_procs, len(sample_df.columns.values))
-    if data.size > mp_cutoff and pool_size > 1:
-        log.info(f'Using {pool_size} processes to deduct types.')
-        pool = mp.Pool(processes=pool_size)
-        # column-wise parallelization  # TODO: evaluate switching to row-wise split instead
-        # TODO: this would be the call to the inference engine -> column in, type out
-        answer_arr = pool.starmap(get_column_data_type, [
-            (sample_df[x].dropna(), data[x], x, pct_invalid) for x in sample_df.columns.values
-        ])
-        pool.close()
-        pool.join()
+    if config['engine'] == ENGINES.RULE_BASED:
+        engine = RuleBasedEngine(config)
+        return engine.infer(data)
     else:
-        answer_arr = []
-        for x in sample_df.columns:
-            answer_arr.append(get_column_data_type(sample_df[x].dropna(), data, x, pct_invalid))
-
-    for i, col_name in enumerate(sample_df.columns):
-        (data_dtype, data_dtype_dist, additional_info, warn, info) = answer_arr[i]
-
-        for msg in warn:
-            log.warning(msg)
-        for msg in info:
-            log.info(msg)
-
-        if data_dtype is None:
-            data_dtype = dtype.invalid
-
-        type_information.dtypes[col_name] = data_dtype
-        type_information.additional_info[col_name] = {
-            'dtype_dist': data_dtype_dist
-        }
-
-    if data.size > mp_cutoff and pool_size > 1:
-        pool = mp.Pool(processes=pool_size)
-        answer_arr = pool.map(get_identifier_description_mp, [
-            (data[x], x, type_information.dtypes[x])
-            for x in sample_df.columns
-        ])
-        pool.close()
-        pool.join()
-    else:
-        answer_arr = []
-        for x in sample_df.columns:
-            answer = get_identifier_description_mp([data[x], x, type_information.dtypes[x]])
-            answer_arr.append(answer)
-
-    for i, col_name in enumerate(sample_df.columns):
-        # work with the full data
-        if answer_arr[i] is not None:
-            log.warning(f'Column {col_name} is an identifier of type "{answer_arr[i]}"')
-            type_information.identifiers[col_name] = answer_arr[i]
-
-        # @TODO Column removal logic was here, if the column was an identifier, move it elsewhere
-
-    return type_information
+        raise Exception(f'Unknown engine {config["engine"]}')
diff --git a/type_infer/base.py b/type_infer/base.py
@@ -24,3 +24,12 @@ def __init__(self):
         self.dtypes = dict()
         self.additional_info = dict()
         self.identifiers = dict()
+
+
+class BaseEngine:
+    def __init__(self, stable = True):
+        self.stable = stable  # whether the engine is stable or not (i.e. experimental)
+
+    def infer(self, df) -> TypeInformation:
+        """Given a dataframe, infer the types of each column and return a TypeInformation object."""
+        raise NotImplementedError
diff --git a/type_infer/helpers.py b/type_infer/helpers.py
@@ -1,4 +1,6 @@
 import os
+
+import pandas as pd
 import psutil
 import random
 import logging
@@ -7,6 +9,7 @@
 from typing import Iterable
 
 import numpy as np
+from scipy.stats import norm
 
 
 def initialize_log():
@@ -109,3 +112,74 @@ def clean_float(val):
         return float(val)
     except Exception:
         return None
+
+
+def sample_data(df: pd.DataFrame) -> pd.DataFrame:
+    population_size = len(df)
+    if population_size <= 50:
+        sample_size = population_size
+    else:
+        sample_size = int(round(_calculate_sample_size(population_size)))
+
+    population_size = len(df)
+    input_data_sample_indexes = random.sample(range(population_size), sample_size)
+    return df.iloc[input_data_sample_indexes]
+
+
+def _calculate_sample_size(
+    population_size,
+    margin_error=.01,
+    confidence_level=.995,
+    sigma=1 / 2
+):
+    """
+    Calculate the minimal sample size to use to achieve a certain
+    margin of error and confidence level for a sample estimate
+    of the population mean.
+    Inputs
+    -------
+    population_size: integer
+        Total size of the population that the sample is to be drawn from.
+    margin_error: number
+        Maximum expected difference between the true population parameter,
+        such as the mean, and the sample estimate.
+    confidence_level: number in the interval (0, 1)
+        If we were to draw a large number of equal-size samples
+        from the population, the true population parameter
+        should lie within this percentage
+        of the intervals (sample_parameter - e, sample_parameter + e)
+        where e is the margin_error.
+    sigma: number
+        The standard deviation of the population.  For the case
+        of estimating a parameter in the interval [0, 1], sigma=1/2
+        should be sufficient.
+    """
+    alpha = 1 - confidence_level
+    # dictionary of confidence levels and corresponding z-scores
+    # computed via norm.ppf(1 - (alpha/2)), where norm is
+    # a normal distribution object in scipy.stats.
+    # Here, ppf is the percentile point function.
+    zdict = {
+        .90: 1.645,
+        .91: 1.695,
+        .99: 2.576,
+        .97: 2.17,
+        .94: 1.881,
+        .93: 1.812,
+        .95: 1.96,
+        .98: 2.326,
+        .96: 2.054,
+        .92: 1.751
+    }
+    if confidence_level in zdict:
+        z = zdict[confidence_level]
+    else:
+        # Inf fix
+        if alpha == 0.0:
+            alpha += 0.001
+        z = norm.ppf(1 - (alpha / 2))
+    N = population_size
+    M = margin_error
+    numerator = z**2 * sigma**2 * (N / (N - 1))
+    denom = M**2 + ((z**2 * sigma**2) / (N - 1))
+    return numerator / denom