Skip to content

Commit

Permalink
Merge branch 'refactor/modules' of github.com:mindsdb/type_infer into…
Browse files Browse the repository at this point in the history
… refactor/modules
  • Loading branch information
Pedro Fluxa committed Dec 22, 2023
2 parents ae665ed + a523ffa commit b0e626e
Show file tree
Hide file tree
Showing 17 changed files with 569 additions and 525 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@
from type_infer.api import infer_types


class TestTypeInference(unittest.TestCase):
class TestRuleBasedTypeInference(unittest.TestCase):
def test_0_airline_sentiment(self):
df = pd.read_csv("tests/data/airline_sentiment_sample.csv")
inferred_types = infer_types(df, pct_invalid=0)
config = {'engine': 'rule_based', 'pct_invalid': 0, 'seed': 420, 'mp_cutoff': 1e4}
inferred_types = infer_types(df, config=config)

expected_types = {
'airline_sentiment': 'categorical',
Expand Down Expand Up @@ -44,6 +45,7 @@ def test_0_airline_sentiment(self):

def test_1_stack_overflow_survey(self):
df = pd.read_csv("tests/data/stack_overflow_survey_sample.csv")
config = {'engine': 'rule_based', 'pct_invalid': 0, 'seed': 420, 'mp_cutoff': 1e4}

expected_types = {
'Respondent': 'integer',
Expand All @@ -68,7 +70,7 @@ def test_1_stack_overflow_survey(self):
'Professional': 'No Information'
}

inferred_types = infer_types(df, pct_invalid=0)
inferred_types = infer_types(df, config=config)

for col in expected_types:
self.assertTrue(expected_types[col], inferred_types.dtypes[col])
Expand All @@ -90,7 +92,10 @@ def test_2_simple(self):
# manual tinkering
df['float'].iloc[-n_corrupted:] = 'random string'

inferred_types = infer_types(df, pct_invalid=100 * (n_corrupted) / n_points)
pct_invalid = 100 * (n_corrupted) / n_points
config = {'engine': 'rule_based', 'pct_invalid': pct_invalid, 'seed': 420, 'mp_cutoff': 1e4}

inferred_types = infer_types(df, config=config)
expected_types = {
'date': dtype.date,
'datetime': dtype.datetime,
Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import unittest

from type_infer.dtype import dtype
from type_infer.rule_based.infer import type_check_date
from type_infer.rule_based.core import RuleBasedEngine

type_check_date = RuleBasedEngine.type_check_date


class TestDates(unittest.TestCase):
Expand Down
22 changes: 22 additions & 0 deletions tests/unit_tests/rule_based/test_infer_dtypes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import unittest
import random

import pandas as pd
from type_infer.rule_based.core import RuleBasedEngine
from type_infer.dtype import dtype

get_column_data_type = RuleBasedEngine.get_column_data_type


class TestInferDtypes(unittest.TestCase):
def test_negative_integers(self):
data = pd.DataFrame([-random.randint(-10, 10) for _ in range(100)], columns=['test_col'])
engine = RuleBasedEngine()
dtyp, dist, ainfo, warn, info = engine.get_column_data_type(data['test_col'], data, 'test_col', 0.0)
self.assertEqual(dtyp, dtype.integer)

def test_negative_floats(self):
data = pd.DataFrame([float(-random.randint(-10, 10)) for _ in range(100)] + [0.1], columns=['test_col'])
engine = RuleBasedEngine()
dtyp, dist, ainfo, warn, info = engine.get_column_data_type(data['test_col'], data, 'test_col', 0.0)
self.assertEqual(dtyp, dtype.float)
12 changes: 12 additions & 0 deletions tests/unit_tests/rule_based/test_misc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import unittest

from type_infer.rule_based.helpers import tokenize_text


class TestDates(unittest.TestCase):
def test_get_tokens(self):
sentences = ['hello, world!', ' !hello! world!!,..#', '#hello!world']
for sent in sentences:
assert list(tokenize_text(sent)) == ['hello', 'world']

assert list(tokenize_text("don't wouldn't")) == ['do', 'not', 'would', 'not']
18 changes: 0 additions & 18 deletions tests/unit_tests/test_infer_dtypes.py

This file was deleted.

8 changes: 0 additions & 8 deletions tests/unit_tests/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from pathlib import Path

import type_infer
from type_infer.rule_based.helpers import tokenize_text


class TestDates(unittest.TestCase):
Expand All @@ -19,10 +18,3 @@ def test_versions_are_in_sync(self):
package_init_version = type_infer.__version__

self.assertEqual(package_init_version, pyproject_version)

def test_get_tokens(self):
sentences = ['hello, world!', ' !hello! world!!,..#', '#hello!world']
for sent in sentences:
assert list(tokenize_text(sent)) == ['hello', 'world']

assert list(tokenize_text("don't wouldn't")) == ['do', 'not', 'would', 'not']
6 changes: 4 additions & 2 deletions type_infer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
from type_infer import api
from type_infer import helpers


__version__ = '0.0.18'


__all__ = ['base', 'dtype', 'api', 'helpers', '__version__']
__all__ = [
'__version__',
'base', 'dtype', 'api', 'helpers',
]
167 changes: 10 additions & 157 deletions type_infer/api.py
Original file line number Diff line number Diff line change
@@ -1,95 +1,13 @@
import random
import multiprocessing as mp

from scipy.stats import norm
from typing import Dict, Optional
import pandas as pd

from type_infer.base import TypeInformation
from type_infer.dtype import dtype
from type_infer.helpers import seed, log, get_nr_procs

# inference engine specific imports
from type_infer.rule_based.infer import get_column_data_type
from type_infer.rule_based.helpers import get_identifier_description_mp


def _calculate_sample_size(
population_size,
margin_error=.01,
confidence_level=.995,
sigma=1 / 2
):
"""
Calculate the minimal sample size to use to achieve a certain
margin of error and confidence level for a sample estimate
of the population mean.
Inputs
-------
population_size: integer
Total size of the population that the sample is to be drawn from.
margin_error: number
Maximum expected difference between the true population parameter,
such as the mean, and the sample estimate.
confidence_level: number in the interval (0, 1)
If we were to draw a large number of equal-size samples
from the population, the true population parameter
should lie within this percentage
of the intervals (sample_parameter - e, sample_parameter + e)
where e is the margin_error.
sigma: number
The standard deviation of the population. For the case
of estimating a parameter in the interval [0, 1], sigma=1/2
should be sufficient.
"""
alpha = 1 - confidence_level
# dictionary of confidence levels and corresponding z-scores
# computed via norm.ppf(1 - (alpha/2)), where norm is
# a normal distribution object in scipy.stats.
# Here, ppf is the percentile point function.
zdict = {
.90: 1.645,
.91: 1.695,
.99: 2.576,
.97: 2.17,
.94: 1.881,
.93: 1.812,
.95: 1.96,
.98: 2.326,
.96: 2.054,
.92: 1.751
}
if confidence_level in zdict:
z = zdict[confidence_level]
else:
# Inf fix
if alpha == 0.0:
alpha += 0.001
z = norm.ppf(1 - (alpha / 2))
N = population_size
M = margin_error
numerator = z**2 * sigma**2 * (N / (N - 1))
denom = M**2 + ((z**2 * sigma**2) / (N - 1))
return numerator / denom


def _sample_data(df: pd.DataFrame) -> pd.DataFrame:
population_size = len(df)
if population_size <= 50:
sample_size = population_size
else:
sample_size = int(round(_calculate_sample_size(population_size)))

population_size = len(df)
input_data_sample_indexes = random.sample(range(population_size), sample_size)
return df.iloc[input_data_sample_indexes]
from type_infer.base import TypeInformation, ENGINES
from type_infer.rule_based.core import RuleBasedEngine


def infer_types(
data: pd.DataFrame,
# TODO: method: InferenceEngine = Union[InferenceEngine.RuleBased, InferenceEngine.BERT],
pct_invalid: float,
seed_nr: int = 420,
mp_cutoff: int = 1e4,
config: Optional[Dict] = None
) -> TypeInformation:
"""
Infers the data types of each column of the dataset by analyzing a small sample of
Expand All @@ -99,77 +17,12 @@ def infer_types(
----------
data : pd.DataFrame
The input dataset for which we want to infer data type information.
pct_invalid : float
The percentage, i.e. a float between 0.0 and 100.0, of invalid values that are
accepted before failing the type inference for a column.
seed_nr : int, optional
Seed for the random number generator, by default 420
mp_cutoff : int, optional
How many elements in the dataframe before switching to parallel processing, by
default 1e4.
"""
seed(seed_nr)
type_information = TypeInformation()
sample_df = _sample_data(data)
sample_size = len(sample_df)
population_size = len(data)
log.info(f'Analyzing a sample of {sample_size}')
log.info(
f'from a total population of {population_size}, this is equivalent to {round(sample_size*100/population_size, 1)}% of your data.') # noqa

nr_procs = get_nr_procs(df=sample_df)
pool_size = min(nr_procs, len(sample_df.columns.values))
if data.size > mp_cutoff and pool_size > 1:
log.info(f'Using {pool_size} processes to deduct types.')
pool = mp.Pool(processes=pool_size)
# column-wise parallelization # TODO: evaluate switching to row-wise split instead
# TODO: this would be the call to the inference engine -> column in, type out
answer_arr = pool.starmap(get_column_data_type, [
(sample_df[x].dropna(), data[x], x, pct_invalid) for x in sample_df.columns.values
])
pool.close()
pool.join()
else:
answer_arr = []
for x in sample_df.columns:
answer_arr.append(get_column_data_type(sample_df[x].dropna(), data, x, pct_invalid))

for i, col_name in enumerate(sample_df.columns):
(data_dtype, data_dtype_dist, additional_info, warn, info) = answer_arr[i]

for msg in warn:
log.warning(msg)
for msg in info:
log.info(msg)
if config is None or 'engine' not in config:
config = {'engine': 'rule_based', 'pct_invalid': 2, 'seed': 420, 'mp_cutoff': 1e4}

if data_dtype is None:
data_dtype = dtype.invalid

type_information.dtypes[col_name] = data_dtype
type_information.additional_info[col_name] = {
'dtype_dist': data_dtype_dist
}

if data.size > mp_cutoff and pool_size > 1:
pool = mp.Pool(processes=pool_size)
answer_arr = pool.map(get_identifier_description_mp, [
(data[x], x, type_information.dtypes[x])
for x in sample_df.columns
])
pool.close()
pool.join()
if config['engine'] == ENGINES.RULE_BASED:
engine = RuleBasedEngine(config)
return engine.infer(data)
else:
answer_arr = []
for x in sample_df.columns:
answer = get_identifier_description_mp([data[x], x, type_information.dtypes[x]])
answer_arr.append(answer)

for i, col_name in enumerate(sample_df.columns):
# work with the full data
if answer_arr[i] is not None:
log.warning(f'Column {col_name} is an identifier of type "{answer_arr[i]}"')
type_information.identifiers[col_name] = answer_arr[i]

# @TODO Column removal logic was here, if the column was an identifier, move it elsewhere

return type_information
raise Exception(f'Unknown engine {config["engine"]}')
13 changes: 13 additions & 0 deletions type_infer/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,16 @@ def __init__(self):
self.dtypes = dict()
self.additional_info = dict()
self.identifiers = dict()


class BaseEngine:
def __init__(self, stable=True):
self.stable = stable # whether the engine is stable or not (i.e. experimental)

def infer(self, df) -> TypeInformation:
"""Given a dataframe, infer the types of each column and return a TypeInformation object."""
raise NotImplementedError


class ENGINES:
RULE_BASED = 'rule_based'
9 changes: 9 additions & 0 deletions type_infer/bert/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from type_infer.base import BaseEngine


class BERType(BaseEngine):
def __init__(self, stable=False):
super().__init__(stable=stable)

def infer(self, df):
raise NotImplementedError
1 change: 0 additions & 1 deletion type_infer/bert/infer.py

This file was deleted.

3 changes: 2 additions & 1 deletion type_infer/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,5 @@ class dtype:
empty = "empty"
invalid = "invalid"

# TODO: introduce "modifiers"?

# TODO: modifier class + system
Loading

0 comments on commit b0e626e

Please sign in to comment.