Skip to content

Commit

Permalink
add classes
Browse files Browse the repository at this point in the history
  • Loading branch information
paxcema committed Dec 22, 2023
1 parent 2466b8e commit 4b495c4
Show file tree
Hide file tree
Showing 10 changed files with 530 additions and 492 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@
from type_infer.api import infer_types


class TestTypeInference(unittest.TestCase):
class TestRuleBasedTypeInference(unittest.TestCase):
def test_0_airline_sentiment(self):
df = pd.read_csv("tests/data/airline_sentiment_sample.csv")
inferred_types = infer_types(df, pct_invalid=0)
config = {'engine': 'rule_based', 'pct_invalid': 0, 'seed': 420, 'mp_cutoff': 1e4}
inferred_types = infer_types(df, config=config)

expected_types = {
'airline_sentiment': 'categorical',
Expand Down Expand Up @@ -44,6 +45,8 @@ def test_0_airline_sentiment(self):

def test_1_stack_overflow_survey(self):
df = pd.read_csv("tests/data/stack_overflow_survey_sample.csv")
config = {'engine': 'rule_based', 'pct_invalid': 0, 'seed': 420, 'mp_cutoff': 1e4}


expected_types = {
'Respondent': 'integer',
Expand All @@ -68,7 +71,7 @@ def test_1_stack_overflow_survey(self):
'Professional': 'No Information'
}

inferred_types = infer_types(df, pct_invalid=0)
inferred_types = infer_types(df, config=config)

for col in expected_types:
self.assertTrue(expected_types[col], inferred_types.dtypes[col])
Expand All @@ -90,7 +93,10 @@ def test_2_simple(self):
# manual tinkering
df['float'].iloc[-n_corrupted:] = 'random string'

inferred_types = infer_types(df, pct_invalid=100 * (n_corrupted) / n_points)
pct_invalid = 100 * (n_corrupted) / n_points
config = {'engine': 'rule_based', 'pct_invalid': pct_invalid, 'seed': 420, 'mp_cutoff': 1e4}

inferred_types = infer_types(df, config=config)
expected_types = {
'date': dtype.date,
'datetime': dtype.datetime,
Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import unittest

from type_infer.dtype import dtype
from type_infer.rule_based.infer import type_check_date
from type_infer.rule_based.core import RuleBasedEngine

type_check_date = RuleBasedEngine.type_check_date


class TestDates(unittest.TestCase):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,20 @@
import random

import pandas as pd
from type_infer.api import get_column_data_type
from type_infer.rule_based.core import RuleBasedEngine
from type_infer.dtype import dtype

get_column_data_type = RuleBasedEngine.get_column_data_type

class TestInferDtypes(unittest.TestCase):
def test_negative_integers(self):
data = pd.DataFrame([-random.randint(-10, 10) for _ in range(100)], columns=['test_col'])
dtyp, dist, ainfo, warn, info = get_column_data_type(data['test_col'], data, 'test_col', 0.0)
engine = RuleBasedEngine()
dtyp, dist, ainfo, warn, info = engine.get_column_data_type(data['test_col'], data, 'test_col', 0.0)
self.assertEqual(dtyp, dtype.integer)

def test_negative_floats(self):
data = pd.DataFrame([-random.randint(-10, 10) for _ in range(100)] + [0.1], columns=['test_col'])
dtyp, dist, ainfo, warn, info = get_column_data_type(data['test_col'], data, 'test_col', 0.0)
engine = RuleBasedEngine()
dtyp, dist, ainfo, warn, info = engine.get_column_data_type(data['test_col'], data, 'test_col', 0.0)
self.assertEqual(dtyp, dtype.float)
7 changes: 6 additions & 1 deletion type_infer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,13 @@
from type_infer import api
from type_infer import helpers

from type_infer.api import ENGINES

__version__ = '0.0.18'


__all__ = ['base', 'dtype', 'api', 'helpers', '__version__']
__all__ = [
'__version__',
'base', 'dtype', 'api', 'helpers',
'ENGINES'
]
165 changes: 11 additions & 154 deletions type_infer/api.py
Original file line number Diff line number Diff line change
@@ -1,95 +1,17 @@
import random
import multiprocessing as mp

from scipy.stats import norm
from typing import Dict, Optional
import pandas as pd

from type_infer.base import TypeInformation
from type_infer.dtype import dtype
from type_infer.helpers import seed, log, get_nr_procs

# inference engine specific imports
from type_infer.rule_based.infer import get_column_data_type
from type_infer.rule_based.helpers import get_identifier_description_mp


def _calculate_sample_size(
population_size,
margin_error=.01,
confidence_level=.995,
sigma=1 / 2
):
"""
Calculate the minimal sample size to use to achieve a certain
margin of error and confidence level for a sample estimate
of the population mean.
Inputs
-------
population_size: integer
Total size of the population that the sample is to be drawn from.
margin_error: number
Maximum expected difference between the true population parameter,
such as the mean, and the sample estimate.
confidence_level: number in the interval (0, 1)
If we were to draw a large number of equal-size samples
from the population, the true population parameter
should lie within this percentage
of the intervals (sample_parameter - e, sample_parameter + e)
where e is the margin_error.
sigma: number
The standard deviation of the population. For the case
of estimating a parameter in the interval [0, 1], sigma=1/2
should be sufficient.
"""
alpha = 1 - confidence_level
# dictionary of confidence levels and corresponding z-scores
# computed via norm.ppf(1 - (alpha/2)), where norm is
# a normal distribution object in scipy.stats.
# Here, ppf is the percentile point function.
zdict = {
.90: 1.645,
.91: 1.695,
.99: 2.576,
.97: 2.17,
.94: 1.881,
.93: 1.812,
.95: 1.96,
.98: 2.326,
.96: 2.054,
.92: 1.751
}
if confidence_level in zdict:
z = zdict[confidence_level]
else:
# Inf fix
if alpha == 0.0:
alpha += 0.001
z = norm.ppf(1 - (alpha / 2))
N = population_size
M = margin_error
numerator = z**2 * sigma**2 * (N / (N - 1))
denom = M**2 + ((z**2 * sigma**2) / (N - 1))
return numerator / denom
from type_infer.rule_based.core import RuleBasedEngine


def _sample_data(df: pd.DataFrame) -> pd.DataFrame:
population_size = len(df)
if population_size <= 50:
sample_size = population_size
else:
sample_size = int(round(_calculate_sample_size(population_size)))

population_size = len(df)
input_data_sample_indexes = random.sample(range(population_size), sample_size)
return df.iloc[input_data_sample_indexes]
class ENGINES:
RULE_BASED = 'rule_based'


def infer_types(
data: pd.DataFrame,
# TODO: method: InferenceEngine = Union[InferenceEngine.RuleBased, InferenceEngine.BERT],
pct_invalid: float,
seed_nr: int = 420,
mp_cutoff: int = 1e4,
config: Optional[Dict] = None
) -> TypeInformation:
"""
Infers the data types of each column of the dataset by analyzing a small sample of
Expand All @@ -99,77 +21,12 @@ def infer_types(
----------
data : pd.DataFrame
The input dataset for which we want to infer data type information.
pct_invalid : float
The percentage, i.e. a float between 0.0 and 100.0, of invalid values that are
accepted before failing the type inference for a column.
seed_nr : int, optional
Seed for the random number generator, by default 420
mp_cutoff : int, optional
How many elements in the dataframe before switching to parallel processing, by
default 1e4.
"""
seed(seed_nr)
type_information = TypeInformation()
sample_df = _sample_data(data)
sample_size = len(sample_df)
population_size = len(data)
log.info(f'Analyzing a sample of {sample_size}')
log.info(
f'from a total population of {population_size}, this is equivalent to {round(sample_size*100/population_size, 1)}% of your data.') # noqa
if config is None or 'engine' not in config:
config = {'engine': 'rule_based', 'pct_invalid': 2, 'seed': 420, 'mp_cutoff': 1e4}

nr_procs = get_nr_procs(df=sample_df)
pool_size = min(nr_procs, len(sample_df.columns.values))
if data.size > mp_cutoff and pool_size > 1:
log.info(f'Using {pool_size} processes to deduct types.')
pool = mp.Pool(processes=pool_size)
# column-wise parallelization # TODO: evaluate switching to row-wise split instead
# TODO: this would be the call to the inference engine -> column in, type out
answer_arr = pool.starmap(get_column_data_type, [
(sample_df[x].dropna(), data[x], x, pct_invalid) for x in sample_df.columns.values
])
pool.close()
pool.join()
if config['engine'] == ENGINES.RULE_BASED:
engine = RuleBasedEngine(config)
return engine.infer(data)
else:
answer_arr = []
for x in sample_df.columns:
answer_arr.append(get_column_data_type(sample_df[x].dropna(), data, x, pct_invalid))

for i, col_name in enumerate(sample_df.columns):
(data_dtype, data_dtype_dist, additional_info, warn, info) = answer_arr[i]

for msg in warn:
log.warning(msg)
for msg in info:
log.info(msg)

if data_dtype is None:
data_dtype = dtype.invalid

type_information.dtypes[col_name] = data_dtype
type_information.additional_info[col_name] = {
'dtype_dist': data_dtype_dist
}

if data.size > mp_cutoff and pool_size > 1:
pool = mp.Pool(processes=pool_size)
answer_arr = pool.map(get_identifier_description_mp, [
(data[x], x, type_information.dtypes[x])
for x in sample_df.columns
])
pool.close()
pool.join()
else:
answer_arr = []
for x in sample_df.columns:
answer = get_identifier_description_mp([data[x], x, type_information.dtypes[x]])
answer_arr.append(answer)

for i, col_name in enumerate(sample_df.columns):
# work with the full data
if answer_arr[i] is not None:
log.warning(f'Column {col_name} is an identifier of type "{answer_arr[i]}"')
type_information.identifiers[col_name] = answer_arr[i]

# @TODO Column removal logic was here, if the column was an identifier, move it elsewhere

return type_information
raise Exception(f'Unknown engine {config["engine"]}')
9 changes: 9 additions & 0 deletions type_infer/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,12 @@ def __init__(self):
self.dtypes = dict()
self.additional_info = dict()
self.identifiers = dict()


class BaseEngine:
def __init__(self, stable = True):
self.stable = stable # whether the engine is stable or not (i.e. experimental)

def infer(self, df) -> TypeInformation:
"""Given a dataframe, infer the types of each column and return a TypeInformation object."""
raise NotImplementedError
74 changes: 74 additions & 0 deletions type_infer/helpers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import os

import pandas as pd
import psutil
import random
import logging
Expand All @@ -7,6 +9,7 @@
from typing import Iterable

import numpy as np
from scipy.stats import norm


def initialize_log():
Expand Down Expand Up @@ -109,3 +112,74 @@ def clean_float(val):
return float(val)
except Exception:
return None


def sample_data(df: pd.DataFrame) -> pd.DataFrame:
population_size = len(df)
if population_size <= 50:
sample_size = population_size
else:
sample_size = int(round(_calculate_sample_size(population_size)))

population_size = len(df)
input_data_sample_indexes = random.sample(range(population_size), sample_size)
return df.iloc[input_data_sample_indexes]


def _calculate_sample_size(
population_size,
margin_error=.01,
confidence_level=.995,
sigma=1 / 2
):
"""
Calculate the minimal sample size to use to achieve a certain
margin of error and confidence level for a sample estimate
of the population mean.
Inputs
-------
population_size: integer
Total size of the population that the sample is to be drawn from.
margin_error: number
Maximum expected difference between the true population parameter,
such as the mean, and the sample estimate.
confidence_level: number in the interval (0, 1)
If we were to draw a large number of equal-size samples
from the population, the true population parameter
should lie within this percentage
of the intervals (sample_parameter - e, sample_parameter + e)
where e is the margin_error.
sigma: number
The standard deviation of the population. For the case
of estimating a parameter in the interval [0, 1], sigma=1/2
should be sufficient.
"""
alpha = 1 - confidence_level
# dictionary of confidence levels and corresponding z-scores
# computed via norm.ppf(1 - (alpha/2)), where norm is
# a normal distribution object in scipy.stats.
# Here, ppf is the percentile point function.
zdict = {
.90: 1.645,
.91: 1.695,
.99: 2.576,
.97: 2.17,
.94: 1.881,
.93: 1.812,
.95: 1.96,
.98: 2.326,
.96: 2.054,
.92: 1.751
}
if confidence_level in zdict:
z = zdict[confidence_level]
else:
# Inf fix
if alpha == 0.0:
alpha += 0.001
z = norm.ppf(1 - (alpha / 2))
N = population_size
M = margin_error
numerator = z**2 * sigma**2 * (N / (N - 1))
denom = M**2 + ((z**2 * sigma**2) / (N - 1))
return numerator / denom
Loading

0 comments on commit 4b495c4

Please sign in to comment.