diff --git a/pyproject.toml b/pyproject.toml index e7c758e..12775ef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "type_infer" -version = "0.0.17" +version = "0.0.18" description = "Automated type inference for Machine Learning pipelines." authors = ["MindsDB Inc. "] license = "GPL-3.0" @@ -15,12 +15,17 @@ numpy = "^1.15" pandas = "^2" dataclasses-json = "^0.6.3" colorlog = "^6.5.0" -langid = "^1.1.6" -nltk = "^3" -toml = "^0.10.2" psutil = "^5.9.0" +toml = "^0.10.2" +# rule based deps, part of core +langid = "^1.1.6" +nltk = "^3" [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" + +# TODO: update once this engine is introduced +[tool.poetry.extras] +# bert = ["torch"] diff --git a/tests/integration_tests/test_type_infer.py b/tests/integration_tests/test_rule_based.py similarity index 86% rename from tests/integration_tests/test_type_infer.py rename to tests/integration_tests/test_rule_based.py index 9ea74e1..56d5579 100644 --- a/tests/integration_tests/test_type_infer.py +++ b/tests/integration_tests/test_rule_based.py @@ -5,13 +5,14 @@ from datetime import datetime, timedelta from type_infer.dtype import dtype -from type_infer.infer import infer_types +from type_infer.api import infer_types -class TestTypeInference(unittest.TestCase): +class TestRuleBasedTypeInference(unittest.TestCase): def test_0_airline_sentiment(self): df = pd.read_csv("tests/data/airline_sentiment_sample.csv") - inferred_types = infer_types(df, pct_invalid=0) + config = {'engine': 'rule_based', 'pct_invalid': 0, 'seed': 420, 'mp_cutoff': 1e4} + inferred_types = infer_types(df, config=config) expected_types = { 'airline_sentiment': 'categorical', @@ -44,6 +45,7 @@ def test_0_airline_sentiment(self): def test_1_stack_overflow_survey(self): df = pd.read_csv("tests/data/stack_overflow_survey_sample.csv") + config = {'engine': 'rule_based', 'pct_invalid': 0, 'seed': 420, 'mp_cutoff': 1e4} expected_types = { 'Respondent': 'integer', @@ -68,7 +70,7 @@ def test_1_stack_overflow_survey(self): 'Professional': 'No Information' } - inferred_types = infer_types(df, pct_invalid=0) + inferred_types = infer_types(df, config=config) for col in expected_types: self.assertTrue(expected_types[col], inferred_types.dtypes[col]) @@ -90,7 +92,10 @@ def test_2_simple(self): # manual tinkering df['float'].iloc[-n_corrupted:] = 'random string' - inferred_types = infer_types(df, pct_invalid=100 * (n_corrupted) / n_points) + pct_invalid = 100 * (n_corrupted) / n_points + config = {'engine': 'rule_based', 'pct_invalid': pct_invalid, 'seed': 420, 'mp_cutoff': 1e4} + + inferred_types = infer_types(df, config=config) expected_types = { 'date': dtype.date, 'datetime': dtype.datetime, diff --git a/tests/unit_tests/rule_based/__init__.py b/tests/unit_tests/rule_based/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit_tests/test_dates.py b/tests/unit_tests/rule_based/test_dates.py similarity index 94% rename from tests/unit_tests/test_dates.py rename to tests/unit_tests/rule_based/test_dates.py index 6e8bfe7..35ea0e9 100644 --- a/tests/unit_tests/test_dates.py +++ b/tests/unit_tests/rule_based/test_dates.py @@ -1,7 +1,9 @@ import unittest from type_infer.dtype import dtype -from type_infer.infer import type_check_date +from type_infer.rule_based.core import RuleBasedEngine + +type_check_date = RuleBasedEngine.type_check_date class TestDates(unittest.TestCase): diff --git a/tests/unit_tests/rule_based/test_infer_dtypes.py b/tests/unit_tests/rule_based/test_infer_dtypes.py new file mode 100644 index 0000000..ef3f920 --- /dev/null +++ b/tests/unit_tests/rule_based/test_infer_dtypes.py @@ -0,0 +1,22 @@ +import unittest +import random + +import pandas as pd +from type_infer.rule_based.core import RuleBasedEngine +from type_infer.dtype import dtype + +get_column_data_type = RuleBasedEngine.get_column_data_type + + +class TestInferDtypes(unittest.TestCase): + def test_negative_integers(self): + data = pd.DataFrame([-random.randint(-10, 10) for _ in range(100)], columns=['test_col']) + engine = RuleBasedEngine() + dtyp, dist, ainfo, warn, info = engine.get_column_data_type(data['test_col'], data, 'test_col', 0.0) + self.assertEqual(dtyp, dtype.integer) + + def test_negative_floats(self): + data = pd.DataFrame([float(-random.randint(-10, 10)) for _ in range(100)] + [0.1], columns=['test_col']) + engine = RuleBasedEngine() + dtyp, dist, ainfo, warn, info = engine.get_column_data_type(data['test_col'], data, 'test_col', 0.0) + self.assertEqual(dtyp, dtype.float) diff --git a/tests/unit_tests/rule_based/test_misc.py b/tests/unit_tests/rule_based/test_misc.py new file mode 100644 index 0000000..1685645 --- /dev/null +++ b/tests/unit_tests/rule_based/test_misc.py @@ -0,0 +1,12 @@ +import unittest + +from type_infer.rule_based.helpers import tokenize_text + + +class TestDates(unittest.TestCase): + def test_get_tokens(self): + sentences = ['hello, world!', ' !hello! world!!,..#', '#hello!world'] + for sent in sentences: + assert list(tokenize_text(sent)) == ['hello', 'world'] + + assert list(tokenize_text("don't wouldn't")) == ['do', 'not', 'would', 'not'] diff --git a/tests/unit_tests/test_infer_dtypes.py b/tests/unit_tests/test_infer_dtypes.py deleted file mode 100644 index c93c28e..0000000 --- a/tests/unit_tests/test_infer_dtypes.py +++ /dev/null @@ -1,18 +0,0 @@ -import unittest -import random - -import pandas as pd -from type_infer.infer import get_column_data_type -from type_infer.dtype import dtype - - -class TestInferDtypes(unittest.TestCase): - def test_negative_integers(self): - data = pd.DataFrame([-random.randint(-10, 10) for _ in range(100)], columns=['test_col']) - dtyp, dist, ainfo, warn, info = get_column_data_type(data['test_col'], data, 'test_col', 0.0) - self.assertEqual(dtyp, dtype.integer) - - def test_negative_floats(self): - data = pd.DataFrame([-random.randint(-10, 10) for _ in range(100)] + [0.1], columns=['test_col']) - dtyp, dist, ainfo, warn, info = get_column_data_type(data['test_col'], data, 'test_col', 0.0) - self.assertEqual(dtyp, dtype.float) diff --git a/tests/unit_tests/test_misc.py b/tests/unit_tests/test_misc.py index 5c88338..4c597b7 100644 --- a/tests/unit_tests/test_misc.py +++ b/tests/unit_tests/test_misc.py @@ -3,7 +3,6 @@ from pathlib import Path import type_infer -from type_infer.helpers import tokenize_text class TestDates(unittest.TestCase): @@ -19,10 +18,3 @@ def test_versions_are_in_sync(self): package_init_version = type_infer.__version__ self.assertEqual(package_init_version, pyproject_version) - - def test_get_tokens(self): - sentences = ['hello, world!', ' !hello! world!!,..#', '#hello!world'] - for sent in sentences: - assert list(tokenize_text(sent)) == ['hello', 'world'] - - assert list(tokenize_text("don't wouldn't")) == ['do', 'not', 'would', 'not'] diff --git a/type_infer/__init__.py b/type_infer/__init__.py index fa649d7..c52ba24 100644 --- a/type_infer/__init__.py +++ b/type_infer/__init__.py @@ -1,10 +1,12 @@ from type_infer import base from type_infer import dtype -from type_infer import infer +from type_infer import api from type_infer import helpers +__version__ = '0.0.18' -__version__ = '0.0.17' - -__all__ = ['base', 'dtype', 'infer', 'helpers', '__version__'] +__all__ = [ + '__version__', + 'base', 'dtype', 'api', 'helpers', +] diff --git a/type_infer/api.py b/type_infer/api.py new file mode 100644 index 0000000..846541a --- /dev/null +++ b/type_infer/api.py @@ -0,0 +1,40 @@ +from typing import Dict, Optional +import pandas as pd + +from type_infer.base import TypeInformation, ENGINES +from type_infer.rule_based.core import RuleBasedEngine + + +def infer_types( + data: pd.DataFrame, + config: Optional[Dict] = None +) -> TypeInformation: + """ + Infers the data types of each column of the dataset by analyzing a small sample of + each column's items. + + Inputs + ---------- + data : pd.DataFrame + The input dataset for which we want to infer data type information. + """ + # Set global defaults if missing + if config is None: + config = {'engine': 'rule_based', 'pct_invalid': 2, 'seed': 420, 'mp_cutoff': 1e4} + elif 'engine' not in config: + config['engine'] = 'rule_based' + + if 'pct_invalid' not in config: + config['pct_invalid'] = 2 + + if 'seed' not in config: + config['seed'] = 420 + + if config['engine'] == ENGINES.RULE_BASED: + if 'mp_cutoff' not in config: + config['mp_cutoff'] = 1e4 + + engine = RuleBasedEngine(config) + return engine.infer(data) + else: + raise Exception(f'Unknown engine {config["engine"]}') diff --git a/type_infer/base.py b/type_infer/base.py index 94fc108..4bba17d 100644 --- a/type_infer/base.py +++ b/type_infer/base.py @@ -24,3 +24,16 @@ def __init__(self): self.dtypes = dict() self.additional_info = dict() self.identifiers = dict() + + +class BaseEngine: + def __init__(self, stable=True): + self.stable = stable # whether the engine is stable or not (i.e. experimental) + + def infer(self, df) -> TypeInformation: + """Given a dataframe, infer the types of each column and return a TypeInformation object.""" + raise NotImplementedError + + +class ENGINES: + RULE_BASED = 'rule_based' diff --git a/type_infer/bert/__init__.py b/type_infer/bert/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/type_infer/bert/core.py b/type_infer/bert/core.py new file mode 100644 index 0000000..2e99ab0 --- /dev/null +++ b/type_infer/bert/core.py @@ -0,0 +1,9 @@ +from type_infer.base import BaseEngine + + +class BERType(BaseEngine): + def __init__(self, stable=False): + super().__init__(stable=stable) + + def infer(self, df): + raise NotImplementedError diff --git a/type_infer/dtype.py b/type_infer/dtype.py index 9f05a2e..178d925 100644 --- a/type_infer/dtype.py +++ b/type_infer/dtype.py @@ -45,3 +45,6 @@ class dtype: # Misc (Unk/NaNs) empty = "empty" invalid = "invalid" + + +# TODO: modifier class + system diff --git a/type_infer/helpers.py b/type_infer/helpers.py index 3ae6bc1..4e2752b 100644 --- a/type_infer/helpers.py +++ b/type_infer/helpers.py @@ -1,34 +1,50 @@ import os -import re -import nltk import psutil import random -import string import logging import colorlog import multiprocessing as mp +from typing import Iterable import numpy as np -import scipy.stats as st -from langid.langid import LanguageIdentifier -from langid.langid import model as langid_model +import pandas as pd +from scipy.stats import norm -from typing import Iterable -from collections import Counter, defaultdict -from type_infer.dtype import dtype +def initialize_log(): + pid = os.getpid() + handler = colorlog.StreamHandler() + handler.setFormatter(colorlog.ColoredFormatter()) + + logging.basicConfig(handlers=[handler]) + log = logging.getLogger(f'type_infer-{pid}') + log_level = os.environ.get('TYPE_INFER_LOG', 'DEBUG') + log.setLevel(log_level) + return log + +log = initialize_log() + + +def get_nr_procs(df=None): + if 'MINDSDB_N_WORKERS' in os.environ: + try: + n = int(os.environ['MINDSDB_N_WORKERS']) + except ValueError: + n = 1 + return n + elif os.name == 'nt': + return 1 + else: + available_mem = psutil.virtual_memory().available + if df is not None: + max_per_proc_usage = df.size + else: + max_per_proc_usage = 0.2 * pow(10, 9) # multiplier * 1GB -try: - nltk.data.find('tokenizers/punkt') -except LookupError: - nltk.download('punkt') + proc_count = int(min(mp.cpu_count() - 1, available_mem // max_per_proc_usage)) -try: - from nltk.corpus import stopwords - stopwords.words('english') -except LookupError: - nltk.download('stopwords', quiet=True) + return max(proc_count, 1) def seed(seed_nr: int) -> None: @@ -58,103 +74,6 @@ def is_nan_numeric(value: object) -> bool: return isnan -def initialize_log(): - pid = os.getpid() - - handler = colorlog.StreamHandler() - handler.setFormatter(colorlog.ColoredFormatter()) - - logging.basicConfig(handlers=[handler]) - log = logging.getLogger(f'type_infer-{pid}') - log_level = os.environ.get('TYPE_INFER_LOG', 'DEBUG') - log.setLevel(log_level) - return log - - -log = initialize_log() - - -def get_identifier_description_mp(arg_tup): - data, column_name, data_dtype = arg_tup - return get_identifier_description(data, column_name, data_dtype) - - -def get_identifier_description(data: Iterable, column_name: str, data_dtype: dtype): - data = list(data) - if isinstance(data[0], list): - nr_unique = len(set(tuple(x) for x in data)) - elif isinstance(data[0], dict): - nr_unique = len(set(str(x) for x in data)) - else: - nr_unique = len(set(data)) - - if nr_unique == 1: - return 'No Information' - - unique_pct = nr_unique / len(data) - - spaces = [len(str(x).split(' ')) - 1 for x in data] - mean_spaces = np.mean(spaces) if len(spaces) > 0 else 0.0 - - # Detect hash - all_same_length = all(len(str(data[0])) == len(str(x)) for x in data) - uuid_charset = set('0123456789abcdefABCDEF-') - all_uuid_charset = all(set(str(x)).issubset(uuid_charset) for x in data) - is_uuid = all_uuid_charset and all_same_length - - if all_same_length and len(data) == nr_unique and data_dtype not in (dtype.integer, dtype.float): - str_data = [str(x) for x in data] - randomness_per_index = [] - for i, _ in enumerate(str_data[0]): - N = len(set(x[i] for x in str_data)) - S = st.entropy([*Counter(x[i] for x in str_data).values()]) - if S == 0: - randomness_per_index.append(0.0) - else: - randomness_per_index.append(S / np.log(N)) - - mean_randomness = np.mean(randomness_per_index) if len(randomness_per_index) > 0 else 0 - if mean_randomness > 0.95: - return 'Hash-like identifier' - - # Detect foreign key - if data_dtype == dtype.integer: - if _is_foreign_key_name(column_name): - return 'Foreign key' - - if _is_identifier_name(column_name) or data_dtype in (dtype.categorical, dtype.binary): - if unique_pct > 0.98: - if is_uuid: - return 'UUID' - else: - return 'Unknown identifier' - - # Everything is unique and it's too short to be rich text - if data_dtype in (dtype.categorical, dtype.binary, dtype.short_text, dtype.rich_text) and \ - unique_pct > 0.99999 and mean_spaces < 1: - return 'Unknown identifier' - - return None - - -def _is_foreign_key_name(name): - for endings in ['id', 'ID', 'Id']: - for add in ['-', '_', ' ']: - if name.endswith(add + endings): - return True - for endings in ['ID', 'Id']: - if name.endswith(endings): - return True - return False - - -def _is_identifier_name(name): - for keyword in ['account', 'uuid', 'identifier', 'user']: - if keyword in name: - return True - return False - - def cast_string_to_python_type(string): """ Returns None, an integer, float or a string from a string""" if string is None or string == '': @@ -173,7 +92,6 @@ def cast_string_to_python_type(string): return string -# TODO: Should this be here? def clean_float(val): if isinstance(val, (int, float)): return float(val) @@ -194,100 +112,72 @@ def clean_float(val): return None -def get_language_dist(data): - lang_dist = defaultdict(lambda: 0) - lang_dist['Unknown'] = 0 - lang_probs_cache = dict() - identifier = LanguageIdentifier.from_modelstring(langid_model, norm_probs=True) - for text in data: - text = str(text) - text = text.translate(str.maketrans('', '', string.punctuation)) - if text not in lang_probs_cache: - try: - lang_probs = identifier.classify(text) - except Exception: - lang_probs = [] - lang_probs_cache[text] = lang_probs - - lang_probs = lang_probs_cache[text] - if len(lang_probs) > 0 and lang_probs[1] > 10 * (1 / len(identifier.nb_classes)): - lang_dist[lang_probs[0]] += 1 - else: - lang_dist['Unknown'] += 1 - - return dict(lang_dist) - - -def analyze_sentences(data): - nr_words = 0 - word_dist = defaultdict(int) - nr_words_dist = defaultdict(int) - stop_words = set(stopwords.words('english')) - for text in map(str, data): - text = text.lower() - text_dist = defaultdict(int) - tokens = tokenize_text(text) - tokens_no_stop = (x for x in tokens if x not in stop_words) - for tok in tokens_no_stop: - text_dist[tok] += 1 - - n_tokens = len(text_dist) - nr_words_dist[n_tokens] += 1 - nr_words += n_tokens - - # merge text_dist into word_dist - for k, v in text_dist.items(): - word_dist[k] += v - - return nr_words, dict(word_dist), dict(nr_words_dist) - - -# @TODO: eventually move these into .helpers.text -def tokenize_text(text): - """ Generator instead of list comprehension for optimal memory usage & runtime """ - return (t.lower() for t in nltk.word_tokenize(decontracted(text)) if contains_alnum(t)) - - -def decontracted(phrase): - # specific - phrase = re.sub(r"won\'t", "will not", phrase) - phrase = re.sub(r"can\'t", "can not", phrase) - - # general - phrase = re.sub(r"n\'t", " not", phrase) - phrase = re.sub(r"\'re", " are", phrase) - phrase = re.sub(r"\'s", " is", phrase) - phrase = re.sub(r"\'d", " would", phrase) - phrase = re.sub(r"\'ll", " will", phrase) - phrase = re.sub(r"\'t", " not", phrase) - phrase = re.sub(r"\'ve", " have", phrase) - phrase = re.sub(r"\'m", " am", phrase) - return phrase - +def sample_data(df: pd.DataFrame) -> pd.DataFrame: + population_size = len(df) + if population_size <= 50: + sample_size = population_size + else: + sample_size = int(round(_calculate_sample_size(population_size))) -def contains_alnum(text): - for c in text: - if c.isalnum(): - return True - return False + population_size = len(df) + input_data_sample_indexes = random.sample(range(population_size), sample_size) + return df.iloc[input_data_sample_indexes] -def get_nr_procs(df=None): - if 'MINDSDB_N_WORKERS' in os.environ: - try: - n = int(os.environ['MINDSDB_N_WORKERS']) - except ValueError: - n = 1 - return n - elif os.name == 'nt': - return 1 +def _calculate_sample_size( + population_size, + margin_error=.01, + confidence_level=.995, + sigma=1 / 2 +): + """ + Calculate the minimal sample size to use to achieve a certain + margin of error and confidence level for a sample estimate + of the population mean. + Inputs + ------- + population_size: integer + Total size of the population that the sample is to be drawn from. + margin_error: number + Maximum expected difference between the true population parameter, + such as the mean, and the sample estimate. + confidence_level: number in the interval (0, 1) + If we were to draw a large number of equal-size samples + from the population, the true population parameter + should lie within this percentage + of the intervals (sample_parameter - e, sample_parameter + e) + where e is the margin_error. + sigma: number + The standard deviation of the population. For the case + of estimating a parameter in the interval [0, 1], sigma=1/2 + should be sufficient. + """ + alpha = 1 - confidence_level + # dictionary of confidence levels and corresponding z-scores + # computed via norm.ppf(1 - (alpha/2)), where norm is + # a normal distribution object in scipy.stats. + # Here, ppf is the percentile point function. + zdict = { + .90: 1.645, + .91: 1.695, + .99: 2.576, + .97: 2.17, + .94: 1.881, + .93: 1.812, + .95: 1.96, + .98: 2.326, + .96: 2.054, + .92: 1.751 + } + if confidence_level in zdict: + z = zdict[confidence_level] else: - available_mem = psutil.virtual_memory().available - if df is not None: - max_per_proc_usage = df.size - else: - max_per_proc_usage = 0.2 * pow(10, 9) # multiplier * 1GB - - proc_count = int(min(mp.cpu_count() - 1, available_mem // max_per_proc_usage)) - - return max(proc_count, 1) + # Inf fix + if alpha == 0.0: + alpha += 0.001 + z = norm.ppf(1 - (alpha / 2)) + N = population_size + M = margin_error + numerator = z**2 * sigma**2 * (N / (N - 1)) + denom = M**2 + ((z**2 * sigma**2) / (N - 1)) + return numerator / denom diff --git a/type_infer/infer.py b/type_infer/infer.py deleted file mode 100644 index dd329f3..0000000 --- a/type_infer/infer.py +++ /dev/null @@ -1,490 +0,0 @@ -import re -import random -import imghdr -import sndhdr -import multiprocessing as mp -from collections import Counter -from typing import List, Union - -from scipy.stats import norm -import pandas as pd -import numpy as np - -from type_infer.base import TypeInformation -from type_infer.dtype import dtype -from type_infer.helpers import seed, log # TODO: move somewhere else? -from type_infer.helpers import get_nr_procs -from type_infer.helpers import is_nan_numeric, get_identifier_description_mp, cast_string_to_python_type, \ - get_language_dist, analyze_sentences - - -# @TODO: hardcode for distance, time, subunits of currency (e.g. cents) and other common units -# @TODO: Add tests with plenty of examples -def get_quantity_col_info(col_data: pd.Series) -> str: - assert isinstance(col_data, pd.Series) - char_const = None - nr_map = set() - for val in col_data: - val = str(val) - char_part = re.sub("[0-9.,]", '', val) - numeric_bit = re.sub("[^0-9.,]", '', val).replace(',', '.') - - if len(char_part) == 0: - char_part = None - - if len(re.sub("[^0-9]", '', numeric_bit)) == 0 or numeric_bit.count('.') > 1: - numeric_bit = None - else: - numeric_bit = float(numeric_bit) - - if numeric_bit is None: - return False, None - else: - nr_map.add(numeric_bit) - - if char_const is None: - char_const = char_part - - if char_part is None or char_part == '-' or char_part != char_const: - return False, None - - if len(nr_map) > 20 and len(nr_map) > len(col_data) / 200: - return True, {char_const: { - 'multiplier': 1 - }} - else: - return False, None - - -def get_binary_type(element: object) -> str: - try: - is_img = imghdr.what(element) - if is_img is not None: - return dtype.image - - # @TODO: currently we don differentiate between audio and video - is_audio = sndhdr.what(element) - # apparently `sndhdr` is really bad.. - for audio_ext in ['.wav', '.mp3']: - if element.endswith(audio_ext): - is_audio = True - if is_audio is not None: - return dtype.audio - except Exception: - # Not a file or file doesn't exist - return None - - -def get_numeric_type(element: object) -> str: - """ Returns the subtype inferred from a number string, or False if its not a number""" - string_as_nr = cast_string_to_python_type(str(element)) - - try: - if string_as_nr == int(string_as_nr): - string_as_nr = int(string_as_nr) - except Exception: - pass - - if isinstance(string_as_nr, float): - return dtype.float - elif isinstance(string_as_nr, int): - return dtype.integer - else: - try: - if is_nan_numeric(element): - return dtype.integer - else: - return None - except Exception: - return None - - -def type_check_sequence(element: object) -> str: - dtype_guess = None - - if isinstance(element, List): - all_nr = all([get_numeric_type(ele) for ele in element]) - if all_nr: - dtype_guess = dtype.num_array - else: - dtype_guess = dtype.cat_array - else: - for sep_char in [',', '\t', '|', ' ']: # @TODO: potential bottleneck, cutoff after a while - all_nr = True - if '[' in element: - ele_arr = element.rstrip(']').lstrip('[').split(sep_char) - else: - ele_arr = element.rstrip(')').lstrip('(').split(sep_char) - - for ele in ele_arr: - if not get_numeric_type(ele): - all_nr = False - break - - if len(ele_arr) > 1 and all_nr: - dtype_guess = dtype.num_array - - return dtype_guess - - -def type_check_date(element: object) -> str: - """ - Check if element corresponds to a date-like object. - """ - # check if element represents a date (no hour/minute/seconds) - is_date = False - # check if element represents a datetime (has hour/minute/seconds) - is_datetime = False - # check if it makes sense to convert element to unix time-stamp by - # evaluating if, when converted, the element represents a number that - # is compatible with a Unix timestamp (number of seconds since 1970-01-01T:00:00:00) - # note that we also check the number is not larger than the "epochalypse time", - # which is when the unix timestamp becomes larger than 2^32 - 1 seconds. We do - # this because timestamps outside this range are likely to be unreliable and hence - # rather treated as every-day numbers. - min_dt = pd.to_datetime('1970-01-01 00:00:00', utc=True) - max_dt = pd.to_datetime('2038-01-19 03:14:08', utc=True) - valid_units = {'ns': 'unix', 'us': 'unix', 'ms': 'unix', 's': 'unix', - 'D': 'julian'} - for unit, origin in valid_units.items(): - try: - as_dt = pd.to_datetime(element, unit=unit, origin=origin, - errors='raise') - if min_dt < as_dt < max_dt: - is_datetime = True - break - except Exception: - pass - # check if element represents a date-like object. - # here we don't check for a validity range like with unix-timestamps - # because dates as string usually represent something more general than - # just the number of seconds since an epoch. - try: - as_dt = pd.to_datetime(element, errors='raise') - is_datetime = True - except Exception: - pass - # finally, if element is represents a datetime object, check if only - # date part is contained (no time information) - if is_datetime: - # round element day (drop hour/minute/second) - dt_d = as_dt.to_period('D').to_timestamp() - # if rounded datetime equals the datetime itself, it means there was not - # hour/minute/second information to begin with. Mind the 'localize' to - # avoid time-zone BS to kick in. - is_date = dt_d == as_dt.tz_localize(None) - if is_date: - return dtype.date - if is_datetime: - return dtype.datetime - - return None - - -def count_data_types_in_column(data): - dtype_counts = Counter() - - type_checkers = [get_numeric_type, - type_check_sequence, - get_binary_type, - type_check_date] - - for element in data: - for type_checker in type_checkers: - try: - dtype_guess = type_checker(element) - except Exception: - dtype_guess = None - if dtype_guess is not None: - dtype_counts[dtype_guess] += 1 - break - else: - dtype_counts[dtype.invalid] += 1 - - return dtype_counts - - -def get_column_data_type(data: Union[np.ndarray, list], full_data: pd.DataFrame, col_name: str, pct_invalid: float): - """ - Provided the column data, define its data type and data subtype. - - :param data: an iterable containing a sample of the data frame - :param full_data: an iterable containing the whole column of a data frame - - :return: type and type distribution, we can later use type_distribution to determine data quality - NOTE: type distribution is the count that this column has for belonging cells to each DATA_TYPE - """ - log.info(f'Infering type for: {col_name}') - additional_info = {'other_potential_dtypes': []} - - warn = [] - info = [] - if len(data) == 0: - warn.append(f'Column {col_name} has no data in it. ') - warn.append(f'Please remove {col_name} from the training file or fill in some of the values !') - return None, None, additional_info, warn, info - - dtype_counts = count_data_types_in_column(data) - - known_dtype_dist = {k: v for k, v in dtype_counts.items()} - if dtype.float in known_dtype_dist and dtype.integer in known_dtype_dist: - known_dtype_dist[dtype.float] += known_dtype_dist[dtype.integer] - del known_dtype_dist[dtype.integer] - - if dtype.datetime in known_dtype_dist and dtype.date in known_dtype_dist: - known_dtype_dist[dtype.datetime] += known_dtype_dist[dtype.date] - del known_dtype_dist[dtype.date] - - max_known_dtype, max_known_dtype_count = max( - known_dtype_dist.items(), - key=lambda kv: kv[1] - ) - - actual_pct_invalid = 100 * (len(data) - max_known_dtype_count) / len(data) - if max_known_dtype is None or max_known_dtype == dtype.invalid: - curr_dtype = None - elif actual_pct_invalid > pct_invalid: - if max_known_dtype in (dtype.integer, dtype.float) and actual_pct_invalid <= 5 * pct_invalid: - curr_dtype = max_known_dtype - else: - curr_dtype = None - else: - curr_dtype = max_known_dtype - - nr_vals = len(data) - nr_distinct_vals = len(set([str(x) for x in data])) - - # Is it a quantity? - if curr_dtype not in (dtype.datetime, dtype.date): - is_quantity, quantitiy_info = get_quantity_col_info(data) - if is_quantity: - additional_info['quantitiy_info'] = quantitiy_info - curr_dtype = dtype.quantity - known_dtype_dist = { - dtype.quantity: nr_vals - } - - # Check for Tags subtype - if curr_dtype not in (dtype.quantity, dtype.num_array): - lengths = [] - unique_tokens = set() - - can_be_tags = False - if all(isinstance(x, str) for x in data): - can_be_tags = True - - mean_lenghts = np.mean(lengths) if len(lengths) > 0 else 0 - - # If more than 30% of the samples contain more than 1 category and there's more than 6 and less than 30 of them and they are shared between the various cells # noqa - if (can_be_tags and mean_lenghts > 1.3 and - 6 <= len(unique_tokens) <= 30 and - len(unique_tokens) / mean_lenghts < (len(data) / 4)): - curr_dtype = dtype.tags - - # Categorical based on unique values - if curr_dtype not in (dtype.date, dtype.datetime, dtype.tags, dtype.cat_array): - if curr_dtype in (dtype.integer, dtype.float): - is_categorical = nr_distinct_vals < 10 - else: - is_categorical = nr_distinct_vals < min(max((nr_vals / 100), 10), 3000) - - if is_categorical: - if curr_dtype is not None: - additional_info['other_potential_dtypes'].append(curr_dtype) - curr_dtype = dtype.categorical - - # If curr_data_type is still None, then it's text or category - if curr_dtype is None: - log.info(f'Doing text detection for column: {col_name}') - lang_dist = get_language_dist(data) # TODO: bottleneck - - # Normalize lang probabilities - for lang in lang_dist: - lang_dist[lang] /= len(data) - - # If most cells are unknown language then it's categorical - if lang_dist['Unknown'] > 0.5: - curr_dtype = dtype.categorical - else: - nr_words, word_dist, nr_words_dist = analyze_sentences(data) # TODO: maybe pass entire corpus at once - - if 1 in nr_words_dist and nr_words_dist[1] == nr_words: - curr_dtype = dtype.categorical - else: - if len(word_dist) > 500 and nr_words / len(data) > 5: - curr_dtype = dtype.rich_text - else: - curr_dtype = dtype.short_text - - return curr_dtype, {curr_dtype: len(data)}, additional_info, warn, info - - if curr_dtype in [dtype.categorical, dtype.rich_text, dtype.short_text, dtype.cat_array]: - known_dtype_dist = {curr_dtype: len(data)} - - if nr_distinct_vals < 3 and curr_dtype == dtype.categorical: - curr_dtype = dtype.binary - known_dtype_dist[dtype.binary] = known_dtype_dist[dtype.categorical] - del known_dtype_dist[dtype.categorical] - - log.info(f'Column {col_name} has data type {curr_dtype}') - return curr_dtype, known_dtype_dist, additional_info, warn, info - - -def calculate_sample_size( - population_size, - margin_error=.01, - confidence_level=.995, - sigma=1 / 2 -): - """ - Calculate the minimal sample size to use to achieve a certain - margin of error and confidence level for a sample estimate - of the population mean. - Inputs - ------- - population_size: integer - Total size of the population that the sample is to be drawn from. - margin_error: number - Maximum expected difference between the true population parameter, - such as the mean, and the sample estimate. - confidence_level: number in the interval (0, 1) - If we were to draw a large number of equal-size samples - from the population, the true population parameter - should lie within this percentage - of the intervals (sample_parameter - e, sample_parameter + e) - where e is the margin_error. - sigma: number - The standard deviation of the population. For the case - of estimating a parameter in the interval [0, 1], sigma=1/2 - should be sufficient. - """ - alpha = 1 - (confidence_level) - # dictionary of confidence levels and corresponding z-scores - # computed via norm.ppf(1 - (alpha/2)), where norm is - # a normal distribution object in scipy.stats. - # Here, ppf is the percentile point function. - zdict = { - .90: 1.645, - .91: 1.695, - .99: 2.576, - .97: 2.17, - .94: 1.881, - .93: 1.812, - .95: 1.96, - .98: 2.326, - .96: 2.054, - .92: 1.751 - } - if confidence_level in zdict: - z = zdict[confidence_level] - else: - # Inf fix - if alpha == 0.0: - alpha += 0.001 - z = norm.ppf(1 - (alpha / 2)) - N = population_size - M = margin_error - numerator = z**2 * sigma**2 * (N / (N - 1)) - denom = M**2 + ((z**2 * sigma**2) / (N - 1)) - return numerator / denom - - -def sample_data(df: pd.DataFrame) -> pd.DataFrame: - population_size = len(df) - if population_size <= 50: - sample_size = population_size - else: - sample_size = int(round(calculate_sample_size(population_size))) - - population_size = len(df) - input_data_sample_indexes = random.sample(range(population_size), sample_size) - return df.iloc[input_data_sample_indexes] - - -def infer_types( - data: pd.DataFrame, - pct_invalid: float, - seed_nr: int = 420, - mp_cutoff: int = 1e4, -) -> TypeInformation: - """ - Infers the data types of each column of the dataset by analyzing a small sample of - each column's items. - - Inputs - ---------- - data : pd.DataFrame - The input dataset for which we want to infer data type information. - pct_invalid : float - The percentage, i.e. a float between 0.0 and 100.0, of invalid values that are - accepted before failing the type inference for a column. - seed_nr : int, optional - Seed for the random number generator, by default 420 - mp_cutoff : int, optional - How many elements in the dataframe before switching to parallel processing, by - default 1e4. - """ - seed(seed_nr) - type_information = TypeInformation() - sample_df = sample_data(data) - sample_size = len(sample_df) - population_size = len(data) - log.info(f'Analyzing a sample of {sample_size}') - log.info( - f'from a total population of {population_size}, this is equivalent to {round(sample_size*100/population_size, 1)}% of your data.') # noqa - - nr_procs = get_nr_procs(df=sample_df) - pool_size = min(nr_procs, len(sample_df.columns.values)) - if data.size > mp_cutoff and pool_size > 1: - log.info(f'Using {pool_size} processes to deduct types.') - pool = mp.Pool(processes=pool_size) - # column-wise parallelization # TODO: evaluate switching to row-wise split instead - answer_arr = pool.starmap(get_column_data_type, [ - (sample_df[x].dropna(), data[x], x, pct_invalid) for x in sample_df.columns.values - ]) - pool.close() - pool.join() - else: - answer_arr = [] - for x in sample_df.columns: - answer_arr.append(get_column_data_type(sample_df[x].dropna(), data, x, pct_invalid)) - - for i, col_name in enumerate(sample_df.columns): - (data_dtype, data_dtype_dist, additional_info, warn, info) = answer_arr[i] - - for msg in warn: - log.warning(msg) - for msg in info: - log.info(msg) - - if data_dtype is None: - data_dtype = dtype.invalid - - type_information.dtypes[col_name] = data_dtype - type_information.additional_info[col_name] = { - 'dtype_dist': data_dtype_dist - } - - if data.size > mp_cutoff and pool_size > 1: - pool = mp.Pool(processes=pool_size) - answer_arr = pool.map(get_identifier_description_mp, [ - (data[x], x, type_information.dtypes[x]) - for x in sample_df.columns - ]) - pool.close() - pool.join() - else: - answer_arr = [] - for x in sample_df.columns: - answer = get_identifier_description_mp([data[x], x, type_information.dtypes[x]]) - answer_arr.append(answer) - - for i, col_name in enumerate(sample_df.columns): - # work with the full data - if answer_arr[i] is not None: - log.warning(f'Column {col_name} is an identifier of type "{answer_arr[i]}"') - type_information.identifiers[col_name] = answer_arr[i] - - # @TODO Column removal logic was here, if the column was an identifier, move it elsewhere - - return type_information diff --git a/type_infer/rule_based/__init__.py b/type_infer/rule_based/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/type_infer/rule_based/core.py b/type_infer/rule_based/core.py new file mode 100644 index 0000000..43a35a1 --- /dev/null +++ b/type_infer/rule_based/core.py @@ -0,0 +1,409 @@ +import re +import imghdr +import sndhdr +import multiprocessing as mp +from typing import List, Union +from collections import Counter + +import numpy as np +import pandas as pd + +from type_infer.dtype import dtype +from type_infer.base import BaseEngine, TypeInformation +from type_infer.helpers import log, seed, sample_data, get_nr_procs, is_nan_numeric, cast_string_to_python_type +from type_infer.rule_based.helpers import get_language_dist, analyze_sentences, get_identifier_description_mp + + +class RuleBasedEngine(BaseEngine): + def __init__(self, config=None): + """ + :param config: a dictionary containing the configuration for the engine + pct_invalid : float + The percentage, i.e. a float between 0.0 and 100.0, of invalid values that are + accepted before failing the type inference for a column. + seed : int, optional + Seed for the random number generator, by default 420 + mp_cutoff : int, optional + How many elements in the dataframe before switching to parallel processing, by + default 1e4. + """ + super().__init__(stable=True) + self.config = config if config else {'pct_invalid': 2, 'seed': 420, 'mp_cutoff': 1e4} + + def infer(self, data: pd.DataFrame) -> TypeInformation: + seed(self.config['seed']) + type_information = TypeInformation() + sample_df = sample_data(data) + sample_size = len(sample_df) + population_size = len(data) + log.info(f'Analyzing a sample of {sample_size}') + log.info( + f'from a total population of {population_size}, this is equivalent to {round(sample_size * 100 / population_size, 1)}% of your data.') # noqa + + nr_procs = get_nr_procs(df=sample_df) + pool_size = min(nr_procs, len(sample_df.columns.values)) + if data.size > self.config['mp_cutoff'] and pool_size > 1: + log.info(f'Using {pool_size} processes to deduct types.') + pool = mp.Pool(processes=pool_size) + # column-wise parallelization # TODO: evaluate switching to row-wise split instead + answer_arr = pool.starmap(self.get_column_data_type, [ + (sample_df[x].dropna(), data[x], x, self.config['pct_invalid']) for x in sample_df.columns.values + ]) + pool.close() + pool.join() + else: + answer_arr = [] + for x in sample_df.columns: + answer_arr.append(self.get_column_data_type(sample_df[x].dropna(), data, x, self.config['pct_invalid'])) + + for i, col_name in enumerate(sample_df.columns): + (data_dtype, data_dtype_dist, additional_info, warn, info) = answer_arr[i] + + for msg in warn: + log.warning(msg) + for msg in info: + log.info(msg) + + if data_dtype is None: + data_dtype = dtype.invalid + + type_information.dtypes[col_name] = data_dtype + type_information.additional_info[col_name] = { + 'dtype_dist': data_dtype_dist + } + + if data.size > self.config['mp_cutoff'] and pool_size > 1: + pool = mp.Pool(processes=pool_size) + answer_arr = pool.map(get_identifier_description_mp, [ + (data[x], x, type_information.dtypes[x]) + for x in sample_df.columns + ]) + pool.close() + pool.join() + else: + answer_arr = [] + for x in sample_df.columns: + answer = get_identifier_description_mp([data[x], x, type_information.dtypes[x]]) + answer_arr.append(answer) + + for i, col_name in enumerate(sample_df.columns): + # work with the full data + if answer_arr[i] is not None: + log.warning(f'Column {col_name} is an identifier of type "{answer_arr[i]}"') + type_information.identifiers[col_name] = answer_arr[i] + + # @TODO Column removal logic was here, if the column was an identifier, move it elsewhere + return type_information + + # @TODO: hardcode for distance, time, subunits of currency (e.g. cents) and other common units + # @TODO: Add tests with plenty of examples + + def get_quantity_col_info(self, col_data: pd.Series) -> str: + assert isinstance(col_data, pd.Series) + char_const = None + nr_map = set() + for val in col_data: + val = str(val) + char_part = re.sub("[0-9.,]", '', val) + numeric_bit = re.sub("[^0-9.,]", '', val).replace(',', '.') + + if len(char_part) == 0: + char_part = None + + if len(re.sub("[^0-9]", '', numeric_bit)) == 0 or numeric_bit.count('.') > 1: + numeric_bit = None + else: + numeric_bit = float(numeric_bit) + + if numeric_bit is None: + return False, None + else: + nr_map.add(numeric_bit) + + if char_const is None: + char_const = char_part + + if char_part is None or char_part == '-' or char_part != char_const: + return False, None + + if len(nr_map) > 20 and len(nr_map) > len(col_data) / 200: + return True, {char_const: { + 'multiplier': 1 + }} + else: + return False, None + + def get_binary_type(self, element: object) -> str: + try: + is_img = imghdr.what(element) + if is_img is not None: + return dtype.image + + # @TODO: currently we don differentiate between audio and video + is_audio = sndhdr.what(element) + # apparently `sndhdr` is really bad.. + for audio_ext in ['.wav', '.mp3']: + if element.endswith(audio_ext): + is_audio = True + if is_audio is not None: + return dtype.audio + except Exception: + # Not a file or file doesn't exist + return None + + def get_numeric_type(self, element: object) -> str: + """ Returns the subtype inferred from a number string, or False if its not a number""" + string_as_nr = cast_string_to_python_type(str(element)) + + try: + if string_as_nr == int(string_as_nr): + string_as_nr = int(string_as_nr) + except Exception: + pass + + if isinstance(string_as_nr, float): + return dtype.float + elif isinstance(string_as_nr, int): + return dtype.integer + else: + try: + if is_nan_numeric(element): + return dtype.integer + else: + return None + except Exception: + return None + + def type_check_sequence(self, element: object) -> str: + dtype_guess = None + + if isinstance(element, List): + all_nr = all([self.get_numeric_type(ele) for ele in element]) + if all_nr: + dtype_guess = dtype.num_array + else: + dtype_guess = dtype.cat_array + else: + for sep_char in [',', '\t', '|', ' ']: # @TODO: potential bottleneck, cutoff after a while + all_nr = True + if '[' in element: + ele_arr = element.rstrip(']').lstrip('[').split(sep_char) + else: + ele_arr = element.rstrip(')').lstrip('(').split(sep_char) + + for ele in ele_arr: + if not self.get_numeric_type(ele): + all_nr = False + break + + if len(ele_arr) > 1 and all_nr: + dtype_guess = dtype.num_array + + return dtype_guess + + @staticmethod + def type_check_date(element: object) -> str: + """ + Check if element corresponds to a date-like object. + """ + # check if element represents a date (no hour/minute/seconds) + is_date = False + # check if element represents a datetime (has hour/minute/seconds) + is_datetime = False + # check if it makes sense to convert element to unix time-stamp by + # evaluating if, when converted, the element represents a number that + # is compatible with a Unix timestamp (number of seconds since 1970-01-01T:00:00:00) + # note that we also check the number is not larger than the "epochalypse time", + # which is when the unix timestamp becomes larger than 2^32 - 1 seconds. We do + # this because timestamps outside this range are likely to be unreliable and hence + # rather treated as every-day numbers. + min_dt = pd.to_datetime('1970-01-01 00:00:00', utc=True) + max_dt = pd.to_datetime('2038-01-19 03:14:08', utc=True) + valid_units = {'ns': 'unix', 'us': 'unix', 'ms': 'unix', 's': 'unix', + 'D': 'julian'} + for unit, origin in valid_units.items(): + try: + as_dt = pd.to_datetime(element, unit=unit, origin=origin, + errors='raise') + if min_dt < as_dt < max_dt: + is_datetime = True + break + except Exception: + pass + # check if element represents a date-like object. + # here we don't check for a validity range like with unix-timestamps + # because dates as string usually represent something more general than + # just the number of seconds since an epoch. + try: + as_dt = pd.to_datetime(element, errors='raise') + is_datetime = True + except Exception: + pass + # finally, if element is represents a datetime object, check if only + # date part is contained (no time information) + if is_datetime: + # round element day (drop hour/minute/second) + dt_d = as_dt.to_period('D').to_timestamp() + # if rounded datetime equals the datetime itself, it means there was not + # hour/minute/second information to begin with. Mind the 'localize' to + # avoid time-zone BS to kick in. + is_date = dt_d == as_dt.tz_localize(None) + if is_date: + return dtype.date + if is_datetime: + return dtype.datetime + + return None + + def count_data_types_in_column(self, data): + dtype_counts = Counter() + + type_checkers = [self.get_numeric_type, + self.type_check_sequence, + self.get_binary_type, + self.type_check_date] + + for element in data: + for type_checker in type_checkers: + try: + dtype_guess = type_checker(element) + except Exception: + dtype_guess = None + if dtype_guess is not None: + dtype_counts[dtype_guess] += 1 + break + else: + dtype_counts[dtype.invalid] += 1 + + return dtype_counts + + def get_column_data_type(self, + data: Union[pd.Series, np.ndarray, list], + full_data: pd.DataFrame, + col_name: str, + pct_invalid: float + ): + """ + Provided the column data, define its data type and data subtype. + + :param data: an iterable containing a sample of the data frame + :param full_data: an iterable containing the whole column of a data frame + + :return: type and type distribution, we can later use type_distribution to determine data quality + NOTE: type distribution is the count that this column has for belonging cells to each DATA_TYPE + """ + log.info(f'Infering type for: {col_name}') + additional_info = {'other_potential_dtypes': []} + + warn = [] + info = [] + if len(data) == 0: + warn.append(f'Column {col_name} has no data in it. ') + warn.append(f'Please remove {col_name} from the training file or fill in some of the values !') + return None, None, additional_info, warn, info + + dtype_counts = self.count_data_types_in_column(data) + + known_dtype_dist = {k: v for k, v in dtype_counts.items()} + if dtype.float in known_dtype_dist and dtype.integer in known_dtype_dist: + known_dtype_dist[dtype.float] += known_dtype_dist[dtype.integer] + del known_dtype_dist[dtype.integer] + + if dtype.datetime in known_dtype_dist and dtype.date in known_dtype_dist: + known_dtype_dist[dtype.datetime] += known_dtype_dist[dtype.date] + del known_dtype_dist[dtype.date] + + max_known_dtype, max_known_dtype_count = max( + known_dtype_dist.items(), + key=lambda kv: kv[1] + ) + + actual_pct_invalid = 100 * (len(data) - max_known_dtype_count) / len(data) + if max_known_dtype is None or max_known_dtype == dtype.invalid: + curr_dtype = None + elif actual_pct_invalid > self.config['pct_invalid']: + if max_known_dtype in (dtype.integer, dtype.float) and actual_pct_invalid <= 5 * self.config['pct_invalid']: + curr_dtype = max_known_dtype + else: + curr_dtype = None + else: + curr_dtype = max_known_dtype + + nr_vals = len(data) + nr_distinct_vals = len(set([str(x) for x in data])) + + # Is it a quantity? + if curr_dtype not in (dtype.datetime, dtype.date): + is_quantity, quantitiy_info = self.get_quantity_col_info(data) + if is_quantity: + additional_info['quantitiy_info'] = quantitiy_info + curr_dtype = dtype.quantity + known_dtype_dist = { + dtype.quantity: nr_vals + } + + # Check for Tags subtype + if curr_dtype not in (dtype.quantity, dtype.num_array): + lengths = [] + unique_tokens = set() + + can_be_tags = False + if all(isinstance(x, str) for x in data): + can_be_tags = True + + mean_lenghts = np.mean(lengths) if len(lengths) > 0 else 0 + + # If more than 30% of the samples contain more than 1 category and there's more than 6 and less than 30 of them and they are shared between the various cells # noqa + if (can_be_tags and mean_lenghts > 1.3 and + 6 <= len(unique_tokens) <= 30 and + len(unique_tokens) / mean_lenghts < (len(data) / 4)): + curr_dtype = dtype.tags + + # Categorical based on unique values + if curr_dtype not in (dtype.date, dtype.datetime, dtype.tags, dtype.cat_array): + if curr_dtype in (dtype.integer, dtype.float): + is_categorical = nr_distinct_vals < 10 + else: + is_categorical = nr_distinct_vals < min(max((nr_vals / 100), 10), 3000) + + if is_categorical: + if curr_dtype is not None: + additional_info['other_potential_dtypes'].append(curr_dtype) + curr_dtype = dtype.categorical + + # If curr_data_type is still None, then it's text or category + if curr_dtype is None: + log.info(f'Doing text detection for column: {col_name}') + lang_dist = get_language_dist(data) # TODO: bottleneck + + # Normalize lang probabilities + for lang in lang_dist: + lang_dist[lang] /= len(data) + + # If most cells are unknown language then it's categorical + if lang_dist['Unknown'] > 0.5: + curr_dtype = dtype.categorical + else: + nr_words, word_dist, nr_words_dist = analyze_sentences(data) # TODO: maybe pass entire corpus at once + + if 1 in nr_words_dist and nr_words_dist[1] == nr_words: + curr_dtype = dtype.categorical + else: + if len(word_dist) > 500 and nr_words / len(data) > 5: + curr_dtype = dtype.rich_text + else: + curr_dtype = dtype.short_text + + return curr_dtype, {curr_dtype: len(data)}, additional_info, warn, info + + if curr_dtype in [dtype.categorical, dtype.rich_text, dtype.short_text, dtype.cat_array]: + known_dtype_dist = {curr_dtype: len(data)} + + if nr_distinct_vals < 3 and curr_dtype == dtype.categorical: + curr_dtype = dtype.binary + known_dtype_dist[dtype.binary] = known_dtype_dist[dtype.categorical] + del known_dtype_dist[dtype.categorical] + + log.info(f'Column {col_name} has data type {curr_dtype}') + return curr_dtype, known_dtype_dist, additional_info, warn, info + diff --git a/type_infer/rule_based/helpers.py b/type_infer/rule_based/helpers.py new file mode 100644 index 0000000..8e3049b --- /dev/null +++ b/type_infer/rule_based/helpers.py @@ -0,0 +1,182 @@ +import re +import nltk +import string +from typing import Iterable +from collections import Counter, defaultdict + +import numpy as np +import scipy.stats as st +from langid.langid import LanguageIdentifier +from langid.langid import model as langid_model + +from type_infer.dtype import dtype + + +try: + nltk.data.find('tokenizers/punkt') +except LookupError: + nltk.download('punkt') + +try: + from nltk.corpus import stopwords + stopwords.words('english') +except LookupError: + nltk.download('stopwords', quiet=True) + + +def get_identifier_description_mp(arg_tup): + data, column_name, data_dtype = arg_tup + return get_identifier_description(data, column_name, data_dtype) + + +def get_identifier_description(data: Iterable, column_name: str, data_dtype: dtype): + data = list(data) + if isinstance(data[0], list): + nr_unique = len(set(tuple(x) for x in data)) + elif isinstance(data[0], dict): + nr_unique = len(set(str(x) for x in data)) + else: + nr_unique = len(set(data)) + + if nr_unique == 1: + return 'No Information' + + unique_pct = nr_unique / len(data) + + spaces = [len(str(x).split(' ')) - 1 for x in data] + mean_spaces = np.mean(spaces) if len(spaces) > 0 else 0.0 + + # Detect hash + all_same_length = all(len(str(data[0])) == len(str(x)) for x in data) + uuid_charset = set('0123456789abcdefABCDEF-') + all_uuid_charset = all(set(str(x)).issubset(uuid_charset) for x in data) + is_uuid = all_uuid_charset and all_same_length + + if all_same_length and len(data) == nr_unique and data_dtype not in (dtype.integer, dtype.float): + str_data = [str(x) for x in data] + randomness_per_index = [] + for i, _ in enumerate(str_data[0]): + N = len(set(x[i] for x in str_data)) + S = st.entropy([*Counter(x[i] for x in str_data).values()]) + if S == 0: + randomness_per_index.append(0.0) + else: + randomness_per_index.append(S / np.log(N)) + + mean_randomness = np.mean(randomness_per_index) if len(randomness_per_index) > 0 else 0 + if mean_randomness > 0.95: + return 'Hash-like identifier' + + # Detect foreign key + if data_dtype == dtype.integer: + if _is_foreign_key_name(column_name): + return 'Foreign key' + + if _is_identifier_name(column_name) or data_dtype in (dtype.categorical, dtype.binary): + if unique_pct > 0.98: + if is_uuid: + return 'UUID' + else: + return 'Unknown identifier' + + # Everything is unique and it's too short to be rich text + if data_dtype in (dtype.categorical, dtype.binary, dtype.short_text, dtype.rich_text) and \ + unique_pct > 0.99999 and mean_spaces < 1: + return 'Unknown identifier' + + return None + + +def _is_foreign_key_name(name): + for endings in ['id', 'ID', 'Id']: + for add in ['-', '_', ' ']: + if name.endswith(add + endings): + return True + for endings in ['ID', 'Id']: + if name.endswith(endings): + return True + return False + + +def _is_identifier_name(name): + for keyword in ['account', 'uuid', 'identifier', 'user']: + if keyword in name: + return True + return False + + +def get_language_dist(data): + lang_dist = defaultdict(lambda: 0) + lang_dist['Unknown'] = 0 + lang_probs_cache = dict() + identifier = LanguageIdentifier.from_modelstring(langid_model, norm_probs=True) + for text in data: + text = str(text) + text = text.translate(str.maketrans('', '', string.punctuation)) + if text not in lang_probs_cache: + try: + lang_probs = identifier.classify(text) + except Exception: + lang_probs = [] + lang_probs_cache[text] = lang_probs + + lang_probs = lang_probs_cache[text] + if len(lang_probs) > 0 and lang_probs[1] > 10 * (1 / len(identifier.nb_classes)): + lang_dist[lang_probs[0]] += 1 + else: + lang_dist['Unknown'] += 1 + + return dict(lang_dist) + + +def analyze_sentences(data): + nr_words = 0 + word_dist = defaultdict(int) + nr_words_dist = defaultdict(int) + stop_words = set(stopwords.words('english')) + for text in map(str, data): + text = text.lower() + text_dist = defaultdict(int) + tokens = tokenize_text(text) + tokens_no_stop = (x for x in tokens if x not in stop_words) + for tok in tokens_no_stop: + text_dist[tok] += 1 + + n_tokens = len(text_dist) + nr_words_dist[n_tokens] += 1 + nr_words += n_tokens + + # merge text_dist into word_dist + for k, v in text_dist.items(): + word_dist[k] += v + + return nr_words, dict(word_dist), dict(nr_words_dist) + + +def contains_alnum(text): + for c in text: + if c.isalnum(): + return True + return False + + +def tokenize_text(text): + """ Generator instead of list comprehension for optimal memory usage & runtime """ + return (t.lower() for t in nltk.word_tokenize(decontracted(text)) if contains_alnum(t)) + + +def decontracted(phrase): + # specific + phrase = re.sub(r"won\'t", "will not", phrase) + phrase = re.sub(r"can\'t", "can not", phrase) + + # general + phrase = re.sub(r"n\'t", " not", phrase) + phrase = re.sub(r"\'re", " are", phrase) + phrase = re.sub(r"\'s", " is", phrase) + phrase = re.sub(r"\'d", " would", phrase) + phrase = re.sub(r"\'ll", " will", phrase) + phrase = re.sub(r"\'t", " not", phrase) + phrase = re.sub(r"\'ve", " have", phrase) + phrase = re.sub(r"\'m", " am", phrase) + return phrase