From c415465a9057e46071bb5e836395eb15c75cab2f Mon Sep 17 00:00:00 2001 From: Patricio Cerda Mardini Date: Fri, 22 Dec 2023 20:36:22 +0900 Subject: [PATCH] cleanup --- tests/unit_tests/rule_based/test_infer_dtypes.py | 2 +- tests/unit_tests/rule_based/test_misc.py | 12 ++++++++++++ tests/unit_tests/test_misc.py | 8 -------- type_infer/__init__.py | 3 --- type_infer/api.py | 6 +----- type_infer/base.py | 4 ++++ type_infer/bert/core.py | 9 +++++++++ type_infer/bert/infer.py | 1 - type_infer/dtype.py | 3 ++- type_infer/helpers.py | 4 +--- type_infer/rule_based/core.py | 11 +++++------ type_infer/rule_based/helpers.py | 5 ++--- 12 files changed, 37 insertions(+), 31 deletions(-) create mode 100644 tests/unit_tests/rule_based/test_misc.py create mode 100644 type_infer/bert/core.py delete mode 100644 type_infer/bert/infer.py diff --git a/tests/unit_tests/rule_based/test_infer_dtypes.py b/tests/unit_tests/rule_based/test_infer_dtypes.py index 5b601dc..1441d3e 100644 --- a/tests/unit_tests/rule_based/test_infer_dtypes.py +++ b/tests/unit_tests/rule_based/test_infer_dtypes.py @@ -15,7 +15,7 @@ def test_negative_integers(self): self.assertEqual(dtyp, dtype.integer) def test_negative_floats(self): - data = pd.DataFrame([-random.randint(-10, 10) for _ in range(100)] + [0.1], columns=['test_col']) + data = pd.DataFrame([float(-random.randint(-10, 10)) for _ in range(100)] + [0.1], columns=['test_col']) engine = RuleBasedEngine() dtyp, dist, ainfo, warn, info = engine.get_column_data_type(data['test_col'], data, 'test_col', 0.0) self.assertEqual(dtyp, dtype.float) diff --git a/tests/unit_tests/rule_based/test_misc.py b/tests/unit_tests/rule_based/test_misc.py new file mode 100644 index 0000000..1685645 --- /dev/null +++ b/tests/unit_tests/rule_based/test_misc.py @@ -0,0 +1,12 @@ +import unittest + +from type_infer.rule_based.helpers import tokenize_text + + +class TestDates(unittest.TestCase): + def test_get_tokens(self): + sentences = ['hello, world!', ' !hello! world!!,..#', '#hello!world'] + for sent in sentences: + assert list(tokenize_text(sent)) == ['hello', 'world'] + + assert list(tokenize_text("don't wouldn't")) == ['do', 'not', 'would', 'not'] diff --git a/tests/unit_tests/test_misc.py b/tests/unit_tests/test_misc.py index afa579b..4c597b7 100644 --- a/tests/unit_tests/test_misc.py +++ b/tests/unit_tests/test_misc.py @@ -3,7 +3,6 @@ from pathlib import Path import type_infer -from type_infer.rule_based.helpers import tokenize_text class TestDates(unittest.TestCase): @@ -19,10 +18,3 @@ def test_versions_are_in_sync(self): package_init_version = type_infer.__version__ self.assertEqual(package_init_version, pyproject_version) - - def test_get_tokens(self): - sentences = ['hello, world!', ' !hello! world!!,..#', '#hello!world'] - for sent in sentences: - assert list(tokenize_text(sent)) == ['hello', 'world'] - - assert list(tokenize_text("don't wouldn't")) == ['do', 'not', 'would', 'not'] diff --git a/type_infer/__init__.py b/type_infer/__init__.py index 9ec6732..c52ba24 100644 --- a/type_infer/__init__.py +++ b/type_infer/__init__.py @@ -3,13 +3,10 @@ from type_infer import api from type_infer import helpers -from type_infer.api import ENGINES - __version__ = '0.0.18' __all__ = [ '__version__', 'base', 'dtype', 'api', 'helpers', - 'ENGINES' ] diff --git a/type_infer/api.py b/type_infer/api.py index b0bffff..cffcc4d 100644 --- a/type_infer/api.py +++ b/type_infer/api.py @@ -1,14 +1,10 @@ from typing import Dict, Optional import pandas as pd -from type_infer.base import TypeInformation +from type_infer.base import TypeInformation, ENGINES from type_infer.rule_based.core import RuleBasedEngine -class ENGINES: - RULE_BASED = 'rule_based' - - def infer_types( data: pd.DataFrame, config: Optional[Dict] = None diff --git a/type_infer/base.py b/type_infer/base.py index ae4b890..8d3c6e5 100644 --- a/type_infer/base.py +++ b/type_infer/base.py @@ -33,3 +33,7 @@ def __init__(self, stable = True): def infer(self, df) -> TypeInformation: """Given a dataframe, infer the types of each column and return a TypeInformation object.""" raise NotImplementedError + + +class ENGINES: + RULE_BASED = 'rule_based' \ No newline at end of file diff --git a/type_infer/bert/core.py b/type_infer/bert/core.py new file mode 100644 index 0000000..2e99ab0 --- /dev/null +++ b/type_infer/bert/core.py @@ -0,0 +1,9 @@ +from type_infer.base import BaseEngine + + +class BERType(BaseEngine): + def __init__(self, stable=False): + super().__init__(stable=stable) + + def infer(self, df): + raise NotImplementedError diff --git a/type_infer/bert/infer.py b/type_infer/bert/infer.py deleted file mode 100644 index 7896162..0000000 --- a/type_infer/bert/infer.py +++ /dev/null @@ -1 +0,0 @@ -STABLE = False diff --git a/type_infer/dtype.py b/type_infer/dtype.py index e6e5819..178d925 100644 --- a/type_infer/dtype.py +++ b/type_infer/dtype.py @@ -46,4 +46,5 @@ class dtype: empty = "empty" invalid = "invalid" -# TODO: introduce "modifiers"? + +# TODO: modifier class + system diff --git a/type_infer/helpers.py b/type_infer/helpers.py index 641cd23..4e2752b 100644 --- a/type_infer/helpers.py +++ b/type_infer/helpers.py @@ -1,6 +1,4 @@ import os - -import pandas as pd import psutil import random import logging @@ -9,12 +7,12 @@ from typing import Iterable import numpy as np +import pandas as pd from scipy.stats import norm def initialize_log(): pid = os.getpid() - handler = colorlog.StreamHandler() handler.setFormatter(colorlog.ColoredFormatter()) diff --git a/type_infer/rule_based/core.py b/type_infer/rule_based/core.py index c749ffa..73554f8 100644 --- a/type_infer/rule_based/core.py +++ b/type_infer/rule_based/core.py @@ -1,18 +1,17 @@ import re import imghdr import sndhdr -from collections import Counter -from typing import List, Union import multiprocessing as mp +from typing import List, Union +from collections import Counter -import pandas as pd import numpy as np +import pandas as pd from type_infer.dtype import dtype from type_infer.base import BaseEngine, TypeInformation -from type_infer.helpers import log, seed, sample_data, get_nr_procs +from type_infer.helpers import log, seed, sample_data, get_nr_procs, is_nan_numeric, cast_string_to_python_type from type_infer.rule_based.helpers import get_language_dist, analyze_sentences, get_identifier_description_mp -from type_infer.helpers import is_nan_numeric, cast_string_to_python_type class RuleBasedEngine(BaseEngine): @@ -284,7 +283,7 @@ def count_data_types_in_column(self, data): return dtype_counts - def get_column_data_type(self, data: Union[np.ndarray, list], full_data: pd.DataFrame, col_name: str, pct_invalid: float): + def get_column_data_type(self, data: Union[pd.Series, np.ndarray, list], full_data: pd.DataFrame, col_name: str, pct_invalid: float): """ Provided the column data, define its data type and data subtype. diff --git a/type_infer/rule_based/helpers.py b/type_infer/rule_based/helpers.py index dd5c0c2..8e3049b 100644 --- a/type_infer/rule_based/helpers.py +++ b/type_infer/rule_based/helpers.py @@ -1,15 +1,14 @@ import re import nltk import string +from typing import Iterable +from collections import Counter, defaultdict import numpy as np import scipy.stats as st from langid.langid import LanguageIdentifier from langid.langid import model as langid_model -from typing import Iterable -from collections import Counter, defaultdict - from type_infer.dtype import dtype