cleanup

paxcema · paxcema · commit c415465a9057 · 2023-12-22T20:36:22.000+09:00
diff --git a/tests/unit_tests/rule_based/test_infer_dtypes.py b/tests/unit_tests/rule_based/test_infer_dtypes.py
@@ -15,7 +15,7 @@ def test_negative_integers(self):
         self.assertEqual(dtyp, dtype.integer)
 
     def test_negative_floats(self):
-        data = pd.DataFrame([-random.randint(-10, 10) for _ in range(100)] + [0.1], columns=['test_col'])
+        data = pd.DataFrame([float(-random.randint(-10, 10)) for _ in range(100)] + [0.1], columns=['test_col'])
         engine = RuleBasedEngine()
         dtyp, dist, ainfo, warn, info = engine.get_column_data_type(data['test_col'], data, 'test_col', 0.0)
         self.assertEqual(dtyp, dtype.float)
diff --git a/tests/unit_tests/rule_based/test_misc.py b/tests/unit_tests/rule_based/test_misc.py
@@ -0,0 +1,12 @@
+import unittest
+
+from type_infer.rule_based.helpers import tokenize_text
+
+
+class TestDates(unittest.TestCase):
+    def test_get_tokens(self):
+        sentences = ['hello, world!', ' !hello! world!!,..#', '#hello!world']
+        for sent in sentences:
+            assert list(tokenize_text(sent)) == ['hello', 'world']
+
+        assert list(tokenize_text("don't wouldn't")) == ['do', 'not', 'would', 'not']
diff --git a/tests/unit_tests/test_misc.py b/tests/unit_tests/test_misc.py
@@ -3,7 +3,6 @@
 from pathlib import Path
 
 import type_infer
-from type_infer.rule_based.helpers import tokenize_text
 
 
 class TestDates(unittest.TestCase):
@@ -19,10 +18,3 @@ def test_versions_are_in_sync(self):
         package_init_version = type_infer.__version__
 
         self.assertEqual(package_init_version, pyproject_version)
-
-    def test_get_tokens(self):
-        sentences = ['hello, world!', ' !hello! world!!,..#', '#hello!world']
-        for sent in sentences:
-            assert list(tokenize_text(sent)) == ['hello', 'world']
-
-        assert list(tokenize_text("don't wouldn't")) == ['do', 'not', 'would', 'not']
diff --git a/type_infer/__init__.py b/type_infer/__init__.py
@@ -3,13 +3,10 @@
 from type_infer import api
 from type_infer import helpers
 
-from type_infer.api import ENGINES
-
 __version__ = '0.0.18'
 
 
 __all__ = [
     '__version__',
     'base', 'dtype', 'api', 'helpers',
-    'ENGINES'
 ]
diff --git a/type_infer/api.py b/type_infer/api.py
@@ -1,14 +1,10 @@
 from typing import Dict, Optional
 import pandas as pd
 
-from type_infer.base import TypeInformation
+from type_infer.base import TypeInformation, ENGINES
 from type_infer.rule_based.core import RuleBasedEngine
 
 
-class ENGINES:
-    RULE_BASED = 'rule_based'
-
-
 def infer_types(
         data: pd.DataFrame,
         config: Optional[Dict] = None
diff --git a/type_infer/base.py b/type_infer/base.py
@@ -33,3 +33,7 @@ def __init__(self, stable = True):
     def infer(self, df) -> TypeInformation:
         """Given a dataframe, infer the types of each column and return a TypeInformation object."""
         raise NotImplementedError
+
+
+class ENGINES:
+    RULE_BASED = 'rule_based'
diff --git a/type_infer/bert/core.py b/type_infer/bert/core.py
@@ -0,0 +1,9 @@
+from type_infer.base import BaseEngine
+
+
+class BERType(BaseEngine):
+    def __init__(self, stable=False):
+        super().__init__(stable=stable)
+
+    def infer(self, df):
+        raise NotImplementedError
diff --git a/type_infer/bert/infer.py b/type_infer/bert/infer.py
diff --git a/type_infer/dtype.py b/type_infer/dtype.py
@@ -46,4 +46,5 @@ class dtype:
     empty = "empty"
     invalid = "invalid"
 
-# TODO: introduce "modifiers"?
+
+# TODO: modifier class + system
diff --git a/type_infer/helpers.py b/type_infer/helpers.py
@@ -1,6 +1,4 @@
 import os
-
-import pandas as pd
 import psutil
 import random
 import logging
@@ -9,12 +7,12 @@
 from typing import Iterable
 
 import numpy as np
+import pandas as pd
 from scipy.stats import norm
 
 
 def initialize_log():
     pid = os.getpid()
-
     handler = colorlog.StreamHandler()
     handler.setFormatter(colorlog.ColoredFormatter())
 
diff --git a/type_infer/rule_based/core.py b/type_infer/rule_based/core.py
@@ -1,18 +1,17 @@
 import re
 import imghdr
 import sndhdr
-from collections import Counter
-from typing import List, Union
 import multiprocessing as mp
+from typing import List, Union
+from collections import Counter
 
-import pandas as pd
 import numpy as np
+import pandas as pd
 
 from type_infer.dtype import dtype
 from type_infer.base import BaseEngine, TypeInformation
-from type_infer.helpers import log, seed, sample_data, get_nr_procs
+from type_infer.helpers import log, seed, sample_data, get_nr_procs, is_nan_numeric, cast_string_to_python_type
 from type_infer.rule_based.helpers import get_language_dist, analyze_sentences, get_identifier_description_mp
-from type_infer.helpers import is_nan_numeric, cast_string_to_python_type
 
 
 class RuleBasedEngine(BaseEngine):
@@ -284,7 +283,7 @@ def count_data_types_in_column(self, data):
         return dtype_counts
 
 
-    def get_column_data_type(self, data: Union[np.ndarray, list], full_data: pd.DataFrame, col_name: str, pct_invalid: float):
+    def get_column_data_type(self, data: Union[pd.Series, np.ndarray, list], full_data: pd.DataFrame, col_name: str, pct_invalid: float):
         """
         Provided the column data, define its data type and data subtype.
 
diff --git a/type_infer/rule_based/helpers.py b/type_infer/rule_based/helpers.py
@@ -1,15 +1,14 @@
 import re
 import nltk
 import string
+from typing import Iterable
+from collections import Counter, defaultdict
 
 import numpy as np
 import scipy.stats as st
 from langid.langid import LanguageIdentifier
 from langid.langid import model as langid_model
 
-from typing import Iterable
-from collections import Counter, defaultdict
-
 from type_infer.dtype import dtype