diff --git a/pyproject.toml b/pyproject.toml
index e7c758e..12775ef 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "type_infer"
-version = "0.0.17"
+version = "0.0.18"
 description = "Automated type inference for Machine Learning pipelines."
 authors = ["MindsDB Inc. <hello@mindsdb.com>"]
 license = "GPL-3.0"
@@ -15,12 +15,17 @@ numpy = "^1.15"
 pandas = "^2"
 dataclasses-json = "^0.6.3"
 colorlog = "^6.5.0"
-langid = "^1.1.6"
-nltk = "^3"
-toml = "^0.10.2"
 psutil = "^5.9.0"
+toml = "^0.10.2"
 
+# rule based deps, part of core
+langid = "^1.1.6"
+nltk = "^3"
 
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
+
+# TODO: update once this engine is introduced
+[tool.poetry.extras]
+# bert = ["torch"]
diff --git a/tests/integration_tests/test_type_infer.py b/tests/integration_tests/test_rule_based.py
similarity index 86%
rename from tests/integration_tests/test_type_infer.py
rename to tests/integration_tests/test_rule_based.py
index 9ea74e1..56d5579 100644
--- a/tests/integration_tests/test_type_infer.py
+++ b/tests/integration_tests/test_rule_based.py
@@ -5,13 +5,14 @@
 from datetime import datetime, timedelta
 
 from type_infer.dtype import dtype
-from type_infer.infer import infer_types
+from type_infer.api import infer_types
 
 
-class TestTypeInference(unittest.TestCase):
+class TestRuleBasedTypeInference(unittest.TestCase):
     def test_0_airline_sentiment(self):
         df = pd.read_csv("tests/data/airline_sentiment_sample.csv")
-        inferred_types = infer_types(df, pct_invalid=0)
+        config = {'engine': 'rule_based', 'pct_invalid': 0, 'seed': 420, 'mp_cutoff': 1e4}
+        inferred_types = infer_types(df, config=config)
 
         expected_types = {
             'airline_sentiment': 'categorical',
@@ -44,6 +45,7 @@ def test_0_airline_sentiment(self):
 
     def test_1_stack_overflow_survey(self):
         df = pd.read_csv("tests/data/stack_overflow_survey_sample.csv")
+        config = {'engine': 'rule_based', 'pct_invalid': 0, 'seed': 420, 'mp_cutoff': 1e4}
 
         expected_types = {
             'Respondent': 'integer',
@@ -68,7 +70,7 @@ def test_1_stack_overflow_survey(self):
             'Professional': 'No Information'
         }
 
-        inferred_types = infer_types(df, pct_invalid=0)
+        inferred_types = infer_types(df, config=config)
 
         for col in expected_types:
             self.assertTrue(expected_types[col], inferred_types.dtypes[col])
@@ -90,7 +92,10 @@ def test_2_simple(self):
         # manual tinkering
         df['float'].iloc[-n_corrupted:] = 'random string'
 
-        inferred_types = infer_types(df, pct_invalid=100 * (n_corrupted) / n_points)
+        pct_invalid = 100 * (n_corrupted) / n_points
+        config = {'engine': 'rule_based', 'pct_invalid': pct_invalid, 'seed': 420, 'mp_cutoff': 1e4}
+
+        inferred_types = infer_types(df, config=config)
         expected_types = {
             'date': dtype.date,
             'datetime': dtype.datetime,
diff --git a/tests/unit_tests/rule_based/__init__.py b/tests/unit_tests/rule_based/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/unit_tests/test_dates.py b/tests/unit_tests/rule_based/test_dates.py
similarity index 94%
rename from tests/unit_tests/test_dates.py
rename to tests/unit_tests/rule_based/test_dates.py
index 6e8bfe7..35ea0e9 100644
--- a/tests/unit_tests/test_dates.py
+++ b/tests/unit_tests/rule_based/test_dates.py
@@ -1,7 +1,9 @@
 import unittest
 
 from type_infer.dtype import dtype
-from type_infer.infer import type_check_date
+from type_infer.rule_based.core import RuleBasedEngine
+
+type_check_date = RuleBasedEngine.type_check_date
 
 
 class TestDates(unittest.TestCase):
diff --git a/tests/unit_tests/rule_based/test_infer_dtypes.py b/tests/unit_tests/rule_based/test_infer_dtypes.py
new file mode 100644
index 0000000..ef3f920
--- /dev/null
+++ b/tests/unit_tests/rule_based/test_infer_dtypes.py
@@ -0,0 +1,22 @@
+import unittest
+import random
+
+import pandas as pd
+from type_infer.rule_based.core import RuleBasedEngine
+from type_infer.dtype import dtype
+
+get_column_data_type = RuleBasedEngine.get_column_data_type
+
+
+class TestInferDtypes(unittest.TestCase):
+    def test_negative_integers(self):
+        data = pd.DataFrame([-random.randint(-10, 10) for _ in range(100)], columns=['test_col'])
+        engine = RuleBasedEngine()
+        dtyp, dist, ainfo, warn, info = engine.get_column_data_type(data['test_col'], data, 'test_col', 0.0)
+        self.assertEqual(dtyp, dtype.integer)
+
+    def test_negative_floats(self):
+        data = pd.DataFrame([float(-random.randint(-10, 10)) for _ in range(100)] + [0.1], columns=['test_col'])
+        engine = RuleBasedEngine()
+        dtyp, dist, ainfo, warn, info = engine.get_column_data_type(data['test_col'], data, 'test_col', 0.0)
+        self.assertEqual(dtyp, dtype.float)
diff --git a/tests/unit_tests/rule_based/test_misc.py b/tests/unit_tests/rule_based/test_misc.py
new file mode 100644
index 0000000..1685645
--- /dev/null
+++ b/tests/unit_tests/rule_based/test_misc.py
@@ -0,0 +1,12 @@
+import unittest
+
+from type_infer.rule_based.helpers import tokenize_text
+
+
+class TestDates(unittest.TestCase):
+    def test_get_tokens(self):
+        sentences = ['hello, world!', ' !hello! world!!,..#', '#hello!world']
+        for sent in sentences:
+            assert list(tokenize_text(sent)) == ['hello', 'world']
+
+        assert list(tokenize_text("don't wouldn't")) == ['do', 'not', 'would', 'not']
diff --git a/tests/unit_tests/test_infer_dtypes.py b/tests/unit_tests/test_infer_dtypes.py
deleted file mode 100644
index c93c28e..0000000
--- a/tests/unit_tests/test_infer_dtypes.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import unittest
-import random
-
-import pandas as pd
-from type_infer.infer import get_column_data_type
-from type_infer.dtype import dtype
-
-
-class TestInferDtypes(unittest.TestCase):
-    def test_negative_integers(self):
-        data = pd.DataFrame([-random.randint(-10, 10) for _ in range(100)], columns=['test_col'])
-        dtyp, dist, ainfo, warn, info = get_column_data_type(data['test_col'], data, 'test_col', 0.0)
-        self.assertEqual(dtyp, dtype.integer)
-
-    def test_negative_floats(self):
-        data = pd.DataFrame([-random.randint(-10, 10) for _ in range(100)] + [0.1], columns=['test_col'])
-        dtyp, dist, ainfo, warn, info = get_column_data_type(data['test_col'], data, 'test_col', 0.0)
-        self.assertEqual(dtyp, dtype.float)
diff --git a/tests/unit_tests/test_misc.py b/tests/unit_tests/test_misc.py
index 5c88338..4c597b7 100644
--- a/tests/unit_tests/test_misc.py
+++ b/tests/unit_tests/test_misc.py
@@ -3,7 +3,6 @@
 from pathlib import Path
 
 import type_infer
-from type_infer.helpers import tokenize_text
 
 
 class TestDates(unittest.TestCase):
@@ -19,10 +18,3 @@ def test_versions_are_in_sync(self):
         package_init_version = type_infer.__version__
 
         self.assertEqual(package_init_version, pyproject_version)
-
-    def test_get_tokens(self):
-        sentences = ['hello, world!', ' !hello! world!!,..#', '#hello!world']
-        for sent in sentences:
-            assert list(tokenize_text(sent)) == ['hello', 'world']
-
-        assert list(tokenize_text("don't wouldn't")) == ['do', 'not', 'would', 'not']
diff --git a/type_infer/__init__.py b/type_infer/__init__.py
index fa649d7..c52ba24 100644
--- a/type_infer/__init__.py
+++ b/type_infer/__init__.py
@@ -1,10 +1,12 @@
 from type_infer import base
 from type_infer import dtype
-from type_infer import infer
+from type_infer import api
 from type_infer import helpers
 
+__version__ = '0.0.18'
 
-__version__ = '0.0.17'
 
-
-__all__ = ['base', 'dtype', 'infer', 'helpers', '__version__']
+__all__ = [
+    '__version__',
+    'base', 'dtype', 'api', 'helpers',
+]
diff --git a/type_infer/api.py b/type_infer/api.py
new file mode 100644
index 0000000..846541a
--- /dev/null
+++ b/type_infer/api.py
@@ -0,0 +1,40 @@
+from typing import Dict, Optional
+import pandas as pd
+
+from type_infer.base import TypeInformation, ENGINES
+from type_infer.rule_based.core import RuleBasedEngine
+
+
+def infer_types(
+        data: pd.DataFrame,
+        config: Optional[Dict] = None
+) -> TypeInformation:
+    """
+    Infers the data types of each column of the dataset by analyzing a small sample of
+    each column's items.
+
+    Inputs
+    ----------
+    data : pd.DataFrame
+        The input dataset for which we want to infer data type information.
+    """
+    # Set global defaults if missing
+    if config is None:
+        config = {'engine': 'rule_based', 'pct_invalid': 2, 'seed': 420, 'mp_cutoff': 1e4}
+    elif 'engine' not in config:
+        config['engine'] = 'rule_based'
+
+    if 'pct_invalid' not in config:
+        config['pct_invalid'] = 2
+
+    if 'seed' not in config:
+        config['seed'] = 420
+
+    if config['engine'] == ENGINES.RULE_BASED:
+        if 'mp_cutoff' not in config:
+            config['mp_cutoff'] = 1e4
+
+        engine = RuleBasedEngine(config)
+        return engine.infer(data)
+    else:
+        raise Exception(f'Unknown engine {config["engine"]}')
diff --git a/type_infer/base.py b/type_infer/base.py
index 94fc108..4bba17d 100644
--- a/type_infer/base.py
+++ b/type_infer/base.py
@@ -24,3 +24,16 @@ def __init__(self):
         self.dtypes = dict()
         self.additional_info = dict()
         self.identifiers = dict()
+
+
+class BaseEngine:
+    def __init__(self, stable=True):
+        self.stable = stable  # whether the engine is stable or not (i.e. experimental)
+
+    def infer(self, df) -> TypeInformation:
+        """Given a dataframe, infer the types of each column and return a TypeInformation object."""
+        raise NotImplementedError
+
+
+class ENGINES:
+    RULE_BASED = 'rule_based'
diff --git a/type_infer/bert/__init__.py b/type_infer/bert/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/type_infer/bert/core.py b/type_infer/bert/core.py
new file mode 100644
index 0000000..2e99ab0
--- /dev/null
+++ b/type_infer/bert/core.py
@@ -0,0 +1,9 @@
+from type_infer.base import BaseEngine
+
+
+class BERType(BaseEngine):
+    def __init__(self, stable=False):
+        super().__init__(stable=stable)
+
+    def infer(self, df):
+        raise NotImplementedError
diff --git a/type_infer/dtype.py b/type_infer/dtype.py
index 9f05a2e..178d925 100644
--- a/type_infer/dtype.py
+++ b/type_infer/dtype.py
@@ -45,3 +45,6 @@ class dtype:
     # Misc (Unk/NaNs)
     empty = "empty"
     invalid = "invalid"
+
+
+# TODO: modifier class + system
diff --git a/type_infer/helpers.py b/type_infer/helpers.py
index 3ae6bc1..4e2752b 100644
--- a/type_infer/helpers.py
+++ b/type_infer/helpers.py
@@ -1,34 +1,50 @@
 import os
-import re
-import nltk
 import psutil
 import random
-import string
 import logging
 import colorlog
 import multiprocessing as mp
+from typing import Iterable
 
 import numpy as np
-import scipy.stats as st
-from langid.langid import LanguageIdentifier
-from langid.langid import model as langid_model
+import pandas as pd
+from scipy.stats import norm
 
-from typing import Iterable
-from collections import Counter, defaultdict
 
-from type_infer.dtype import dtype
+def initialize_log():
+    pid = os.getpid()
+    handler = colorlog.StreamHandler()
+    handler.setFormatter(colorlog.ColoredFormatter())
+
+    logging.basicConfig(handlers=[handler])
+    log = logging.getLogger(f'type_infer-{pid}')
+    log_level = os.environ.get('TYPE_INFER_LOG', 'DEBUG')
+    log.setLevel(log_level)
+    return log
+
 
+log = initialize_log()
+
+
+def get_nr_procs(df=None):
+    if 'MINDSDB_N_WORKERS' in os.environ:
+        try:
+            n = int(os.environ['MINDSDB_N_WORKERS'])
+        except ValueError:
+            n = 1
+        return n
+    elif os.name == 'nt':
+        return 1
+    else:
+        available_mem = psutil.virtual_memory().available
+        if df is not None:
+            max_per_proc_usage = df.size
+        else:
+            max_per_proc_usage = 0.2 * pow(10, 9)  # multiplier * 1GB
 
-try:
-    nltk.data.find('tokenizers/punkt')
-except LookupError:
-    nltk.download('punkt')
+        proc_count = int(min(mp.cpu_count() - 1, available_mem // max_per_proc_usage))
 
-try:
-    from nltk.corpus import stopwords
-    stopwords.words('english')
-except LookupError:
-    nltk.download('stopwords', quiet=True)
+        return max(proc_count, 1)
 
 
 def seed(seed_nr: int) -> None:
@@ -58,103 +74,6 @@ def is_nan_numeric(value: object) -> bool:
     return isnan
 
 
-def initialize_log():
-    pid = os.getpid()
-
-    handler = colorlog.StreamHandler()
-    handler.setFormatter(colorlog.ColoredFormatter())
-
-    logging.basicConfig(handlers=[handler])
-    log = logging.getLogger(f'type_infer-{pid}')
-    log_level = os.environ.get('TYPE_INFER_LOG', 'DEBUG')
-    log.setLevel(log_level)
-    return log
-
-
-log = initialize_log()
-
-
-def get_identifier_description_mp(arg_tup):
-    data, column_name, data_dtype = arg_tup
-    return get_identifier_description(data, column_name, data_dtype)
-
-
-def get_identifier_description(data: Iterable, column_name: str, data_dtype: dtype):
-    data = list(data)
-    if isinstance(data[0], list):
-        nr_unique = len(set(tuple(x) for x in data))
-    elif isinstance(data[0], dict):
-        nr_unique = len(set(str(x) for x in data))
-    else:
-        nr_unique = len(set(data))
-
-    if nr_unique == 1:
-        return 'No Information'
-
-    unique_pct = nr_unique / len(data)
-
-    spaces = [len(str(x).split(' ')) - 1 for x in data]
-    mean_spaces = np.mean(spaces) if len(spaces) > 0 else 0.0
-
-    # Detect hash
-    all_same_length = all(len(str(data[0])) == len(str(x)) for x in data)
-    uuid_charset = set('0123456789abcdefABCDEF-')
-    all_uuid_charset = all(set(str(x)).issubset(uuid_charset) for x in data)
-    is_uuid = all_uuid_charset and all_same_length
-
-    if all_same_length and len(data) == nr_unique and data_dtype not in (dtype.integer, dtype.float):
-        str_data = [str(x) for x in data]
-        randomness_per_index = []
-        for i, _ in enumerate(str_data[0]):
-            N = len(set(x[i] for x in str_data))
-            S = st.entropy([*Counter(x[i] for x in str_data).values()])
-            if S == 0:
-                randomness_per_index.append(0.0)
-            else:
-                randomness_per_index.append(S / np.log(N))
-
-        mean_randomness = np.mean(randomness_per_index) if len(randomness_per_index) > 0 else 0
-        if mean_randomness > 0.95:
-            return 'Hash-like identifier'
-
-    # Detect foreign key
-    if data_dtype == dtype.integer:
-        if _is_foreign_key_name(column_name):
-            return 'Foreign key'
-
-    if _is_identifier_name(column_name) or data_dtype in (dtype.categorical, dtype.binary):
-        if unique_pct > 0.98:
-            if is_uuid:
-                return 'UUID'
-            else:
-                return 'Unknown identifier'
-
-    # Everything is unique and it's too short to be rich text
-    if data_dtype in (dtype.categorical, dtype.binary, dtype.short_text, dtype.rich_text) and \
-            unique_pct > 0.99999 and mean_spaces < 1:
-        return 'Unknown identifier'
-
-    return None
-
-
-def _is_foreign_key_name(name):
-    for endings in ['id', 'ID', 'Id']:
-        for add in ['-', '_', ' ']:
-            if name.endswith(add + endings):
-                return True
-    for endings in ['ID', 'Id']:
-        if name.endswith(endings):
-            return True
-    return False
-
-
-def _is_identifier_name(name):
-    for keyword in ['account', 'uuid', 'identifier', 'user']:
-        if keyword in name:
-            return True
-    return False
-
-
 def cast_string_to_python_type(string):
     """ Returns None, an integer, float or a string from a string"""
     if string is None or string == '':
@@ -173,7 +92,6 @@ def cast_string_to_python_type(string):
         return string
 
 
-# TODO: Should this be here?
 def clean_float(val):
     if isinstance(val, (int, float)):
         return float(val)
@@ -194,100 +112,72 @@ def clean_float(val):
         return None
 
 
-def get_language_dist(data):
-    lang_dist = defaultdict(lambda: 0)
-    lang_dist['Unknown'] = 0
-    lang_probs_cache = dict()
-    identifier = LanguageIdentifier.from_modelstring(langid_model, norm_probs=True)
-    for text in data:
-        text = str(text)
-        text = text.translate(str.maketrans('', '', string.punctuation))
-        if text not in lang_probs_cache:
-            try:
-                lang_probs = identifier.classify(text)
-            except Exception:
-                lang_probs = []
-            lang_probs_cache[text] = lang_probs
-
-        lang_probs = lang_probs_cache[text]
-        if len(lang_probs) > 0 and lang_probs[1] > 10 * (1 / len(identifier.nb_classes)):
-            lang_dist[lang_probs[0]] += 1
-        else:
-            lang_dist['Unknown'] += 1
-
-    return dict(lang_dist)
-
-
-def analyze_sentences(data):
-    nr_words = 0
-    word_dist = defaultdict(int)
-    nr_words_dist = defaultdict(int)
-    stop_words = set(stopwords.words('english'))
-    for text in map(str, data):
-        text = text.lower()
-        text_dist = defaultdict(int)
-        tokens = tokenize_text(text)
-        tokens_no_stop = (x for x in tokens if x not in stop_words)
-        for tok in tokens_no_stop:
-            text_dist[tok] += 1
-
-        n_tokens = len(text_dist)
-        nr_words_dist[n_tokens] += 1
-        nr_words += n_tokens
-
-        # merge text_dist into word_dist
-        for k, v in text_dist.items():
-            word_dist[k] += v
-
-    return nr_words, dict(word_dist), dict(nr_words_dist)
-
-
-# @TODO: eventually move these into .helpers.text
-def tokenize_text(text):
-    """ Generator instead of list comprehension for optimal memory usage & runtime """
-    return (t.lower() for t in nltk.word_tokenize(decontracted(text)) if contains_alnum(t))
-
-
-def decontracted(phrase):
-    # specific
-    phrase = re.sub(r"won\'t", "will not", phrase)
-    phrase = re.sub(r"can\'t", "can not", phrase)
-
-    # general
-    phrase = re.sub(r"n\'t", " not", phrase)
-    phrase = re.sub(r"\'re", " are", phrase)
-    phrase = re.sub(r"\'s", " is", phrase)
-    phrase = re.sub(r"\'d", " would", phrase)
-    phrase = re.sub(r"\'ll", " will", phrase)
-    phrase = re.sub(r"\'t", " not", phrase)
-    phrase = re.sub(r"\'ve", " have", phrase)
-    phrase = re.sub(r"\'m", " am", phrase)
-    return phrase
-
+def sample_data(df: pd.DataFrame) -> pd.DataFrame:
+    population_size = len(df)
+    if population_size <= 50:
+        sample_size = population_size
+    else:
+        sample_size = int(round(_calculate_sample_size(population_size)))
 
-def contains_alnum(text):
-    for c in text:
-        if c.isalnum():
-            return True
-    return False
+    population_size = len(df)
+    input_data_sample_indexes = random.sample(range(population_size), sample_size)
+    return df.iloc[input_data_sample_indexes]
 
 
-def get_nr_procs(df=None):
-    if 'MINDSDB_N_WORKERS' in os.environ:
-        try:
-            n = int(os.environ['MINDSDB_N_WORKERS'])
-        except ValueError:
-            n = 1
-        return n
-    elif os.name == 'nt':
-        return 1
+def _calculate_sample_size(
+    population_size,
+    margin_error=.01,
+    confidence_level=.995,
+    sigma=1 / 2
+):
+    """
+    Calculate the minimal sample size to use to achieve a certain
+    margin of error and confidence level for a sample estimate
+    of the population mean.
+    Inputs
+    -------
+    population_size: integer
+        Total size of the population that the sample is to be drawn from.
+    margin_error: number
+        Maximum expected difference between the true population parameter,
+        such as the mean, and the sample estimate.
+    confidence_level: number in the interval (0, 1)
+        If we were to draw a large number of equal-size samples
+        from the population, the true population parameter
+        should lie within this percentage
+        of the intervals (sample_parameter - e, sample_parameter + e)
+        where e is the margin_error.
+    sigma: number
+        The standard deviation of the population.  For the case
+        of estimating a parameter in the interval [0, 1], sigma=1/2
+        should be sufficient.
+    """
+    alpha = 1 - confidence_level
+    # dictionary of confidence levels and corresponding z-scores
+    # computed via norm.ppf(1 - (alpha/2)), where norm is
+    # a normal distribution object in scipy.stats.
+    # Here, ppf is the percentile point function.
+    zdict = {
+        .90: 1.645,
+        .91: 1.695,
+        .99: 2.576,
+        .97: 2.17,
+        .94: 1.881,
+        .93: 1.812,
+        .95: 1.96,
+        .98: 2.326,
+        .96: 2.054,
+        .92: 1.751
+    }
+    if confidence_level in zdict:
+        z = zdict[confidence_level]
     else:
-        available_mem = psutil.virtual_memory().available
-        if df is not None:
-            max_per_proc_usage = df.size
-        else:
-            max_per_proc_usage = 0.2 * pow(10, 9)  # multiplier * 1GB
-
-        proc_count = int(min(mp.cpu_count() - 1, available_mem // max_per_proc_usage))
-
-        return max(proc_count, 1)
+        # Inf fix
+        if alpha == 0.0:
+            alpha += 0.001
+        z = norm.ppf(1 - (alpha / 2))
+    N = population_size
+    M = margin_error
+    numerator = z**2 * sigma**2 * (N / (N - 1))
+    denom = M**2 + ((z**2 * sigma**2) / (N - 1))
+    return numerator / denom
diff --git a/type_infer/infer.py b/type_infer/infer.py
deleted file mode 100644
index dd329f3..0000000
--- a/type_infer/infer.py
+++ /dev/null
@@ -1,490 +0,0 @@
-import re
-import random
-import imghdr
-import sndhdr
-import multiprocessing as mp
-from collections import Counter
-from typing import List, Union
-
-from scipy.stats import norm
-import pandas as pd
-import numpy as np
-
-from type_infer.base import TypeInformation
-from type_infer.dtype import dtype
-from type_infer.helpers import seed, log  # TODO: move somewhere else?
-from type_infer.helpers import get_nr_procs
-from type_infer.helpers import is_nan_numeric, get_identifier_description_mp, cast_string_to_python_type, \
-    get_language_dist, analyze_sentences
-
-
-# @TODO: hardcode for distance, time, subunits of currency (e.g. cents) and other common units
-# @TODO: Add tests with plenty of examples
-def get_quantity_col_info(col_data: pd.Series) -> str:
-    assert isinstance(col_data, pd.Series)
-    char_const = None
-    nr_map = set()
-    for val in col_data:
-        val = str(val)
-        char_part = re.sub("[0-9.,]", '', val)
-        numeric_bit = re.sub("[^0-9.,]", '', val).replace(',', '.')
-
-        if len(char_part) == 0:
-            char_part = None
-
-        if len(re.sub("[^0-9]", '', numeric_bit)) == 0 or numeric_bit.count('.') > 1:
-            numeric_bit = None
-        else:
-            numeric_bit = float(numeric_bit)
-
-        if numeric_bit is None:
-            return False, None
-        else:
-            nr_map.add(numeric_bit)
-
-        if char_const is None:
-            char_const = char_part
-
-        if char_part is None or char_part == '-' or char_part != char_const:
-            return False, None
-
-    if len(nr_map) > 20 and len(nr_map) > len(col_data) / 200:
-        return True, {char_const: {
-            'multiplier': 1
-        }}
-    else:
-        return False, None
-
-
-def get_binary_type(element: object) -> str:
-    try:
-        is_img = imghdr.what(element)
-        if is_img is not None:
-            return dtype.image
-
-        # @TODO: currently we don differentiate between audio and video
-        is_audio = sndhdr.what(element)
-        # apparently `sndhdr` is really bad..
-        for audio_ext in ['.wav', '.mp3']:
-            if element.endswith(audio_ext):
-                is_audio = True
-        if is_audio is not None:
-            return dtype.audio
-    except Exception:
-        # Not a file or file doesn't exist
-        return None
-
-
-def get_numeric_type(element: object) -> str:
-    """ Returns the subtype inferred from a number string, or False if its not a number"""
-    string_as_nr = cast_string_to_python_type(str(element))
-
-    try:
-        if string_as_nr == int(string_as_nr):
-            string_as_nr = int(string_as_nr)
-    except Exception:
-        pass
-
-    if isinstance(string_as_nr, float):
-        return dtype.float
-    elif isinstance(string_as_nr, int):
-        return dtype.integer
-    else:
-        try:
-            if is_nan_numeric(element):
-                return dtype.integer
-            else:
-                return None
-        except Exception:
-            return None
-
-
-def type_check_sequence(element: object) -> str:
-    dtype_guess = None
-
-    if isinstance(element, List):
-        all_nr = all([get_numeric_type(ele) for ele in element])
-        if all_nr:
-            dtype_guess = dtype.num_array
-        else:
-            dtype_guess = dtype.cat_array
-    else:
-        for sep_char in [',', '\t', '|', ' ']:  # @TODO: potential bottleneck, cutoff after a while
-            all_nr = True
-            if '[' in element:
-                ele_arr = element.rstrip(']').lstrip('[').split(sep_char)
-            else:
-                ele_arr = element.rstrip(')').lstrip('(').split(sep_char)
-
-            for ele in ele_arr:
-                if not get_numeric_type(ele):
-                    all_nr = False
-                    break
-
-            if len(ele_arr) > 1 and all_nr:
-                dtype_guess = dtype.num_array
-
-    return dtype_guess
-
-
-def type_check_date(element: object) -> str:
-    """
-    Check if element corresponds to a date-like object.
-    """
-    # check if element represents a date (no hour/minute/seconds)
-    is_date = False
-    # check if element represents a datetime (has hour/minute/seconds)
-    is_datetime = False
-    # check if it makes sense to convert element to unix time-stamp by
-    # evaluating if, when converted, the element represents a number that
-    # is compatible with a Unix timestamp (number of seconds since 1970-01-01T:00:00:00)
-    # note that we also check the number is not larger than the "epochalypse time",
-    # which is when the unix timestamp becomes larger than 2^32 - 1 seconds. We do
-    # this because timestamps outside this range are likely to be unreliable and hence
-    # rather treated as every-day numbers.
-    min_dt = pd.to_datetime('1970-01-01 00:00:00', utc=True)
-    max_dt = pd.to_datetime('2038-01-19 03:14:08', utc=True)
-    valid_units = {'ns': 'unix', 'us': 'unix', 'ms': 'unix', 's': 'unix',
-                   'D': 'julian'}
-    for unit, origin in valid_units.items():
-        try:
-            as_dt = pd.to_datetime(element, unit=unit, origin=origin,
-                                   errors='raise')
-            if min_dt < as_dt < max_dt:
-                is_datetime = True
-                break
-        except Exception:
-            pass
-    # check if element represents a date-like object.
-    # here we don't check for a validity range like with unix-timestamps
-    # because dates as string usually represent something more general than
-    # just the number of seconds since an epoch.
-    try:
-        as_dt = pd.to_datetime(element, errors='raise')
-        is_datetime = True
-    except Exception:
-        pass
-    # finally, if element is represents a datetime object, check if only
-    # date part is contained (no time information)
-    if is_datetime:
-        # round element day (drop hour/minute/second)
-        dt_d = as_dt.to_period('D').to_timestamp()
-        # if rounded datetime equals the datetime itself, it means there was not
-        # hour/minute/second information to begin with. Mind the 'localize' to
-        # avoid time-zone BS to kick in.
-        is_date = dt_d == as_dt.tz_localize(None)
-    if is_date:
-        return dtype.date
-    if is_datetime:
-        return dtype.datetime
-
-    return None
-
-
-def count_data_types_in_column(data):
-    dtype_counts = Counter()
-
-    type_checkers = [get_numeric_type,
-                     type_check_sequence,
-                     get_binary_type,
-                     type_check_date]
-
-    for element in data:
-        for type_checker in type_checkers:
-            try:
-                dtype_guess = type_checker(element)
-            except Exception:
-                dtype_guess = None
-            if dtype_guess is not None:
-                dtype_counts[dtype_guess] += 1
-                break
-        else:
-            dtype_counts[dtype.invalid] += 1
-
-    return dtype_counts
-
-
-def get_column_data_type(data: Union[np.ndarray, list], full_data: pd.DataFrame, col_name: str, pct_invalid: float):
-    """
-    Provided the column data, define its data type and data subtype.
-
-    :param data: an iterable containing a sample of the data frame
-    :param full_data: an iterable containing the whole column of a data frame
-
-    :return: type and type distribution, we can later use type_distribution to determine data quality
-    NOTE: type distribution is the count that this column has for belonging cells to each DATA_TYPE
-    """
-    log.info(f'Infering type for: {col_name}')
-    additional_info = {'other_potential_dtypes': []}
-
-    warn = []
-    info = []
-    if len(data) == 0:
-        warn.append(f'Column {col_name} has no data in it. ')
-        warn.append(f'Please remove {col_name} from the training file or fill in some of the values !')
-        return None, None, additional_info, warn, info
-
-    dtype_counts = count_data_types_in_column(data)
-
-    known_dtype_dist = {k: v for k, v in dtype_counts.items()}
-    if dtype.float in known_dtype_dist and dtype.integer in known_dtype_dist:
-        known_dtype_dist[dtype.float] += known_dtype_dist[dtype.integer]
-        del known_dtype_dist[dtype.integer]
-
-    if dtype.datetime in known_dtype_dist and dtype.date in known_dtype_dist:
-        known_dtype_dist[dtype.datetime] += known_dtype_dist[dtype.date]
-        del known_dtype_dist[dtype.date]
-
-    max_known_dtype, max_known_dtype_count = max(
-        known_dtype_dist.items(),
-        key=lambda kv: kv[1]
-    )
-
-    actual_pct_invalid = 100 * (len(data) - max_known_dtype_count) / len(data)
-    if max_known_dtype is None or max_known_dtype == dtype.invalid:
-        curr_dtype = None
-    elif actual_pct_invalid > pct_invalid:
-        if max_known_dtype in (dtype.integer, dtype.float) and actual_pct_invalid <= 5 * pct_invalid:
-            curr_dtype = max_known_dtype
-        else:
-            curr_dtype = None
-    else:
-        curr_dtype = max_known_dtype
-
-    nr_vals = len(data)
-    nr_distinct_vals = len(set([str(x) for x in data]))
-
-    # Is it a quantity?
-    if curr_dtype not in (dtype.datetime, dtype.date):
-        is_quantity, quantitiy_info = get_quantity_col_info(data)
-        if is_quantity:
-            additional_info['quantitiy_info'] = quantitiy_info
-            curr_dtype = dtype.quantity
-            known_dtype_dist = {
-                dtype.quantity: nr_vals
-            }
-
-    # Check for Tags subtype
-    if curr_dtype not in (dtype.quantity, dtype.num_array):
-        lengths = []
-        unique_tokens = set()
-
-        can_be_tags = False
-        if all(isinstance(x, str) for x in data):
-            can_be_tags = True
-
-        mean_lenghts = np.mean(lengths) if len(lengths) > 0 else 0
-
-        # If more than 30% of the samples contain more than 1 category and there's more than 6 and less than 30 of them and they are shared between the various cells # noqa
-        if (can_be_tags and mean_lenghts > 1.3 and
-                6 <= len(unique_tokens) <= 30 and
-                len(unique_tokens) / mean_lenghts < (len(data) / 4)):
-            curr_dtype = dtype.tags
-
-    # Categorical based on unique values
-    if curr_dtype not in (dtype.date, dtype.datetime, dtype.tags, dtype.cat_array):
-        if curr_dtype in (dtype.integer, dtype.float):
-            is_categorical = nr_distinct_vals < 10
-        else:
-            is_categorical = nr_distinct_vals < min(max((nr_vals / 100), 10), 3000)
-
-        if is_categorical:
-            if curr_dtype is not None:
-                additional_info['other_potential_dtypes'].append(curr_dtype)
-            curr_dtype = dtype.categorical
-
-    # If curr_data_type is still None, then it's text or category
-    if curr_dtype is None:
-        log.info(f'Doing text detection for column: {col_name}')
-        lang_dist = get_language_dist(data)  # TODO: bottleneck
-
-        # Normalize lang probabilities
-        for lang in lang_dist:
-            lang_dist[lang] /= len(data)
-
-        # If most cells are unknown language then it's categorical
-        if lang_dist['Unknown'] > 0.5:
-            curr_dtype = dtype.categorical
-        else:
-            nr_words, word_dist, nr_words_dist = analyze_sentences(data)  # TODO: maybe pass entire corpus at once
-
-            if 1 in nr_words_dist and nr_words_dist[1] == nr_words:
-                curr_dtype = dtype.categorical
-            else:
-                if len(word_dist) > 500 and nr_words / len(data) > 5:
-                    curr_dtype = dtype.rich_text
-                else:
-                    curr_dtype = dtype.short_text
-
-                return curr_dtype, {curr_dtype: len(data)}, additional_info, warn, info
-
-    if curr_dtype in [dtype.categorical, dtype.rich_text, dtype.short_text, dtype.cat_array]:
-        known_dtype_dist = {curr_dtype: len(data)}
-
-    if nr_distinct_vals < 3 and curr_dtype == dtype.categorical:
-        curr_dtype = dtype.binary
-        known_dtype_dist[dtype.binary] = known_dtype_dist[dtype.categorical]
-        del known_dtype_dist[dtype.categorical]
-
-    log.info(f'Column {col_name} has data type {curr_dtype}')
-    return curr_dtype, known_dtype_dist, additional_info, warn, info
-
-
-def calculate_sample_size(
-    population_size,
-    margin_error=.01,
-    confidence_level=.995,
-    sigma=1 / 2
-):
-    """
-    Calculate the minimal sample size to use to achieve a certain
-    margin of error and confidence level for a sample estimate
-    of the population mean.
-    Inputs
-    -------
-    population_size: integer
-        Total size of the population that the sample is to be drawn from.
-    margin_error: number
-        Maximum expected difference between the true population parameter,
-        such as the mean, and the sample estimate.
-    confidence_level: number in the interval (0, 1)
-        If we were to draw a large number of equal-size samples
-        from the population, the true population parameter
-        should lie within this percentage
-        of the intervals (sample_parameter - e, sample_parameter + e)
-        where e is the margin_error.
-    sigma: number
-        The standard deviation of the population.  For the case
-        of estimating a parameter in the interval [0, 1], sigma=1/2
-        should be sufficient.
-    """
-    alpha = 1 - (confidence_level)
-    # dictionary of confidence levels and corresponding z-scores
-    # computed via norm.ppf(1 - (alpha/2)), where norm is
-    # a normal distribution object in scipy.stats.
-    # Here, ppf is the percentile point function.
-    zdict = {
-        .90: 1.645,
-        .91: 1.695,
-        .99: 2.576,
-        .97: 2.17,
-        .94: 1.881,
-        .93: 1.812,
-        .95: 1.96,
-        .98: 2.326,
-        .96: 2.054,
-        .92: 1.751
-    }
-    if confidence_level in zdict:
-        z = zdict[confidence_level]
-    else:
-        # Inf fix
-        if alpha == 0.0:
-            alpha += 0.001
-        z = norm.ppf(1 - (alpha / 2))
-    N = population_size
-    M = margin_error
-    numerator = z**2 * sigma**2 * (N / (N - 1))
-    denom = M**2 + ((z**2 * sigma**2) / (N - 1))
-    return numerator / denom
-
-
-def sample_data(df: pd.DataFrame) -> pd.DataFrame:
-    population_size = len(df)
-    if population_size <= 50:
-        sample_size = population_size
-    else:
-        sample_size = int(round(calculate_sample_size(population_size)))
-
-    population_size = len(df)
-    input_data_sample_indexes = random.sample(range(population_size), sample_size)
-    return df.iloc[input_data_sample_indexes]
-
-
-def infer_types(
-        data: pd.DataFrame,
-        pct_invalid: float,
-        seed_nr: int = 420,
-        mp_cutoff: int = 1e4,
-) -> TypeInformation:
-    """
-    Infers the data types of each column of the dataset by analyzing a small sample of
-    each column's items.
-
-    Inputs
-    ----------
-    data : pd.DataFrame
-        The input dataset for which we want to infer data type information.
-    pct_invalid : float
-        The percentage, i.e. a float between 0.0 and 100.0, of invalid values that are
-        accepted before failing the type inference for a column.
-    seed_nr : int, optional
-        Seed for the random number generator, by default 420
-    mp_cutoff : int, optional
-        How many elements in the dataframe before switching to parallel processing, by
-        default 1e4.
-    """
-    seed(seed_nr)
-    type_information = TypeInformation()
-    sample_df = sample_data(data)
-    sample_size = len(sample_df)
-    population_size = len(data)
-    log.info(f'Analyzing a sample of {sample_size}')
-    log.info(
-        f'from a total population of {population_size}, this is equivalent to {round(sample_size*100/population_size, 1)}% of your data.')  # noqa
-
-    nr_procs = get_nr_procs(df=sample_df)
-    pool_size = min(nr_procs, len(sample_df.columns.values))
-    if data.size > mp_cutoff and pool_size > 1:
-        log.info(f'Using {pool_size} processes to deduct types.')
-        pool = mp.Pool(processes=pool_size)
-        # column-wise parallelization  # TODO: evaluate switching to row-wise split instead
-        answer_arr = pool.starmap(get_column_data_type, [
-            (sample_df[x].dropna(), data[x], x, pct_invalid) for x in sample_df.columns.values
-        ])
-        pool.close()
-        pool.join()
-    else:
-        answer_arr = []
-        for x in sample_df.columns:
-            answer_arr.append(get_column_data_type(sample_df[x].dropna(), data, x, pct_invalid))
-
-    for i, col_name in enumerate(sample_df.columns):
-        (data_dtype, data_dtype_dist, additional_info, warn, info) = answer_arr[i]
-
-        for msg in warn:
-            log.warning(msg)
-        for msg in info:
-            log.info(msg)
-
-        if data_dtype is None:
-            data_dtype = dtype.invalid
-
-        type_information.dtypes[col_name] = data_dtype
-        type_information.additional_info[col_name] = {
-            'dtype_dist': data_dtype_dist
-        }
-
-    if data.size > mp_cutoff and pool_size > 1:
-        pool = mp.Pool(processes=pool_size)
-        answer_arr = pool.map(get_identifier_description_mp, [
-            (data[x], x, type_information.dtypes[x])
-            for x in sample_df.columns
-        ])
-        pool.close()
-        pool.join()
-    else:
-        answer_arr = []
-        for x in sample_df.columns:
-            answer = get_identifier_description_mp([data[x], x, type_information.dtypes[x]])
-            answer_arr.append(answer)
-
-    for i, col_name in enumerate(sample_df.columns):
-        # work with the full data
-        if answer_arr[i] is not None:
-            log.warning(f'Column {col_name} is an identifier of type "{answer_arr[i]}"')
-            type_information.identifiers[col_name] = answer_arr[i]
-
-        # @TODO Column removal logic was here, if the column was an identifier, move it elsewhere
-
-    return type_information
diff --git a/type_infer/rule_based/__init__.py b/type_infer/rule_based/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/type_infer/rule_based/core.py b/type_infer/rule_based/core.py
new file mode 100644
index 0000000..43a35a1
--- /dev/null
+++ b/type_infer/rule_based/core.py
@@ -0,0 +1,409 @@
+import re
+import imghdr
+import sndhdr
+import multiprocessing as mp
+from typing import List, Union
+from collections import Counter
+
+import numpy as np
+import pandas as pd
+
+from type_infer.dtype import dtype
+from type_infer.base import BaseEngine, TypeInformation
+from type_infer.helpers import log, seed, sample_data, get_nr_procs, is_nan_numeric, cast_string_to_python_type
+from type_infer.rule_based.helpers import get_language_dist, analyze_sentences, get_identifier_description_mp
+
+
+class RuleBasedEngine(BaseEngine):
+    def __init__(self, config=None):
+        """
+        :param config: a dictionary containing the configuration for the engine
+                pct_invalid : float
+                    The percentage, i.e. a float between 0.0 and 100.0, of invalid values that are
+                    accepted before failing the type inference for a column.
+                seed : int, optional
+                    Seed for the random number generator, by default 420
+                mp_cutoff : int, optional
+                    How many elements in the dataframe before switching to parallel processing, by
+                    default 1e4.
+        """
+        super().__init__(stable=True)
+        self.config = config if config else {'pct_invalid': 2, 'seed': 420, 'mp_cutoff': 1e4}
+
+    def infer(self, data: pd.DataFrame) -> TypeInformation:
+        seed(self.config['seed'])
+        type_information = TypeInformation()
+        sample_df = sample_data(data)
+        sample_size = len(sample_df)
+        population_size = len(data)
+        log.info(f'Analyzing a sample of {sample_size}')
+        log.info(
+            f'from a total population of {population_size}, this is equivalent to {round(sample_size * 100 / population_size, 1)}% of your data.')  # noqa
+
+        nr_procs = get_nr_procs(df=sample_df)
+        pool_size = min(nr_procs, len(sample_df.columns.values))
+        if data.size > self.config['mp_cutoff'] and pool_size > 1:
+            log.info(f'Using {pool_size} processes to deduct types.')
+            pool = mp.Pool(processes=pool_size)
+            # column-wise parallelization  # TODO: evaluate switching to row-wise split instead
+            answer_arr = pool.starmap(self.get_column_data_type, [
+                (sample_df[x].dropna(), data[x], x, self.config['pct_invalid']) for x in sample_df.columns.values
+            ])
+            pool.close()
+            pool.join()
+        else:
+            answer_arr = []
+            for x in sample_df.columns:
+                answer_arr.append(self.get_column_data_type(sample_df[x].dropna(), data, x, self.config['pct_invalid']))
+
+        for i, col_name in enumerate(sample_df.columns):
+            (data_dtype, data_dtype_dist, additional_info, warn, info) = answer_arr[i]
+
+            for msg in warn:
+                log.warning(msg)
+            for msg in info:
+                log.info(msg)
+
+            if data_dtype is None:
+                data_dtype = dtype.invalid
+
+            type_information.dtypes[col_name] = data_dtype
+            type_information.additional_info[col_name] = {
+                'dtype_dist': data_dtype_dist
+            }
+
+        if data.size > self.config['mp_cutoff'] and pool_size > 1:
+            pool = mp.Pool(processes=pool_size)
+            answer_arr = pool.map(get_identifier_description_mp, [
+                (data[x], x, type_information.dtypes[x])
+                for x in sample_df.columns
+            ])
+            pool.close()
+            pool.join()
+        else:
+            answer_arr = []
+            for x in sample_df.columns:
+                answer = get_identifier_description_mp([data[x], x, type_information.dtypes[x]])
+                answer_arr.append(answer)
+
+        for i, col_name in enumerate(sample_df.columns):
+            # work with the full data
+            if answer_arr[i] is not None:
+                log.warning(f'Column {col_name} is an identifier of type "{answer_arr[i]}"')
+                type_information.identifiers[col_name] = answer_arr[i]
+
+        # @TODO Column removal logic was here, if the column was an identifier, move it elsewhere
+        return type_information
+
+    # @TODO: hardcode for distance, time, subunits of currency (e.g. cents) and other common units
+    # @TODO: Add tests with plenty of examples
+
+    def get_quantity_col_info(self, col_data: pd.Series) -> str:
+        assert isinstance(col_data, pd.Series)
+        char_const = None
+        nr_map = set()
+        for val in col_data:
+            val = str(val)
+            char_part = re.sub("[0-9.,]", '', val)
+            numeric_bit = re.sub("[^0-9.,]", '', val).replace(',', '.')
+
+            if len(char_part) == 0:
+                char_part = None
+
+            if len(re.sub("[^0-9]", '', numeric_bit)) == 0 or numeric_bit.count('.') > 1:
+                numeric_bit = None
+            else:
+                numeric_bit = float(numeric_bit)
+
+            if numeric_bit is None:
+                return False, None
+            else:
+                nr_map.add(numeric_bit)
+
+            if char_const is None:
+                char_const = char_part
+
+            if char_part is None or char_part == '-' or char_part != char_const:
+                return False, None
+
+        if len(nr_map) > 20 and len(nr_map) > len(col_data) / 200:
+            return True, {char_const: {
+                'multiplier': 1
+            }}
+        else:
+            return False, None
+
+    def get_binary_type(self, element: object) -> str:
+        try:
+            is_img = imghdr.what(element)
+            if is_img is not None:
+                return dtype.image
+
+            # @TODO: currently we don differentiate between audio and video
+            is_audio = sndhdr.what(element)
+            # apparently `sndhdr` is really bad..
+            for audio_ext in ['.wav', '.mp3']:
+                if element.endswith(audio_ext):
+                    is_audio = True
+            if is_audio is not None:
+                return dtype.audio
+        except Exception:
+            # Not a file or file doesn't exist
+            return None
+
+    def get_numeric_type(self, element: object) -> str:
+        """ Returns the subtype inferred from a number string, or False if its not a number"""
+        string_as_nr = cast_string_to_python_type(str(element))
+
+        try:
+            if string_as_nr == int(string_as_nr):
+                string_as_nr = int(string_as_nr)
+        except Exception:
+            pass
+
+        if isinstance(string_as_nr, float):
+            return dtype.float
+        elif isinstance(string_as_nr, int):
+            return dtype.integer
+        else:
+            try:
+                if is_nan_numeric(element):
+                    return dtype.integer
+                else:
+                    return None
+            except Exception:
+                return None
+
+    def type_check_sequence(self, element: object) -> str:
+        dtype_guess = None
+
+        if isinstance(element, List):
+            all_nr = all([self.get_numeric_type(ele) for ele in element])
+            if all_nr:
+                dtype_guess = dtype.num_array
+            else:
+                dtype_guess = dtype.cat_array
+        else:
+            for sep_char in [',', '\t', '|', ' ']:  # @TODO: potential bottleneck, cutoff after a while
+                all_nr = True
+                if '[' in element:
+                    ele_arr = element.rstrip(']').lstrip('[').split(sep_char)
+                else:
+                    ele_arr = element.rstrip(')').lstrip('(').split(sep_char)
+
+                for ele in ele_arr:
+                    if not self.get_numeric_type(ele):
+                        all_nr = False
+                        break
+
+                if len(ele_arr) > 1 and all_nr:
+                    dtype_guess = dtype.num_array
+
+        return dtype_guess
+
+    @staticmethod
+    def type_check_date(element: object) -> str:
+        """
+        Check if element corresponds to a date-like object.
+        """
+        # check if element represents a date (no hour/minute/seconds)
+        is_date = False
+        # check if element represents a datetime (has hour/minute/seconds)
+        is_datetime = False
+        # check if it makes sense to convert element to unix time-stamp by
+        # evaluating if, when converted, the element represents a number that
+        # is compatible with a Unix timestamp (number of seconds since 1970-01-01T:00:00:00)
+        # note that we also check the number is not larger than the "epochalypse time",
+        # which is when the unix timestamp becomes larger than 2^32 - 1 seconds. We do
+        # this because timestamps outside this range are likely to be unreliable and hence
+        # rather treated as every-day numbers.
+        min_dt = pd.to_datetime('1970-01-01 00:00:00', utc=True)
+        max_dt = pd.to_datetime('2038-01-19 03:14:08', utc=True)
+        valid_units = {'ns': 'unix', 'us': 'unix', 'ms': 'unix', 's': 'unix',
+                       'D': 'julian'}
+        for unit, origin in valid_units.items():
+            try:
+                as_dt = pd.to_datetime(element, unit=unit, origin=origin,
+                                       errors='raise')
+                if min_dt < as_dt < max_dt:
+                    is_datetime = True
+                    break
+            except Exception:
+                pass
+        # check if element represents a date-like object.
+        # here we don't check for a validity range like with unix-timestamps
+        # because dates as string usually represent something more general than
+        # just the number of seconds since an epoch.
+        try:
+            as_dt = pd.to_datetime(element, errors='raise')
+            is_datetime = True
+        except Exception:
+            pass
+        # finally, if element is represents a datetime object, check if only
+        # date part is contained (no time information)
+        if is_datetime:
+            # round element day (drop hour/minute/second)
+            dt_d = as_dt.to_period('D').to_timestamp()
+            # if rounded datetime equals the datetime itself, it means there was not
+            # hour/minute/second information to begin with. Mind the 'localize' to
+            # avoid time-zone BS to kick in.
+            is_date = dt_d == as_dt.tz_localize(None)
+        if is_date:
+            return dtype.date
+        if is_datetime:
+            return dtype.datetime
+
+        return None
+
+    def count_data_types_in_column(self, data):
+        dtype_counts = Counter()
+
+        type_checkers = [self.get_numeric_type,
+                         self.type_check_sequence,
+                         self.get_binary_type,
+                         self.type_check_date]
+
+        for element in data:
+            for type_checker in type_checkers:
+                try:
+                    dtype_guess = type_checker(element)
+                except Exception:
+                    dtype_guess = None
+                if dtype_guess is not None:
+                    dtype_counts[dtype_guess] += 1
+                    break
+            else:
+                dtype_counts[dtype.invalid] += 1
+
+        return dtype_counts
+
+    def get_column_data_type(self,
+                             data: Union[pd.Series, np.ndarray, list],
+                             full_data: pd.DataFrame,
+                             col_name: str,
+                             pct_invalid: float
+                             ):
+        """
+        Provided the column data, define its data type and data subtype.
+
+        :param data: an iterable containing a sample of the data frame
+        :param full_data: an iterable containing the whole column of a data frame
+
+        :return: type and type distribution, we can later use type_distribution to determine data quality
+        NOTE: type distribution is the count that this column has for belonging cells to each DATA_TYPE
+        """
+        log.info(f'Infering type for: {col_name}')
+        additional_info = {'other_potential_dtypes': []}
+
+        warn = []
+        info = []
+        if len(data) == 0:
+            warn.append(f'Column {col_name} has no data in it. ')
+            warn.append(f'Please remove {col_name} from the training file or fill in some of the values !')
+            return None, None, additional_info, warn, info
+
+        dtype_counts = self.count_data_types_in_column(data)
+
+        known_dtype_dist = {k: v for k, v in dtype_counts.items()}
+        if dtype.float in known_dtype_dist and dtype.integer in known_dtype_dist:
+            known_dtype_dist[dtype.float] += known_dtype_dist[dtype.integer]
+            del known_dtype_dist[dtype.integer]
+
+        if dtype.datetime in known_dtype_dist and dtype.date in known_dtype_dist:
+            known_dtype_dist[dtype.datetime] += known_dtype_dist[dtype.date]
+            del known_dtype_dist[dtype.date]
+
+        max_known_dtype, max_known_dtype_count = max(
+            known_dtype_dist.items(),
+            key=lambda kv: kv[1]
+        )
+
+        actual_pct_invalid = 100 * (len(data) - max_known_dtype_count) / len(data)
+        if max_known_dtype is None or max_known_dtype == dtype.invalid:
+            curr_dtype = None
+        elif actual_pct_invalid > self.config['pct_invalid']:
+            if max_known_dtype in (dtype.integer, dtype.float) and actual_pct_invalid <= 5 * self.config['pct_invalid']:
+                curr_dtype = max_known_dtype
+            else:
+                curr_dtype = None
+        else:
+            curr_dtype = max_known_dtype
+
+        nr_vals = len(data)
+        nr_distinct_vals = len(set([str(x) for x in data]))
+
+        # Is it a quantity?
+        if curr_dtype not in (dtype.datetime, dtype.date):
+            is_quantity, quantitiy_info = self.get_quantity_col_info(data)
+            if is_quantity:
+                additional_info['quantitiy_info'] = quantitiy_info
+                curr_dtype = dtype.quantity
+                known_dtype_dist = {
+                    dtype.quantity: nr_vals
+                }
+
+        # Check for Tags subtype
+        if curr_dtype not in (dtype.quantity, dtype.num_array):
+            lengths = []
+            unique_tokens = set()
+
+            can_be_tags = False
+            if all(isinstance(x, str) for x in data):
+                can_be_tags = True
+
+            mean_lenghts = np.mean(lengths) if len(lengths) > 0 else 0
+
+            # If more than 30% of the samples contain more than 1 category and there's more than 6 and less than 30 of them and they are shared between the various cells # noqa
+            if (can_be_tags and mean_lenghts > 1.3 and
+                    6 <= len(unique_tokens) <= 30 and
+                    len(unique_tokens) / mean_lenghts < (len(data) / 4)):
+                curr_dtype = dtype.tags
+
+        # Categorical based on unique values
+        if curr_dtype not in (dtype.date, dtype.datetime, dtype.tags, dtype.cat_array):
+            if curr_dtype in (dtype.integer, dtype.float):
+                is_categorical = nr_distinct_vals < 10
+            else:
+                is_categorical = nr_distinct_vals < min(max((nr_vals / 100), 10), 3000)
+
+            if is_categorical:
+                if curr_dtype is not None:
+                    additional_info['other_potential_dtypes'].append(curr_dtype)
+                curr_dtype = dtype.categorical
+
+        # If curr_data_type is still None, then it's text or category
+        if curr_dtype is None:
+            log.info(f'Doing text detection for column: {col_name}')
+            lang_dist = get_language_dist(data)  # TODO: bottleneck
+
+            # Normalize lang probabilities
+            for lang in lang_dist:
+                lang_dist[lang] /= len(data)
+
+            # If most cells are unknown language then it's categorical
+            if lang_dist['Unknown'] > 0.5:
+                curr_dtype = dtype.categorical
+            else:
+                nr_words, word_dist, nr_words_dist = analyze_sentences(data)  # TODO: maybe pass entire corpus at once
+
+                if 1 in nr_words_dist and nr_words_dist[1] == nr_words:
+                    curr_dtype = dtype.categorical
+                else:
+                    if len(word_dist) > 500 and nr_words / len(data) > 5:
+                        curr_dtype = dtype.rich_text
+                    else:
+                        curr_dtype = dtype.short_text
+
+                    return curr_dtype, {curr_dtype: len(data)}, additional_info, warn, info
+
+        if curr_dtype in [dtype.categorical, dtype.rich_text, dtype.short_text, dtype.cat_array]:
+            known_dtype_dist = {curr_dtype: len(data)}
+
+        if nr_distinct_vals < 3 and curr_dtype == dtype.categorical:
+            curr_dtype = dtype.binary
+            known_dtype_dist[dtype.binary] = known_dtype_dist[dtype.categorical]
+            del known_dtype_dist[dtype.categorical]
+
+        log.info(f'Column {col_name} has data type {curr_dtype}')
+        return curr_dtype, known_dtype_dist, additional_info, warn, info
+
diff --git a/type_infer/rule_based/helpers.py b/type_infer/rule_based/helpers.py
new file mode 100644
index 0000000..8e3049b
--- /dev/null
+++ b/type_infer/rule_based/helpers.py
@@ -0,0 +1,182 @@
+import re
+import nltk
+import string
+from typing import Iterable
+from collections import Counter, defaultdict
+
+import numpy as np
+import scipy.stats as st
+from langid.langid import LanguageIdentifier
+from langid.langid import model as langid_model
+
+from type_infer.dtype import dtype
+
+
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    nltk.download('punkt')
+
+try:
+    from nltk.corpus import stopwords
+    stopwords.words('english')
+except LookupError:
+    nltk.download('stopwords', quiet=True)
+
+
+def get_identifier_description_mp(arg_tup):
+    data, column_name, data_dtype = arg_tup
+    return get_identifier_description(data, column_name, data_dtype)
+
+
+def get_identifier_description(data: Iterable, column_name: str, data_dtype: dtype):
+    data = list(data)
+    if isinstance(data[0], list):
+        nr_unique = len(set(tuple(x) for x in data))
+    elif isinstance(data[0], dict):
+        nr_unique = len(set(str(x) for x in data))
+    else:
+        nr_unique = len(set(data))
+
+    if nr_unique == 1:
+        return 'No Information'
+
+    unique_pct = nr_unique / len(data)
+
+    spaces = [len(str(x).split(' ')) - 1 for x in data]
+    mean_spaces = np.mean(spaces) if len(spaces) > 0 else 0.0
+
+    # Detect hash
+    all_same_length = all(len(str(data[0])) == len(str(x)) for x in data)
+    uuid_charset = set('0123456789abcdefABCDEF-')
+    all_uuid_charset = all(set(str(x)).issubset(uuid_charset) for x in data)
+    is_uuid = all_uuid_charset and all_same_length
+
+    if all_same_length and len(data) == nr_unique and data_dtype not in (dtype.integer, dtype.float):
+        str_data = [str(x) for x in data]
+        randomness_per_index = []
+        for i, _ in enumerate(str_data[0]):
+            N = len(set(x[i] for x in str_data))
+            S = st.entropy([*Counter(x[i] for x in str_data).values()])
+            if S == 0:
+                randomness_per_index.append(0.0)
+            else:
+                randomness_per_index.append(S / np.log(N))
+
+        mean_randomness = np.mean(randomness_per_index) if len(randomness_per_index) > 0 else 0
+        if mean_randomness > 0.95:
+            return 'Hash-like identifier'
+
+    # Detect foreign key
+    if data_dtype == dtype.integer:
+        if _is_foreign_key_name(column_name):
+            return 'Foreign key'
+
+    if _is_identifier_name(column_name) or data_dtype in (dtype.categorical, dtype.binary):
+        if unique_pct > 0.98:
+            if is_uuid:
+                return 'UUID'
+            else:
+                return 'Unknown identifier'
+
+    # Everything is unique and it's too short to be rich text
+    if data_dtype in (dtype.categorical, dtype.binary, dtype.short_text, dtype.rich_text) and \
+            unique_pct > 0.99999 and mean_spaces < 1:
+        return 'Unknown identifier'
+
+    return None
+
+
+def _is_foreign_key_name(name):
+    for endings in ['id', 'ID', 'Id']:
+        for add in ['-', '_', ' ']:
+            if name.endswith(add + endings):
+                return True
+    for endings in ['ID', 'Id']:
+        if name.endswith(endings):
+            return True
+    return False
+
+
+def _is_identifier_name(name):
+    for keyword in ['account', 'uuid', 'identifier', 'user']:
+        if keyword in name:
+            return True
+    return False
+
+
+def get_language_dist(data):
+    lang_dist = defaultdict(lambda: 0)
+    lang_dist['Unknown'] = 0
+    lang_probs_cache = dict()
+    identifier = LanguageIdentifier.from_modelstring(langid_model, norm_probs=True)
+    for text in data:
+        text = str(text)
+        text = text.translate(str.maketrans('', '', string.punctuation))
+        if text not in lang_probs_cache:
+            try:
+                lang_probs = identifier.classify(text)
+            except Exception:
+                lang_probs = []
+            lang_probs_cache[text] = lang_probs
+
+        lang_probs = lang_probs_cache[text]
+        if len(lang_probs) > 0 and lang_probs[1] > 10 * (1 / len(identifier.nb_classes)):
+            lang_dist[lang_probs[0]] += 1
+        else:
+            lang_dist['Unknown'] += 1
+
+    return dict(lang_dist)
+
+
+def analyze_sentences(data):
+    nr_words = 0
+    word_dist = defaultdict(int)
+    nr_words_dist = defaultdict(int)
+    stop_words = set(stopwords.words('english'))
+    for text in map(str, data):
+        text = text.lower()
+        text_dist = defaultdict(int)
+        tokens = tokenize_text(text)
+        tokens_no_stop = (x for x in tokens if x not in stop_words)
+        for tok in tokens_no_stop:
+            text_dist[tok] += 1
+
+        n_tokens = len(text_dist)
+        nr_words_dist[n_tokens] += 1
+        nr_words += n_tokens
+
+        # merge text_dist into word_dist
+        for k, v in text_dist.items():
+            word_dist[k] += v
+
+    return nr_words, dict(word_dist), dict(nr_words_dist)
+
+
+def contains_alnum(text):
+    for c in text:
+        if c.isalnum():
+            return True
+    return False
+
+
+def tokenize_text(text):
+    """ Generator instead of list comprehension for optimal memory usage & runtime """
+    return (t.lower() for t in nltk.word_tokenize(decontracted(text)) if contains_alnum(t))
+
+
+def decontracted(phrase):
+    # specific
+    phrase = re.sub(r"won\'t", "will not", phrase)
+    phrase = re.sub(r"can\'t", "can not", phrase)
+
+    # general
+    phrase = re.sub(r"n\'t", " not", phrase)
+    phrase = re.sub(r"\'re", " are", phrase)
+    phrase = re.sub(r"\'s", " is", phrase)
+    phrase = re.sub(r"\'d", " would", phrase)
+    phrase = re.sub(r"\'ll", " will", phrase)
+    phrase = re.sub(r"\'t", " not", phrase)
+    phrase = re.sub(r"\'ve", " have", phrase)
+    phrase = re.sub(r"\'m", " am", phrase)
+    return phrase