From 425eb203db51d105e9e831e119600e1a7634343d Mon Sep 17 00:00:00 2001
From: Forest Gregg <fgregg@datamade.us>
Date: Thu, 27 Jun 2024 06:59:06 -0400
Subject: [PATCH] Revert "more direct set up data model (#1193)"

This reverts commit 88558420c15f79b6153ec3e123a7662c09edbbf4.
---
 .flake8                                      |   2 +-
 .pre-commit-config.yaml                      |   5 -
 CHANGELOG.md                                 |   5 -
 benchmarks/benchmarks/canonical.py           |  12 +-
 benchmarks/benchmarks/canonical_gazetteer.py |   9 +-
 benchmarks/benchmarks/canonical_matching.py  |   9 +-
 benchmarks/benchmarks/common.py              |   6 +-
 dedupe/__init__.py                           |  15 --
 dedupe/_typing.py                            |  44 ++-
 dedupe/api.py                                |   7 +-
 dedupe/convenience.py                        |   5 +-
 dedupe/core.py                               |   2 +-
 dedupe/datamodel.py                          | 170 +++++++-----
 dedupe/predicates.py                         |   4 +-
 dedupe/training.py                           |   3 +-
 dedupe/variables/__init__.py                 |  26 +-
 dedupe/variables/base.py                     |  90 ++++---
 dedupe/variables/categorical_type.py         |  22 +-
 dedupe/variables/exists.py                   |  10 +-
 dedupe/variables/interaction.py              |  19 +-
 dedupe/variables/set.py                      |  15 +-
 dedupe/variables/string.py                   |  26 +-
 docs/Variable-definition.rst                 | 270 +++++++++++--------
 pyproject.toml                               |   5 +-
 tests/test_api.py                            |  17 +-
 tests/test_core.py                           |  19 +-
 tests/test_dedupe.py                         |  29 +-
 tests/test_labeler.py                        |   3 +-
 tests/test_serializer.py                     |   2 +-
 tests/test_training.py                       |   2 +-
 30 files changed, 467 insertions(+), 386 deletions(-)

diff --git a/.flake8 b/.flake8
index 7350ce301..0e85dce10 100644
--- a/.flake8
+++ b/.flake8
@@ -1,3 +1,3 @@
 [flake8]
 max-line-length=160
-extend-ignore = E203
\ No newline at end of file
+extend-ignore = E203
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4896e74f6..320205765 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -8,8 +8,3 @@ repos:
     hooks:
       - id: isort
         name: isort (python)
-  - repo: https://github.com/pycqa/flake8
-    rev: "7.1.0"
-    hooks:
-      - id: flake8
-        args: [--config=.flake8]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8a9ee9c76..ef3a19bfc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,8 +1,3 @@
-# 3.0.0
-- Development in python packaging made supporting the previous namespace approach for
-  variable plugins untenable. Since we had to redo the way we defined the data model, 
-  we took the opportunity to explicity instantiate variable objects. 
-
 # 2.0.6
 - fixed bug that was preventing learning of index predicates in Dedupe mode
 
diff --git a/benchmarks/benchmarks/canonical.py b/benchmarks/benchmarks/canonical.py
index 6f23bb08d..da075b9ba 100644
--- a/benchmarks/benchmarks/canonical.py
+++ b/benchmarks/benchmarks/canonical.py
@@ -32,19 +32,17 @@ def make_report(self, clustering):
         return make_report(self.data, clustering)
 
     def run(self, use_settings=False):
-        deduper: dedupe.StaticDedupe | dedupe.Dedupe
-
         if use_settings and os.path.exists(self.settings_file):
             with open(self.settings_file, "rb") as f:
                 deduper = dedupe.StaticDedupe(f)
 
         else:
             variables = [
-                dedupe.variables.String("name"),
-                dedupe.variables.Exact("name"),
-                dedupe.variables.String("address"),
-                dedupe.variables.ShortString("cuisine", has_missing=True),
-                dedupe.variables.ShortString("city"),
+                {"field": "name", "type": "String"},
+                {"field": "name", "type": "Exact"},
+                {"field": "address", "type": "String"},
+                {"field": "cuisine", "type": "ShortString", "has missing": True},
+                {"field": "city", "type": "ShortString"},
             ]
 
             deduper = dedupe.Dedupe(variables, num_cores=5)
diff --git a/benchmarks/benchmarks/canonical_gazetteer.py b/benchmarks/benchmarks/canonical_gazetteer.py
index 1f73b8e20..bdbc51ba1 100644
--- a/benchmarks/benchmarks/canonical_gazetteer.py
+++ b/benchmarks/benchmarks/canonical_gazetteer.py
@@ -25,17 +25,16 @@ def make_report(self, clustering):
 
     def run(self, kwargs, use_settings=False):
         data_1, data_2 = self.data
-        gazetteer: dedupe.StaticGazetteer | dedupe.Gazetteer
 
         if use_settings and os.path.exists(self.settings_file):
             with open(self.settings_file, "rb") as f:
                 gazetteer = dedupe.StaticGazetteer(f)
         else:
             variables = [
-                dedupe.variables.String("name"),
-                dedupe.variables.String("address"),
-                dedupe.variables.String("cuisine"),
-                dedupe.variables.String("city"),
+                {"field": "name", "type": "String"},
+                {"field": "address", "type": "String"},
+                {"field": "cuisine", "type": "String"},
+                {"field": "city", "type": "String"},
             ]
 
             gazetteer = dedupe.Gazetteer(variables)
diff --git a/benchmarks/benchmarks/canonical_matching.py b/benchmarks/benchmarks/canonical_matching.py
index b1c3c823b..471cd4988 100644
--- a/benchmarks/benchmarks/canonical_matching.py
+++ b/benchmarks/benchmarks/canonical_matching.py
@@ -42,17 +42,16 @@ def setup(self, kwargs):
 
     def run(self, kwargs, use_settings=False):
         data_1, data_2 = self.data
-        deduper: dedupe.StaticRecordLink | dedupe.RecordLink
 
         if use_settings and os.path.exists(self.settings_file):
             with open(self.settings_file, "rb") as f:
                 deduper = dedupe.StaticRecordLink(f)
         else:
             variables = [
-                dedupe.variables.String("name"),
-                dedupe.variables.String("address"),
-                dedupe.variables.String("cuisine"),
-                dedupe.variables.String("city"),
+                {"field": "name", "type": "String"},
+                {"field": "address", "type": "String"},
+                {"field": "cuisine", "type": "String"},
+                {"field": "city", "type": "String"},
             ]
             deduper = dedupe.RecordLink(variables)
             deduper.prepare_training(
diff --git a/benchmarks/benchmarks/common.py b/benchmarks/benchmarks/common.py
index 17e129f99..afe993274 100644
--- a/benchmarks/benchmarks/common.py
+++ b/benchmarks/benchmarks/common.py
@@ -54,9 +54,9 @@ def get_true_dupes(data: dict) -> set:
         sorted(data.items(), key=lambda x: x[1]["unique_id"]),
         key=lambda x: x[1]["unique_id"],
     ):
-        pair_l = list(pair)
-        if len(pair_l) == 2:
-            a, b = pair_l
+        pair = list(pair)
+        if len(pair) == 2:
+            a, b = pair
             duplicates.add(frozenset((a[0], b[0])))
     return duplicates
 
diff --git a/dedupe/__init__.py b/dedupe/__init__.py
index 7ef7d4c77..726836a72 100644
--- a/dedupe/__init__.py
+++ b/dedupe/__init__.py
@@ -13,18 +13,3 @@
     training_data_link,
 )
 from dedupe.serializer import read_training, write_training  # noqa: F401
-
-__all__ = [
-    "Dedupe",
-    "Gazetteer",
-    "RecordLink",
-    "StaticDedupe",
-    "StaticGazetteer",
-    "StaticRecordLink",
-    "canonicalize",
-    "console_label",
-    "training_data_dedupe",
-    "training_data_link",
-    "read_training",
-    "write_training",
-]
diff --git a/dedupe/_typing.py b/dedupe/_typing.py
index 9de9eb5f7..65922f7e5 100644
--- a/dedupe/_typing.py
+++ b/dedupe/_typing.py
@@ -4,18 +4,17 @@
     TYPE_CHECKING,
     Any,
     Callable,
+    Collection,
     Dict,
     FrozenSet,
     Iterable,
     Iterator,
     List,
     Mapping,
-    MutableSequence,
     Sequence,
     Tuple,
     Type,
     Union,
-    runtime_checkable,
 )
 
 import numpy
@@ -73,7 +72,6 @@
 LookupResults = Union[LookupResultsInt, LookupResultsStr]
 JoinConstraint = Literal["one-to-one", "many-to-one", "many-to-many"]
 Comparator = Callable[[Any, Any], Union[Union[int, float], Sequence[Union[int, float]]]]
-CustomComparator = Callable[[Any, Any], Union[int, float]]
 Scores = Union[numpy.memmap, numpy.ndarray]
 Labels = List[Literal[0, 1]]
 LabelsLike = Iterable[Literal[0, 1]]
@@ -83,10 +81,28 @@
 ComparisonCover = Union[ComparisonCoverInt, ComparisonCoverStr]
 PredicateFunction = Callable[[Any], FrozenSet[str]]
 
+VariableDefinition = TypedDict(
+    "VariableDefinition",
+    {
+        "type": str,
+        "field": str,
+        "variable name": str,
+        "corpus": Iterable[Union[str, Collection[str]]],
+        "comparator": Callable[
+            [Any, Any], Union[int, float]
+        ],  # a custom comparator can only return a single float or int, not a sequence of numbers
+        "categories": List[str],
+        "interaction variables": List[str],
+        "has missing": bool,
+        "name": str,
+    },
+    total=False,
+)
+
 
 class TrainingData(TypedDict):
-    match: MutableSequence[RecordDictPair]
-    distinct: MutableSequence[RecordDictPair]
+    match: List[RecordDictPair]
+    distinct: List[RecordDictPair]
 
 
 # Takes pairs of records and generates a (n_samples X n_features) array
@@ -111,24 +127,6 @@ def close(self) -> None: ...
     def join(self) -> None: ...
 
 
-class Variable(Protocol):
-    name: str
-    predicates: List["Predicate"]
-    has_missing: bool
-
-    def __len__(self) -> int: ...
-
-
-@runtime_checkable
-class FieldVariable(Variable, Protocol):
-    field: str
-    comparator: Comparator
-
-
-class InteractionVariable(Variable, Protocol):
-    interaction_fields: List[str]
-
-
 MapLike = Callable[[Callable[[Any], Any], Iterable], Iterable]
 
 PathLike = Union[str, os.PathLike]
diff --git a/dedupe/api.py b/dedupe/api.py
index 9d17bc0b0..1e2e1f438 100644
--- a/dedupe/api.py
+++ b/dedupe/api.py
@@ -14,7 +14,7 @@
 import sqlite3
 import tempfile
 import warnings
-from typing import TYPE_CHECKING, Literal, cast, overload
+from typing import TYPE_CHECKING, cast, overload
 
 import numpy
 import sklearn.linear_model
@@ -27,6 +27,7 @@
 import dedupe.labeler as labeler
 import dedupe.predicates
 import dedupe.serializer as serializer
+from dedupe._typing import Literal
 
 if TYPE_CHECKING:
     from typing import (
@@ -69,7 +70,7 @@
         Scores,
         TrainingData,
         TupleLinks,
-        Variable,
+        VariableDefinition,
     )
 
 logger = logging.getLogger(__name__)
@@ -1116,7 +1117,7 @@ class ActiveMatching(Matching):
 
     def __init__(
         self,
-        variable_definition: Collection[Variable],
+        variable_definition: Collection[VariableDefinition],
         num_cores: int | None = None,
         in_memory: bool = False,
         **kwargs,
diff --git a/dedupe/convenience.py b/dedupe/convenience.py
index 26b886a4a..fb24de259 100644
--- a/dedupe/convenience.py
+++ b/dedupe/convenience.py
@@ -7,7 +7,7 @@
 import random
 import sys
 import warnings
-from typing import Iterator, Literal, Tuple, overload
+from typing import Iterator, Tuple, overload
 
 import numpy
 
@@ -15,6 +15,7 @@
 from dedupe._typing import (
     DataInt,
     DataStr,
+    Literal,
     RecordDict,
     RecordDictPair,
     RecordID,
@@ -134,7 +135,7 @@ def console_label(deduper: dedupe.api.ActiveMatching) -> None:  # pragma: no cov
 
     finished = False
     use_previous = False
-    fields = unique(var.field for var in deduper.data_model.field_variables)
+    fields = unique(var.field for var in deduper.data_model.primary_variables)
 
     buffer_len = 1  # Max number of previous operations
     unlabeled: list[RecordDictPair] = []
diff --git a/dedupe/core.py b/dedupe/core.py
index 975c08469..a9ffd55b5 100644
--- a/dedupe/core.py
+++ b/dedupe/core.py
@@ -22,7 +22,6 @@
         Generator,
         Iterable,
         Iterator,
-        Literal,
         Optional,
         Sequence,
         Type,
@@ -36,6 +35,7 @@
         ClosableJoinable,
         Data,
         FeaturizerFunction,
+        Literal,
         MapLike,
         RecordID,
         RecordIDDType,
diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py
index 1b84b8aff..49956bc6b 100644
--- a/dedupe/datamodel.py
+++ b/dedupe/datamodel.py
@@ -1,80 +1,54 @@
 from __future__ import annotations
 
 import copyreg
+import pkgutil
 import types
-from collections.abc import Mapping
 from typing import TYPE_CHECKING, cast
 
 import numpy
 
-from dedupe._typing import FieldVariable
+import dedupe.variables
+from dedupe.variables.base import FieldType as FieldVariable
+from dedupe.variables.base import MissingDataType, Variable
 from dedupe.variables.interaction import InteractionType
 
+for _, module, _ in pkgutil.iter_modules(  # type: ignore
+    dedupe.variables.__path__, "dedupe.variables."
+):
+    __import__(module)
+
 if TYPE_CHECKING:
-    from typing import Collection, Generator, Iterable, Sequence
+    from typing import Generator, Iterable, Sequence
 
     from dedupe._typing import (
         Comparator,
-        InteractionVariable,
         RecordDict,
         RecordDictPair,
-        Variable,
+        VariableDefinition,
     )
     from dedupe.predicates import Predicate
 
+VARIABLE_CLASSES = {k: v for k, v in FieldVariable.all_subclasses() if k}
 
-class DataModel:
-    version = 2
 
-    def __init__(self, variable_definitions: Collection[Variable]):
-        for item in variable_definitions:
-            if isinstance(item, Mapping):
-                raise ValueError(
-                    "It looks like you are trying to use a variable definition "
-                    "composed of dictionaries. dedupe 3.0 uses variable objects "
-                    'directly. So instead of [{"field": "name", "type": "String"}] '
-                    'we now do [dedupe.variables.String("name")].'
-                )
+class DataModel(object):
+    version = 1
 
+    def __init__(self, variable_definitions: Iterable[VariableDefinition]):
         variable_definitions = list(variable_definitions)
         if not variable_definitions:
             raise ValueError("The variable definitions cannot be empty")
-        if not any(variable.predicates for variable in variable_definitions):
-            raise ValueError(
-                "At least one of the variable types needs to be a type"
-                "other than 'Custom'. 'Custom' types have no associated"
-                "blocking rules"
-            )
-
-        # This is a protocol check, not a class inheritance check
-        self.field_variables: list[FieldVariable] = [
-            variable
-            for variable in variable_definitions
-            if isinstance(variable, FieldVariable)
-        ]
-
-        # we need to keep track of ordering of variables because in
-        # order to calculate derived fields like interaction and missing
-        # data fields.
-        columns: list[Variable] = []
-        for variable in self.field_variables:
-            if len(variable) == 1:
-                columns.append(variable)
-            elif len(variable) > 1:
-                assert hasattr(variable, "higher_vars")
-                columns.extend(variable.higher_vars)
+        all_variables: list[Variable]
+        self.primary_variables, all_variables = typify_variables(variable_definitions)
+        self._derived_start = len(all_variables)
 
-        self._derived_start = len(columns)
+        all_variables += interactions(variable_definitions, self.primary_variables)
+        all_variables += missing(all_variables)
 
-        # i'm not really satisfied with how we are dealing with interactions
-        # here. seems like there should be a cleaner path, but i don't see it
-        # today
-        columns += interactions(variable_definitions, self.field_variables)
+        self._missing_field_indices = missing_field_indices(all_variables)
+        self._interaction_indices = interaction_indices(all_variables)
 
-        self._missing_field_indices = missing_field_indices(columns)
-        self._interaction_indices = interaction_indices(columns)
-
-        self._len = len(columns) + len(self._missing_field_indices)
+        self._len = len(all_variables)
 
     def __len__(self) -> int:
         return self._len
@@ -89,7 +63,7 @@ def _field_comparators(
     ) -> Generator[tuple[str, Comparator, int, int], None, None]:
         start = 0
         stop = 0
-        for var in self.field_variables:
+        for var in self.primary_variables:
             stop = start + len(var)
             comparator = cast("Comparator", var.comparator)
             yield (var.field, comparator, start, stop)
@@ -98,7 +72,7 @@ def _field_comparators(
     @property
     def predicates(self) -> set[Predicate]:
         predicates = set()
-        for var in self.field_variables:
+        for var in self.primary_variables:
             for predicate in var.predicates:
                 predicates.add(predicate)
         return predicates
@@ -158,26 +132,100 @@ def __getstate__(self):
         return d
 
     def __setstate__(self, d):
-        version = d.pop("object_version", None)
+        version = d.pop("version", None)
         if version is None and "_variables" in d:
             d["_len"] = len(d.pop("_variables"))
             d["primary_variables"] = d.pop("primary_fields")
-        elif version == 1:
-            d["field_variables"] = d.pop("primary_variables")
 
         self.__dict__ = d
 
 
+def typify_variables(
+    variable_definitions: Iterable[VariableDefinition],
+) -> tuple[list[FieldVariable], list[Variable]]:
+    primary_variables: list[FieldVariable] = []
+    all_variables: list[Variable] = []
+    only_custom = True
+
+    for definition in variable_definitions:
+        try:
+            variable_type = definition["type"]
+        except TypeError:
+            raise TypeError(
+                "Incorrect variable specification: variable "
+                "specifications are dictionaries that must "
+                "include a type definition, ex. "
+                "{'field' : 'Phone', type: 'String'}"
+            )
+        except KeyError:
+            raise KeyError(
+                "Missing variable type: variable "
+                "specifications are dictionaries that must "
+                "include a type definition, ex. "
+                "{'field' : 'Phone', type: 'String'}"
+            )
+
+        if variable_type != "Custom":
+            only_custom = False
+
+        if variable_type == "Interaction":
+            continue
+
+        if variable_type == "FuzzyCategorical" and "other fields" not in definition:
+            definition["other fields"] = [  # type: ignore
+                d["field"]
+                for d in variable_definitions
+                if ("field" in d and d["field"] != definition["field"])
+            ]
+
+        try:
+            variable_class = VARIABLE_CLASSES[variable_type]
+        except KeyError:
+            raise KeyError(
+                "Field type %s not valid. Valid types include %s"
+                % (definition["type"], ", ".join(VARIABLE_CLASSES))
+            )
+
+        variable_object = variable_class(definition)
+        assert isinstance(variable_object, FieldVariable)
+
+        primary_variables.append(variable_object)
+
+        if hasattr(variable_object, "higher_vars"):
+            all_variables.extend(variable_object.higher_vars)
+        else:
+            variable_object = cast(Variable, variable_object)
+            all_variables.append(variable_object)
+
+    if only_custom:
+        raise ValueError(
+            "At least one of the variable types needs to be a type"
+            "other than 'Custom'. 'Custom' types have no associated"
+            "blocking rules"
+        )
+
+    return primary_variables, all_variables
+
+
+def missing(variables: list[Variable]) -> list[MissingDataType]:
+    missing_variables = []
+    for var in variables:
+        if var.has_missing:
+            missing_variables.append(MissingDataType(var.name))
+    return missing_variables
+
+
 def interactions(
-    variables: Iterable[Variable], primary_variables: Iterable[FieldVariable]
-) -> list[InteractionVariable]:
+    definitions: Iterable[VariableDefinition], primary_variables: list[FieldVariable]
+) -> list[InteractionType]:
     field_d = {field.name: field for field in primary_variables}
 
-    interactions: list[InteractionVariable] = []
-    for variable in variables:
-        if isinstance(variable, InteractionType):
-            variable.expandInteractions(field_d)
-            interactions.extend(variable.higher_vars)
+    interactions = []
+    for definition in definitions:
+        if definition["type"] == "Interaction":
+            var = InteractionType(definition)
+            var.expandInteractions(field_d)
+            interactions.extend(var.higher_vars)
     return interactions
 
 
diff --git a/dedupe/predicates.py b/dedupe/predicates.py
index 1e07c672e..2d180ee8d 100644
--- a/dedupe/predicates.py
+++ b/dedupe/predicates.py
@@ -17,9 +17,9 @@
 from dedupe.predicate_functions import *  # noqa: F401, F403
 
 if TYPE_CHECKING:
-    from typing import AbstractSet, Any, FrozenSet, Iterable, Literal, Mapping, Sequence
+    from typing import AbstractSet, Any, FrozenSet, Iterable, Mapping, Sequence
 
-    from dedupe._typing import PredicateFunction, RecordDict
+    from dedupe._typing import Literal, PredicateFunction, RecordDict
     from dedupe.index import Index
 
 
diff --git a/dedupe/training.py b/dedupe/training.py
index 98c9f28df..eb6fe0a04 100644
--- a/dedupe/training.py
+++ b/dedupe/training.py
@@ -13,7 +13,7 @@
 from . import blocking, branch_and_bound
 
 if TYPE_CHECKING:
-    from typing import Iterable, Literal, Sequence
+    from typing import Iterable, Sequence
 
     from ._typing import (
         ComparisonCover,
@@ -23,6 +23,7 @@
         Data,
         DataInt,
         DataStr,
+        Literal,
     )
     from ._typing import RecordDictPairs as TrainingExamples
     from ._typing import RecordID, RecordIDPair
diff --git a/dedupe/variables/__init__.py b/dedupe/variables/__init__.py
index 39c339c68..b36383a61 100644
--- a/dedupe/variables/__init__.py
+++ b/dedupe/variables/__init__.py
@@ -1,25 +1,3 @@
-from .base import CustomType as Custom
-from .categorical_type import CategoricalType as Categorical
-from .exact import ExactType as Exact
-from .exists import ExistsType as Exists
-from .interaction import InteractionType as Interaction
-from .latlong import LatLongType as LatLong
-from .price import PriceType as Price
-from .set import SetType as Set
-from .string import ShortStringType as ShortString
-from .string import StringType as String
-from .string import TextType as Text
+from pkgutil import extend_path
 
-__all__ = [
-    "Custom",
-    "Categorical",
-    "Exact",
-    "Exists",
-    "Interaction",
-    "LatLong",
-    "Price",
-    "Set",
-    "ShortString",
-    "String",
-    "Text",
-]
+__path__ = extend_path(__path__, __name__)
diff --git a/dedupe/variables/base.py b/dedupe/variables/base.py
index 71d6e722e..109d3332c 100644
--- a/dedupe/variables/base.py
+++ b/dedupe/variables/base.py
@@ -1,21 +1,20 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 from dedupe import predicates
 
 if TYPE_CHECKING:
-    from typing import Any, ClassVar, Iterable, Sequence, Type
+    from typing import Any, ClassVar, Generator, Iterable, Optional, Sequence, Type
 
-    from dedupe._typing import Comparator, CustomComparator, PredicateFunction
-    from dedupe._typing import Variable as VariableProtocol
+    from dedupe._typing import Comparator, PredicateFunction, VariableDefinition
 
 
 class Variable(object):
     name: str
     type: ClassVar[str]
     predicates: list[predicates.Predicate]
-    higher_vars: Sequence["VariableProtocol"]
+    higher_vars: Sequence["Variable"]
 
     def __len__(self) -> int:
         return 1
@@ -30,8 +29,16 @@ def __eq__(self, other: Any) -> bool:
         other_name: str = other.name
         return self.name == other_name
 
-    def __init__(self, has_missing: bool = False):
-        self.has_missing = has_missing
+    def __init__(self, definition: VariableDefinition):
+        if definition.get("has missing", False):
+            self.has_missing = True
+            try:
+                exists_pred = predicates.ExistsPredicate(definition["field"])
+                self.predicates.append(exists_pred)
+            except KeyError:
+                pass
+        else:
+            self.has_missing = False
 
     def __getstate__(self) -> dict[str, Any]:
         odict = self.__dict__.copy()
@@ -39,13 +46,31 @@ def __getstate__(self) -> dict[str, Any]:
 
         return odict
 
+    @classmethod
+    def all_subclasses(
+        cls,
+    ) -> Generator[tuple[Optional[str], Type["Variable"]], None, None]:
+        for q in cls.__subclasses__():
+            yield getattr(q, "type", None), q
+            for p in q.all_subclasses():
+                yield p
+
 
 class DerivedType(Variable):
     type = "Derived"
 
-    def __init__(self, name: str, var_type: str, **kwargs):
-        self.name = "(%s: %s)" % (str(name), str(var_type))
-        super().__init__(**kwargs)
+    def __init__(self, definition: VariableDefinition):
+        self.name = "(%s: %s)" % (str(definition["name"]), str(definition["type"]))
+        super(DerivedType, self).__init__(definition)
+
+
+class MissingDataType(Variable):
+    type = "MissingData"
+
+    def __init__(self, name: str):
+        self.name = "(%s: Not Missing)" % name
+
+        self.has_missing = False
 
 
 class FieldType(Variable):
@@ -55,15 +80,13 @@ class FieldType(Variable):
     _Predicate: Type[predicates.SimplePredicate] = predicates.SimplePredicate
     comparator: Comparator
 
-    def __init__(
-        self, field: str, name: Optional[str] = None, has_missing: bool = False
-    ):
-        self.field = field
+    def __init__(self, definition: VariableDefinition):
+        self.field = definition["field"]
 
-        if name is None:
-            self.name = "(%s: %s)" % (self.field, self.type)
+        if "variable name" in definition:
+            self.name = definition["variable name"]
         else:
-            self.name = name
+            self.name = "(%s: %s)" % (self.field, self.type)
 
         self.predicates = [
             self._Predicate(pred, self.field) for pred in self._predicate_functions
@@ -73,39 +96,30 @@ def __init__(
             self._index_predicates, self._index_thresholds, self.field
         )
 
-        self.has_missing = has_missing
-        if self.has_missing:
-            exists_pred = predicates.ExistsPredicate(self.field)
-            self.predicates.append(exists_pred)
+        super(FieldType, self).__init__(definition)
 
 
 class CustomType(FieldType):
     type = "Custom"
 
-    def __init__(
-        self,
-        field: str,
-        comparator: CustomComparator,
-        name: Optional[str] = None,
-        **kwargs,
-    ):
-        super().__init__(field, **kwargs)
-
-        if comparator is None:
-            raise ValueError(
-                "You must define a comparator function for the Custom class"
+    def __init__(self, definition: VariableDefinition):
+        super(CustomType, self).__init__(definition)
+
+        try:
+            self.comparator = definition["comparator"]  # type: ignore[assignment]
+        except KeyError:
+            raise KeyError(
+                "For 'Custom' field types you must define "
+                "a 'comparator' function in the field "
+                "definition. "
             )
-        else:
-            self.comparator = comparator
 
-        if name is None:
+        if "variable name" not in definition:
             self.name = "(%s: %s, %s)" % (
                 self.field,
                 self.type,
                 self.comparator.__name__,
             )
-        else:
-            self.name = name
 
 
 def indexPredicates(
diff --git a/dedupe/variables/categorical_type.py b/dedupe/variables/categorical_type.py
index c2dc56768..b9d3ef66b 100644
--- a/dedupe/variables/categorical_type.py
+++ b/dedupe/variables/categorical_type.py
@@ -1,11 +1,9 @@
 from __future__ import annotations
 
-from typing import Sequence
-
 from categorical import CategoricalComparator
 
 from dedupe import predicates
-from dedupe._typing import PredicateFunction
+from dedupe._typing import PredicateFunction, VariableDefinition
 from dedupe.variables.base import DerivedType, FieldType
 
 
@@ -13,14 +11,26 @@ class CategoricalType(FieldType):
     type = "Categorical"
     _predicate_functions: list[PredicateFunction] = [predicates.wholeFieldPredicate]
 
-    def __init__(self, field: str, categories: Sequence[str], **kwargs):
-        super().__init__(field, **kwargs)
+    def _categories(self, definition: VariableDefinition) -> list[str]:
+        try:
+            categories = definition["categories"]
+        except KeyError:
+            raise ValueError('No "categories" defined')
+
+        return categories
+
+    def __init__(self, definition: VariableDefinition):
+        super(CategoricalType, self).__init__(definition)
+
+        categories = self._categories(definition)
 
         self.comparator = CategoricalComparator(categories)  # type: ignore[assignment]
 
         self.higher_vars = []
         for higher_var in self.comparator.dummy_names:  # type: ignore[attr-defined]
-            dummy_var = DerivedType(higher_var, "Dummy", has_missing=False)
+            dummy_var = DerivedType(
+                {"name": higher_var, "type": "Dummy", "has missing": self.has_missing}
+            )
             self.higher_vars.append(dummy_var)
 
     def __len__(self) -> int:
diff --git a/dedupe/variables/exists.py b/dedupe/variables/exists.py
index 00ca7eb46..46c36c292 100644
--- a/dedupe/variables/exists.py
+++ b/dedupe/variables/exists.py
@@ -4,7 +4,7 @@
 
 from categorical import CategoricalComparator
 
-from dedupe._typing import PredicateFunction
+from dedupe._typing import PredicateFunction, VariableDefinition
 from dedupe.variables.base import DerivedType
 from dedupe.variables.categorical_type import CategoricalType
 
@@ -13,14 +13,16 @@ class ExistsType(CategoricalType):
     type = "Exists"
     _predicate_functions: list[PredicateFunction] = []
 
-    def __init__(self, field: str, **kwargs):
-        super().__init__(field, **kwargs)
+    def __init__(self, definition: VariableDefinition):
+        super(CategoricalType, self).__init__(definition)
 
         self.cat_comparator = CategoricalComparator([0, 1])
 
         self.higher_vars = []
         for higher_var in self.cat_comparator.dummy_names:
-            dummy_var = DerivedType(higher_var, "Dummy", has_missing=self.has_missing)
+            dummy_var = DerivedType(
+                {"name": higher_var, "type": "Dummy", "has missing": self.has_missing}
+            )
             self.higher_vars.append(dummy_var)
 
     def comparator(self, field_1: Any, field_2: Any) -> list[int]:
diff --git a/dedupe/variables/interaction.py b/dedupe/variables/interaction.py
index e9fbbe591..b0370e667 100644
--- a/dedupe/variables/interaction.py
+++ b/dedupe/variables/interaction.py
@@ -1,23 +1,24 @@
 from __future__ import annotations
 
 import itertools
-from typing import List, Mapping
+from typing import Mapping
 
-from dedupe._typing import FieldVariable, InteractionVariable
+from dedupe._typing import VariableDefinition
+from dedupe.variables.base import FieldType as FieldVariable
 from dedupe.variables.base import Variable
 
 
 class InteractionType(Variable):
     type = "Interaction"
-    higher_vars: List[InteractionVariable]
+    higher_vars: list["InteractionType"]
 
-    def __init__(self, *args: str, **kwargs):
-        self.interactions = list(args)
+    def __init__(self, definition: VariableDefinition):
+        self.interactions = definition["interaction variables"]
 
         self.name = "(Interaction: %s)" % str(self.interactions)
         self.interaction_fields = self.interactions
 
-        super().__init__(**kwargs)
+        super().__init__(definition)
 
     def expandInteractions(self, field_model: Mapping[str, FieldVariable]) -> None:
         self.interaction_fields = self.atomicInteractions(
@@ -41,12 +42,14 @@ def categorical(self, field_model: Mapping[str, FieldVariable]) -> None:
             if not hasattr(field_model[field], "higher_vars")
         ]
 
-        dummies = [field_model[field].higher_vars for field in categoricals]  # type: ignore[attr-defined]
+        dummies = [field_model[field].higher_vars for field in categoricals]
 
         self.higher_vars = []
         for combo in itertools.product(*dummies):
             var_names = [field.name for field in combo] + noncategoricals
-            higher_var = InteractionType(*var_names, has_missing=self.has_missing)
+            higher_var = InteractionType(
+                {"has missing": self.has_missing, "interaction variables": var_names}
+            )
             self.higher_vars.append(higher_var)
 
     def atomicInteractions(
diff --git a/dedupe/variables/set.py b/dedupe/variables/set.py
index 8b8253e59..fddfa5c5e 100644
--- a/dedupe/variables/set.py
+++ b/dedupe/variables/set.py
@@ -1,8 +1,7 @@
-from typing import Collection, Iterable, Optional
-
 from simplecosine.cosine import CosineSetSimilarity
 
 from dedupe import predicates
+from dedupe._typing import VariableDefinition
 from dedupe.variables.base import FieldType
 
 
@@ -25,12 +24,10 @@ class SetType(FieldType):
     )
     _index_thresholds = (0.2, 0.4, 0.6, 0.8)
 
-    def __init__(
-        self, field: str, corpus: Optional[Iterable[Collection[str]]] = None, **kwargs
-    ):
-        super().__init__(field, **kwargs)
+    def __init__(self, definition: VariableDefinition):
+        super(SetType, self).__init__(definition)
 
-        if corpus is None:
-            corpus = []
+        if "corpus" not in definition:
+            definition["corpus"] = []
 
-        self.comparator = CosineSetSimilarity(corpus)  # type: ignore[assignment]
+        self.comparator = CosineSetSimilarity(definition["corpus"])  # type: ignore[assignment]
diff --git a/dedupe/variables/string.py b/dedupe/variables/string.py
index 9a2bc8ab3..4272dba09 100644
--- a/dedupe/variables/string.py
+++ b/dedupe/variables/string.py
@@ -1,11 +1,11 @@
-from typing import Iterable, Optional, Sequence, Type
+from typing import Sequence, Type
 
 from affinegap import normalizedAffineGapDistance as affineGap
 from highered import CRFEditDistance
 from simplecosine.cosine import CosineTextSimilarity
 
 from dedupe import predicates
-from dedupe._typing import PredicateFunction
+from dedupe._typing import PredicateFunction, VariableDefinition
 from dedupe.variables.base import FieldType, indexPredicates
 
 crfEd = CRFEditDistance()
@@ -36,8 +36,8 @@ class BaseStringType(FieldType):
     _Predicate = predicates.StringPredicate
     _predicate_functions: Sequence[PredicateFunction] = ()
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
+    def __init__(self, definition: VariableDefinition):
+        super(BaseStringType, self).__init__(definition)
 
         self.predicates += indexPredicates(
             (
@@ -67,12 +67,10 @@ class ShortStringType(BaseStringType):
     ]
     _index_thresholds = (0.2, 0.4, 0.6, 0.8)
 
-    def __init__(
-        self, field: str, name: Optional[str] = None, crf: bool = False, **kwargs
-    ):
-        super().__init__(field, name=name, **kwargs)
+    def __init__(self, definition: VariableDefinition):
+        super(ShortStringType, self).__init__(definition)
 
-        if crf:
+        if definition.get("crf", False) is True:
             self.comparator = crfEd  # type: ignore[assignment]
         else:
             self.comparator = affineGap  # type: ignore[assignment]
@@ -100,10 +98,10 @@ class TextType(BaseStringType):
     ]
     _index_thresholds = (0.2, 0.4, 0.6, 0.8)
 
-    def __init__(self, field: str, corpus: Optional[Iterable[str]] = None, **kwargs):
-        super().__init__(field, **kwargs)
+    def __init__(self, definition: VariableDefinition):
+        super(TextType, self).__init__(definition)
 
-        if corpus is None:
-            corpus = []
+        if "corpus" not in definition:
+            definition["corpus"] = []
 
-        self.comparator = CosineTextSimilarity(corpus)  # type: ignore[assignment]
+        self.comparator = CosineTextSimilarity(definition["corpus"])  # type: ignore[assignment]
diff --git a/docs/Variable-definition.rst b/docs/Variable-definition.rst
index 5e6d19023..73abcccc2 100644
--- a/docs/Variable-definition.rst
+++ b/docs/Variable-definition.rst
@@ -3,30 +3,31 @@
 Variable Definitions
 ====================
 
-Variables
----------
+Variable Types
+--------------
 
-A variable definition describes the records that you want to match. It is sequence
-of Variable objects. For example:-
+A variable definition describes the records that you want to match. It is
+a dictionary where the keys are the fields and the values are the
+field specification. For example:-
 
 .. code:: python
 
-    import dedupe.variables
-	  
     [
-        dedupe.variables.String("Site Name"),
-        dedupe.variables.String("Address"),
-	dedupe.variables.ShortString("Zip", has_missing=True),
-	dedupe.variables.String("Phone", has_missing=True)
+        {'field': 'Site name', 'type': 'String'},
+        {'field': 'Address', 'type': 'String'},
+        {'field': 'Zip', 'type': 'ShortString', 'has missing': True},
+        {'field': 'Phone', 'type': 'String', 'has missing': True}
     ]
 
 
-String
-^^^^^^
+String Types
+^^^^^^^^^^^^
 
-The ``String`` takes the key of the record field to compare.
+A ``String`` type field must declare the name of the record field to compare
+a ``String`` type declaration. The ``String`` type expects fields to be of
+class string.
 
-``String`` variables are compared using string edit distance, specifically
+``String`` types are compared using string edit distance, specifically
 `affine gap string distance <http://en.wikipedia.org/wiki/Gap_penalty#Affine>`__.
 This is a good metric for measuring fields that might have typos in them,
 such as "John" vs "Jon".
@@ -35,45 +36,44 @@ For example:-
 
 .. code:: python
 
-  dedupe.variables.String("Address")
+  {'field': 'Address', type: 'String'}
 
-ShortString
-^^^^^^^^^^^
+ShortString Types
+^^^^^^^^^^^^^^^^^
 
-The ``ShortString`` variable is just like the ``String`` variable except that dedupe
+A ``ShortString`` type field is just like ``String`` types except that dedupe
 will not try to learn any :ref:`index blocking rules <index-blocks-label>` for these fields, which can
 speed up the training phase considerably.
 
-Zip codes and city names are good candidates for this variable. If in doubt,
+Zip codes and city names are good candidates for this type. If in doubt,
 always use ``String``.
 
 For example:-
 
 .. code:: python
 
-  dedupe.variables.ShortString("Zipcode")
+  {'field': 'Zipcode', type: 'ShortString'}
 
 .. _text-types-label:
 
-Text
-^^^^
+Text Types
+^^^^^^^^^^
 
 If you want to compare fields containing blocks of text e.g. product
-descriptions or article abstracts, you should use this variable. ``Text``
-variables are compared using the `cosine similarity metric
+descriptions or article abstracts, you should use this type. ``Text`` type
+fields are compared using the `cosine similarity metric
 <http://en.wikipedia.org/wiki/Vector_space_model>`__.
 
 This is a measurement of the amount of words that two documents have in
 common. This measure can be made more useful as the overlap of rare words
 counts more than the overlap of common words.
 
-Compare this to ``String`` and ``ShortString`` variables: For strings
-containing occupations, "yoga teacher" might be fairly similar to
-"yoga instructor" when using the ``Text`` measurement, because they
-both contain the relatively rare word of "yoga". However, if you
-compared these two strings using the ``String`` or ``ShortString``
-measurements, they might be considered fairly dissimilar, because the
-actual string edit distance between them is large.
+Compare this to ``String`` and ``ShortString`` types: For strings containing
+occupations, "yoga teacher" might be fairly similar to "yoga instructor" when
+using the ``Text`` measurement, because they both contain the relatively
+rare word of "yoga". However, if you compared these two strings using the
+``String`` or ``ShortString`` measurements, they might be considered fairly
+dis-similar, because the actual string edit distance between them is large.
 
 
 If provided a sequence of example fields (i.e. a corpus) then dedupe will
@@ -81,27 +81,29 @@ learn these weights for you. For example:-
 
 .. code:: python
 
-   dedupe.variables.Text("Product description",
-                         corpus=[
-                                 'this product is great',
-                                 'this product is great and blue'
-                                ]
-			)
+   {
+    'field': 'Product description',
+    'type': 'Text', 
+    'corpus' : [
+            'this product is great',
+            'this product is great and blue'
+        ]
+   } 
 
 If you don't want to adjust the measure to your data, just leave 'corpus' out
 of the variable definition entirely.
 
 .. code:: python
 
-   dedupe.variables.Text("Product description") 
+   {'field': 'Product description', 'type': 'Text'} 
 
 
-Custom Variable
+Custom Types
 ^^^^^^^^^^^^
 
-A ``Custom`` variables allows you to use a custom function for
-comparing fields. The function must take two field values and return a
-number.
+A ``Custom`` type field must have specify the field it wants to compare, a
+type declaration of ``Custom``, and a comparator declaration. The comparator
+must be a function that can take in two field values and return a number.
 
 For example, a custom comparator:
 
@@ -118,53 +120,65 @@ The corresponding variable definition:
 
 .. code:: python
 
-    dedupe.variables.Custom("Zip", comparator=same_or_not_comparator)
+    {
+        'field': 'Zip',
+        'type': 'Custom', 
+        'comparator': same_or_not_comparator
+     }
 
-``Custom`` variables do not have any blocking rules associated with them.
+``Custom`` fields do not have any blocking rules associated with them.
 Since dedupe needs blocking rules, a data model that only contains ``Custom``
 fields will raise an error.
 
 LatLong
 ^^^^^^^
 
-``LatLong`` variables are compared using the `Haversine
+A ``LatLong`` type field must have as the name of a field and a type
+declaration of ``LatLong``. ``LatLong`` fields are compared using the `Haversine
 Formula <http://en.wikipedia.org/wiki/Haversine_formula>`__. 
 
-A ``LatLong`` variable field must consist of tuples of floats
-corresponding to a latitude and a longitude.
+A ``LatLong``
+type field must consist of tuples of floats corresponding to a latitude and a
+longitude.
 
 .. code:: python
 
-    dedupe.variables.LatLong("location")
+    {'field': 'Location', 'type': 'LatLong'}
 
 Set
 ^^^
 
-``Set`` variables are for comparing lists of elements, like keywords or
-client names. ``Set`` variables are very similar to :ref:`text-types-label`. They
+A ``Set`` type field is for comparing lists of elements, like keywords or
+client names. ``Set`` types are very similar to :ref:`text-types-label`. They
 use the same comparison function and you can also let dedupe learn which
 terms are common or rare by providing a corpus. Within a record, a ``Set``
-variable field has to be hashable sequences like tuples or frozensets.
+type field has to be hashable sequences like tuples or frozensets.
 
 .. code:: python
 
-    dedupe.variables.Set("Co-authors",
-                         corpus=[
-                                 ('steve edwards'),
-                                 ('steve edwards', 'steve jobs')
-                                ])
+    {
+        'field': 'Co-authors',
+        'type': 'Set',
+        'corpus' : [
+                ('steve edwards'),
+                ('steve edwards', 'steve jobs')
+            ]
+     } 
 
 or
 
 .. code:: python
 
-    dedupe.variables.Set("Co-authors")
+    {'field': 'Co-authors', 'type': 'Set'}
 
 Interaction
 ^^^^^^^^^^^
 
-An ``Interaction`` variable multiplies the values of the multiple variables.
-The arguments to an ``Interaction`` variable must be a sequence of variable names of
+An ``Interaction`` field multiplies the values of the multiple variables.
+An ``Interaction`` variable is created with type declaration of
+``Interaction`` and an ``interaction variables`` declaration.
+
+The ``interaction variables`` field must be a sequence of variable names of
 other fields you have defined in your variable definition.
 
 `Interactions <http://en.wikipedia.org/wiki/Interaction_%28statistics%29>`__
@@ -173,9 +187,10 @@ are good when the effect of two predictors is not simply additive.
 .. code:: python
 
     [
-        dedupe.variables.String("Name", name="name"),
-	dedupe.variables.Custom("Zip", comparator=same_or_not_comparator, name="zip")
-	dedupe.variables.Interaction("name", "zip")
+        { 'field': 'Name', 'variable name': 'name', 'type': 'String' },
+        { 'field': 'Zip', 'variable name': 'zip', 'type': 'Custom', 
+      'comparator' : same_or_not_comparator },
+        {'type': 'Interaction', 'interaction variables': ['name', 'zip']}
     ]
 
 Exact
@@ -185,7 +200,7 @@ Exact
 
 .. code:: python
 
-    dedupe.variables.Exact("city")
+    {'field': 'city', 'type': 'Exact'}
 
 
 Exists
@@ -201,7 +216,7 @@ different cases:
 
 .. code:: python
 
-    dedupe.variables.Exists("first_name")
+    {'field': 'first_name', 'type': 'Exists'} 
 
 
 
@@ -239,7 +254,11 @@ You would create a definition such as:
 
 .. code:: python
 
-    dedupe.variables.Categorical("Business Type", categories=['taxi', 'lawyer'])
+    {
+        'field': 'Business Type',
+        'type': 'Categorical',
+        'categories' : ['taxi', 'lawyer']
+    }
 
 Price
 ^^^^^
@@ -250,7 +269,7 @@ prices. The values of ``Price`` field must be a positive float. If the value is
 
 .. code:: python
 
-    dedupe.variables.Price("cost")
+    {'field': 'cost', 'type': 'Price'}
 
 Optional Variables
 ------------------
@@ -267,8 +286,8 @@ DateTime
 ``DateTime`` variables are useful for comparing dates and timestamps. This
 variable can accept strings or Python datetime objects as inputs.
 
-The ``DateTime`` variable a few optional arguments that can help
-improve behavior if you know your field follows an unusual format:
+The ``DateTime`` variable definition accepts a few optional arguments that
+can help improve behavior if you know your field follows an unusual format:
 
 * :code:`fuzzy` - Use fuzzy parsing to automatically extract dates from strings like "It happened on June 2nd, 2018" (default :code:`True`)
 * :code:`dayfirst` - Ambiguous dates should be parsed as dd/mm/yy (default :code:`False`)
@@ -278,24 +297,34 @@ Note that the ``DateTime`` variable defaults to mm/dd/yy for ambiguous dates.
 If both :code:`dayfirst` and :code:`yearfirst` are set to :code:`True`, then
 :code:`dayfirst` will take precedence.
 
+For example, a sample ``DateTime`` variable definition, using the defaults:
 
 .. code:: python
 
-    import datetimetype
-
-    datetimetype.DateTime("field")
+    {
+        'field': 'time_of_sale',
+        'type': 'DateTime',
+        'fuzzy': True,
+        'dayfirst': False,
+        'yearfirst': False
+    }
 
-To install:
+If you're happy with the defaults, you can simply define the :code:`field`
+and :code:`type`:
 
-.. code:: console
+.. code:: python
 
-    pip install dedupe-variable-datetime
+    {'field': 'time_of_sale', 'type': 'DateTime'}
 
+Install the `dedupe-variable-datetime
+<https://pypi.python.org/pypi/dedupe-variable-datetime>`__ package for
+``DateTime`` Type. For more info, see the `GitHub Repository
+<https://github.com/dedupeio/dedupe-variable-datetime>`__.
 
-Address
-^^^^^^^
+Address Type
+^^^^^^^^^^^^
 
-An ``USAddress`` variable should be used for United States addresses. It uses
+An ``Address`` variable should be used for United States addresses. It uses
 the `usaddress <https://usaddress.readthedocs.io/en/latest/>`__ package to
 split apart an address string into components like address number, street
 name, and street type and compares component to component.
@@ -304,22 +333,18 @@ For example:-
 
 .. code:: python
 
-    import addressvariable
-	  
-    addressvariable.USAddress("address")
-
-
-To install:
+    {'field': 'address', 'type': 'Address'}
 
-.. code:: console
 
-    pip install dedupe-variable-address
+Install the `dedupe-variable-address
+<https://pypi.python.org/pypi/dedupe-variable-address>`__ package for
+``Address`` Type. For more info, see the `GitHub Repository
+<https://github.com/dedupeio/dedupe-variable-address>`__.
 
+Name Type
+^^^^^^^^^
 
-Name
-^^^^
-
-A ``WesternName`` variable should be used for a field that contains American names,
+A ``Name`` variable should be used for a field that contains American names,
 corporations and households. It uses the `probablepeople
 <https://probablepeople.readthedocs.io/en/latest/>`__ package to split apart
 an name string into components like give name, surname, generational suffix,
@@ -330,15 +355,42 @@ For example:-
 
 .. code:: python
 
-    import namevariable
+    {'field': 'name', 'type': 'Name'}
+
+
+Install the `dedupe-variable-name
+<https://pypi.python.org/pypi/dedupe-variable-name>`__ package for ``Name``
+Type. For more info, see the `GitHub Repository
+<https://github.com/dedupeio/dedupe-variable-name>`__.
+
+Fuzzy Category
+^^^^^^^^^^^^^^
+
+A ``FuzzyCategorical`` variable should be used for when you for
+categorical data that has variations.
 
-    namevariable.WesternName("field")
+Occupations are an example, where the you may have 'Attorney', 'Counsel', and
+'Lawyer'. For this variable type, you need to supply a corpus of records that
+contain your focal record and other field types. This corpus should either be
+all the data you are trying to link or a representative sample.
+
+For example:-
+
+.. code:: python
 
-To install: 
-    
-.. code:: console
+    {
+     'field': 'occupation',
+     'type': 'FuzzyCategorical',
+     'corpus' : [
+            {'name' : 'Jim Doe', 'occupation' : 'Attorney'},
+            {'name' : 'Jim Doe', 'occupation' : 'Lawyer'}
+        ]
+    }
 
-    pip install dedupe-variable-name
+Install the `dedupe-variable-fuzzycategory
+<https://pypi.python.org/pypi/dedupe-variable-fuzzycategory>`__ package for
+the ``FuzzyCategorical`` Type. For more info, see the `GitHub Repository
+<https://github.com/dedupeio/fuzzycategory>`__.
 
 
 Missing Data 
@@ -355,13 +407,13 @@ a ``None`` object. You should also use ``None`` to represent empty strings
         {'Name': None, 'Phone': '773-555-1123'}
    ]
 
-If you want to model this missing data for a field, you can set the ``has
-missing=True`` in the variable definition. This creates a new,
+If you want to model this missing data for a field, you can set ``'has
+missing' : True`` in the variable definition. This creates a new,
 additional field representing whether the data was present or not and
 zeros out the missing data.
 
-If there is missing data, but you did not declare ``has
-missing=True`` then the missing data will simply be zeroed out and
+If there is missing data, but you did not declare ``'has
+missing' : True`` then the missing data will simply be zeroed out and
 no field will be created to account for missing data.
 
 This approach is called 'response augmented data' and is described in
@@ -378,7 +430,7 @@ This approach makes a few assumptions that are usually not completely true:
 
 
 If you define an an interaction with a field that you declared to have
-missing data, then ``has missing=True`` will also be set for the
+missing data, then ``has missing : True`` will also be set for the
 Interaction field.
 
 Longer example of a variable definition:
@@ -386,12 +438,12 @@ Longer example of a variable definition:
 .. code:: python
 
     [
-        dedupe.variables.String("name", name="name"),
-	dedupe.variables.String("address"),
-	dedupe.variables.String("city", name="city"),
-	dedupe.variables.Custom("zip", comparator=same_or_not_comparator),
-	dedupe.variables.String("cuisine", has_missing=True),
-	dedupe.vairables.Interaction("name", "city")
+        {'field': 'name', 'variable name' : 'name', 'type': 'String'},
+        {'field': 'address', 'type': 'String'},
+        {'field': 'city', 'variable name' : 'city', 'type': 'String'},
+        {'field': 'zip', 'type': 'Custom', 'comparator' : same_or_not_comparator},
+        {'field': 'cuisine', 'type': 'String', 'has missing': True}
+        {'type': 'Interaction', 'interaction variables' : ['name', 'city']}
     ]
 
 Multiple Variables comparing same field
@@ -404,8 +456,8 @@ For example:-
 .. code:: python
 
     [
-        dedupe.variables.String("name"),
-	dedupe.variables.Text("name")
+        {'field': 'name', 'type': 'String'},
+        {'field': 'name', 'type': 'Text'}
     ]
 
 
@@ -423,4 +475,4 @@ default edit distance.
 
 .. code:: python
 
-    dedupe.variables.String("name", crf=True)
+    {'field': 'name', 'type': 'String', 'crf': True}
diff --git a/pyproject.toml b/pyproject.toml
index 0256eee68..9cfe9afc9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "dedupe"
 description = "A python library for accurate and scaleable data deduplication and entity-resolution"
-version = "3.0.0"
+version = "2.0.24"
 readme = "README.md"
 requires-python = ">=3.7"
 license = {file = "LICENSE"}
@@ -63,11 +63,10 @@ dedupe = ["py.typed"]
 
 [tool.mypy]
 plugins = "numpy.typing.mypy_plugin"
-files = ["dedupe"]
+files = "dedupe"
 show_error_codes = true
 ignore_missing_imports = true
 check_untyped_defs = true
-implicit_reexport = false
 
 [tool.pytest.ini_options]
 minversion = "7.1"
diff --git a/tests/test_api.py b/tests/test_api.py
index 4e6b92906..84ac9169a 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -46,8 +46,8 @@ def icfi(x):
 class ActiveMatch(unittest.TestCase):
     def setUp(self):
         self.field_definition = [
-            dedupe.variables.String("name"),
-            dedupe.variables.String("age"),
+            {"field": "name", "type": "String"},
+            {"field": "age", "type": "String"},
         ]
 
     def test_initialize_fields(self):
@@ -58,26 +58,23 @@ def test_initialize_fields(self):
                 [],
             )
 
-        with self.assertRaises(ValueError):
-            dedupe.api.ActiveMatching([{"field": "name", "type": "String"}])
-
         with self.assertRaises(ValueError):
             dedupe.api.ActiveMatching(
-                [dedupe.variables.Custom("name", comparator=lambda x, y: 1)],
+                [{"field": "name", "type": "Custom", "comparator": lambda x, y: 1}],
             )
 
         with self.assertRaises(ValueError):
             dedupe.api.ActiveMatching(
                 [
-                    dedupe.variables.Custom("name", comparator=lambda x, y: 1),
-                    dedupe.variables.Custom("age", comparator=lambda x, y: 1),
+                    {"field": "name", "type": "Custom", "comparator": lambda x, y: 1},
+                    {"field": "age", "type": "Custom", "comparator": lambda x, y: 1},
                 ],
             )
 
         dedupe.api.ActiveMatching(
             [
-                dedupe.variables.Custom("name", comparator=lambda x, y: 1),
-                dedupe.variables.String("age"),
+                {"field": "name", "type": "Custom", "comparator": lambda x, y: 1},
+                {"field": "age", "type": "String"},
             ],
         )
 
diff --git a/tests/test_core.py b/tests/test_core.py
index d0bc8c94a..56d1ac010 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -47,7 +47,7 @@ def setUp(self):
             ]
         )
 
-        deduper = dedupe.Dedupe([dedupe.variables.String("name")])
+        deduper = dedupe.Dedupe([{"field": "name", "type": "String"}])
         self.data_model = deduper.data_model
         self.classifier = MockClassifier()
 
@@ -104,7 +104,7 @@ def test_score_duplicates_with_zeros(self):
 
 class FieldDistances(unittest.TestCase):
     def test_exact_comparator(self):
-        deduper = dedupe.Dedupe([dedupe.variables.Exact("name")])
+        deduper = dedupe.Dedupe([{"field": "name", "type": "Exact"}])
 
         record_pairs = (
             ({"name": "Shmoo"}, {"name": "Shmee"}),
@@ -117,7 +117,7 @@ def test_exact_comparator(self):
 
     def test_comparator(self):
         deduper = dedupe.Dedupe(
-            [dedupe.variables.Categorical("type", categories=["a", "b", "c"])]
+            [{"field": "type", "type": "Categorical", "categories": ["a", "b", "c"]}]
         )
 
         record_pairs = (({"type": "a"}, {"type": "b"}), ({"type": "a"}, {"type": "c"}))
@@ -131,11 +131,14 @@ def test_comparator(self):
     def test_comparator_interaction(self):
         deduper = dedupe.Dedupe(
             [
-                dedupe.variables.Categorical(
-                    "type", categories=["a", "b"], name="type"
-                ),
-                dedupe.variables.Interaction("type", "name"),
-                dedupe.variables.Exact("name", name="name"),
+                {
+                    "field": "type",
+                    "variable name": "type",
+                    "type": "Categorical",
+                    "categories": ["a", "b"],
+                },
+                {"type": "Interaction", "interaction variables": ["type", "name"]},
+                {"field": "name", "variable name": "name", "type": "Exact"},
             ]
         )
 
diff --git a/tests/test_dedupe.py b/tests/test_dedupe.py
index 4a925e8b4..e50af63cc 100644
--- a/tests/test_dedupe.py
+++ b/tests/test_dedupe.py
@@ -6,7 +6,6 @@
 import numpy
 
 import dedupe
-import dedupe.variables
 
 DATA = {
     100: {"name": "Bob", "age": "50"},
@@ -38,9 +37,9 @@ def test_data_model(self):
 
         data_model = DataModel(
             [
-                dedupe.variables.String(field="a", name="a"),
-                dedupe.variables.String(field="b", name="b"),
-                dedupe.variables.Interaction("a", "b"),
+                {"field": "a", "variable name": "a", "type": "String"},
+                {"field": "b", "variable name": "b", "type": "String"},
+                {"type": "Interaction", "interaction variables": ["a", "b"]},
             ]
         )
 
@@ -48,9 +47,14 @@ def test_data_model(self):
 
         data_model = DataModel(
             [
-                dedupe.variables.String(field="a", name="a", has_missing=True),
-                dedupe.variables.String(field="b", name="b"),
-                dedupe.variables.Interaction("a", "b"),
+                {
+                    "field": "a",
+                    "variable name": "a",
+                    "type": "String",
+                    "has missing": True,
+                },
+                {"field": "b", "variable name": "b", "type": "String"},
+                {"type": "Interaction", "interaction variables": ["a", "b"]},
             ]
         )
 
@@ -58,9 +62,14 @@ def test_data_model(self):
 
         data_model = DataModel(
             [
-                dedupe.variables.String(field="a", name="a", has_missing=False),
-                dedupe.variables.String(field="b", name="b"),
-                dedupe.variables.Interaction("a", "b"),
+                {
+                    "field": "a",
+                    "variable name": "a",
+                    "type": "String",
+                    "has missing": False,
+                },
+                {"field": "b", "variable name": "b", "type": "String"},
+                {"type": "Interaction", "interaction variables": ["a", "b"]},
             ]
         )
 
diff --git a/tests/test_labeler.py b/tests/test_labeler.py
index 8bbc2eab5..30609ffae 100644
--- a/tests/test_labeler.py
+++ b/tests/test_labeler.py
@@ -3,7 +3,6 @@
 
 import pytest
 
-import dedupe
 from dedupe import datamodel, labeler
 from dedupe._typing import RecordDictPair
 
@@ -25,7 +24,7 @@ def freeze_record_pair(record_pair: RecordDictPair):
 class ActiveLearningTest(unittest.TestCase):
     def setUp(self):
         self.data_model = datamodel.DataModel(
-            [dedupe.variables.String("name"), dedupe.variables.String("age")]
+            [{"field": "name", "type": "String"}, {"field": "age", "type": "String"}]
         )
 
     def test_AL(self):
diff --git a/tests/test_serializer.py b/tests/test_serializer.py
index 7eb2d931e..ab8c0e471 100644
--- a/tests/test_serializer.py
+++ b/tests/test_serializer.py
@@ -53,7 +53,7 @@ def test_writeTraining(self):
         assert isinstance(loaded_training_pairs["distinct"][0][0]["bar"], frozenset)
         assert isinstance(loaded_training_pairs["distinct"][0][0]["baz"], tuple)
 
-        deduper = dedupe.Dedupe([dedupe.variables.String("foo")])
+        deduper = dedupe.Dedupe([{"field": "foo", "type": "String"}])
         deduper.classifier.cv = False
 
         encoded_file.seek(0)
diff --git a/tests/test_training.py b/tests/test_training.py
index 6b71f3aee..b908dde0c 100644
--- a/tests/test_training.py
+++ b/tests/test_training.py
@@ -7,7 +7,7 @@
 
 class TrainingTest(unittest.TestCase):
     def setUp(self):
-        field_definition = [dedupe.variables.String("name")]
+        field_definition = [{"field": "name", "type": "String"}]
         self.data_model = dedupe.Dedupe(field_definition).data_model
         self.training_pairs = {
             "match": [