From 425eb203db51d105e9e831e119600e1a7634343d Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Thu, 27 Jun 2024 06:59:06 -0400 Subject: [PATCH] Revert "more direct set up data model (#1193)" This reverts commit 88558420c15f79b6153ec3e123a7662c09edbbf4. --- .flake8 | 2 +- .pre-commit-config.yaml | 5 - CHANGELOG.md | 5 - benchmarks/benchmarks/canonical.py | 12 +- benchmarks/benchmarks/canonical_gazetteer.py | 9 +- benchmarks/benchmarks/canonical_matching.py | 9 +- benchmarks/benchmarks/common.py | 6 +- dedupe/__init__.py | 15 -- dedupe/_typing.py | 44 ++- dedupe/api.py | 7 +- dedupe/convenience.py | 5 +- dedupe/core.py | 2 +- dedupe/datamodel.py | 170 +++++++----- dedupe/predicates.py | 4 +- dedupe/training.py | 3 +- dedupe/variables/__init__.py | 26 +- dedupe/variables/base.py | 90 ++++--- dedupe/variables/categorical_type.py | 22 +- dedupe/variables/exists.py | 10 +- dedupe/variables/interaction.py | 19 +- dedupe/variables/set.py | 15 +- dedupe/variables/string.py | 26 +- docs/Variable-definition.rst | 270 +++++++++++-------- pyproject.toml | 5 +- tests/test_api.py | 17 +- tests/test_core.py | 19 +- tests/test_dedupe.py | 29 +- tests/test_labeler.py | 3 +- tests/test_serializer.py | 2 +- tests/test_training.py | 2 +- 30 files changed, 467 insertions(+), 386 deletions(-) diff --git a/.flake8 b/.flake8 index 7350ce301..0e85dce10 100644 --- a/.flake8 +++ b/.flake8 @@ -1,3 +1,3 @@ [flake8] max-line-length=160 -extend-ignore = E203 \ No newline at end of file +extend-ignore = E203 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4896e74f6..320205765 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,8 +8,3 @@ repos: hooks: - id: isort name: isort (python) - - repo: https://github.com/pycqa/flake8 - rev: "7.1.0" - hooks: - - id: flake8 - args: [--config=.flake8] diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a9ee9c76..ef3a19bfc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,3 @@ -# 3.0.0 -- Development in python packaging made supporting the previous namespace approach for - variable plugins untenable. Since we had to redo the way we defined the data model, - we took the opportunity to explicity instantiate variable objects. - # 2.0.6 - fixed bug that was preventing learning of index predicates in Dedupe mode diff --git a/benchmarks/benchmarks/canonical.py b/benchmarks/benchmarks/canonical.py index 6f23bb08d..da075b9ba 100644 --- a/benchmarks/benchmarks/canonical.py +++ b/benchmarks/benchmarks/canonical.py @@ -32,19 +32,17 @@ def make_report(self, clustering): return make_report(self.data, clustering) def run(self, use_settings=False): - deduper: dedupe.StaticDedupe | dedupe.Dedupe - if use_settings and os.path.exists(self.settings_file): with open(self.settings_file, "rb") as f: deduper = dedupe.StaticDedupe(f) else: variables = [ - dedupe.variables.String("name"), - dedupe.variables.Exact("name"), - dedupe.variables.String("address"), - dedupe.variables.ShortString("cuisine", has_missing=True), - dedupe.variables.ShortString("city"), + {"field": "name", "type": "String"}, + {"field": "name", "type": "Exact"}, + {"field": "address", "type": "String"}, + {"field": "cuisine", "type": "ShortString", "has missing": True}, + {"field": "city", "type": "ShortString"}, ] deduper = dedupe.Dedupe(variables, num_cores=5) diff --git a/benchmarks/benchmarks/canonical_gazetteer.py b/benchmarks/benchmarks/canonical_gazetteer.py index 1f73b8e20..bdbc51ba1 100644 --- a/benchmarks/benchmarks/canonical_gazetteer.py +++ b/benchmarks/benchmarks/canonical_gazetteer.py @@ -25,17 +25,16 @@ def make_report(self, clustering): def run(self, kwargs, use_settings=False): data_1, data_2 = self.data - gazetteer: dedupe.StaticGazetteer | dedupe.Gazetteer if use_settings and os.path.exists(self.settings_file): with open(self.settings_file, "rb") as f: gazetteer = dedupe.StaticGazetteer(f) else: variables = [ - dedupe.variables.String("name"), - dedupe.variables.String("address"), - dedupe.variables.String("cuisine"), - dedupe.variables.String("city"), + {"field": "name", "type": "String"}, + {"field": "address", "type": "String"}, + {"field": "cuisine", "type": "String"}, + {"field": "city", "type": "String"}, ] gazetteer = dedupe.Gazetteer(variables) diff --git a/benchmarks/benchmarks/canonical_matching.py b/benchmarks/benchmarks/canonical_matching.py index b1c3c823b..471cd4988 100644 --- a/benchmarks/benchmarks/canonical_matching.py +++ b/benchmarks/benchmarks/canonical_matching.py @@ -42,17 +42,16 @@ def setup(self, kwargs): def run(self, kwargs, use_settings=False): data_1, data_2 = self.data - deduper: dedupe.StaticRecordLink | dedupe.RecordLink if use_settings and os.path.exists(self.settings_file): with open(self.settings_file, "rb") as f: deduper = dedupe.StaticRecordLink(f) else: variables = [ - dedupe.variables.String("name"), - dedupe.variables.String("address"), - dedupe.variables.String("cuisine"), - dedupe.variables.String("city"), + {"field": "name", "type": "String"}, + {"field": "address", "type": "String"}, + {"field": "cuisine", "type": "String"}, + {"field": "city", "type": "String"}, ] deduper = dedupe.RecordLink(variables) deduper.prepare_training( diff --git a/benchmarks/benchmarks/common.py b/benchmarks/benchmarks/common.py index 17e129f99..afe993274 100644 --- a/benchmarks/benchmarks/common.py +++ b/benchmarks/benchmarks/common.py @@ -54,9 +54,9 @@ def get_true_dupes(data: dict) -> set: sorted(data.items(), key=lambda x: x[1]["unique_id"]), key=lambda x: x[1]["unique_id"], ): - pair_l = list(pair) - if len(pair_l) == 2: - a, b = pair_l + pair = list(pair) + if len(pair) == 2: + a, b = pair duplicates.add(frozenset((a[0], b[0]))) return duplicates diff --git a/dedupe/__init__.py b/dedupe/__init__.py index 7ef7d4c77..726836a72 100644 --- a/dedupe/__init__.py +++ b/dedupe/__init__.py @@ -13,18 +13,3 @@ training_data_link, ) from dedupe.serializer import read_training, write_training # noqa: F401 - -__all__ = [ - "Dedupe", - "Gazetteer", - "RecordLink", - "StaticDedupe", - "StaticGazetteer", - "StaticRecordLink", - "canonicalize", - "console_label", - "training_data_dedupe", - "training_data_link", - "read_training", - "write_training", -] diff --git a/dedupe/_typing.py b/dedupe/_typing.py index 9de9eb5f7..65922f7e5 100644 --- a/dedupe/_typing.py +++ b/dedupe/_typing.py @@ -4,18 +4,17 @@ TYPE_CHECKING, Any, Callable, + Collection, Dict, FrozenSet, Iterable, Iterator, List, Mapping, - MutableSequence, Sequence, Tuple, Type, Union, - runtime_checkable, ) import numpy @@ -73,7 +72,6 @@ LookupResults = Union[LookupResultsInt, LookupResultsStr] JoinConstraint = Literal["one-to-one", "many-to-one", "many-to-many"] Comparator = Callable[[Any, Any], Union[Union[int, float], Sequence[Union[int, float]]]] -CustomComparator = Callable[[Any, Any], Union[int, float]] Scores = Union[numpy.memmap, numpy.ndarray] Labels = List[Literal[0, 1]] LabelsLike = Iterable[Literal[0, 1]] @@ -83,10 +81,28 @@ ComparisonCover = Union[ComparisonCoverInt, ComparisonCoverStr] PredicateFunction = Callable[[Any], FrozenSet[str]] +VariableDefinition = TypedDict( + "VariableDefinition", + { + "type": str, + "field": str, + "variable name": str, + "corpus": Iterable[Union[str, Collection[str]]], + "comparator": Callable[ + [Any, Any], Union[int, float] + ], # a custom comparator can only return a single float or int, not a sequence of numbers + "categories": List[str], + "interaction variables": List[str], + "has missing": bool, + "name": str, + }, + total=False, +) + class TrainingData(TypedDict): - match: MutableSequence[RecordDictPair] - distinct: MutableSequence[RecordDictPair] + match: List[RecordDictPair] + distinct: List[RecordDictPair] # Takes pairs of records and generates a (n_samples X n_features) array @@ -111,24 +127,6 @@ def close(self) -> None: ... def join(self) -> None: ... -class Variable(Protocol): - name: str - predicates: List["Predicate"] - has_missing: bool - - def __len__(self) -> int: ... - - -@runtime_checkable -class FieldVariable(Variable, Protocol): - field: str - comparator: Comparator - - -class InteractionVariable(Variable, Protocol): - interaction_fields: List[str] - - MapLike = Callable[[Callable[[Any], Any], Iterable], Iterable] PathLike = Union[str, os.PathLike] diff --git a/dedupe/api.py b/dedupe/api.py index 9d17bc0b0..1e2e1f438 100644 --- a/dedupe/api.py +++ b/dedupe/api.py @@ -14,7 +14,7 @@ import sqlite3 import tempfile import warnings -from typing import TYPE_CHECKING, Literal, cast, overload +from typing import TYPE_CHECKING, cast, overload import numpy import sklearn.linear_model @@ -27,6 +27,7 @@ import dedupe.labeler as labeler import dedupe.predicates import dedupe.serializer as serializer +from dedupe._typing import Literal if TYPE_CHECKING: from typing import ( @@ -69,7 +70,7 @@ Scores, TrainingData, TupleLinks, - Variable, + VariableDefinition, ) logger = logging.getLogger(__name__) @@ -1116,7 +1117,7 @@ class ActiveMatching(Matching): def __init__( self, - variable_definition: Collection[Variable], + variable_definition: Collection[VariableDefinition], num_cores: int | None = None, in_memory: bool = False, **kwargs, diff --git a/dedupe/convenience.py b/dedupe/convenience.py index 26b886a4a..fb24de259 100644 --- a/dedupe/convenience.py +++ b/dedupe/convenience.py @@ -7,7 +7,7 @@ import random import sys import warnings -from typing import Iterator, Literal, Tuple, overload +from typing import Iterator, Tuple, overload import numpy @@ -15,6 +15,7 @@ from dedupe._typing import ( DataInt, DataStr, + Literal, RecordDict, RecordDictPair, RecordID, @@ -134,7 +135,7 @@ def console_label(deduper: dedupe.api.ActiveMatching) -> None: # pragma: no cov finished = False use_previous = False - fields = unique(var.field for var in deduper.data_model.field_variables) + fields = unique(var.field for var in deduper.data_model.primary_variables) buffer_len = 1 # Max number of previous operations unlabeled: list[RecordDictPair] = [] diff --git a/dedupe/core.py b/dedupe/core.py index 975c08469..a9ffd55b5 100644 --- a/dedupe/core.py +++ b/dedupe/core.py @@ -22,7 +22,6 @@ Generator, Iterable, Iterator, - Literal, Optional, Sequence, Type, @@ -36,6 +35,7 @@ ClosableJoinable, Data, FeaturizerFunction, + Literal, MapLike, RecordID, RecordIDDType, diff --git a/dedupe/datamodel.py b/dedupe/datamodel.py index 1b84b8aff..49956bc6b 100644 --- a/dedupe/datamodel.py +++ b/dedupe/datamodel.py @@ -1,80 +1,54 @@ from __future__ import annotations import copyreg +import pkgutil import types -from collections.abc import Mapping from typing import TYPE_CHECKING, cast import numpy -from dedupe._typing import FieldVariable +import dedupe.variables +from dedupe.variables.base import FieldType as FieldVariable +from dedupe.variables.base import MissingDataType, Variable from dedupe.variables.interaction import InteractionType +for _, module, _ in pkgutil.iter_modules( # type: ignore + dedupe.variables.__path__, "dedupe.variables." +): + __import__(module) + if TYPE_CHECKING: - from typing import Collection, Generator, Iterable, Sequence + from typing import Generator, Iterable, Sequence from dedupe._typing import ( Comparator, - InteractionVariable, RecordDict, RecordDictPair, - Variable, + VariableDefinition, ) from dedupe.predicates import Predicate +VARIABLE_CLASSES = {k: v for k, v in FieldVariable.all_subclasses() if k} -class DataModel: - version = 2 - def __init__(self, variable_definitions: Collection[Variable]): - for item in variable_definitions: - if isinstance(item, Mapping): - raise ValueError( - "It looks like you are trying to use a variable definition " - "composed of dictionaries. dedupe 3.0 uses variable objects " - 'directly. So instead of [{"field": "name", "type": "String"}] ' - 'we now do [dedupe.variables.String("name")].' - ) +class DataModel(object): + version = 1 + def __init__(self, variable_definitions: Iterable[VariableDefinition]): variable_definitions = list(variable_definitions) if not variable_definitions: raise ValueError("The variable definitions cannot be empty") - if not any(variable.predicates for variable in variable_definitions): - raise ValueError( - "At least one of the variable types needs to be a type" - "other than 'Custom'. 'Custom' types have no associated" - "blocking rules" - ) - - # This is a protocol check, not a class inheritance check - self.field_variables: list[FieldVariable] = [ - variable - for variable in variable_definitions - if isinstance(variable, FieldVariable) - ] - - # we need to keep track of ordering of variables because in - # order to calculate derived fields like interaction and missing - # data fields. - columns: list[Variable] = [] - for variable in self.field_variables: - if len(variable) == 1: - columns.append(variable) - elif len(variable) > 1: - assert hasattr(variable, "higher_vars") - columns.extend(variable.higher_vars) + all_variables: list[Variable] + self.primary_variables, all_variables = typify_variables(variable_definitions) + self._derived_start = len(all_variables) - self._derived_start = len(columns) + all_variables += interactions(variable_definitions, self.primary_variables) + all_variables += missing(all_variables) - # i'm not really satisfied with how we are dealing with interactions - # here. seems like there should be a cleaner path, but i don't see it - # today - columns += interactions(variable_definitions, self.field_variables) + self._missing_field_indices = missing_field_indices(all_variables) + self._interaction_indices = interaction_indices(all_variables) - self._missing_field_indices = missing_field_indices(columns) - self._interaction_indices = interaction_indices(columns) - - self._len = len(columns) + len(self._missing_field_indices) + self._len = len(all_variables) def __len__(self) -> int: return self._len @@ -89,7 +63,7 @@ def _field_comparators( ) -> Generator[tuple[str, Comparator, int, int], None, None]: start = 0 stop = 0 - for var in self.field_variables: + for var in self.primary_variables: stop = start + len(var) comparator = cast("Comparator", var.comparator) yield (var.field, comparator, start, stop) @@ -98,7 +72,7 @@ def _field_comparators( @property def predicates(self) -> set[Predicate]: predicates = set() - for var in self.field_variables: + for var in self.primary_variables: for predicate in var.predicates: predicates.add(predicate) return predicates @@ -158,26 +132,100 @@ def __getstate__(self): return d def __setstate__(self, d): - version = d.pop("object_version", None) + version = d.pop("version", None) if version is None and "_variables" in d: d["_len"] = len(d.pop("_variables")) d["primary_variables"] = d.pop("primary_fields") - elif version == 1: - d["field_variables"] = d.pop("primary_variables") self.__dict__ = d +def typify_variables( + variable_definitions: Iterable[VariableDefinition], +) -> tuple[list[FieldVariable], list[Variable]]: + primary_variables: list[FieldVariable] = [] + all_variables: list[Variable] = [] + only_custom = True + + for definition in variable_definitions: + try: + variable_type = definition["type"] + except TypeError: + raise TypeError( + "Incorrect variable specification: variable " + "specifications are dictionaries that must " + "include a type definition, ex. " + "{'field' : 'Phone', type: 'String'}" + ) + except KeyError: + raise KeyError( + "Missing variable type: variable " + "specifications are dictionaries that must " + "include a type definition, ex. " + "{'field' : 'Phone', type: 'String'}" + ) + + if variable_type != "Custom": + only_custom = False + + if variable_type == "Interaction": + continue + + if variable_type == "FuzzyCategorical" and "other fields" not in definition: + definition["other fields"] = [ # type: ignore + d["field"] + for d in variable_definitions + if ("field" in d and d["field"] != definition["field"]) + ] + + try: + variable_class = VARIABLE_CLASSES[variable_type] + except KeyError: + raise KeyError( + "Field type %s not valid. Valid types include %s" + % (definition["type"], ", ".join(VARIABLE_CLASSES)) + ) + + variable_object = variable_class(definition) + assert isinstance(variable_object, FieldVariable) + + primary_variables.append(variable_object) + + if hasattr(variable_object, "higher_vars"): + all_variables.extend(variable_object.higher_vars) + else: + variable_object = cast(Variable, variable_object) + all_variables.append(variable_object) + + if only_custom: + raise ValueError( + "At least one of the variable types needs to be a type" + "other than 'Custom'. 'Custom' types have no associated" + "blocking rules" + ) + + return primary_variables, all_variables + + +def missing(variables: list[Variable]) -> list[MissingDataType]: + missing_variables = [] + for var in variables: + if var.has_missing: + missing_variables.append(MissingDataType(var.name)) + return missing_variables + + def interactions( - variables: Iterable[Variable], primary_variables: Iterable[FieldVariable] -) -> list[InteractionVariable]: + definitions: Iterable[VariableDefinition], primary_variables: list[FieldVariable] +) -> list[InteractionType]: field_d = {field.name: field for field in primary_variables} - interactions: list[InteractionVariable] = [] - for variable in variables: - if isinstance(variable, InteractionType): - variable.expandInteractions(field_d) - interactions.extend(variable.higher_vars) + interactions = [] + for definition in definitions: + if definition["type"] == "Interaction": + var = InteractionType(definition) + var.expandInteractions(field_d) + interactions.extend(var.higher_vars) return interactions diff --git a/dedupe/predicates.py b/dedupe/predicates.py index 1e07c672e..2d180ee8d 100644 --- a/dedupe/predicates.py +++ b/dedupe/predicates.py @@ -17,9 +17,9 @@ from dedupe.predicate_functions import * # noqa: F401, F403 if TYPE_CHECKING: - from typing import AbstractSet, Any, FrozenSet, Iterable, Literal, Mapping, Sequence + from typing import AbstractSet, Any, FrozenSet, Iterable, Mapping, Sequence - from dedupe._typing import PredicateFunction, RecordDict + from dedupe._typing import Literal, PredicateFunction, RecordDict from dedupe.index import Index diff --git a/dedupe/training.py b/dedupe/training.py index 98c9f28df..eb6fe0a04 100644 --- a/dedupe/training.py +++ b/dedupe/training.py @@ -13,7 +13,7 @@ from . import blocking, branch_and_bound if TYPE_CHECKING: - from typing import Iterable, Literal, Sequence + from typing import Iterable, Sequence from ._typing import ( ComparisonCover, @@ -23,6 +23,7 @@ Data, DataInt, DataStr, + Literal, ) from ._typing import RecordDictPairs as TrainingExamples from ._typing import RecordID, RecordIDPair diff --git a/dedupe/variables/__init__.py b/dedupe/variables/__init__.py index 39c339c68..b36383a61 100644 --- a/dedupe/variables/__init__.py +++ b/dedupe/variables/__init__.py @@ -1,25 +1,3 @@ -from .base import CustomType as Custom -from .categorical_type import CategoricalType as Categorical -from .exact import ExactType as Exact -from .exists import ExistsType as Exists -from .interaction import InteractionType as Interaction -from .latlong import LatLongType as LatLong -from .price import PriceType as Price -from .set import SetType as Set -from .string import ShortStringType as ShortString -from .string import StringType as String -from .string import TextType as Text +from pkgutil import extend_path -__all__ = [ - "Custom", - "Categorical", - "Exact", - "Exists", - "Interaction", - "LatLong", - "Price", - "Set", - "ShortString", - "String", - "Text", -] +__path__ = extend_path(__path__, __name__) diff --git a/dedupe/variables/base.py b/dedupe/variables/base.py index 71d6e722e..109d3332c 100644 --- a/dedupe/variables/base.py +++ b/dedupe/variables/base.py @@ -1,21 +1,20 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING from dedupe import predicates if TYPE_CHECKING: - from typing import Any, ClassVar, Iterable, Sequence, Type + from typing import Any, ClassVar, Generator, Iterable, Optional, Sequence, Type - from dedupe._typing import Comparator, CustomComparator, PredicateFunction - from dedupe._typing import Variable as VariableProtocol + from dedupe._typing import Comparator, PredicateFunction, VariableDefinition class Variable(object): name: str type: ClassVar[str] predicates: list[predicates.Predicate] - higher_vars: Sequence["VariableProtocol"] + higher_vars: Sequence["Variable"] def __len__(self) -> int: return 1 @@ -30,8 +29,16 @@ def __eq__(self, other: Any) -> bool: other_name: str = other.name return self.name == other_name - def __init__(self, has_missing: bool = False): - self.has_missing = has_missing + def __init__(self, definition: VariableDefinition): + if definition.get("has missing", False): + self.has_missing = True + try: + exists_pred = predicates.ExistsPredicate(definition["field"]) + self.predicates.append(exists_pred) + except KeyError: + pass + else: + self.has_missing = False def __getstate__(self) -> dict[str, Any]: odict = self.__dict__.copy() @@ -39,13 +46,31 @@ def __getstate__(self) -> dict[str, Any]: return odict + @classmethod + def all_subclasses( + cls, + ) -> Generator[tuple[Optional[str], Type["Variable"]], None, None]: + for q in cls.__subclasses__(): + yield getattr(q, "type", None), q + for p in q.all_subclasses(): + yield p + class DerivedType(Variable): type = "Derived" - def __init__(self, name: str, var_type: str, **kwargs): - self.name = "(%s: %s)" % (str(name), str(var_type)) - super().__init__(**kwargs) + def __init__(self, definition: VariableDefinition): + self.name = "(%s: %s)" % (str(definition["name"]), str(definition["type"])) + super(DerivedType, self).__init__(definition) + + +class MissingDataType(Variable): + type = "MissingData" + + def __init__(self, name: str): + self.name = "(%s: Not Missing)" % name + + self.has_missing = False class FieldType(Variable): @@ -55,15 +80,13 @@ class FieldType(Variable): _Predicate: Type[predicates.SimplePredicate] = predicates.SimplePredicate comparator: Comparator - def __init__( - self, field: str, name: Optional[str] = None, has_missing: bool = False - ): - self.field = field + def __init__(self, definition: VariableDefinition): + self.field = definition["field"] - if name is None: - self.name = "(%s: %s)" % (self.field, self.type) + if "variable name" in definition: + self.name = definition["variable name"] else: - self.name = name + self.name = "(%s: %s)" % (self.field, self.type) self.predicates = [ self._Predicate(pred, self.field) for pred in self._predicate_functions @@ -73,39 +96,30 @@ def __init__( self._index_predicates, self._index_thresholds, self.field ) - self.has_missing = has_missing - if self.has_missing: - exists_pred = predicates.ExistsPredicate(self.field) - self.predicates.append(exists_pred) + super(FieldType, self).__init__(definition) class CustomType(FieldType): type = "Custom" - def __init__( - self, - field: str, - comparator: CustomComparator, - name: Optional[str] = None, - **kwargs, - ): - super().__init__(field, **kwargs) - - if comparator is None: - raise ValueError( - "You must define a comparator function for the Custom class" + def __init__(self, definition: VariableDefinition): + super(CustomType, self).__init__(definition) + + try: + self.comparator = definition["comparator"] # type: ignore[assignment] + except KeyError: + raise KeyError( + "For 'Custom' field types you must define " + "a 'comparator' function in the field " + "definition. " ) - else: - self.comparator = comparator - if name is None: + if "variable name" not in definition: self.name = "(%s: %s, %s)" % ( self.field, self.type, self.comparator.__name__, ) - else: - self.name = name def indexPredicates( diff --git a/dedupe/variables/categorical_type.py b/dedupe/variables/categorical_type.py index c2dc56768..b9d3ef66b 100644 --- a/dedupe/variables/categorical_type.py +++ b/dedupe/variables/categorical_type.py @@ -1,11 +1,9 @@ from __future__ import annotations -from typing import Sequence - from categorical import CategoricalComparator from dedupe import predicates -from dedupe._typing import PredicateFunction +from dedupe._typing import PredicateFunction, VariableDefinition from dedupe.variables.base import DerivedType, FieldType @@ -13,14 +11,26 @@ class CategoricalType(FieldType): type = "Categorical" _predicate_functions: list[PredicateFunction] = [predicates.wholeFieldPredicate] - def __init__(self, field: str, categories: Sequence[str], **kwargs): - super().__init__(field, **kwargs) + def _categories(self, definition: VariableDefinition) -> list[str]: + try: + categories = definition["categories"] + except KeyError: + raise ValueError('No "categories" defined') + + return categories + + def __init__(self, definition: VariableDefinition): + super(CategoricalType, self).__init__(definition) + + categories = self._categories(definition) self.comparator = CategoricalComparator(categories) # type: ignore[assignment] self.higher_vars = [] for higher_var in self.comparator.dummy_names: # type: ignore[attr-defined] - dummy_var = DerivedType(higher_var, "Dummy", has_missing=False) + dummy_var = DerivedType( + {"name": higher_var, "type": "Dummy", "has missing": self.has_missing} + ) self.higher_vars.append(dummy_var) def __len__(self) -> int: diff --git a/dedupe/variables/exists.py b/dedupe/variables/exists.py index 00ca7eb46..46c36c292 100644 --- a/dedupe/variables/exists.py +++ b/dedupe/variables/exists.py @@ -4,7 +4,7 @@ from categorical import CategoricalComparator -from dedupe._typing import PredicateFunction +from dedupe._typing import PredicateFunction, VariableDefinition from dedupe.variables.base import DerivedType from dedupe.variables.categorical_type import CategoricalType @@ -13,14 +13,16 @@ class ExistsType(CategoricalType): type = "Exists" _predicate_functions: list[PredicateFunction] = [] - def __init__(self, field: str, **kwargs): - super().__init__(field, **kwargs) + def __init__(self, definition: VariableDefinition): + super(CategoricalType, self).__init__(definition) self.cat_comparator = CategoricalComparator([0, 1]) self.higher_vars = [] for higher_var in self.cat_comparator.dummy_names: - dummy_var = DerivedType(higher_var, "Dummy", has_missing=self.has_missing) + dummy_var = DerivedType( + {"name": higher_var, "type": "Dummy", "has missing": self.has_missing} + ) self.higher_vars.append(dummy_var) def comparator(self, field_1: Any, field_2: Any) -> list[int]: diff --git a/dedupe/variables/interaction.py b/dedupe/variables/interaction.py index e9fbbe591..b0370e667 100644 --- a/dedupe/variables/interaction.py +++ b/dedupe/variables/interaction.py @@ -1,23 +1,24 @@ from __future__ import annotations import itertools -from typing import List, Mapping +from typing import Mapping -from dedupe._typing import FieldVariable, InteractionVariable +from dedupe._typing import VariableDefinition +from dedupe.variables.base import FieldType as FieldVariable from dedupe.variables.base import Variable class InteractionType(Variable): type = "Interaction" - higher_vars: List[InteractionVariable] + higher_vars: list["InteractionType"] - def __init__(self, *args: str, **kwargs): - self.interactions = list(args) + def __init__(self, definition: VariableDefinition): + self.interactions = definition["interaction variables"] self.name = "(Interaction: %s)" % str(self.interactions) self.interaction_fields = self.interactions - super().__init__(**kwargs) + super().__init__(definition) def expandInteractions(self, field_model: Mapping[str, FieldVariable]) -> None: self.interaction_fields = self.atomicInteractions( @@ -41,12 +42,14 @@ def categorical(self, field_model: Mapping[str, FieldVariable]) -> None: if not hasattr(field_model[field], "higher_vars") ] - dummies = [field_model[field].higher_vars for field in categoricals] # type: ignore[attr-defined] + dummies = [field_model[field].higher_vars for field in categoricals] self.higher_vars = [] for combo in itertools.product(*dummies): var_names = [field.name for field in combo] + noncategoricals - higher_var = InteractionType(*var_names, has_missing=self.has_missing) + higher_var = InteractionType( + {"has missing": self.has_missing, "interaction variables": var_names} + ) self.higher_vars.append(higher_var) def atomicInteractions( diff --git a/dedupe/variables/set.py b/dedupe/variables/set.py index 8b8253e59..fddfa5c5e 100644 --- a/dedupe/variables/set.py +++ b/dedupe/variables/set.py @@ -1,8 +1,7 @@ -from typing import Collection, Iterable, Optional - from simplecosine.cosine import CosineSetSimilarity from dedupe import predicates +from dedupe._typing import VariableDefinition from dedupe.variables.base import FieldType @@ -25,12 +24,10 @@ class SetType(FieldType): ) _index_thresholds = (0.2, 0.4, 0.6, 0.8) - def __init__( - self, field: str, corpus: Optional[Iterable[Collection[str]]] = None, **kwargs - ): - super().__init__(field, **kwargs) + def __init__(self, definition: VariableDefinition): + super(SetType, self).__init__(definition) - if corpus is None: - corpus = [] + if "corpus" not in definition: + definition["corpus"] = [] - self.comparator = CosineSetSimilarity(corpus) # type: ignore[assignment] + self.comparator = CosineSetSimilarity(definition["corpus"]) # type: ignore[assignment] diff --git a/dedupe/variables/string.py b/dedupe/variables/string.py index 9a2bc8ab3..4272dba09 100644 --- a/dedupe/variables/string.py +++ b/dedupe/variables/string.py @@ -1,11 +1,11 @@ -from typing import Iterable, Optional, Sequence, Type +from typing import Sequence, Type from affinegap import normalizedAffineGapDistance as affineGap from highered import CRFEditDistance from simplecosine.cosine import CosineTextSimilarity from dedupe import predicates -from dedupe._typing import PredicateFunction +from dedupe._typing import PredicateFunction, VariableDefinition from dedupe.variables.base import FieldType, indexPredicates crfEd = CRFEditDistance() @@ -36,8 +36,8 @@ class BaseStringType(FieldType): _Predicate = predicates.StringPredicate _predicate_functions: Sequence[PredicateFunction] = () - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__(self, definition: VariableDefinition): + super(BaseStringType, self).__init__(definition) self.predicates += indexPredicates( ( @@ -67,12 +67,10 @@ class ShortStringType(BaseStringType): ] _index_thresholds = (0.2, 0.4, 0.6, 0.8) - def __init__( - self, field: str, name: Optional[str] = None, crf: bool = False, **kwargs - ): - super().__init__(field, name=name, **kwargs) + def __init__(self, definition: VariableDefinition): + super(ShortStringType, self).__init__(definition) - if crf: + if definition.get("crf", False) is True: self.comparator = crfEd # type: ignore[assignment] else: self.comparator = affineGap # type: ignore[assignment] @@ -100,10 +98,10 @@ class TextType(BaseStringType): ] _index_thresholds = (0.2, 0.4, 0.6, 0.8) - def __init__(self, field: str, corpus: Optional[Iterable[str]] = None, **kwargs): - super().__init__(field, **kwargs) + def __init__(self, definition: VariableDefinition): + super(TextType, self).__init__(definition) - if corpus is None: - corpus = [] + if "corpus" not in definition: + definition["corpus"] = [] - self.comparator = CosineTextSimilarity(corpus) # type: ignore[assignment] + self.comparator = CosineTextSimilarity(definition["corpus"]) # type: ignore[assignment] diff --git a/docs/Variable-definition.rst b/docs/Variable-definition.rst index 5e6d19023..73abcccc2 100644 --- a/docs/Variable-definition.rst +++ b/docs/Variable-definition.rst @@ -3,30 +3,31 @@ Variable Definitions ==================== -Variables ---------- +Variable Types +-------------- -A variable definition describes the records that you want to match. It is sequence -of Variable objects. For example:- +A variable definition describes the records that you want to match. It is +a dictionary where the keys are the fields and the values are the +field specification. For example:- .. code:: python - import dedupe.variables - [ - dedupe.variables.String("Site Name"), - dedupe.variables.String("Address"), - dedupe.variables.ShortString("Zip", has_missing=True), - dedupe.variables.String("Phone", has_missing=True) + {'field': 'Site name', 'type': 'String'}, + {'field': 'Address', 'type': 'String'}, + {'field': 'Zip', 'type': 'ShortString', 'has missing': True}, + {'field': 'Phone', 'type': 'String', 'has missing': True} ] -String -^^^^^^ +String Types +^^^^^^^^^^^^ -The ``String`` takes the key of the record field to compare. +A ``String`` type field must declare the name of the record field to compare +a ``String`` type declaration. The ``String`` type expects fields to be of +class string. -``String`` variables are compared using string edit distance, specifically +``String`` types are compared using string edit distance, specifically `affine gap string distance `__. This is a good metric for measuring fields that might have typos in them, such as "John" vs "Jon". @@ -35,45 +36,44 @@ For example:- .. code:: python - dedupe.variables.String("Address") + {'field': 'Address', type: 'String'} -ShortString -^^^^^^^^^^^ +ShortString Types +^^^^^^^^^^^^^^^^^ -The ``ShortString`` variable is just like the ``String`` variable except that dedupe +A ``ShortString`` type field is just like ``String`` types except that dedupe will not try to learn any :ref:`index blocking rules ` for these fields, which can speed up the training phase considerably. -Zip codes and city names are good candidates for this variable. If in doubt, +Zip codes and city names are good candidates for this type. If in doubt, always use ``String``. For example:- .. code:: python - dedupe.variables.ShortString("Zipcode") + {'field': 'Zipcode', type: 'ShortString'} .. _text-types-label: -Text -^^^^ +Text Types +^^^^^^^^^^ If you want to compare fields containing blocks of text e.g. product -descriptions or article abstracts, you should use this variable. ``Text`` -variables are compared using the `cosine similarity metric +descriptions or article abstracts, you should use this type. ``Text`` type +fields are compared using the `cosine similarity metric `__. This is a measurement of the amount of words that two documents have in common. This measure can be made more useful as the overlap of rare words counts more than the overlap of common words. -Compare this to ``String`` and ``ShortString`` variables: For strings -containing occupations, "yoga teacher" might be fairly similar to -"yoga instructor" when using the ``Text`` measurement, because they -both contain the relatively rare word of "yoga". However, if you -compared these two strings using the ``String`` or ``ShortString`` -measurements, they might be considered fairly dissimilar, because the -actual string edit distance between them is large. +Compare this to ``String`` and ``ShortString`` types: For strings containing +occupations, "yoga teacher" might be fairly similar to "yoga instructor" when +using the ``Text`` measurement, because they both contain the relatively +rare word of "yoga". However, if you compared these two strings using the +``String`` or ``ShortString`` measurements, they might be considered fairly +dis-similar, because the actual string edit distance between them is large. If provided a sequence of example fields (i.e. a corpus) then dedupe will @@ -81,27 +81,29 @@ learn these weights for you. For example:- .. code:: python - dedupe.variables.Text("Product description", - corpus=[ - 'this product is great', - 'this product is great and blue' - ] - ) + { + 'field': 'Product description', + 'type': 'Text', + 'corpus' : [ + 'this product is great', + 'this product is great and blue' + ] + } If you don't want to adjust the measure to your data, just leave 'corpus' out of the variable definition entirely. .. code:: python - dedupe.variables.Text("Product description") + {'field': 'Product description', 'type': 'Text'} -Custom Variable +Custom Types ^^^^^^^^^^^^ -A ``Custom`` variables allows you to use a custom function for -comparing fields. The function must take two field values and return a -number. +A ``Custom`` type field must have specify the field it wants to compare, a +type declaration of ``Custom``, and a comparator declaration. The comparator +must be a function that can take in two field values and return a number. For example, a custom comparator: @@ -118,53 +120,65 @@ The corresponding variable definition: .. code:: python - dedupe.variables.Custom("Zip", comparator=same_or_not_comparator) + { + 'field': 'Zip', + 'type': 'Custom', + 'comparator': same_or_not_comparator + } -``Custom`` variables do not have any blocking rules associated with them. +``Custom`` fields do not have any blocking rules associated with them. Since dedupe needs blocking rules, a data model that only contains ``Custom`` fields will raise an error. LatLong ^^^^^^^ -``LatLong`` variables are compared using the `Haversine +A ``LatLong`` type field must have as the name of a field and a type +declaration of ``LatLong``. ``LatLong`` fields are compared using the `Haversine Formula `__. -A ``LatLong`` variable field must consist of tuples of floats -corresponding to a latitude and a longitude. +A ``LatLong`` +type field must consist of tuples of floats corresponding to a latitude and a +longitude. .. code:: python - dedupe.variables.LatLong("location") + {'field': 'Location', 'type': 'LatLong'} Set ^^^ -``Set`` variables are for comparing lists of elements, like keywords or -client names. ``Set`` variables are very similar to :ref:`text-types-label`. They +A ``Set`` type field is for comparing lists of elements, like keywords or +client names. ``Set`` types are very similar to :ref:`text-types-label`. They use the same comparison function and you can also let dedupe learn which terms are common or rare by providing a corpus. Within a record, a ``Set`` -variable field has to be hashable sequences like tuples or frozensets. +type field has to be hashable sequences like tuples or frozensets. .. code:: python - dedupe.variables.Set("Co-authors", - corpus=[ - ('steve edwards'), - ('steve edwards', 'steve jobs') - ]) + { + 'field': 'Co-authors', + 'type': 'Set', + 'corpus' : [ + ('steve edwards'), + ('steve edwards', 'steve jobs') + ] + } or .. code:: python - dedupe.variables.Set("Co-authors") + {'field': 'Co-authors', 'type': 'Set'} Interaction ^^^^^^^^^^^ -An ``Interaction`` variable multiplies the values of the multiple variables. -The arguments to an ``Interaction`` variable must be a sequence of variable names of +An ``Interaction`` field multiplies the values of the multiple variables. +An ``Interaction`` variable is created with type declaration of +``Interaction`` and an ``interaction variables`` declaration. + +The ``interaction variables`` field must be a sequence of variable names of other fields you have defined in your variable definition. `Interactions `__ @@ -173,9 +187,10 @@ are good when the effect of two predictors is not simply additive. .. code:: python [ - dedupe.variables.String("Name", name="name"), - dedupe.variables.Custom("Zip", comparator=same_or_not_comparator, name="zip") - dedupe.variables.Interaction("name", "zip") + { 'field': 'Name', 'variable name': 'name', 'type': 'String' }, + { 'field': 'Zip', 'variable name': 'zip', 'type': 'Custom', + 'comparator' : same_or_not_comparator }, + {'type': 'Interaction', 'interaction variables': ['name', 'zip']} ] Exact @@ -185,7 +200,7 @@ Exact .. code:: python - dedupe.variables.Exact("city") + {'field': 'city', 'type': 'Exact'} Exists @@ -201,7 +216,7 @@ different cases: .. code:: python - dedupe.variables.Exists("first_name") + {'field': 'first_name', 'type': 'Exists'} @@ -239,7 +254,11 @@ You would create a definition such as: .. code:: python - dedupe.variables.Categorical("Business Type", categories=['taxi', 'lawyer']) + { + 'field': 'Business Type', + 'type': 'Categorical', + 'categories' : ['taxi', 'lawyer'] + } Price ^^^^^ @@ -250,7 +269,7 @@ prices. The values of ``Price`` field must be a positive float. If the value is .. code:: python - dedupe.variables.Price("cost") + {'field': 'cost', 'type': 'Price'} Optional Variables ------------------ @@ -267,8 +286,8 @@ DateTime ``DateTime`` variables are useful for comparing dates and timestamps. This variable can accept strings or Python datetime objects as inputs. -The ``DateTime`` variable a few optional arguments that can help -improve behavior if you know your field follows an unusual format: +The ``DateTime`` variable definition accepts a few optional arguments that +can help improve behavior if you know your field follows an unusual format: * :code:`fuzzy` - Use fuzzy parsing to automatically extract dates from strings like "It happened on June 2nd, 2018" (default :code:`True`) * :code:`dayfirst` - Ambiguous dates should be parsed as dd/mm/yy (default :code:`False`) @@ -278,24 +297,34 @@ Note that the ``DateTime`` variable defaults to mm/dd/yy for ambiguous dates. If both :code:`dayfirst` and :code:`yearfirst` are set to :code:`True`, then :code:`dayfirst` will take precedence. +For example, a sample ``DateTime`` variable definition, using the defaults: .. code:: python - import datetimetype - - datetimetype.DateTime("field") + { + 'field': 'time_of_sale', + 'type': 'DateTime', + 'fuzzy': True, + 'dayfirst': False, + 'yearfirst': False + } -To install: +If you're happy with the defaults, you can simply define the :code:`field` +and :code:`type`: -.. code:: console +.. code:: python - pip install dedupe-variable-datetime + {'field': 'time_of_sale', 'type': 'DateTime'} +Install the `dedupe-variable-datetime +`__ package for +``DateTime`` Type. For more info, see the `GitHub Repository +`__. -Address -^^^^^^^ +Address Type +^^^^^^^^^^^^ -An ``USAddress`` variable should be used for United States addresses. It uses +An ``Address`` variable should be used for United States addresses. It uses the `usaddress `__ package to split apart an address string into components like address number, street name, and street type and compares component to component. @@ -304,22 +333,18 @@ For example:- .. code:: python - import addressvariable - - addressvariable.USAddress("address") - - -To install: + {'field': 'address', 'type': 'Address'} -.. code:: console - pip install dedupe-variable-address +Install the `dedupe-variable-address +`__ package for +``Address`` Type. For more info, see the `GitHub Repository +`__. +Name Type +^^^^^^^^^ -Name -^^^^ - -A ``WesternName`` variable should be used for a field that contains American names, +A ``Name`` variable should be used for a field that contains American names, corporations and households. It uses the `probablepeople `__ package to split apart an name string into components like give name, surname, generational suffix, @@ -330,15 +355,42 @@ For example:- .. code:: python - import namevariable + {'field': 'name', 'type': 'Name'} + + +Install the `dedupe-variable-name +`__ package for ``Name`` +Type. For more info, see the `GitHub Repository +`__. + +Fuzzy Category +^^^^^^^^^^^^^^ + +A ``FuzzyCategorical`` variable should be used for when you for +categorical data that has variations. - namevariable.WesternName("field") +Occupations are an example, where the you may have 'Attorney', 'Counsel', and +'Lawyer'. For this variable type, you need to supply a corpus of records that +contain your focal record and other field types. This corpus should either be +all the data you are trying to link or a representative sample. + +For example:- + +.. code:: python -To install: - -.. code:: console + { + 'field': 'occupation', + 'type': 'FuzzyCategorical', + 'corpus' : [ + {'name' : 'Jim Doe', 'occupation' : 'Attorney'}, + {'name' : 'Jim Doe', 'occupation' : 'Lawyer'} + ] + } - pip install dedupe-variable-name +Install the `dedupe-variable-fuzzycategory +`__ package for +the ``FuzzyCategorical`` Type. For more info, see the `GitHub Repository +`__. Missing Data @@ -355,13 +407,13 @@ a ``None`` object. You should also use ``None`` to represent empty strings {'Name': None, 'Phone': '773-555-1123'} ] -If you want to model this missing data for a field, you can set the ``has -missing=True`` in the variable definition. This creates a new, +If you want to model this missing data for a field, you can set ``'has +missing' : True`` in the variable definition. This creates a new, additional field representing whether the data was present or not and zeros out the missing data. -If there is missing data, but you did not declare ``has -missing=True`` then the missing data will simply be zeroed out and +If there is missing data, but you did not declare ``'has +missing' : True`` then the missing data will simply be zeroed out and no field will be created to account for missing data. This approach is called 'response augmented data' and is described in @@ -378,7 +430,7 @@ This approach makes a few assumptions that are usually not completely true: If you define an an interaction with a field that you declared to have -missing data, then ``has missing=True`` will also be set for the +missing data, then ``has missing : True`` will also be set for the Interaction field. Longer example of a variable definition: @@ -386,12 +438,12 @@ Longer example of a variable definition: .. code:: python [ - dedupe.variables.String("name", name="name"), - dedupe.variables.String("address"), - dedupe.variables.String("city", name="city"), - dedupe.variables.Custom("zip", comparator=same_or_not_comparator), - dedupe.variables.String("cuisine", has_missing=True), - dedupe.vairables.Interaction("name", "city") + {'field': 'name', 'variable name' : 'name', 'type': 'String'}, + {'field': 'address', 'type': 'String'}, + {'field': 'city', 'variable name' : 'city', 'type': 'String'}, + {'field': 'zip', 'type': 'Custom', 'comparator' : same_or_not_comparator}, + {'field': 'cuisine', 'type': 'String', 'has missing': True} + {'type': 'Interaction', 'interaction variables' : ['name', 'city']} ] Multiple Variables comparing same field @@ -404,8 +456,8 @@ For example:- .. code:: python [ - dedupe.variables.String("name"), - dedupe.variables.Text("name") + {'field': 'name', 'type': 'String'}, + {'field': 'name', 'type': 'Text'} ] @@ -423,4 +475,4 @@ default edit distance. .. code:: python - dedupe.variables.String("name", crf=True) + {'field': 'name', 'type': 'String', 'crf': True} diff --git a/pyproject.toml b/pyproject.toml index 0256eee68..9cfe9afc9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "dedupe" description = "A python library for accurate and scaleable data deduplication and entity-resolution" -version = "3.0.0" +version = "2.0.24" readme = "README.md" requires-python = ">=3.7" license = {file = "LICENSE"} @@ -63,11 +63,10 @@ dedupe = ["py.typed"] [tool.mypy] plugins = "numpy.typing.mypy_plugin" -files = ["dedupe"] +files = "dedupe" show_error_codes = true ignore_missing_imports = true check_untyped_defs = true -implicit_reexport = false [tool.pytest.ini_options] minversion = "7.1" diff --git a/tests/test_api.py b/tests/test_api.py index 4e6b92906..84ac9169a 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -46,8 +46,8 @@ def icfi(x): class ActiveMatch(unittest.TestCase): def setUp(self): self.field_definition = [ - dedupe.variables.String("name"), - dedupe.variables.String("age"), + {"field": "name", "type": "String"}, + {"field": "age", "type": "String"}, ] def test_initialize_fields(self): @@ -58,26 +58,23 @@ def test_initialize_fields(self): [], ) - with self.assertRaises(ValueError): - dedupe.api.ActiveMatching([{"field": "name", "type": "String"}]) - with self.assertRaises(ValueError): dedupe.api.ActiveMatching( - [dedupe.variables.Custom("name", comparator=lambda x, y: 1)], + [{"field": "name", "type": "Custom", "comparator": lambda x, y: 1}], ) with self.assertRaises(ValueError): dedupe.api.ActiveMatching( [ - dedupe.variables.Custom("name", comparator=lambda x, y: 1), - dedupe.variables.Custom("age", comparator=lambda x, y: 1), + {"field": "name", "type": "Custom", "comparator": lambda x, y: 1}, + {"field": "age", "type": "Custom", "comparator": lambda x, y: 1}, ], ) dedupe.api.ActiveMatching( [ - dedupe.variables.Custom("name", comparator=lambda x, y: 1), - dedupe.variables.String("age"), + {"field": "name", "type": "Custom", "comparator": lambda x, y: 1}, + {"field": "age", "type": "String"}, ], ) diff --git a/tests/test_core.py b/tests/test_core.py index d0bc8c94a..56d1ac010 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -47,7 +47,7 @@ def setUp(self): ] ) - deduper = dedupe.Dedupe([dedupe.variables.String("name")]) + deduper = dedupe.Dedupe([{"field": "name", "type": "String"}]) self.data_model = deduper.data_model self.classifier = MockClassifier() @@ -104,7 +104,7 @@ def test_score_duplicates_with_zeros(self): class FieldDistances(unittest.TestCase): def test_exact_comparator(self): - deduper = dedupe.Dedupe([dedupe.variables.Exact("name")]) + deduper = dedupe.Dedupe([{"field": "name", "type": "Exact"}]) record_pairs = ( ({"name": "Shmoo"}, {"name": "Shmee"}), @@ -117,7 +117,7 @@ def test_exact_comparator(self): def test_comparator(self): deduper = dedupe.Dedupe( - [dedupe.variables.Categorical("type", categories=["a", "b", "c"])] + [{"field": "type", "type": "Categorical", "categories": ["a", "b", "c"]}] ) record_pairs = (({"type": "a"}, {"type": "b"}), ({"type": "a"}, {"type": "c"})) @@ -131,11 +131,14 @@ def test_comparator(self): def test_comparator_interaction(self): deduper = dedupe.Dedupe( [ - dedupe.variables.Categorical( - "type", categories=["a", "b"], name="type" - ), - dedupe.variables.Interaction("type", "name"), - dedupe.variables.Exact("name", name="name"), + { + "field": "type", + "variable name": "type", + "type": "Categorical", + "categories": ["a", "b"], + }, + {"type": "Interaction", "interaction variables": ["type", "name"]}, + {"field": "name", "variable name": "name", "type": "Exact"}, ] ) diff --git a/tests/test_dedupe.py b/tests/test_dedupe.py index 4a925e8b4..e50af63cc 100644 --- a/tests/test_dedupe.py +++ b/tests/test_dedupe.py @@ -6,7 +6,6 @@ import numpy import dedupe -import dedupe.variables DATA = { 100: {"name": "Bob", "age": "50"}, @@ -38,9 +37,9 @@ def test_data_model(self): data_model = DataModel( [ - dedupe.variables.String(field="a", name="a"), - dedupe.variables.String(field="b", name="b"), - dedupe.variables.Interaction("a", "b"), + {"field": "a", "variable name": "a", "type": "String"}, + {"field": "b", "variable name": "b", "type": "String"}, + {"type": "Interaction", "interaction variables": ["a", "b"]}, ] ) @@ -48,9 +47,14 @@ def test_data_model(self): data_model = DataModel( [ - dedupe.variables.String(field="a", name="a", has_missing=True), - dedupe.variables.String(field="b", name="b"), - dedupe.variables.Interaction("a", "b"), + { + "field": "a", + "variable name": "a", + "type": "String", + "has missing": True, + }, + {"field": "b", "variable name": "b", "type": "String"}, + {"type": "Interaction", "interaction variables": ["a", "b"]}, ] ) @@ -58,9 +62,14 @@ def test_data_model(self): data_model = DataModel( [ - dedupe.variables.String(field="a", name="a", has_missing=False), - dedupe.variables.String(field="b", name="b"), - dedupe.variables.Interaction("a", "b"), + { + "field": "a", + "variable name": "a", + "type": "String", + "has missing": False, + }, + {"field": "b", "variable name": "b", "type": "String"}, + {"type": "Interaction", "interaction variables": ["a", "b"]}, ] ) diff --git a/tests/test_labeler.py b/tests/test_labeler.py index 8bbc2eab5..30609ffae 100644 --- a/tests/test_labeler.py +++ b/tests/test_labeler.py @@ -3,7 +3,6 @@ import pytest -import dedupe from dedupe import datamodel, labeler from dedupe._typing import RecordDictPair @@ -25,7 +24,7 @@ def freeze_record_pair(record_pair: RecordDictPair): class ActiveLearningTest(unittest.TestCase): def setUp(self): self.data_model = datamodel.DataModel( - [dedupe.variables.String("name"), dedupe.variables.String("age")] + [{"field": "name", "type": "String"}, {"field": "age", "type": "String"}] ) def test_AL(self): diff --git a/tests/test_serializer.py b/tests/test_serializer.py index 7eb2d931e..ab8c0e471 100644 --- a/tests/test_serializer.py +++ b/tests/test_serializer.py @@ -53,7 +53,7 @@ def test_writeTraining(self): assert isinstance(loaded_training_pairs["distinct"][0][0]["bar"], frozenset) assert isinstance(loaded_training_pairs["distinct"][0][0]["baz"], tuple) - deduper = dedupe.Dedupe([dedupe.variables.String("foo")]) + deduper = dedupe.Dedupe([{"field": "foo", "type": "String"}]) deduper.classifier.cv = False encoded_file.seek(0) diff --git a/tests/test_training.py b/tests/test_training.py index 6b71f3aee..b908dde0c 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -7,7 +7,7 @@ class TrainingTest(unittest.TestCase): def setUp(self): - field_definition = [dedupe.variables.String("name")] + field_definition = [{"field": "name", "type": "String"}] self.data_model = dedupe.Dedupe(field_definition).data_model self.training_pairs = { "match": [