Skip to content

Commit

Permalink
simplify datamodel slightly
Browse files Browse the repository at this point in the history
  • Loading branch information
fgregg committed Jun 27, 2024
1 parent 0cac941 commit d3b0aac
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 26 deletions.
26 changes: 9 additions & 17 deletions dedupe/datamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import numpy

from dedupe._typing import FieldVariable
from dedupe.variables.base import MissingDataType
from dedupe.variables.interaction import InteractionType

if TYPE_CHECKING:
Expand Down Expand Up @@ -55,25 +54,26 @@ def __init__(self, variable_definitions: Iterable[Variable]):
]

# we need to keep track of ordering of variables because in
# order calculate derived fields like interation and missing
# data fields. This code would be much better if there was
# always a "columns" attribute on variables
# order to calculate derived fields like interaction and missing
# data fields.
columns: list[Variable] = []
for variable in self.field_variables:
if hasattr(variable, "higher_vars"):
columns.extend(variable.higher_vars)
else:
if len(variable) == 1:
columns.append(variable)
elif len(variable) > 1:
columns.extend(variable.higher_vars)

self._derived_start = len(columns)

# i'm not really satisfied with how we are dealing with interactions
# here. seems like there should be a cleaner path, but i don't see it
# today
columns += interactions(variable_definitions, self.field_variables)
columns += missing(columns)

self._missing_field_indices = missing_field_indices(columns)
self._interaction_indices = interaction_indices(columns)

self._len = len(columns)
self._len = len(columns) + len(self._missing_field_indices)

def __len__(self) -> int:
return self._len
Expand Down Expand Up @@ -167,14 +167,6 @@ def __setstate__(self, d):
self.__dict__ = d


def missing(variables: list[Variable]) -> list[MissingDataType]:
missing_variables = []
for var in variables:
if var.has_missing:
missing_variables.append(MissingDataType(var.name))
return missing_variables


def interactions(
variables: Iterable[Variable], primary_variables: Iterable[FieldVariable]
) -> list[InteractionVariable]:
Expand Down
9 changes: 0 additions & 9 deletions dedupe/variables/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,6 @@ def __init__(self, name, var_type, **kwargs):
super().__init__(**kwargs)


class MissingDataType(Variable):
type = "MissingData"

def __init__(self, name: str):
self.name = "(%s: Not Missing)" % name

self.has_missing = False


class FieldType(Variable):
_index_thresholds: Sequence[float] = []
_index_predicates: Sequence[Type[predicates.IndexPredicate]] = []
Expand Down

0 comments on commit d3b0aac

Please sign in to comment.