From 7d2c79becabe375980613ff3bf66da678cbad658 Mon Sep 17 00:00:00 2001 From: Forest Gregg Date: Thu, 27 Jun 2024 08:45:16 -0400 Subject: [PATCH] require py3.8+, modernize python code (#1195) --- benchmarks/benchmarks/canonical.py | 2 +- benchmarks/benchmarks/canonical_gazetteer.py | 4 +- benchmarks/benchmarks/canonical_matching.py | 2 +- dedupe/_typing.py | 10 ++--- dedupe/api.py | 41 +++++++------------- dedupe/blocking.py | 3 +- dedupe/clustering.py | 1 - dedupe/convenience.py | 3 +- dedupe/core.py | 30 +++++--------- dedupe/labeler.py | 6 +-- dedupe/predicates.py | 31 +++++++-------- dedupe/tfidf.py | 1 - dedupe/training.py | 6 +-- dedupe/variables/base.py | 26 ++++++------- dedupe/variables/interaction.py | 4 +- pyproject.toml | 3 +- tests/duplicateCluster_memory_case.py | 2 +- tests/test_blocking.py | 26 ++++++------- tests/test_dedupe.py | 2 - tests/test_predicates.py | 34 ++++++++-------- tests/test_serializer.py | 3 -- tests/test_training.py | 36 ++++++++--------- 22 files changed, 116 insertions(+), 160 deletions(-) diff --git a/benchmarks/benchmarks/canonical.py b/benchmarks/benchmarks/canonical.py index 6f23bb08d..c3e439ff1 100644 --- a/benchmarks/benchmarks/canonical.py +++ b/benchmarks/benchmarks/canonical.py @@ -9,7 +9,7 @@ def make_report(data, clustering): true_dupes = common.get_true_dupes(data) - predicted_dupes = set([]) + predicted_dupes = set() for cluser_id, _ in clustering: for pair in combinations(cluser_id, 2): predicted_dupes.add(frozenset(pair)) diff --git a/benchmarks/benchmarks/canonical_gazetteer.py b/benchmarks/benchmarks/canonical_gazetteer.py index 1f73b8e20..fa3086fc9 100644 --- a/benchmarks/benchmarks/canonical_gazetteer.py +++ b/benchmarks/benchmarks/canonical_gazetteer.py @@ -7,9 +7,9 @@ def make_report(data, clustering): true_dupes = canonical_matching.get_true_dupes(data) - predicted_dupes = set( + predicted_dupes = { frozenset([a, b]) for a, result in clustering for b, score in result - ) + } return common.Report.from_scores(true_dupes, predicted_dupes) diff --git a/benchmarks/benchmarks/canonical_matching.py b/benchmarks/benchmarks/canonical_matching.py index b1c3c823b..fc56034bf 100644 --- a/benchmarks/benchmarks/canonical_matching.py +++ b/benchmarks/benchmarks/canonical_matching.py @@ -15,7 +15,7 @@ def get_true_dupes(data): def make_report(data, clustering): true_dupes = get_true_dupes(data) - predicted_dupes = set(frozenset(pair) for pair, _ in clustering) + predicted_dupes = {frozenset(pair) for pair, _ in clustering} return common.Report.from_scores(true_dupes, predicted_dupes) diff --git a/dedupe/_typing.py b/dedupe/_typing.py index 9de9eb5f7..2631829fd 100644 --- a/dedupe/_typing.py +++ b/dedupe/_typing.py @@ -1,5 +1,4 @@ import os -import sys from typing import ( TYPE_CHECKING, Any, @@ -9,11 +8,14 @@ Iterable, Iterator, List, + Literal, Mapping, MutableSequence, + Protocol, Sequence, Tuple, Type, + TypedDict, Union, runtime_checkable, ) @@ -21,12 +23,6 @@ import numpy import numpy.typing -if sys.version_info >= (3, 8): - from typing import Literal, Protocol, TypedDict -else: - from typing_extensions import Literal, Protocol, TypedDict - - if TYPE_CHECKING: from dedupe.predicates import Predicate diff --git a/dedupe/api.py b/dedupe/api.py index 9d17bc0b0..03b635544 100644 --- a/dedupe/api.py +++ b/dedupe/api.py @@ -1,5 +1,4 @@ #!/usr/bin/python -# -*- coding: utf-8 -*- """ dedupe provides the main user interface for the library the Dedupe class @@ -29,15 +28,7 @@ import dedupe.serializer as serializer if TYPE_CHECKING: - from typing import ( - BinaryIO, - Collection, - Generator, - Iterable, - MutableMapping, - TextIO, - Union, - ) + from typing import BinaryIO, Collection, Generator, Iterable, MutableMapping, TextIO import numpy.typing @@ -75,7 +66,7 @@ logger = logging.getLogger(__name__) -class Matching(object): +class Matching: """ Base Class for Record Matching Classes """ @@ -682,9 +673,9 @@ def __init__( self.temp_dir = tempfile.TemporaryDirectory() self.db = self.temp_dir.name + "/blocks.db" - self.indexed_data: Union[ - MutableMapping[int, RecordDict], MutableMapping[str, RecordDict] - ] + self.indexed_data: ( + MutableMapping[int, RecordDict] | MutableMapping[str, RecordDict] + ) self.indexed_data = {} # type: ignore[assignment] def _close(self) -> None: @@ -856,10 +847,10 @@ def blocks(self, data): ORDER BY a.record_id""" ) - pair_blocks: Union[ - Iterable[tuple[int, Iterable[tuple[int, int]]]], - Iterable[tuple[str, Iterable[tuple[str, str]]]], - ] + pair_blocks: ( + Iterable[tuple[int, Iterable[tuple[int, int]]]] + | Iterable[tuple[str, Iterable[tuple[str, str]]]] + ) pair_blocks = itertools.groupby(pairs, lambda x: x[0]) @@ -1313,14 +1304,12 @@ def mark_pairs(self, labeled_pairs: TrainingData) -> None: self.active_learner.mark(examples, y) except dedupe.predicates.NoIndexError as e: raise UserWarning( - ( - "The record\n" - f"{e.failing_record}\n" - "is not known to to the active learner. " - "Make sure all `labeled_pairs` " - "are in the data or training file " - "of the `prepare_training()` method" - ) + "The record\n" + f"{e.failing_record}\n" + "is not known to to the active learner. " + "Make sure all `labeled_pairs` " + "are in the data or training file " + "of the `prepare_training()` method" ) def _checkTrainingPairs(self, labeled_pairs: TrainingData) -> None: diff --git a/dedupe/blocking.py b/dedupe/blocking.py index a3c134f24..956626fd8 100644 --- a/dedupe/blocking.py +++ b/dedupe/blocking.py @@ -1,5 +1,4 @@ #!/usr/bin/python -# -*- coding: utf-8 -*- from __future__ import annotations import logging @@ -34,7 +33,7 @@ def index_list() -> IndexList: return defaultdict(list) -class Fingerprinter(object): +class Fingerprinter: """Takes in a record and returns all blocks that record belongs to""" def __init__(self, predicates: Iterable[dedupe.predicates.Predicate]) -> None: diff --git a/dedupe/clustering.py b/dedupe/clustering.py index 94e62e3cb..82719417d 100644 --- a/dedupe/clustering.py +++ b/dedupe/clustering.py @@ -1,5 +1,4 @@ #!/usr/bin/python -# -*- coding: utf-8 -*- from __future__ import annotations import array diff --git a/dedupe/convenience.py b/dedupe/convenience.py index 26b886a4a..9b6f5a950 100644 --- a/dedupe/convenience.py +++ b/dedupe/convenience.py @@ -1,5 +1,4 @@ #!/usr/bin/python -# -*- coding: utf-8 -*- from __future__ import annotations import collections @@ -162,7 +161,7 @@ def console_label(deduper: dedupe.api.ActiveMatching) -> None: # pragma: no cov for record in record_pair: for field in fields: - line = "%s : %s" % (field, record[field]) + line = "{} : {}".format(field, record[field]) _print(line) _print() _print(f"{n_match}/10 positive, {n_distinct}/10 negative") diff --git a/dedupe/core.py b/dedupe/core.py index 975c08469..710daf97b 100644 --- a/dedupe/core.py +++ b/dedupe/core.py @@ -1,5 +1,4 @@ #!/usr/bin/python -# -*- coding: utf-8 -*- from __future__ import annotations import collections @@ -17,17 +16,7 @@ from dedupe.backport import RLock if TYPE_CHECKING: - from typing import ( - Any, - Generator, - Iterable, - Iterator, - Literal, - Optional, - Sequence, - Type, - Union, - ) + from typing import Any, Generator, Iterable, Iterator, Literal, Sequence, Union from dedupe._typing import ( Block, @@ -50,7 +39,7 @@ class BlockingError(Exception): pass -class ScoreDupes(object): +class ScoreDupes: def __init__( self, featurizer: FeaturizerFunction, @@ -71,7 +60,7 @@ def __init__( def __call__(self) -> None: while True: - record_pairs: Optional[RecordPairs] = self.records_queue.get() + record_pairs: RecordPairs | None = self.records_queue.get() if record_pairs is None: break @@ -198,7 +187,7 @@ def fillQueue( break -class ScoreGazette(object): +class ScoreGazette: def __init__(self, featurizer: FeaturizerFunction, classifier: Classifier): self.featurizer = featurizer self.classifier = classifier @@ -238,8 +227,7 @@ def scoreGazette( score_records = ScoreGazette(featurizer, classifier) - for scored_pairs in imap(score_records, record_pairs): - yield scored_pairs + yield from imap(score_records, record_pairs) # The underlying processes in the pool should terminate when the # pool is garbage collected, but sometimes it takes a while @@ -248,7 +236,7 @@ def scoreGazette( pool.join() -class MockPool(object): +class MockPool: def close(self) -> None: pass @@ -273,7 +261,7 @@ def appropriate_imap(num_cores: int) -> tuple[MapLike, ClosableJoinable]: return imap, pool -def peek(seq: Iterator[Any]) -> tuple[Optional[Any], Iterator[Any]]: +def peek(seq: Iterator[Any]) -> tuple[Any | None, Iterator[Any]]: try: first = next(seq) except TypeError as e: @@ -307,11 +295,11 @@ def Enumerator(start: int = 0) -> collections.defaultdict[Any, int]: @overload -def sniff_id_type(ids: Sequence[tuple[int, int]]) -> Type[int]: ... +def sniff_id_type(ids: Sequence[tuple[int, int]]) -> type[int]: ... @overload -def sniff_id_type(ids: Sequence[tuple[str, str]]) -> tuple[Type[str], Literal[256]]: ... +def sniff_id_type(ids: Sequence[tuple[str, str]]) -> tuple[type[str], Literal[256]]: ... def sniff_id_type(ids: Sequence[tuple[RecordID, RecordID]]) -> RecordIDDType: diff --git a/dedupe/labeler.py b/dedupe/labeler.py index c5adbbf4f..bc024140a 100644 --- a/dedupe/labeler.py +++ b/dedupe/labeler.py @@ -14,7 +14,7 @@ import dedupe.training as training if TYPE_CHECKING: - from typing import Dict, Iterable, Literal, Mapping + from typing import Iterable, Literal, Mapping from dedupe._typing import ( Data, @@ -170,7 +170,7 @@ def remove(self, index: int) -> None: def _sample_indices( self, sample_size: int, max_cover: int ) -> Iterable[RecordIDPair]: - weights: Dict[RecordIDPair, float] = {} + weights: dict[RecordIDPair, float] = {} for predicate, covered in self.block_learner.comparison_cover.items(): # each predicate gets to vote for every record pair it covers. the # strength of that vote is in inverse proportion to the number of @@ -248,7 +248,7 @@ def __init__( def _index_predicates(self, candidates: TrainingExamples) -> None: blocker = self.block_learner.blocker - records = core.unique((record for pair in candidates for record in pair)) + records = core.unique(record for pair in candidates for record in pair) for field in blocker.index_fields: unique_fields = {record[field] for record in records} diff --git a/dedupe/predicates.py b/dedupe/predicates.py index 1e07c672e..2abe2149d 100644 --- a/dedupe/predicates.py +++ b/dedupe/predicates.py @@ -1,5 +1,4 @@ #!/usr/bin/python -# -*- coding: utf-8 -*- from __future__ import annotations import abc @@ -17,7 +16,7 @@ from dedupe.predicate_functions import * # noqa: F401, F403 if TYPE_CHECKING: - from typing import AbstractSet, Any, FrozenSet, Iterable, Literal, Mapping, Sequence + from typing import AbstractSet, Any, Iterable, Literal, Mapping, Sequence from dedupe._typing import PredicateFunction, RecordDict from dedupe.index import Index @@ -51,7 +50,7 @@ def __iter__(self): yield self def __repr__(self) -> str: - return "%s: %s" % (self.type, self.__name__) + return "{}: {}".format(self.type, self.__name__) def __hash__(self) -> int: try: @@ -70,7 +69,7 @@ def __len__(self) -> int: def __call__(self, record: RecordDict, **kwargs) -> AbstractSet[str]: pass - def __add__(self, other: "Predicate") -> "CompoundPredicate": + def __add__(self, other: Predicate) -> CompoundPredicate: if isinstance(other, CompoundPredicate): return CompoundPredicate((self,) + tuple(other)) elif isinstance(other, Predicate): @@ -84,10 +83,10 @@ class SimplePredicate(Predicate): def __init__(self, func: PredicateFunction, field: str): self.func = func - self.__name__ = "(%s, %s)" % (func.__name__, field) + self.__name__ = "({}, {})".format(func.__name__, field) self.field = field - def __call__(self, record: RecordDict, **kwargs) -> FrozenSet[str]: + def __call__(self, record: RecordDict, **kwargs) -> frozenset[str]: column = record[self.field] if column: return self.func(column) @@ -96,7 +95,7 @@ def __call__(self, record: RecordDict, **kwargs) -> FrozenSet[str]: class StringPredicate(SimplePredicate): - def __call__(self, record: RecordDict, **kwargs) -> FrozenSet[str]: + def __call__(self, record: RecordDict, **kwargs) -> frozenset[str]: column: str = record[self.field] if column: return self.func(" ".join(strip_punc(column).split())) @@ -108,17 +107,17 @@ class ExistsPredicate(Predicate): type = "ExistsPredicate" def __init__(self, field: str): - self.__name__ = "(Exists, %s)" % (field,) + self.__name__ = "(Exists, {})".format(field) self.field = field @staticmethod - def func(column: Any) -> FrozenSet[Literal["0", "1"]]: + def func(column: Any) -> frozenset[Literal["0", "1"]]: if column: return frozenset(("1",)) else: return frozenset(("0",)) - def __call__(self, record: RecordDict, **kwargs) -> FrozenSet[Literal["0", "1"]]: # type: ignore + def __call__(self, record: RecordDict, **kwargs) -> frozenset[Literal["0", "1"]]: # type: ignore column = record[self.field] return self.func(column) @@ -127,10 +126,10 @@ class IndexPredicate(Predicate): field: str threshold: float index: Index | None - _cache: dict[Any, FrozenSet[str]] + _cache: dict[Any, frozenset[str]] def __init__(self, threshold: float, field: str): - self.__name__ = "(%s, %s)" % (threshold, field) + self.__name__ = "({}, {})".format(threshold, field) self.field = field self.threshold = threshold self.index = None @@ -179,7 +178,7 @@ def reset(self) -> None: self.canopy = {} self.index = None - def __call__(self, record: RecordDict, **kwargs) -> FrozenSet[str]: + def __call__(self, record: RecordDict, **kwargs) -> frozenset[str]: block_key = None column = record[self.field] @@ -244,7 +243,7 @@ def reset(self) -> None: def __call__( self, record: RecordDict, target: bool = False, **kwargs - ) -> FrozenSet[str]: + ) -> frozenset[str]: column = record[self.field] if column: if (column, target) in self._cache: @@ -357,7 +356,7 @@ def __hash__(self) -> int: def __eq__(self, other: Any) -> bool: return frozenset(self) == frozenset(other) - def __call__(self, record: RecordDict, **kwargs) -> FrozenSet[str]: + def __call__(self, record: RecordDict, **kwargs) -> frozenset[str]: predicate_keys = [predicate(record, **kwargs) for predicate in self] return frozenset( ":".join( @@ -368,7 +367,7 @@ def __call__(self, record: RecordDict, **kwargs) -> FrozenSet[str]: for block_key in product(*predicate_keys) ) - def __add__(self, other: Predicate) -> "CompoundPredicate": # type: ignore + def __add__(self, other: Predicate) -> CompoundPredicate: # type: ignore if isinstance(other, CompoundPredicate): return CompoundPredicate(tuple(self) + tuple(other)) elif isinstance(other, Predicate): diff --git a/dedupe/tfidf.py b/dedupe/tfidf.py index 5faaf6cb7..f70a915d0 100644 --- a/dedupe/tfidf.py +++ b/dedupe/tfidf.py @@ -1,5 +1,4 @@ #!/usr/bin/python -# -*- coding: utf-8 -*- import logging from typing import List, Tuple diff --git a/dedupe/training.py b/dedupe/training.py index 98c9f28df..8709fb8e9 100644 --- a/dedupe/training.py +++ b/dedupe/training.py @@ -321,7 +321,7 @@ def coveredPairs(self, blocker, records_1, records_2): return pair_cover -class InfiniteSet(object): +class InfiniteSet: def __and__(self, item): return item @@ -329,7 +329,7 @@ def __rand__(self, item): return item -class Resampler(object): +class Resampler: def __init__(self, sequence: Sequence[int]): sampled = random.choices(sequence, k=len(sequence)) @@ -344,7 +344,7 @@ def __init__(self, sequence: Sequence[int]): self.replacements[k].append(max_value) max_value += 1 - @functools.lru_cache() + @functools.lru_cache def __call__(self, iterable: Iterable[int]) -> frozenset[int]: result = itertools.chain.from_iterable( self.replacements[k] for k in iterable if k in self.replacements diff --git a/dedupe/variables/base.py b/dedupe/variables/base.py index 71d6e722e..c7f256eb7 100644 --- a/dedupe/variables/base.py +++ b/dedupe/variables/base.py @@ -1,21 +1,21 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING from dedupe import predicates if TYPE_CHECKING: - from typing import Any, ClassVar, Iterable, Sequence, Type + from typing import Any, ClassVar, Iterable, Sequence from dedupe._typing import Comparator, CustomComparator, PredicateFunction from dedupe._typing import Variable as VariableProtocol -class Variable(object): +class Variable: name: str type: ClassVar[str] predicates: list[predicates.Predicate] - higher_vars: Sequence["VariableProtocol"] + higher_vars: Sequence[VariableProtocol] def __len__(self) -> int: return 1 @@ -44,24 +44,22 @@ class DerivedType(Variable): type = "Derived" def __init__(self, name: str, var_type: str, **kwargs): - self.name = "(%s: %s)" % (str(name), str(var_type)) + self.name = "({}: {})".format(str(name), str(var_type)) super().__init__(**kwargs) class FieldType(Variable): _index_thresholds: Sequence[float] = [] - _index_predicates: Sequence[Type[predicates.IndexPredicate]] = [] + _index_predicates: Sequence[type[predicates.IndexPredicate]] = [] _predicate_functions: Sequence[PredicateFunction] = () - _Predicate: Type[predicates.SimplePredicate] = predicates.SimplePredicate + _Predicate: type[predicates.SimplePredicate] = predicates.SimplePredicate comparator: Comparator - def __init__( - self, field: str, name: Optional[str] = None, has_missing: bool = False - ): + def __init__(self, field: str, name: str | None = None, has_missing: bool = False): self.field = field if name is None: - self.name = "(%s: %s)" % (self.field, self.type) + self.name = "({}: {})".format(self.field, self.type) else: self.name = name @@ -86,7 +84,7 @@ def __init__( self, field: str, comparator: CustomComparator, - name: Optional[str] = None, + name: str | None = None, **kwargs, ): super().__init__(field, **kwargs) @@ -99,7 +97,7 @@ def __init__( self.comparator = comparator if name is None: - self.name = "(%s: %s, %s)" % ( + self.name = "({}: {}, {})".format( self.field, self.type, self.comparator.__name__, @@ -109,7 +107,7 @@ def __init__( def indexPredicates( - predicates: Iterable[Type[predicates.IndexPredicate]], + predicates: Iterable[type[predicates.IndexPredicate]], thresholds: Sequence[float], field: str, ) -> list[predicates.IndexPredicate]: diff --git a/dedupe/variables/interaction.py b/dedupe/variables/interaction.py index e9fbbe591..483f17765 100644 --- a/dedupe/variables/interaction.py +++ b/dedupe/variables/interaction.py @@ -1,7 +1,7 @@ from __future__ import annotations import itertools -from typing import List, Mapping +from typing import Mapping from dedupe._typing import FieldVariable, InteractionVariable from dedupe.variables.base import Variable @@ -9,7 +9,7 @@ class InteractionType(Variable): type = "Interaction" - higher_vars: List[InteractionVariable] + higher_vars: list[InteractionVariable] def __init__(self, *args: str, **kwargs): self.interactions = list(args) diff --git a/pyproject.toml b/pyproject.toml index 0256eee68..2a9484eb1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "dedupe" description = "A python library for accurate and scaleable data deduplication and entity-resolution" version = "3.0.0" readme = "README.md" -requires-python = ">=3.7" +requires-python = ">=3.8" license = {file = "LICENSE"} keywords = [] authors = [ @@ -36,7 +36,6 @@ dependencies = [ "BTrees>=4.1.4", "zope.index", "dedupe_Levenshtein_search", - "typing_extensions", ] [project.urls] diff --git a/tests/duplicateCluster_memory_case.py b/tests/duplicateCluster_memory_case.py index f210498a6..111092b55 100644 --- a/tests/duplicateCluster_memory_case.py +++ b/tests/duplicateCluster_memory_case.py @@ -23,7 +23,7 @@ def candidates_gen(): - candidate_set = set([]) + candidate_set = set() for _ in range(10**5): block = [((random.randint(0, 1000), "a"), (random.randint(0, 1000), "b"))] for candidate in block: diff --git a/tests/test_blocking.py b/tests/test_blocking.py index 6e7af98d8..41465dc9a 100644 --- a/tests/test_blocking.py +++ b/tests/test_blocking.py @@ -27,9 +27,7 @@ def setUp(self): if record not in self.training_records: self.training_records.append(record) - self.simple = lambda x: set( - [str(k) for k in x if "CompoundPredicate" not in str(k)] - ) + self.simple = lambda x: {str(k) for k in x if "CompoundPredicate" not in str(k)} class TfidfTest(unittest.TestCase): @@ -52,16 +50,16 @@ def test_unconstrained_inverted_index(self): [dedupe.predicates.TfidfTextSearchPredicate(0.0, "name")] ) - blocker.index(set(record["name"] for record in self.data_d.values()), "name") + blocker.index({record["name"] for record in self.data_d.values()}, "name") blocks = defaultdict(set) for block_key, record_id in blocker(self.data_d.items()): blocks[block_key].add(record_id) - blocks = set([frozenset(block) for block in blocks.values() if len(block) > 1]) + blocks = {frozenset(block) for block in blocks.values() if len(block) > 1} - assert blocks == set([frozenset([120, 125]), frozenset([130, 135])]) + assert blocks == {frozenset([120, 125]), frozenset([130, 135])} class TfIndexUnindex(unittest.TestCase): @@ -83,17 +81,17 @@ def setUp(self): [dedupe.predicates.TfidfTextSearchPredicate(0.0, "name")] ) - self.records_1 = dict( - (record_id, record) + self.records_1 = { + record_id: record for record_id, record in data_d.items() if record["dataset"] == 0 - ) + } - self.fields_2 = dict( - (record_id, record["name"]) + self.fields_2 = { + record_id: record["name"] for record_id, record in data_d.items() if record["dataset"] == 1 - ) + } def test_index(self): self.blocker.index(set(self.fields_2.values()), "name") @@ -103,7 +101,7 @@ def test_index(self): for block_key, record_id in self.blocker(self.records_1.items()): blocks[block_key].add(record_id) - assert list(blocks.items())[0][1] == set([130]) + assert list(blocks.items())[0][1] == {130} def test_doubled_index(self): self.blocker.index(self.fields_2.values(), "name") @@ -118,7 +116,7 @@ def test_doubled_index(self): assert len(result) == 1 - assert result[0][1] == set([130]) + assert result[0][1] == {130} def test_unindex(self): self.blocker.index(self.fields_2.values(), "name") diff --git a/tests/test_dedupe.py b/tests/test_dedupe.py index 4a925e8b4..be49c39ed 100644 --- a/tests/test_dedupe.py +++ b/tests/test_dedupe.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - import itertools import unittest diff --git a/tests/test_predicates.py b/tests/test_predicates.py index bcae4fa60..4df2c0401 100644 --- a/tests/test_predicates.py +++ b/tests/test_predicates.py @@ -17,12 +17,12 @@ def test_set(self): class TestMetaphone(unittest.TestCase): def test_metaphone_token(self): block_val = predicates.metaphoneToken("9301 S. State St. ") - assert block_val == set(["STT", "S", "ST"]) + assert block_val == {"STT", "S", "ST"} class TestWholeSet(unittest.TestCase): def setUp(self): - self.s1 = set(["red", "blue", "green"]) + self.s1 = {"red", "blue", "green"} def test_full_set(self): block_val = predicates.wholeSetPredicate(self.s1) @@ -31,11 +31,11 @@ def test_full_set(self): class TestSetElement(unittest.TestCase): def setUp(self): - self.s1 = set(["red", "blue", "green"]) + self.s1 = {"red", "blue", "green"} def test_long_set(self): block_val = predicates.commonSetElementPredicate(self.s1) - self.assertEqual(set(block_val), set(("blue", "green", "red"))) + self.assertEqual(set(block_val), {"blue", "green", "red"}) def test_empty_set(self): block_val = predicates.commonSetElementPredicate(set()) @@ -68,19 +68,21 @@ def test_precise_latlong(self): class TestAlpaNumeric(unittest.TestCase): def test_alphanumeric(self): - assert predicates.alphaNumericPredicate("a1") == set(["a1"]) - assert predicates.alphaNumericPredicate("1a") == set(["1a"]) - assert predicates.alphaNumericPredicate("a1b") == set(["a1b"]) - assert predicates.alphaNumericPredicate("1 a") == set(["1"]) - assert predicates.alphaNumericPredicate("a1 b1") == set(["a1", "b1"]) + assert predicates.alphaNumericPredicate("a1") == {"a1"} + assert predicates.alphaNumericPredicate("1a") == {"1a"} + assert predicates.alphaNumericPredicate("a1b") == {"a1b"} + assert predicates.alphaNumericPredicate("1 a") == {"1"} + assert predicates.alphaNumericPredicate("a1 b1") == {"a1", "b1"} assert predicates.alphaNumericPredicate("asdf") == set() - assert predicates.alphaNumericPredicate("1") == set(["1"]) - assert predicates.alphaNumericPredicate("a_1") == set(["1"]) - assert predicates.alphaNumericPredicate("a$1") == set(["1"]) - assert predicates.alphaNumericPredicate("a 1") == set(["1"]) - assert predicates.alphaNumericPredicate("773-555-1676") == set( - ["773", "555", "1676"] - ) + assert predicates.alphaNumericPredicate("1") == {"1"} + assert predicates.alphaNumericPredicate("a_1") == {"1"} + assert predicates.alphaNumericPredicate("a$1") == {"1"} + assert predicates.alphaNumericPredicate("a 1") == {"1"} + assert predicates.alphaNumericPredicate("773-555-1676") == { + "773", + "555", + "1676", + } class TestNumericPredicates(unittest.TestCase): diff --git a/tests/test_serializer.py b/tests/test_serializer.py index 7eb2d931e..34cf6cf56 100644 --- a/tests/test_serializer.py +++ b/tests/test_serializer.py @@ -1,6 +1,3 @@ -# -*- coding: utf-8 -*- -from __future__ import print_function - import codecs import json import sys diff --git a/tests/test_training.py b/tests/test_training.py index 6b71f3aee..24a28901a 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -28,9 +28,7 @@ def setUp(self): if record not in self.training_records: self.training_records.append(record) - self.simple = lambda x: set( - [str(k) for k in x if "CompoundPredicate" not in str(k)] - ) + self.simple = lambda x: {str(k) for k in x if "CompoundPredicate" not in str(k)} self.block_learner = training.BlockLearner self.block_learner.blocker = dedupe.blocking.Fingerprinter( @@ -43,23 +41,21 @@ def setUp(self): def test_dedupe_coverage(self): coverage = self.block_learner.cover(self.block_learner, self.training) assert self.simple(coverage.keys()).issuperset( - set( - [ - "SimplePredicate: (tokenFieldPredicate, name)", - "SimplePredicate: (commonSixGram, name)", - "TfidfTextCanopyPredicate: (0.4, name)", - "SimplePredicate: (sortedAcronym, name)", - "SimplePredicate: (sameThreeCharStartPredicate, name)", - "TfidfTextCanopyPredicate: (0.2, name)", - "SimplePredicate: (sameFiveCharStartPredicate, name)", - "TfidfTextCanopyPredicate: (0.6, name)", - "SimplePredicate: (wholeFieldPredicate, name)", - "TfidfTextCanopyPredicate: (0.8, name)", - "SimplePredicate: (commonFourGram, name)", - "SimplePredicate: (firstTokenPredicate, name)", - "SimplePredicate: (sameSevenCharStartPredicate, name)", - ] - ) + { + "SimplePredicate: (tokenFieldPredicate, name)", + "SimplePredicate: (commonSixGram, name)", + "TfidfTextCanopyPredicate: (0.4, name)", + "SimplePredicate: (sortedAcronym, name)", + "SimplePredicate: (sameThreeCharStartPredicate, name)", + "TfidfTextCanopyPredicate: (0.2, name)", + "SimplePredicate: (sameFiveCharStartPredicate, name)", + "TfidfTextCanopyPredicate: (0.6, name)", + "SimplePredicate: (wholeFieldPredicate, name)", + "TfidfTextCanopyPredicate: (0.8, name)", + "SimplePredicate: (commonFourGram, name)", + "SimplePredicate: (firstTokenPredicate, name)", + "SimplePredicate: (sameSevenCharStartPredicate, name)", + } ) def test_uncovered_by(self):