diff --git a/eyecite/helpers.py b/eyecite/helpers.py index d5f6c42e..b858e21a 100644 --- a/eyecite/helpers.py +++ b/eyecite/helpers.py @@ -7,6 +7,7 @@ from eyecite.models import ( CaseCitation, CitationBase, + CitationToken, FullCaseCitation, FullJournalCitation, FullLawCitation, @@ -294,15 +295,19 @@ def disambiguate_reporters( joke_cite: List[CitationBase] = [ FullCaseCitation( - Token("1 FLP 1", 0, 7), + CitationToken( + "1 FLP 1", + 0, + 99, + { + "volume": "1", + "reporter": "FLP", + "page": "1", + }, + ), 0, - groups={ - "volume": "1", - "reporter": "FLP", - "page": "1", - }, - year=2021, metadata={ + "year": "2021", "extra": "Eyecite is a collaborative community effort.", }, ) diff --git a/eyecite/models.py b/eyecite/models.py index 756be23b..ec169120 100644 --- a/eyecite/models.py +++ b/eyecite/models.py @@ -83,6 +83,9 @@ def __post_init__(self): if isinstance(self.metadata, dict) else self.Metadata() ) + # Set known missing page numbers to None + if re.search("^_+$", self.groups.get("page", "") or ""): + self.groups["page"] = None def __repr__(self): """Simplified repr() to be more readable than full dataclass repr(). @@ -103,8 +106,12 @@ class Metadata: def comparison_hash(self) -> int: """Return hash that will be the same if two cites are semantically - equivalent.""" - return hash((type(self), tuple(self.groups.items()))) + equivalent, unless the citation is a CaseCitation missing a page. + """ + if isinstance(self, CaseCitation) and self.groups["page"] is None: + return id(self) + else: + return hash((type(self), tuple(self.groups.items()))) def corrected_citation(self): """Return citation with any variations normalized.""" @@ -614,7 +621,12 @@ class Resource(ResourceType): def __hash__(self): """Resources are the same if their citations are semantically - equivalent.""" + equivalent. + + Note: Resources composed of citations with missing page numbers are + NOT considered the same, even if their other attributes are identical. + This is to avoid potential false positives. + """ return self.citation.comparison_hash() def __eq__(self, other): diff --git a/eyecite/regexes.py b/eyecite/regexes.py index c7dbd27c..5b29ab0b 100644 --- a/eyecite/regexes.py +++ b/eyecite/regexes.py @@ -63,7 +63,8 @@ def short_cite_re(regex): # (ordered in descending order of likelihood) # 1) A plain digit. E.g. "123" # 2) A roman numeral. -PAGE_NUMBER_REGEX = rf"(?:\d+|{ROMAN_NUMERAL_REGEX})" +# 3) A page placeholder. E.g. "Carpenter v. United States, 585 U.S. ___ (2018)" +PAGE_NUMBER_REGEX = rf"(?:\d+|{ROMAN_NUMERAL_REGEX}|_+)" # Regex to match punctuation around volume numbers and stopwords. # This could potentially be more precise. diff --git a/eyecite/resolve.py b/eyecite/resolve.py index 4bec70c6..91b1d753 100644 --- a/eyecite/resolve.py +++ b/eyecite/resolve.py @@ -88,6 +88,13 @@ def _has_invalid_pin_cite( ) -> bool: """Return True if id_cite has a pin cite that can't be correct for the given full_cite.""" + # if full cite has a known missing page, this pin cite can't be correct + if ( + type(full_cite) is FullCaseCitation + and full_cite.groups.get("page") is None + ): + return True + # if no pin cite, we're fine if not id_cite.metadata.pin_cite: return False diff --git a/eyecite/test_factories.py b/eyecite/test_factories.py index 3fa60ba3..e69a467d 100644 --- a/eyecite/test_factories.py +++ b/eyecite/test_factories.py @@ -28,7 +28,7 @@ def resource_citation( if year: metadata["year"] = str(year) elif "year" in metadata: - year = get_year(metadata.year) + year = get_year(metadata["year"]) # Avoid https://github.com/PyCQA/pylint/issues/3201 # pylint: disable=unexpected-keyword-arg token = CitationToken( diff --git a/tests/test_AnnotateTest.py b/tests/test_AnnotateTest.py index d76c1138..7ae3d711 100644 --- a/tests/test_AnnotateTest.py +++ b/tests/test_AnnotateTest.py @@ -19,6 +19,10 @@ def lower_annotator(before, text, after): ("foo 1 U.S. 1 bar", "foo <0>1 U.S. 1 bar", []), # cite with punctuation ("foo '1 U.S. 1' bar", "foo '<0>1 U.S. 1' bar", []), + # cite with missing page number (original underscores should be + # rendered in annotated text even though the missing page number + # has been normalized to None within the citation object) + ("foo 1 U.S. ____ bar", "foo <0>1 U.S. ____ bar", []), # law cite ( "foo. Mass. Gen. Laws ch. 1, § 2. bar", diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py index d2302454..9511205b 100644 --- a/tests/test_FindTest.py +++ b/tests/test_FindTest.py @@ -181,8 +181,11 @@ def test_find_citations(self): # Test with page range with a weird suffix ('559 N.W.2d 826|N.D.', [case_citation(page='826', reporter='N.W.2d', volume='559')]), - # Test with malformed/missing page number + # Test with malformed page number ('1 U.S. f24601', []), + # Test with page number that is indicated as missing + ('1 U.S. ___', + [case_citation(volume='1', reporter='U.S.', page=None)]), # Test with the 'digit-REPORTER-digit' corner-case formatting ('2007-NMCERT-008', [case_citation(source_text='2007-NMCERT-008', page='008', diff --git a/tests/test_ModelsTest.py b/tests/test_ModelsTest.py index 12e256e1..8e025af7 100644 --- a/tests/test_ModelsTest.py +++ b/tests/test_ModelsTest.py @@ -1,10 +1,12 @@ from unittest import TestCase +from eyecite import get_citations +from eyecite.models import Resource from eyecite.test_factories import case_citation class ModelsTest(TestCase): - def test_comparison(self): + def test_citation_comparison(self): """Are two citation objects equal when their attributes are the same?""" citations = [ @@ -15,3 +17,38 @@ def test_comparison(self): self.assertEqual(citations[0], citations[1]) self.assertEqual(hash(citations[0]), hash(citations[1])) print("✓") + + def test_resource_comparison(self): + """Are two Resource objects equal when their citations' attributes are + the same?""" + resources = [ + Resource(case_citation(2, volume="2", reporter="U.S.", page="2")), + Resource(case_citation(2, volume="2", reporter="U.S.", page="2")), + ] + print("Testing resource comparison...", end=" ") + self.assertEqual(resources[0], resources[1]) + self.assertEqual(hash(resources[0]), hash(resources[1])) + print("✓") + + def test_resource_comparison_with_missing_page_cites(self): + """Are two Resource objects different when their citations are missing + pages, even if their other attributes are the same?""" + citations = [ + Resource(case_citation(2, volume="2", reporter="U.S.", page="__")), + Resource(case_citation(2, volume="2", reporter="U.S.", page="__")), + ] + print("Testing resource comparison with missing pages...", end=" ") + self.assertNotEqual(citations[0], citations[1]) + self.assertNotEqual(hash(citations[0]), hash(citations[1])) + print("✓") + + def test_missing_page_cite_conversion(self): + """Do citations with missing page numbers get their groups['page'] + attribute set to None?""" + + citation1 = case_citation(2, volume="2", reporter="U.S.", page="__") + citation2 = get_citations("2 U.S. __")[0] + print("Testing missing page conversion...", end=" ") + self.assertIsNone(citation1.groups["page"]) + self.assertIsNone(citation2.groups["page"]) + print("✓") diff --git a/tests/test_ResolveTest.py b/tests/test_ResolveTest.py index 2bf667f3..8461ba74 100644 --- a/tests/test_ResolveTest.py +++ b/tests/test_ResolveTest.py @@ -114,6 +114,12 @@ def test_full_resolution(self): (0, "Foo v. Bar, 1 U.S. 1."), (0, "Foo v. Bar, 1 U.S. 1."), ) + # Test resolving two full citations with missing page numbers but + # otherwise identical. These should not resolve to the same document. + self.checkResolution( + (0, "Foo v. Bar, 1 U.S. ____."), + (1, "Foo v. Bar, 1 U.S. ____."), + ) # Test resolving multiple full citations to different documents self.checkResolution( (0, "Foo v. Bar, 1 U.S. 1."), @@ -241,6 +247,12 @@ def test_id_resolution(self): (1, "Ala. Code § 92"), (1, "Id. at 2000"), ) + # Test resolving an Id. citation with a pin cite when the previous + # citation only has a placeholder page. We expect this to fail. + self.checkResolution( + (0, "Foo v. Bar, 1 U.S. ___"), + (None, "Id. at 100."), + ) def test_non_case_resolution(self): """Test law and journal resolution."""