From 4b29c44b37b4af4441ad1e7cd077bd58669ba630 Mon Sep 17 00:00:00 2001 From: Matt Dahl Date: Sat, 16 Jul 2022 12:11:46 -0400 Subject: [PATCH 01/10] test(find): Tests finding for cites with missing pages. --- tests/test_FindTest.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py index d2302454..1494bd8d 100644 --- a/tests/test_FindTest.py +++ b/tests/test_FindTest.py @@ -181,8 +181,11 @@ def test_find_citations(self): # Test with page range with a weird suffix ('559 N.W.2d 826|N.D.', [case_citation(page='826', reporter='N.W.2d', volume='559')]), - # Test with malformed/missing page number + # Test with malformed page number ('1 U.S. f24601', []), + # Test with page number that is indicated as missing + ('1 U.S. ___', + [case_citation(volume='1', reporter='U.S.', page='___')]), # Test with the 'digit-REPORTER-digit' corner-case formatting ('2007-NMCERT-008', [case_citation(source_text='2007-NMCERT-008', page='008', From fcde5cbe8f4386329ab0cf2e617b3434fdf3e4ec Mon Sep 17 00:00:00 2001 From: Matt Dahl Date: Sat, 16 Jul 2022 12:12:41 -0400 Subject: [PATCH 02/10] feat(find): Adds regex for finding placeholder pages. --- eyecite/regexes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/eyecite/regexes.py b/eyecite/regexes.py index c7dbd27c..5b29ab0b 100644 --- a/eyecite/regexes.py +++ b/eyecite/regexes.py @@ -63,7 +63,8 @@ def short_cite_re(regex): # (ordered in descending order of likelihood) # 1) A plain digit. E.g. "123" # 2) A roman numeral. -PAGE_NUMBER_REGEX = rf"(?:\d+|{ROMAN_NUMERAL_REGEX})" +# 3) A page placeholder. E.g. "Carpenter v. United States, 585 U.S. ___ (2018)" +PAGE_NUMBER_REGEX = rf"(?:\d+|{ROMAN_NUMERAL_REGEX}|_+)" # Regex to match punctuation around volume numbers and stopwords. # This could potentially be more precise. From a51ba41816c17f82e4a61f2bd4b5626666211c3d Mon Sep 17 00:00:00 2001 From: Matt Dahl Date: Sat, 16 Jul 2022 12:30:43 -0400 Subject: [PATCH 03/10] test(find): Tests Id. resolution for cites with missing pages. --- tests/test_ResolveTest.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_ResolveTest.py b/tests/test_ResolveTest.py index 2bf667f3..5cf6fe68 100644 --- a/tests/test_ResolveTest.py +++ b/tests/test_ResolveTest.py @@ -241,6 +241,14 @@ def test_id_resolution(self): (1, "Ala. Code § 92"), (1, "Id. at 2000"), ) + # Test resolving an Id. citation with a pin cite when the previous + # citation only has a placeholder page. We expect this to still resolve + # because it's possible to pin cite to an opinion with pending final + # page numbers. + self.checkResolution( + (0, "Foo v. Bar, 1 U.S. ___"), + (0, "Id. at 100."), + ) def test_non_case_resolution(self): """Test law and journal resolution.""" From 5ee4145af4617f142352568468827efcd0911aea Mon Sep 17 00:00:00 2001 From: Matt Dahl Date: Wed, 20 Jul 2022 12:24:22 -0400 Subject: [PATCH 04/10] fix(helpers): Fixes joke cite. --- eyecite/helpers.py | 19 ++++++++++++------- eyecite/test_factories.py | 2 +- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/eyecite/helpers.py b/eyecite/helpers.py index d5f6c42e..b858e21a 100644 --- a/eyecite/helpers.py +++ b/eyecite/helpers.py @@ -7,6 +7,7 @@ from eyecite.models import ( CaseCitation, CitationBase, + CitationToken, FullCaseCitation, FullJournalCitation, FullLawCitation, @@ -294,15 +295,19 @@ def disambiguate_reporters( joke_cite: List[CitationBase] = [ FullCaseCitation( - Token("1 FLP 1", 0, 7), + CitationToken( + "1 FLP 1", + 0, + 99, + { + "volume": "1", + "reporter": "FLP", + "page": "1", + }, + ), 0, - groups={ - "volume": "1", - "reporter": "FLP", - "page": "1", - }, - year=2021, metadata={ + "year": "2021", "extra": "Eyecite is a collaborative community effort.", }, ) diff --git a/eyecite/test_factories.py b/eyecite/test_factories.py index 3fa60ba3..e69a467d 100644 --- a/eyecite/test_factories.py +++ b/eyecite/test_factories.py @@ -28,7 +28,7 @@ def resource_citation( if year: metadata["year"] = str(year) elif "year" in metadata: - year = get_year(metadata.year) + year = get_year(metadata["year"]) # Avoid https://github.com/PyCQA/pylint/issues/3201 # pylint: disable=unexpected-keyword-arg token = CitationToken( From a120ca8e571e4396d17c93119f10c9500d34e96f Mon Sep 17 00:00:00 2001 From: Matt Dahl Date: Wed, 20 Jul 2022 13:28:00 -0400 Subject: [PATCH 05/10] feat(find): Normalizes known missing page numbers to None. --- eyecite/models.py | 6 ++++++ tests/test_AnnotateTest.py | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/eyecite/models.py b/eyecite/models.py index 756be23b..4b1f0fa8 100644 --- a/eyecite/models.py +++ b/eyecite/models.py @@ -83,6 +83,12 @@ def __post_init__(self): if isinstance(self.metadata, dict) else self.Metadata() ) + # Set known missing page numbers to None + if ( + "page" in self.groups + and self.groups["page"] == len(self.groups["page"]) * "_" + ): + self.groups["page"] = None def __repr__(self): """Simplified repr() to be more readable than full dataclass repr(). diff --git a/tests/test_AnnotateTest.py b/tests/test_AnnotateTest.py index d76c1138..7ae3d711 100644 --- a/tests/test_AnnotateTest.py +++ b/tests/test_AnnotateTest.py @@ -19,6 +19,10 @@ def lower_annotator(before, text, after): ("foo 1 U.S. 1 bar", "foo <0>1 U.S. 1 bar", []), # cite with punctuation ("foo '1 U.S. 1' bar", "foo '<0>1 U.S. 1' bar", []), + # cite with missing page number (original underscores should be + # rendered in annotated text even though the missing page number + # has been normalized to None within the citation object) + ("foo 1 U.S. ____ bar", "foo <0>1 U.S. ____ bar", []), # law cite ( "foo. Mass. Gen. Laws ch. 1, § 2. bar", From 6048422a1840ec34b8e6e889fcc1907688b97eda Mon Sep 17 00:00:00 2001 From: Matt Dahl Date: Wed, 20 Jul 2022 13:30:19 -0400 Subject: [PATCH 06/10] feat(resolve): Ensures that Id. pin cites don't match missing page cites. --- eyecite/resolve.py | 7 +++++++ tests/test_ResolveTest.py | 6 ++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/eyecite/resolve.py b/eyecite/resolve.py index 4bec70c6..91b1d753 100644 --- a/eyecite/resolve.py +++ b/eyecite/resolve.py @@ -88,6 +88,13 @@ def _has_invalid_pin_cite( ) -> bool: """Return True if id_cite has a pin cite that can't be correct for the given full_cite.""" + # if full cite has a known missing page, this pin cite can't be correct + if ( + type(full_cite) is FullCaseCitation + and full_cite.groups.get("page") is None + ): + return True + # if no pin cite, we're fine if not id_cite.metadata.pin_cite: return False diff --git a/tests/test_ResolveTest.py b/tests/test_ResolveTest.py index 5cf6fe68..7357d26b 100644 --- a/tests/test_ResolveTest.py +++ b/tests/test_ResolveTest.py @@ -242,12 +242,10 @@ def test_id_resolution(self): (1, "Id. at 2000"), ) # Test resolving an Id. citation with a pin cite when the previous - # citation only has a placeholder page. We expect this to still resolve - # because it's possible to pin cite to an opinion with pending final - # page numbers. + # citation only has a placeholder page. We expect this to fail. self.checkResolution( (0, "Foo v. Bar, 1 U.S. ___"), - (0, "Id. at 100."), + (None, "Id. at 100."), ) def test_non_case_resolution(self): From 08b9625552cfcbe592526134009bf757a47e7030 Mon Sep 17 00:00:00 2001 From: Matt Dahl Date: Wed, 20 Jul 2022 13:31:09 -0400 Subject: [PATCH 07/10] feat(models): Ensures that Resources with missing page cites hash differently. --- eyecite/models.py | 15 ++++++++++++--- tests/test_ModelsTest.py | 27 ++++++++++++++++++++++++++- tests/test_ResolveTest.py | 6 ++++++ 3 files changed, 44 insertions(+), 4 deletions(-) diff --git a/eyecite/models.py b/eyecite/models.py index 4b1f0fa8..b34e8d64 100644 --- a/eyecite/models.py +++ b/eyecite/models.py @@ -109,8 +109,12 @@ class Metadata: def comparison_hash(self) -> int: """Return hash that will be the same if two cites are semantically - equivalent.""" - return hash((type(self), tuple(self.groups.items()))) + equivalent, unless the citation is a CaseCitation missing a page. + """ + if isinstance(self, CaseCitation) and self.groups["page"] is None: + return id(self) + else: + return hash((type(self), tuple(self.groups.items()))) def corrected_citation(self): """Return citation with any variations normalized.""" @@ -620,7 +624,12 @@ class Resource(ResourceType): def __hash__(self): """Resources are the same if their citations are semantically - equivalent.""" + equivalent. + + Note: Resources composed of citations with missing page numbers are + NOT considered the same, even if their other attributes are identical. + This is to avoid potential false positives. + """ return self.citation.comparison_hash() def __eq__(self, other): diff --git a/tests/test_ModelsTest.py b/tests/test_ModelsTest.py index 12e256e1..2a35676a 100644 --- a/tests/test_ModelsTest.py +++ b/tests/test_ModelsTest.py @@ -1,10 +1,11 @@ from unittest import TestCase from eyecite.test_factories import case_citation +from eyecite.models import Resource class ModelsTest(TestCase): - def test_comparison(self): + def test_citation_comparison(self): """Are two citation objects equal when their attributes are the same?""" citations = [ @@ -15,3 +16,27 @@ def test_comparison(self): self.assertEqual(citations[0], citations[1]) self.assertEqual(hash(citations[0]), hash(citations[1])) print("✓") + + def test_resource_comparison(self): + """Are two Resource objects equal when their citations' attributes are + the same?""" + resources = [ + Resource(case_citation(2, volume="2", reporter="U.S.", page="2")), + Resource(case_citation(2, volume="2", reporter="U.S.", page="2")), + ] + print("Testing resource comparison...", end=" ") + self.assertEqual(resources[0], resources[1]) + self.assertEqual(hash(resources[0]), hash(resources[1])) + print("✓") + + def test_resource_comparison_with_missing_page_cites(self): + """Are two Resource objects different when their citations are missing + pages, even if their other attributes are the same?""" + citations = [ + Resource(case_citation(2, volume="2", reporter="U.S.", page="__")), + Resource(case_citation(2, volume="2", reporter="U.S.", page="__")), + ] + print("Testing resource comparison with missing pages...", end=" ") + self.assertNotEqual(citations[0], citations[1]) + self.assertNotEqual(hash(citations[0]), hash(citations[1])) + print("✓") diff --git a/tests/test_ResolveTest.py b/tests/test_ResolveTest.py index 7357d26b..8461ba74 100644 --- a/tests/test_ResolveTest.py +++ b/tests/test_ResolveTest.py @@ -114,6 +114,12 @@ def test_full_resolution(self): (0, "Foo v. Bar, 1 U.S. 1."), (0, "Foo v. Bar, 1 U.S. 1."), ) + # Test resolving two full citations with missing page numbers but + # otherwise identical. These should not resolve to the same document. + self.checkResolution( + (0, "Foo v. Bar, 1 U.S. ____."), + (1, "Foo v. Bar, 1 U.S. ____."), + ) # Test resolving multiple full citations to different documents self.checkResolution( (0, "Foo v. Bar, 1 U.S. 1."), From 3009d1f121fec5465f25412d7de7fcd3cf47d19f Mon Sep 17 00:00:00 2001 From: Matt Dahl Date: Wed, 20 Jul 2022 13:49:00 -0400 Subject: [PATCH 08/10] fix(tests): Fixes isort. --- tests/test_ModelsTest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_ModelsTest.py b/tests/test_ModelsTest.py index 2a35676a..d517dbc7 100644 --- a/tests/test_ModelsTest.py +++ b/tests/test_ModelsTest.py @@ -1,7 +1,7 @@ from unittest import TestCase -from eyecite.test_factories import case_citation from eyecite.models import Resource +from eyecite.test_factories import case_citation class ModelsTest(TestCase): From d9966cea0438f149b547c43cff19b4f5f27e2e15 Mon Sep 17 00:00:00 2001 From: Matt Dahl Date: Thu, 21 Jul 2022 11:25:52 -0400 Subject: [PATCH 09/10] refactor(models): Uses regex to convert missing pages. --- eyecite/models.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/eyecite/models.py b/eyecite/models.py index b34e8d64..ec169120 100644 --- a/eyecite/models.py +++ b/eyecite/models.py @@ -84,10 +84,7 @@ def __post_init__(self): else self.Metadata() ) # Set known missing page numbers to None - if ( - "page" in self.groups - and self.groups["page"] == len(self.groups["page"]) * "_" - ): + if re.search("^_+$", self.groups.get("page", "") or ""): self.groups["page"] = None def __repr__(self): From 18b58711124c292feced3e925264f4b10dd282ae Mon Sep 17 00:00:00 2001 From: Matt Dahl Date: Thu, 21 Jul 2022 11:37:08 -0400 Subject: [PATCH 10/10] feat(tests): Adds tests for missing page to None conversion. --- tests/test_FindTest.py | 2 +- tests/test_ModelsTest.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py index 1494bd8d..9511205b 100644 --- a/tests/test_FindTest.py +++ b/tests/test_FindTest.py @@ -185,7 +185,7 @@ def test_find_citations(self): ('1 U.S. f24601', []), # Test with page number that is indicated as missing ('1 U.S. ___', - [case_citation(volume='1', reporter='U.S.', page='___')]), + [case_citation(volume='1', reporter='U.S.', page=None)]), # Test with the 'digit-REPORTER-digit' corner-case formatting ('2007-NMCERT-008', [case_citation(source_text='2007-NMCERT-008', page='008', diff --git a/tests/test_ModelsTest.py b/tests/test_ModelsTest.py index d517dbc7..8e025af7 100644 --- a/tests/test_ModelsTest.py +++ b/tests/test_ModelsTest.py @@ -1,5 +1,6 @@ from unittest import TestCase +from eyecite import get_citations from eyecite.models import Resource from eyecite.test_factories import case_citation @@ -40,3 +41,14 @@ def test_resource_comparison_with_missing_page_cites(self): self.assertNotEqual(citations[0], citations[1]) self.assertNotEqual(hash(citations[0]), hash(citations[1])) print("✓") + + def test_missing_page_cite_conversion(self): + """Do citations with missing page numbers get their groups['page'] + attribute set to None?""" + + citation1 = case_citation(2, volume="2", reporter="U.S.", page="__") + citation2 = get_citations("2 U.S. __")[0] + print("Testing missing page conversion...", end=" ") + self.assertIsNone(citation1.groups["page"]) + self.assertIsNone(citation2.groups["page"]) + print("✓")