Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 30 find partial page citations #116

Merged
19 changes: 12 additions & 7 deletions eyecite/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from eyecite.models import (
CaseCitation,
CitationBase,
CitationToken,
FullCaseCitation,
FullJournalCitation,
FullLawCitation,
Expand Down Expand Up @@ -294,15 +295,19 @@ def disambiguate_reporters(

joke_cite: List[CitationBase] = [
FullCaseCitation(
Token("1 FLP 1", 0, 7),
CitationToken(
"1 FLP 1",
0,
99,
{
"volume": "1",
"reporter": "FLP",
"page": "1",
},
),
0,
groups={
"volume": "1",
"reporter": "FLP",
"page": "1",
},
year=2021,
metadata={
"year": "2021",
"extra": "Eyecite is a collaborative community effort.",
},
)
Expand Down
18 changes: 15 additions & 3 deletions eyecite/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ def __post_init__(self):
if isinstance(self.metadata, dict)
else self.Metadata()
)
# Set known missing page numbers to None
if re.search("^_+$", self.groups.get("page", "") or ""):
self.groups["page"] = None

def __repr__(self):
"""Simplified repr() to be more readable than full dataclass repr().
Expand All @@ -103,8 +106,12 @@ class Metadata:

def comparison_hash(self) -> int:
"""Return hash that will be the same if two cites are semantically
equivalent."""
return hash((type(self), tuple(self.groups.items())))
equivalent, unless the citation is a CaseCitation missing a page.
"""
if isinstance(self, CaseCitation) and self.groups["page"] is None:
return id(self)
else:
return hash((type(self), tuple(self.groups.items())))

def corrected_citation(self):
"""Return citation with any variations normalized."""
Expand Down Expand Up @@ -614,7 +621,12 @@ class Resource(ResourceType):

def __hash__(self):
"""Resources are the same if their citations are semantically
equivalent."""
equivalent.

Note: Resources composed of citations with missing page numbers are
NOT considered the same, even if their other attributes are identical.
This is to avoid potential false positives.
"""
return self.citation.comparison_hash()

def __eq__(self, other):
Expand Down
3 changes: 2 additions & 1 deletion eyecite/regexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ def short_cite_re(regex):
# (ordered in descending order of likelihood)
# 1) A plain digit. E.g. "123"
# 2) A roman numeral.
PAGE_NUMBER_REGEX = rf"(?:\d+|{ROMAN_NUMERAL_REGEX})"
# 3) A page placeholder. E.g. "Carpenter v. United States, 585 U.S. ___ (2018)"
PAGE_NUMBER_REGEX = rf"(?:\d+|{ROMAN_NUMERAL_REGEX}|_+)"

# Regex to match punctuation around volume numbers and stopwords.
# This could potentially be more precise.
Expand Down
7 changes: 7 additions & 0 deletions eyecite/resolve.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,13 @@ def _has_invalid_pin_cite(
) -> bool:
"""Return True if id_cite has a pin cite that can't be correct for the
given full_cite."""
# if full cite has a known missing page, this pin cite can't be correct
if (
type(full_cite) is FullCaseCitation
and full_cite.groups.get("page") is None
):
return True

# if no pin cite, we're fine
if not id_cite.metadata.pin_cite:
return False
Expand Down
2 changes: 1 addition & 1 deletion eyecite/test_factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def resource_citation(
if year:
metadata["year"] = str(year)
elif "year" in metadata:
year = get_year(metadata.year)
year = get_year(metadata["year"])
# Avoid https://github.com/PyCQA/pylint/issues/3201
# pylint: disable=unexpected-keyword-arg
token = CitationToken(
Expand Down
4 changes: 4 additions & 0 deletions tests/test_AnnotateTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ def lower_annotator(before, text, after):
("foo 1 U.S. 1 bar", "foo <0>1 U.S. 1</0> bar", []),
# cite with punctuation
("foo '1 U.S. 1' bar", "foo '<0>1 U.S. 1</0>' bar", []),
# cite with missing page number (original underscores should be
# rendered in annotated text even though the missing page number
# has been normalized to None within the citation object)
("foo 1 U.S. ____ bar", "foo <0>1 U.S. ____</0> bar", []),
# law cite
(
"foo. Mass. Gen. Laws ch. 1, § 2. bar",
Expand Down
5 changes: 4 additions & 1 deletion tests/test_FindTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,11 @@ def test_find_citations(self):
# Test with page range with a weird suffix
('559 N.W.2d 826|N.D.',
[case_citation(page='826', reporter='N.W.2d', volume='559')]),
# Test with malformed/missing page number
# Test with malformed page number
('1 U.S. f24601', []),
# Test with page number that is indicated as missing
('1 U.S. ___',
[case_citation(volume='1', reporter='U.S.', page=None)]),
# Test with the 'digit-REPORTER-digit' corner-case formatting
('2007-NMCERT-008',
[case_citation(source_text='2007-NMCERT-008', page='008',
Expand Down
39 changes: 38 additions & 1 deletion tests/test_ModelsTest.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from unittest import TestCase

from eyecite import get_citations
from eyecite.models import Resource
from eyecite.test_factories import case_citation


class ModelsTest(TestCase):
def test_comparison(self):
def test_citation_comparison(self):
"""Are two citation objects equal when their attributes are
the same?"""
citations = [
Expand All @@ -15,3 +17,38 @@ def test_comparison(self):
self.assertEqual(citations[0], citations[1])
self.assertEqual(hash(citations[0]), hash(citations[1]))
print("✓")

def test_resource_comparison(self):
"""Are two Resource objects equal when their citations' attributes are
the same?"""
resources = [
Resource(case_citation(2, volume="2", reporter="U.S.", page="2")),
Resource(case_citation(2, volume="2", reporter="U.S.", page="2")),
]
print("Testing resource comparison...", end=" ")
self.assertEqual(resources[0], resources[1])
self.assertEqual(hash(resources[0]), hash(resources[1]))
print("✓")

def test_resource_comparison_with_missing_page_cites(self):
"""Are two Resource objects different when their citations are missing
pages, even if their other attributes are the same?"""
citations = [
Resource(case_citation(2, volume="2", reporter="U.S.", page="__")),
Resource(case_citation(2, volume="2", reporter="U.S.", page="__")),
]
print("Testing resource comparison with missing pages...", end=" ")
self.assertNotEqual(citations[0], citations[1])
self.assertNotEqual(hash(citations[0]), hash(citations[1]))
print("✓")

def test_missing_page_cite_conversion(self):
"""Do citations with missing page numbers get their groups['page']
attribute set to None?"""

citation1 = case_citation(2, volume="2", reporter="U.S.", page="__")
citation2 = get_citations("2 U.S. __")[0]
print("Testing missing page conversion...", end=" ")
self.assertIsNone(citation1.groups["page"])
self.assertIsNone(citation2.groups["page"])
print("✓")
12 changes: 12 additions & 0 deletions tests/test_ResolveTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,12 @@ def test_full_resolution(self):
(0, "Foo v. Bar, 1 U.S. 1."),
(0, "Foo v. Bar, 1 U.S. 1."),
)
# Test resolving two full citations with missing page numbers but
# otherwise identical. These should not resolve to the same document.
self.checkResolution(
(0, "Foo v. Bar, 1 U.S. ____."),
(1, "Foo v. Bar, 1 U.S. ____."),
)
# Test resolving multiple full citations to different documents
self.checkResolution(
(0, "Foo v. Bar, 1 U.S. 1."),
Expand Down Expand Up @@ -241,6 +247,12 @@ def test_id_resolution(self):
(1, "Ala. Code § 92"),
(1, "Id. at 2000"),
)
# Test resolving an Id. citation with a pin cite when the previous
# citation only has a placeholder page. We expect this to fail.
self.checkResolution(
(0, "Foo v. Bar, 1 U.S. ___"),
(None, "Id. at 100."),
)

def test_non_case_resolution(self):
"""Test law and journal resolution."""
Expand Down