Skip to content

Commit

Permalink
Merge pull request #206 from freelawproject/improve-citation-filtering
Browse files Browse the repository at this point in the history
Improve citation filtering
  • Loading branch information
flooie authored Feb 7, 2025
2 parents 42dd315 + 0e31544 commit 32ee756
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 49 deletions.
2 changes: 1 addition & 1 deletion eyecite/find.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def is_valid_name(name: str) -> bool:
if not regexes:
return []
pin_cite_re = (
rf"\b(?:{'|'.join(regexes)})\s+at\s+(?P<pin_cite>\d{{1,5}})\b"
rf"\b(?:{'|'.join(regexes)})\s+at(\s¶)?\s+(?P<pin_cite>\d{{1,5}})\b"
)
reference_citations = []
remaining_text = plain_text[citation.span()[-1] :]
Expand Down
59 changes: 39 additions & 20 deletions eyecite/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,17 @@ def add_post_citation(citation: CaseCitation, words: Tokens) -> None:
citation.metadata.pin_cite = clean_pin_cite(m["pin_cite"]) or None
citation.metadata.extra = (m["extra"] or "").strip() or None
citation.metadata.parenthetical = process_parenthetical(m["parenthetical"])

if (
citation.full_span_end
and m["parenthetical"] is not None
and isinstance(citation.metadata.parenthetical, str)
):
if len(m["parenthetical"]) > len(citation.metadata.parenthetical):
offset = len(m["parenthetical"]) - len(
citation.metadata.parenthetical
)
citation.full_span_end = citation.full_span_end - offset
citation.metadata.year = m["year"]
if m["year"]:
citation.year = get_year(m["year"])
Expand Down Expand Up @@ -318,6 +329,15 @@ def disambiguate_reporters(
]


def overlapping_citations(
full_span_1: Tuple[int, int], full_span_2: Tuple[int, int]
) -> bool:
"""Check if citations overlap at all"""
start_1, end_1 = full_span_1
start_2, end_2 = full_span_2
return max(start_1, start_2) < min(end_1, end_2)


def filter_citations(citations: List[CitationBase]) -> List[CitationBase]:
"""Filter and order citations, ensuring reference citations are in sequence
Expand All @@ -329,31 +349,30 @@ def filter_citations(citations: List[CitationBase]) -> List[CitationBase]:
:param citations: List of citations
:return: Sorted and filtered citations
"""
citations = list(
{citation.span(): citation for citation in citations}.values()
)
filtered_citations: List[CitationBase] = []
sorted_citations = sorted(citations, key=lambda citation: citation.span())
sorted_citations = sorted(
citations, key=lambda citation: citation.full_span()
)
for citation in sorted_citations:
if filtered_citations:
last_citation = filtered_citations[-1]
last_span = last_citation.span()
current_span = citation.span()

if current_span == last_span and isinstance(
last_citation, ReferenceCitation
):
# a single ReferenceCitation may be found via different
# names. Save the name metadata to account for collisions
for field in ReferenceCitation.name_fields:
if not getattr(last_citation.metadata, field):
setattr(
last_citation.metadata,
field,
getattr(citation.metadata, field),
)

if current_span[0] <= last_span[1]:
# Remove overlapping citations that can occur in edge cases
is_overlapping = overlapping_citations(
citation.full_span(), last_citation.full_span()
)
if is_overlapping and isinstance(last_citation, ReferenceCitation):
# Remove the overlapping reference citation
filtered_citations.pop(-1)
filtered_citations.append(citation)
continue
if is_overlapping and isinstance(citation, ReferenceCitation):
# Skip overlapping reference citations
continue
filtered_citations.append(citation)
filtered_citations.append(citation)
else:
filtered_citations.append(citation)
return filtered_citations


Expand Down
2 changes: 1 addition & 1 deletion eyecite/regexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def short_cite_re(regex):
# What case does a short cite refer to? For now, we just capture the previous
# word optionally followed by a comma. Example: Adarand, 515 U.S. at 241.
SHORT_CITE_ANTECEDENT_REGEX = r"""
(?P<antecedent>[\w\-.]+),?
(?P<antecedent>[A-Za-z][\w\-.]+),?
\ # final space
"""

Expand Down
28 changes: 13 additions & 15 deletions eyecite/resolve.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,22 +91,20 @@ def _filter_by_matching_plaintiff_or_defendant_or_resolved_names(
"""Filter out reference citations that point to more than 1 Resource"""
matches: List[ResourceType] = []

for full_citation, resource in resolved_full_cites:
if not isinstance(full_citation, FullCaseCitation):
continue

for key in ReferenceCitation.name_fields:
reference_value = getattr(reference_citation.metadata, key)
full_case_value = getattr(full_citation.metadata, key)
if (
reference_value
and full_case_value
and reference_value in full_case_value
):
matches.append(resource)
break
match_count = 0
reference_values = []
for key in ReferenceCitation.name_fields:
reference_value = getattr(reference_citation.metadata, key)
if reference_value:
reference_values.append(reference_value)
for citation, resource in resolved_full_cites:
full_cite_values = list(
[value for value in citation.metadata.__dict__.values() if value]
)
if set(full_cite_values) & set(reference_values):
match_count += 1
matches.append(resource)

# Remove duplicates and only accept if one candidate remains
matches = list(set(matches))
return matches[0] if len(matches) == 1 else None

Expand Down
58 changes: 46 additions & 12 deletions tests/test_FindTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

# by default tests use a cache for speed
# call tests with `EYECITE_CACHE_DIR= python ...` to disable cache
from eyecite.models import ResourceCitation
from eyecite.models import FullCaseCitation, FullCitation, ResourceCitation
from eyecite.test_factories import (
case_citation,
id_citation,
Expand Down Expand Up @@ -730,6 +730,36 @@ def test_date_in_editions(self):
% (edition[0], year, expected, date_in_reporter),
)

def test_citation_filtering(self):
"""Ensure citations with overlapping spans are correctly filtered
Imagine a scenario where a bug incorrectly identifies the following
.... at Conley v. Gibson, 355 Mass. 41, 42 (1999) ...
this returns two reference citations Conley, Gibson and the full cite
this shouldn't occur but if it did we would be able to filter these
correcly
"""
".... at Conley v. Gibson, 355 Mass. 41, 42 (1999) ..."
citations = [
case_citation(
volume="355",
page="41",
reporter_found="U.S.",
short=False,
span_start=26,
span_end=38,
full_span_start=8,
full_span_end=49,
metadata={"plaintiff": "Conley", "defendant": "Gibson"},
),
reference_citation("Conley", span_start=8, span_end=14),
reference_citation("Conley", span_start=18, span_end=24),
]
self.assertEqual(len(citations), 3)
filtered_citations = filter_citations(citations)
self.assertEqual(len(filtered_citations), 1)
self.assertEqual(type(filtered_citations[0]), FullCaseCitation)

def test_disambiguate_citations(self):
# fmt: off
test_pairs = [
Expand Down Expand Up @@ -882,14 +912,18 @@ def test_reference_extraction(self):
]
for plain_text in texts:
citations = get_citations(plain_text)
citations[0].metadata.resolved_case_name = "State v. Wingler"
references = extract_reference_citations(citations[0], plain_text)
final_citations = filter_citations(citations + references)
self.assertEqual(
len(final_citations), 2, "There should only be 2 citations"
)
self.assertEqual(
len(references),
1,
"Only a reference citation should had been picked up",
)
found_cite = citations[0]
if isinstance(found_cite, FullCitation):
found_cite.metadata.resolved_case_name = "State v. Wingler"
references = extract_reference_citations(
found_cite, plain_text
)
final_citations = filter_citations(citations + references)
self.assertEqual(
len(final_citations), 2, "There should only be 2 citations"
)
self.assertEqual(
len(references),
1,
"Only a reference citation should had been picked up",
)

0 comments on commit 32ee756

Please sign in to comment.