Merge pull request #206 from freelawproject/improve-citation-filtering

Improve citation filtering
freelawproject · Feb 7, 2025 · 32ee756 · 32ee756
2 parents 42dd315 + 0e31544
commit 32ee756
Show file tree

Hide file tree

Showing 5 changed files with 100 additions and 49 deletions.
diff --git a/eyecite/find.py b/eyecite/find.py
@@ -166,7 +166,7 @@ def is_valid_name(name: str) -> bool:
     if not regexes:
         return []
     pin_cite_re = (
-        rf"\b(?:{'|'.join(regexes)})\s+at\s+(?P<pin_cite>\d{{1,5}})\b"
+        rf"\b(?:{'|'.join(regexes)})\s+at(\s¶)?\s+(?P<pin_cite>\d{{1,5}})\b"
     )
     reference_citations = []
     remaining_text = plain_text[citation.span()[-1] :]

diff --git a/eyecite/helpers.py b/eyecite/helpers.py
@@ -101,6 +101,17 @@ def add_post_citation(citation: CaseCitation, words: Tokens) -> None:
     citation.metadata.pin_cite = clean_pin_cite(m["pin_cite"]) or None
     citation.metadata.extra = (m["extra"] or "").strip() or None
     citation.metadata.parenthetical = process_parenthetical(m["parenthetical"])
+
+    if (
+        citation.full_span_end
+        and m["parenthetical"] is not None
+        and isinstance(citation.metadata.parenthetical, str)
+    ):
+        if len(m["parenthetical"]) > len(citation.metadata.parenthetical):
+            offset = len(m["parenthetical"]) - len(
+                citation.metadata.parenthetical
+            )
+            citation.full_span_end = citation.full_span_end - offset
     citation.metadata.year = m["year"]
     if m["year"]:
         citation.year = get_year(m["year"])
@@ -318,6 +329,15 @@ def disambiguate_reporters(
     ]
 
 
+def overlapping_citations(
+    full_span_1: Tuple[int, int], full_span_2: Tuple[int, int]
+) -> bool:
+    """Check if citations overlap at all"""
+    start_1, end_1 = full_span_1
+    start_2, end_2 = full_span_2
+    return max(start_1, start_2) < min(end_1, end_2)
+
+
 def filter_citations(citations: List[CitationBase]) -> List[CitationBase]:
     """Filter and order citations, ensuring reference citations are in sequence
 
@@ -329,31 +349,30 @@ def filter_citations(citations: List[CitationBase]) -> List[CitationBase]:
     :param citations: List of citations
     :return: Sorted and filtered citations
     """
+    citations = list(
+        {citation.span(): citation for citation in citations}.values()
+    )
     filtered_citations: List[CitationBase] = []
-    sorted_citations = sorted(citations, key=lambda citation: citation.span())
+    sorted_citations = sorted(
+        citations, key=lambda citation: citation.full_span()
+    )
     for citation in sorted_citations:
         if filtered_citations:
             last_citation = filtered_citations[-1]
-            last_span = last_citation.span()
-            current_span = citation.span()
-
-            if current_span == last_span and isinstance(
-                last_citation, ReferenceCitation
-            ):
-                # a single ReferenceCitation may be found via different
-                # names. Save the name metadata to account for collisions
-                for field in ReferenceCitation.name_fields:
-                    if not getattr(last_citation.metadata, field):
-                        setattr(
-                            last_citation.metadata,
-                            field,
-                            getattr(citation.metadata, field),
-                        )
-
-            if current_span[0] <= last_span[1]:
-                # Remove overlapping citations that can occur in edge cases
+            is_overlapping = overlapping_citations(
+                citation.full_span(), last_citation.full_span()
+            )
+            if is_overlapping and isinstance(last_citation, ReferenceCitation):
+                # Remove the overlapping reference citation
+                filtered_citations.pop(-1)
+                filtered_citations.append(citation)
+                continue
+            if is_overlapping and isinstance(citation, ReferenceCitation):
+                # Skip overlapping reference citations
                 continue
-        filtered_citations.append(citation)
+            filtered_citations.append(citation)
+        else:
+            filtered_citations.append(citation)
     return filtered_citations
 
 

diff --git a/eyecite/regexes.py b/eyecite/regexes.py
@@ -212,7 +212,7 @@ def short_cite_re(regex):
 # What case does a short cite refer to? For now, we just capture the previous
 # word optionally followed by a comma. Example: Adarand, 515 U.S. at 241.
 SHORT_CITE_ANTECEDENT_REGEX = r"""
-    (?P<antecedent>[\w\-.]+),?
+    (?P<antecedent>[A-Za-z][\w\-.]+),?
     \   # final space
 """
 

diff --git a/eyecite/resolve.py b/eyecite/resolve.py
@@ -91,22 +91,20 @@ def _filter_by_matching_plaintiff_or_defendant_or_resolved_names(
     """Filter out reference citations that point to more than 1 Resource"""
     matches: List[ResourceType] = []
 
-    for full_citation, resource in resolved_full_cites:
-        if not isinstance(full_citation, FullCaseCitation):
-            continue
-
-        for key in ReferenceCitation.name_fields:
-            reference_value = getattr(reference_citation.metadata, key)
-            full_case_value = getattr(full_citation.metadata, key)
-            if (
-                reference_value
-                and full_case_value
-                and reference_value in full_case_value
-            ):
-                matches.append(resource)
-                break
+    match_count = 0
+    reference_values = []
+    for key in ReferenceCitation.name_fields:
+        reference_value = getattr(reference_citation.metadata, key)
+        if reference_value:
+            reference_values.append(reference_value)
+    for citation, resource in resolved_full_cites:
+        full_cite_values = list(
+            [value for value in citation.metadata.__dict__.values() if value]
+        )
+        if set(full_cite_values) & set(reference_values):
+            match_count += 1
+            matches.append(resource)
 
-    # Remove duplicates and only accept if one candidate remains
     matches = list(set(matches))
     return matches[0] if len(matches) == 1 else None
 

diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py
@@ -9,7 +9,7 @@
 
 # by default tests use a cache for speed
 # call tests with `EYECITE_CACHE_DIR= python ...` to disable cache
-from eyecite.models import ResourceCitation
+from eyecite.models import FullCaseCitation, FullCitation, ResourceCitation
 from eyecite.test_factories import (
     case_citation,
     id_citation,
@@ -730,6 +730,36 @@ def test_date_in_editions(self):
                 % (edition[0], year, expected, date_in_reporter),
             )
 
+    def test_citation_filtering(self):
+        """Ensure citations with overlapping spans are correctly filtered
+
+        Imagine a scenario where a bug incorrectly identifies the following
+        .... at Conley v. Gibson, 355 Mass. 41, 42 (1999) ...
+        this returns two reference citations Conley, Gibson and the full cite
+        this shouldn't occur but if it did we would be able to filter these
+        correcly
+        """
+        ".... at Conley v. Gibson, 355 Mass. 41, 42 (1999) ..."
+        citations = [
+            case_citation(
+                volume="355",
+                page="41",
+                reporter_found="U.S.",
+                short=False,
+                span_start=26,
+                span_end=38,
+                full_span_start=8,
+                full_span_end=49,
+                metadata={"plaintiff": "Conley", "defendant": "Gibson"},
+            ),
+            reference_citation("Conley", span_start=8, span_end=14),
+            reference_citation("Conley", span_start=18, span_end=24),
+        ]
+        self.assertEqual(len(citations), 3)
+        filtered_citations = filter_citations(citations)
+        self.assertEqual(len(filtered_citations), 1)
+        self.assertEqual(type(filtered_citations[0]), FullCaseCitation)
+
     def test_disambiguate_citations(self):
         # fmt: off
         test_pairs = [
@@ -882,14 +912,18 @@ def test_reference_extraction(self):
         ]
         for plain_text in texts:
             citations = get_citations(plain_text)
-            citations[0].metadata.resolved_case_name = "State v. Wingler"
-            references = extract_reference_citations(citations[0], plain_text)
-            final_citations = filter_citations(citations + references)
-            self.assertEqual(
-                len(final_citations), 2, "There should only be 2 citations"
-            )
-            self.assertEqual(
-                len(references),
-                1,
-                "Only a reference citation should had been picked up",
-            )
+            found_cite = citations[0]
+            if isinstance(found_cite, FullCitation):
+                found_cite.metadata.resolved_case_name = "State v. Wingler"
+                references = extract_reference_citations(
+                    found_cite, plain_text
+                )
+                final_citations = filter_citations(citations + references)
+                self.assertEqual(
+                    len(final_citations), 2, "There should only be 2 citations"
+                )
+                self.assertEqual(
+                    len(references),
+                    1,
+                    "Only a reference citation should had been picked up",
+                )