diff --git a/eyecite/find.py b/eyecite/find.py index 90985be..5037556 100644 --- a/eyecite/find.py +++ b/eyecite/find.py @@ -29,6 +29,7 @@ ) from eyecite.regexes import SHORT_CITE_ANTECEDENT_REGEX, SUPRA_ANTECEDENT_REGEX from eyecite.tokenizers import Tokenizer, default_tokenizer +from eyecite.utils import DISALLOWED_NAMES def get_citations( @@ -153,6 +154,7 @@ def is_valid_name(name: str) -> bool: and name[0].isupper() and not name.endswith(".") and not name.isdigit() + and name.lower() not in DISALLOWED_NAMES ) regexes = [ diff --git a/eyecite/utils.py b/eyecite/utils.py index c606a32..0b920d3 100644 --- a/eyecite/utils.py +++ b/eyecite/utils.py @@ -4,6 +4,107 @@ from lxml import etree +# Names not allowed to be reference citations +# this is partially taken from juriscraper +DISALLOWED_NAMES = [ + # Common options + "state", + "united states", + "people", + "commonwealth", + "mass", + "commissioner" + # AGs + "Akerman", + "Ashcroft", + "Barr", + "Bates", + "Bell", + "Berrien", + "Biddle", + "Black", + "Bonaparte", + "Bork", + "Bondi", + "Bradford", + "Breckinridge", + "Brewster", + "Brownell", + "Butler", + "Civiletti", + "Clark", + "Clement", + "Clifford", + "Crittenden", + "Cummings", + "Cushing", + "Daugherty", + "Devens", + "Evarts", + "Filip", + "Garland", + "Gerson", + "Gilpin", + "Gonzales", + "Gregory", + "Griggs", + "Grundy", + "Harmon", + "Hoar", + "Holder", + "Jackson", + "Johnson", + "Katzenbach", + "Keisler", + "Kennedy", + "Kleindienst", + "Knox", + "Lee", + "Legaré", + "Levi", + "Lincoln", + "Lynch", + "MacVeagh", + "Mason", + "McGranery", + "McGrath", + "McKenna", + "McReynolds", + "Meese", + "Miller", + "Mitchell", + "Moody", + "Mukasey", + "Murphy", + "Nelson", + "Olney", + "Palmer", + "Pierrepont", + "Pinkney", + "Randolph", + "Reno", + "Richardson", + "Rodney", + "Rogers", + "Rush", + "Sargent", + "Saxbe", + "Sessions", + "Smith", + "Speed", + "Stanbery", + "Stanton", + "Stone", + "Taft", + "Taney", + "Thornburgh", + "Toucey", + "Whitacker", + "Wickersham", + "Williams", + "Wirt", +] + def strip_punct(text: str) -> str: """Strips punctuation from a given string diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py index 291c9ae..00a334a 100644 --- a/tests/test_FindTest.py +++ b/tests/test_FindTest.py @@ -502,6 +502,12 @@ def test_find_citations(self): 'defendant': 'Bar', 'pin_cite': '347-348'}), reference_citation('Foo at 62', metadata={'plaintiff': 'Foo', 'pin_cite': '62'})]), + ('Foo v. United States 1 U.S. 12, 347-348. something something ... the United States at 1776 we see that and Foo at 62', + [case_citation(page='12', + metadata={'plaintiff': 'Foo', + 'defendant': 'United States', + 'pin_cite': '347-348'}), + reference_citation('Foo at 62', metadata={'plaintiff': 'Foo', 'pin_cite': '62'})]), # Test that reference citation must occur after full case citation ('In Foo at 62 we see that, Foo v. Bar 1 U.S. 12, 347-348. something something,', [case_citation(page='12',