diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 3f9638a3..f810cb43 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -10,7 +10,7 @@ on: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest strategy: fail-fast: false matrix: diff --git a/CHANGES.md b/CHANGES.md index fe69f238..312c262e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -8,10 +8,11 @@ Features: - None Changes: -- None +- Modifies section regex and puctuation regex Fixes: -- None +- Fixes github action +- Fixed an issue in `extract_tokens()` where regex re-run could fail on certain Hyperscan matches. ## Current diff --git a/eyecite/clean.py b/eyecite/clean.py index 985ad31b..e2b672b9 100644 --- a/eyecite/clean.py +++ b/eyecite/clean.py @@ -78,7 +78,10 @@ def all_whitespace(text: str) -> str: Returns: Text with collapsed whitespace characters. """ - return re.sub(r"\s+", " ", text) + WHITESPACE_REGEX = ( + r"[ \t\n\r\f\v\u00A0\u2002\u2003\u2009\u200B\u202F\u205F]+" + ) + return re.sub(WHITESPACE_REGEX, " ", text) def underscores(text: str) -> str: diff --git a/eyecite/regexes.py b/eyecite/regexes.py index 5c3a4366..65954255 100644 --- a/eyecite/regexes.py +++ b/eyecite/regexes.py @@ -52,7 +52,7 @@ def short_cite_re(regex): # Regex to match punctuation around volume numbers and stopwords. # This could potentially be more precise. -PUNCTUATION_REGEX = r"[^\sa-zA-Z0-9]*" +PUNCTUATION_REGEX = r"[^\sa-zA-Z0-9]{,3}" # Regex for IdToken ID_REGEX = space_boundaries_re(r"id\.,?|ibid\.") @@ -79,7 +79,7 @@ def short_cite_re(regex): ) # Regex for SectionToken -SECTION_REGEX = r"(\S*§\S*)" +SECTION_REGEX = space_boundaries_re(r"([\w\.\,\-]*§[\w\.\,\-]*)") # Regex for ParagraphToken PARAGRAPH_REGEX = r"(\n)" diff --git a/eyecite/tokenizers.py b/eyecite/tokenizers.py index ba9ea7d9..2b7cd495 100644 --- a/eyecite/tokenizers.py +++ b/eyecite/tokenizers.py @@ -466,6 +466,9 @@ def on_match(index, start, end, flags, context): start = byte_to_str_offset[start] end = byte_to_str_offset[end] m = extractor.compiled_regex.match(text[start:end]) + if not m: + # skip if re-run regex fails to detect match + continue yield extractor.get_token(m, offset=start) @property diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py index 17e28af7..c7871120 100644 --- a/tests/test_FindTest.py +++ b/tests/test_FindTest.py @@ -630,6 +630,12 @@ def test_find_citations(self): metadata={'plaintiff': 'Commonwealth', 'defendant': 'Muniz', 'court': 'pa'})]), ('Foo v. Bar, 1 F.Supp. 1 (SC 1967)', [case_citation(volume='1', reporter='F.Supp.', year=1967, page='1', metadata={'plaintiff': 'Foo', 'defendant': 'Bar', 'court': 'sc'})]), + ('Shady Grove Farms \xa0v Goldsmith Seeds 1 U.S. 1 (1981)', [ + case_citation(year=1981, + metadata={'defendant': 'Goldsmith Seeds', + 'plaintiff': 'Farms', + 'court': 'scotus'})], + {'clean': ['all_whitespace']}), ) # fmt: on self.run_test_pairs(test_pairs, "Citation extraction")