freelawproject · flooie · Mar 4, 2025 · Mar 4, 2025 · Mar 4, 2025 · Mar 4, 2025
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -10,7 +10,7 @@ on:
 
 jobs:
   build:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-latest
     strategy:
       fail-fast: false
       matrix:

diff --git a/CHANGES.md b/CHANGES.md
@@ -8,10 +8,11 @@ Features:
 - None
 
 Changes:
-- None
+- Modifies section regex and puctuation regex
 
 Fixes:
-- None
+- Fixes github action
+- Fixed an issue in `extract_tokens()` where regex re-run could fail on certain Hyperscan matches.  
 
 
 ## Current

diff --git a/eyecite/clean.py b/eyecite/clean.py
@@ -78,7 +78,10 @@ def all_whitespace(text: str) -> str:
     Returns:
         Text with collapsed whitespace characters.
     """
-    return re.sub(r"\s+", " ", text)
+    WHITESPACE_REGEX = (
+        r"[ \t\n\r\f\v\u00A0\u2002\u2003\u2009\u200B\u202F\u205F]+"
+    )
+    return re.sub(WHITESPACE_REGEX, " ", text)
 
 
 def underscores(text: str) -> str:

diff --git a/eyecite/regexes.py b/eyecite/regexes.py
@@ -52,7 +52,7 @@ def short_cite_re(regex):
 
 # Regex to match punctuation around volume numbers and stopwords.
 # This could potentially be more precise.
-PUNCTUATION_REGEX = r"[^\sa-zA-Z0-9]*"
+PUNCTUATION_REGEX = r"[^\sa-zA-Z0-9]{,3}"
 
 # Regex for IdToken
 ID_REGEX = space_boundaries_re(r"id\.,?|ibid\.")
@@ -79,7 +79,7 @@ def short_cite_re(regex):
 )
 
 # Regex for SectionToken
-SECTION_REGEX = r"(\S*§\S*)"
+SECTION_REGEX = space_boundaries_re(r"([\w\.\,\-]*§[\w\.\,\-]*)")
 
 # Regex for ParagraphToken
 PARAGRAPH_REGEX = r"(\n)"

diff --git a/eyecite/tokenizers.py b/eyecite/tokenizers.py
@@ -466,6 +466,9 @@ def on_match(index, start, end, flags, context):
                 start = byte_to_str_offset[start]
                 end = byte_to_str_offset[end]
                 m = extractor.compiled_regex.match(text[start:end])
+                if not m:
+                    # skip if re-run regex fails to detect match
+                    continue
                 yield extractor.get_token(m, offset=start)
 
     @property

diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py
@@ -630,6 +630,12 @@ def test_find_citations(self):
                               metadata={'plaintiff': 'Commonwealth', 'defendant': 'Muniz',
                                         'court': 'pa'})]),
             ('Foo v. Bar,  1 F.Supp. 1 (SC 1967)', [case_citation(volume='1', reporter='F.Supp.', year=1967, page='1', metadata={'plaintiff': 'Foo', 'defendant': 'Bar', 'court': 'sc'})]),
+            ('Shady Grove Farms \xa0v Goldsmith Seeds 1 U.S. 1 (1981)', [
+                case_citation(year=1981,
+                              metadata={'defendant': 'Goldsmith Seeds',
+                                        'plaintiff': 'Farms',
+                                        'court': 'scotus'})],
+             {'clean': ['all_whitespace']}),
         )
         # fmt: on
         self.run_test_pairs(test_pairs, "Citation extraction")