Partial sync of codebase

hauntsaninja · hauntsaninja · commit f51a3df4e51c · 2025-09-29T23:37:43.000-07:00
diff --git a/Cargo.toml b/Cargo.toml
@@ -14,13 +14,13 @@ python = [
 ]
 
 [dependencies]
-pyo3 = { version = "0.26", default-features = false, features = [
+pyo3 = { version = "0.26.0", default-features = false, features = [
     "extension-module",
     "macros",
 ], optional = true }
 
 # tiktoken dependencies
-fancy-regex = "0.16"
+fancy-regex = "0.13.0"
 regex = "1.10.3"
 rustc-hash = "2"
 bstr = "1.5.0"
diff --git a/pyproject.toml b/pyproject.toml
@@ -31,7 +31,6 @@ skip = [
   "*-manylinux_i686",
   "*-musllinux_i686",
   "*-win32",
-  "*-musllinux_aarch64",
 ]
 macos.archs = ["x86_64", "arm64"]
 # When cross-compiling on Intel, it is not possible to test arm64 wheels.
diff --git a/src/py.rs b/src/py.rs
@@ -28,7 +28,7 @@ impl CoreBPE {
 
     #[pyo3(name = "encode_ordinary")]
     fn py_encode_ordinary(&self, py: Python, text: &str) -> Vec<Rank> {
-        py.allow_threads(|| self.encode_ordinary(text))
+        py.detach(|| self.encode_ordinary(text))
     }
 
     #[pyo3(name = "encode")]
@@ -38,7 +38,7 @@ impl CoreBPE {
         text: &str,
         allowed_special: HashSet<PyBackedStr>,
     ) -> PyResult<Vec<Rank>> {
-        py.allow_threads(|| {
+        py.detach(|| {
             let allowed_special: HashSet<&str> =
                 allowed_special.iter().map(|s| s.as_ref()).collect();
             match self.encode(text, &allowed_special) {
@@ -54,7 +54,7 @@ impl CoreBPE {
         text: &str,
         allowed_special: HashSet<PyBackedStr>,
     ) -> PyResult<Py<PyAny>> {
-        let tokens_res = py.allow_threads(|| {
+        let tokens_res = py.detach(|| {
             let allowed_special: HashSet<&str> =
                 allowed_special.iter().map(|s| s.as_ref()).collect();
             self.encode(text, &allowed_special)
@@ -70,7 +70,7 @@ impl CoreBPE {
     }
 
     fn _encode_bytes(&self, py: Python, bytes: &[u8]) -> Vec<Rank> {
-        py.allow_threads(|| {
+        py.detach(|| {
             match std::str::from_utf8(bytes) {
                 // Straightforward case
                 Ok(text) => self.encode_ordinary(text),
@@ -121,7 +121,7 @@ impl CoreBPE {
         text: &str,
         allowed_special: HashSet<PyBackedStr>,
     ) -> PyResult<(Vec<Rank>, Py<PyList>)> {
-        let (tokens, completions): (Vec<Rank>, HashSet<Vec<Rank>>) = py.allow_threads(|| {
+        let (tokens, completions): (Vec<Rank>, HashSet<Vec<Rank>>) = py.detach(|| {
             let allowed_special: HashSet<&str> =
                 allowed_special.iter().map(|s| s.as_ref()).collect();
             self._encode_unstable_native(text, &allowed_special)
@@ -155,7 +155,7 @@ impl CoreBPE {
 
     #[pyo3(name = "decode_bytes")]
     fn py_decode_bytes(&self, py: Python, tokens: Vec<Rank>) -> Result<Py<PyBytes>, PyErr> {
-        match py.allow_threads(|| self.decode_bytes(&tokens)) {
+        match py.detach(|| self.decode_bytes(&tokens)) {
             Ok(bytes) => Ok(PyBytes::new(py, &bytes).into()),
             Err(e) => Err(pyo3::exceptions::PyKeyError::new_err(format!("{}", e))),
         }
diff --git a/tiktoken/core.py b/tiktoken/core.py
@@ -4,11 +4,11 @@
 from concurrent.futures import ThreadPoolExecutor
 from typing import TYPE_CHECKING, AbstractSet, Collection, Literal, NoReturn, Sequence
 
-import regex
-
 from tiktoken import _tiktoken
 
 if TYPE_CHECKING:
+    import re
+
     import numpy as np
     import numpy.typing as npt
 
@@ -391,6 +391,9 @@ def _encode_single_piece(self, text_or_bytes: str | bytes) -> list[int]:
 
     def _encode_only_native_bpe(self, text: str) -> list[int]:
         """Encodes a string into tokens, but do regex splitting in Python."""
+        # We need specifically `regex` in order to compile pat_str due to e.g. \p
+        import regex
+
         _unused_pat = regex.compile(self._pat_str)
         ret = []
         for piece in regex.findall(_unused_pat, text):

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,6 @@ skip = [`
`31`	`31`	`"*-manylinux_i686",`
`32`	`32`	`"*-musllinux_i686",`
`33`	`33`	`"*-win32",`
`34`		`- "*-musllinux_aarch64",`
`35`	`34`	`]`
`36`	`35`	`macos.archs = ["x86_64", "arm64"]`
`37`	`36`	`# When cross-compiling on Intel, it is not possible to test arm64 wheels.`