Skip to content

Gate regex lookahead in cmake behind compile flag #59

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 42 additions & 13 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,33 +18,56 @@ project(Tokenizers)

option(TOKENIZERS_BUILD_TEST "Build tests" OFF)
option(TOKENIZERS_BUILD_TOOLS "Build tools" OFF)
option(SUPPORT_REGEX_LOOKAHEAD
"Support regex lookahead patterns (requires PCRE2)" OFF)

# Ignore weak attribute warning
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes")

set(ABSL_ENABLE_INSTALL ON)
set(ABSL_PROPAGATE_CXX_STD ON)

set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
set(CMAKE_POSITION_INDEPENDENT_CODE ON)

add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/abseil-cpp)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece)

# Configure PCRE2
set(PCRE2_BUILD_PCRE2_8 ON)
set(PCRE2_BUILD_PCRE2_16 OFF)
set(PCRE2_BUILD_PCRE2_32 OFF)
set(PCRE2_BUILD_TESTS OFF)
set(PCRE2_BUILD_PCRE2GREP OFF)
set(PCRE2_BUILD_PCRE2TEST OFF)
set(PCRE2_BUILD_PCRE2GPERF OFF)
set(PCRE2_BUILD_DOCS OFF)
set(PCRE2_BUILD_LIBPCRE2_PDB OFF)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2)
if(SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST)
set(PCRE2_BUILD_PCRE2_8 ON)
set(PCRE2_BUILD_PCRE2_16 OFF)
set(PCRE2_BUILD_PCRE2_32 OFF)
set(PCRE2_BUILD_TESTS OFF)
set(PCRE2_BUILD_PCRE2GREP OFF)
set(PCRE2_BUILD_PCRE2TEST OFF)
set(PCRE2_BUILD_PCRE2GPERF OFF)
set(PCRE2_BUILD_DOCS OFF)
set(PCRE2_BUILD_LIBPCRE2_PDB OFF)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2)
endif()

set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})

file(GLOB tokenizers_source_files ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp)
set(tokenizers_source_files
${CMAKE_CURRENT_SOURCE_DIR}/src/bpe_tokenizer_base.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/hf_tokenizer.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/llama2c_tokenizer.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/pre_tokenizer.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/re2_regex.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/regex.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/sentencepiece.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/tiktoken.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/token_decoder.cpp
)
if(SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST)
list(APPEND
tokenizers_source_files
${CMAKE_CURRENT_SOURCE_DIR}/src/pcre2_regex.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/std_regex.cpp)
endif()
file(GLOB unicode_source_files
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/src/*.cpp)
add_library(tokenizers STATIC ${tokenizers_source_files}
Expand All @@ -58,10 +81,16 @@ target_include_directories(
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece/src
${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2
${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/include
${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src)
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/include)
target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2)

target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2 pcre2-8)
if(SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST)
target_include_directories(tokenizers
PUBLIC
${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src)
target_link_libraries(tokenizers PUBLIC pcre2-8)
target_compile_definitions(tokenizers PUBLIC SUPPORT_REGEX_LOOKAHEAD)
endif()

# Build test
if(TOKENIZERS_BUILD_TEST)
Expand Down
16 changes: 14 additions & 2 deletions src/regex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
* LICENSE file in the root directory of this source tree.
*/

#ifdef SUPPORT_REGEX_LOOKAHEAD
#include <pytorch/tokenizers/pcre2_regex.h>
#endif
#include <pytorch/tokenizers/re2_regex.h>
#include <pytorch/tokenizers/regex.h>
#include <pytorch/tokenizers/std_regex.h>
Expand All @@ -19,8 +21,8 @@ namespace tokenizers {

/**
* @brief Factory function that creates a regex object using RE2 if possible.
* Falls back to PCRE2 if RE2 rejects the pattern, then to std::regex if
* PCRE2 fails.
* Falls back to PCRE2 if RE2 rejects the pattern and
* SUPPORT_REGEX_LOOKAHEAD is enabled. Otherwise, returns an error.
*/
Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern) {
// Try RE2 first
Expand All @@ -30,6 +32,15 @@ Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern) {
return static_cast<std::unique_ptr<IRegex>>(std::move(re2));
}

#ifndef SUPPORT_REGEX_LOOKAHEAD
std::cerr << "RE2 failed to compile pattern with lookahead: " << pattern
<< "\n";
std::cerr << "Error: " << (re2->regex_->error()) << std::endl;
std::cerr
<< "Compile with SUPPORT_REGEX_LOOKAHEAD=ON to enable support for lookahead patterns."
<< std::endl;
return tokenizers::Error::LoadFailure;
#else
if (re2->regex_->error_code() == re2::RE2::ErrorBadPerlOp) {
// RE2 doesn't support some Perl features, try PCRE2
auto pcre2 = std::make_unique<Pcre2Regex>("(" + pattern + ")");
Expand All @@ -56,6 +67,7 @@ Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern) {
std::cerr << "Error: " << (re2->regex_->error()) << std::endl;
return tokenizers::Error::LoadFailure;
}
#endif
}

} // namespace tokenizers
17 changes: 8 additions & 9 deletions targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def define_common_targets():
name = "headers",
exported_headers = subdir_glob([
("include", "pytorch/tokenizers/*.h"),
]),
], exclude = ["pcre2_regex.h", "std_regex.h"]),
visibility = [
"@EXECUTORCH_CLIENTS",
"//pytorch/tokenizers/...",
Expand All @@ -23,20 +23,19 @@ def define_common_targets():
platforms = PLATFORMS,
)

# TODO: add target for regex which does lookahed with pcre2
# by adding "-DSUPPORT_REGEX_LOOKAHEAD" as a compiler flag
# and including pcre2 dependencies.
runtime.cxx_library(
name = "regex",
srcs = ["src/regex.cpp"] + glob([
"src/*_regex.cpp",
]),
exported_headers = subdir_glob([
("include", "pytorch/tokenizers/regex.h"),
("include", "pytorch/tokenizers/*_regex.h"),
]),
srcs = [
"src/regex.cpp",
"src/re2_regex.cpp",
],
exported_deps = [
":headers",
],
exported_external_deps = [
"pcre2",
"re2",
],
visibility = ["//pytorch/tokenizers/..."],
Expand Down