Skip to content

Commit 7e8c212

Browse files
committed
Gate regex lookahead in cmake
1 parent 13abc73 commit 7e8c212

File tree

2 files changed

+34
-15
lines changed

2 files changed

+34
-15
lines changed

CMakeLists.txt

+23-13
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ project(Tokenizers)
1818

1919
option(TOKENIZERS_BUILD_TEST "Build tests" OFF)
2020
option(TOKENIZERS_BUILD_TOOLS "Build tools" OFF)
21+
option(SUPPORT_REGEX_LOOKAHEAD "Support regex lookahead patterns (requires PCRE2)" OFF)
2122

2223
# Ignore weak attribute warning
2324
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes")
@@ -31,20 +32,25 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2)
3132
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece)
3233

3334
# Configure PCRE2
34-
set(PCRE2_BUILD_PCRE2_8 ON)
35-
set(PCRE2_BUILD_PCRE2_16 OFF)
36-
set(PCRE2_BUILD_PCRE2_32 OFF)
37-
set(PCRE2_BUILD_TESTS OFF)
38-
set(PCRE2_BUILD_PCRE2GREP OFF)
39-
set(PCRE2_BUILD_PCRE2TEST OFF)
40-
set(PCRE2_BUILD_PCRE2GPERF OFF)
41-
set(PCRE2_BUILD_DOCS OFF)
42-
set(PCRE2_BUILD_LIBPCRE2_PDB OFF)
43-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2)
35+
if(SUPPORT_REGEX_LOOKAHEAD)
36+
set(PCRE2_BUILD_PCRE2_8 ON)
37+
set(PCRE2_BUILD_PCRE2_16 OFF)
38+
set(PCRE2_BUILD_PCRE2_32 OFF)
39+
set(PCRE2_BUILD_TESTS OFF)
40+
set(PCRE2_BUILD_PCRE2GREP OFF)
41+
set(PCRE2_BUILD_PCRE2TEST OFF)
42+
set(PCRE2_BUILD_PCRE2GPERF OFF)
43+
set(PCRE2_BUILD_DOCS OFF)
44+
set(PCRE2_BUILD_LIBPCRE2_PDB OFF)
45+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2)
46+
endif()
4447

4548
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
4649

4750
file(GLOB tokenizers_source_files ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp)
51+
if(NOT SUPPORT_REGEX_LOOKAHEAD)
52+
list(REMOVE_ITEM tokenizers_source_files ${CMAKE_CURRENT_SOURCE_DIR}/src/pcre2_regex.cpp)
53+
endif()
4854
file(GLOB unicode_source_files
4955
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/src/*.cpp)
5056
add_library(tokenizers STATIC ${tokenizers_source_files}
@@ -58,10 +64,14 @@ target_include_directories(
5864
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece/src
5965
${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2
6066
${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include
61-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/include
62-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src)
67+
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/include)
68+
target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2)
6369

64-
target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2 pcre2-8)
70+
if(SUPPORT_REGEX_LOOKAHEAD)
71+
target_include_directories(tokenizers PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src)
72+
target_link_libraries(tokenizers PUBLIC pcre2-8)
73+
target_compile_definitions(tokenizers PUBLIC SUPPORT_REGEX_LOOKAHEAD)
74+
endif()
6575

6676
# Build test
6777
if(TOKENIZERS_BUILD_TEST)

src/regex.cpp

+11-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9+
#ifdef SUPPORT_REGEX_LOOKAHEAD
910
#include <pytorch/tokenizers/pcre2_regex.h>
11+
#endif
1012
#include <pytorch/tokenizers/re2_regex.h>
1113
#include <pytorch/tokenizers/regex.h>
1214
#include <pytorch/tokenizers/std_regex.h>
@@ -19,8 +21,8 @@ namespace tokenizers {
1921

2022
/**
2123
* @brief Factory function that creates a regex object using RE2 if possible.
22-
* Falls back to PCRE2 if RE2 rejects the pattern, then to std::regex if
23-
* PCRE2 fails.
24+
* Falls back to PCRE2 if RE2 rejects the pattern and SUPPORT_REGEX_LOOKAHEAD is enabled.
25+
* Otherwise, returns an error.
2426
*/
2527
Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern) {
2628
// Try RE2 first
@@ -30,6 +32,12 @@ Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern) {
3032
return static_cast<std::unique_ptr<IRegex>>(std::move(re2));
3133
}
3234

35+
#ifndef SUPPORT_REGEX_LOOKAHEAD
36+
std::cerr << "RE2 failed to compile pattern with lookahead: " << pattern << "\n";
37+
std::cerr << "Error: " << (re2->regex_->error()) << std::endl;
38+
std::cerr << "Compile with SUPPORT_REGEX_LOOKAHEAD=ON to enable support for lookahead patterns." << std::endl;
39+
return tokenizers::Error::LoadFailure;
40+
#else
3341
if (re2->regex_->error_code() == re2::RE2::ErrorBadPerlOp) {
3442
// RE2 doesn't support some Perl features, try PCRE2
3543
auto pcre2 = std::make_unique<Pcre2Regex>("(" + pattern + ")");
@@ -56,6 +64,7 @@ Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern) {
5664
std::cerr << "Error: " << (re2->regex_->error()) << std::endl;
5765
return tokenizers::Error::LoadFailure;
5866
}
67+
#endif
5968
}
6069

6170
} // namespace tokenizers

0 commit comments

Comments
 (0)