Skip to content

Commit 964c934

Browse files
jackzhxngfacebook-github-bot
authored andcommitted
Gate regex lookahead in cmake behind compile flag (#59)
Summary: Make the pcre2 fallback optional to offer library size savings ``` cmake -DSUPPORT_REGEX_LOOKAHEAD=ON -DTOKENIZERS_BUILD_TOOLS=ON . -Bbuild && cmake --build build -j9 stat -f "%z %N" build/libtokenizers.a 6394880 build/libtokenizers.a cmake -DTOKENIZERS_BUILD_TOOLS=ON . -Bbuild && cmake --build build -j9 stat -f "%z %N" build/libtokenizers.a 6311960 build/libtokenizers.a # Difference of 6394880 - 6311960 = 82920 bytes ``` BSB results on IOS `messenger-distribution`: ``` Changes for variation [arm64 + 3x assets]: Compressed : -80.4 KiB (-0.12%) => 64.92 MiB Uncompressed: -235.2 KiB (-0.17%) => 138.98 MiB ``` Compared to app size regression reported from 222011767: ``` uncompressed_delta_size_arm64@3x: change of 233058 (0.16%) exceeds threshold of 40960 ``` Differential Revision: D73530475 Pulled By: jackzhxng
1 parent 13abc73 commit 964c934

File tree

3 files changed

+50
-24
lines changed

3 files changed

+50
-24
lines changed

CMakeLists.txt

+28-13
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ project(Tokenizers)
1818

1919
option(TOKENIZERS_BUILD_TEST "Build tests" OFF)
2020
option(TOKENIZERS_BUILD_TOOLS "Build tools" OFF)
21+
option(SUPPORT_REGEX_LOOKAHEAD
22+
"Support regex lookahead patterns (requires PCRE2)" OFF)
2123

2224
# Ignore weak attribute warning
2325
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes")
@@ -31,20 +33,27 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2)
3133
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece)
3234

3335
# Configure PCRE2
34-
set(PCRE2_BUILD_PCRE2_8 ON)
35-
set(PCRE2_BUILD_PCRE2_16 OFF)
36-
set(PCRE2_BUILD_PCRE2_32 OFF)
37-
set(PCRE2_BUILD_TESTS OFF)
38-
set(PCRE2_BUILD_PCRE2GREP OFF)
39-
set(PCRE2_BUILD_PCRE2TEST OFF)
40-
set(PCRE2_BUILD_PCRE2GPERF OFF)
41-
set(PCRE2_BUILD_DOCS OFF)
42-
set(PCRE2_BUILD_LIBPCRE2_PDB OFF)
43-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2)
36+
if(SUPPORT_REGEX_LOOKAHEAD)
37+
set(PCRE2_BUILD_PCRE2_8 ON)
38+
set(PCRE2_BUILD_PCRE2_16 OFF)
39+
set(PCRE2_BUILD_PCRE2_32 OFF)
40+
set(PCRE2_BUILD_TESTS OFF)
41+
set(PCRE2_BUILD_PCRE2GREP OFF)
42+
set(PCRE2_BUILD_PCRE2TEST OFF)
43+
set(PCRE2_BUILD_PCRE2GPERF OFF)
44+
set(PCRE2_BUILD_DOCS OFF)
45+
set(PCRE2_BUILD_LIBPCRE2_PDB OFF)
46+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2)
47+
endif()
4448

4549
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
4650

4751
file(GLOB tokenizers_source_files ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp)
52+
if(NOT SUPPORT_REGEX_LOOKAHEAD)
53+
list(REMOVE_ITEM
54+
tokenizers_source_files
55+
${CMAKE_CURRENT_SOURCE_DIR}/src/pcre2_regex.cpp)
56+
endif()
4857
file(GLOB unicode_source_files
4958
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/src/*.cpp)
5059
add_library(tokenizers STATIC ${tokenizers_source_files}
@@ -58,10 +67,16 @@ target_include_directories(
5867
${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece/src
5968
${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2
6069
${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include
61-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/include
62-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src)
70+
${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/include)
71+
target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2)
6372

64-
target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2 pcre2-8)
73+
if(SUPPORT_REGEX_LOOKAHEAD)
74+
target_include_directories(tokenizers
75+
PUBLIC
76+
${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src)
77+
target_link_libraries(tokenizers PUBLIC pcre2-8)
78+
target_compile_definitions(tokenizers PUBLIC SUPPORT_REGEX_LOOKAHEAD)
79+
endif()
6580

6681
# Build test
6782
if(TOKENIZERS_BUILD_TEST)

src/regex.cpp

+14-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9+
#ifdef SUPPORT_REGEX_LOOKAHEAD
910
#include <pytorch/tokenizers/pcre2_regex.h>
11+
#endif
1012
#include <pytorch/tokenizers/re2_regex.h>
1113
#include <pytorch/tokenizers/regex.h>
1214
#include <pytorch/tokenizers/std_regex.h>
@@ -19,8 +21,8 @@ namespace tokenizers {
1921

2022
/**
2123
* @brief Factory function that creates a regex object using RE2 if possible.
22-
* Falls back to PCRE2 if RE2 rejects the pattern, then to std::regex if
23-
* PCRE2 fails.
24+
* Falls back to PCRE2 if RE2 rejects the pattern and
25+
* SUPPORT_REGEX_LOOKAHEAD is enabled. Otherwise, returns an error.
2426
*/
2527
Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern) {
2628
// Try RE2 first
@@ -30,6 +32,15 @@ Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern) {
3032
return static_cast<std::unique_ptr<IRegex>>(std::move(re2));
3133
}
3234

35+
#ifndef SUPPORT_REGEX_LOOKAHEAD
36+
std::cerr << "RE2 failed to compile pattern with lookahead: " << pattern
37+
<< "\n";
38+
std::cerr << "Error: " << (re2->regex_->error()) << std::endl;
39+
std::cerr
40+
<< "Compile with SUPPORT_REGEX_LOOKAHEAD=ON to enable support for lookahead patterns."
41+
<< std::endl;
42+
return tokenizers::Error::LoadFailure;
43+
#else
3344
if (re2->regex_->error_code() == re2::RE2::ErrorBadPerlOp) {
3445
// RE2 doesn't support some Perl features, try PCRE2
3546
auto pcre2 = std::make_unique<Pcre2Regex>("(" + pattern + ")");
@@ -56,6 +67,7 @@ Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern) {
5667
std::cerr << "Error: " << (re2->regex_->error()) << std::endl;
5768
return tokenizers::Error::LoadFailure;
5869
}
70+
#endif
5971
}
6072

6173
} // namespace tokenizers

targets.bzl

+8-9
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def define_common_targets():
1414
name = "headers",
1515
exported_headers = subdir_glob([
1616
("include", "pytorch/tokenizers/*.h"),
17-
]),
17+
], exclude = ["pcre2_regex.h", "std_regex.h"]),
1818
visibility = [
1919
"@EXECUTORCH_CLIENTS",
2020
"//pytorch/tokenizers/...",
@@ -23,20 +23,19 @@ def define_common_targets():
2323
platforms = PLATFORMS,
2424
)
2525

26+
# TODO: add target for regex which does lookahed with pcre2
27+
# by adding "-DSUPPORT_REGEX_LOOKAHEAD" as a compiler flag
28+
# and including pcre2 dependencies.
2629
runtime.cxx_library(
2730
name = "regex",
28-
srcs = ["src/regex.cpp"] + glob([
29-
"src/*_regex.cpp",
30-
]),
31-
exported_headers = subdir_glob([
32-
("include", "pytorch/tokenizers/regex.h"),
33-
("include", "pytorch/tokenizers/*_regex.h"),
34-
]),
31+
srcs = [
32+
"src/regex.cpp",
33+
"src/re2_regex.cpp",
34+
],
3535
exported_deps = [
3636
":headers",
3737
],
3838
exported_external_deps = [
39-
"pcre2",
4039
"re2",
4140
],
4241
visibility = ["//pytorch/tokenizers/..."],

0 commit comments

Comments
 (0)