From 964c93430fa3e39a3d738b00fc8e9c1ded65f3da Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Wed, 23 Apr 2025 16:39:45 -0700 Subject: [PATCH 1/2] Gate regex lookahead in cmake behind compile flag (#59) Summary: Make the pcre2 fallback optional to offer library size savings ``` cmake -DSUPPORT_REGEX_LOOKAHEAD=ON -DTOKENIZERS_BUILD_TOOLS=ON . -Bbuild && cmake --build build -j9 stat -f "%z %N" build/libtokenizers.a 6394880 build/libtokenizers.a cmake -DTOKENIZERS_BUILD_TOOLS=ON . -Bbuild && cmake --build build -j9 stat -f "%z %N" build/libtokenizers.a 6311960 build/libtokenizers.a # Difference of 6394880 - 6311960 = 82920 bytes ``` BSB results on IOS `messenger-distribution`: ``` Changes for variation [arm64 + 3x assets]: Compressed : -80.4 KiB (-0.12%) => 64.92 MiB Uncompressed: -235.2 KiB (-0.17%) => 138.98 MiB ``` Compared to app size regression reported from 222011767: ``` uncompressed_delta_size_arm64@3x: change of 233058 (0.16%) exceeds threshold of 40960 ``` Differential Revision: D73530475 Pulled By: jackzhxng --- CMakeLists.txt | 41 ++++++++++++++++++++++++++++------------- src/regex.cpp | 16 ++++++++++++++-- targets.bzl | 17 ++++++++--------- 3 files changed, 50 insertions(+), 24 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f0ce71c..46e0dee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,6 +18,8 @@ project(Tokenizers) option(TOKENIZERS_BUILD_TEST "Build tests" OFF) option(TOKENIZERS_BUILD_TOOLS "Build tools" OFF) +option(SUPPORT_REGEX_LOOKAHEAD + "Support regex lookahead patterns (requires PCRE2)" OFF) # Ignore weak attribute warning set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes") @@ -31,20 +33,27 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece) # Configure PCRE2 -set(PCRE2_BUILD_PCRE2_8 ON) -set(PCRE2_BUILD_PCRE2_16 OFF) -set(PCRE2_BUILD_PCRE2_32 OFF) -set(PCRE2_BUILD_TESTS OFF) -set(PCRE2_BUILD_PCRE2GREP OFF) -set(PCRE2_BUILD_PCRE2TEST OFF) -set(PCRE2_BUILD_PCRE2GPERF OFF) -set(PCRE2_BUILD_DOCS OFF) -set(PCRE2_BUILD_LIBPCRE2_PDB OFF) -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2) +if(SUPPORT_REGEX_LOOKAHEAD) + set(PCRE2_BUILD_PCRE2_8 ON) + set(PCRE2_BUILD_PCRE2_16 OFF) + set(PCRE2_BUILD_PCRE2_32 OFF) + set(PCRE2_BUILD_TESTS OFF) + set(PCRE2_BUILD_PCRE2GREP OFF) + set(PCRE2_BUILD_PCRE2TEST OFF) + set(PCRE2_BUILD_PCRE2GPERF OFF) + set(PCRE2_BUILD_DOCS OFF) + set(PCRE2_BUILD_LIBPCRE2_PDB OFF) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2) +endif() set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag}) file(GLOB tokenizers_source_files ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp) +if(NOT SUPPORT_REGEX_LOOKAHEAD) + list(REMOVE_ITEM + tokenizers_source_files + ${CMAKE_CURRENT_SOURCE_DIR}/src/pcre2_regex.cpp) +endif() file(GLOB unicode_source_files ${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/src/*.cpp) add_library(tokenizers STATIC ${tokenizers_source_files} @@ -58,10 +67,16 @@ target_include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece/src ${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2 ${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include - ${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/include - ${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src) + ${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/include) +target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2) -target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2 pcre2-8) +if(SUPPORT_REGEX_LOOKAHEAD) + target_include_directories(tokenizers + PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src) + target_link_libraries(tokenizers PUBLIC pcre2-8) + target_compile_definitions(tokenizers PUBLIC SUPPORT_REGEX_LOOKAHEAD) +endif() # Build test if(TOKENIZERS_BUILD_TEST) diff --git a/src/regex.cpp b/src/regex.cpp index 873b270..5b913cd 100644 --- a/src/regex.cpp +++ b/src/regex.cpp @@ -6,7 +6,9 @@ * LICENSE file in the root directory of this source tree. */ +#ifdef SUPPORT_REGEX_LOOKAHEAD #include +#endif #include #include #include @@ -19,8 +21,8 @@ namespace tokenizers { /** * @brief Factory function that creates a regex object using RE2 if possible. - * Falls back to PCRE2 if RE2 rejects the pattern, then to std::regex if - * PCRE2 fails. + * Falls back to PCRE2 if RE2 rejects the pattern and + * SUPPORT_REGEX_LOOKAHEAD is enabled. Otherwise, returns an error. */ Result> create_regex(const std::string& pattern) { // Try RE2 first @@ -30,6 +32,15 @@ Result> create_regex(const std::string& pattern) { return static_cast>(std::move(re2)); } +#ifndef SUPPORT_REGEX_LOOKAHEAD + std::cerr << "RE2 failed to compile pattern with lookahead: " << pattern + << "\n"; + std::cerr << "Error: " << (re2->regex_->error()) << std::endl; + std::cerr + << "Compile with SUPPORT_REGEX_LOOKAHEAD=ON to enable support for lookahead patterns." + << std::endl; + return tokenizers::Error::LoadFailure; +#else if (re2->regex_->error_code() == re2::RE2::ErrorBadPerlOp) { // RE2 doesn't support some Perl features, try PCRE2 auto pcre2 = std::make_unique("(" + pattern + ")"); @@ -56,6 +67,7 @@ Result> create_regex(const std::string& pattern) { std::cerr << "Error: " << (re2->regex_->error()) << std::endl; return tokenizers::Error::LoadFailure; } +#endif } } // namespace tokenizers diff --git a/targets.bzl b/targets.bzl index eccada8..1d1a193 100644 --- a/targets.bzl +++ b/targets.bzl @@ -14,7 +14,7 @@ def define_common_targets(): name = "headers", exported_headers = subdir_glob([ ("include", "pytorch/tokenizers/*.h"), - ]), + ], exclude = ["pcre2_regex.h", "std_regex.h"]), visibility = [ "@EXECUTORCH_CLIENTS", "//pytorch/tokenizers/...", @@ -23,20 +23,19 @@ def define_common_targets(): platforms = PLATFORMS, ) + # TODO: add target for regex which does lookahed with pcre2 + # by adding "-DSUPPORT_REGEX_LOOKAHEAD" as a compiler flag + # and including pcre2 dependencies. runtime.cxx_library( name = "regex", - srcs = ["src/regex.cpp"] + glob([ - "src/*_regex.cpp", - ]), - exported_headers = subdir_glob([ - ("include", "pytorch/tokenizers/regex.h"), - ("include", "pytorch/tokenizers/*_regex.h"), - ]), + srcs = [ + "src/regex.cpp", + "src/re2_regex.cpp", + ], exported_deps = [ ":headers", ], exported_external_deps = [ - "pcre2", "re2", ], visibility = ["//pytorch/tokenizers/..."], From 8e9b04ed8cd95d7f905f09d38055b04cfb5fde8c Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Wed, 23 Apr 2025 16:56:10 -0700 Subject: [PATCH 2/2] Fix test build --- CMakeLists.txt | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 46e0dee..a81987f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,14 +26,16 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes") set(ABSL_ENABLE_INSTALL ON) set(ABSL_PROPAGATE_CXX_STD ON) + set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE}) set(CMAKE_POSITION_INDEPENDENT_CODE ON) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/abseil-cpp) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece) # Configure PCRE2 -if(SUPPORT_REGEX_LOOKAHEAD) +if(SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST) set(PCRE2_BUILD_PCRE2_8 ON) set(PCRE2_BUILD_PCRE2_16 OFF) set(PCRE2_BUILD_PCRE2_32 OFF) @@ -49,10 +51,22 @@ endif() set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag}) file(GLOB tokenizers_source_files ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp) -if(NOT SUPPORT_REGEX_LOOKAHEAD) - list(REMOVE_ITEM +set(tokenizers_source_files + ${CMAKE_CURRENT_SOURCE_DIR}/src/bpe_tokenizer_base.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/hf_tokenizer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/llama2c_tokenizer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/pre_tokenizer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/re2_regex.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/regex.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/sentencepiece.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/tiktoken.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/token_decoder.cpp +) +if(SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST) + list(APPEND tokenizers_source_files - ${CMAKE_CURRENT_SOURCE_DIR}/src/pcre2_regex.cpp) + ${CMAKE_CURRENT_SOURCE_DIR}/src/pcre2_regex.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/std_regex.cpp) endif() file(GLOB unicode_source_files ${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/src/*.cpp) @@ -70,7 +84,7 @@ target_include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/include) target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2) -if(SUPPORT_REGEX_LOOKAHEAD) +if(SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST) target_include_directories(tokenizers PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src)