diff --git a/CMakeLists.txt b/CMakeLists.txt index f0ce71c..a81987f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,33 +18,56 @@ project(Tokenizers) option(TOKENIZERS_BUILD_TEST "Build tests" OFF) option(TOKENIZERS_BUILD_TOOLS "Build tools" OFF) +option(SUPPORT_REGEX_LOOKAHEAD + "Support regex lookahead patterns (requires PCRE2)" OFF) # Ignore weak attribute warning set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes") set(ABSL_ENABLE_INSTALL ON) set(ABSL_PROPAGATE_CXX_STD ON) + set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE}) set(CMAKE_POSITION_INDEPENDENT_CODE ON) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/abseil-cpp) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece) # Configure PCRE2 -set(PCRE2_BUILD_PCRE2_8 ON) -set(PCRE2_BUILD_PCRE2_16 OFF) -set(PCRE2_BUILD_PCRE2_32 OFF) -set(PCRE2_BUILD_TESTS OFF) -set(PCRE2_BUILD_PCRE2GREP OFF) -set(PCRE2_BUILD_PCRE2TEST OFF) -set(PCRE2_BUILD_PCRE2GPERF OFF) -set(PCRE2_BUILD_DOCS OFF) -set(PCRE2_BUILD_LIBPCRE2_PDB OFF) -add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2) +if(SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST) + set(PCRE2_BUILD_PCRE2_8 ON) + set(PCRE2_BUILD_PCRE2_16 OFF) + set(PCRE2_BUILD_PCRE2_32 OFF) + set(PCRE2_BUILD_TESTS OFF) + set(PCRE2_BUILD_PCRE2GREP OFF) + set(PCRE2_BUILD_PCRE2TEST OFF) + set(PCRE2_BUILD_PCRE2GPERF OFF) + set(PCRE2_BUILD_DOCS OFF) + set(PCRE2_BUILD_LIBPCRE2_PDB OFF) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2) +endif() set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag}) file(GLOB tokenizers_source_files ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp) +set(tokenizers_source_files + ${CMAKE_CURRENT_SOURCE_DIR}/src/bpe_tokenizer_base.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/hf_tokenizer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/llama2c_tokenizer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/pre_tokenizer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/re2_regex.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/regex.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/sentencepiece.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/tiktoken.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/token_decoder.cpp +) +if(SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST) + list(APPEND + tokenizers_source_files + ${CMAKE_CURRENT_SOURCE_DIR}/src/pcre2_regex.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/std_regex.cpp) +endif() file(GLOB unicode_source_files ${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/src/*.cpp) add_library(tokenizers STATIC ${tokenizers_source_files} @@ -58,10 +81,16 @@ target_include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece/src ${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2 ${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include - ${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/include - ${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src) + ${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/include) +target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2) -target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2 pcre2-8) +if(SUPPORT_REGEX_LOOKAHEAD OR TOKENIZERS_BUILD_TEST) + target_include_directories(tokenizers + PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/third-party/pcre2/src) + target_link_libraries(tokenizers PUBLIC pcre2-8) + target_compile_definitions(tokenizers PUBLIC SUPPORT_REGEX_LOOKAHEAD) +endif() # Build test if(TOKENIZERS_BUILD_TEST) diff --git a/src/regex.cpp b/src/regex.cpp index 873b270..5b913cd 100644 --- a/src/regex.cpp +++ b/src/regex.cpp @@ -6,7 +6,9 @@ * LICENSE file in the root directory of this source tree. */ +#ifdef SUPPORT_REGEX_LOOKAHEAD #include +#endif #include #include #include @@ -19,8 +21,8 @@ namespace tokenizers { /** * @brief Factory function that creates a regex object using RE2 if possible. - * Falls back to PCRE2 if RE2 rejects the pattern, then to std::regex if - * PCRE2 fails. + * Falls back to PCRE2 if RE2 rejects the pattern and + * SUPPORT_REGEX_LOOKAHEAD is enabled. Otherwise, returns an error. */ Result> create_regex(const std::string& pattern) { // Try RE2 first @@ -30,6 +32,15 @@ Result> create_regex(const std::string& pattern) { return static_cast>(std::move(re2)); } +#ifndef SUPPORT_REGEX_LOOKAHEAD + std::cerr << "RE2 failed to compile pattern with lookahead: " << pattern + << "\n"; + std::cerr << "Error: " << (re2->regex_->error()) << std::endl; + std::cerr + << "Compile with SUPPORT_REGEX_LOOKAHEAD=ON to enable support for lookahead patterns." + << std::endl; + return tokenizers::Error::LoadFailure; +#else if (re2->regex_->error_code() == re2::RE2::ErrorBadPerlOp) { // RE2 doesn't support some Perl features, try PCRE2 auto pcre2 = std::make_unique("(" + pattern + ")"); @@ -56,6 +67,7 @@ Result> create_regex(const std::string& pattern) { std::cerr << "Error: " << (re2->regex_->error()) << std::endl; return tokenizers::Error::LoadFailure; } +#endif } } // namespace tokenizers diff --git a/targets.bzl b/targets.bzl index eccada8..1d1a193 100644 --- a/targets.bzl +++ b/targets.bzl @@ -14,7 +14,7 @@ def define_common_targets(): name = "headers", exported_headers = subdir_glob([ ("include", "pytorch/tokenizers/*.h"), - ]), + ], exclude = ["pcre2_regex.h", "std_regex.h"]), visibility = [ "@EXECUTORCH_CLIENTS", "//pytorch/tokenizers/...", @@ -23,20 +23,19 @@ def define_common_targets(): platforms = PLATFORMS, ) + # TODO: add target for regex which does lookahed with pcre2 + # by adding "-DSUPPORT_REGEX_LOOKAHEAD" as a compiler flag + # and including pcre2 dependencies. runtime.cxx_library( name = "regex", - srcs = ["src/regex.cpp"] + glob([ - "src/*_regex.cpp", - ]), - exported_headers = subdir_glob([ - ("include", "pytorch/tokenizers/regex.h"), - ("include", "pytorch/tokenizers/*_regex.h"), - ]), + srcs = [ + "src/regex.cpp", + "src/re2_regex.cpp", + ], exported_deps = [ ":headers", ], exported_external_deps = [ - "pcre2", "re2", ], visibility = ["//pytorch/tokenizers/..."],