@@ -18,6 +18,7 @@ project(Tokenizers)
18
18
19
19
option (TOKENIZERS_BUILD_TEST "Build tests" OFF )
20
20
option (TOKENIZERS_BUILD_TOOLS "Build tools" OFF )
21
+ option (SUPPORT_REGEX_LOOKAHEAD "Support regex lookahead patterns (requires PCRE2)" OFF )
21
22
22
23
# Ignore weak attribute warning
23
24
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes" )
@@ -31,20 +32,25 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2)
31
32
add_subdirectory (${CMAKE_CURRENT_SOURCE_DIR} /third-party/sentencepiece)
32
33
33
34
# Configure PCRE2
34
- set (PCRE2_BUILD_PCRE2_8 ON )
35
- set (PCRE2_BUILD_PCRE2_16 OFF )
36
- set (PCRE2_BUILD_PCRE2_32 OFF )
37
- set (PCRE2_BUILD_TESTS OFF )
38
- set (PCRE2_BUILD_PCRE2GREP OFF )
39
- set (PCRE2_BUILD_PCRE2TEST OFF )
40
- set (PCRE2_BUILD_PCRE2GPERF OFF )
41
- set (PCRE2_BUILD_DOCS OFF )
42
- set (PCRE2_BUILD_LIBPCRE2_PDB OFF )
43
- add_subdirectory (${CMAKE_CURRENT_SOURCE_DIR} /third-party/pcre2)
35
+ if (SUPPORT_REGEX_LOOKAHEAD)
36
+ set (PCRE2_BUILD_PCRE2_8 ON )
37
+ set (PCRE2_BUILD_PCRE2_16 OFF )
38
+ set (PCRE2_BUILD_PCRE2_32 OFF )
39
+ set (PCRE2_BUILD_TESTS OFF )
40
+ set (PCRE2_BUILD_PCRE2GREP OFF )
41
+ set (PCRE2_BUILD_PCRE2TEST OFF )
42
+ set (PCRE2_BUILD_PCRE2GPERF OFF )
43
+ set (PCRE2_BUILD_DOCS OFF )
44
+ set (PCRE2_BUILD_LIBPCRE2_PDB OFF )
45
+ add_subdirectory (${CMAKE_CURRENT_SOURCE_DIR} /third-party/pcre2)
46
+ endif ()
44
47
45
48
set (CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag} )
46
49
47
50
file (GLOB tokenizers_source_files ${CMAKE_CURRENT_SOURCE_DIR} /src/*.cpp)
51
+ if (NOT SUPPORT_REGEX_LOOKAHEAD)
52
+ list (REMOVE_ITEM tokenizers_source_files ${CMAKE_CURRENT_SOURCE_DIR} /src/pcre2_regex.cpp)
53
+ endif ()
48
54
file (GLOB unicode_source_files
49
55
${CMAKE_CURRENT_SOURCE_DIR} /third-party/llama.cpp-unicode/src/*.cpp)
50
56
add_library (tokenizers STATIC ${tokenizers_source_files}
@@ -58,10 +64,14 @@ target_include_directories(
58
64
${CMAKE_CURRENT_SOURCE_DIR} /third-party/sentencepiece/src
59
65
${CMAKE_CURRENT_SOURCE_DIR} /third-party/re2
60
66
${CMAKE_CURRENT_SOURCE_DIR} /third-party/json/single_include
61
- ${CMAKE_CURRENT_SOURCE_DIR} /third-party/llama.cpp-unicode/include
62
- ${CMAKE_CURRENT_SOURCE_DIR} /third-party/pcre2/src )
67
+ ${CMAKE_CURRENT_SOURCE_DIR} /third-party/llama.cpp-unicode/include )
68
+ target_link_libraries (tokenizers PUBLIC sentencepiece- static re2::re2 )
63
69
64
- target_link_libraries (tokenizers PUBLIC sentencepiece-static re2::re2 pcre2-8)
70
+ if (SUPPORT_REGEX_LOOKAHEAD)
71
+ target_include_directories (tokenizers PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} /third-party/pcre2/src)
72
+ target_link_libraries (tokenizers PUBLIC pcre2-8)
73
+ target_compile_definitions (tokenizers PUBLIC SUPPORT_REGEX_LOOKAHEAD)
74
+ endif ()
65
75
66
76
# Build test
67
77
if (TOKENIZERS_BUILD_TEST)
0 commit comments