@@ -18,6 +18,8 @@ project(Tokenizers)
18
18
19
19
option (TOKENIZERS_BUILD_TEST "Build tests" OFF )
20
20
option (TOKENIZERS_BUILD_TOOLS "Build tools" OFF )
21
+ option (SUPPORT_REGEX_LOOKAHEAD
22
+ "Support regex lookahead patterns (requires PCRE2)" OFF )
21
23
22
24
# Ignore weak attribute warning
23
25
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes" )
@@ -31,20 +33,27 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2)
31
33
add_subdirectory (${CMAKE_CURRENT_SOURCE_DIR} /third-party/sentencepiece)
32
34
33
35
# Configure PCRE2
34
- set (PCRE2_BUILD_PCRE2_8 ON )
35
- set (PCRE2_BUILD_PCRE2_16 OFF )
36
- set (PCRE2_BUILD_PCRE2_32 OFF )
37
- set (PCRE2_BUILD_TESTS OFF )
38
- set (PCRE2_BUILD_PCRE2GREP OFF )
39
- set (PCRE2_BUILD_PCRE2TEST OFF )
40
- set (PCRE2_BUILD_PCRE2GPERF OFF )
41
- set (PCRE2_BUILD_DOCS OFF )
42
- set (PCRE2_BUILD_LIBPCRE2_PDB OFF )
43
- add_subdirectory (${CMAKE_CURRENT_SOURCE_DIR} /third-party/pcre2)
36
+ if (SUPPORT_REGEX_LOOKAHEAD)
37
+ set (PCRE2_BUILD_PCRE2_8 ON )
38
+ set (PCRE2_BUILD_PCRE2_16 OFF )
39
+ set (PCRE2_BUILD_PCRE2_32 OFF )
40
+ set (PCRE2_BUILD_TESTS OFF )
41
+ set (PCRE2_BUILD_PCRE2GREP OFF )
42
+ set (PCRE2_BUILD_PCRE2TEST OFF )
43
+ set (PCRE2_BUILD_PCRE2GPERF OFF )
44
+ set (PCRE2_BUILD_DOCS OFF )
45
+ set (PCRE2_BUILD_LIBPCRE2_PDB OFF )
46
+ add_subdirectory (${CMAKE_CURRENT_SOURCE_DIR} /third-party/pcre2)
47
+ endif ()
44
48
45
49
set (CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag} )
46
50
47
51
file (GLOB tokenizers_source_files ${CMAKE_CURRENT_SOURCE_DIR} /src/*.cpp)
52
+ if (NOT SUPPORT_REGEX_LOOKAHEAD)
53
+ list (REMOVE_ITEM
54
+ tokenizers_source_files
55
+ ${CMAKE_CURRENT_SOURCE_DIR} /src/pcre2_regex.cpp)
56
+ endif ()
48
57
file (GLOB unicode_source_files
49
58
${CMAKE_CURRENT_SOURCE_DIR} /third-party/llama.cpp-unicode/src/*.cpp)
50
59
add_library (tokenizers STATIC ${tokenizers_source_files}
@@ -58,10 +67,16 @@ target_include_directories(
58
67
${CMAKE_CURRENT_SOURCE_DIR} /third-party/sentencepiece/src
59
68
${CMAKE_CURRENT_SOURCE_DIR} /third-party/re2
60
69
${CMAKE_CURRENT_SOURCE_DIR} /third-party/json/single_include
61
- ${CMAKE_CURRENT_SOURCE_DIR} /third-party/llama.cpp-unicode/include
62
- ${CMAKE_CURRENT_SOURCE_DIR} /third-party/pcre2/src )
70
+ ${CMAKE_CURRENT_SOURCE_DIR} /third-party/llama.cpp-unicode/include )
71
+ target_link_libraries (tokenizers PUBLIC sentencepiece- static re2::re2 )
63
72
64
- target_link_libraries (tokenizers PUBLIC sentencepiece-static re2::re2 pcre2-8)
73
+ if (SUPPORT_REGEX_LOOKAHEAD)
74
+ target_include_directories (tokenizers
75
+ PUBLIC
76
+ ${CMAKE_CURRENT_SOURCE_DIR} /third-party/pcre2/src)
77
+ target_link_libraries (tokenizers PUBLIC pcre2-8)
78
+ target_compile_definitions (tokenizers PUBLIC SUPPORT_REGEX_LOOKAHEAD)
79
+ endif ()
65
80
66
81
# Build test
67
82
if (TOKENIZERS_BUILD_TEST)
0 commit comments