diff --git a/doc/pcre2_set_compile_extra_options.3 b/doc/pcre2_set_compile_extra_options.3 index fa15eadb2..c51e35f07 100644 --- a/doc/pcre2_set_compile_extra_options.3 +++ b/doc/pcre2_set_compile_extra_options.3 @@ -20,7 +20,7 @@ options are: .sp PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK Allow \eK in lookarounds .\" JOIN - PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES Allow \ex{d800} to \ex{dfff} + PCRE2_EXTRA_ALLOW_SURROGATES Allow \ex{d800} to \ex{dfff} in UTF-8 and UTF-32 modes .\" JOIN PCRE2_EXTRA_ALT_BSUX Extended alternate \eu, \eU, and diff --git a/doc/pcre2api.3 b/doc/pcre2api.3 index 84fdc9d49..ce3fbaa7a 100644 --- a/doc/pcre2api.3 +++ b/doc/pcre2api.3 @@ -1824,8 +1824,8 @@ Note also that setting PCRE2_NO_UTF_CHECK at compile time does not disable the error that is given if an escape sequence for an invalid Unicode code point is encountered in the pattern. In particular, the so-called "surrogate" code points (0xd800 to 0xdfff) are invalid. If you want to allow escape sequences -such as \ex{d800} you can set the PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES extra -option, as described in the section entitled "Extra compile options" +such as \ex{d800} you can set the PCRE2_EXTRA_ALLOW_SURROGATES extra option, +as described in the section entitled "Extra compile options" .\" HTML .\" below. @@ -1907,7 +1907,7 @@ assertions, following Perl's lead. This option is provided to re-enable the previous behaviour (act in positive lookarounds, ignore in negative ones) in case anybody is relying on it. .sp - PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES + PCRE2_EXTRA_ALLOW_SURROGATES .sp This option applies when compiling a pattern in UTF-8 or UTF-32 mode. It is forbidden in UTF-16 mode, and ignored in non-UTF modes. Unicode "surrogate" @@ -1924,10 +1924,16 @@ for the surrogates using escape sequences. The PCRE2_NO_UTF_CHECK option does not disable the error that occurs, because it applies only to the testing of input strings for UTF validity. .P -If the extra option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is set, surrogate code -point values in UTF-8 and UTF-32 patterns no longer provoke errors and are +If the extra option PCRE2_EXTRA_ALLOW_SURROGATES is set, surrogate code point +values in UTF-8 and UTF-32 patterns no longer provoke errors and are incorporated in the compiled pattern. However, they can only match subject characters if the matching function is called with PCRE2_NO_UTF_CHECK set. +.P +Before 10.43 this option was known as PCRE2_EXTRA_ALLOW_SURROGATES and that +is still available for backward compatibility, but the new name should be used +in new code to better reflect that it also applies to characters in that range +in UTF-32 as part ot the pattern or subject, including characters encoded in +UTF-8 if found in the subject. .sp PCRE2_EXTRA_ALT_BSUX .sp diff --git a/doc/pcre2pattern.3 b/doc/pcre2pattern.3 index 266a6c2bd..ee4b12d25 100644 --- a/doc/pcre2pattern.3 +++ b/doc/pcre2pattern.3 @@ -538,8 +538,8 @@ limited to certain values, as follows: Invalid Unicode code points are all those in the range 0xd800 to 0xdfff (the so-called "surrogate" code points). The check for these can be disabled by the caller of \fBpcre2_compile()\fP by setting the option -PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES. However, this is possible only in UTF-8 -and UTF-32 modes, because these values are not representable in UTF-16. +PCRE2_EXTRA_ALLOW_SURROGATES. However, this is possible only in UTF-8 and +UTF-32 modes, because these values are not representable in UTF-16. . . .SS "Escape sequences in character classes" @@ -1436,8 +1436,8 @@ inclusive. They can also be used for code points specified numerically, for example [\e000-\e037]. Ranges can include any characters that are valid for the current mode. In any UTF mode, the so-called "surrogate" characters (those whose code points lie between 0xd800 and 0xdfff inclusive) may not be specified -explicitly by default (the PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES option disables -this check). However, ranges such as [\ex{d7ff}-\ex{e000}], which include the +explicitly by default (the PCRE2_EXTRA_ALLOW_SURROGATES option disables the +check). However, ranges such as [\ex{d7ff}-\ex{e000}], which include the surrogates, are always permitted. .P There is a special case in EBCDIC environments for ranges whose end points are diff --git a/doc/pcre2test.1 b/doc/pcre2test.1 index 616f148f5..408240ba1 100644 --- a/doc/pcre2test.1 +++ b/doc/pcre2test.1 @@ -577,7 +577,7 @@ for a description of the effects of these options. .sp allow_empty_class set PCRE2_ALLOW_EMPTY_CLASS allow_lookaround_bsk set PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK - allow_surrogate_escapes set PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES + allow_surrogates set PCRE2_EXTRA_ALLOW_SURROGATES alt_bsux set PCRE2_ALT_BSUX alt_circumflex set PCRE2_ALT_CIRCUMFLEX alt_verbnames set PCRE2_ALT_VERBNAMES diff --git a/doc/pcre2unicode.3 b/doc/pcre2unicode.3 index 1ed8aaf58..8422395b8 100644 --- a/doc/pcre2unicode.3 +++ b/doc/pcre2unicode.3 @@ -309,7 +309,7 @@ UTF-32.) Setting PCRE2_NO_UTF_CHECK at compile time does not disable the error that is given if an escape sequence for an invalid Unicode code point is encountered in the pattern. If you want to allow escape sequences such as \ex{d800} (a -surrogate code point) you can set the PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES extra +surrogate code point) you can set the PCRE2_EXTRA_ALLOW_SURROGATES extra option. However, this is possible only in UTF-8 and UTF-32 modes, because these values are not representable in UTF-16. . diff --git a/src/pcre2.h.in b/src/pcre2.h.in index cd7fdcf2f..10fca8e38 100644 --- a/src/pcre2.h.in +++ b/src/pcre2.h.in @@ -146,7 +146,7 @@ D is inspected during pcre2_dfa_match() execution /* An additional compile options word is available in the compile context. */ -#define PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES 0x00000001u /* C */ +#define PCRE2_EXTRA_ALLOW_SURROGATES 0x00000001u /* C */ #define PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL 0x00000002u /* C */ #define PCRE2_EXTRA_MATCH_WORD 0x00000004u /* C */ #define PCRE2_EXTRA_MATCH_LINE 0x00000008u /* C */ @@ -160,6 +160,9 @@ D is inspected during pcre2_dfa_match() execution #define PCRE2_EXTRA_ASCII_POSIX 0x00000800u /* C */ #define PCRE2_EXTRA_ASCII_DIGIT 0x00001000u /* C */ +/* Backward compatibility */ +#define PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES PCRE2_EXTRA_ALLOW_SURROGATES + /* These are for pcre2_jit_compile(). */ #define PCRE2_JIT_COMPLETE 0x00000001u /* For full matching */ diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 634360b71..c8ded7553 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -783,7 +783,7 @@ are allowed. */ #define PUBLIC_COMPILE_EXTRA_OPTIONS \ (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \ - PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \ + PCRE2_EXTRA_ALLOW_SURROGATES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \ PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \ PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \ PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \ @@ -1691,7 +1691,7 @@ else if (c > 0x10ffffU) *errorcodeptr = ERR77; else if (c >= 0xd800 && c <= 0xdfff && - (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0) + (xoptions & PCRE2_EXTRA_ALLOW_SURROGATES) == 0) *errorcodeptr = ERR73; } else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77; @@ -1886,7 +1886,7 @@ else else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET) { if (utf && c >= 0xd800 && c <= 0xdfff && - (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0) + (xoptions & PCRE2_EXTRA_ALLOW_SURROGATES) == 0) { ptr--; *errorcodeptr = ERR73; @@ -1959,7 +1959,7 @@ else else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET) { if (utf && c >= 0xd800 && c <= 0xdfff && - (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0) + (xoptions & PCRE2_EXTRA_ALLOW_SURROGATES) == 0) { ptr--; *errorcodeptr = ERR73; @@ -10177,23 +10177,29 @@ if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0) /* Check UTF. We have the original options in 'options', with that value as modified by (*UTF) etc in cb->external_options. The extra option -PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the +PCRE2_EXTRA_ALLOW_SURROGATES is not permitted in UTF-16 mode because the surrogate code points cannot be represented in UTF-16. */ utf = (cb.external_options & PCRE2_UTF) != 0; if (utf) { + BOOL strict = TRUE; + +#if PCRE2_CODE_UNIT_WIDTH != 16 + strict = (ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATES) == 0; +#endif + if ((options & PCRE2_NEVER_UTF) != 0) { errorcode = ERR74; goto HAD_EARLY_ERROR; } if ((options & PCRE2_NO_UTF_CHECK) == 0 && - (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0) + (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset, strict)) != 0) goto HAD_ERROR; /* Offset was set by valid_utf() */ #if PCRE2_CODE_UNIT_WIDTH == 16 - if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0) + if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATES) != 0) { errorcode = ERR91; goto HAD_EARLY_ERROR; diff --git a/src/pcre2_convert.c b/src/pcre2_convert.c index 36466e4b9..5fcfd5c7b 100644 --- a/src/pcre2_convert.c +++ b/src/pcre2_convert.c @@ -1090,7 +1090,7 @@ if (utf) if (utf && (options & PCRE2_CONVERT_NO_UTF_CHECK) == 0) { PCRE2_SIZE erroroffset; - rc = PRIV(valid_utf)(pattern, plength, &erroroffset); + rc = PRIV(valid_utf)(pattern, plength, &erroroffset, TRUE); if (rc != 0) { *bufflenptr = erroroffset; diff --git a/src/pcre2_dfa_match.c b/src/pcre2_dfa_match.c index fa76b32bc..0917fadae 100644 --- a/src/pcre2_dfa_match.c +++ b/src/pcre2_dfa_match.c @@ -3575,7 +3575,8 @@ if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) offset to be an absolute offset in the whole string. */ match_data->rc = PRIV(valid_utf)(check_subject, - length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar)); + length - (PCRE2_SIZE)(check_subject - subject), + &(match_data->startchar), TRUE); if (match_data->rc != 0) { match_data->startchar += (PCRE2_SIZE)(check_subject - subject); diff --git a/src/pcre2_error.c b/src/pcre2_error.c index 09904c03e..5ec2074d5 100644 --- a/src/pcre2_error.c +++ b/src/pcre2_error.c @@ -177,7 +177,7 @@ static const unsigned char compile_error_texts[] = "internal error: unknown code in parsed pattern\0" /* 90 */ "internal error: bad code value in parsed_skip()\0" - "PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode\0" + "PCRE2_EXTRA_ALLOW_SURROGATES is not allowed in UTF-16 mode\0" "invalid option bits with PCRE2_LITERAL\0" "\\N{U+dddd} is supported only in Unicode (UTF) mode\0" "invalid hyphen in option setting\0" diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index 5e7796a9d..3c59896e5 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -206,7 +206,7 @@ Unicode doesn't go beyond 0x0010ffff. */ /* This is the largest valid UTF/Unicode code point. */ -#define MAX_UTF_CODE_POINT 0x10ffff +#define MAX_UTF_CODE_POINT 0x10ffffu /* Compile-time positive error numbers (all except UTF errors, which are negative) start at this value. It should probably never be changed, in case @@ -2036,7 +2036,8 @@ extern PCRE2_SIZE _pcre2_strlen(PCRE2_SPTR); extern int _pcre2_strncmp(PCRE2_SPTR, PCRE2_SPTR, size_t); extern int _pcre2_strncmp_c8(PCRE2_SPTR, const char *, size_t); extern int _pcre2_study(pcre2_real_code *); -extern int _pcre2_valid_utf(PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE *); +extern int _pcre2_valid_utf(PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE *, + BOOL); extern BOOL _pcre2_was_newline(PCRE2_SPTR, uint32_t, PCRE2_SPTR, uint32_t *, BOOL); extern BOOL _pcre2_xclass(uint32_t, PCRE2_SPTR, BOOL); diff --git a/src/pcre2_intmodedep.h b/src/pcre2_intmodedep.h index 007e4c013..9860eb937 100644 --- a/src/pcre2_intmodedep.h +++ b/src/pcre2_intmodedep.h @@ -471,7 +471,7 @@ code. */ /* These are trivial for the 32-bit library, since all UTF-32 characters fit into one PCRE2_UCHAR unit. */ -#define MAX_UTF_SINGLE_CU (0x10ffffu) +#define MAX_UTF_SINGLE_CU 0x10ffffu #define HAS_EXTRALEN(c) (0) #define GET_EXTRALEN(c) (0) #define NOT_FIRSTCU(c) (0) diff --git a/src/pcre2_match.c b/src/pcre2_match.c index ea98af3c3..4946cbebb 100644 --- a/src/pcre2_match.c +++ b/src/pcre2_match.c @@ -6551,7 +6551,7 @@ if (use_jit) invalid code point to be an absolute offset in the whole string. */ match_data->rc = PRIV(valid_utf)(start_match, - length - (start_match - subject), &(match_data->startchar)); + length - (start_match - subject), &(match_data->startchar), TRUE); if (match_data->rc != 0) { match_data->startchar += start_match - subject; @@ -6598,14 +6598,15 @@ If we get here in those circumstances, it means the subject string is valid, but for some reason JIT matching was not successful. There is no need to check the subject again. -We check only the portion of the subject that might be be inspected during +We check only the portion of the subject that might be inspected during matching - from the offset minus the maximum lookbehind to the given length. This saves time when a small part of a large subject is being matched by the use of a starting offset. Note that the maximum lookbehind is a number of characters, not code units. Note also that support for invalid UTF forces a check, overriding the setting -of PCRE2_NO_CHECK_UTF. */ +of PCRE2_NO_CHECK_UTF, so validate_utf() has to be told not to error if a +surrogate is found and the PCRE2_EXTRA_ALLOW_SURROGATES setting is used. */ #ifdef SUPPORT_UNICODE if (utf && @@ -6685,8 +6686,11 @@ if (utf && for (;;) { + BOOL strict = (re->extra_options & PCRE2_EXTRA_ALLOW_SURROGATES) == 0; + match_data->rc = PRIV(valid_utf)(mb->check_subject, - length - (mb->check_subject - subject), &(match_data->startchar)); + length - (mb->check_subject - subject), + &(match_data->startchar), strict); if (match_data->rc == 0) break; /* Valid UTF string */ @@ -7461,7 +7465,7 @@ if (utf && end_subject != true_end_subject && mb->check_subject = start_match; rc = PRIV(valid_utf)(start_match, length - (start_match - subject), - &(match_data->startchar)); + &(match_data->startchar), TRUE); /* The rest of the subject is valid UTF. */ diff --git a/src/pcre2_substitute.c b/src/pcre2_substitute.c index edbb78c6d..ecd2f45f5 100644 --- a/src/pcre2_substitute.c +++ b/src/pcre2_substitute.c @@ -345,7 +345,7 @@ if (length == PCRE2_ZERO_TERMINATED) #ifdef SUPPORT_UNICODE if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) { - rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar)); + rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar), TRUE); if (rc != 0) { match_data->leftchar = 0; diff --git a/src/pcre2_valid_utf.c b/src/pcre2_valid_utf.c index de411b919..4262f8286 100644 --- a/src/pcre2_valid_utf.c +++ b/src/pcre2_valid_utf.c @@ -60,11 +60,13 @@ with the library. In this case, PCRE2_PCRE2TEST is defined. */ /* This function should never be called when Unicode is not supported. */ int -PRIV(valid_utf)(PCRE2_SPTR string, PCRE2_SIZE length, PCRE2_SIZE *erroroffset) +PRIV(valid_utf)(PCRE2_SPTR string, PCRE2_SIZE length, + PCRE2_SIZE *erroroffset, BOOL strict) { (void)string; (void)length; (void)erroroffset; +(void)strict; return 0; } #else /* UTF is supported */ @@ -85,21 +87,23 @@ invalid string are then undefined. string points to the string length length of string errp pointer to an error position offset variable + strict enforce strict checks Returns: == 0 if the string is a valid UTF string != 0 otherwise, setting the offset of the bad character */ int -PRIV(valid_utf)(PCRE2_SPTR string, PCRE2_SIZE length, PCRE2_SIZE *erroroffset) +PRIV(valid_utf)(PCRE2_SPTR string, PCRE2_SIZE length, + PCRE2_SIZE *erroroffset, BOOL strict) { PCRE2_SPTR p; uint32_t c; -/* ----------------- Check a UTF-8 string ----------------- */ - #if PCRE2_CODE_UNIT_WIDTH == 8 +/* ----------------- Check a UTF-8 string ----------------- */ + /* Originally, this function checked according to RFC 2279, allowing for values in the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were in the canonical format. Once somebody had pointed out RFC 3629 to me (it @@ -138,7 +142,7 @@ for (p = string; length > 0; p++) c = *p; length--; - if (c < 128) continue; /* ASCII character */ + if (c <= 0x7f) continue; /* ASCII character */ if (c < 0xc0) /* Isolated 10xx xxxx byte */ { @@ -206,7 +210,7 @@ for (p = string; length > 0; p++) *erroroffset = (PCRE2_SIZE)(p - string) - 2; return PCRE2_ERROR_UTF8_ERR16; } - if (c == 0xed && d >= 0xa0) + if ((c == 0xed && d >= 0xa0) && strict) { *erroroffset = (PCRE2_SIZE)(p - string) - 2; return PCRE2_ERROR_UTF8_ERR14; @@ -303,7 +307,7 @@ for (p = string; length > 0; p++) break; } - /* Character is valid under RFC 2279, but 4-byte and 5-byte characters are + /* Character is valid under RFC 2279, but 5-byte and 6-byte characters are excluded by RFC 3629. The pointer p is currently at the last byte of the character. */ @@ -313,12 +317,11 @@ for (p = string; length > 0; p++) return (ab == 4)? PCRE2_ERROR_UTF8_ERR11 : PCRE2_ERROR_UTF8_ERR12; } } -return 0; +#elif PCRE2_CODE_UNIT_WIDTH == 16 /* ----------------- Check a UTF-16 string ----------------- */ - -#elif PCRE2_CODE_UNIT_WIDTH == 16 +(void)strict; /* There's not so much work, nor so many errors, for UTF-16. PCRE2_ERROR_UTF16_ERR1 Missing low surrogate at the end of the string @@ -358,14 +361,11 @@ for (p = string; length > 0; p++) return PCRE2_ERROR_UTF16_ERR3; } } -return 0; - +#else /* ----------------- Check a UTF-32 string ----------------- */ -#else - /* There is very little to do for a UTF-32 string. PCRE2_ERROR_UTF32_ERR1 Surrogate character PCRE2_ERROR_UTF32_ERR2 Character > 0x10ffff @@ -377,21 +377,21 @@ for (p = string; length > 0; length--, p++) if ((c & 0xfffff800u) != 0xd800u) { /* Normal UTF-32 code point. Neither high nor low surrogate. */ - if (c > 0x10ffffu) + if (c > MAX_UTF_CODE_POINT) { *erroroffset = (PCRE2_SIZE)(p - string); return PCRE2_ERROR_UTF32_ERR2; } } - else + else if (strict) { /* A surrogate */ *erroroffset = (PCRE2_SIZE)(p - string); return PCRE2_ERROR_UTF32_ERR1; } } -return 0; #endif /* CODE_UNIT_WIDTH */ +return 0; } #endif /* SUPPORT_UNICODE */ diff --git a/src/pcre2test.c b/src/pcre2test.c index 21b193701..be3c46705 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -343,7 +343,7 @@ widths are actually available, because the input to pcre2test is always in 8-bit code units. So we include the UTF validity checking function for 8-bit code units. */ -extern int valid_utf(PCRE2_SPTR8, PCRE2_SIZE, PCRE2_SIZE *); +extern int valid_utf(PCRE2_SPTR8, PCRE2_SIZE, PCRE2_SIZE *, BOOL); #define PCRE2_CODE_UNIT_WIDTH 8 #undef PCRE2_SPTR @@ -639,7 +639,7 @@ static modstruct modlist[] = { { "allcaptures", MOD_PND, MOD_CTL, CTL_ALLCAPTURES, PO(control) }, { "allow_empty_class", MOD_PAT, MOD_OPT, PCRE2_ALLOW_EMPTY_CLASS, PO(options) }, { "allow_lookaround_bsk", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK, CO(extra_options) }, - { "allow_surrogate_escapes", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES, CO(extra_options) }, + { "allow_surrogates", MOD_CTC, MOD_OPT, PCRE2_EXTRA_ALLOW_SURROGATES, CO(extra_options) }, { "allusedtext", MOD_PNDP, MOD_CTL, CTL_ALLUSEDTEXT, PO(control) }, { "allvector", MOD_PND, MOD_CTL, CTL2_ALLVECTOR, PO(control2) }, { "alt_bsux", MOD_PAT, MOD_OPT, PCRE2_ALT_BSUX, PO(options) }, @@ -4297,7 +4297,7 @@ show_compile_extra_options(uint32_t options, const char *before, if (options == 0) fprintf(outfile, "%s %s", before, after); else fprintf(outfile, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s", before, - ((options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)? " allow_surrogate_escapes" : "", + ((options & PCRE2_EXTRA_ALLOW_SURROGATES) != 0)? " allow_surrogates" : "", ((options & PCRE2_EXTRA_ALT_BSUX) != 0)? " alt_bsux" : "", ((options & PCRE2_EXTRA_ASCII_BSD) != 0)? " ascii_bsd" : "", ((options & PCRE2_EXTRA_ASCII_BSS) != 0)? " ascii_bss" : "", @@ -7548,7 +7548,7 @@ if (dat_datctl.replacement[0] != 0) is detected. Otherwise, UTF-8 can be used to include wide characters in a replacement. */ - if (utf) badutf = valid_utf(pr, strlen((const char *)pr), &erroroffset); + if (utf) badutf = valid_utf(pr, strlen((const char *)pr), &erroroffset, TRUE); /* Not UTF or invalid UTF-8: just copy the code units. */ diff --git a/testdata/testinput10 b/testdata/testinput10 index 53e37cbca..9ede6a110 100644 --- a/testdata/testinput10 +++ b/testdata/testinput10 @@ -294,6 +294,16 @@ \= Expect no match a\x{123}aa\=offset=5 +/a+/match_invalid_utf + a\x{123}aa\=offset=1 + a\x{123}aa\=offset=2 + a\x{123}aa\=offset=3 + a\x{123}aa\=offset=4 +\= Expect bad offset value + a\x{123}aa\=offset=6 +\= Expect no match + a\x{123}aa\=offset=5 + /\x{1234}+/Ii,utf /\x{1234}+?/Ii,utf @@ -463,12 +473,18 @@ # A special extra option allows excaped surrogate code points in 8-bit mode, # but subjects containing them must not be UTF-checked. -/\x{d800}/I,utf,allow_surrogate_escapes +/\x{d800}/I,utf,allow_surrogates + \x{d800}\=no_utf_check + +/\x{d800}/I,match_invalid_utf,allow_surrogates \x{d800}\=no_utf_check -/\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes +/\udfff\o{157401}/utf,alt_bsux,allow_surrogates \x{dfff}\x{df01}\=no_utf_check - + +/\udfff\o{157401}/match_invalid_utf,alt_bsux,allow_surrogates + \x{dfff}\x{df01}\=no_utf_check + # This has different starting code units in 8-bit mode. /^[^ab]/IB,utf diff --git a/testdata/testinput12 b/testdata/testinput12 index 9b4f8d343..ab48c672c 100644 --- a/testdata/testinput12 +++ b/testdata/testinput12 @@ -367,10 +367,10 @@ # but subjects containing them must not be UTF-checked. These patterns give # errors in 16-bit mode. -/\x{d800}/I,utf,allow_surrogate_escapes +/\x{d800}/I,utf,allow_surrogates \x{d800}\=no_utf_check -/\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes +/\udfff\o{157401}/utf,alt_bsux,allow_surrogates \x{dfff}\x{df01}\=no_utf_check # This has different starting code units in 8-bit mode. @@ -394,7 +394,7 @@ \x{30a1}\x{3041}\x{3007}\x{3007} Katakana Hiragana Han Han \x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul -/^(*sr:.*)/utf,allow_surrogate_escapes +/^(*sr:.*)/utf,allow_surrogates \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana \x{d800}\x{dfff} Surrogates (Unknown) \=no_utf_check diff --git a/testdata/testinput18 b/testdata/testinput18 index de3645d84..47b0d3fbc 100644 --- a/testdata/testinput18 +++ b/testdata/testinput18 @@ -17,7 +17,7 @@ /a(())bc/parens_nest_limit=1 -/abc/allow_surrogate_escapes,max_pattern_length=2 +/abc/allow_surrogates,max_pattern_length=2 # Real tests diff --git a/testdata/testinput5 b/testdata/testinput5 index 0624a0c30..b65f05a61 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -912,6 +912,16 @@ \= Expect no match \x{09f} +/^\p{Cs}/match_invalid_utf +\= Expect no match + \x{dfff}\=no_utf_check + \x{09f} + +/^\p{Cs}/match_invalid_utf,allow_surrogates + \x{dfff}\=no_utf_check +\= Expect no match + \x{09f} + /^\p{Mn}/utf \x{1a1b} diff --git a/testdata/testoutput10 b/testdata/testoutput10 index d40851061..af5744ff3 100644 --- a/testdata/testoutput10 +++ b/testdata/testoutput10 @@ -885,6 +885,22 @@ Error -36 (bad UTF-8 offset) a\x{123}aa\=offset=5 No match +/a+/match_invalid_utf + a\x{123}aa\=offset=1 + 0: aa + a\x{123}aa\=offset=2 + 0: aa + a\x{123}aa\=offset=3 + 0: aa + a\x{123}aa\=offset=4 + 0: a +\= Expect bad offset value + a\x{123}aa\=offset=6 +Failed: error -33: bad offset value +\= Expect no match + a\x{123}aa\=offset=5 +No match + /\x{1234}+/Ii,utf Capture group count = 0 Options: caseless utf @@ -1553,20 +1569,34 @@ Failed: error 176 at offset 259: name is too long in (*MARK), (*PRUNE), (*SKIP), # A special extra option allows excaped surrogate code points in 8-bit mode, # but subjects containing them must not be UTF-checked. -/\x{d800}/I,utf,allow_surrogate_escapes +/\x{d800}/I,utf,allow_surrogates Capture group count = 0 Options: utf -Extra options: allow_surrogate_escapes +Extra options: allow_surrogates First code unit = \xed Last code unit = \x80 Subject length lower bound = 1 \x{d800}\=no_utf_check 0: \x{d800} -/\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes +/\x{d800}/I,match_invalid_utf,allow_surrogates +Capture group count = 0 +Options: match_invalid_utf utf +Extra options: allow_surrogates +First code unit = \xed +Last code unit = \x80 +Subject length lower bound = 1 + \x{d800}\=no_utf_check + 0: \x{d800} + +/\udfff\o{157401}/utf,alt_bsux,allow_surrogates \x{dfff}\x{df01}\=no_utf_check 0: \x{dfff}\x{df01} - + +/\udfff\o{157401}/match_invalid_utf,alt_bsux,allow_surrogates + \x{dfff}\x{df01}\=no_utf_check + 0: \x{dfff}\x{df01} + # This has different starting code units in 8-bit mode. /^[^ab]/IB,utf diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16 index 84c485817..c64695981 100644 --- a/testdata/testoutput12-16 +++ b/testdata/testoutput12-16 @@ -1425,12 +1425,12 @@ No match # but subjects containing them must not be UTF-checked. These patterns give # errors in 16-bit mode. -/\x{d800}/I,utf,allow_surrogate_escapes -Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode +/\x{d800}/I,utf,allow_surrogates +Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATES is not allowed in UTF-16 mode \x{d800}\=no_utf_check -/\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes -Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode +/\udfff\o{157401}/utf,alt_bsux,allow_surrogates +Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATES is not allowed in UTF-16 mode \x{dfff}\x{df01}\=no_utf_check # This has different starting code units in 8-bit mode. @@ -1491,8 +1491,8 @@ No match \x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul 0: \x{1100}\x{2e80}\x{2e80}\x{1101} -/^(*sr:.*)/utf,allow_surrogate_escapes -Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode +/^(*sr:.*)/utf,allow_surrogates +Failed: error 191 at offset 0: PCRE2_EXTRA_ALLOW_SURROGATES is not allowed in UTF-16 mode \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana \x{d800}\x{dfff} Surrogates (Unknown) \=no_utf_check diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32 index 03b6e3940..163ed6e61 100644 --- a/testdata/testoutput12-32 +++ b/testdata/testoutput12-32 @@ -1417,16 +1417,16 @@ No match # but subjects containing them must not be UTF-checked. These patterns give # errors in 16-bit mode. -/\x{d800}/I,utf,allow_surrogate_escapes +/\x{d800}/I,utf,allow_surrogates Capture group count = 0 Options: utf -Extra options: allow_surrogate_escapes +Extra options: allow_surrogates First code unit = \x{d800} Subject length lower bound = 1 \x{d800}\=no_utf_check 0: \x{d800} -/\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes +/\udfff\o{157401}/utf,alt_bsux,allow_surrogates \x{dfff}\x{df01}\=no_utf_check 0: \x{dfff}\x{df01} @@ -1488,7 +1488,7 @@ No match \x{1100}\x{2e80}\x{2e80}\x{1101} Hangul Han Han Hangul 0: \x{1100}\x{2e80}\x{2e80}\x{1101} -/^(*sr:.*)/utf,allow_surrogate_escapes +/^(*sr:.*)/utf,allow_surrogates \x{2e80}\x{3105}\x{2e80}\x{30a1} Han Bopomofo Han Katakana 0: \x{2e80}\x{3105}\x{2e80} \x{d800}\x{dfff} Surrogates (Unknown) \=no_utf_check diff --git a/testdata/testoutput18 b/testdata/testoutput18 index 6d64cdf5a..1b22e98ec 100644 --- a/testdata/testoutput18 +++ b/testdata/testoutput18 @@ -23,8 +23,8 @@ /a(())bc/parens_nest_limit=1 ** Ignored with POSIX interface: parens_nest_limit -/abc/allow_surrogate_escapes,max_pattern_length=2 -** Ignored with POSIX interface: allow_surrogate_escapes max_pattern_length +/abc/allow_surrogates,max_pattern_length=2 +** Ignored with POSIX interface: allow_surrogates max_pattern_length # Real tests diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 7069b653b..339848604 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -17764,5 +17764,5 @@ Error -1: no match Error 0: PCRE2_ERROR_BADDATA (unknown error number) Error 100: no error Error 101: \ at end of pattern -Error 191: PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not allowed in UTF-16 mode +Error 191: PCRE2_EXTRA_ALLOW_SURROGATES is not allowed in UTF-16 mode Error 200: PCRE2_ERROR_BADDATA (unknown error number) diff --git a/testdata/testoutput5 b/testdata/testoutput5 index febcc954d..b2380f7d1 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -1972,6 +1972,20 @@ No match \x{09f} No match +/^\p{Cs}/match_invalid_utf +\= Expect no match + \x{dfff}\=no_utf_check +No match + \x{09f} +No match + +/^\p{Cs}/match_invalid_utf,allow_surrogates + \x{dfff}\=no_utf_check + 0: \x{dfff} +\= Expect no match + \x{09f} +No match + /^\p{Mn}/utf \x{1a1b} 0: \x{1a1b}