From be654c54d3e206033eee0a6f078b47a220a60fcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlo=20Marcelo=20Arenas=20Bel=C3=B3n?= Date: Wed, 13 Nov 2024 05:25:19 -0800 Subject: [PATCH] WIP: slightly improve substitutions Avoid at least one crash introduced with recent changes to substitute code as well as clarify what the expected offset value should be when overflowing the provided buffer. While at it, make sure that the returned string is always NUL terminated, and do some minor cleanup. --- doc/pcre2api.3 | 9 ++-- doc/pcre2test.1 | 2 +- src/pcre2_compile.c | 97 ++++++++++++++++++++---------------------- src/pcre2_substitute.c | 28 +++++++----- src/pcre2test.c | 7 +++ testdata/testinput2 | 35 ++++++++++++++- testdata/testoutput2 | 60 +++++++++++++++++++++++++- 7 files changed, 170 insertions(+), 68 deletions(-) diff --git a/doc/pcre2api.3 b/doc/pcre2api.3 index 8eb0b68dc..2f5c70528 100644 --- a/doc/pcre2api.3 +++ b/doc/pcre2api.3 @@ -3815,9 +3815,10 @@ PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output buffer is too small. The default action is to return PCRE2_ERROR_NOMEMORY immediately. If this option is set, however, \fBpcre2_substitute()\fP continues to go through the motions of matching and substituting (without, of course, writing anything) -in order to compute the size of buffer that is needed. This value is passed -back via the \fIoutlengthptr\fP variable, with the result of the function still -being PCRE2_ERROR_NOMEMORY. +in order to compute the size of buffer that is needed, which will include the +extra space for the terminating NUL. This value is passed back via the +\fIoutlengthptr\fP variable, with the result of the function still being +PCRE2_ERROR_NOMEMORY. .P Passing a buffer size of zero is a permitted way of finding out how much memory is needed for given substitution. However, this does mean that the entire @@ -3938,7 +3939,7 @@ that can be applied to group captures. For example, if group 1 has captured .P If either PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode properties are used for case forcing characters whose code points are greater -than 127. However, only basic case folding, as determined by the Unicode file +than 127. However, only simple case folding, as determined by the Unicode file \fBCaseFolding.txt\fP is supported. PCRE2 does not support language-specific special casing rules such as using different lower case Greek sigmas in the middle and ends of words (as defined in the Unicode file diff --git a/doc/pcre2test.1 b/doc/pcre2test.1 index 73845f65b..db385c865 100644 --- a/doc/pcre2test.1 +++ b/doc/pcre2test.1 @@ -1225,7 +1225,7 @@ command are of two types. The following modifiers set options for \fBpcre2_match()\fP or \fBpcre2_dfa_match()\fP. See .\" HREF -\fBpcreapi\fP +\fBpcre2api\fP .\" for a description of their effects. .sp diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index caf34a66f..c439b495b 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -826,12 +826,12 @@ for (;;) return; case META_CAPTURE: - fprintf(stderr, "META_CAPTURE %d", meta_arg); + fprintf(stderr, "META_CAPTURE %u", meta_arg); break; case META_RECURSE: GETOFFSET(offset, pptr); - fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset); + fprintf(stderr, "META_RECURSE %u %zu", meta_arg, offset); break; case META_BACKREF: @@ -839,7 +839,7 @@ for (;;) offset = cb->small_ref_offset[meta_arg]; else GETOFFSET(offset, pptr); - fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset); + fprintf(stderr, "META_BACKREF %u %zu", meta_arg, offset); break; case META_ESCAPE: @@ -847,7 +847,7 @@ for (;;) { uint32_t ptype = *pptr >> 16; uint32_t pvalue = *pptr++ & 0xffff; - fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? CHAR_P:CHAR_p, + fprintf(stderr, "META \\%c %u %u", (meta_arg == ESC_P)? CHAR_P:CHAR_p, ptype, pvalue); } else @@ -914,7 +914,7 @@ for (;;) case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break; case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break; case META_KET: fprintf(stderr, "META )"); break; - case META_ALT: fprintf(stderr, "META | %d", meta_arg); break; + case META_ALT: fprintf(stderr, "META | %u", meta_arg); break; case META_CLASS: fprintf(stderr, "META ["); break; case META_CLASS_NOT: fprintf(stderr, "META [^"); break; @@ -925,8 +925,8 @@ for (;;) case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break; case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break; - case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break; - case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break; + case META_POSIX: fprintf(stderr, "META_POSIX %u", *pptr++); break; + case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %u", *pptr++); break; case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break; case META_FAIL: fprintf(stderr, "META (*FAIL)"); break; @@ -941,22 +941,22 @@ for (;;) break; case META_LOOKBEHIND: - fprintf(stderr, "META (?<= %d %d", meta_arg, *pptr); + fprintf(stderr, "META (?<= %u %u", meta_arg, *pptr); pptr += 2; break; case META_LOOKBEHIND_NA: - fprintf(stderr, "META (*naplb: %d %d", meta_arg, *pptr); + fprintf(stderr, "META (*naplb: %u %u", meta_arg, *pptr); pptr += 2; break; case META_LOOKBEHINDNOT: - fprintf(stderr, "META (?="); - fprintf(stderr, "%d.", *pptr++); - fprintf(stderr, "%d)", *pptr++); + fprintf(stderr, "%u.", *pptr++); + fprintf(stderr, "%u)", *pptr++); break; case META_COND_NAME: - fprintf(stderr, "META (?() length=%d offset=", *pptr++); + fprintf(stderr, "META (?() length=%u offset=", *pptr++); GETOFFSET(offset, pptr); - fprintf(stderr, "%zd", offset); + fprintf(stderr, "%zu", offset); break; case META_COND_RNAME: - fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++); + fprintf(stderr, "META (?(R&name) length=%u offset=", *pptr++); GETOFFSET(offset, pptr); - fprintf(stderr, "%zd", offset); + fprintf(stderr, "%zu", offset); break; /* This is kept as a name, because it might be. */ case META_COND_RNUMBER: - fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++); + fprintf(stderr, "META (?(Rnumber) length=%u offset=", *pptr++); GETOFFSET(offset, pptr); - fprintf(stderr, "%zd", offset); + fprintf(stderr, "%zu", offset); break; case META_SCS_NAME: - fprintf(stderr, "META (*scan_substring:() length=%d offset=", *pptr++); + fprintf(stderr, "META (*scan_substring:() length=%u offset=", *pptr++); GETOFFSET(offset, pptr); - fprintf(stderr, "%zd", offset); + fprintf(stderr, "%zu", offset); break; case META_SCS_NUMBER: - fprintf(stderr, "META_SCS_NUMBER %d offset=", pptr[SIZEOFFSET]); + fprintf(stderr, "META_SCS_NUMBER %u offset=", pptr[SIZEOFFSET]); GETOFFSET(offset, pptr); - fprintf(stderr, "%zd", offset); + fprintf(stderr, "%zu", offset); pptr++; break; case META_SCS_NEXT_NAME: - fprintf(stderr, "META_SCS_NEXT_NAME length=%d offset=", *pptr++); + fprintf(stderr, "META_SCS_NEXT_NAME length=%u offset=", *pptr++); GETOFFSET(offset, pptr); - fprintf(stderr, "%zd", offset); + fprintf(stderr, "%zu", offset); break; case META_SCS_NEXT_NUMBER: - fprintf(stderr, "META_SCS_NEXT_NUMBER %d offset=", pptr[SIZEOFFSET]); + fprintf(stderr, "META_SCS_NEXT_NUMBER %u offset=", pptr[SIZEOFFSET]); GETOFFSET(offset, pptr); - fprintf(stderr, "%zd", offset); + fprintf(stderr, "%zu", offset); pptr++; break; @@ -1099,7 +1099,6 @@ associated JIT data. */ PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION pcre2_code_copy(const pcre2_code *code) { -PCRE2_SIZE *ref_count; pcre2_code *newcode; if (code == NULL) return NULL; @@ -1113,7 +1112,7 @@ in the decoded tables. */ if ((code->flags & PCRE2_DEREF_TABLES) != 0) { - ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH); + PCRE2_SIZE *ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH); (*ref_count)++; } @@ -2597,9 +2596,7 @@ won't be recognized. */ } while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0) - { ptr++; - } } /* Check name length */ @@ -5479,7 +5476,7 @@ static BOOL find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr, int *countptr, int *errorcodeptr, compile_block *cb) { -uint32_t i, groupnumber; +uint32_t i; int count; PCRE2_UCHAR *slot = cb->name_table; @@ -5511,8 +5508,9 @@ count = 0; for (;;) { + uint32_t groupnumber = GET2(slot,0); + count++; - groupnumber = GET2(slot,0); cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1; if (groupnumber > cb->top_backref) cb->top_backref = groupnumber; if (++i >= cb->names_found) break; @@ -6800,7 +6798,7 @@ for (;; pptr++) #ifdef MAYBE_UTF_MULTI if (utf && NOT_FIRSTCU(code[-1])) { - PCRE2_UCHAR *lastchar = code - 1; + PCRE2_SPTR lastchar = code - 1; BACKCHAR(lastchar); mclength = (uint32_t)(code - lastchar); /* Length of UTF character */ memcpy(mcbuffer, lastchar, CU2BYTES(mclength)); /* Save the char */ @@ -7940,7 +7938,6 @@ PCRE2_UCHAR *last_branch = code; PCRE2_UCHAR *start_bracket = code; BOOL lookbehind; open_capitem capitem; -int capnumber = 0; int okreturn = 1; uint32_t *pptr = *pptrptr; uint32_t firstcu, reqcu; @@ -7998,7 +7995,7 @@ OP_SCBRAPOS, happens later, after the group has been compiled. */ if (*code == OP_CBRA) { - capnumber = GET2(code, 1 + LINK_SIZE); + int capnumber = GET2(code, 1 + LINK_SIZE); capitem.number = capnumber; capitem.next = open_caps; capitem.assert_depth = cb->assert_depth; @@ -10212,11 +10209,9 @@ show_parsed(&cb); #ifdef DEBUG_SHOW_CAPTURES { named_group *ng = cb.named_groups; - fprintf(stderr, "+++Captures: %d\n", cb.bracount); + fprintf(stderr, "+++Captures: %u\n", cb.bracount); for (i = 0; i < cb.names_found; i++, ng++) - { - fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name); - } + fprintf(stderr, "+++%3u %.*s\n", ng->number, ng->length, ng->name); } #endif @@ -10415,11 +10410,11 @@ if (errorcode == 0 && cb.had_recurse) rcode != NULL; rcode = find_recurse(rcode + 1 + LINK_SIZE, utf)) { - int p, groupnumber; + int groupnumber = (int)GET(rcode, 1); - groupnumber = (int)GET(rcode, 1); if (groupnumber == 0) rgroup = codestart; else { + int p; PCRE2_SPTR search_from = codestart; rgroup = NULL; for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7) diff --git a/src/pcre2_substitute.c b/src/pcre2_substitute.c index fa9fe4450..9125d40f8 100644 --- a/src/pcre2_substitute.c +++ b/src/pcre2_substitute.c @@ -289,13 +289,14 @@ Returns: >= 0 number of substitutions made overflow, either give an error immediately, or keep on, accumulating the length. */ -#define CHECKMEMCPY(from,length) \ +#define CHECKMEMCPY(from,length) do \ { \ if (!overflowed && lengthleft < length) \ { \ if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \ overflowed = TRUE; \ extra_needed = length - lengthleft; \ + lengthleft = 0; \ } \ else if (overflowed) \ { \ @@ -307,7 +308,7 @@ length. */ buff_offset += length; \ lengthleft -= length; \ } \ - } + } while (0) /* Here's the function */ @@ -439,8 +440,7 @@ if (subject == NULL) /* Find length of zero-terminated subject */ -if (length == PCRE2_ZERO_TERMINATED) - length = subject? PRIV(strlen)(subject) : 0; +if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject); /* Check UTF replacement string if necessary. */ @@ -597,9 +597,7 @@ do ptr = replacement; if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0) - { CHECKMEMCPY(ptr, rlength); - } /* Within a non-literal replacement, which must be scanned character by character, local literal mode can be set by \Q, but only in extended mode @@ -942,7 +940,7 @@ do GETCHARINCTEST(ch, subptr); if (forcecase != 0) { - if (mcontext->substitute_case_callout) + if (mcontext != NULL && mcontext->substitute_case_callout != NULL) { ch = mcontext->substitute_case_callout( ch, @@ -1104,7 +1102,7 @@ do LITERAL: if (forcecase != 0) { - if (mcontext->substitute_case_callout) + if (mcontext != NULL && mcontext->substitute_case_callout != NULL) { ch = mcontext->substitute_case_callout( ch, @@ -1194,8 +1192,17 @@ if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength); } -temp[0] = 0; -CHECKMEMCPY(temp, 1); +/* Zero terminate the output string, truncating if needed. */ + +if (lengthleft-- > 0) ++buff_offset; +else + { + if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; + overflowed = TRUE; + extra_needed++; + lengthleft = 0; + } +if (!overflowed || lengthleft == 0) buffer[buff_offset] = 0; else extra_needed++; /* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set, and matching has carried on after a full buffer, in order to compute the length @@ -1222,6 +1229,7 @@ if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data); return rc; NOROOM: +buffer[buff_offset] = 0; rc = PCRE2_ERROR_NOMEMORY; goto EXIT; diff --git a/src/pcre2test.c b/src/pcre2test.c index 7f01cf928..a241aba43 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -7500,6 +7500,13 @@ if (pat_patctl.replacement[0] != 0) return PR_OK; } + if (pat_patctl.replacement_case[0] != 0 && + (dat_datctl.control & CTL_NULLCONTEXT) != 0) + { + fprintf(outfile, "** Replacement case callouts are not supported with null_context.\n"); + return PR_OK; + } + if ((dat_datctl.control & CTL_ALLCAPTURES) != 0) fprintf(outfile, "** Ignored with replacement text: allcaptures\n"); } diff --git a/testdata/testinput2 b/testdata/testinput2 index 1fbb778e0..45c8c909f 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -4220,6 +4220,37 @@ apple lemon blackberry /abc/ + 123abc123\=replace=XYZ + 123abc123\=replace=[10]XYZ +\= Expect error + 123abc123\=replace=[9]XYZ + 123abc123\=substitute_overflow_length,replace=[9]XYZ + 123abc123\=substitute_overflow_length,replace=[6]XYZ + 123abc123\=substitute_overflow_length,replace=[1]XYZ + 123abc123\=substitute_overflow_length,replace=[0]XYZ + +/abc/ + 123abc123\=replace=XY + 123abc123\=replace=[9]XY + 123abc123\=replace=[9]XY,substitute_literal +\= Expect error + 123abc123\=replace=[8]XY,substitute_overflow_length + 123abc123\=replace=[8]XY,substitute_overflow_length,substitute_literal + 123abc123\=replace=[6]XY,substitute_overflow_length + 123abc123\=replace=[6]XY,substitute_overflow_length,substitute_literal + 123abc123\=replace=[5]XY,substitute_overflow_length + 123abc123\=replace=[5]XY,substitute_overflow_length,substitute_literal + 123abc123\=replace=[4]XY,substitute_overflow_length + 123abc123\=replace=[4]XY,substitute_overflow_length,substitute_literal + 123abc123\=replace=[3]XY,substitute_overflow_length + 123abc123\=replace=[3]XY,substitute_overflow_length,substitute_literal + 123abc123\=replace=[2]XY,substitute_overflow_length + 123abc123\=replace=[2]XY,substitute_overflow_length,substitute_literal + +/abc/substitute_literal + 123abc123\=replace=XYZ + 123abc123\=replace=[10]XYZ +\= Expect error 123abc123\=replace=[9]XYZ 123abc123\=substitute_overflow_length,replace=[9]XYZ 123abc123\=substitute_overflow_length,replace=[6]XYZ @@ -4714,7 +4745,7 @@ B)x/alt_verbnames,mark /abcd/null_context abcd\=null_context -\= Expect error - not allowed together +\= Expect not allowed together abcd\=null_context,find_limits abcd\=allusedtext,startchar @@ -6061,6 +6092,8 @@ a)"xI /abc/replace=\U$0,substitute_extended,substitute_case_callout=a.U.b.V. XabcY +\= Expect not supported + XabcY\=null_context /a/substitute_extended XaY\=replace=\U$0,substitute_case_callout=a.U. diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 99714596e..6e671d1ad 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -13884,6 +13884,61 @@ Failed: error -48: no more memory: 23 code units are needed 1: fruit\x00juice lemon blackberry /abc/ + 123abc123\=replace=XYZ + 1: 123XYZ123 + 123abc123\=replace=[10]XYZ + 1: 123XYZ123 +\= Expect error + 123abc123\=replace=[9]XYZ +Failed: error -48: no more memory + 123abc123\=substitute_overflow_length,replace=[9]XYZ +Failed: error -48: no more memory: 10 code units are needed + 123abc123\=substitute_overflow_length,replace=[6]XYZ +Failed: error -48: no more memory: 10 code units are needed + 123abc123\=substitute_overflow_length,replace=[1]XYZ +Failed: error -48: no more memory: 10 code units are needed + 123abc123\=substitute_overflow_length,replace=[0]XYZ +Failed: error -48: no more memory: 10 code units are needed + +/abc/ + 123abc123\=replace=XY + 1: 123XY123 + 123abc123\=replace=[9]XY + 1: 123XY123 + 123abc123\=replace=[9]XY,substitute_literal + 1: 123XY123 +\= Expect error + 123abc123\=replace=[8]XY,substitute_overflow_length +Failed: error -48: no more memory: 9 code units are needed + 123abc123\=replace=[8]XY,substitute_overflow_length,substitute_literal +Failed: error -48: no more memory: 9 code units are needed + 123abc123\=replace=[6]XY,substitute_overflow_length +Failed: error -48: no more memory: 9 code units are needed + 123abc123\=replace=[6]XY,substitute_overflow_length,substitute_literal +Failed: error -48: no more memory: 9 code units are needed + 123abc123\=replace=[5]XY,substitute_overflow_length +Failed: error -48: no more memory: 9 code units are needed + 123abc123\=replace=[5]XY,substitute_overflow_length,substitute_literal +Failed: error -48: no more memory: 9 code units are needed + 123abc123\=replace=[4]XY,substitute_overflow_length +Failed: error -48: no more memory: 9 code units are needed + 123abc123\=replace=[4]XY,substitute_overflow_length,substitute_literal +Failed: error -48: no more memory: 9 code units are needed + 123abc123\=replace=[3]XY,substitute_overflow_length +Failed: error -48: no more memory: 9 code units are needed + 123abc123\=replace=[3]XY,substitute_overflow_length,substitute_literal +Failed: error -48: no more memory: 9 code units are needed + 123abc123\=replace=[2]XY,substitute_overflow_length +Failed: error -48: no more memory: 9 code units are needed + 123abc123\=replace=[2]XY,substitute_overflow_length,substitute_literal +Failed: error -48: no more memory: 9 code units are needed + +/abc/substitute_literal + 123abc123\=replace=XYZ + 1: 123XYZ123 + 123abc123\=replace=[10]XYZ + 1: 123XYZ123 +\= Expect error 123abc123\=replace=[9]XYZ Failed: error -48: no more memory 123abc123\=substitute_overflow_length,replace=[9]XYZ @@ -15131,7 +15186,7 @@ No match /abcd/null_context abcd\=null_context 0: abcd -\= Expect error - not allowed together +\= Expect not allowed together abcd\=null_context,find_limits ** Not allowed together: find_limits null_context abcd\=allusedtext,startchar @@ -18065,6 +18120,9 @@ Failed: error -55 at offset 3 in replacement: requested value is not set /abc/replace=\U$0,substitute_extended,substitute_case_callout=a.U.b.V. XabcY 1: XUVcY +\= Expect not supported + XabcY\=null_context +** Replacement case callouts are not supported with null_context. /a/substitute_extended XaY\=replace=\U$0,substitute_case_callout=a.U.