Skip to content

WIP: slightly improve substitutions #562

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions doc/pcre2api.3
Original file line number Diff line number Diff line change
Expand Up @@ -3815,9 +3815,10 @@ PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the output buffer is
too small. The default action is to return PCRE2_ERROR_NOMEMORY immediately. If
this option is set, however, \fBpcre2_substitute()\fP continues to go through
the motions of matching and substituting (without, of course, writing anything)
in order to compute the size of buffer that is needed. This value is passed
back via the \fIoutlengthptr\fP variable, with the result of the function still
being PCRE2_ERROR_NOMEMORY.
in order to compute the size of buffer that is needed, which will include the
extra space for the terminating NUL. This value is passed back via the
\fIoutlengthptr\fP variable, with the result of the function still being
PCRE2_ERROR_NOMEMORY.
.P
Passing a buffer size of zero is a permitted way of finding out how much memory
is needed for given substitution. However, this does mean that the entire
Expand Down Expand Up @@ -3938,7 +3939,7 @@ that can be applied to group captures. For example, if group 1 has captured
.P
If either PCRE2_UTF or PCRE2_UCP was set when the pattern was compiled, Unicode
properties are used for case forcing characters whose code points are greater
than 127. However, only basic case folding, as determined by the Unicode file
than 127. However, only simple case folding, as determined by the Unicode file
\fBCaseFolding.txt\fP is supported. PCRE2 does not support language-specific
special casing rules such as using different lower case Greek sigmas in the
middle and ends of words (as defined in the Unicode file
Expand Down
2 changes: 1 addition & 1 deletion doc/pcre2test.1
Original file line number Diff line number Diff line change
Expand Up @@ -1225,7 +1225,7 @@ command are of two types.
The following modifiers set options for \fBpcre2_match()\fP or
\fBpcre2_dfa_match()\fP. See
.\" HREF
\fBpcreapi\fP
\fBpcre2api\fP
.\"
for a description of their effects.
.sp
Expand Down
97 changes: 46 additions & 51 deletions src/pcre2_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -826,28 +826,28 @@ for (;;)
return;

case META_CAPTURE:
fprintf(stderr, "META_CAPTURE %d", meta_arg);
fprintf(stderr, "META_CAPTURE %u", meta_arg);
break;

case META_RECURSE:
GETOFFSET(offset, pptr);
fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
fprintf(stderr, "META_RECURSE %u %zu", meta_arg, offset);
break;

case META_BACKREF:
if (meta_arg < 10)
offset = cb->small_ref_offset[meta_arg];
else
GETOFFSET(offset, pptr);
fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
fprintf(stderr, "META_BACKREF %u %zu", meta_arg, offset);
break;

case META_ESCAPE:
if (meta_arg == ESC_P || meta_arg == ESC_p)
{
uint32_t ptype = *pptr >> 16;
uint32_t pvalue = *pptr++ & 0xffff;
fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? CHAR_P:CHAR_p,
fprintf(stderr, "META \\%c %u %u", (meta_arg == ESC_P)? CHAR_P:CHAR_p,
ptype, pvalue);
}
else
Expand Down Expand Up @@ -914,7 +914,7 @@ for (;;)
case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
case META_KET: fprintf(stderr, "META )"); break;
case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
case META_ALT: fprintf(stderr, "META | %u", meta_arg); break;

case META_CLASS: fprintf(stderr, "META ["); break;
case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
Expand All @@ -925,8 +925,8 @@ for (;;)
case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;

case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
case META_POSIX: fprintf(stderr, "META_POSIX %u", *pptr++); break;
case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %u", *pptr++); break;

case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
Expand All @@ -941,22 +941,22 @@ for (;;)
break;

case META_LOOKBEHIND:
fprintf(stderr, "META (?<= %d %d", meta_arg, *pptr);
fprintf(stderr, "META (?<= %u %u", meta_arg, *pptr);
pptr += 2;
break;

case META_LOOKBEHIND_NA:
fprintf(stderr, "META (*naplb: %d %d", meta_arg, *pptr);
fprintf(stderr, "META (*naplb: %u %u", meta_arg, *pptr);
pptr += 2;
break;

case META_LOOKBEHINDNOT:
fprintf(stderr, "META (?<! %d %d", meta_arg, *pptr);
fprintf(stderr, "META (?<! %u %u", meta_arg, *pptr);
pptr += 2;
break;

case META_CALLOUT_NUMBER:
fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
fprintf(stderr, "META (?C%u) next=%u/%u", pptr[2], pptr[0],
pptr[1]);
pptr += 3;
break;
Expand All @@ -965,86 +965,86 @@ for (;;)
{
uint32_t patoffset = *pptr++; /* Offset of next pattern item */
uint32_t patlength = *pptr++; /* Length of next pattern item */
fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
fprintf(stderr, "META (?Cstring) length=%u offset=", *pptr++);
GETOFFSET(offset, pptr);
fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
fprintf(stderr, "%zu next=%u/%u", offset, patoffset, patlength);
}
break;

case META_RECURSE_BYNAME:
fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
fprintf(stderr, "META (?(&name) length=%u offset=", *pptr++);
GETOFFSET(offset, pptr);
fprintf(stderr, "%zd", offset);
fprintf(stderr, "%zu", offset);
break;

case META_BACKREF_BYNAME:
fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
fprintf(stderr, "META_BACKREF_BYNAME length=%u offset=", *pptr++);
GETOFFSET(offset, pptr);
fprintf(stderr, "%zd", offset);
fprintf(stderr, "%zu", offset);
break;

case META_COND_NUMBER:
fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
fprintf(stderr, "META_COND_NUMBER %u offset=", pptr[SIZEOFFSET]);
GETOFFSET(offset, pptr);
fprintf(stderr, "%zd", offset);
fprintf(stderr, "%zu", offset);
pptr++;
break;

case META_COND_DEFINE:
fprintf(stderr, "META (?(DEFINE) offset=");
GETOFFSET(offset, pptr);
fprintf(stderr, "%zd", offset);
fprintf(stderr, "%zu", offset);
break;

case META_COND_VERSION:
fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
fprintf(stderr, "%d.", *pptr++);
fprintf(stderr, "%d)", *pptr++);
fprintf(stderr, "%u.", *pptr++);
fprintf(stderr, "%u)", *pptr++);
break;

case META_COND_NAME:
fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
fprintf(stderr, "META (?(<name>) length=%u offset=", *pptr++);
GETOFFSET(offset, pptr);
fprintf(stderr, "%zd", offset);
fprintf(stderr, "%zu", offset);
break;

case META_COND_RNAME:
fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
fprintf(stderr, "META (?(R&name) length=%u offset=", *pptr++);
GETOFFSET(offset, pptr);
fprintf(stderr, "%zd", offset);
fprintf(stderr, "%zu", offset);
break;

/* This is kept as a name, because it might be. */

case META_COND_RNUMBER:
fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
fprintf(stderr, "META (?(Rnumber) length=%u offset=", *pptr++);
GETOFFSET(offset, pptr);
fprintf(stderr, "%zd", offset);
fprintf(stderr, "%zu", offset);
break;

case META_SCS_NAME:
fprintf(stderr, "META (*scan_substring:(<name>) length=%d offset=", *pptr++);
fprintf(stderr, "META (*scan_substring:(<name>) length=%u offset=", *pptr++);
GETOFFSET(offset, pptr);
fprintf(stderr, "%zd", offset);
fprintf(stderr, "%zu", offset);
break;

case META_SCS_NUMBER:
fprintf(stderr, "META_SCS_NUMBER %d offset=", pptr[SIZEOFFSET]);
fprintf(stderr, "META_SCS_NUMBER %u offset=", pptr[SIZEOFFSET]);
GETOFFSET(offset, pptr);
fprintf(stderr, "%zd", offset);
fprintf(stderr, "%zu", offset);
pptr++;
break;

case META_SCS_NEXT_NAME:
fprintf(stderr, "META_SCS_NEXT_NAME length=%d offset=", *pptr++);
fprintf(stderr, "META_SCS_NEXT_NAME length=%u offset=", *pptr++);
GETOFFSET(offset, pptr);
fprintf(stderr, "%zd", offset);
fprintf(stderr, "%zu", offset);
break;

case META_SCS_NEXT_NUMBER:
fprintf(stderr, "META_SCS_NEXT_NUMBER %d offset=", pptr[SIZEOFFSET]);
fprintf(stderr, "META_SCS_NEXT_NUMBER %u offset=", pptr[SIZEOFFSET]);
GETOFFSET(offset, pptr);
fprintf(stderr, "%zd", offset);
fprintf(stderr, "%zu", offset);
pptr++;
break;

Expand Down Expand Up @@ -1099,7 +1099,6 @@ associated JIT data. */
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
pcre2_code_copy(const pcre2_code *code)
{
PCRE2_SIZE *ref_count;
pcre2_code *newcode;

if (code == NULL) return NULL;
Expand All @@ -1113,7 +1112,7 @@ in the decoded tables. */

if ((code->flags & PCRE2_DEREF_TABLES) != 0)
{
ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
PCRE2_SIZE *ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Personally, I'm very happy with these changes.

I know Philip likes the old style of defining variables high up, at the top of a scope, and with a blank line after variable definitions.

But I don't see any benefit to having variables available for use, but not yet initialised. Much better to define & initialise at the same time (safer).

The compiler will hoist all the variables up to the top anyway (it will bump the stack pointer just once at the start of a block, rather than bump the stack pointer multiple times, when it sees a new variable).

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Partly it's because I'm a dinosaur from the age when one had to define variables like that, but partly also I find it makes it easier when looking back up some code to find where a variable is defined. However, I am not going to try to impose my own preferences on the future. I can certainly see the advantage of always initializing at definition time. So please don't worry about me too much.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Funny is that this change is still valid C89 code and the main motivation wasn't to go against Philip's advice of defining variables at the beginning of blocks, but just reducing the scope of this variable to where it was actually needed/used.

Since we have at least one CI job with -Wshadow and I wanted to minimize churn didn't rename the variable to reflect its "temp" holder (might be even optimized out) status.

(*ref_count)++;
}

Expand Down Expand Up @@ -2597,9 +2596,7 @@ won't be recognized. */
}

while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
{
ptr++;
}
}

/* Check name length */
Expand Down Expand Up @@ -5479,7 +5476,7 @@ static BOOL
find_dupname_details(PCRE2_SPTR name, uint32_t length, int *indexptr,
int *countptr, int *errorcodeptr, compile_block *cb)
{
uint32_t i, groupnumber;
uint32_t i;
int count;
PCRE2_UCHAR *slot = cb->name_table;

Expand Down Expand Up @@ -5511,8 +5508,9 @@ count = 0;

for (;;)
{
uint32_t groupnumber = GET2(slot,0);

count++;
groupnumber = GET2(slot,0);
cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
if (++i >= cb->names_found) break;
Expand Down Expand Up @@ -6800,7 +6798,7 @@ for (;; pptr++)
#ifdef MAYBE_UTF_MULTI
if (utf && NOT_FIRSTCU(code[-1]))
{
PCRE2_UCHAR *lastchar = code - 1;
PCRE2_SPTR lastchar = code - 1;
BACKCHAR(lastchar);
mclength = (uint32_t)(code - lastchar); /* Length of UTF character */
memcpy(mcbuffer, lastchar, CU2BYTES(mclength)); /* Save the char */
Expand Down Expand Up @@ -7940,7 +7938,6 @@ PCRE2_UCHAR *last_branch = code;
PCRE2_UCHAR *start_bracket = code;
BOOL lookbehind;
open_capitem capitem;
int capnumber = 0;
int okreturn = 1;
uint32_t *pptr = *pptrptr;
uint32_t firstcu, reqcu;
Expand Down Expand Up @@ -7998,7 +7995,7 @@ OP_SCBRAPOS, happens later, after the group has been compiled. */

if (*code == OP_CBRA)
{
capnumber = GET2(code, 1 + LINK_SIZE);
int capnumber = GET2(code, 1 + LINK_SIZE);
capitem.number = capnumber;
capitem.next = open_caps;
capitem.assert_depth = cb->assert_depth;
Expand Down Expand Up @@ -10212,11 +10209,9 @@ show_parsed(&cb);
#ifdef DEBUG_SHOW_CAPTURES
{
named_group *ng = cb.named_groups;
fprintf(stderr, "+++Captures: %d\n", cb.bracount);
fprintf(stderr, "+++Captures: %u\n", cb.bracount);
for (i = 0; i < cb.names_found; i++, ng++)
{
fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
}
fprintf(stderr, "+++%3u %.*s\n", ng->number, ng->length, ng->name);
}
#endif

Expand Down Expand Up @@ -10415,11 +10410,11 @@ if (errorcode == 0 && cb.had_recurse)
rcode != NULL;
rcode = find_recurse(rcode + 1 + LINK_SIZE, utf))
{
int p, groupnumber;
int groupnumber = (int)GET(rcode, 1);

groupnumber = (int)GET(rcode, 1);
if (groupnumber == 0) rgroup = codestart; else
{
int p;
PCRE2_SPTR search_from = codestart;
rgroup = NULL;
for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
Expand Down
Loading
Loading