Skip to content

Commit a3f5415

Browse files
committed
regex capture group names must use identifier syntax
Prior to this commit the non-first characters could be any \w character. But an identifier excludes a few \w characters from appearing in them. This commit tightens what is allowed. Commit xd1e2a852fbc901b45fba20906a8f42ca227ae462 gave a list of them, but I forgot a couple details in generating that list, so it wasn't quite right. The complete corrected list is: GREEK YPOGEGRAMMENI COMBINING CYRILLIC HUNDRED THOUSANDS SIGN COMBINING CYRILLIC MILLIONS SIGN COMBINING PARENTHESES OVERLAY COMBINING ENCLOSING CIRCLE COMBINING ENCLOSING SQUARE COMBINING ENCLOSING DIAMOND COMBINING ENCLOSING CIRCLE BACKSLASH COMBINING ENCLOSING SCREEN COMBINING ENCLOSING KEYCAP COMBINING ENCLOSING UPWARD POINTING TRIANGLE CIRCLED LATIN CAPITAL LETTER A - Z CIRCLED LATIN SMALL LETTER A - Z VERTICAL TILDE COMBINING CYRILLIC TEN MILLIONS SIGN COMBINING CYRILLIC HUNDRED MILLIONS SIGN COMBINING CYRILLIC THOUSAND MILLIONS SIGN ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF ISOLATED FORM ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM ARABIC LIGATURE JALLAJALALOUHOU ARABIC FATHATAN ISOLATED FORM ARABIC DAMMATAN ISOLATED FORM ARABIC KASRATAN ISOLATED FORM ARABIC FATHA ISOLATED FORM ARABIC DAMMA ISOLATED FORM ARABIC KASRA ISOLATED FORM ARABIC SHADDA ISOLATED FORM ARABIC SUKUN ISOLATED FORM SQUARED LATIN CAPITAL LETTER A - Z NEGATIVE CIRCLED LATIN CAPITAL LETTER A - Z NEGATIVE SQUARED LATIN CAPITAL LETTER A - Z
1 parent 95b1020 commit a3f5415

File tree

2 files changed

+24
-2
lines changed

2 files changed

+24
-2
lines changed

regcomp.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2532,7 +2532,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
25322532
do {
25332533
RExC_parse_advance(advance);
25342534
} while ( RExC_parse < RExC_end
2535-
&& (advance = isWORDCHAR_utf8_safe( (U8 *) RExC_parse,
2535+
&& (advance = isIDCONT_utf8_safe( (U8 *) RExC_parse,
25362536
(U8 *) RExC_end)));
25372537
} else {
25382538
RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending

t/re/pat.t

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ skip_all_without_unicode_tables();
2828
my $has_locales = locales_enabled('LC_CTYPE');
2929
my $utf8_locale = find_utf8_ctype_locale();
3030

31-
plan tests => 1296; # Update this when adding/deleting tests.
31+
plan tests => 1298; # Update this when adding/deleting tests.
3232

3333
run_tests() unless caller;
3434

@@ -1388,6 +1388,28 @@ EOP
13881388
fresh_perl_like($prog, qr!Group name must start with a non-digit word character!, {},
13891389
sprintf("'U+%04X not legal IDFirst'", ord($char)));
13901390
}
1391+
1392+
foreach my $char (chr(0x2115), chr(0x24B7)) {
1393+
my $prog = <<"EOP";
1394+
use utf8;;
1395+
no warnings 'utf8';
1396+
print 0 + "abc" =~ qr/(?<a${char}b>abc)/;
1397+
EOP
1398+
utf8::encode($prog);
1399+
if ($char =~ /\p{XID_Continue}/) {
1400+
fresh_perl_is($prog, 1,
1401+
{},
1402+
sprintf("U+%04X is legal IDCont",
1403+
ord($char)));
1404+
}
1405+
else {
1406+
fresh_perl_like($prog,
1407+
qr/Sequence .* not terminated/,
1408+
{},
1409+
sprintf("U+%04X not legal IDCont",
1410+
ord($char)));
1411+
}
1412+
}
13911413
}
13921414

13931415
{ # [perl #101710]

0 commit comments

Comments
 (0)