diff --git a/locale.c b/locale.c index 1f5f5eb82fd7..53760f6d4ba9 100644 --- a/locale.c +++ b/locale.c @@ -9609,6 +9609,8 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, bool utf8 /* Is the input in UTF-8? */ ) { + PERL_ARGS_ASSERT_MEM_COLLXFRM_; + /* mem_collxfrm_() is like strxfrm() but with two important differences. * First, it handles embedded NULs. Second, it allocates a bit more memory * than needed for the transformed data itself. The real transformed data @@ -9637,11 +9639,18 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, locale_t constructed_locale = (locale_t) 0; # endif - PERL_ARGS_ASSERT_MEM_COLLXFRM_; - /* Must be NUL-terminated */ assert(*(input_string + len) == '\0'); + /* We may have to allocate memory to hold modified versions of the input. + * Initialize to NULL here, and before any return, free them all. Those + * that do get allocated will be non-NULL then, and get freed */ + char * sans_nuls = NULL; /* NULs changed to lowest collating ctrl */ + char * sans_highs = NULL; /* >0xFF changed to highest collating byte + for non-UTF8 locales */ + void * free_me = NULL; /* some called functions may allocate memory that + this function then is required to free */ + if (PL_collxfrm_mult == 0) { /* unknown or bad */ if (PL_collxfrm_base != 0) { /* bad collation => skip */ DEBUG_L(PerlIO_printf(Perl_debug_log, @@ -9659,10 +9668,10 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, * otherwise contain that character, but otherwise there may be * less-than-perfect results with that character and NUL. This is * unavoidable unless we replace strxfrm with our own implementation. */ + if (UNLIKELY(s_strlen < len)) { /* Only execute if there is an embedded NUL */ char * e = s + len; - char * sans_nuls; STRLEN sans_nuls_len; int try_non_controls; char this_replacement_char[] = "?\0"; /* Room for a two-byte string, @@ -9825,148 +9834,141 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, len = strlen(s); } /* End of replacing NULs */ - /* Make sure the UTF8ness of the string and locale match */ - if (utf8 != PL_in_utf8_COLLATE_locale) { - /* XXX convert above Unicode to 10FFFF? */ - const char * const t = s; /* Temporary so we can later find where the - input was */ + if (! utf8) { - /* Here they don't match. Change the string's to be what the locale is - * expecting */ - - if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */ - s = (char *) bytes_to_utf8((const U8 *) s, &len); + /* When the locale is UTF-8, strxfrm() is expecting a UTF-8 string. + * Here, the string isn't. Convert it to be so. */ + if (PL_in_utf8_COLLATE_locale) { + s = (char *) bytes_to_utf8_free_me((const U8 *) s, &len, &free_me); utf8 = TRUE; } - else { /* locale is not UTF-8; but input is; downgrade the input */ - - s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8); - - /* If the downgrade was successful we are done, but if the input - * contains things that require UTF-8 to represent, have to do - * damage control ... */ - if (UNLIKELY(utf8)) { - - /* What we do is construct a non-UTF-8 string with - * 1) the characters representable by a single byte converted - * to be so (if necessary); - * 2) and the rest converted to collate the same as the - * highest collating representable character. That makes - * them collate at the end. This is similar to how we - * handle embedded NULs, but we use the highest collating - * code point instead of the smallest. Like the NUL case, - * this isn't perfect, but is the best we can reasonably - * do. Every above-255 code point will sort the same as - * the highest-sorting 0-255 code point. If that code - * point can combine in a sequence with some other code - * points for weight calculations, us changing something to - * be it can adversely affect the results. But in most - * cases, it should work reasonably. And note that this is - * really an illegal situation: using code points above 255 - * on a locale where only 0-255 are valid. If two strings - * sort entirely equal, then the sort order for the - * above-255 code points will be in code point order. */ - - utf8 = FALSE; - - /* If we haven't calculated the code point with the maximum - * collating order for this locale, do so now */ - if (! PL_strxfrm_max_cp) { - int j; - - /* The current transformed string that collates the - * highest (except it also includes the prefixed collation - * index. */ - char * cur_max_x = NULL; - - /* Look through all legal code points (NUL isn't) */ - for (j = 1; j < 256; j++) { - char * x; - STRLEN x_len; - char cur_source[] = { '\0', '\0' }; - - /* Create a 1-char string of the current code point */ - cur_source[0] = (char) j; - - /* Then transform it */ - x = mem_collxfrm_(cur_source, 1, &x_len, FALSE); - - /* If something went wrong (which it shouldn't), just - * ignore this code point */ - if (! x) { - continue; - } - /* If this character's transformation is higher than - * the current highest, this one becomes the highest */ - if ( cur_max_x == NULL - || strGT(x + COLLXFRM_HDR_LEN, - cur_max_x + COLLXFRM_HDR_LEN)) - { - PL_strxfrm_max_cp = j; - Safefree(cur_max_x); - cur_max_x = x; - } - else { - Safefree(x); - } + /* We are ready to call strxfrm() */ + } + else if (! PL_in_utf8_COLLATE_locale) { + + /* Here, the string is UTF-8, but the locale isn't. strxfrm() is + * expecting a non-UTF-8 string. Convert the string to bytes. If + * that succeeds, we are ready to call strxfrm() */ + utf8 = FALSE; + if (UNLIKELY(! utf8_to_bytes_new_pv((const U8 **) &s, &len, &free_me))) + { + /* But here, it didn't succeed; have to do damage control ... + * + * What we do is construct a non-UTF-8 string with + * 1) the characters representable by a single byte converted to + * be so (if not already); + * 2) and the rest converted to collate the same as the highest + * collating representable character. That makes them collate + * at the end. This is similar to how we handle embedded NULs, + * but we use the highest collating code point instead of the + * smallest. Like the NUL case, this isn't perfect, but is the + * best we can reasonably do. Every above-255 code point will + * sort the same as the highest-sorting 0-255 code point. If + * that code point can combine in a sequence with some other + * code points for weight calculations, us changing something + * to be it can adversely affect the results. But in most + * cases, it should work reasonably. And note that this is + * really an illegal situation: using code points above 255 on + * a locale where only 0-255 are valid. If two strings sort + * entirely equal, then the sort order for the above-255 code + * points will be in code point order. + * + * If we haven't calculated the code point with the maximum + * collating order for this locale, do so now */ + if (! PL_strxfrm_max_cp) { + int j; + + /* The current transformed string that collates the + * highest (except it also includes the prefixed collation + * index. */ + char * cur_max_x = NULL; + + /* Look through all legal code points (NUL isn't) */ + for (j = 1; j < 256; j++) { + char * x; + STRLEN x_len; + char cur_source[] = { '\0', '\0' }; + + /* Create a 1-char string of the current code point */ + cur_source[0] = (char) j; + + /* Then transform it */ + x = mem_collxfrm_(cur_source, 1, &x_len, FALSE); + + /* If something went wrong (which it shouldn't), just + * ignore this code point */ + if (! x) { + continue; } - if (! cur_max_x) { - DEBUG_L(PerlIO_printf(Perl_debug_log, - "mem_collxfrm_: Couldn't find any character to" - " replace above-Latin1 chars in locale %s with", - PL_collation_name)); - goto bad; + /* If this character's transformation is higher than + * the current highest, this one becomes the highest */ + if ( cur_max_x == NULL + || strGT(x + COLLXFRM_HDR_LEN, + cur_max_x + COLLXFRM_HDR_LEN)) + { + PL_strxfrm_max_cp = j; + Safefree(cur_max_x); + cur_max_x = x; + } + else { + Safefree(x); } + } + if (! cur_max_x) { DEBUG_L(PerlIO_printf(Perl_debug_log, - "mem_collxfrm_: highest 1-byte collating character" - " in locale %s is 0x%02X\n", - PL_collation_name, - PL_strxfrm_max_cp)); - - Safefree(cur_max_x); + "mem_collxfrm_: Couldn't find any character to" + " replace above-Latin1 chars in locale %s with", + PL_collation_name)); + goto bad; } - /* Here we know which legal code point collates the highest. - * We are ready to construct the non-UTF-8 string. The length - * will be at least 1 byte smaller than the input string - * (because we changed at least one 2-byte character into a - * single byte), but that is eaten up by the trailing NUL */ - Newx(s, len, char); + DEBUG_L(PerlIO_printf(Perl_debug_log, + "mem_collxfrm_: highest 1-byte collating character" + " in locale %s is 0x%02X\n", + PL_collation_name, + PL_strxfrm_max_cp)); - { - STRLEN i; - STRLEN d= 0; - char * e = (char *) t + len; - - for (i = 0; i < len; i+= UTF8SKIP(t + i)) { - U8 cur_char = t[i]; - if (UTF8_IS_INVARIANT(cur_char)) { - s[d++] = cur_char; - } - else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(t + i, e)) { - s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]); - } - else { /* Replace illegal cp with highest collating - one */ - s[d++] = PL_strxfrm_max_cp; - } - } - s[d++] = '\0'; - Renew(s, d, char); /* Free up unused space */ + Safefree(cur_max_x); + } + + /* Here we know which legal code point collates the highest. We + * are ready to construct the non-UTF-8 string. The length will be + * at least 1 byte smaller than the input string (because we + * changed at least one 2-byte character into a single byte), but + * that is eaten up by the trailing NUL + * + * May shrink; will never grow */ + Newx(sans_highs, len, char); + char * d = sans_highs; + + const char * const e = s + len; + while (s < e) { + if (UTF8_IS_INVARIANT(*s)) { + *d++ = *s++; + continue; } + + if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, e)) { + *d++ = EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s + 1)); + } + else { /* Replace illegal cp's with highest collating one */ + *d++ = PL_strxfrm_max_cp; + } + + s+= UTF8SKIP(s); } - } - /* Here, we have constructed a modified version of the input. It could - * be that we already had a modified copy before we did this version. - * If so, that copy is no longer needed */ - if (t != input_string) { - Safefree(t); + len = d - sans_highs; + *d++ = '\0'; + + s = sans_highs; } } + /* else // Here both the locale and string are UTF-8 */ + /* XXX convert above Unicode to 10FFFF? */ length_in_chars = (utf8) ? utf8_length((U8 *) s, (U8 *) s + len) @@ -9988,6 +9990,13 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, /* Store the collation id */ *(PERL_UINTMAX_T *)xbuf = PL_collation_ix; +# define CLEANUP_STRXFRM_COMMON \ + STMT_START { \ + Safefree(free_me); \ + Safefree(sans_nuls); \ + Safefree(sans_highs); \ + } STMT_END + # if defined(USE_POSIX_2008_LOCALE) && defined HAS_STRXFRM_L # ifdef USE_LOCALE_CTYPE @@ -10004,6 +10013,7 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, STMT_START { \ if (constructed_locale != (locale_t) 0) \ freelocale(constructed_locale); \ + CLEANUP_STRXFRM_COMMON; \ } STMT_END # else # define my_strxfrm(dest, src, n) strxfrm(dest, src, n) @@ -10012,9 +10022,12 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, orig_CTYPE_locale = toggle_locale_c(LC_CTYPE, PL_collation_name); # define CLEANUP_STRXFRM \ - restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale) + STMT_START { \ + restore_toggled_locale_c(LC_CTYPE, orig_CTYPE_locale); \ + CLEANUP_STRXFRM_COMMON; \ + } STMT_END # else -# define CLEANUP_STRXFRM NOOP +# define CLEANUP_STRXFRM CLEANUP_STRXFRM_COMMON # endif # endif @@ -10154,28 +10167,20 @@ Perl_mem_collxfrm_(pTHX_ const char *input_string, first_time = FALSE; } - CLEANUP_STRXFRM; - DEBUG_L(print_collxfrm_input_and_return(s, s + len, xbuf, *xlen, utf8)); + CLEANUP_STRXFRM; /* Free up unneeded space; retain enough for trailing NUL */ Renew(xbuf, COLLXFRM_HDR_LEN + *xlen + 1, char); - if (s != input_string) { - Safefree(s); - } - return xbuf; bad: - CLEANUP_STRXFRM; DEBUG_L(print_collxfrm_input_and_return(s, s + len, NULL, 0, utf8)); + CLEANUP_STRXFRM; Safefree(xbuf); - if (s != input_string) { - Safefree(s); - } *xlen = 0; return NULL; diff --git a/t/run/locale.t b/t/run/locale.t index 11f97f146ed6..dbaba7e4ed50 100644 --- a/t/run/locale.t +++ b/t/run/locale.t @@ -584,6 +584,32 @@ else { EOF } +SKIP: +{ + my @locales = find_locales( [ qw(LC_CTYPE LC_COLLATE) ] ); + my (undef, $non_utf8_ref) = classify_locales_wrt_utf8ness(\@locales); + my @non_utf8_locales = grep { $_ !~ / \b C \b | POSIX /x } + $non_utf8_ref->@*; + skip "didn't find a suitable non-UTF-8 locale", 1 unless + @non_utf8_locales; + my $locale = $non_utf8_locales[0]; + + fresh_perl_is(<<"EOF", "ok\n", {}, "Handles above Latin1 and NUL in non-UTF8 locale"); +use locale; +use POSIX qw(setlocale LC_COLLATE); +if (setlocale(LC_COLLATE, '$locale')) { + my \$x = "A\\xB5\\x00B"; + utf8::upgrade(\$x); + my \$y = "\\x{100}"; + my \$cmp = \$x cmp \$y; + print \$cmp <= 0 ? "ok\n" : "not ok\n"; +} +else { + print "ok\n"; +} +EOF +} + SKIP: { # GH #20085 my @utf8_locales = find_utf8_ctype_locales(); skip "didn't find a UTF-8 locale", 1 unless @utf8_locales;