Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JSON.dump: avoid redundant UTF-8 validation #595

Merged
merged 1 commit into from
Oct 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 16 additions & 49 deletions ext/json/ext/generator/generator.c
Original file line number Diff line number Diff line change
Expand Up @@ -66,42 +66,6 @@ static const char trailingBytesForUTF8[256] = {
static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
0x03C82080UL, 0xFA082080UL, 0x82082080UL };

/*
* Utility routine to tell whether a sequence of bytes is legal UTF-8.
* This must be called with the length pre-determined by the first byte.
* If not calling this from ConvertUTF8to*, then the length can be set by:
* length = trailingBytesForUTF8[*source]+1;
* and the sequence is illegal right away if there aren't that many bytes
* available.
* If presented with a length > 4, this returns 0. The Unicode
* definition of UTF-8 goes up to 4-byte sequences.
*/
static unsigned char isLegalUTF8(const UTF8 *source, unsigned long length)
{
UTF8 a;
const UTF8 *srcptr = source+length;
switch (length) {
default: return 0;
/* Everything else falls through when "1"... */
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
case 2: if ((a = (*--srcptr)) > 0xBF) return 0;

switch (*source) {
/* no fall-through in this inner switch */
case 0xE0: if (a < 0xA0) return 0; break;
case 0xED: if (a > 0x9F) return 0; break;
case 0xF0: if (a < 0x90) return 0; break;
case 0xF4: if (a > 0x8F) return 0; break;
default: if (a < 0x80) return 0;
}

case 1: if (*source >= 0x80 && *source < 0xC2) return 0;
}
if (*source > 0xF4) return 0;
return 1;
}

/* Escapes the UTF16 character and stores the result in the buffer buf. */
static void unicode_escape(char *buf, UTF16 character)
{
Expand Down Expand Up @@ -130,17 +94,18 @@ static void convert_UTF8_to_JSON_ASCII(FBuffer *buffer, VALUE string, char scrip
const UTF8 *sourceEnd = source + RSTRING_LEN(string);
char buf[6] = { '\\', 'u' };

while (source < sourceEnd) {
UTF32 ch = 0;
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
if (source + extraBytesToRead >= sourceEnd) {
rb_raise(rb_path2class("JSON::GeneratorError"),
"partial character in source, but hit end");
}
if (!isLegalUTF8(source, extraBytesToRead+1)) {
int ascii_only = rb_enc_str_asciionly_p(string);

if (!ascii_only) {
if (RB_ENCODING_GET_INLINED(string) != rb_utf8_encindex() || RB_ENC_CODERANGE(string) != RUBY_ENC_CODERANGE_VALID) {
rb_raise(rb_path2class("JSON::GeneratorError"),
"source sequence is illegal/malformed utf-8");
}
}

while (source < sourceEnd) {
UTF32 ch = 0;
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
/*
* The cases all fall through. See "Note A" below.
*/
Expand Down Expand Up @@ -238,6 +203,13 @@ static void convert_UTF8_to_JSON(FBuffer *buffer, VALUE string, char script_safe
char buf[6] = { '\\', 'u' };
int ascii_only = rb_enc_str_asciionly_p(string);

if (!ascii_only) {
if (RB_ENCODING_GET_INLINED(string) != rb_utf8_encindex() || RB_ENC_CODERANGE(string) != RUBY_ENC_CODERANGE_VALID) {
rb_raise(rb_path2class("JSON::GeneratorError"),
"source sequence is illegal/malformed utf-8");
}
}

for (start = 0, end = 0; end < len;) {
p = ptr + end;
c = (unsigned char) *p;
Expand Down Expand Up @@ -309,11 +281,6 @@ static void convert_UTF8_to_JSON(FBuffer *buffer, VALUE string, char script_safe
continue;
}
}

if (!isLegalUTF8((UTF8 *) p, clen)) {
rb_raise(rb_path2class("JSON::GeneratorError"),
"source sequence is illegal/malformed utf-8");
}
}
end += clen;
}
Expand Down
1 change: 0 additions & 1 deletion ext/json/ext/generator/generator.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ static const int halfShift = 10; /* used for shifting by 10 bits */
static const UTF32 halfBase = 0x0010000UL;
static const UTF32 halfMask = 0x3FFUL;

static unsigned char isLegalUTF8(const UTF8 *source, unsigned long length);
static void unicode_escape(char *buf, UTF16 character);
static void unicode_escape_to_buffer(FBuffer *buffer, char buf[6], UTF16 character);
static void convert_UTF8_to_JSON_ASCII(FBuffer *buffer, VALUE string, char script_safe);
Expand Down
Loading