Skip to content

Commit

Permalink
Merge branch '2.18'
Browse files Browse the repository at this point in the history
  • Loading branch information
cowtowncoder committed Sep 18, 2024
2 parents 7a28627 + 4d47aae commit 8cab937
Show file tree
Hide file tree
Showing 8 changed files with 110 additions and 32 deletions.
9 changes: 9 additions & 0 deletions release-notes/CREDITS-2.x
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,15 @@ Antonin Janec (@xtonic)
* Contributed #1218: Simplify Unicode surrogate pair conversion for generation
(2.17.0)

Ian Roberts (@ianroberts)
* Reported #223: `UTF8JsonGenerator` writes supplementary characters as a
surrogate pair: should use 4-byte encoding
(2.18.0)

Radovan Netuka (@rnetuka)
* Contributed fix for #223: `UTF8JsonGenerator` writes supplementary characters as a
surrogate pair: should use 4-byte encoding

Jared Stehler (@jaredstehler)
* Reported, contributed fix for #1274: `NUL`-corrupted keys, values on JSON serialization
(2.18.0)
Expand Down
4 changes: 4 additions & 0 deletions release-notes/VERSION-2.x
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ a pure JSON library.

2.18.0 (not yet released)

#223: `UTF8JsonGenerator` writes supplementary characters as a surrogate pair:
should use 4-byte encoding
(reported by Ian R)
(fix contributed by Radovan N)
#1230: Improve performance of `float` and `double` parsing from `TextBuffer`
(implemented by @pjfanning)
#1251: `InternCache` replace synchronized with `ReentrantLock` - the cache
Expand Down
19 changes: 19 additions & 0 deletions src/main/java/tools/jackson/core/json/JsonWriteFeature.java
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,25 @@ public enum JsonWriteFeature
*/
ESCAPE_FORWARD_SLASHES(true),

/**
* Feature that specifies how characters outside "Basic Multilingual Plane" (BMP) -- ones encoded
* as 4-byte UTF-8 sequences but represented in JVM memory as 2 16-bit "surrogate" {@code chars} --
* should be encoded as UTF-8 by {@link JsonGenerator}.
* If enabled, surrogate pairs are combined and flushed as a
* single, 4-byte UTF-8 character.
* If disabled, each {@code char} of pair is written as 2 separate characters: that is, as 2
* separate 3-byte UTF-8 characters with values in Surrogate character ranges
* ({@code 0xD800} - {@code 0xDBFF} and {@code 0xDC00} - {@code 0xDFFF})
* <p>
* Note that this feature only has effect for {@link JsonGenerator}s that directly encode
* {@code byte}-based output, as UTF-8 (target {@link java.io.OutputStream}, {@code byte[]}
* and so on); it will not (can not) change handling of
* {@code char}-based output (like {@link java.io.Writer} or {@link java.lang.String}).
* <p>
* Feature is enabled by default in Jackson 3.0 (was disabled in 2.x).
*/
COMBINE_UNICODE_SURROGATES_IN_UTF8(true),

;

final private boolean _defaultState;
Expand Down
38 changes: 38 additions & 0 deletions src/main/java/tools/jackson/core/json/UTF8JsonGenerator.java
Original file line number Diff line number Diff line change
Expand Up @@ -1545,6 +1545,16 @@ private final void _writeStringSegment2(final char[] cbuf, int offset, final int
outputBuffer[outputPtr++] = (byte) (0xc0 | (ch >> 6));
outputBuffer[outputPtr++] = (byte) (0x80 | (ch & 0x3f));
} else {
// 3- or 4-byte character
if (_isSurrogateChar(ch)) {
final boolean combineSurrogates = JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8.enabledIn(_formatWriteFeatures);
if (combineSurrogates && offset < end) {
char highSurrogate = (char) ch;
char lowSurrogate = cbuf[offset++];
outputPtr = _outputSurrogatePair(highSurrogate, lowSurrogate, outputPtr);
continue;
}
}
outputPtr = _outputMultiByteChar(ch, outputPtr);
}
}
Expand Down Expand Up @@ -1583,6 +1593,16 @@ private final void _writeStringSegment2(final String text, int offset, final int
outputBuffer[outputPtr++] = (byte) (0xc0 | (ch >> 6));
outputBuffer[outputPtr++] = (byte) (0x80 | (ch & 0x3f));
} else {
// 3- or 4-byte character
if (_isSurrogateChar(ch)) {
final boolean combineSurrogates = JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8.enabledIn(_formatWriteFeatures);
if (combineSurrogates && offset < end) {
char highSurrogate = (char) ch;
char lowSurrogate = text.charAt(offset++);
outputPtr = _outputSurrogatePair(highSurrogate, lowSurrogate, outputPtr);
continue;
}
}
outputPtr = _outputMultiByteChar(ch, outputPtr);
}
}
Expand Down Expand Up @@ -2177,6 +2197,19 @@ protected final void _outputSurrogates(int surr1, int surr2) throws JacksonExcep
bbuf[_outputTail++] = (byte) (0x80 | (c & 0x3f));
}

// @since 2.18
private int _outputSurrogatePair(char highSurrogate, char lowSurrogate, int outputPtr) {
final int unicode = 0x10000 + ((highSurrogate & 0x03FF) << 10)
+ (lowSurrogate & 0x03FF);

_outputBuffer[outputPtr++] = (byte) (0xF0 + ((unicode >> 18) & 0x07));
_outputBuffer[outputPtr++] = (byte) (0x80 + ((unicode >> 12) & 0x3F));
_outputBuffer[outputPtr++] = (byte) (0x80 + ((unicode >> 6) & 0x3F));
_outputBuffer[outputPtr++] = (byte) (0x80 + (unicode & 0x3F));

return outputPtr;
}

/**
*
* @param ch
Expand Down Expand Up @@ -2262,5 +2295,10 @@ protected final void _flushBuffer() throws JacksonException
private byte[] getHexBytes() {
return _cfgWriteHexUppercase ? HEX_BYTES_UPPER : HEX_BYTES_LOWER;
}

// @since 2.18
private boolean _isSurrogateChar(int ch) {
return (ch & 0xD800) == 0xD800;
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
import java.math.BigDecimal;
import java.math.BigInteger;

import org.junit.jupiter.api.Test;

import tools.jackson.core.*;
import tools.jackson.core.exc.StreamWriteException;

import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.*;

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@ class StringGenerationFromReaderTest
"Longer text & other stuff:\twith some\r\n\r\n random linefeeds etc added in to cause some \"special\" handling \\\\ to occur...\n"
};

private final JsonFactory FACTORY = newStreamFactory();
// 17-Sep-2024, tatu: [core#223] change to surrogates, let's use old behavior
// for now for simpler testing
private final JsonFactory FACTORY = streamFactoryBuilder()
.disable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8)
.build();

@Test
void basicEscaping() throws Exception
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@ class StringGenerationTest
"Longer text & other stuff:\twith some\r\n\r\n random linefeeds etc added in to cause some \"special\" handling \\\\ to occur...\n"
};

private final JsonFactory FACTORY = new JsonFactory();
// 17-Sep-2024, tatu: [core#223] change to surrogates, let's use old behavior
// for now for simpler testing
private final JsonFactory FACTORY = streamFactoryBuilder()
.disable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8)
.build();

@Test
void basicEscaping() throws Exception
Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,26 @@
package tools.jackson.failing;
package tools.jackson.core.json;

import java.io.ByteArrayOutputStream;
import java.io.StringWriter;
import java.io.Writer;

import tools.jackson.core.*;
import tools.jackson.core.json.JsonFactory;

import org.junit.jupiter.api.Test;

import tools.jackson.core.*;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

class Surrogate223Test extends JUnit5TestBase
{
private final JsonFactory JSON_F = newStreamFactory();
private final JsonFactory DEFAULT_JSON_F = newStreamFactory();

// for [core#223]
@Test
void surrogatesDefaultSetting() throws Exception {
// default in 3.x should be disabled:
assertTrue(DEFAULT_JSON_F.isEnabled(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8));
}

// for [core#223]
@Test
Expand All @@ -24,36 +31,41 @@ void surrogatesByteBacked() throws Exception
final String toQuote = new String(Character.toChars(0x1F602));
assertEquals(2, toQuote.length()); // just sanity check

// default should be disabled:
// assertFalse(JSON_F.isEnabled(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES));

out = new ByteArrayOutputStream();
g = JSON_F.createGenerator(ObjectWriteContext.empty(), out);

JsonFactory f = JsonFactory.builder()
.enable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8)
.build();
g = f.createGenerator(ObjectWriteContext.empty(), out);
g.writeStartArray();
g.writeString(toQuote);
g.writeEndArray();
g.close();
assertEquals(2 + 2 + 4, out.size()); // brackets, quotes, 4-byte encoding

// Also parse back to ensure correctness
JsonParser p = JSON_F.createParser(ObjectReadContext.empty(), out.toByteArray());
JsonParser p = f.createParser(ObjectReadContext.empty(), out.toByteArray());
assertToken(JsonToken.START_ARRAY, p.nextToken());
assertToken(JsonToken.VALUE_STRING, p.nextToken());
assertEquals(toQuote, p.getText());
assertToken(JsonToken.END_ARRAY, p.nextToken());
p.close();

// but may revert back to original behavior
out = new ByteArrayOutputStream();
g = JSON_F.createGenerator(ObjectWriteContext.empty(), out);
// g.enable(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES);
f = JsonFactory.builder()
.disable(JsonWriteFeature.COMBINE_UNICODE_SURROGATES_IN_UTF8)
.build();

g = f.createGenerator(ObjectWriteContext.empty(), out);
g.writeStartArray();
g.writeString(toQuote);
g.writeEndArray();
g.close();
assertEquals(2 + 2 + 12, out.size()); // brackets, quotes, 2 x 6 byte JSON escape
}

// for [core#223]
// for [core#223]: no change for character-backed (cannot do anything)
@Test
void surrogatesCharBacked() throws Exception
{
Expand All @@ -62,32 +74,20 @@ void surrogatesCharBacked() throws Exception
final String toQuote = new String(Character.toChars(0x1F602));
assertEquals(2, toQuote.length()); // just sanity check

// default should be disabled:
// assertFalse(JSON_F.isEnabled(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES));

out = new StringWriter();
g = JSON_F.createGenerator(ObjectWriteContext.empty(), out);
g = DEFAULT_JSON_F.createGenerator(ObjectWriteContext.empty(), out);
g.writeStartArray();
g.writeString(toQuote);
g.writeEndArray();
g.close();
assertEquals(2 + 2 + 2, out.toString().length()); // brackets, quotes, 2 chars as is

// Also parse back to ensure correctness
JsonParser p = JSON_F.createParser(ObjectReadContext.empty(), out.toString());
JsonParser p = DEFAULT_JSON_F.createParser(ObjectReadContext.empty(), out.toString());
assertToken(JsonToken.START_ARRAY, p.nextToken());
assertToken(JsonToken.VALUE_STRING, p.nextToken());
assertEquals(toQuote, p.getText());
assertToken(JsonToken.END_ARRAY, p.nextToken());
p.close();

// but may revert back to original behavior
out = new StringWriter();
g = JSON_F.createGenerator(ObjectWriteContext.empty(), out);
// g.enable(JsonGenerator.Feature.ESCAPE_UTF8_SURROGATES);
g.writeStartArray();
g.writeString(toQuote);
g.writeEndArray();
g.close();
assertEquals(2 + 2 + 12, out.toString().length()); // brackets, quotes, 2 x 6 byte JSON escape
}
}

0 comments on commit 8cab937

Please sign in to comment.