Skip to content

[libc] Reworked CharacterConverter isComplete into isFull and isEmpty #144799

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 17 additions & 16 deletions libc/src/__support/wchar/character_converter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,20 @@ CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }

void CharacterConverter::clear() {
state->partial = 0;
state->bytes_processed = 0;
state->bytes_stored = 0;
state->total_bytes = 0;
}

bool CharacterConverter::isComplete() {
return state->bytes_processed == state->total_bytes;
bool CharacterConverter::isFull() {
return state->bytes_stored == state->total_bytes && state->total_bytes != 0;
}

bool CharacterConverter::isEmpty() { return state->bytes_stored == 0; }

int CharacterConverter::push(char8_t utf8_byte) {
uint8_t num_ones = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
// Checking the first byte if first push
if (state->bytes_processed == 0) {
if (isEmpty()) {
// UTF-8 char has 1 byte total
if (num_ones == 0) {
state->total_bytes = 1;
Expand All @@ -58,21 +60,21 @@ int CharacterConverter::push(char8_t utf8_byte) {
}
// Invalid first byte
else {
// bytes_processed and total_bytes will always be 0 here
// bytes_stored and total_bytes will always be 0 here
state->partial = static_cast<char32_t>(0);
return -1;
}
state->partial = static_cast<char32_t>(utf8_byte);
state->bytes_processed++;
state->bytes_stored++;
return 0;
}
// Any subsequent push
// Adding 6 more bits so need to left shift
if (num_ones == 1 && !isComplete()) {
if (num_ones == 1 && !isFull()) {
char32_t byte = utf8_byte & MASK_ENCODED_BITS;
state->partial = state->partial << ENCODED_BITS_PER_UTF8;
state->partial |= byte;
state->bytes_processed++;
state->bytes_stored++;
return 0;
}
// Invalid byte -> reset the state
Expand All @@ -82,18 +84,18 @@ int CharacterConverter::push(char8_t utf8_byte) {

int CharacterConverter::push(char32_t utf32) {
// we can't be partially through a conversion when pushing a utf32 value
if (!isComplete())
if (!isEmpty())
return -1;

state->partial = utf32;
state->bytes_processed = 0;

// determine number of utf-8 bytes needed to represent this utf32 value
constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
constexpr int NUM_RANGES = 4;
for (uint8_t i = 0; i < NUM_RANGES; i++) {
if (state->partial <= MAX_VALUE_PER_UTF8_LEN[i]) {
state->total_bytes = i + 1;
state->bytes_stored = i + 1;
return 0;
}
}
Expand All @@ -107,7 +109,7 @@ int CharacterConverter::push(char32_t utf32) {
ErrorOr<char32_t> CharacterConverter::pop_utf32() {
// If pop is called too early, do not reset the state, use error to determine
// whether enough bytes have been pushed
if (!isComplete() || state->bytes_processed == 0)
if (!isFull())
return Error(-1);
char32_t utf32 = state->partial;
// reset if successful pop
Expand All @@ -116,7 +118,7 @@ ErrorOr<char32_t> CharacterConverter::pop_utf32() {
}

ErrorOr<char8_t> CharacterConverter::pop_utf8() {
if (isComplete())
if (isEmpty())
return Error(-1);

constexpr char8_t FIRST_BYTE_HEADERS[] = {0, 0xC0, 0xE0, 0xF0};
Expand All @@ -125,9 +127,8 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
char32_t output;

// Shift to get the next 6 bits from the utf32 encoding
const size_t shift_amount =
(state->total_bytes - state->bytes_processed - 1) * ENCODED_BITS_PER_UTF8;
if (state->bytes_processed == 0) {
const size_t shift_amount = (state->bytes_stored - 1) * ENCODED_BITS_PER_UTF8;
if (isFull()) {
/*
Choose the correct set of most significant bits to encode the length
of the utf8 sequence. The remaining bits contain the most significant
Expand All @@ -141,7 +142,7 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
((state->partial >> shift_amount) & MASK_ENCODED_BITS);
}

state->bytes_processed++;
state->bytes_stored--;
return static_cast<char8_t>(output);
}

Expand Down
3 changes: 2 additions & 1 deletion libc/src/__support/wchar/character_converter.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ class CharacterConverter {
CharacterConverter(mbstate *mbstate);

void clear();
bool isComplete();
bool isFull();
bool isEmpty();

int push(char8_t utf8_byte);
int push(char32_t utf32);
Expand Down
6 changes: 3 additions & 3 deletions libc/src/__support/wchar/mbstate.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ struct mbstate {

/*
Progress towards a conversion
For utf8 -> utf32, increases with each CharacterConverter::push(utf8_byte)
For utf32 -> utf8, increases with each CharacterConverter::pop_utf8()
Increases with each push(...) until it reaches total_bytes
Decreases with each pop(...) until it reaches 0
*/
uint8_t bytes_processed;
uint8_t bytes_stored;

// Total number of bytes that will be needed to represent this character
uint8_t total_bytes;
Expand Down
48 changes: 28 additions & 20 deletions libc/test/src/__support/wchar/utf32_to_8_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,19 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {
// utf8 1-byte encodings are identical to their utf32 representations
char32_t utf32_A = 0x41; // 'A'
cr.push(utf32_A);
ASSERT_TRUE(cr.isFull());
auto popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<char>(popped.value()), 'A');
ASSERT_TRUE(cr.isComplete());
ASSERT_TRUE(cr.isEmpty());

char32_t utf32_B = 0x42; // 'B'
cr.push(utf32_B);
ASSERT_TRUE(cr.isFull());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<char>(popped.value()), 'B');
ASSERT_TRUE(cr.isComplete());
ASSERT_TRUE(cr.isEmpty());

// should error if we try to pop another utf8 byte out
popped = cr.pop_utf8();
Expand All @@ -45,26 +47,28 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
// testing utf32: 0xff -> utf8: 0xc3 0xbf
char32_t utf32 = 0xff;
cr.push(utf32);
ASSERT_TRUE(cr.isFull());
auto popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xc3);
ASSERT_TRUE(!cr.isComplete());
ASSERT_TRUE(!cr.isEmpty());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xbf);
ASSERT_TRUE(cr.isComplete());
ASSERT_TRUE(cr.isEmpty());

// testing utf32: 0x58e -> utf8: 0xd6 0x8e
utf32 = 0x58e;
cr.push(utf32);
ASSERT_TRUE(cr.isFull());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xd6);
ASSERT_TRUE(!cr.isComplete());
ASSERT_TRUE(!cr.isEmpty());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0x8e);
ASSERT_TRUE(cr.isComplete());
ASSERT_TRUE(cr.isEmpty());

// should error if we try to pop another utf8 byte out
popped = cr.pop_utf8();
Expand All @@ -79,34 +83,36 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
// testing utf32: 0xac15 -> utf8: 0xea 0xb0 0x95
char32_t utf32 = 0xac15;
cr.push(utf32);
ASSERT_TRUE(cr.isFull());
auto popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xea);
ASSERT_TRUE(!cr.isComplete());
ASSERT_TRUE(!cr.isEmpty());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xb0);
ASSERT_TRUE(!cr.isComplete());
ASSERT_TRUE(!cr.isEmpty());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0x95);
ASSERT_TRUE(cr.isComplete());
ASSERT_TRUE(cr.isEmpty());

// testing utf32: 0x267b -> utf8: 0xe2 0x99 0xbb
utf32 = 0x267b;
cr.push(utf32);
ASSERT_TRUE(cr.isFull());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xe2);
ASSERT_TRUE(!cr.isComplete());
ASSERT_TRUE(!cr.isEmpty());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0x99);
ASSERT_TRUE(!cr.isComplete());
ASSERT_TRUE(!cr.isEmpty());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xbb);
ASSERT_TRUE(cr.isComplete());
ASSERT_TRUE(cr.isEmpty());

// should error if we try to pop another utf8 byte out
popped = cr.pop_utf8();
Expand All @@ -121,42 +127,44 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) {
// testing utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1
char32_t utf32 = 0x1f921;
cr.push(utf32);
ASSERT_TRUE(cr.isFull());
auto popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
ASSERT_TRUE(!cr.isComplete());
ASSERT_TRUE(!cr.isEmpty());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0x9f);
ASSERT_TRUE(!cr.isComplete());
ASSERT_TRUE(!cr.isEmpty());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xa4);
ASSERT_TRUE(!cr.isComplete());
ASSERT_TRUE(!cr.isEmpty());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
ASSERT_TRUE(cr.isComplete());
ASSERT_TRUE(cr.isEmpty());

// testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1
utf32 = 0x12121;
cr.push(utf32);
ASSERT_TRUE(cr.isFull());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
ASSERT_TRUE(!cr.isComplete());
ASSERT_TRUE(!cr.isEmpty());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0x92);
ASSERT_TRUE(!cr.isComplete());
ASSERT_TRUE(!cr.isEmpty());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0x84);
ASSERT_TRUE(!cr.isComplete());
ASSERT_TRUE(!cr.isEmpty());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
ASSERT_TRUE(cr.isComplete());
ASSERT_TRUE(cr.isEmpty());

// should error if we try to pop another utf8 byte out
popped = cr.pop_utf8();
Expand Down
20 changes: 10 additions & 10 deletions libc/test/src/__support/wchar/utf8_to_32_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.bytes_stored = 0;
state.total_bytes = 0;
char ch = 'A';

Expand All @@ -28,7 +28,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {

TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.bytes_stored = 0;
state.total_bytes = 0;
const char ch[2] = {static_cast<char>(0xC2),
static_cast<char>(0x8E)}; // Ž car symbol
Expand All @@ -44,7 +44,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {

TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.bytes_stored = 0;
state.total_bytes = 0;
const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
static_cast<char>(0x91)}; // ∑ sigma symbol
Expand All @@ -61,7 +61,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {

TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.bytes_stored = 0;
state.total_bytes = 0;
const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
static_cast<char>(0xA4),
Expand All @@ -80,7 +80,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {

TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.bytes_stored = 0;
state.total_bytes = 0;
const char ch = static_cast<char>(0x80); // invalid starting bit sequence

Expand All @@ -92,7 +92,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {

TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.bytes_stored = 0;
state.total_bytes = 0;
const char ch[4] = {
static_cast<char>(0x80), static_cast<char>(0x00), static_cast<char>(0x80),
Expand All @@ -112,7 +112,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {

TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.bytes_stored = 0;
state.total_bytes = 0;
// Last byte is invalid since it does not have correct starting sequence.
// 0xC0 --> 11000000 starting sequence should be 10xxxxxx
Expand All @@ -132,7 +132,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {

TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.bytes_stored = 0;
state.total_bytes = 0;
const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
static_cast<char>(0x80)};
Expand All @@ -153,7 +153,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {

TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.bytes_stored = 0;
state.total_bytes = 0;
const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
static_cast<char>(0xC7), static_cast<char>(0x8C)};
Expand All @@ -179,7 +179,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {

TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) {
LIBC_NAMESPACE::internal::mbstate state;
state.bytes_processed = 0;
state.bytes_stored = 0;
state.total_bytes = 0;
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)};
Expand Down
Loading