-
Notifications
You must be signed in to change notification settings - Fork 14.2k
[libc] Reworked CharacterConverter isComplete into isFull and isEmpty #144799
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
…y for both conversions
@llvm/pr-subscribers-libc Author: Uzair Nawaz (uzairnawaz) ChangesisComplete previously meant different things for different conversion directions. Full diff: https://github.com/llvm/llvm-project/pull/144799.diff 5 Files Affected:
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 5ab0447bb08b2..708af357d15d9 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -30,18 +30,21 @@ CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
void CharacterConverter::clear() {
state->partial = 0;
- state->bytes_processed = 0;
+ state->bytes_stored = 0;
state->total_bytes = 0;
}
-bool CharacterConverter::isComplete() {
- return state->bytes_processed == state->total_bytes;
+bool CharacterConverter::isFull() {
+ return state->bytes_stored == state->total_bytes &&
+ state->total_bytes != 0;
}
+bool CharacterConverter::isEmpty() { return state->bytes_stored == 0; }
+
int CharacterConverter::push(char8_t utf8_byte) {
uint8_t num_ones = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
// Checking the first byte if first push
- if (state->bytes_processed == 0) {
+ if (isEmpty()) {
// UTF-8 char has 1 byte total
if (num_ones == 0) {
state->total_bytes = 1;
@@ -58,21 +61,21 @@ int CharacterConverter::push(char8_t utf8_byte) {
}
// Invalid first byte
else {
- // bytes_processed and total_bytes will always be 0 here
+ // bytes_stored and total_bytes will always be 0 here
state->partial = static_cast<char32_t>(0);
return -1;
}
state->partial = static_cast<char32_t>(utf8_byte);
- state->bytes_processed++;
+ state->bytes_stored++;
return 0;
}
// Any subsequent push
// Adding 6 more bits so need to left shift
- if (num_ones == 1 && !isComplete()) {
+ if (num_ones == 1 && !isFull()) {
char32_t byte = utf8_byte & MASK_ENCODED_BITS;
state->partial = state->partial << ENCODED_BITS_PER_UTF8;
state->partial |= byte;
- state->bytes_processed++;
+ state->bytes_stored++;
return 0;
}
// Invalid byte -> reset the state
@@ -82,11 +85,10 @@ int CharacterConverter::push(char8_t utf8_byte) {
int CharacterConverter::push(char32_t utf32) {
// we can't be partially through a conversion when pushing a utf32 value
- if (!isComplete())
+ if (!isEmpty())
return -1;
state->partial = utf32;
- state->bytes_processed = 0;
// determine number of utf-8 bytes needed to represent this utf32 value
constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
@@ -94,6 +96,7 @@ int CharacterConverter::push(char32_t utf32) {
for (uint8_t i = 0; i < NUM_RANGES; i++) {
if (state->partial <= MAX_VALUE_PER_UTF8_LEN[i]) {
state->total_bytes = i + 1;
+ state->bytes_stored = i + 1;
return 0;
}
}
@@ -107,7 +110,7 @@ int CharacterConverter::push(char32_t utf32) {
ErrorOr<char32_t> CharacterConverter::pop_utf32() {
// If pop is called too early, do not reset the state, use error to determine
// whether enough bytes have been pushed
- if (!isComplete() || state->bytes_processed == 0)
+ if (!isFull())
return Error(-1);
char32_t utf32 = state->partial;
// reset if successful pop
@@ -116,7 +119,7 @@ ErrorOr<char32_t> CharacterConverter::pop_utf32() {
}
ErrorOr<char8_t> CharacterConverter::pop_utf8() {
- if (isComplete())
+ if (isEmpty())
return Error(-1);
constexpr char8_t FIRST_BYTE_HEADERS[] = {0, 0xC0, 0xE0, 0xF0};
@@ -126,8 +129,8 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
// Shift to get the next 6 bits from the utf32 encoding
const size_t shift_amount =
- (state->total_bytes - state->bytes_processed - 1) * ENCODED_BITS_PER_UTF8;
- if (state->bytes_processed == 0) {
+ (state->bytes_stored - 1) * ENCODED_BITS_PER_UTF8;
+ if (isFull()) {
/*
Choose the correct set of most significant bits to encode the length
of the utf8 sequence. The remaining bits contain the most significant
@@ -141,7 +144,7 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
((state->partial >> shift_amount) & MASK_ENCODED_BITS);
}
- state->bytes_processed++;
+ state->bytes_stored--;
return static_cast<char8_t>(output);
}
diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h
index c4ba7cf6b689f..be0e6129df236 100644
--- a/libc/src/__support/wchar/character_converter.h
+++ b/libc/src/__support/wchar/character_converter.h
@@ -26,7 +26,8 @@ class CharacterConverter {
CharacterConverter(mbstate *mbstate);
void clear();
- bool isComplete();
+ bool isFull();
+ bool isEmpty();
int push(char8_t utf8_byte);
int push(char32_t utf32);
diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h
index fb08fb4eaa188..fea693f73c3b5 100644
--- a/libc/src/__support/wchar/mbstate.h
+++ b/libc/src/__support/wchar/mbstate.h
@@ -22,10 +22,10 @@ struct mbstate {
/*
Progress towards a conversion
- For utf8 -> utf32, increases with each CharacterConverter::push(utf8_byte)
- For utf32 -> utf8, increases with each CharacterConverter::pop_utf8()
+ Increases with each push(...) until it reaches total_bytes
+ Decreases with each pop(...) until it reaches 0
*/
- uint8_t bytes_processed;
+ uint8_t bytes_stored;
// Total number of bytes that will be needed to represent this character
uint8_t total_bytes;
diff --git a/libc/test/src/__support/wchar/utf32_to_8_test.cpp b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
index f4c5cb863ff38..a6a7bc4aa6f4c 100644
--- a/libc/test/src/__support/wchar/utf32_to_8_test.cpp
+++ b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
@@ -20,17 +20,19 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {
// utf8 1-byte encodings are identical to their utf32 representations
char32_t utf32_A = 0x41; // 'A'
cr.push(utf32_A);
+ ASSERT_TRUE(cr.isFull());
auto popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<char>(popped.value()), 'A');
- ASSERT_TRUE(cr.isComplete());
+ ASSERT_TRUE(cr.isEmpty());
char32_t utf32_B = 0x42; // 'B'
cr.push(utf32_B);
+ ASSERT_TRUE(cr.isFull());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<char>(popped.value()), 'B');
- ASSERT_TRUE(cr.isComplete());
+ ASSERT_TRUE(cr.isEmpty());
// should error if we try to pop another utf8 byte out
popped = cr.pop_utf8();
@@ -45,26 +47,28 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
// testing utf32: 0xff -> utf8: 0xc3 0xbf
char32_t utf32 = 0xff;
cr.push(utf32);
+ ASSERT_TRUE(cr.isFull());
auto popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xc3);
- ASSERT_TRUE(!cr.isComplete());
+ ASSERT_TRUE(!cr.isEmpty());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xbf);
- ASSERT_TRUE(cr.isComplete());
+ ASSERT_TRUE(cr.isEmpty());
// testing utf32: 0x58e -> utf8: 0xd6 0x8e
utf32 = 0x58e;
cr.push(utf32);
+ ASSERT_TRUE(cr.isFull());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xd6);
- ASSERT_TRUE(!cr.isComplete());
+ ASSERT_TRUE(!cr.isEmpty());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0x8e);
- ASSERT_TRUE(cr.isComplete());
+ ASSERT_TRUE(cr.isEmpty());
// should error if we try to pop another utf8 byte out
popped = cr.pop_utf8();
@@ -79,34 +83,36 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
// testing utf32: 0xac15 -> utf8: 0xea 0xb0 0x95
char32_t utf32 = 0xac15;
cr.push(utf32);
+ ASSERT_TRUE(cr.isFull());
auto popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xea);
- ASSERT_TRUE(!cr.isComplete());
+ ASSERT_TRUE(!cr.isEmpty());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xb0);
- ASSERT_TRUE(!cr.isComplete());
+ ASSERT_TRUE(!cr.isEmpty());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0x95);
- ASSERT_TRUE(cr.isComplete());
+ ASSERT_TRUE(cr.isEmpty());
// testing utf32: 0x267b -> utf8: 0xe2 0x99 0xbb
utf32 = 0x267b;
cr.push(utf32);
+ ASSERT_TRUE(cr.isFull());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xe2);
- ASSERT_TRUE(!cr.isComplete());
+ ASSERT_TRUE(!cr.isEmpty());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0x99);
- ASSERT_TRUE(!cr.isComplete());
+ ASSERT_TRUE(!cr.isEmpty());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xbb);
- ASSERT_TRUE(cr.isComplete());
+ ASSERT_TRUE(cr.isEmpty());
// should error if we try to pop another utf8 byte out
popped = cr.pop_utf8();
@@ -121,42 +127,44 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) {
// testing utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1
char32_t utf32 = 0x1f921;
cr.push(utf32);
+ ASSERT_TRUE(cr.isFull());
auto popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
- ASSERT_TRUE(!cr.isComplete());
+ ASSERT_TRUE(!cr.isEmpty());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0x9f);
- ASSERT_TRUE(!cr.isComplete());
+ ASSERT_TRUE(!cr.isEmpty());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xa4);
- ASSERT_TRUE(!cr.isComplete());
+ ASSERT_TRUE(!cr.isEmpty());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
- ASSERT_TRUE(cr.isComplete());
+ ASSERT_TRUE(cr.isEmpty());
// testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1
utf32 = 0x12121;
cr.push(utf32);
+ ASSERT_TRUE(cr.isFull());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
- ASSERT_TRUE(!cr.isComplete());
+ ASSERT_TRUE(!cr.isEmpty());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0x92);
- ASSERT_TRUE(!cr.isComplete());
+ ASSERT_TRUE(!cr.isEmpty());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0x84);
- ASSERT_TRUE(!cr.isComplete());
+ ASSERT_TRUE(!cr.isEmpty());
popped = cr.pop_utf8();
ASSERT_TRUE(popped.has_value());
ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
- ASSERT_TRUE(cr.isComplete());
+ ASSERT_TRUE(cr.isEmpty());
// should error if we try to pop another utf8 byte out
popped = cr.pop_utf8();
diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
index 9cb059faa9374..36ae7d689cc00 100644
--- a/libc/test/src/__support/wchar/utf8_to_32_test.cpp
+++ b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
@@ -13,7 +13,7 @@
TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
LIBC_NAMESPACE::internal::mbstate state;
- state.bytes_processed = 0;
+ state.bytes_stored = 0;
state.total_bytes = 0;
char ch = 'A';
@@ -28,7 +28,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
LIBC_NAMESPACE::internal::mbstate state;
- state.bytes_processed = 0;
+ state.bytes_stored = 0;
state.total_bytes = 0;
const char ch[2] = {static_cast<char>(0xC2),
static_cast<char>(0x8E)}; // � car symbol
@@ -44,7 +44,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
LIBC_NAMESPACE::internal::mbstate state;
- state.bytes_processed = 0;
+ state.bytes_stored = 0;
state.total_bytes = 0;
const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
static_cast<char>(0x91)}; // ∑ sigma symbol
@@ -61,7 +61,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
LIBC_NAMESPACE::internal::mbstate state;
- state.bytes_processed = 0;
+ state.bytes_stored = 0;
state.total_bytes = 0;
const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
static_cast<char>(0xA4),
@@ -80,7 +80,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
LIBC_NAMESPACE::internal::mbstate state;
- state.bytes_processed = 0;
+ state.bytes_stored = 0;
state.total_bytes = 0;
const char ch = static_cast<char>(0x80); // invalid starting bit sequence
@@ -92,7 +92,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
LIBC_NAMESPACE::internal::mbstate state;
- state.bytes_processed = 0;
+ state.bytes_stored = 0;
state.total_bytes = 0;
const char ch[4] = {
static_cast<char>(0x80), static_cast<char>(0x00), static_cast<char>(0x80),
@@ -112,7 +112,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
LIBC_NAMESPACE::internal::mbstate state;
- state.bytes_processed = 0;
+ state.bytes_stored = 0;
state.total_bytes = 0;
// Last byte is invalid since it does not have correct starting sequence.
// 0xC0 --> 11000000 starting sequence should be 10xxxxxx
@@ -132,7 +132,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
LIBC_NAMESPACE::internal::mbstate state;
- state.bytes_processed = 0;
+ state.bytes_stored = 0;
state.total_bytes = 0;
const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
static_cast<char>(0x80)};
@@ -153,7 +153,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
LIBC_NAMESPACE::internal::mbstate state;
- state.bytes_processed = 0;
+ state.bytes_stored = 0;
state.total_bytes = 0;
const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
static_cast<char>(0xC7), static_cast<char>(0x8C)};
@@ -179,7 +179,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) {
LIBC_NAMESPACE::internal::mbstate state;
- state.bytes_processed = 0;
+ state.bytes_stored = 0;
state.total_bytes = 0;
LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)};
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
isComplete previously meant different things for different conversion directions.
Refactored bytes_processed to bytes_stored which now consistently increments on every push and decrements on pop making both directions more consistent with each other