Skip to content

[libc] Reworked CharacterConverter isComplete into isFull and isEmpty #144799

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from

Conversation

uzairnawaz
Copy link
Contributor

isComplete previously meant different things for different conversion directions.
Refactored bytes_processed to bytes_stored which now consistently increments on every push and decrements on pop making both directions more consistent with each other

@uzairnawaz uzairnawaz requested a review from sribee8 June 18, 2025 21:40
@llvmbot llvmbot added the libc label Jun 18, 2025
@llvmbot
Copy link
Member

llvmbot commented Jun 18, 2025

@llvm/pr-subscribers-libc

Author: Uzair Nawaz (uzairnawaz)

Changes

isComplete previously meant different things for different conversion directions.
Refactored bytes_processed to bytes_stored which now consistently increments on every push and decrements on pop making both directions more consistent with each other


Full diff: https://github.com/llvm/llvm-project/pull/144799.diff

5 Files Affected:

  • (modified) libc/src/__support/wchar/character_converter.cpp (+18-15)
  • (modified) libc/src/__support/wchar/character_converter.h (+2-1)
  • (modified) libc/src/__support/wchar/mbstate.h (+3-3)
  • (modified) libc/test/src/__support/wchar/utf32_to_8_test.cpp (+28-20)
  • (modified) libc/test/src/__support/wchar/utf8_to_32_test.cpp (+10-10)
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 5ab0447bb08b2..708af357d15d9 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -30,18 +30,21 @@ CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
 
 void CharacterConverter::clear() {
   state->partial = 0;
-  state->bytes_processed = 0;
+  state->bytes_stored = 0;
   state->total_bytes = 0;
 }
 
-bool CharacterConverter::isComplete() {
-  return state->bytes_processed == state->total_bytes;
+bool CharacterConverter::isFull() {
+  return state->bytes_stored == state->total_bytes &&
+         state->total_bytes != 0;
 }
 
+bool CharacterConverter::isEmpty() { return state->bytes_stored == 0; }
+
 int CharacterConverter::push(char8_t utf8_byte) {
   uint8_t num_ones = static_cast<uint8_t>(cpp::countl_one(utf8_byte));
   // Checking the first byte if first push
-  if (state->bytes_processed == 0) {
+  if (isEmpty()) {
     // UTF-8 char has 1 byte total
     if (num_ones == 0) {
       state->total_bytes = 1;
@@ -58,21 +61,21 @@ int CharacterConverter::push(char8_t utf8_byte) {
     }
     // Invalid first byte
     else {
-      // bytes_processed and total_bytes will always be 0 here
+      // bytes_stored and total_bytes will always be 0 here
       state->partial = static_cast<char32_t>(0);
       return -1;
     }
     state->partial = static_cast<char32_t>(utf8_byte);
-    state->bytes_processed++;
+    state->bytes_stored++;
     return 0;
   }
   // Any subsequent push
   // Adding 6 more bits so need to left shift
-  if (num_ones == 1 && !isComplete()) {
+  if (num_ones == 1 && !isFull()) {
     char32_t byte = utf8_byte & MASK_ENCODED_BITS;
     state->partial = state->partial << ENCODED_BITS_PER_UTF8;
     state->partial |= byte;
-    state->bytes_processed++;
+    state->bytes_stored++;
     return 0;
   }
   // Invalid byte -> reset the state
@@ -82,11 +85,10 @@ int CharacterConverter::push(char8_t utf8_byte) {
 
 int CharacterConverter::push(char32_t utf32) {
   // we can't be partially through a conversion when pushing a utf32 value
-  if (!isComplete())
+  if (!isEmpty())
     return -1;
 
   state->partial = utf32;
-  state->bytes_processed = 0;
 
   // determine number of utf-8 bytes needed to represent this utf32 value
   constexpr char32_t MAX_VALUE_PER_UTF8_LEN[] = {0x7f, 0x7ff, 0xffff, 0x10ffff};
@@ -94,6 +96,7 @@ int CharacterConverter::push(char32_t utf32) {
   for (uint8_t i = 0; i < NUM_RANGES; i++) {
     if (state->partial <= MAX_VALUE_PER_UTF8_LEN[i]) {
       state->total_bytes = i + 1;
+      state->bytes_stored = i + 1;
       return 0;
     }
   }
@@ -107,7 +110,7 @@ int CharacterConverter::push(char32_t utf32) {
 ErrorOr<char32_t> CharacterConverter::pop_utf32() {
   // If pop is called too early, do not reset the state, use error to determine
   // whether enough bytes have been pushed
-  if (!isComplete() || state->bytes_processed == 0)
+  if (!isFull())
     return Error(-1);
   char32_t utf32 = state->partial;
   // reset if successful pop
@@ -116,7 +119,7 @@ ErrorOr<char32_t> CharacterConverter::pop_utf32() {
 }
 
 ErrorOr<char8_t> CharacterConverter::pop_utf8() {
-  if (isComplete())
+  if (isEmpty())
     return Error(-1);
 
   constexpr char8_t FIRST_BYTE_HEADERS[] = {0, 0xC0, 0xE0, 0xF0};
@@ -126,8 +129,8 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
 
   // Shift to get the next 6 bits from the utf32 encoding
   const size_t shift_amount =
-      (state->total_bytes - state->bytes_processed - 1) * ENCODED_BITS_PER_UTF8;
-  if (state->bytes_processed == 0) {
+      (state->bytes_stored - 1) * ENCODED_BITS_PER_UTF8;
+  if (isFull()) {
     /*
       Choose the correct set of most significant bits to encode the length
       of the utf8 sequence. The remaining bits contain the most significant
@@ -141,7 +144,7 @@ ErrorOr<char8_t> CharacterConverter::pop_utf8() {
              ((state->partial >> shift_amount) & MASK_ENCODED_BITS);
   }
 
-  state->bytes_processed++;
+  state->bytes_stored--;
   return static_cast<char8_t>(output);
 }
 
diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h
index c4ba7cf6b689f..be0e6129df236 100644
--- a/libc/src/__support/wchar/character_converter.h
+++ b/libc/src/__support/wchar/character_converter.h
@@ -26,7 +26,8 @@ class CharacterConverter {
   CharacterConverter(mbstate *mbstate);
 
   void clear();
-  bool isComplete();
+  bool isFull();
+  bool isEmpty();
 
   int push(char8_t utf8_byte);
   int push(char32_t utf32);
diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h
index fb08fb4eaa188..fea693f73c3b5 100644
--- a/libc/src/__support/wchar/mbstate.h
+++ b/libc/src/__support/wchar/mbstate.h
@@ -22,10 +22,10 @@ struct mbstate {
 
   /*
   Progress towards a conversion
-    For utf8  -> utf32, increases with each CharacterConverter::push(utf8_byte)
-    For utf32 ->  utf8, increases with each CharacterConverter::pop_utf8()
+    Increases with each push(...) until it reaches total_bytes
+    Decreases with each pop(...) until it reaches 0
   */
-  uint8_t bytes_processed;
+  uint8_t bytes_stored;
 
   // Total number of bytes that will be needed to represent this character
   uint8_t total_bytes;
diff --git a/libc/test/src/__support/wchar/utf32_to_8_test.cpp b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
index f4c5cb863ff38..a6a7bc4aa6f4c 100644
--- a/libc/test/src/__support/wchar/utf32_to_8_test.cpp
+++ b/libc/test/src/__support/wchar/utf32_to_8_test.cpp
@@ -20,17 +20,19 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, OneByte) {
   // utf8 1-byte encodings are identical to their utf32 representations
   char32_t utf32_A = 0x41; // 'A'
   cr.push(utf32_A);
+  ASSERT_TRUE(cr.isFull());
   auto popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<char>(popped.value()), 'A');
-  ASSERT_TRUE(cr.isComplete());
+  ASSERT_TRUE(cr.isEmpty());
 
   char32_t utf32_B = 0x42; // 'B'
   cr.push(utf32_B);
+  ASSERT_TRUE(cr.isFull());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<char>(popped.value()), 'B');
-  ASSERT_TRUE(cr.isComplete());
+  ASSERT_TRUE(cr.isEmpty());
 
   // should error if we try to pop another utf8 byte out
   popped = cr.pop_utf8();
@@ -45,26 +47,28 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, TwoByte) {
   // testing utf32: 0xff -> utf8: 0xc3 0xbf
   char32_t utf32 = 0xff;
   cr.push(utf32);
+  ASSERT_TRUE(cr.isFull());
   auto popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0xc3);
-  ASSERT_TRUE(!cr.isComplete());
+  ASSERT_TRUE(!cr.isEmpty());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0xbf);
-  ASSERT_TRUE(cr.isComplete());
+  ASSERT_TRUE(cr.isEmpty());
 
   // testing utf32: 0x58e -> utf8: 0xd6 0x8e
   utf32 = 0x58e;
   cr.push(utf32);
+  ASSERT_TRUE(cr.isFull());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0xd6);
-  ASSERT_TRUE(!cr.isComplete());
+  ASSERT_TRUE(!cr.isEmpty());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0x8e);
-  ASSERT_TRUE(cr.isComplete());
+  ASSERT_TRUE(cr.isEmpty());
 
   // should error if we try to pop another utf8 byte out
   popped = cr.pop_utf8();
@@ -79,34 +83,36 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, ThreeByte) {
   // testing utf32: 0xac15 -> utf8: 0xea 0xb0 0x95
   char32_t utf32 = 0xac15;
   cr.push(utf32);
+  ASSERT_TRUE(cr.isFull());
   auto popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0xea);
-  ASSERT_TRUE(!cr.isComplete());
+  ASSERT_TRUE(!cr.isEmpty());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0xb0);
-  ASSERT_TRUE(!cr.isComplete());
+  ASSERT_TRUE(!cr.isEmpty());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0x95);
-  ASSERT_TRUE(cr.isComplete());
+  ASSERT_TRUE(cr.isEmpty());
 
   // testing utf32: 0x267b -> utf8: 0xe2 0x99 0xbb
   utf32 = 0x267b;
   cr.push(utf32);
+  ASSERT_TRUE(cr.isFull());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0xe2);
-  ASSERT_TRUE(!cr.isComplete());
+  ASSERT_TRUE(!cr.isEmpty());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0x99);
-  ASSERT_TRUE(!cr.isComplete());
+  ASSERT_TRUE(!cr.isEmpty());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0xbb);
-  ASSERT_TRUE(cr.isComplete());
+  ASSERT_TRUE(cr.isEmpty());
 
   // should error if we try to pop another utf8 byte out
   popped = cr.pop_utf8();
@@ -121,42 +127,44 @@ TEST(LlvmLibcCharacterConverterUTF32To8Test, FourByte) {
   // testing utf32: 0x1f921 -> utf8: 0xf0 0x9f 0xa4 0xa1
   char32_t utf32 = 0x1f921;
   cr.push(utf32);
+  ASSERT_TRUE(cr.isFull());
   auto popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
-  ASSERT_TRUE(!cr.isComplete());
+  ASSERT_TRUE(!cr.isEmpty());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0x9f);
-  ASSERT_TRUE(!cr.isComplete());
+  ASSERT_TRUE(!cr.isEmpty());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0xa4);
-  ASSERT_TRUE(!cr.isComplete());
+  ASSERT_TRUE(!cr.isEmpty());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
-  ASSERT_TRUE(cr.isComplete());
+  ASSERT_TRUE(cr.isEmpty());
 
   // testing utf32: 0x12121 -> utf8: 0xf0 0x92 0x84 0xa1
   utf32 = 0x12121;
   cr.push(utf32);
+  ASSERT_TRUE(cr.isFull());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0xf0);
-  ASSERT_TRUE(!cr.isComplete());
+  ASSERT_TRUE(!cr.isEmpty());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0x92);
-  ASSERT_TRUE(!cr.isComplete());
+  ASSERT_TRUE(!cr.isEmpty());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0x84);
-  ASSERT_TRUE(!cr.isComplete());
+  ASSERT_TRUE(!cr.isEmpty());
   popped = cr.pop_utf8();
   ASSERT_TRUE(popped.has_value());
   ASSERT_EQ(static_cast<int>(popped.value()), 0xa1);
-  ASSERT_TRUE(cr.isComplete());
+  ASSERT_TRUE(cr.isEmpty());
 
   // should error if we try to pop another utf8 byte out
   popped = cr.pop_utf8();
diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
index 9cb059faa9374..36ae7d689cc00 100644
--- a/libc/test/src/__support/wchar/utf8_to_32_test.cpp
+++ b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
@@ -13,7 +13,7 @@
 
 TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
   LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
+  state.bytes_stored = 0;
   state.total_bytes = 0;
   char ch = 'A';
 
@@ -28,7 +28,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
 
 TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
   LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
+  state.bytes_stored = 0;
   state.total_bytes = 0;
   const char ch[2] = {static_cast<char>(0xC2),
                       static_cast<char>(0x8E)}; // � car symbol
@@ -44,7 +44,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
 
 TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
   LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
+  state.bytes_stored = 0;
   state.total_bytes = 0;
   const char ch[3] = {static_cast<char>(0xE2), static_cast<char>(0x88),
                       static_cast<char>(0x91)}; // ∑ sigma symbol
@@ -61,7 +61,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
 
 TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
   LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
+  state.bytes_stored = 0;
   state.total_bytes = 0;
   const char ch[4] = {static_cast<char>(0xF0), static_cast<char>(0x9F),
                       static_cast<char>(0xA4),
@@ -80,7 +80,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
 
 TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
   LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
+  state.bytes_stored = 0;
   state.total_bytes = 0;
   const char ch = static_cast<char>(0x80); // invalid starting bit sequence
 
@@ -92,7 +92,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
 
 TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
   LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
+  state.bytes_stored = 0;
   state.total_bytes = 0;
   const char ch[4] = {
       static_cast<char>(0x80), static_cast<char>(0x00), static_cast<char>(0x80),
@@ -112,7 +112,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
 
 TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
   LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
+  state.bytes_stored = 0;
   state.total_bytes = 0;
   // Last byte is invalid since it does not have correct starting sequence.
   // 0xC0 --> 11000000 starting sequence should be 10xxxxxx
@@ -132,7 +132,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidLastByte) {
 
 TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
   LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
+  state.bytes_stored = 0;
   state.total_bytes = 0;
   const char ch[3] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
                       static_cast<char>(0x80)};
@@ -153,7 +153,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, ValidTwoByteWithExtraRead) {
 
 TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
   LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
+  state.bytes_stored = 0;
   state.total_bytes = 0;
   const char ch[4] = {static_cast<char>(0xC2), static_cast<char>(0x8E),
                       static_cast<char>(0xC7), static_cast<char>(0x8C)};
@@ -179,7 +179,7 @@ TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoValidTwoBytes) {
 
 TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidPop) {
   LIBC_NAMESPACE::internal::mbstate state;
-  state.bytes_processed = 0;
+  state.bytes_stored = 0;
   state.total_bytes = 0;
   LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
   const char ch[2] = {static_cast<char>(0xC2), static_cast<char>(0x8E)};

Copy link

github-actions bot commented Jun 18, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

Copy link
Contributor

@sribee8 sribee8 left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants