Skip to content

Commit e74bb67

Browse files
authored
GH-50216: [C++][Parquet] Add RleBitPackedToBitmapDecoder (#50217)
### Rationale for this change Add a `RleBitPackedToBitmapDecoder` capable of decoding a Parquet mixed Rle / BitPacked byte stream directly into a bitmap. This is an optimization to be used when target and source bit_width = 1 that will improve decoding nullable columns in a follow-up PR. ### What changes are included in this PR? New classes, that reuse previous Runs and parser. Their API is slighly different from the existing one as it fits a partilcular case. - `RleRunToBitmapDecoder` - `BitPackedRunToBitmapDecoder` - `RleRunToBitmapDecoder` ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #50216 Authored-by: AntoinePrv <AntoinePrv@users.noreply.github.com> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent b43a417 commit e74bb67

6 files changed

Lines changed: 1098 additions & 32 deletions

File tree

cpp/src/arrow/util/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ add_arrow_test(bit-utility-test
9999
bit_util_test.cc
100100
bitmap_test.cc
101101
bpacking_test.cc
102+
rle_bitmap_test.cc
102103
rle_encoding_test.cc
103104
test_common.cc)
104105

cpp/src/arrow/util/bit_util.h

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,12 @@
1818
#pragma once
1919

2020
#include <bit>
21+
#include <cassert>
2122
#include <cstdint>
23+
#include <cstring>
2224
#include <type_traits>
2325

26+
#include "arrow/util/endian.h"
2427
#include "arrow/util/macros.h"
2528
#include "arrow/util/visibility.h"
2629

@@ -176,6 +179,39 @@ static constexpr bool GetBitFromByte(uint8_t byte, uint8_t i) {
176179
return byte & kBitmask[i];
177180
}
178181

182+
template <typename Uint>
183+
struct CopyBitsParams {
184+
Uint src = {};
185+
Uint dst = {};
186+
int start = {};
187+
int end = {};
188+
};
189+
190+
/// Copy a contiguous span of bits from src into dst.
191+
///
192+
/// Copy bits [start, end[ from src into the position [start, end[ in dst
193+
/// and return the result (inputs are unmodified).
194+
/// Setting ``kAllowFullCopy`` to false is an optimization when the caller can
195+
/// guarantee that the range of bits to copy does not cover the whole range.
196+
template <typename Uint, bool kAllowFullCopy = true>
197+
[[nodiscard]] constexpr Uint CopyBitsInInteger(const CopyBitsParams<Uint>& params) {
198+
constexpr auto kUintSizeBits = static_cast<int>(sizeof(Uint) * 8);
199+
assert(params.start <= params.end);
200+
assert(params.start < kUintSizeBits);
201+
assert(params.end <= kUintSizeBits);
202+
203+
const int length = params.end - params.start;
204+
if constexpr (kAllowFullCopy) {
205+
if (length == kUintSizeBits) {
206+
return params.src;
207+
}
208+
}
209+
assert(length < kUintSizeBits);
210+
const Uint mask =
211+
static_cast<Uint>(LeastSignificantBitMask<Uint, false>(length) << params.start);
212+
return (~mask & params.dst) | (mask & params.src);
213+
}
214+
179215
static inline void ClearBit(uint8_t* bits, int64_t i) {
180216
bits[i / 8] &= kFlippedBitmask[i % 8];
181217
}

cpp/src/arrow/util/bit_util_test.cc

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1065,4 +1065,47 @@ TEST(SpliceWord, SpliceWord) {
10651065
0xfedc456789abcdef);
10661066
}
10671067

1068+
TEST(BitUtil, CopyBits) {
1069+
// Copy bits [start, end[ from src into dst, keeping dst's other bits.
1070+
using bit_util::CopyBitsInInteger;
1071+
1072+
// Empty range: result equals dst.
1073+
ASSERT_EQ(CopyBitsInInteger<uint8_t>(
1074+
{.src = 0b11111111, .dst = 0b00010010, .start = 3, .end = 3}),
1075+
0b00010010);
1076+
// dst = 0101 0101, src = 1010 1010 -> 0101 1010
1077+
ASSERT_EQ(CopyBitsInInteger<uint8_t>(
1078+
{.src = 0b10101010, .dst = 0b01010101, .start = 0, .end = 4}),
1079+
0b01011010);
1080+
// Copy a middle span [2, 5[ of all-ones into an all-zeros dst.
1081+
ASSERT_EQ(CopyBitsInInteger<uint8_t>(
1082+
{.src = 0b11111111, .dst = 0b00000000, .start = 2, .end = 5}),
1083+
0b00011100);
1084+
// Copy a middle span [2, 5[ of all-zeros into an all-ones dst.
1085+
ASSERT_EQ(CopyBitsInInteger<uint8_t>(
1086+
{.src = 0b00000000, .dst = 0b11111111, .start = 2, .end = 5}),
1087+
0b11100011);
1088+
// Full-word copy returns src unchanged.
1089+
ASSERT_EQ(CopyBitsInInteger<uint8_t>(
1090+
{.src = 0b10101011, .dst = 0b00010010, .start = 0, .end = 8}),
1091+
0b10101011);
1092+
// uint16_t partial range [4, 12[: dst keeps its bits outside, src fills inside.
1093+
ASSERT_EQ(
1094+
CopyBitsInInteger<uint16_t>(
1095+
{.src = 0b1010101010101010, .dst = 0b0101010101010101, .start = 4, .end = 12}),
1096+
0b0101101010100101);
1097+
// uint64_t
1098+
ASSERT_EQ(CopyBitsInInteger<uint64_t>({
1099+
.src = 0x0123456789abcdef,
1100+
.dst = 0xfedcba9876543210,
1101+
.start = 0,
1102+
.end = 64,
1103+
}),
1104+
0x0123456789abcdef);
1105+
// constexpr-evaluable.
1106+
static_assert(CopyBitsInInteger<uint8_t>(
1107+
{.src = 0b10101010, .dst = 0b01010101, .start = 0, .end = 4}) ==
1108+
0b01011010);
1109+
}
1110+
10681111
} // namespace arrow

0 commit comments

Comments
 (0)