Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 33 additions & 2 deletions cpp/src/parquet/decoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1038,7 +1038,13 @@ void DictDecoderImpl<Type>::SetDict(TypedDecoder<Type>* dictionary) {

template <>
void DictDecoderImpl<BooleanType>::SetDict(TypedDecoder<BooleanType>* dictionary) {
ParquetException::NYI("Dictionary encoding is not implemented for boolean values");
dictionary_length_ = static_cast<int32_t>(dictionary->values_left());
PARQUET_THROW_NOT_OK(dictionary_->Resize(
static_cast<int64_t>(dictionary_length_) * sizeof(bool), false));
if (dictionary->Decode(dictionary_->mutable_data_as<bool>(), dictionary_length_) !=
dictionary_length_) {
throw ParquetException("Could not decode boolean dictionary values");
}
}

template <>
Expand Down Expand Up @@ -1257,6 +1263,31 @@ void DictDecoderImpl<ByteArrayType>::InsertDictionary(::arrow::ArrayBuilder* bui
PARQUET_THROW_NOT_OK(binary_builder->InsertMemoValues(*arr));
}

// Dictionary decoder for boolean column data. Decodes PLAIN_DICTIONARY
// and RLE_DICTIONARY pages into the callers byte buffer.
class DictBooleanDecoderImpl : public DictDecoderImpl<BooleanType>,
virtual public BooleanDecoder {
public:
using BASE = DictDecoderImpl<BooleanType>;
using BASE::BASE;
using BASE::Decode;

int Decode(uint8_t* buffer, int max_values) override {
max_values = std::min(max_values, this->num_values_);
const auto* dict = dictionary_->data_as<bool>();
for (int i = 0; i < max_values; ++i) {
int32_t index;
if (ARROW_PREDICT_FALSE(!idx_decoder_.Get(&index))) {
ParquetException::EofException();
}
PARQUET_THROW_NOT_OK(IndexInBounds(index));
::arrow::bit_util::SetBitTo(buffer, i, dict[index]);
}
this->num_values_ -= max_values;
return max_values;
}
};

class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType> {
public:
using BASE = DictDecoderImpl<ByteArrayType>;
Expand Down Expand Up @@ -2459,7 +2490,7 @@ std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
MemoryPool* pool) {
switch (type_num) {
case Type::BOOLEAN:
ParquetException::NYI("Dictionary encoding not implemented for boolean type");
return std::make_unique<DictBooleanDecoderImpl>(descr, pool);
case Type::INT32:
return std::make_unique<DictDecoderImpl<Int32Type>>(descr, pool);
case Type::INT64:
Expand Down
29 changes: 27 additions & 2 deletions cpp/src/parquet/encoding_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -462,8 +462,33 @@ TYPED_TEST(TestDictionaryEncoding, BasicRoundTrip) {
ASSERT_NO_FATAL_FAILURE(this->Execute(2500, 2));
}

TEST(TestDictionaryEncoding, CannotDictDecodeBoolean) {
ASSERT_THROW(MakeDictDecoder<BooleanType>(nullptr), ParquetException);
// Round trip a dictionary encoded boolean column. The dictionary holds two
// booleans, the index stream picks each by position, and the result is read
// back through both the bool buffer and the bit packed uint8_t buffer.
TEST(TestDictionaryEncoding, DictDecodesBoolean) {
const uint8_t dict_bytes[] = {0x02};
auto dict_plain_decoder = MakeTypedDecoder<BooleanType>(Encoding::PLAIN);
dict_plain_decoder->SetData(2, dict_bytes, 1);

auto decoder = MakeDictDecoder<BooleanType>();
decoder->SetDict(dict_plain_decoder.get());

const uint8_t indices[] = {0x01, 0x03, 0x35};
decoder->SetData(8, indices, sizeof(indices));

bool out_bool[8] = {};
ASSERT_EQ(8, decoder->Decode(out_bool, 8));
const bool expected[8] = {true, false, true, false, true, true, false, false};
for (int i = 0; i < 8; ++i) {
EXPECT_EQ(expected[i], out_bool[i]) << " at index " << i;
}

decoder->SetData(8, indices, sizeof(indices));
uint8_t out_packed = 0;
auto* bool_dec = dynamic_cast<BooleanDecoder*>(decoder.get());
ASSERT_NE(bool_dec, nullptr);
ASSERT_EQ(8, bool_dec->Decode(&out_packed, 8));
EXPECT_EQ(0x35, out_packed);
}

// ----------------------------------------------------------------------
Expand Down
Loading