diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc index 50ce510bb1f..697e4f64fb8 100644 --- a/cpp/src/parquet/decoder.cc +++ b/cpp/src/parquet/decoder.cc @@ -1038,7 +1038,13 @@ void DictDecoderImpl::SetDict(TypedDecoder* dictionary) { template <> void DictDecoderImpl::SetDict(TypedDecoder* dictionary) { - ParquetException::NYI("Dictionary encoding is not implemented for boolean values"); + dictionary_length_ = static_cast(dictionary->values_left()); + PARQUET_THROW_NOT_OK(dictionary_->Resize( + static_cast(dictionary_length_) * sizeof(bool), false)); + if (dictionary->Decode(dictionary_->mutable_data_as(), dictionary_length_) != + dictionary_length_) { + throw ParquetException("Could not decode boolean dictionary values"); + } } template <> @@ -1257,6 +1263,31 @@ void DictDecoderImpl::InsertDictionary(::arrow::ArrayBuilder* bui PARQUET_THROW_NOT_OK(binary_builder->InsertMemoValues(*arr)); } +// Dictionary decoder for boolean column data. Decodes PLAIN_DICTIONARY +// and RLE_DICTIONARY pages into the callers byte buffer. +class DictBooleanDecoderImpl : public DictDecoderImpl, + virtual public BooleanDecoder { + public: + using BASE = DictDecoderImpl; + using BASE::BASE; + using BASE::Decode; + + int Decode(uint8_t* buffer, int max_values) override { + max_values = std::min(max_values, this->num_values_); + const auto* dict = dictionary_->data_as(); + for (int i = 0; i < max_values; ++i) { + int32_t index; + if (ARROW_PREDICT_FALSE(!idx_decoder_.Get(&index))) { + ParquetException::EofException(); + } + PARQUET_THROW_NOT_OK(IndexInBounds(index)); + ::arrow::bit_util::SetBitTo(buffer, i, dict[index]); + } + this->num_values_ -= max_values; + return max_values; + } +}; + class DictByteArrayDecoderImpl : public DictDecoderImpl { public: using BASE = DictDecoderImpl; @@ -2459,7 +2490,7 @@ std::unique_ptr MakeDictDecoder(Type::type type_num, MemoryPool* pool) { switch (type_num) { case Type::BOOLEAN: - ParquetException::NYI("Dictionary encoding not implemented for boolean type"); + return std::make_unique(descr, pool); case Type::INT32: return std::make_unique>(descr, pool); case Type::INT64: diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index 9c88eb468a4..97347895a09 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -462,8 +462,33 @@ TYPED_TEST(TestDictionaryEncoding, BasicRoundTrip) { ASSERT_NO_FATAL_FAILURE(this->Execute(2500, 2)); } -TEST(TestDictionaryEncoding, CannotDictDecodeBoolean) { - ASSERT_THROW(MakeDictDecoder(nullptr), ParquetException); +// Round trip a dictionary encoded boolean column. The dictionary holds two +// booleans, the index stream picks each by position, and the result is read +// back through both the bool buffer and the bit packed uint8_t buffer. +TEST(TestDictionaryEncoding, DictDecodesBoolean) { + const uint8_t dict_bytes[] = {0x02}; + auto dict_plain_decoder = MakeTypedDecoder(Encoding::PLAIN); + dict_plain_decoder->SetData(2, dict_bytes, 1); + + auto decoder = MakeDictDecoder(); + decoder->SetDict(dict_plain_decoder.get()); + + const uint8_t indices[] = {0x01, 0x03, 0x35}; + decoder->SetData(8, indices, sizeof(indices)); + + bool out_bool[8] = {}; + ASSERT_EQ(8, decoder->Decode(out_bool, 8)); + const bool expected[8] = {true, false, true, false, true, true, false, false}; + for (int i = 0; i < 8; ++i) { + EXPECT_EQ(expected[i], out_bool[i]) << " at index " << i; + } + + decoder->SetData(8, indices, sizeof(indices)); + uint8_t out_packed = 0; + auto* bool_dec = dynamic_cast(decoder.get()); + ASSERT_NE(bool_dec, nullptr); + ASSERT_EQ(8, bool_dec->Decode(&out_packed, 8)); + EXPECT_EQ(0x35, out_packed); } // ----------------------------------------------------------------------