diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 12e0f553b740..bc2612f92add 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -18,7 +18,6 @@ #include "arrow/record_batch.h" #include -#include #include #include #include diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index 0d1d2d4ac359..d1e5d541821f 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -90,11 +90,9 @@ class ARROW_EXPORT RecordBatch { /// in the resulting struct array. Result> ToStructArray() const; - /// \brief Convert record batch with one data type to Tensor + /// \brief Convert RecordBatch to Tensor /// - /// Create a Tensor object with shape (number of rows, number of columns) and - /// strides (type size in bytes, type size in bytes * number of rows). - /// Generated Tensor will have column-major layout. + /// Create a Tensor object. /// /// \param[in] null_to_nan if true, convert nulls to NaN /// \param[in] row_major if true, create row-major Tensor else column-major Tensor diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index a037d7261efb..0cc7def76796 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -906,10 +906,11 @@ TEST_F(TestRecordBatch, ToTensorUnsupportedMissing) { auto batch = RecordBatch::Make(schema, length, {a0, a1}); - ASSERT_RAISES_WITH_MESSAGE(TypeError, - "Type error: Can only convert a RecordBatch with no nulls. " - "Set null_to_nan to true to convert nulls to NaN", - batch->ToTensor()); + ASSERT_RAISES_WITH_MESSAGE( + TypeError, + "Type error: Can only convert a Table or RecordBatch with no " + "nulls. Set null_to_nan to true to convert nulls to NaN", + batch->ToTensor()); } TEST_F(TestRecordBatch, ToTensorEmptyBatch) { @@ -940,10 +941,11 @@ TEST_F(TestRecordBatch, ToTensorEmptyBatch) { auto batch_no_columns = RecordBatch::Make(::arrow::schema({}), 10, std::vector>{}); - ASSERT_RAISES_WITH_MESSAGE(TypeError, - "Type error: Conversion to Tensor for RecordBatches without " - "columns/schema is not supported.", - batch_no_columns->ToTensor()); + ASSERT_RAISES_WITH_MESSAGE( + TypeError, + "Type error: Conversion to Tensor for Tables or RecordBatches " + "without columns/schema is not supported.", + batch_no_columns->ToTensor()); } template diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 68a8a1951f1c..2c9ed5195a6f 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -36,10 +36,12 @@ #include "arrow/record_batch.h" #include "arrow/result.h" #include "arrow/status.h" +#include "arrow/tensor.h" #include "arrow/type.h" #include "arrow/type_fwd.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" #include "arrow/util/logging_internal.h" #include "arrow/util/vector.h" @@ -346,6 +348,14 @@ Result> Table::FromChunkedStructArray( array->length()); } +Result> Table::ToTensor(bool null_to_nan, bool row_major, + MemoryPool* pool) const { + std::shared_ptr tensor; + ARROW_RETURN_NOT_OK( + internal::TableToTensor(*this, null_to_nan, row_major, pool, &tensor)); + return tensor; +} + std::vector Table::ColumnNames() const { std::vector names(num_columns()); for (int i = 0; i < num_columns(); ++i) { diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index dee6f6fdd3cb..051060a52c4b 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -102,6 +102,18 @@ class ARROW_EXPORT Table { static Result> FromChunkedStructArray( const std::shared_ptr& array); + /// \brief Convert Table to Tensor + /// + /// Create a Tensor object. + /// + /// \param[in] null_to_nan if true, convert nulls to NaN + /// \param[in] row_major if true, create row-major Tensor else column-major Tensor + /// \param[in] pool the memory pool to allocate the tensor buffer + /// \return the resulting Tensor + Result> ToTensor( + bool null_to_nan = false, bool row_major = true, + MemoryPool* pool = default_memory_pool()) const; + /// \brief Return the table schema const std::shared_ptr& schema() const { return schema_; } diff --git a/cpp/src/arrow/table_test.cc b/cpp/src/arrow/table_test.cc index 692671910b89..d3522b282df3 100644 --- a/cpp/src/arrow/table_test.cc +++ b/cpp/src/arrow/table_test.cc @@ -33,6 +33,7 @@ #include "arrow/compute/cast.h" #include "arrow/record_batch.h" #include "arrow/status.h" +#include "arrow/tensor.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/type.h" @@ -520,6 +521,539 @@ TEST_F(TestTable, ConcatenateTables) { ASSERT_RAISES(Invalid, ConcatenateTables({t1, t3})); } +TEST_F(TestTable, ToTensorUnsupportedType) { + auto f0 = field("f0", int32()); + // Unsupported data type + auto f1 = field("f1", utf8()); + + std::vector> fields = {f0, f1}; + auto schema = ::arrow::schema(fields); + + auto a0 = ChunkedArrayFromJSON(int32(), {"[1, 2, 3]", "[4, 5, 6, 7, 8, 9]"}); + auto a1 = ChunkedArrayFromJSON( + utf8(), {R"(["a", "b", "c", "a", "b"])", R"(["c", "a", "b", "c"])"}); + + auto table = Table::Make(schema, {a0, a1}); + + ASSERT_RAISES_WITH_MESSAGE( + TypeError, "Type error: DataType is not supported: " + a1->type()->ToString(), + table->ToTensor()); + + // Unsupported boolean data type + auto f2 = field("f2", boolean()); + + std::vector> fields2 = {f0, f2}; + auto schema2 = ::arrow::schema(fields2); + auto a2 = ChunkedArrayFromJSON( + boolean(), {"[true, false, true, true, false, true, false, true, true]"}); + auto table2 = Table::Make(schema2, {a0, a2}); + + ASSERT_RAISES_WITH_MESSAGE( + TypeError, "Type error: DataType is not supported: " + a2->type()->ToString(), + table2->ToTensor()); +} + +TEST_F(TestTable, ToTensorUnsupportedMissing) { + auto f0 = field("f0", int32()); + auto f1 = field("f1", int32()); + + std::vector> fields = {f0, f1}; + auto schema = ::arrow::schema(fields); + + auto a0 = ChunkedArrayFromJSON(int32(), {"[1, 2, 3]", "[4, 5, 6, 7, 8, 9]"}); + auto a1 = ChunkedArrayFromJSON(int32(), {"[10, 20]", "[30, 40, null, 60, 70, 80, 90]"}); + + auto table = Table::Make(schema, {a0, a1}); + + ASSERT_RAISES_WITH_MESSAGE( + TypeError, + "Type error: Can only convert a Table or RecordBatch with no " + "nulls. Set null_to_nan to true to convert nulls to NaN", + table->ToTensor()); +} + +TEST_F(TestTable, ToTensorEmptyTable) { + auto f0 = field("f0", int32()); + auto f1 = field("f1", int32()); + + std::vector> fields = {f0, f1}; + auto schema = ::arrow::schema(fields); + + ASSERT_OK_AND_ASSIGN(std::shared_ptr empty, Table::MakeEmpty(schema)); + + ASSERT_OK_AND_ASSIGN(auto tensor_column, + empty->ToTensor(/*null_to_nan=*/false, /*row_major=*/false)); + ASSERT_OK(tensor_column->Validate()); + + ASSERT_OK_AND_ASSIGN(auto tensor_row, empty->ToTensor()); + ASSERT_OK(tensor_row->Validate()); + + const std::vector strides = {4, 4}; + const std::vector shape = {0, 2}; + + EXPECT_EQ(strides, tensor_column->strides()); + EXPECT_EQ(shape, tensor_column->shape()); + EXPECT_EQ(strides, tensor_row->strides()); + EXPECT_EQ(shape, tensor_row->shape()); + + std::vector> columns; + auto t2 = Table::Make(::arrow::schema({}), columns); + auto table_no_columns = + Table::Make(::arrow::schema({}), std::vector>{}); + + ASSERT_RAISES_WITH_MESSAGE( + TypeError, + "Type error: Conversion to Tensor for Tables or RecordBatches " + "without columns/schema is not supported.", + table_no_columns->ToTensor()); +} + +template +void CheckTableToTensor(const std::shared_ptr& tensor, const int size, + const std::vector shape, + const std::vector f_strides) { + EXPECT_EQ(size, tensor->size()); + EXPECT_EQ(TypeTraits::type_singleton(), tensor->type()); + EXPECT_EQ(shape, tensor->shape()); + EXPECT_EQ(f_strides, tensor->strides()); + EXPECT_FALSE(tensor->is_row_major()); + EXPECT_TRUE(tensor->is_column_major()); + EXPECT_TRUE(tensor->is_contiguous()); +} + +template +void CheckTableToTensorRowMajor(const std::shared_ptr& tensor, const int size, + const std::vector shape, + const std::vector strides) { + EXPECT_EQ(size, tensor->size()); + EXPECT_EQ(TypeTraits::type_singleton(), tensor->type()); + EXPECT_EQ(shape, tensor->shape()); + EXPECT_EQ(strides, tensor->strides()); + EXPECT_TRUE(tensor->is_row_major()); + EXPECT_FALSE(tensor->is_column_major()); + EXPECT_TRUE(tensor->is_contiguous()); +} + +TEST_F(TestTable, ToTensorSupportedNaN) { + auto f0 = field("f0", float32()); + auto f1 = field("f1", float32()); + + std::vector> fields = {f0, f1}; + auto schema = ::arrow::schema(fields); + + auto a0 = ChunkedArrayFromJSON(float32(), {"[NaN, 2, 3]", "[4, 5, 6, 7, 8, 9]"}); + auto a1 = + ChunkedArrayFromJSON(float32(), {"[10, 20]", "[30, 40, NaN, 60, 70, 80, 90]"}); + + auto table = Table::Make(schema, {a0, a1}); + + ASSERT_OK_AND_ASSIGN(auto tensor, + table->ToTensor(/*null_to_nan=*/false, /*row_major=*/false)); + ASSERT_OK(tensor->Validate()); + + std::vector shape = {9, 2}; + const int64_t f32_size = sizeof(float); + std::vector f_strides = {f32_size, f32_size * shape[0]}; + std::shared_ptr tensor_expected = TensorFromJSON( + float32(), "[NaN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, NaN, 60, 70, 80, 90]", + shape, f_strides); + + EXPECT_FALSE(tensor_expected->Equals(*tensor)); + EXPECT_TRUE(tensor_expected->Equals(*tensor, EqualOptions().nans_equal(true))); + CheckTableToTensor(tensor, 18, shape, f_strides); +} + +TEST_F(TestTable, ToTensorSupportedNullToNan) { + // int32 + float32 = float64 + auto f0 = field("f0", int32()); + auto f1 = field("f1", float32()); + + std::vector> fields = {f0, f1}; + auto schema = ::arrow::schema(fields); + + auto a0 = ChunkedArrayFromJSON(int32(), {"[null, 2, 3]", "[4, 5, 6, 7, 8, 9]"}); + auto a1 = + ChunkedArrayFromJSON(float32(), {"[10, 20]", "[30, 40, null, 60, 70, 80, 90]"}); + + auto table = Table::Make(schema, {a0, a1}); + + ASSERT_OK_AND_ASSIGN(auto tensor, + table->ToTensor(/*null_to_nan=*/true, /*row_major=*/false)); + ASSERT_OK(tensor->Validate()); + + std::vector shape = {9, 2}; + const int64_t f64_size = sizeof(double); + std::vector f_strides = {f64_size, f64_size * shape[0]}; + std::shared_ptr tensor_expected = TensorFromJSON( + float64(), "[NaN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, NaN, 60, 70, 80, 90]", + shape, f_strides); + + EXPECT_FALSE(tensor_expected->Equals(*tensor)); + EXPECT_TRUE(tensor_expected->Equals(*tensor, EqualOptions().nans_equal(true))); + + CheckTableToTensor(tensor, 18, shape, f_strides); + + ASSERT_OK_AND_ASSIGN(auto tensor_row, table->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK(tensor_row->Validate()); + + std::vector strides = {f64_size * shape[1], f64_size}; + std::shared_ptr tensor_expected_row = TensorFromJSON( + float64(), "[NaN, 10, 2, 20, 3, 30, 4, 40, 5, NaN, 6, 60, 7, 70, 8, 80, 9, 90]", + shape, strides); + + EXPECT_FALSE(tensor_expected_row->Equals(*tensor_row)); + EXPECT_TRUE(tensor_expected_row->Equals(*tensor_row, EqualOptions().nans_equal(true))); + + CheckTableToTensorRowMajor(tensor_row, 18, shape, strides); + + // int32 -> float64 + auto f2 = field("f2", int32()); + + std::vector> fields1 = {f0, f2}; + auto schema1 = ::arrow::schema(fields1); + + auto a2 = ChunkedArrayFromJSON(int32(), {"[10, 20]", "[30, 40, null, 60, 70, 80, 90]"}); + auto table1 = Table::Make(schema1, {a0, a2}); + + ASSERT_OK_AND_ASSIGN(auto tensor1, + table1->ToTensor(/*null_to_nan=*/true, /*row_major=*/false)); + ASSERT_OK(tensor1->Validate()); + + EXPECT_FALSE(tensor_expected->Equals(*tensor1)); + EXPECT_TRUE(tensor_expected->Equals(*tensor1, EqualOptions().nans_equal(true))); + + CheckTableToTensor(tensor1, 18, shape, f_strides); + + ASSERT_OK_AND_ASSIGN(auto tensor1_row, table1->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK(tensor1_row->Validate()); + + EXPECT_FALSE(tensor_expected_row->Equals(*tensor1_row)); + EXPECT_TRUE(tensor_expected_row->Equals(*tensor1_row, EqualOptions().nans_equal(true))); + + CheckTableToTensorRowMajor(tensor1_row, 18, shape, strides); + + // int8 -> float32 + auto f3 = field("f3", int8()); + auto f4 = field("f4", int8()); + + std::vector> fields2 = {f3, f4}; + auto schema2 = ::arrow::schema(fields2); + + auto a3 = ChunkedArrayFromJSON(int8(), {"[null, 2, 3]", "[4, 5, 6, 7, 8, 9]"}); + auto a4 = ChunkedArrayFromJSON(int8(), {"[10, 20]", "[30, 40, null, 60, 70, 80, 90]"}); + auto table2 = Table::Make(schema2, {a3, a4}); + + ASSERT_OK_AND_ASSIGN(auto tensor2, + table2->ToTensor(/*null_to_nan=*/true, /*row_major=*/false)); + ASSERT_OK(tensor2->Validate()); + + const int64_t f32_size = sizeof(float); + std::vector f_strides_2 = {f32_size, f32_size * shape[0]}; + std::shared_ptr tensor_expected_2 = TensorFromJSON( + float32(), "[NaN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, NaN, 60, 70, 80, 90]", + shape, f_strides_2); + + EXPECT_FALSE(tensor_expected_2->Equals(*tensor2)); + EXPECT_TRUE(tensor_expected_2->Equals(*tensor2, EqualOptions().nans_equal(true))); + + CheckTableToTensor(tensor2, 18, shape, f_strides_2); + + ASSERT_OK_AND_ASSIGN(auto tensor2_row, table2->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK(tensor2_row->Validate()); + + std::vector strides_2 = {f32_size * shape[1], f32_size}; + std::shared_ptr tensor2_expected_row = TensorFromJSON( + float32(), "[NaN, 10, 2, 20, 3, 30, 4, 40, 5, NaN, 6, 60, 7, 70, 8, 80, 9, 90]", + shape, strides_2); + + EXPECT_FALSE(tensor2_expected_row->Equals(*tensor2_row)); + EXPECT_TRUE( + tensor2_expected_row->Equals(*tensor2_row, EqualOptions().nans_equal(true))); + + CheckTableToTensorRowMajor(tensor2_row, 18, shape, strides_2); +} + +TEST_F(TestTable, ToTensorSupportedTypesMixed) { + auto f0 = field("f0", uint16()); + auto f1 = field("f1", int16()); + auto f2 = field("f2", float32()); + + auto a0 = ChunkedArrayFromJSON(uint16(), {"[1, 2, 3]", "[4, 5, 6, 7, 8, 9]"}); + auto a1 = ChunkedArrayFromJSON(int16(), {"[10, 20]", "[30, 40, 50, 60, 70, 80, 90]"}); + auto a2 = ChunkedArrayFromJSON(float32(), + {"[100, 200, 300, NaN, 500, 600]", "[700, 800, 900]"}); + + // Single column + std::vector> fields = {f0}; + auto schema = ::arrow::schema(fields); + auto table = Table::Make(schema, {a0}); + + ASSERT_OK_AND_ASSIGN(auto tensor, + table->ToTensor(/*null_to_nan=*/false, /*row_major=*/false)); + ASSERT_OK(tensor->Validate()); + + std::vector shape = {9, 1}; + const int64_t uint16_size = sizeof(uint16_t); + std::vector f_strides = {uint16_size, uint16_size * shape[0]}; + std::shared_ptr tensor_expected = + TensorFromJSON(uint16(), "[1, 2, 3, 4, 5, 6, 7, 8, 9]", shape, f_strides); + + EXPECT_TRUE(tensor_expected->Equals(*tensor)); + CheckTableToTensor(tensor, 9, shape, f_strides); + + // uint16 + int16 = int32 + std::vector> fields1 = {f0, f1}; + auto schema1 = ::arrow::schema(fields1); + auto table1 = Table::Make(schema1, {a0, a1}); + + ASSERT_OK_AND_ASSIGN(auto tensor1, + table1->ToTensor(/*null_to_nan=*/false, /*row_major=*/false)); + ASSERT_OK(tensor1->Validate()); + + std::vector shape1 = {9, 2}; + const int64_t int32_size = sizeof(int32_t); + std::vector f_strides_1 = {int32_size, int32_size * shape1[0]}; + std::shared_ptr tensor_expected_1 = TensorFromJSON( + int32(), "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90]", + shape1, f_strides_1); + + EXPECT_TRUE(tensor_expected_1->Equals(*tensor1)); + + CheckTableToTensor(tensor1, 18, shape1, f_strides_1); + + ASSERT_EQ(tensor1->type()->bit_width(), tensor_expected_1->type()->bit_width()); + + ASSERT_EQ(1, tensor_expected_1->Value({0, 0})); + ASSERT_EQ(2, tensor_expected_1->Value({1, 0})); + ASSERT_EQ(10, tensor_expected_1->Value({0, 1})); + + // uint16 + int16 + float32 = float64 + std::vector> fields2 = {f0, f1, f2}; + auto schema2 = ::arrow::schema(fields2); + auto table2 = Table::Make(schema2, {a0, a1, a2}); + + ASSERT_OK_AND_ASSIGN(auto tensor2, + table2->ToTensor(/*null_to_nan=*/false, /*row_major=*/false)); + ASSERT_OK(tensor2->Validate()); + + std::vector shape2 = {9, 3}; + const int64_t f64_size = sizeof(double); + std::vector f_strides_2 = {f64_size, f64_size * shape2[0]}; + std::shared_ptr tensor_expected_2 = + TensorFromJSON(float64(), + "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50," + "60, 70, 80, 90, 100, 200, 300, NaN, 500, 600, 700, 800, 900]", + shape2, f_strides_2); + + EXPECT_FALSE(tensor_expected_2->Equals(*tensor2)); + EXPECT_TRUE(tensor_expected_2->Equals(*tensor2, EqualOptions().nans_equal(true))); + + CheckTableToTensor(tensor2, 27, shape2, f_strides_2); +} + +TEST_F(TestTable, ToTensorUnsupportedMixedFloat16) { + auto f0 = field("f0", float16()); + auto f1 = field("f1", float64()); + + auto a0 = ChunkedArrayFromJSON(float16(), {"[1, 2, 3]", "[4, 5, 6, 7, 8, 9]"}); + auto a1 = ChunkedArrayFromJSON(float64(), {"[10, 20]", "[30, 40, 50, 60, 70, 80, 90]"}); + + std::vector> fields = {f0, f1}; + auto schema = ::arrow::schema(fields); + auto table = Table::Make(schema, {a0, a1}); + + ASSERT_RAISES_WITH_MESSAGE( + NotImplemented, "NotImplemented: Casting from or to halffloat is not supported.", + table->ToTensor()); + + std::vector> fields1 = {f1, f0}; + auto schema1 = ::arrow::schema(fields1); + auto table1 = Table::Make(schema1, {a1, a0}); + + ASSERT_RAISES_WITH_MESSAGE( + NotImplemented, "NotImplemented: Casting from or to halffloat is not supported.", + table1->ToTensor()); +} + +template +class TestTableToTensorColumnMajor : public ::testing::Test {}; + +TYPED_TEST_SUITE_P(TestTableToTensorColumnMajor); + +TYPED_TEST_P(TestTableToTensorColumnMajor, SupportedTypes) { + using DataType = TypeParam; + using c_data_type = typename DataType::c_type; + const int unit_size = sizeof(c_data_type); + + auto f0 = field("f0", TypeTraits::type_singleton()); + auto f1 = field("f1", TypeTraits::type_singleton()); + auto f2 = field("f2", TypeTraits::type_singleton()); + + std::vector> fields = {f0, f1, f2}; + auto schema = ::arrow::schema(fields); + + auto a0 = ChunkedArrayFromJSON(TypeTraits::type_singleton(), + {"[1, 2, 3]", "[4, 5, 6, 7, 8, 9]"}); + auto a1 = ChunkedArrayFromJSON(TypeTraits::type_singleton(), + {"[10, 20]", "[30, 40, 50, 60, 70, 80, 90]"}); + auto a2 = ChunkedArrayFromJSON(TypeTraits::type_singleton(), + {"[100, 100, 100, 100, 100, 100]", "[100, 100, 100]"}); + + auto table = Table::Make(schema, {a0, a1, a2}); + + ASSERT_OK_AND_ASSIGN(auto tensor, + table->ToTensor(/*null_to_nan=*/false, /*row_major=*/false)); + ASSERT_OK(tensor->Validate()); + + std::vector shape = {9, 3}; + std::vector f_strides = {unit_size, unit_size * shape[0]}; + std::shared_ptr tensor_expected = TensorFromJSON( + TypeTraits::type_singleton(), + "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, " + "80, 90, 100, 100, 100, 100, 100, 100, 100, 100, 100]", + shape, f_strides); + + EXPECT_TRUE(tensor_expected->Equals(*tensor)); + CheckTableToTensor(tensor, 27, shape, f_strides); + + // Test offsets + auto table_slice = table->Slice(1); + + ASSERT_OK_AND_ASSIGN(auto tensor_sliced, table_slice->ToTensor(/*null_to_nan=*/false, + /*row_major=*/false)); + ASSERT_OK(tensor_sliced->Validate()); + + std::vector shape_sliced = {8, 3}; + std::vector f_strides_sliced = {unit_size, unit_size * shape_sliced[0]}; + std::shared_ptr tensor_expected_sliced = + TensorFromJSON(TypeTraits::type_singleton(), + "[2, 3, 4, 5, 6, 7, 8, 9, 20, 30, 40, 50, 60, " + "70, 80, 90, 100, 100, 100, 100, 100, 100, 100, 100]", + shape_sliced, f_strides_sliced); + + EXPECT_TRUE(tensor_expected_sliced->Equals(*tensor_sliced)); + CheckTableToTensor(tensor_sliced, 24, shape_sliced, f_strides_sliced); + + auto table_slice_1 = table->Slice(1, 5); + + ASSERT_OK_AND_ASSIGN( + auto tensor_sliced_1, + table_slice_1->ToTensor(/*null_to_nan=*/false, /*row_major=*/false)); + ASSERT_OK(tensor_sliced_1->Validate()); + + std::vector shape_sliced_1 = {5, 3}; + std::vector f_strides_sliced_1 = {unit_size, unit_size * shape_sliced_1[0]}; + std::shared_ptr tensor_expected_sliced_1 = + TensorFromJSON(TypeTraits::type_singleton(), + "[2, 3, 4, 5, 6, 20, 30, 40, 50, 60, 100, 100, 100, 100, 100]", + shape_sliced_1, f_strides_sliced_1); + + EXPECT_TRUE(tensor_expected_sliced_1->Equals(*tensor_sliced_1)); + CheckTableToTensor(tensor_sliced_1, 15, shape_sliced_1, f_strides_sliced_1); +} + +REGISTER_TYPED_TEST_SUITE_P(TestTableToTensorColumnMajor, SupportedTypes); + +INSTANTIATE_TYPED_TEST_SUITE_P(UInt8, TestTableToTensorColumnMajor, UInt8Type); +INSTANTIATE_TYPED_TEST_SUITE_P(UInt16, TestTableToTensorColumnMajor, UInt16Type); +INSTANTIATE_TYPED_TEST_SUITE_P(UInt32, TestTableToTensorColumnMajor, UInt32Type); +INSTANTIATE_TYPED_TEST_SUITE_P(UInt64, TestTableToTensorColumnMajor, UInt64Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Int8, TestTableToTensorColumnMajor, Int8Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Int16, TestTableToTensorColumnMajor, Int16Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Int32, TestTableToTensorColumnMajor, Int32Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Int64, TestTableToTensorColumnMajor, Int64Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Float16, TestTableToTensorColumnMajor, HalfFloatType); +INSTANTIATE_TYPED_TEST_SUITE_P(Float32, TestTableToTensorColumnMajor, FloatType); +INSTANTIATE_TYPED_TEST_SUITE_P(Float64, TestTableToTensorColumnMajor, DoubleType); + +template +class TestTableToTensorRowMajor : public ::testing::Test {}; + +TYPED_TEST_SUITE_P(TestTableToTensorRowMajor); + +TYPED_TEST_P(TestTableToTensorRowMajor, SupportedTypes) { + using DataType = TypeParam; + using c_data_type = typename DataType::c_type; + const int unit_size = sizeof(c_data_type); + + auto f0 = field("f0", TypeTraits::type_singleton()); + auto f1 = field("f1", TypeTraits::type_singleton()); + auto f2 = field("f2", TypeTraits::type_singleton()); + + std::vector> fields = {f0, f1, f2}; + auto schema = ::arrow::schema(fields); + + auto a0 = ChunkedArrayFromJSON(TypeTraits::type_singleton(), + {"[1, 2, 3]", "[4, 5, 6, 7, 8, 9]"}); + auto a1 = ChunkedArrayFromJSON(TypeTraits::type_singleton(), + {"[10, 20]", "[30, 40, 50, 60, 70, 80, 90]"}); + auto a2 = ChunkedArrayFromJSON(TypeTraits::type_singleton(), + {"[100, 100, 100, 100, 100, 100]", "[100, 100, 100]"}); + + auto table = Table::Make(schema, {a0, a1, a2}); + + ASSERT_OK_AND_ASSIGN(auto tensor, table->ToTensor()); + ASSERT_OK(tensor->Validate()); + + std::vector shape = {9, 3}; + std::vector strides = {unit_size * shape[1], unit_size}; + std::shared_ptr tensor_expected = + TensorFromJSON(TypeTraits::type_singleton(), + "[1, 10, 100, 2, 20, 100, 3, 30, 100, 4, 40, 100, 5, 50, 100, 6, " + "60, 100, 7, 70, 100, 8, 80, 100, 9, 90, 100]", + shape, strides); + + EXPECT_TRUE(tensor_expected->Equals(*tensor)); + CheckTableToTensorRowMajor(tensor, 27, shape, strides); + + // Test offsets + auto table_slice = table->Slice(1); + + ASSERT_OK_AND_ASSIGN(auto tensor_sliced, table_slice->ToTensor()); + ASSERT_OK(tensor_sliced->Validate()); + + std::vector shape_sliced = {8, 3}; + std::vector strides_sliced = {unit_size * shape[1], unit_size}; + std::shared_ptr tensor_expected_sliced = + TensorFromJSON(TypeTraits::type_singleton(), + "[2, 20, 100, 3, 30, 100, 4, 40, 100, 5, 50, 100, 6, " + "60, 100, 7, 70, 100, 8, 80, 100, 9, 90, 100]", + shape_sliced, strides_sliced); + + EXPECT_TRUE(tensor_expected_sliced->Equals(*tensor_sliced)); + CheckTableToTensorRowMajor(tensor_sliced, 24, shape_sliced, strides_sliced); + + auto table_slice_1 = table->Slice(1, 5); + + ASSERT_OK_AND_ASSIGN(auto tensor_sliced_1, table_slice_1->ToTensor()); + ASSERT_OK(tensor_sliced_1->Validate()); + + std::vector shape_sliced_1 = {5, 3}; + std::vector strides_sliced_1 = {unit_size * shape_sliced_1[1], unit_size}; + std::shared_ptr tensor_expected_sliced_1 = + TensorFromJSON(TypeTraits::type_singleton(), + "[2, 20, 100, 3, 30, 100, 4, 40, 100, 5, 50, 100, 6, 60, 100]", + shape_sliced_1, strides_sliced_1); + + EXPECT_TRUE(tensor_expected_sliced_1->Equals(*tensor_sliced_1)); + CheckTableToTensorRowMajor(tensor_sliced_1, 15, shape_sliced_1, + strides_sliced_1); +} + +REGISTER_TYPED_TEST_SUITE_P(TestTableToTensorRowMajor, SupportedTypes); + +INSTANTIATE_TYPED_TEST_SUITE_P(UInt8, TestTableToTensorRowMajor, UInt8Type); +INSTANTIATE_TYPED_TEST_SUITE_P(UInt16, TestTableToTensorRowMajor, UInt16Type); +INSTANTIATE_TYPED_TEST_SUITE_P(UInt32, TestTableToTensorRowMajor, UInt32Type); +INSTANTIATE_TYPED_TEST_SUITE_P(UInt64, TestTableToTensorRowMajor, UInt64Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Int8, TestTableToTensorRowMajor, Int8Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Int16, TestTableToTensorRowMajor, Int16Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Int32, TestTableToTensorRowMajor, Int32Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Int64, TestTableToTensorRowMajor, Int64Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Float16, TestTableToTensorRowMajor, HalfFloatType); +INSTANTIATE_TYPED_TEST_SUITE_P(Float32, TestTableToTensorRowMajor, FloatType); +INSTANTIATE_TYPED_TEST_SUITE_P(Float64, TestTableToTensorRowMajor, DoubleType); + std::shared_ptr
MakeTableWithOneNullFilledColumn( const std::string& column_name, const std::shared_ptr& data_type, const int length) { diff --git a/cpp/src/arrow/tensor.cc b/cpp/src/arrow/tensor.cc index 8cdf7f82d264..c7bb049a1fd0 100644 --- a/cpp/src/arrow/tensor.cc +++ b/cpp/src/arrow/tensor.cc @@ -28,8 +28,8 @@ #include #include -#include "arrow/record_batch.h" #include "arrow/status.h" +#include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" @@ -224,7 +224,7 @@ Status ValidateTensorParameters(const std::shared_ptr& type, } template -struct ConvertColumnsToTensorVisitor { +struct ConvertArrayToTensorVisitor { Out*& out_values; const ArrayData& in_data; @@ -256,11 +256,12 @@ struct ConvertColumnsToTensorVisitor { }; template -struct ConvertColumnsToTensorRowMajorVisitor { +struct ConvertArrayToTensorRowMajorVisitor { Out*& out_values; const ArrayData& in_data; - int num_cols; - int col_idx; + int64_t num_cols; + int64_t col_idx; + int64_t chunk_idx; template Status Visit(const T&) { @@ -268,13 +269,15 @@ struct ConvertColumnsToTensorRowMajorVisitor { using In = typename T::c_type; auto in_values = ArraySpan(in_data).GetSpan(1, in_data.length); + const int64_t base = chunk_idx * num_cols + col_idx; + if (in_data.null_count == 0) { for (int64_t i = 0; i < in_data.length; ++i) { - out_values[i * num_cols + col_idx] = static_cast(in_values[i]); + out_values[base + i * num_cols] = static_cast(in_values[i]); } } else { for (int64_t i = 0; i < in_data.length; ++i) { - out_values[i * num_cols + col_idx] = + out_values[base + i * num_cols] = in_data.IsNull(i) ? static_cast(NAN) : static_cast(in_values[i]); } } @@ -284,50 +287,73 @@ struct ConvertColumnsToTensorRowMajorVisitor { } }; -template -inline void ConvertColumnsToTensor(const RecordBatch& batch, uint8_t* out, +template +inline void ConvertColumnsToTensor(const Container& container, uint8_t* out, bool row_major) { using CType = typename arrow::TypeTraits::CType; auto* out_values = reinterpret_cast(out); - int i = 0; - for (const auto& column : batch.columns()) { - if (row_major) { - ConvertColumnsToTensorRowMajorVisitor visitor{out_values, *column->data(), - batch.num_columns(), i++}; - DCHECK_OK(VisitTypeInline(*column->type(), &visitor)); - } else { - ConvertColumnsToTensorVisitor visitor{out_values, *column->data()}; - DCHECK_OK(VisitTypeInline(*column->type(), &visitor)); + for (int col_idx = 0; col_idx < container.num_columns(); ++col_idx) { + if constexpr (std::is_same_v) { + int64_t chunk_idx = 0; + + for (const auto& chunk : container.column(col_idx)->chunks()) { + if (row_major) { + ConvertArrayToTensorRowMajorVisitor visitor{ + out_values, *chunk->data(), container.num_columns(), col_idx, chunk_idx}; + DCHECK_OK(VisitTypeInline(*chunk->type(), &visitor)); + chunk_idx += chunk->length(); + } else { + ConvertArrayToTensorVisitor visitor{out_values, *chunk->data()}; + DCHECK_OK(VisitTypeInline(*chunk->type(), &visitor)); + } + } + } else if constexpr (std::is_same_v) { + const auto& array_data = container.column_data(col_idx); + + if (row_major) { + ConvertArrayToTensorRowMajorVisitor visitor{ + out_values, *array_data, container.num_columns(), col_idx, 0}; + DCHECK_OK(VisitTypeInline(*array_data->type, &visitor)); + } else { + ConvertArrayToTensorVisitor visitor{out_values, *array_data}; + DCHECK_OK(VisitTypeInline(*array_data->type, &visitor)); + } } } } -Status RecordBatchToTensor(const RecordBatch& batch, bool null_to_nan, bool row_major, - MemoryPool* pool, std::shared_ptr* tensor) { - if (batch.num_columns() == 0) { +template +Status ToTensorImpl(const Container& container, bool null_to_nan, bool row_major, + MemoryPool* pool, std::shared_ptr* tensor) { + if (container.num_columns() == 0) { return Status::TypeError( - "Conversion to Tensor for RecordBatches without columns/schema is not " + "Conversion to Tensor for Tables or RecordBatches without columns/schema is not " "supported."); } // Check for no validity bitmap of each field // if null_to_nan conversion is set to false - for (int i = 0; i < batch.num_columns(); ++i) { - if (batch.column(i)->null_count() > 0 && !null_to_nan) { + for (int i = 0; i < container.num_columns(); ++i) { + int64_t null_count; + if constexpr (std::is_same_v) { + null_count = container.column(i)->null_count(); + } else if constexpr (std::is_same_v) { + null_count = container.column_data(i)->GetNullCount(); + } + if (null_count > 0 && !null_to_nan) { return Status::TypeError( - "Can only convert a RecordBatch with no nulls. Set null_to_nan to true to " - "convert nulls to NaN"); + "Can only convert a Table or RecordBatch with no nulls. Set null_to_nan to " + "true to convert nulls to NaN"); } } // Check for supported data types and merge fields // to get the resulting uniform data type - if (!is_integer(batch.column(0)->type()->id()) && - !is_floating(batch.column(0)->type()->id())) { - return Status::TypeError("DataType is not supported: ", - batch.column(0)->type()->ToString()); + const auto& col_0_type = container.schema()->field(0)->type(); + if (!is_integer(col_0_type->id()) && !is_floating(col_0_type->id())) { + return Status::TypeError("DataType is not supported: ", col_0_type->ToString()); } - std::shared_ptr result_field = batch.schema()->field(0); + std::shared_ptr result_field = container.schema()->field(0); std::shared_ptr result_type = result_field->type(); Field::MergeOptions options; @@ -335,24 +361,25 @@ Status RecordBatchToTensor(const RecordBatch& batch, bool null_to_nan, bool row_ options.promote_integer_sign = true; options.promote_numeric_width = true; - if (batch.num_columns() > 1) { - for (int i = 1; i < batch.num_columns(); ++i) { - if (!is_numeric(batch.column(i)->type()->id())) { - return Status::TypeError("DataType is not supported: ", - batch.column(i)->type()->ToString()); + if (container.num_columns() > 1) { + for (int i = 1; i < container.num_columns(); ++i) { + const auto& col_type = container.schema()->field(i)->type(); + + if (!is_numeric(col_type->id())) { + return Status::TypeError("DataType is not supported: ", col_type->ToString()); } // Casting of float16 is not supported, throw an error in this case - if ((batch.column(i)->type()->id() == Type::HALF_FLOAT || + if ((col_type->id() == Type::HALF_FLOAT || result_field->type()->id() == Type::HALF_FLOAT) && - batch.column(i)->type()->id() != result_field->type()->id()) { + col_type->id() != result_field->type()->id()) { return Status::NotImplemented("Casting from or to halffloat is not supported."); } ARROW_ASSIGN_OR_RAISE( result_field, result_field->MergeWith( - batch.schema()->field(i)->WithName(result_field->name()), options)); + container.schema()->field(i)->WithName(result_field->name()), options)); } result_type = result_field->type(); } @@ -367,42 +394,42 @@ Status RecordBatchToTensor(const RecordBatch& batch, bool null_to_nan, bool row_ } // Allocate memory - ARROW_ASSIGN_OR_RAISE( - std::shared_ptr result, - AllocateBuffer(result_type->bit_width() * batch.num_columns() * batch.num_rows(), - pool)); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr result, + AllocateBuffer(result_type->bit_width() * + container.num_columns() * container.num_rows(), + pool)); // Copy data switch (result_type->id()) { case Type::UINT8: - ConvertColumnsToTensor(batch, result->mutable_data(), row_major); + ConvertColumnsToTensor(container, result->mutable_data(), row_major); break; case Type::UINT16: case Type::HALF_FLOAT: - ConvertColumnsToTensor(batch, result->mutable_data(), row_major); + ConvertColumnsToTensor(container, result->mutable_data(), row_major); break; case Type::UINT32: - ConvertColumnsToTensor(batch, result->mutable_data(), row_major); + ConvertColumnsToTensor(container, result->mutable_data(), row_major); break; case Type::UINT64: - ConvertColumnsToTensor(batch, result->mutable_data(), row_major); + ConvertColumnsToTensor(container, result->mutable_data(), row_major); break; case Type::INT8: - ConvertColumnsToTensor(batch, result->mutable_data(), row_major); + ConvertColumnsToTensor(container, result->mutable_data(), row_major); break; case Type::INT16: - ConvertColumnsToTensor(batch, result->mutable_data(), row_major); + ConvertColumnsToTensor(container, result->mutable_data(), row_major); break; case Type::INT32: - ConvertColumnsToTensor(batch, result->mutable_data(), row_major); + ConvertColumnsToTensor(container, result->mutable_data(), row_major); break; case Type::INT64: - ConvertColumnsToTensor(batch, result->mutable_data(), row_major); + ConvertColumnsToTensor(container, result->mutable_data(), row_major); break; case Type::FLOAT: - ConvertColumnsToTensor(batch, result->mutable_data(), row_major); + ConvertColumnsToTensor(container, result->mutable_data(), row_major); break; case Type::DOUBLE: - ConvertColumnsToTensor(batch, result->mutable_data(), row_major); + ConvertColumnsToTensor(container, result->mutable_data(), row_major); break; default: return Status::TypeError("DataType is not supported: ", result_type->ToString()); @@ -411,7 +438,7 @@ Status RecordBatchToTensor(const RecordBatch& batch, bool null_to_nan, bool row_ // Construct Tensor object const auto& fixed_width_type = internal::checked_cast(*result_type); - std::vector shape = {batch.num_rows(), batch.num_columns()}; + std::vector shape = {container.num_rows(), container.num_columns()}; std::vector strides; if (row_major) { @@ -426,6 +453,16 @@ Status RecordBatchToTensor(const RecordBatch& batch, bool null_to_nan, bool row_ return Status::OK(); } +Status TableToTensor(const Table& table, bool null_to_nan, bool row_major, + MemoryPool* pool, std::shared_ptr* tensor) { + return ToTensorImpl(table, null_to_nan, row_major, pool, tensor); +} + +Status RecordBatchToTensor(const RecordBatch& batch, bool null_to_nan, bool row_major, + MemoryPool* pool, std::shared_ptr* tensor) { + return ToTensorImpl(batch, null_to_nan, row_major, pool, tensor); +} + } // namespace internal /// Constructor with strides and dimension names diff --git a/cpp/src/arrow/tensor.h b/cpp/src/arrow/tensor.h index beb62a11bdce..1300003c2985 100644 --- a/cpp/src/arrow/tensor.h +++ b/cpp/src/arrow/tensor.h @@ -77,6 +77,10 @@ Status ValidateTensorParameters(const std::shared_ptr& type, const std::vector& strides, const std::vector& dim_names); +ARROW_EXPORT +Status TableToTensor(const Table& table, bool null_to_nan, bool row_major, + MemoryPool* pool, std::shared_ptr* tensor); + ARROW_EXPORT Status RecordBatchToTensor(const RecordBatch& batch, bool null_to_nan, bool row_major, MemoryPool* pool, std::shared_ptr* tensor); diff --git a/cpp/src/arrow/tensor_benchmark.cc b/cpp/src/arrow/tensor_benchmark.cc index 91a9270ef347..30969995ee2b 100644 --- a/cpp/src/arrow/tensor_benchmark.cc +++ b/cpp/src/arrow/tensor_benchmark.cc @@ -18,6 +18,7 @@ #include "benchmark/benchmark.h" #include "arrow/record_batch.h" +#include "arrow/table.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/type.h" @@ -51,6 +52,34 @@ static void BatchToTensorSimple(benchmark::State& state) { state.SetBytesProcessed(state.iterations() * ty->byte_width() * num_rows * num_cols); } +template +static void TableToTensorSimple(benchmark::State& state) { + using CType = typename ValueType::c_type; + std::shared_ptr ty = TypeTraits::type_singleton(); + + const int64_t num_cols = state.range(1); + const int64_t num_rows = state.range(0) / num_cols / sizeof(CType); + arrow::random::RandomArrayGenerator gen_{42}; + + std::vector> fields = {}; + std::vector> columns = {}; + + for (int64_t i = 0; i < num_cols; ++i) { + fields.push_back(field("f" + std::to_string(i), ty)); + ArrayVector arrays = {gen_.ArrayOf(ty, num_rows / 2), gen_.ArrayOf(ty, num_rows / 2)}; + auto chunks = std::make_shared(arrays, ty); + columns.push_back(chunks); + } + auto schema = std::make_shared(std::move(fields)); + auto table = Table::Make(schema, columns); + + for (auto _ : state) { + ASSERT_OK_AND_ASSIGN(auto tensor, table->ToTensor(/*row_major=*/row_major)); + } + state.SetItemsProcessed(state.iterations() * num_rows * num_cols); + state.SetBytesProcessed(state.iterations() * ty->byte_width() * num_rows * num_cols); +} + void SetArgs(benchmark::internal::Benchmark* bench) { for (int64_t size : {kL1Size, kL2Size}) { for (int64_t num_columns : {3, 30, 300}) { @@ -65,4 +94,13 @@ BENCHMARK_TEMPLATE(BatchToTensorSimple, Int16Type)->Apply(SetArgs); BENCHMARK_TEMPLATE(BatchToTensorSimple, Int32Type)->Apply(SetArgs); BENCHMARK_TEMPLATE(BatchToTensorSimple, Int64Type)->Apply(SetArgs); +#define DECLARE_TABLE_TO_TENSOR_BENCHMARKS(row_major) \ + BENCHMARK_TEMPLATE(TableToTensorSimple, Int8Type, row_major)->Apply(SetArgs); \ + BENCHMARK_TEMPLATE(TableToTensorSimple, Int16Type, row_major)->Apply(SetArgs); \ + BENCHMARK_TEMPLATE(TableToTensorSimple, Int32Type, row_major)->Apply(SetArgs); \ + BENCHMARK_TEMPLATE(TableToTensorSimple, Int64Type, row_major)->Apply(SetArgs); + +DECLARE_TABLE_TO_TENSOR_BENCHMARKS(false); +DECLARE_TABLE_TO_TENSOR_BENCHMARKS(true); + } // namespace arrow diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index e96a7d84696d..767e21f01bda 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1139,6 +1139,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: const shared_ptr[CSchema]& schema, const vector[shared_ptr[CRecordBatch]]& batches) + CResult[shared_ptr[CTensor]] ToTensor(c_bool null_to_nan, c_bool row_major, + CMemoryPool* pool) const + int num_columns() int64_t num_rows() diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 2e04fa75b8b7..0c35c915015a 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -3629,7 +3629,7 @@ cdef class RecordBatch(_Tabular): b: [10,20,30,40,null] Convert a RecordBatch to row-major Tensor with null values - written as NaN values + written as ``NaN``: >>> batch.to_tensor(null_to_nan=True) @@ -3643,7 +3643,7 @@ cdef class RecordBatch(_Tabular): [ 4., 40.], [nan, nan]]) - Convert a RecordBatch to column-major Tensor + Convert a RecordBatch to column-major Tensor: >>> batch.to_tensor(null_to_nan=True, row_major=False) @@ -3659,15 +3659,11 @@ cdef class RecordBatch(_Tabular): """ self._assert_cpu() cdef: - shared_ptr[CRecordBatch] c_record_batch shared_ptr[CTensor] c_tensor CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) - c_record_batch = pyarrow_unwrap_batch(self) with nogil: - c_tensor = GetResultValue( - deref(c_record_batch).ToTensor(null_to_nan, - row_major, pool)) + c_tensor = GetResultValue(self.batch.ToTensor(null_to_nan, row_major, pool)) return pyarrow_wrap_tensor(c_tensor) def copy_to(self, destination): @@ -4989,7 +4985,7 @@ cdef class Table(_Tabular): animals: string ---- n_legs: [[2,4,5,100],[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"],["Flamingo","Horse","Brittle stars","Centipede"]] + animals: [["Flamingo",...,"Centipede"],["Flamingo",...,"Centipede"]] """ cdef: vector[shared_ptr[CRecordBatch]] c_batches @@ -5084,6 +5080,81 @@ cdef class Table(_Tabular): return result + def to_tensor(self, c_bool null_to_nan=False, c_bool row_major=True, MemoryPool memory_pool=None): + """ + Convert to a :class:`~pyarrow.Tensor`. + + Tables that can be converted have fields of type signed or unsigned integer or float, + including all bit-widths. + + ``null_to_nan`` is ``False`` by default and this method will raise an error in case + any nulls are present. Tables with nulls can be converted with ``null_to_nan`` set to + ``True``. In this case null values are converted to ``NaN`` and integer type arrays are + promoted to the appropriate float type. + + Parameters + ---------- + null_to_nan : bool, default False + Whether to write null values in the result as ``NaN``. + row_major : bool, default True + Whether resulting Tensor is row-major or column-major + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... [ + ... pa.chunked_array([[1, 2], [3, 4, None]], type=pa.int32()), + ... pa.chunked_array([[10, 20, 30], [40, None]], type=pa.float32()), + ... ], names = ["a", "b"] + ... ) + + >>> table + pyarrow.Table + a: int32 + b: float + ---- + a: [[1,2],[3,4,null]] + b: [[10,20,30],[40,null]] + + Convert a Table to row-major Tensor with null values written as ``NaN``: + + >>> table.to_tensor(null_to_nan=True) + + type: double + shape: (5, 2) + strides: (16, 8) + >>> table.to_tensor(null_to_nan=True).to_numpy() + array([[ 1., 10.], + [ 2., 20.], + [ 3., 30.], + [ 4., 40.], + [nan, nan]]) + + Convert a Table to column-major Tensor + + >>> table.to_tensor(null_to_nan=True, row_major=False) + + type: double + shape: (5, 2) + strides: (8, 40) + >>> table.to_tensor(null_to_nan=True, row_major=False).to_numpy() + array([[ 1., 10.], + [ 2., 20.], + [ 3., 30.], + [ 4., 40.], + [nan, nan]]) + """ + cdef: + shared_ptr[CTensor] c_tensor + CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + + with nogil: + c_tensor = GetResultValue(self.table.ToTensor(null_to_nan, row_major, pool)) + return pyarrow_wrap_tensor(c_tensor) + def to_reader(self, max_chunksize=None): """ Convert the Table to a RecordBatchReader. diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index b65fb7d952c8..57d6faa677bf 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -1080,37 +1080,44 @@ def test_recordbatch_to_tensor_uniform_float_16(): @pytest.mark.numpy -def test_recordbatch_to_tensor_mixed_type(): +@pytest.mark.parametrize( + ('cls'), + [ + (pa.Table), + (pa.RecordBatch) + ] +) +def test_to_tensor_mixed_type(cls): # uint16 + int16 = int32 arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9] arr2 = [10, 20, 30, 40, 50, 60, 70, 80, 90] arr3 = [100, 200, 300, np.nan, 500, 600, 700, 800, 900] - batch = pa.RecordBatch.from_arrays( + tabular = cls.from_arrays( [ pa.array(arr1, type=pa.uint16()), pa.array(arr2, type=pa.int16()), ], ["a", "b"] ) - result = batch.to_tensor(row_major=False) + result = tabular.to_tensor(row_major=False) x = np.column_stack([arr1, arr2]).astype(np.int32, order="F") expected = pa.Tensor.from_numpy(x) check_tensors(result, expected, pa.int32(), 18) - result = batch.to_tensor() + result = tabular.to_tensor() x = np.column_stack([arr1, arr2]).astype(np.int32, order="C") expected = pa.Tensor.from_numpy(x) check_tensors(result, expected, pa.int32(), 18) # uint16 + int16 + float32 = float64 - batch = pa.RecordBatch.from_arrays( + tabular = cls.from_arrays( [ pa.array(arr1, type=pa.uint16()), pa.array(arr2, type=pa.int16()), pa.array(arr3, type=pa.float32()), ], ["a", "b", "c"] ) - result = batch.to_tensor(row_major=False) + result = tabular.to_tensor(row_major=False) x = np.column_stack([arr1, arr2, arr3]).astype(np.float64, order="F") expected = pa.Tensor.from_numpy(x) @@ -1120,7 +1127,7 @@ def test_recordbatch_to_tensor_mixed_type(): assert result.shape == expected.shape assert result.strides == expected.strides - result = batch.to_tensor() + result = tabular.to_tensor() x = np.column_stack([arr1, arr2, arr3]).astype(np.float64, order="C") expected = pa.Tensor.from_numpy(x) @@ -1184,7 +1191,7 @@ def test_recordbatch_to_tensor_null(): ) with pytest.raises( pa.ArrowTypeError, - match="Can only convert a RecordBatch with no nulls." + match="Can only convert a Table or RecordBatch with no nulls." ): batch.to_tensor() @@ -1269,6 +1276,70 @@ def test_recordbatch_to_tensor_unsupported(): batch.to_tensor() +@pytest.mark.numpy +@pytest.mark.parametrize('typ_str', [ + "uint8", "uint16", "uint32", "uint64", + "int8", "int16", "int32", "int64", + "float32", "float64", +]) +def test_table_to_tensor_uniform_type(typ_str): + arr1 = [[1, 2, 3], [4, 5, 6, 7, 8, 9]] + arr2 = [[10, 20], [30, 40, 50, 60, 70, 80, 90]] + arr3 = [[100, 100, 100, 100, 100, 100], [100, 100, 100]] + table = pa.Table.from_arrays( + [ + pa.chunked_array(arr1, type=pa.from_numpy_dtype(typ_str)), + pa.chunked_array(arr2, type=pa.from_numpy_dtype(typ_str)), + pa.chunked_array(arr3, type=pa.from_numpy_dtype(typ_str)), + ], ["a", "b", "c"] + ) + + arr1_f = [1, 2, 3, 4, 5, 6, 7, 8, 9] + arr2_f = [10, 20, 30, 40, 50, 60, 70, 80, 90] + arr3_f = [100, 100, 100, 100, 100, 100, 100, 100, 100] + + result = table.to_tensor(row_major=False) + x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(typ_str, order="F") + expected = pa.Tensor.from_numpy(x) + check_tensors(result, expected, pa.from_numpy_dtype(typ_str), 27) + + result = table.to_tensor() + x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(typ_str, order="C") + expected = pa.Tensor.from_numpy(x) + check_tensors(result, expected, pa.from_numpy_dtype(typ_str), 27) + + # Test offset + table1 = table.slice(1) + arr1_f = [2, 3, 4, 5, 6, 7, 8, 9] + arr2_f = [20, 30, 40, 50, 60, 70, 80, 90] + arr3_f = [100, 100, 100, 100, 100, 100, 100, 100] + + result = table1.to_tensor(row_major=False) + x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(typ_str, order="F") + expected = pa.Tensor.from_numpy(x) + check_tensors(result, expected, pa.from_numpy_dtype(typ_str), 24) + + result = table1.to_tensor() + x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(typ_str, order="C") + expected = pa.Tensor.from_numpy(x) + check_tensors(result, expected, pa.from_numpy_dtype(typ_str), 24) + + table2 = table.slice(1, 5) + arr1_f = [2, 3, 4, 5, 6] + arr2_f = [20, 30, 40, 50, 60] + arr3_f = [100, 100, 100, 100, 100] + + result = table2.to_tensor(row_major=False) + x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(typ_str, order="F") + expected = pa.Tensor.from_numpy(x) + check_tensors(result, expected, pa.from_numpy_dtype(typ_str), 15) + + result = table2.to_tensor() + x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(typ_str, order="C") + expected = pa.Tensor.from_numpy(x) + check_tensors(result, expected, pa.from_numpy_dtype(typ_str), 15) + + def _table_like_slice_tests(factory): data = [ pa.array(range(5)),