From c8691ab99254af3a26f634b845ef15ac19415521 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Tue, 28 May 2024 12:50:25 +0200 Subject: [PATCH 01/23] Add Table::ToTensor and bindings to Python with Python tests --- cpp/src/arrow/record_batch.h | 1 - cpp/src/arrow/table.cc | 210 +++++++++++++++++++ cpp/src/arrow/table.h | 13 ++ python/pyarrow/includes/libarrow.pxd | 3 + python/pyarrow/table.pxi | 85 +++++++- python/pyarrow/tests/test_table.py | 289 +++++++++++++++++++++++++++ 6 files changed, 598 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index 0d1d2d4ac359..4601b1ba9d6a 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -94,7 +94,6 @@ class ARROW_EXPORT RecordBatch { /// /// Create a Tensor object with shape (number of rows, number of columns) and /// strides (type size in bytes, type size in bytes * number of rows). - /// Generated Tensor will have column-major layout. /// /// \param[in] null_to_nan if true, convert nulls to NaN /// \param[in] row_major if true, create row-major Tensor else column-major Tensor diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 68a8a1951f1c..c7e357ee03b9 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -36,11 +36,14 @@ #include "arrow/record_batch.h" #include "arrow/result.h" #include "arrow/status.h" +#include "arrow/tensor.h" #include "arrow/type.h" #include "arrow/type_fwd.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging_internal.h" +#include "arrow/util/logging.h" +#include "arrow/util/unreachable.h" #include "arrow/util/vector.h" namespace arrow { @@ -346,6 +349,213 @@ Result> Table::FromChunkedStructArray( array->length()); } +template +struct ConvertChunksToTensorVisitor { + Out*& out_values; + const ArrayData& in_data; + + template + Status Visit(const T&) { + if constexpr (is_numeric(T::type_id)) { + using In = typename T::c_type; + auto in_values = ArraySpan(in_data).GetSpan(1, in_data.length); + + if (in_data.null_count == 0) { + if constexpr (std::is_same_v) { + memcpy(out_values, in_values.data(), in_values.size_bytes()); + out_values += in_values.size(); + } else { + for (In in_value : in_values) { + *out_values++ = static_cast(in_value); + } + } + } else { + for (int64_t i = 0; i < in_data.length; ++i) { + *out_values++ = + in_data.IsNull(i) ? static_cast(NAN) : static_cast(in_values[i]); + } + } + return Status::OK(); + } + Unreachable(); + } +}; + +template +struct ConvertChunksToTensorRowMajorVisitor { + Out*& out_values; + const ArrayData& in_data; + int num_cols; + int col_idx; + int chunk_idx; + + template + Status Visit(const T&) { + if constexpr (is_numeric(T::type_id)) { + using In = typename T::c_type; + auto in_values = ArraySpan(in_data).GetSpan(1, in_data.length); + + if (in_data.null_count == 0) { + for (int64_t data_idx = 0; data_idx < in_data.length; ++data_idx) { + out_values[(data_idx + chunk_idx) * num_cols + col_idx] = + static_cast(in_values[data_idx]); + } + } else { + for (int64_t data_idx = 0; data_idx < in_data.length; ++data_idx) { + out_values[(data_idx + chunk_idx) * num_cols + col_idx] = + in_data.IsNull(data_idx) ? static_cast(NAN) + : static_cast(in_values[data_idx]); + } + } + return Status::OK(); + } + Unreachable(); + } +}; + +template +inline void ConvertColumnsToTensor(const Table& table, uint8_t* out, bool row_major) { + using CType = typename arrow::TypeTraits::CType; + auto* out_values = reinterpret_cast(out); + + int i = 0; + for (const auto& column : table.columns()) { + int j = 0; + for (const auto& chunk : column->chunks()) { + if (row_major) { + ConvertChunksToTensorRowMajorVisitor visitor{out_values, *chunk->data(), + table.num_columns(), i, j}; + DCHECK_OK(VisitTypeInline(*column->type(), &visitor)); + j = j + static_cast(chunk->length()); + } else { + ConvertChunksToTensorVisitor visitor{out_values, *chunk->data()}; + DCHECK_OK(VisitTypeInline(*column->type(), &visitor)); + } + } + i++; + } +} + +Result> Table::ToTensor(bool null_to_nan, bool row_major, + MemoryPool* pool) const { + if (num_columns() == 0) { + return Status::TypeError( + "Conversion to Tensor for Table without columns/schema is not supported."); + } + // Check for no validity bitmap of each field + // if null_to_nan conversion is set to false + for (int i = 0; i < num_columns(); ++i) { + if (column(i)->null_count() > 0 && !null_to_nan) { + return Status::TypeError( + "Can only convert a Table with no nulls. Set null_to_nan to true to " + "convert nulls to NaN"); + } + } + + // Check for supported data types and merge fields + // to get the resulting uniform data type + if (!is_integer(column(0)->type()->id()) && !is_floating(column(0)->type()->id())) { + return Status::TypeError("DataType is not supported: ", + column(0)->type()->ToString()); + } + std::shared_ptr result_field = schema_->field(0); + std::shared_ptr result_type = result_field->type(); + + Field::MergeOptions options; + options.promote_integer_to_float = true; + options.promote_integer_sign = true; + options.promote_numeric_width = true; + + if (num_columns() > 1) { + for (int i = 1; i < num_columns(); ++i) { + if (!is_numeric(column(i)->type()->id())) { + return Status::TypeError("DataType is not supported: ", + column(i)->type()->ToString()); + } + + // Casting of float16 is not supported, throw an error in this case + if ((column(i)->type()->id() == Type::HALF_FLOAT || + result_field->type()->id() == Type::HALF_FLOAT) && + column(i)->type()->id() != result_field->type()->id()) { + return Status::NotImplemented("Casting from or to halffloat is not supported."); + } + + ARROW_ASSIGN_OR_RAISE( + result_field, result_field->MergeWith( + schema_->field(i)->WithName(result_field->name()), options)); + } + result_type = result_field->type(); + } + + // Check if result_type is signed or unsigned integer and null_to_nan is set to true + // Then all columns should be promoted to float type + if (is_integer(result_type->id()) && null_to_nan) { + ARROW_ASSIGN_OR_RAISE( + result_field, + result_field->MergeWith(arrow::field(result_field->name(), float32()), options)); + result_type = result_field->type(); + } + + // Allocate memory + ARROW_ASSIGN_OR_RAISE( + std::shared_ptr result, + AllocateBuffer(result_type->bit_width() * num_columns() * num_rows(), pool)); + // Copy data + switch (result_type->id()) { + case Type::UINT8: + ConvertColumnsToTensor(*this, result->mutable_data(), row_major); + break; + case Type::UINT16: + case Type::HALF_FLOAT: + ConvertColumnsToTensor(*this, result->mutable_data(), row_major); + break; + case Type::UINT32: + ConvertColumnsToTensor(*this, result->mutable_data(), row_major); + break; + case Type::UINT64: + ConvertColumnsToTensor(*this, result->mutable_data(), row_major); + break; + case Type::INT8: + ConvertColumnsToTensor(*this, result->mutable_data(), row_major); + break; + case Type::INT16: + ConvertColumnsToTensor(*this, result->mutable_data(), row_major); + break; + case Type::INT32: + ConvertColumnsToTensor(*this, result->mutable_data(), row_major); + break; + case Type::INT64: + ConvertColumnsToTensor(*this, result->mutable_data(), row_major); + break; + case Type::FLOAT: + ConvertColumnsToTensor(*this, result->mutable_data(), row_major); + break; + case Type::DOUBLE: + ConvertColumnsToTensor(*this, result->mutable_data(), row_major); + break; + default: + return Status::TypeError("DataType is not supported: ", result_type->ToString()); + } + + // Construct Tensor object + const auto& fixed_width_type = + internal::checked_cast(*result_type); + std::vector shape = {num_rows(), num_columns()}; + std::vector strides; + std::shared_ptr tensor; + + if (row_major) { + ARROW_RETURN_NOT_OK( + internal::ComputeRowMajorStrides(fixed_width_type, shape, &strides)); + } else { + ARROW_RETURN_NOT_OK( + internal::ComputeColumnMajorStrides(fixed_width_type, shape, &strides)); + } + ARROW_ASSIGN_OR_RAISE(tensor, + Tensor::Make(result_type, std::move(result), shape, strides)); + return tensor; +} + std::vector Table::ColumnNames() const { std::vector names(num_columns()); for (int i = 0; i < num_columns(); ++i) { diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index dee6f6fdd3cb..f57e23aaf5dd 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -102,6 +102,19 @@ class ARROW_EXPORT Table { static Result> FromChunkedStructArray( const std::shared_ptr& array); + /// \brief Convert table with one data type to Tensor + /// + /// Create a Tensor object with shape (number of rows, number of columns) and + /// strides (type size in bytes, type size in bytes * number of rows). + /// + /// \param[in] null_to_nan if true, convert nulls to NaN + /// \param[in] row_major if true, create row-major Tensor else column-major Tensor + /// \param[in] pool the memory pool to allocate the tensor buffer + /// \return the resulting Tensor + Result> ToTensor( + bool null_to_nan = false, bool row_major = true, + MemoryPool* pool = default_memory_pool()) const; + /// \brief Return the table schema const std::shared_ptr& schema() const { return schema_; } diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index e96a7d84696d..767e21f01bda 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1139,6 +1139,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: const shared_ptr[CSchema]& schema, const vector[shared_ptr[CRecordBatch]]& batches) + CResult[shared_ptr[CTensor]] ToTensor(c_bool null_to_nan, c_bool row_major, + CMemoryPool* pool) const + int num_columns() int64_t num_rows() diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 2e04fa75b8b7..3299ccae9997 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -2292,7 +2292,8 @@ cdef class _Tabular(_PandasConvertible): >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]) >>> table = pa.Table.from_arrays([n_legs, animals], names=["n_legs", "animals"]) >>> table.to_pydict() - {'n_legs': [2, 2, 4, 4, 5, 100], 'animals': ['Flamingo', 'Parrot', ..., 'Centipede']} + {'n_legs': [2, 2, 4, 4, 5, 100], 'animals': [ + 'Flamingo', 'Parrot', ..., 'Centipede']} """ entries = [] for i in range(self.num_columns): @@ -4989,7 +4990,8 @@ cdef class Table(_Tabular): animals: string ---- n_legs: [[2,4,5,100],[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"],["Flamingo","Horse","Brittle stars","Centipede"]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"], + ["Flamingo","Horse","Brittle stars","Centipede"]] """ cdef: vector[shared_ptr[CRecordBatch]] c_batches @@ -5084,6 +5086,85 @@ cdef class Table(_Tabular): return result + def to_tensor(self, c_bool null_to_nan=False, c_bool row_major=True, MemoryPool memory_pool=None): + """ + Convert to a :class:`~pyarrow.Tensor`. + + Tables that can be converted have fields of type signed or unsigned integer or float, + including all bit-widths. + + ``null_to_nan`` is ``False`` by default and this method will raise an error in case + any nulls are present. Tables with nulls can be converted with ``null_to_nan`` set to + ``True``. In this case null values are converted to ``NaN`` and integer type arrays are + promoted to the appropriate float type. + + Parameters + ---------- + null_to_nan : bool, default False + Whether to write null values in the result as ``NaN``. + row_major : bool, default True + Whether resulting Tensor is row-major or column-major + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... [ + ... pa.chunked_array([[1, 2], [3, 4, None]], type=pa.int32()), + ... pa.chunked_array([[10, 20, 30], [40, None]], type=pa.float32()), + ... ], names = ["a", "b"] + ... ) + + >>> table + pyarrow.Table + a: int32 + b: float + ---- + a: [[1,2],[3,4,null]] + b: [[10,20,30],[40,null]] + + Convert a Table to row-major Tensor with null values written as ``NaN``s: + + >>> table.to_tensor(null_to_nan=True) + + type: double + shape: (5, 2) + strides: (16, 8) + >>> table.to_tensor(null_to_nan=True).to_numpy() + array([[ 1., 10.], + [ 2., 20.], + [ 3., 30.], + [ 4., 40.], + [nan, nan]]) + + Convert a Table to column-major Tensor + + >>> table.to_tensor(null_to_nan=True, row_major=False) + + type: double + shape: (5, 2) + strides: (8, 40) + >>> table.to_tensor(null_to_nan=True, row_major=False).to_numpy() + array([[ 1., 10.], + [ 2., 20.], + [ 3., 30.], + [ 4., 40.], + [nan, nan]]) + """ + cdef: + shared_ptr[CTable] c_table + shared_ptr[CTensor] c_tensor + CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + + c_table = pyarrow_unwrap_table(self) + with nogil: + c_tensor = GetResultValue( + deref(c_table).ToTensor(null_to_nan, + row_major, pool)) + return pyarrow_wrap_tensor(c_tensor) + def to_reader(self, max_chunksize=None): """ Convert the Table to a RecordBatchReader. diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index b65fb7d952c8..d12a61063bef 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -1269,6 +1269,295 @@ def test_recordbatch_to_tensor_unsupported(): batch.to_tensor() +@pytest.mark.parametrize('typ', [ + np.uint8, np.uint16, np.uint32, np.uint64, + np.int8, np.int16, np.int32, np.int64, + np.float32, np.float64, +]) +def test_table_to_tensor_uniform_type(typ): + arr1 = [[1, 2, 3], [4, 5, 6, 7, 8, 9]] + arr2 = [[10, 20], [30, 40, 50, 60, 70, 80, 90]] + arr3 = [[100, 100, 100, 100, 100, 100], [100, 100, 100]] + table = pa.Table.from_arrays( + [ + pa.chunked_array(arr1, type=pa.from_numpy_dtype(typ)), + pa.chunked_array(arr2, type=pa.from_numpy_dtype(typ)), + pa.chunked_array(arr3, type=pa.from_numpy_dtype(typ)), + ], ["a", "b", "c"] + ) + + arr1_f = [1, 2, 3, 4, 5, 6, 7, 8, 9] + arr2_f = [10, 20, 30, 40, 50, 60, 70, 80, 90] + arr3_f = [100, 100, 100, 100, 100, 100, 100, 100, 100] + + result = table.to_tensor(row_major=False) + x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(typ, order="F") + expected = pa.Tensor.from_numpy(x) + check_tensors(result, expected, pa.from_numpy_dtype(typ), 27) + + result = table.to_tensor() + x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(typ, order="C") + expected = pa.Tensor.from_numpy(x) + check_tensors(result, expected, pa.from_numpy_dtype(typ), 27) + + # Test offset + table1 = table.slice(1) + arr1_f = [2, 3, 4, 5, 6, 7, 8, 9] + arr2_f = [20, 30, 40, 50, 60, 70, 80, 90] + arr3_f = [100, 100, 100, 100, 100, 100, 100, 100] + + result = table1.to_tensor(row_major=False) + x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(typ, order="F") + expected = pa.Tensor.from_numpy(x) + check_tensors(result, expected, pa.from_numpy_dtype(typ), 24) + + result = table1.to_tensor() + x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(typ, order="C") + expected = pa.Tensor.from_numpy(x) + check_tensors(result, expected, pa.from_numpy_dtype(typ), 24) + + table2 = table.slice(1, 5) + arr1_f = [2, 3, 4, 5, 6] + arr2_f = [20, 30, 40, 50, 60] + arr3_f = [100, 100, 100, 100, 100] + + result = table2.to_tensor(row_major=False) + x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(typ, order="F") + expected = pa.Tensor.from_numpy(x) + check_tensors(result, expected, pa.from_numpy_dtype(typ), 15) + + result = table2.to_tensor() + x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(typ, order="C") + expected = pa.Tensor.from_numpy(x) + check_tensors(result, expected, pa.from_numpy_dtype(typ), 15) + + +def test_table_to_tensor_uniform_float_16(): + arr1 = [np.array([1, 2, 3], dtype=np.float16), + np.array([4, 5, 6, 7, 8, 9], dtype=np.float16)] + arr2 = [np.array([10, 20], dtype=np.float16), + np.array([30, 40, 50, 60, 70, 80, 90], dtype=np.float16)] + arr3 = [np.array([100, 100, 100, 100, 100, 100], dtype=np.float16), + np.array([100, 100, 100], dtype=np.float16)] + table = pa.Table.from_arrays( + [ + pa.chunked_array(arr1, type=pa.float16()), + pa.chunked_array(arr2, type=pa.float16()), + pa.chunked_array(arr3, type=pa.float16()), + ], ["a", "b", "c"] + ) + + arr1_f = [1, 2, 3, 4, 5, 6, 7, 8, 9] + arr2_f = [10, 20, 30, 40, 50, 60, 70, 80, 90] + arr3_f = [100, 100, 100, 100, 100, 100, 100, 100, 100] + + result = table.to_tensor(row_major=False) + x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(np.float16, order="F") + expected = pa.Tensor.from_numpy(x) + check_tensors(result, expected, pa.float16(), 27) + + result = table.to_tensor() + x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(np.float16, order="C") + expected = pa.Tensor.from_numpy(x) + check_tensors(result, expected, pa.float16(), 27) + + +def test_table_to_tensor_mixed_type(): + # uint16 + int16 = int32 + arr1 = [[1, 2, 3], [4, 5, 6, 7, 8, 9]] + arr2 = [[10, 20], [30, 40, 50, 60, 70, 80, 90]] + arr3 = [[100, 200, 300, np.nan, 500, 600], [700, 800, 900]] + table = pa.Table.from_arrays( + [ + pa.chunked_array(arr1, type=pa.uint16()), + pa.chunked_array(arr2, type=pa.int16()), + ], ["a", "b"] + ) + + arr1_f = [1, 2, 3, 4, 5, 6, 7, 8, 9] + arr2_f = [10, 20, 30, 40, 50, 60, 70, 80, 90] + arr3_f = [100, 200, 300, np.nan, 500, 600, 700, 800, 900] + + result = table.to_tensor(row_major=False) + x = np.column_stack([arr1_f, arr2_f]).astype(np.int32, order="F") + expected = pa.Tensor.from_numpy(x) + check_tensors(result, expected, pa.int32(), 18) + + result = table.to_tensor() + x = np.column_stack([arr1_f, arr2_f]).astype(np.int32, order="C") + expected = pa.Tensor.from_numpy(x) + check_tensors(result, expected, pa.int32(), 18) + + # uint16 + int16 + float32 = float64 + table = pa.Table.from_arrays( + [ + pa.chunked_array(arr1, type=pa.uint16()), + pa.chunked_array(arr2, type=pa.int16()), + pa.chunked_array(arr3, type=pa.float32()), + ], ["a", "b", "c"] + ) + result = table.to_tensor(row_major=False) + x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(np.float64, order="F") + expected = pa.Tensor.from_numpy(x) + + np.testing.assert_equal(result.to_numpy(), x) + assert result.size == 27 + assert result.type == pa.float64() + assert result.shape == expected.shape + assert result.strides == expected.strides + + result = table.to_tensor() + x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(np.float64, order="C") + expected = pa.Tensor.from_numpy(x) + + np.testing.assert_equal(result.to_numpy(), x) + assert result.size == 27 + assert result.type == pa.float64() + assert result.shape == expected.shape + assert result.strides == expected.strides + + +def test_table_to_tensor_unsupported_mixed_type_with_float16(): + arr1 = [[1, 2, 3], [4, 5, 6, 7, 8, 9]] + arr2 = [10, 20, 30, 40, 50, 60, 70, 80, 90] + arr3 = [[100, 200, 300, 400, 500, 600], [700, 800, 900]] + table = pa.Table.from_arrays( + [ + pa.chunked_array(arr1, type=pa.uint16()), + pa.chunked_array([np.array(arr2, dtype=np.float16)], type=pa.float16()), + pa.chunked_array(arr3, type=pa.float32()), + ], ["a", "b", "c"] + ) + + with pytest.raises( + NotImplementedError, + match="Casting from or to halffloat is not supported." + ): + table.to_tensor() + + +def test_table_to_tensor_nan(): + arr1 = [[1, 2, 3], [4, np.nan, 6, 7, 8, 9]] + arr2 = [[10, 20], [30, 40, 50, 60, 70, np.nan, 90]] + table = pa.Table.from_arrays( + [ + pa.chunked_array(arr1, type=pa.float32()), + pa.chunked_array(arr2, type=pa.float32()), + ], ["a", "b"] + ) + + arr1_f = [1, 2, 3, 4, np.nan, 6, 7, 8, 9] + arr2_f = [10, 20, 30, 40, 50, 60, 70, np.nan, 90] + + result = table.to_tensor(row_major=False) + x = np.column_stack([arr1_f, arr2_f]).astype(np.float32, order="F") + expected = pa.Tensor.from_numpy(x) + + np.testing.assert_equal(result.to_numpy(), x) + assert result.size == 18 + assert result.type == pa.float32() + assert result.shape == expected.shape + assert result.strides == expected.strides + + +def test_table_to_tensor_null(): + arr1 = [[1, 2, 3], [4, None, 6, 7, 8, 9]] + arr2 = [[10, 20], [30, 40, 50, 60, 70, None, 90]] + table = pa.Table.from_arrays( + [ + pa.chunked_array(arr1, type=pa.int32()), + pa.chunked_array(arr2, type=pa.float32()), + ], ["a", "b"] + ) + with pytest.raises( + pa.ArrowTypeError, + match="Can only convert a Table with no nulls." + ): + table.to_tensor() + + arr1_f = [1, 2, 3, 4, np.nan, 6, 7, 8, 9] + arr2_f = [10, 20, 30, 40, 50, 60, 70, np.nan, 90] + + result = table.to_tensor(null_to_nan=True, row_major=False) + x = np.column_stack([arr1_f, arr2_f]).astype(np.float64, order="F") + expected = pa.Tensor.from_numpy(x) + + np.testing.assert_equal(result.to_numpy(), x) + assert result.size == 18 + assert result.type == pa.float64() + assert result.shape == expected.shape + assert result.strides == expected.strides + + # int32 -> float64 + table = pa.Table.from_arrays( + [ + pa.chunked_array(arr1, type=pa.int32()), + pa.chunked_array(arr2, type=pa.int32()), + ], ["a", "b"] + ) + + result = table.to_tensor(null_to_nan=True, row_major=False) + + np.testing.assert_equal(result.to_numpy(), x) + assert result.size == 18 + assert result.type == pa.float64() + assert result.shape == expected.shape + assert result.strides == expected.strides + + # int8 -> float32 + table = pa.Table.from_arrays( + [ + pa.chunked_array(arr1, type=pa.int8()), + pa.chunked_array(arr2, type=pa.int8()), + ], ["a", "b"] + ) + + result = table.to_tensor(null_to_nan=True, row_major=False) + x = np.column_stack([arr1_f, arr2_f]).astype(np.float32, order="F") + expected = pa.Tensor.from_numpy(x) + + np.testing.assert_equal(result.to_numpy(), x) + assert result.size == 18 + assert result.type == pa.float32() + assert result.shape == expected.shape + assert result.strides == expected.strides + + +def test_table_to_tensor_empty(): + table = pa.Table.from_arrays( + [ + pa.chunked_array([], type=pa.float32()), + pa.chunked_array([], type=pa.float32()), + ], ["a", "b"] + ) + result = table.to_tensor() + + x = np.column_stack([[], []]).astype(np.float32, order="F") + expected = pa.Tensor.from_numpy(x) + + assert result.size == expected.size + assert result.type == pa.float32() + assert result.shape == expected.shape + assert result.strides == (4, 4) + + +def test_table_to_tensor_unsupported(): + arr1 = [[1, 2, 3], [4, 5, 6, 7, 8, 9]] + # Unsupported data type + arr2 = [["a", "b", "c", "a"], ["b", "c", "a", "b", "c"]] + table = pa.Table.from_arrays( + [ + pa.chunked_array(arr1, type=pa.int32()), + pa.chunked_array(arr2, type=pa.utf8()), + ], ["a", "b"] + ) + with pytest.raises( + pa.ArrowTypeError, + match="DataType is not supported" + ): + table.to_tensor() + + def _table_like_slice_tests(factory): data = [ pa.array(range(5)), From cd74794fa3b4c342592c33d336a1721b348cc4f4 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Wed, 29 May 2024 09:16:36 +0200 Subject: [PATCH 02/23] Add C++ tests --- cpp/src/arrow/table.cc | 2 +- cpp/src/arrow/table_test.cc | 529 ++++++++++++++++++++++++++++++++++++ 2 files changed, 530 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index c7e357ee03b9..42fc9d0ce894 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -440,7 +440,7 @@ Result> Table::ToTensor(bool null_to_nan, bool row_major MemoryPool* pool) const { if (num_columns() == 0) { return Status::TypeError( - "Conversion to Tensor for Table without columns/schema is not supported."); + "Conversion to Tensor for Tables without columns/schema is not supported."); } // Check for no validity bitmap of each field // if null_to_nan conversion is set to false diff --git a/cpp/src/arrow/table_test.cc b/cpp/src/arrow/table_test.cc index 692671910b89..c08a5dca81c2 100644 --- a/cpp/src/arrow/table_test.cc +++ b/cpp/src/arrow/table_test.cc @@ -33,6 +33,7 @@ #include "arrow/compute/cast.h" #include "arrow/record_batch.h" #include "arrow/status.h" +#include "arrow/tensor.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/type.h" @@ -520,6 +521,534 @@ TEST_F(TestTable, ConcatenateTables) { ASSERT_RAISES(Invalid, ConcatenateTables({t1, t3})); } +TEST_F(TestTable, ToTensorUnsupportedType) { + auto f0 = field("f0", int32()); + // Unsupported data type + auto f1 = field("f1", utf8()); + + std::vector> fields = {f0, f1}; + auto schema = ::arrow::schema(fields); + + auto a0 = ChunkedArrayFromJSON(int32(), {"[1, 2, 3]", "[4, 5, 6, 7, 8, 9]"}); + auto a1 = ChunkedArrayFromJSON( + utf8(), {R"(["a", "b", "c", "a", "b"])", R"(["c", "a", "b", "c"])"}); + + auto table = Table::Make(schema, {a0, a1}); + + ASSERT_RAISES_WITH_MESSAGE( + TypeError, "Type error: DataType is not supported: " + a1->type()->ToString(), + table->ToTensor()); + + // Unsupported boolean data type + auto f2 = field("f2", boolean()); + + std::vector> fields2 = {f0, f2}; + auto schema2 = ::arrow::schema(fields2); + auto a2 = ChunkedArrayFromJSON( + boolean(), {"[true, false, true, true, false, true, false, true, true]"}); + auto table2 = Table::Make(schema2, {a0, a2}); + + ASSERT_RAISES_WITH_MESSAGE( + TypeError, "Type error: DataType is not supported: " + a2->type()->ToString(), + table2->ToTensor()); +} + +TEST_F(TestTable, ToTensorUnsupportedMissing) { + auto f0 = field("f0", int32()); + auto f1 = field("f1", int32()); + + std::vector> fields = {f0, f1}; + auto schema = ::arrow::schema(fields); + + auto a0 = ChunkedArrayFromJSON(int32(), {"[1, 2, 3]", "[4, 5, 6, 7, 8, 9]"}); + auto a1 = ChunkedArrayFromJSON(int32(), {"[10, 20]", "[30, 40, null, 60, 70, 80, 90]"}); + + auto table = Table::Make(schema, {a0, a1}); + + ASSERT_RAISES_WITH_MESSAGE(TypeError, + "Type error: Can only convert a Table with no nulls. Set " + "null_to_nan to true to convert nulls to NaN", + table->ToTensor()); +} + +TEST_F(TestTable, ToTensorEmptyTable) { + auto f0 = field("f0", int32()); + auto f1 = field("f1", int32()); + + std::vector> fields = {f0, f1}; + auto schema = ::arrow::schema(fields); + + ASSERT_OK_AND_ASSIGN(std::shared_ptr empty, Table::MakeEmpty(schema)); + + ASSERT_OK_AND_ASSIGN(auto tensor_column, + empty->ToTensor(/*null_to_nan=*/false, /*row_major=*/false)); + ASSERT_OK(tensor_column->Validate()); + + ASSERT_OK_AND_ASSIGN(auto tensor_row, empty->ToTensor()); + ASSERT_OK(tensor_row->Validate()); + + const std::vector strides = {4, 4}; + const std::vector shape = {0, 2}; + + EXPECT_EQ(strides, tensor_column->strides()); + EXPECT_EQ(shape, tensor_column->shape()); + EXPECT_EQ(strides, tensor_row->strides()); + EXPECT_EQ(shape, tensor_row->shape()); + + std::vector> columns; + auto t2 = Table::Make(::arrow::schema({}), columns); + auto table_no_columns = + Table::Make(::arrow::schema({}), std::vector>{}); + + ASSERT_RAISES_WITH_MESSAGE(TypeError, + "Type error: Conversion to Tensor for Tables without " + "columns/schema is not supported.", + table_no_columns->ToTensor()); +} + +template +void CheckTensor(const std::shared_ptr& tensor, const int size, + const std::vector shape, const std::vector f_strides) { + EXPECT_EQ(size, tensor->size()); + EXPECT_EQ(TypeTraits::type_singleton(), tensor->type()); + EXPECT_EQ(shape, tensor->shape()); + EXPECT_EQ(f_strides, tensor->strides()); + EXPECT_FALSE(tensor->is_row_major()); + EXPECT_TRUE(tensor->is_column_major()); + EXPECT_TRUE(tensor->is_contiguous()); +} + +template +void CheckTensorRowMajor(const std::shared_ptr& tensor, const int size, + const std::vector shape, + const std::vector strides) { + EXPECT_EQ(size, tensor->size()); + EXPECT_EQ(TypeTraits::type_singleton(), tensor->type()); + EXPECT_EQ(shape, tensor->shape()); + EXPECT_EQ(strides, tensor->strides()); + EXPECT_TRUE(tensor->is_row_major()); + EXPECT_FALSE(tensor->is_column_major()); + EXPECT_TRUE(tensor->is_contiguous()); +} + +TEST_F(TestTable, ToTensorSupportedNaN) { + auto f0 = field("f0", float32()); + auto f1 = field("f1", float32()); + + std::vector> fields = {f0, f1}; + auto schema = ::arrow::schema(fields); + + auto a0 = ChunkedArrayFromJSON(float32(), {"[NaN, 2, 3]", "[4, 5, 6, 7, 8, 9]"}); + auto a1 = + ChunkedArrayFromJSON(float32(), {"[10, 20]", "[30, 40, NaN, 60, 70, 80, 90]"}); + + auto table = Table::Make(schema, {a0, a1}); + + ASSERT_OK_AND_ASSIGN(auto tensor, + table->ToTensor(/*null_to_nan=*/false, /*row_major=*/false)); + ASSERT_OK(tensor->Validate()); + + std::vector shape = {9, 2}; + const int64_t f32_size = sizeof(float); + std::vector f_strides = {f32_size, f32_size * shape[0]}; + std::shared_ptr tensor_expected = TensorFromJSON( + float32(), "[NaN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, NaN, 60, 70, 80, 90]", + shape, f_strides); + + EXPECT_FALSE(tensor_expected->Equals(*tensor)); + EXPECT_TRUE(tensor_expected->Equals(*tensor, EqualOptions().nans_equal(true))); + CheckTensor(tensor, 18, shape, f_strides); +} + +TEST_F(TestTable, ToTensorSupportedNullToNan) { + // int32 + float32 = float64 + auto f0 = field("f0", int32()); + auto f1 = field("f1", float32()); + + std::vector> fields = {f0, f1}; + auto schema = ::arrow::schema(fields); + + auto a0 = ChunkedArrayFromJSON(int32(), {"[null, 2, 3]", "[4, 5, 6, 7, 8, 9]"}); + auto a1 = + ChunkedArrayFromJSON(float32(), {"[10, 20]", "[30, 40, null, 60, 70, 80, 90]"}); + + auto table = Table::Make(schema, {a0, a1}); + + ASSERT_OK_AND_ASSIGN(auto tensor, + table->ToTensor(/*null_to_nan=*/true, /*row_major=*/false)); + ASSERT_OK(tensor->Validate()); + + std::vector shape = {9, 2}; + const int64_t f64_size = sizeof(double); + std::vector f_strides = {f64_size, f64_size * shape[0]}; + std::shared_ptr tensor_expected = TensorFromJSON( + float64(), "[NaN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, NaN, 60, 70, 80, 90]", + shape, f_strides); + + EXPECT_FALSE(tensor_expected->Equals(*tensor)); + EXPECT_TRUE(tensor_expected->Equals(*tensor, EqualOptions().nans_equal(true))); + + CheckTensor(tensor, 18, shape, f_strides); + + ASSERT_OK_AND_ASSIGN(auto tensor_row, table->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK(tensor_row->Validate()); + + std::vector strides = {f64_size * shape[1], f64_size}; + std::shared_ptr tensor_expected_row = TensorFromJSON( + float64(), "[NaN, 10, 2, 20, 3, 30, 4, 40, 5, NaN, 6, 60, 7, 70, 8, 80, 9, 90]", + shape, strides); + + EXPECT_FALSE(tensor_expected_row->Equals(*tensor_row)); + EXPECT_TRUE(tensor_expected_row->Equals(*tensor_row, EqualOptions().nans_equal(true))); + + CheckTensorRowMajor(tensor_row, 18, shape, strides); + + // int32 -> float64 + auto f2 = field("f2", int32()); + + std::vector> fields1 = {f0, f2}; + auto schema1 = ::arrow::schema(fields1); + + auto a2 = ChunkedArrayFromJSON(int32(), {"[10, 20]", "[30, 40, null, 60, 70, 80, 90]"}); + auto table1 = Table::Make(schema1, {a0, a2}); + + ASSERT_OK_AND_ASSIGN(auto tensor1, + table1->ToTensor(/*null_to_nan=*/true, /*row_major=*/false)); + ASSERT_OK(tensor1->Validate()); + + EXPECT_FALSE(tensor_expected->Equals(*tensor1)); + EXPECT_TRUE(tensor_expected->Equals(*tensor1, EqualOptions().nans_equal(true))); + + CheckTensor(tensor1, 18, shape, f_strides); + + ASSERT_OK_AND_ASSIGN(auto tensor1_row, table1->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK(tensor1_row->Validate()); + + EXPECT_FALSE(tensor_expected_row->Equals(*tensor1_row)); + EXPECT_TRUE(tensor_expected_row->Equals(*tensor1_row, EqualOptions().nans_equal(true))); + + CheckTensorRowMajor(tensor1_row, 18, shape, strides); + + // int8 -> float32 + auto f3 = field("f3", int8()); + auto f4 = field("f4", int8()); + + std::vector> fields2 = {f3, f4}; + auto schema2 = ::arrow::schema(fields2); + + auto a3 = ChunkedArrayFromJSON(int8(), {"[null, 2, 3]", "[4, 5, 6, 7, 8, 9]"}); + auto a4 = ChunkedArrayFromJSON(int8(), {"[10, 20]", "[30, 40, null, 60, 70, 80, 90]"}); + auto table2 = Table::Make(schema2, {a3, a4}); + + ASSERT_OK_AND_ASSIGN(auto tensor2, + table2->ToTensor(/*null_to_nan=*/true, /*row_major=*/false)); + ASSERT_OK(tensor2->Validate()); + + const int64_t f32_size = sizeof(float); + std::vector f_strides_2 = {f32_size, f32_size * shape[0]}; + std::shared_ptr tensor_expected_2 = TensorFromJSON( + float32(), "[NaN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, NaN, 60, 70, 80, 90]", + shape, f_strides_2); + + EXPECT_FALSE(tensor_expected_2->Equals(*tensor2)); + EXPECT_TRUE(tensor_expected_2->Equals(*tensor2, EqualOptions().nans_equal(true))); + + CheckTensor(tensor2, 18, shape, f_strides_2); + + ASSERT_OK_AND_ASSIGN(auto tensor2_row, table2->ToTensor(/*null_to_nan=*/true)); + ASSERT_OK(tensor2_row->Validate()); + + std::vector strides_2 = {f32_size * shape[1], f32_size}; + std::shared_ptr tensor2_expected_row = TensorFromJSON( + float32(), "[NaN, 10, 2, 20, 3, 30, 4, 40, 5, NaN, 6, 60, 7, 70, 8, 80, 9, 90]", + shape, strides_2); + + EXPECT_FALSE(tensor2_expected_row->Equals(*tensor2_row)); + EXPECT_TRUE( + tensor2_expected_row->Equals(*tensor2_row, EqualOptions().nans_equal(true))); + + CheckTensorRowMajor(tensor2_row, 18, shape, strides_2); +} + +TEST_F(TestTable, ToTensorSupportedTypesMixed) { + auto f0 = field("f0", uint16()); + auto f1 = field("f1", int16()); + auto f2 = field("f2", float32()); + + auto a0 = ChunkedArrayFromJSON(uint16(), {"[1, 2, 3]", "[4, 5, 6, 7, 8, 9]"}); + auto a1 = ChunkedArrayFromJSON(int16(), {"[10, 20]", "[30, 40, 50, 60, 70, 80, 90]"}); + auto a2 = ChunkedArrayFromJSON(float32(), + {"[100, 200, 300, NaN, 500, 600]", "[700, 800, 900]"}); + + // Single column + std::vector> fields = {f0}; + auto schema = ::arrow::schema(fields); + auto table = Table::Make(schema, {a0}); + + ASSERT_OK_AND_ASSIGN(auto tensor, + table->ToTensor(/*null_to_nan=*/false, /*row_major=*/false)); + ASSERT_OK(tensor->Validate()); + + std::vector shape = {9, 1}; + const int64_t uint16_size = sizeof(uint16_t); + std::vector f_strides = {uint16_size, uint16_size * shape[0]}; + std::shared_ptr tensor_expected = + TensorFromJSON(uint16(), "[1, 2, 3, 4, 5, 6, 7, 8, 9]", shape, f_strides); + + EXPECT_TRUE(tensor_expected->Equals(*tensor)); + CheckTensor(tensor, 9, shape, f_strides); + + // uint16 + int16 = int32 + std::vector> fields1 = {f0, f1}; + auto schema1 = ::arrow::schema(fields1); + auto table1 = Table::Make(schema1, {a0, a1}); + + ASSERT_OK_AND_ASSIGN(auto tensor1, + table1->ToTensor(/*null_to_nan=*/false, /*row_major=*/false)); + ASSERT_OK(tensor1->Validate()); + + std::vector shape1 = {9, 2}; + const int64_t int32_size = sizeof(int32_t); + std::vector f_strides_1 = {int32_size, int32_size * shape1[0]}; + std::shared_ptr tensor_expected_1 = TensorFromJSON( + int32(), "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90]", + shape1, f_strides_1); + + EXPECT_TRUE(tensor_expected_1->Equals(*tensor1)); + + CheckTensor(tensor1, 18, shape1, f_strides_1); + + ASSERT_EQ(tensor1->type()->bit_width(), tensor_expected_1->type()->bit_width()); + + ASSERT_EQ(1, tensor_expected_1->Value({0, 0})); + ASSERT_EQ(2, tensor_expected_1->Value({1, 0})); + ASSERT_EQ(10, tensor_expected_1->Value({0, 1})); + + // uint16 + int16 + float32 = float64 + std::vector> fields2 = {f0, f1, f2}; + auto schema2 = ::arrow::schema(fields2); + auto table2 = Table::Make(schema2, {a0, a1, a2}); + + ASSERT_OK_AND_ASSIGN(auto tensor2, + table2->ToTensor(/*null_to_nan=*/false, /*row_major=*/false)); + ASSERT_OK(tensor2->Validate()); + + std::vector shape2 = {9, 3}; + const int64_t f64_size = sizeof(double); + std::vector f_strides_2 = {f64_size, f64_size * shape2[0]}; + std::shared_ptr tensor_expected_2 = + TensorFromJSON(float64(), + "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50," + "60, 70, 80, 90, 100, 200, 300, NaN, 500, 600, 700, 800, 900]", + shape2, f_strides_2); + + EXPECT_FALSE(tensor_expected_2->Equals(*tensor2)); + EXPECT_TRUE(tensor_expected_2->Equals(*tensor2, EqualOptions().nans_equal(true))); + + CheckTensor(tensor2, 27, shape2, f_strides_2); +} + +TEST_F(TestTable, ToTensorUnsupportedMixedFloat16) { + auto f0 = field("f0", float16()); + auto f1 = field("f1", float64()); + + auto a0 = ChunkedArrayFromJSON(float16(), {"[1, 2, 3]", "[4, 5, 6, 7, 8, 9]"}); + auto a1 = ChunkedArrayFromJSON(float64(), {"[10, 20]", "[30, 40, 50, 60, 70, 80, 90]"}); + + std::vector> fields = {f0, f1}; + auto schema = ::arrow::schema(fields); + auto table = Table::Make(schema, {a0, a1}); + + ASSERT_RAISES_WITH_MESSAGE( + NotImplemented, "NotImplemented: Casting from or to halffloat is not supported.", + table->ToTensor()); + + std::vector> fields1 = {f1, f0}; + auto schema1 = ::arrow::schema(fields1); + auto table1 = Table::Make(schema1, {a1, a0}); + + ASSERT_RAISES_WITH_MESSAGE( + NotImplemented, "NotImplemented: Casting from or to halffloat is not supported.", + table1->ToTensor()); +} + +template +class TestTableToTensorColumnMajor : public ::testing::Test {}; + +TYPED_TEST_SUITE_P(TestTableToTensorColumnMajor); + +TYPED_TEST_P(TestTableToTensorColumnMajor, SupportedTypes) { + using DataType = TypeParam; + using c_data_type = typename DataType::c_type; + const int unit_size = sizeof(c_data_type); + + auto f0 = field("f0", TypeTraits::type_singleton()); + auto f1 = field("f1", TypeTraits::type_singleton()); + auto f2 = field("f2", TypeTraits::type_singleton()); + + std::vector> fields = {f0, f1, f2}; + auto schema = ::arrow::schema(fields); + + auto a0 = ChunkedArrayFromJSON(TypeTraits::type_singleton(), + {"[1, 2, 3]", "[4, 5, 6, 7, 8, 9]"}); + auto a1 = ChunkedArrayFromJSON(TypeTraits::type_singleton(), + {"[10, 20]", "[30, 40, 50, 60, 70, 80, 90]"}); + auto a2 = ChunkedArrayFromJSON(TypeTraits::type_singleton(), + {"[100, 100, 100, 100, 100, 100]", "[100, 100, 100]"}); + + auto table = Table::Make(schema, {a0, a1, a2}); + + ASSERT_OK_AND_ASSIGN(auto tensor, + table->ToTensor(/*null_to_nan=*/false, /*row_major=*/false)); + ASSERT_OK(tensor->Validate()); + + std::vector shape = {9, 3}; + std::vector f_strides = {unit_size, unit_size * shape[0]}; + std::shared_ptr tensor_expected = TensorFromJSON( + TypeTraits::type_singleton(), + "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, " + "80, 90, 100, 100, 100, 100, 100, 100, 100, 100, 100]", + shape, f_strides); + + EXPECT_TRUE(tensor_expected->Equals(*tensor)); + CheckTensor(tensor, 27, shape, f_strides); + + // Test offsets + auto table_slice = table->Slice(1); + + ASSERT_OK_AND_ASSIGN(auto tensor_sliced, table_slice->ToTensor(/*null_to_nan=*/false, + /*row_major=*/false)); + ASSERT_OK(tensor_sliced->Validate()); + + std::vector shape_sliced = {8, 3}; + std::vector f_strides_sliced = {unit_size, unit_size * shape_sliced[0]}; + std::shared_ptr tensor_expected_sliced = + TensorFromJSON(TypeTraits::type_singleton(), + "[2, 3, 4, 5, 6, 7, 8, 9, 20, 30, 40, 50, 60, " + "70, 80, 90, 100, 100, 100, 100, 100, 100, 100, 100]", + shape_sliced, f_strides_sliced); + + EXPECT_TRUE(tensor_expected_sliced->Equals(*tensor_sliced)); + CheckTensor(tensor_expected_sliced, 24, shape_sliced, f_strides_sliced); + + auto table_slice_1 = table->Slice(1, 5); + + ASSERT_OK_AND_ASSIGN( + auto tensor_sliced_1, + table_slice_1->ToTensor(/*null_to_nan=*/false, /*row_major=*/false)); + ASSERT_OK(tensor_sliced_1->Validate()); + + std::vector shape_sliced_1 = {5, 3}; + std::vector f_strides_sliced_1 = {unit_size, unit_size * shape_sliced_1[0]}; + std::shared_ptr tensor_expected_sliced_1 = + TensorFromJSON(TypeTraits::type_singleton(), + "[2, 3, 4, 5, 6, 20, 30, 40, 50, 60, 100, 100, 100, 100, 100]", + shape_sliced_1, f_strides_sliced_1); + + EXPECT_TRUE(tensor_expected_sliced_1->Equals(*tensor_sliced_1)); + CheckTensor(tensor_expected_sliced_1, 15, shape_sliced_1, f_strides_sliced_1); +} + +REGISTER_TYPED_TEST_SUITE_P(TestTableToTensorColumnMajor, SupportedTypes); + +INSTANTIATE_TYPED_TEST_SUITE_P(UInt8, TestTableToTensorColumnMajor, UInt8Type); +INSTANTIATE_TYPED_TEST_SUITE_P(UInt16, TestTableToTensorColumnMajor, UInt16Type); +INSTANTIATE_TYPED_TEST_SUITE_P(UInt32, TestTableToTensorColumnMajor, UInt32Type); +INSTANTIATE_TYPED_TEST_SUITE_P(UInt64, TestTableToTensorColumnMajor, UInt64Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Int8, TestTableToTensorColumnMajor, Int8Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Int16, TestTableToTensorColumnMajor, Int16Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Int32, TestTableToTensorColumnMajor, Int32Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Int64, TestTableToTensorColumnMajor, Int64Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Float16, TestTableToTensorColumnMajor, HalfFloatType); +INSTANTIATE_TYPED_TEST_SUITE_P(Float32, TestTableToTensorColumnMajor, FloatType); +INSTANTIATE_TYPED_TEST_SUITE_P(Float64, TestTableToTensorColumnMajor, DoubleType); + +template +class TestTableToTensorRowMajor : public ::testing::Test {}; + +TYPED_TEST_SUITE_P(TestTableToTensorRowMajor); + +TYPED_TEST_P(TestTableToTensorRowMajor, SupportedTypes) { + using DataType = TypeParam; + using c_data_type = typename DataType::c_type; + const int unit_size = sizeof(c_data_type); + + auto f0 = field("f0", TypeTraits::type_singleton()); + auto f1 = field("f1", TypeTraits::type_singleton()); + auto f2 = field("f2", TypeTraits::type_singleton()); + + std::vector> fields = {f0, f1, f2}; + auto schema = ::arrow::schema(fields); + + auto a0 = ChunkedArrayFromJSON(TypeTraits::type_singleton(), + {"[1, 2, 3]", "[4, 5, 6, 7, 8, 9]"}); + auto a1 = ChunkedArrayFromJSON(TypeTraits::type_singleton(), + {"[10, 20]", "[30, 40, 50, 60, 70, 80, 90]"}); + auto a2 = ChunkedArrayFromJSON(TypeTraits::type_singleton(), + {"[100, 100, 100, 100, 100, 100]", "[100, 100, 100]"}); + + auto table = Table::Make(schema, {a0, a1, a2}); + + ASSERT_OK_AND_ASSIGN(auto tensor, table->ToTensor()); + ASSERT_OK(tensor->Validate()); + + std::vector shape = {9, 3}; + std::vector strides = {unit_size * shape[1], unit_size}; + std::shared_ptr tensor_expected = + TensorFromJSON(TypeTraits::type_singleton(), + "[1, 10, 100, 2, 20, 100, 3, 30, 100, 4, 40, 100, 5, 50, 100, 6, " + "60, 100, 7, 70, 100, 8, 80, 100, 9, 90, 100]", shape, strides); + + EXPECT_TRUE(tensor_expected->Equals(*tensor)); + CheckTensorRowMajor(tensor, 27, shape, strides); + + // Test offsets + auto table_slice = table->Slice(1); + + ASSERT_OK_AND_ASSIGN(auto tensor_sliced, table_slice->ToTensor()); + ASSERT_OK(tensor_sliced->Validate()); + + std::vector shape_sliced = {8, 3}; + std::vector strides_sliced = {unit_size * shape[1], unit_size}; + std::shared_ptr tensor_expected_sliced = + TensorFromJSON(TypeTraits::type_singleton(), + "[2, 20, 100, 3, 30, 100, 4, 40, 100, 5, 50, 100, 6, " + "60, 100, 7, 70, 100, 8, 80, 100, 9, 90, 100]", + shape_sliced, strides_sliced); + + EXPECT_TRUE(tensor_expected_sliced->Equals(*tensor_sliced)); + CheckTensorRowMajor(tensor_sliced, 24, shape_sliced, strides_sliced); + + auto table_slice_1 = table->Slice(1, 5); + + ASSERT_OK_AND_ASSIGN(auto tensor_sliced_1, table_slice_1->ToTensor()); + ASSERT_OK(tensor_sliced_1->Validate()); + + std::vector shape_sliced_1 = {5, 3}; + std::vector strides_sliced_1 = {unit_size * shape_sliced_1[1], unit_size}; + std::shared_ptr tensor_expected_sliced_1 = + TensorFromJSON(TypeTraits::type_singleton(), + "[2, 20, 100, 3, 30, 100, 4, 40, 100, 5, 50, 100, 6, 60, 100]", + shape_sliced_1, strides_sliced_1); + + EXPECT_TRUE(tensor_expected_sliced_1->Equals(*tensor_sliced_1)); + CheckTensorRowMajor(tensor_sliced_1, 15, shape_sliced_1, strides_sliced_1); +} + +REGISTER_TYPED_TEST_SUITE_P(TestTableToTensorRowMajor, SupportedTypes); + +INSTANTIATE_TYPED_TEST_SUITE_P(UInt8, TestTableToTensorRowMajor, UInt8Type); +INSTANTIATE_TYPED_TEST_SUITE_P(UInt16, TestTableToTensorRowMajor, UInt16Type); +INSTANTIATE_TYPED_TEST_SUITE_P(UInt32, TestTableToTensorRowMajor, UInt32Type); +INSTANTIATE_TYPED_TEST_SUITE_P(UInt64, TestTableToTensorRowMajor, UInt64Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Int8, TestTableToTensorRowMajor, Int8Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Int16, TestTableToTensorRowMajor, Int16Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Int32, TestTableToTensorRowMajor, Int32Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Int64, TestTableToTensorRowMajor, Int64Type); +INSTANTIATE_TYPED_TEST_SUITE_P(Float16, TestTableToTensorRowMajor, HalfFloatType); +INSTANTIATE_TYPED_TEST_SUITE_P(Float32, TestTableToTensorRowMajor, FloatType); +INSTANTIATE_TYPED_TEST_SUITE_P(Float64, TestTableToTensorRowMajor, DoubleType); + std::shared_ptr
MakeTableWithOneNullFilledColumn( const std::string& column_name, const std::shared_ptr& data_type, const int length) { From 09843d7c4e94db51854173312a19b716514ebcf9 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Wed, 29 May 2024 11:16:18 +0200 Subject: [PATCH 03/23] Add benchmarks --- cpp/src/arrow/tensor_benchmark.cc | 38 +++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/cpp/src/arrow/tensor_benchmark.cc b/cpp/src/arrow/tensor_benchmark.cc index 91a9270ef347..30969995ee2b 100644 --- a/cpp/src/arrow/tensor_benchmark.cc +++ b/cpp/src/arrow/tensor_benchmark.cc @@ -18,6 +18,7 @@ #include "benchmark/benchmark.h" #include "arrow/record_batch.h" +#include "arrow/table.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/type.h" @@ -51,6 +52,34 @@ static void BatchToTensorSimple(benchmark::State& state) { state.SetBytesProcessed(state.iterations() * ty->byte_width() * num_rows * num_cols); } +template +static void TableToTensorSimple(benchmark::State& state) { + using CType = typename ValueType::c_type; + std::shared_ptr ty = TypeTraits::type_singleton(); + + const int64_t num_cols = state.range(1); + const int64_t num_rows = state.range(0) / num_cols / sizeof(CType); + arrow::random::RandomArrayGenerator gen_{42}; + + std::vector> fields = {}; + std::vector> columns = {}; + + for (int64_t i = 0; i < num_cols; ++i) { + fields.push_back(field("f" + std::to_string(i), ty)); + ArrayVector arrays = {gen_.ArrayOf(ty, num_rows / 2), gen_.ArrayOf(ty, num_rows / 2)}; + auto chunks = std::make_shared(arrays, ty); + columns.push_back(chunks); + } + auto schema = std::make_shared(std::move(fields)); + auto table = Table::Make(schema, columns); + + for (auto _ : state) { + ASSERT_OK_AND_ASSIGN(auto tensor, table->ToTensor(/*row_major=*/row_major)); + } + state.SetItemsProcessed(state.iterations() * num_rows * num_cols); + state.SetBytesProcessed(state.iterations() * ty->byte_width() * num_rows * num_cols); +} + void SetArgs(benchmark::internal::Benchmark* bench) { for (int64_t size : {kL1Size, kL2Size}) { for (int64_t num_columns : {3, 30, 300}) { @@ -65,4 +94,13 @@ BENCHMARK_TEMPLATE(BatchToTensorSimple, Int16Type)->Apply(SetArgs); BENCHMARK_TEMPLATE(BatchToTensorSimple, Int32Type)->Apply(SetArgs); BENCHMARK_TEMPLATE(BatchToTensorSimple, Int64Type)->Apply(SetArgs); +#define DECLARE_TABLE_TO_TENSOR_BENCHMARKS(row_major) \ + BENCHMARK_TEMPLATE(TableToTensorSimple, Int8Type, row_major)->Apply(SetArgs); \ + BENCHMARK_TEMPLATE(TableToTensorSimple, Int16Type, row_major)->Apply(SetArgs); \ + BENCHMARK_TEMPLATE(TableToTensorSimple, Int32Type, row_major)->Apply(SetArgs); \ + BENCHMARK_TEMPLATE(TableToTensorSimple, Int64Type, row_major)->Apply(SetArgs); + +DECLARE_TABLE_TO_TENSOR_BENCHMARKS(false); +DECLARE_TABLE_TO_TENSOR_BENCHMARKS(true); + } // namespace arrow From 9c0c6f6ce315cb36236ce46221009edc6873e585 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Wed, 29 May 2024 11:28:21 +0200 Subject: [PATCH 04/23] Fix linter error --- cpp/src/arrow/table_test.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/table_test.cc b/cpp/src/arrow/table_test.cc index c08a5dca81c2..1d6b20142bd2 100644 --- a/cpp/src/arrow/table_test.cc +++ b/cpp/src/arrow/table_test.cc @@ -997,7 +997,8 @@ TYPED_TEST_P(TestTableToTensorRowMajor, SupportedTypes) { std::shared_ptr tensor_expected = TensorFromJSON(TypeTraits::type_singleton(), "[1, 10, 100, 2, 20, 100, 3, 30, 100, 4, 40, 100, 5, 50, 100, 6, " - "60, 100, 7, 70, 100, 8, 80, 100, 9, 90, 100]", shape, strides); + "60, 100, 7, 70, 100, 8, 80, 100, 9, 90, 100]", + shape, strides); EXPECT_TRUE(tensor_expected->Equals(*tensor)); CheckTensorRowMajor(tensor, 27, shape, strides); From e1562f41a4475fd5626955db3c2218081c907f91 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Wed, 29 May 2024 11:37:09 +0200 Subject: [PATCH 05/23] Add cmath include --- cpp/src/arrow/table.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 42fc9d0ce894..fe2cd12373ea 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -18,6 +18,7 @@ #include "arrow/table.h" #include +#include #include #include #include From d5a6eafd1788fed405735798329be491a04b9dc3 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Wed, 29 May 2024 13:25:19 +0200 Subject: [PATCH 06/23] Change helper function names in C++ tests, fix doctest errors --- cpp/src/arrow/table_test.cc | 36 ++++++++++++++++++------------------ python/pyarrow/table.pxi | 6 ++---- 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/cpp/src/arrow/table_test.cc b/cpp/src/arrow/table_test.cc index 1d6b20142bd2..beb2d48fc113 100644 --- a/cpp/src/arrow/table_test.cc +++ b/cpp/src/arrow/table_test.cc @@ -607,7 +607,7 @@ TEST_F(TestTable, ToTensorEmptyTable) { } template -void CheckTensor(const std::shared_ptr& tensor, const int size, +void CheckTableToTensor(const std::shared_ptr& tensor, const int size, const std::vector shape, const std::vector f_strides) { EXPECT_EQ(size, tensor->size()); EXPECT_EQ(TypeTraits::type_singleton(), tensor->type()); @@ -619,7 +619,7 @@ void CheckTensor(const std::shared_ptr& tensor, const int size, } template -void CheckTensorRowMajor(const std::shared_ptr& tensor, const int size, +void CheckTableToTensorRowMajor(const std::shared_ptr& tensor, const int size, const std::vector shape, const std::vector strides) { EXPECT_EQ(size, tensor->size()); @@ -657,7 +657,7 @@ TEST_F(TestTable, ToTensorSupportedNaN) { EXPECT_FALSE(tensor_expected->Equals(*tensor)); EXPECT_TRUE(tensor_expected->Equals(*tensor, EqualOptions().nans_equal(true))); - CheckTensor(tensor, 18, shape, f_strides); + CheckTableToTensor(tensor, 18, shape, f_strides); } TEST_F(TestTable, ToTensorSupportedNullToNan) { @@ -688,7 +688,7 @@ TEST_F(TestTable, ToTensorSupportedNullToNan) { EXPECT_FALSE(tensor_expected->Equals(*tensor)); EXPECT_TRUE(tensor_expected->Equals(*tensor, EqualOptions().nans_equal(true))); - CheckTensor(tensor, 18, shape, f_strides); + CheckTableToTensor(tensor, 18, shape, f_strides); ASSERT_OK_AND_ASSIGN(auto tensor_row, table->ToTensor(/*null_to_nan=*/true)); ASSERT_OK(tensor_row->Validate()); @@ -701,7 +701,7 @@ TEST_F(TestTable, ToTensorSupportedNullToNan) { EXPECT_FALSE(tensor_expected_row->Equals(*tensor_row)); EXPECT_TRUE(tensor_expected_row->Equals(*tensor_row, EqualOptions().nans_equal(true))); - CheckTensorRowMajor(tensor_row, 18, shape, strides); + CheckTableToTensorRowMajor(tensor_row, 18, shape, strides); // int32 -> float64 auto f2 = field("f2", int32()); @@ -719,7 +719,7 @@ TEST_F(TestTable, ToTensorSupportedNullToNan) { EXPECT_FALSE(tensor_expected->Equals(*tensor1)); EXPECT_TRUE(tensor_expected->Equals(*tensor1, EqualOptions().nans_equal(true))); - CheckTensor(tensor1, 18, shape, f_strides); + CheckTableToTensor(tensor1, 18, shape, f_strides); ASSERT_OK_AND_ASSIGN(auto tensor1_row, table1->ToTensor(/*null_to_nan=*/true)); ASSERT_OK(tensor1_row->Validate()); @@ -727,7 +727,7 @@ TEST_F(TestTable, ToTensorSupportedNullToNan) { EXPECT_FALSE(tensor_expected_row->Equals(*tensor1_row)); EXPECT_TRUE(tensor_expected_row->Equals(*tensor1_row, EqualOptions().nans_equal(true))); - CheckTensorRowMajor(tensor1_row, 18, shape, strides); + CheckTableToTensorRowMajor(tensor1_row, 18, shape, strides); // int8 -> float32 auto f3 = field("f3", int8()); @@ -753,7 +753,7 @@ TEST_F(TestTable, ToTensorSupportedNullToNan) { EXPECT_FALSE(tensor_expected_2->Equals(*tensor2)); EXPECT_TRUE(tensor_expected_2->Equals(*tensor2, EqualOptions().nans_equal(true))); - CheckTensor(tensor2, 18, shape, f_strides_2); + CheckTableToTensor(tensor2, 18, shape, f_strides_2); ASSERT_OK_AND_ASSIGN(auto tensor2_row, table2->ToTensor(/*null_to_nan=*/true)); ASSERT_OK(tensor2_row->Validate()); @@ -767,7 +767,7 @@ TEST_F(TestTable, ToTensorSupportedNullToNan) { EXPECT_TRUE( tensor2_expected_row->Equals(*tensor2_row, EqualOptions().nans_equal(true))); - CheckTensorRowMajor(tensor2_row, 18, shape, strides_2); + CheckTableToTensorRowMajor(tensor2_row, 18, shape, strides_2); } TEST_F(TestTable, ToTensorSupportedTypesMixed) { @@ -796,7 +796,7 @@ TEST_F(TestTable, ToTensorSupportedTypesMixed) { TensorFromJSON(uint16(), "[1, 2, 3, 4, 5, 6, 7, 8, 9]", shape, f_strides); EXPECT_TRUE(tensor_expected->Equals(*tensor)); - CheckTensor(tensor, 9, shape, f_strides); + CheckTableToTensor(tensor, 9, shape, f_strides); // uint16 + int16 = int32 std::vector> fields1 = {f0, f1}; @@ -816,7 +816,7 @@ TEST_F(TestTable, ToTensorSupportedTypesMixed) { EXPECT_TRUE(tensor_expected_1->Equals(*tensor1)); - CheckTensor(tensor1, 18, shape1, f_strides_1); + CheckTableToTensor(tensor1, 18, shape1, f_strides_1); ASSERT_EQ(tensor1->type()->bit_width(), tensor_expected_1->type()->bit_width()); @@ -845,7 +845,7 @@ TEST_F(TestTable, ToTensorSupportedTypesMixed) { EXPECT_FALSE(tensor_expected_2->Equals(*tensor2)); EXPECT_TRUE(tensor_expected_2->Equals(*tensor2, EqualOptions().nans_equal(true))); - CheckTensor(tensor2, 27, shape2, f_strides_2); + CheckTableToTensor(tensor2, 27, shape2, f_strides_2); } TEST_F(TestTable, ToTensorUnsupportedMixedFloat16) { @@ -911,7 +911,7 @@ TYPED_TEST_P(TestTableToTensorColumnMajor, SupportedTypes) { shape, f_strides); EXPECT_TRUE(tensor_expected->Equals(*tensor)); - CheckTensor(tensor, 27, shape, f_strides); + CheckTableToTensor(tensor, 27, shape, f_strides); // Test offsets auto table_slice = table->Slice(1); @@ -929,7 +929,7 @@ TYPED_TEST_P(TestTableToTensorColumnMajor, SupportedTypes) { shape_sliced, f_strides_sliced); EXPECT_TRUE(tensor_expected_sliced->Equals(*tensor_sliced)); - CheckTensor(tensor_expected_sliced, 24, shape_sliced, f_strides_sliced); + CheckTableToTensor(tensor_expected_sliced, 24, shape_sliced, f_strides_sliced); auto table_slice_1 = table->Slice(1, 5); @@ -946,7 +946,7 @@ TYPED_TEST_P(TestTableToTensorColumnMajor, SupportedTypes) { shape_sliced_1, f_strides_sliced_1); EXPECT_TRUE(tensor_expected_sliced_1->Equals(*tensor_sliced_1)); - CheckTensor(tensor_expected_sliced_1, 15, shape_sliced_1, f_strides_sliced_1); + CheckTableToTensor(tensor_expected_sliced_1, 15, shape_sliced_1, f_strides_sliced_1); } REGISTER_TYPED_TEST_SUITE_P(TestTableToTensorColumnMajor, SupportedTypes); @@ -1001,7 +1001,7 @@ TYPED_TEST_P(TestTableToTensorRowMajor, SupportedTypes) { shape, strides); EXPECT_TRUE(tensor_expected->Equals(*tensor)); - CheckTensorRowMajor(tensor, 27, shape, strides); + CheckTableToTensorRowMajor(tensor, 27, shape, strides); // Test offsets auto table_slice = table->Slice(1); @@ -1018,7 +1018,7 @@ TYPED_TEST_P(TestTableToTensorRowMajor, SupportedTypes) { shape_sliced, strides_sliced); EXPECT_TRUE(tensor_expected_sliced->Equals(*tensor_sliced)); - CheckTensorRowMajor(tensor_sliced, 24, shape_sliced, strides_sliced); + CheckTableToTensorRowMajor(tensor_sliced, 24, shape_sliced, strides_sliced); auto table_slice_1 = table->Slice(1, 5); @@ -1033,7 +1033,7 @@ TYPED_TEST_P(TestTableToTensorRowMajor, SupportedTypes) { shape_sliced_1, strides_sliced_1); EXPECT_TRUE(tensor_expected_sliced_1->Equals(*tensor_sliced_1)); - CheckTensorRowMajor(tensor_sliced_1, 15, shape_sliced_1, strides_sliced_1); + CheckTableToTensorRowMajor(tensor_sliced_1, 15, shape_sliced_1, strides_sliced_1); } REGISTER_TYPED_TEST_SUITE_P(TestTableToTensorRowMajor, SupportedTypes); diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 3299ccae9997..38d920e0d20f 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -2292,8 +2292,7 @@ cdef class _Tabular(_PandasConvertible): >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]) >>> table = pa.Table.from_arrays([n_legs, animals], names=["n_legs", "animals"]) >>> table.to_pydict() - {'n_legs': [2, 2, 4, 4, 5, 100], 'animals': [ - 'Flamingo', 'Parrot', ..., 'Centipede']} + {'n_legs': [2, 2, 4, 4, 5, 100], 'animals': ['Flamingo', 'Parrot', ..., 'Centipede']} """ entries = [] for i in range(self.num_columns): @@ -4990,8 +4989,7 @@ cdef class Table(_Tabular): animals: string ---- n_legs: [[2,4,5,100],[2,4,5,100]] - animals: [["Flamingo","Horse","Brittle stars","Centipede"], - ["Flamingo","Horse","Brittle stars","Centipede"]] + animals: [["Flamingo",...,"Centipede"],["Flamingo",...,"Centipede"]] """ cdef: vector[shared_ptr[CRecordBatch]] c_batches From 8854a82cf21a5e381c9a22e9ac06e58a1f9ab9f8 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Wed, 29 May 2024 15:41:30 +0200 Subject: [PATCH 07/23] Correct indentations --- cpp/src/arrow/table_test.cc | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/table_test.cc b/cpp/src/arrow/table_test.cc index beb2d48fc113..548e4be2278c 100644 --- a/cpp/src/arrow/table_test.cc +++ b/cpp/src/arrow/table_test.cc @@ -608,7 +608,8 @@ TEST_F(TestTable, ToTensorEmptyTable) { template void CheckTableToTensor(const std::shared_ptr& tensor, const int size, - const std::vector shape, const std::vector f_strides) { + const std::vector shape, + const std::vector f_strides) { EXPECT_EQ(size, tensor->size()); EXPECT_EQ(TypeTraits::type_singleton(), tensor->type()); EXPECT_EQ(shape, tensor->shape()); @@ -620,8 +621,8 @@ void CheckTableToTensor(const std::shared_ptr& tensor, const int size, template void CheckTableToTensorRowMajor(const std::shared_ptr& tensor, const int size, - const std::vector shape, - const std::vector strides) { + const std::vector shape, + const std::vector strides) { EXPECT_EQ(size, tensor->size()); EXPECT_EQ(TypeTraits::type_singleton(), tensor->type()); EXPECT_EQ(shape, tensor->shape()); @@ -929,7 +930,8 @@ TYPED_TEST_P(TestTableToTensorColumnMajor, SupportedTypes) { shape_sliced, f_strides_sliced); EXPECT_TRUE(tensor_expected_sliced->Equals(*tensor_sliced)); - CheckTableToTensor(tensor_expected_sliced, 24, shape_sliced, f_strides_sliced); + CheckTableToTensor(tensor_expected_sliced, 24, shape_sliced, + f_strides_sliced); auto table_slice_1 = table->Slice(1, 5); @@ -946,7 +948,8 @@ TYPED_TEST_P(TestTableToTensorColumnMajor, SupportedTypes) { shape_sliced_1, f_strides_sliced_1); EXPECT_TRUE(tensor_expected_sliced_1->Equals(*tensor_sliced_1)); - CheckTableToTensor(tensor_expected_sliced_1, 15, shape_sliced_1, f_strides_sliced_1); + CheckTableToTensor(tensor_expected_sliced_1, 15, shape_sliced_1, + f_strides_sliced_1); } REGISTER_TYPED_TEST_SUITE_P(TestTableToTensorColumnMajor, SupportedTypes); @@ -1033,7 +1036,8 @@ TYPED_TEST_P(TestTableToTensorRowMajor, SupportedTypes) { shape_sliced_1, strides_sliced_1); EXPECT_TRUE(tensor_expected_sliced_1->Equals(*tensor_sliced_1)); - CheckTableToTensorRowMajor(tensor_sliced_1, 15, shape_sliced_1, strides_sliced_1); + CheckTableToTensorRowMajor(tensor_sliced_1, 15, shape_sliced_1, + strides_sliced_1); } REGISTER_TYPED_TEST_SUITE_P(TestTableToTensorRowMajor, SupportedTypes); From d2be9252a68264313ea5d0c441381d265bba0665 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Wed, 29 May 2024 17:52:07 +0200 Subject: [PATCH 08/23] Remove code from RecordBatch::ToTensor and use Table implementation --- cpp/src/arrow/record_batch_test.cc | 18 ++++++++++-------- cpp/src/arrow/table.cc | 7 ++++--- cpp/src/arrow/table_test.cc | 18 ++++++++++-------- python/pyarrow/tests/test_table.py | 4 ++-- 4 files changed, 26 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/record_batch_test.cc b/cpp/src/arrow/record_batch_test.cc index a037d7261efb..0cc7def76796 100644 --- a/cpp/src/arrow/record_batch_test.cc +++ b/cpp/src/arrow/record_batch_test.cc @@ -906,10 +906,11 @@ TEST_F(TestRecordBatch, ToTensorUnsupportedMissing) { auto batch = RecordBatch::Make(schema, length, {a0, a1}); - ASSERT_RAISES_WITH_MESSAGE(TypeError, - "Type error: Can only convert a RecordBatch with no nulls. " - "Set null_to_nan to true to convert nulls to NaN", - batch->ToTensor()); + ASSERT_RAISES_WITH_MESSAGE( + TypeError, + "Type error: Can only convert a Table or RecordBatch with no " + "nulls. Set null_to_nan to true to convert nulls to NaN", + batch->ToTensor()); } TEST_F(TestRecordBatch, ToTensorEmptyBatch) { @@ -940,10 +941,11 @@ TEST_F(TestRecordBatch, ToTensorEmptyBatch) { auto batch_no_columns = RecordBatch::Make(::arrow::schema({}), 10, std::vector>{}); - ASSERT_RAISES_WITH_MESSAGE(TypeError, - "Type error: Conversion to Tensor for RecordBatches without " - "columns/schema is not supported.", - batch_no_columns->ToTensor()); + ASSERT_RAISES_WITH_MESSAGE( + TypeError, + "Type error: Conversion to Tensor for Tables or RecordBatches " + "without columns/schema is not supported.", + batch_no_columns->ToTensor()); } template diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index fe2cd12373ea..89c69a778e8c 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -441,15 +441,16 @@ Result> Table::ToTensor(bool null_to_nan, bool row_major MemoryPool* pool) const { if (num_columns() == 0) { return Status::TypeError( - "Conversion to Tensor for Tables without columns/schema is not supported."); + "Conversion to Tensor for Tables or RecordBatches without columns/schema is " + "not supported."); } // Check for no validity bitmap of each field // if null_to_nan conversion is set to false for (int i = 0; i < num_columns(); ++i) { if (column(i)->null_count() > 0 && !null_to_nan) { return Status::TypeError( - "Can only convert a Table with no nulls. Set null_to_nan to true to " - "convert nulls to NaN"); + "Can only convert a Table or RecordBatch with no nulls. Set null_to_nan to " + "true to convert nulls to NaN"); } } diff --git a/cpp/src/arrow/table_test.cc b/cpp/src/arrow/table_test.cc index 548e4be2278c..c19541a14925 100644 --- a/cpp/src/arrow/table_test.cc +++ b/cpp/src/arrow/table_test.cc @@ -565,10 +565,11 @@ TEST_F(TestTable, ToTensorUnsupportedMissing) { auto table = Table::Make(schema, {a0, a1}); - ASSERT_RAISES_WITH_MESSAGE(TypeError, - "Type error: Can only convert a Table with no nulls. Set " - "null_to_nan to true to convert nulls to NaN", - table->ToTensor()); + ASSERT_RAISES_WITH_MESSAGE( + TypeError, + "Type error: Can only convert a Table or RecordBatch with no " + "nulls. Set null_to_nan to true to convert nulls to NaN", + table->ToTensor()); } TEST_F(TestTable, ToTensorEmptyTable) { @@ -600,10 +601,11 @@ TEST_F(TestTable, ToTensorEmptyTable) { auto table_no_columns = Table::Make(::arrow::schema({}), std::vector>{}); - ASSERT_RAISES_WITH_MESSAGE(TypeError, - "Type error: Conversion to Tensor for Tables without " - "columns/schema is not supported.", - table_no_columns->ToTensor()); + ASSERT_RAISES_WITH_MESSAGE( + TypeError, + "Type error: Conversion to Tensor for Tables or RecordBatches " + "without columns/schema is not supported.", + table_no_columns->ToTensor()); } template diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index d12a61063bef..b82cf507a890 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -1184,7 +1184,7 @@ def test_recordbatch_to_tensor_null(): ) with pytest.raises( pa.ArrowTypeError, - match="Can only convert a RecordBatch with no nulls." + match="Can only convert a Table or RecordBatch with no nulls." ): batch.to_tensor() @@ -1471,7 +1471,7 @@ def test_table_to_tensor_null(): ) with pytest.raises( pa.ArrowTypeError, - match="Can only convert a Table with no nulls." + match="Can only convert a Table or RecordBatch with no nulls." ): table.to_tensor() From 520561c43ae6e25b4efc29b9f93f5a46e3681499 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Wed, 5 Jun 2024 10:33:51 +0200 Subject: [PATCH 09/23] Add RecordBatchToTensor code to tensor.cc --- cpp/src/arrow/record_batch.cc | 1 - cpp/src/arrow/tensor.cc | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 12e0f553b740..bc2612f92add 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -18,7 +18,6 @@ #include "arrow/record_batch.h" #include -#include #include #include #include diff --git a/cpp/src/arrow/tensor.cc b/cpp/src/arrow/tensor.cc index 8cdf7f82d264..80a97c1a44fd 100644 --- a/cpp/src/arrow/tensor.cc +++ b/cpp/src/arrow/tensor.cc @@ -307,7 +307,7 @@ Status RecordBatchToTensor(const RecordBatch& batch, bool null_to_nan, bool row_ MemoryPool* pool, std::shared_ptr* tensor) { if (batch.num_columns() == 0) { return Status::TypeError( - "Conversion to Tensor for RecordBatches without columns/schema is not " + "Conversion to Tensor for Tables or RecordBatches without columns/schema is not " "supported."); } // Check for no validity bitmap of each field @@ -315,8 +315,8 @@ Status RecordBatchToTensor(const RecordBatch& batch, bool null_to_nan, bool row_ for (int i = 0; i < batch.num_columns(); ++i) { if (batch.column(i)->null_count() > 0 && !null_to_nan) { return Status::TypeError( - "Can only convert a RecordBatch with no nulls. Set null_to_nan to true to " - "convert nulls to NaN"); + "Can only convert a Table or RecordBatch with no nulls. Set null_to_nan to " + "true to convert nulls to NaN"); } } From 6bd177dad1196825f7494c70a023ff2225ff79eb Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Wed, 5 Jun 2024 10:56:54 +0200 Subject: [PATCH 10/23] Change RecordBatchToTensor to TableToTensor and update the code to work for Arrow Tables --- cpp/src/arrow/record_batch.cc | 3 +- cpp/src/arrow/tensor.cc | 99 +++++++++++++++++++---------------- cpp/src/arrow/tensor.h | 4 +- 3 files changed, 57 insertions(+), 49 deletions(-) diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index bc2612f92add..3271f26be8c8 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -295,9 +295,10 @@ Result> RecordBatch::ToStructArray() const { Result> RecordBatch::ToTensor(bool null_to_nan, bool row_major, MemoryPool* pool) const { + std::shared_ptr
table = Table::Make(schema(), columns()); std::shared_ptr tensor; ARROW_RETURN_NOT_OK( - internal::RecordBatchToTensor(*this, null_to_nan, row_major, pool, &tensor)); + internal::TableToTensor(*table, null_to_nan, row_major, pool, &tensor)); return tensor; } diff --git a/cpp/src/arrow/tensor.cc b/cpp/src/arrow/tensor.cc index 80a97c1a44fd..1ecaa384631b 100644 --- a/cpp/src/arrow/tensor.cc +++ b/cpp/src/arrow/tensor.cc @@ -28,8 +28,8 @@ #include #include -#include "arrow/record_batch.h" #include "arrow/status.h" +#include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" @@ -224,7 +224,7 @@ Status ValidateTensorParameters(const std::shared_ptr& type, } template -struct ConvertColumnsToTensorVisitor { +struct ConvertArrayToTensorVisitor { Out*& out_values; const ArrayData& in_data; @@ -256,11 +256,12 @@ struct ConvertColumnsToTensorVisitor { }; template -struct ConvertColumnsToTensorRowMajorVisitor { +struct ConvertArrayToTensorRowMajorVisitor { Out*& out_values; const ArrayData& in_data; int num_cols; int col_idx; + int chunk_idx; template Status Visit(const T&) { @@ -269,13 +270,15 @@ struct ConvertColumnsToTensorRowMajorVisitor { auto in_values = ArraySpan(in_data).GetSpan(1, in_data.length); if (in_data.null_count == 0) { - for (int64_t i = 0; i < in_data.length; ++i) { - out_values[i * num_cols + col_idx] = static_cast(in_values[i]); + for (int64_t data_idx = 0; data_idx < in_data.length; ++data_idx) { + out_values[(data_idx + chunk_idx) * num_cols + col_idx] = + static_cast(in_values[data_idx]); } } else { - for (int64_t i = 0; i < in_data.length; ++i) { - out_values[i * num_cols + col_idx] = - in_data.IsNull(i) ? static_cast(NAN) : static_cast(in_values[i]); + for (int64_t data_idx = 0; data_idx < in_data.length; ++data_idx) { + out_values[(data_idx + chunk_idx) * num_cols + col_idx] = + in_data.IsNull(data_idx) ? static_cast(NAN) + : static_cast(in_values[data_idx]); } } return Status::OK(); @@ -285,35 +288,39 @@ struct ConvertColumnsToTensorRowMajorVisitor { }; template -inline void ConvertColumnsToTensor(const RecordBatch& batch, uint8_t* out, - bool row_major) { +inline void ConvertColumnsToTensor(const Table& table, uint8_t* out, bool row_major) { using CType = typename arrow::TypeTraits::CType; auto* out_values = reinterpret_cast(out); int i = 0; - for (const auto& column : batch.columns()) { - if (row_major) { - ConvertColumnsToTensorRowMajorVisitor visitor{out_values, *column->data(), - batch.num_columns(), i++}; - DCHECK_OK(VisitTypeInline(*column->type(), &visitor)); - } else { - ConvertColumnsToTensorVisitor visitor{out_values, *column->data()}; - DCHECK_OK(VisitTypeInline(*column->type(), &visitor)); + for (const auto& column : table.columns()) { + int j = 0; + for (const auto& chunk : column->chunks()) { + if (row_major) { + ConvertArrayToTensorRowMajorVisitor visitor{out_values, *chunk->data(), + table.num_columns(), i, j}; + DCHECK_OK(VisitTypeInline(*chunk->type(), &visitor)); + j = j + static_cast(chunk->length()); + } else { + ConvertArrayToTensorVisitor visitor{out_values, *chunk->data()}; + DCHECK_OK(VisitTypeInline(*chunk->type(), &visitor)); + } } + i++; } } -Status RecordBatchToTensor(const RecordBatch& batch, bool null_to_nan, bool row_major, - MemoryPool* pool, std::shared_ptr* tensor) { - if (batch.num_columns() == 0) { +Status TableToTensor(const Table& table, bool null_to_nan, bool row_major, + MemoryPool* pool, std::shared_ptr* tensor) { + if (table.num_columns() == 0) { return Status::TypeError( "Conversion to Tensor for Tables or RecordBatches without columns/schema is not " "supported."); } // Check for no validity bitmap of each field // if null_to_nan conversion is set to false - for (int i = 0; i < batch.num_columns(); ++i) { - if (batch.column(i)->null_count() > 0 && !null_to_nan) { + for (int i = 0; i < table.num_columns(); ++i) { + if (table.column(i)->null_count() > 0 && !null_to_nan) { return Status::TypeError( "Can only convert a Table or RecordBatch with no nulls. Set null_to_nan to " "true to convert nulls to NaN"); @@ -322,12 +329,12 @@ Status RecordBatchToTensor(const RecordBatch& batch, bool null_to_nan, bool row_ // Check for supported data types and merge fields // to get the resulting uniform data type - if (!is_integer(batch.column(0)->type()->id()) && - !is_floating(batch.column(0)->type()->id())) { + if (!is_integer(table.column(0)->type()->id()) && + !is_floating(table.column(0)->type()->id())) { return Status::TypeError("DataType is not supported: ", - batch.column(0)->type()->ToString()); + table.column(0)->type()->ToString()); } - std::shared_ptr result_field = batch.schema()->field(0); + std::shared_ptr result_field = table.schema()->field(0); std::shared_ptr result_type = result_field->type(); Field::MergeOptions options; @@ -335,24 +342,24 @@ Status RecordBatchToTensor(const RecordBatch& batch, bool null_to_nan, bool row_ options.promote_integer_sign = true; options.promote_numeric_width = true; - if (batch.num_columns() > 1) { - for (int i = 1; i < batch.num_columns(); ++i) { - if (!is_numeric(batch.column(i)->type()->id())) { + if (table.num_columns() > 1) { + for (int i = 1; i < table.num_columns(); ++i) { + if (!is_numeric(table.column(i)->type()->id())) { return Status::TypeError("DataType is not supported: ", - batch.column(i)->type()->ToString()); + table.column(i)->type()->ToString()); } // Casting of float16 is not supported, throw an error in this case - if ((batch.column(i)->type()->id() == Type::HALF_FLOAT || + if ((table.column(i)->type()->id() == Type::HALF_FLOAT || result_field->type()->id() == Type::HALF_FLOAT) && - batch.column(i)->type()->id() != result_field->type()->id()) { + table.column(i)->type()->id() != result_field->type()->id()) { return Status::NotImplemented("Casting from or to halffloat is not supported."); } ARROW_ASSIGN_OR_RAISE( result_field, result_field->MergeWith( - batch.schema()->field(i)->WithName(result_field->name()), options)); + table.schema()->field(i)->WithName(result_field->name()), options)); } result_type = result_field->type(); } @@ -369,40 +376,40 @@ Status RecordBatchToTensor(const RecordBatch& batch, bool null_to_nan, bool row_ // Allocate memory ARROW_ASSIGN_OR_RAISE( std::shared_ptr result, - AllocateBuffer(result_type->bit_width() * batch.num_columns() * batch.num_rows(), + AllocateBuffer(result_type->bit_width() * table.num_columns() * table.num_rows(), pool)); // Copy data switch (result_type->id()) { case Type::UINT8: - ConvertColumnsToTensor(batch, result->mutable_data(), row_major); + ConvertColumnsToTensor(table, result->mutable_data(), row_major); break; case Type::UINT16: case Type::HALF_FLOAT: - ConvertColumnsToTensor(batch, result->mutable_data(), row_major); + ConvertColumnsToTensor(table, result->mutable_data(), row_major); break; case Type::UINT32: - ConvertColumnsToTensor(batch, result->mutable_data(), row_major); + ConvertColumnsToTensor(table, result->mutable_data(), row_major); break; case Type::UINT64: - ConvertColumnsToTensor(batch, result->mutable_data(), row_major); + ConvertColumnsToTensor(table, result->mutable_data(), row_major); break; case Type::INT8: - ConvertColumnsToTensor(batch, result->mutable_data(), row_major); + ConvertColumnsToTensor(table, result->mutable_data(), row_major); break; case Type::INT16: - ConvertColumnsToTensor(batch, result->mutable_data(), row_major); + ConvertColumnsToTensor(table, result->mutable_data(), row_major); break; case Type::INT32: - ConvertColumnsToTensor(batch, result->mutable_data(), row_major); + ConvertColumnsToTensor(table, result->mutable_data(), row_major); break; case Type::INT64: - ConvertColumnsToTensor(batch, result->mutable_data(), row_major); + ConvertColumnsToTensor(table, result->mutable_data(), row_major); break; case Type::FLOAT: - ConvertColumnsToTensor(batch, result->mutable_data(), row_major); + ConvertColumnsToTensor(table, result->mutable_data(), row_major); break; case Type::DOUBLE: - ConvertColumnsToTensor(batch, result->mutable_data(), row_major); + ConvertColumnsToTensor(table, result->mutable_data(), row_major); break; default: return Status::TypeError("DataType is not supported: ", result_type->ToString()); @@ -411,7 +418,7 @@ Status RecordBatchToTensor(const RecordBatch& batch, bool null_to_nan, bool row_ // Construct Tensor object const auto& fixed_width_type = internal::checked_cast(*result_type); - std::vector shape = {batch.num_rows(), batch.num_columns()}; + std::vector shape = {table.num_rows(), table.num_columns()}; std::vector strides; if (row_major) { diff --git a/cpp/src/arrow/tensor.h b/cpp/src/arrow/tensor.h index beb62a11bdce..b1c98bf733c7 100644 --- a/cpp/src/arrow/tensor.h +++ b/cpp/src/arrow/tensor.h @@ -78,8 +78,8 @@ Status ValidateTensorParameters(const std::shared_ptr& type, const std::vector& dim_names); ARROW_EXPORT -Status RecordBatchToTensor(const RecordBatch& batch, bool null_to_nan, bool row_major, - MemoryPool* pool, std::shared_ptr* tensor); +Status TableToTensor(const Table& table, bool null_to_nan, bool row_major, + MemoryPool* pool, std::shared_ptr* tensor); } // namespace internal From 8306e730db490bc88e921fabeb186e265f3694e0 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Wed, 5 Jun 2024 14:09:50 +0200 Subject: [PATCH 11/23] Use TableToTensor in Table::ToTensor --- cpp/src/arrow/table.cc | 206 +---------------------------------------- 1 file changed, 2 insertions(+), 204 deletions(-) diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 89c69a778e8c..b2b18bd510bc 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -18,7 +18,6 @@ #include "arrow/table.h" #include -#include #include #include #include @@ -44,7 +43,6 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/logging_internal.h" #include "arrow/util/logging.h" -#include "arrow/util/unreachable.h" #include "arrow/util/vector.h" namespace arrow { @@ -350,211 +348,11 @@ Result> Table::FromChunkedStructArray( array->length()); } -template -struct ConvertChunksToTensorVisitor { - Out*& out_values; - const ArrayData& in_data; - - template - Status Visit(const T&) { - if constexpr (is_numeric(T::type_id)) { - using In = typename T::c_type; - auto in_values = ArraySpan(in_data).GetSpan(1, in_data.length); - - if (in_data.null_count == 0) { - if constexpr (std::is_same_v) { - memcpy(out_values, in_values.data(), in_values.size_bytes()); - out_values += in_values.size(); - } else { - for (In in_value : in_values) { - *out_values++ = static_cast(in_value); - } - } - } else { - for (int64_t i = 0; i < in_data.length; ++i) { - *out_values++ = - in_data.IsNull(i) ? static_cast(NAN) : static_cast(in_values[i]); - } - } - return Status::OK(); - } - Unreachable(); - } -}; - -template -struct ConvertChunksToTensorRowMajorVisitor { - Out*& out_values; - const ArrayData& in_data; - int num_cols; - int col_idx; - int chunk_idx; - - template - Status Visit(const T&) { - if constexpr (is_numeric(T::type_id)) { - using In = typename T::c_type; - auto in_values = ArraySpan(in_data).GetSpan(1, in_data.length); - - if (in_data.null_count == 0) { - for (int64_t data_idx = 0; data_idx < in_data.length; ++data_idx) { - out_values[(data_idx + chunk_idx) * num_cols + col_idx] = - static_cast(in_values[data_idx]); - } - } else { - for (int64_t data_idx = 0; data_idx < in_data.length; ++data_idx) { - out_values[(data_idx + chunk_idx) * num_cols + col_idx] = - in_data.IsNull(data_idx) ? static_cast(NAN) - : static_cast(in_values[data_idx]); - } - } - return Status::OK(); - } - Unreachable(); - } -}; - -template -inline void ConvertColumnsToTensor(const Table& table, uint8_t* out, bool row_major) { - using CType = typename arrow::TypeTraits::CType; - auto* out_values = reinterpret_cast(out); - - int i = 0; - for (const auto& column : table.columns()) { - int j = 0; - for (const auto& chunk : column->chunks()) { - if (row_major) { - ConvertChunksToTensorRowMajorVisitor visitor{out_values, *chunk->data(), - table.num_columns(), i, j}; - DCHECK_OK(VisitTypeInline(*column->type(), &visitor)); - j = j + static_cast(chunk->length()); - } else { - ConvertChunksToTensorVisitor visitor{out_values, *chunk->data()}; - DCHECK_OK(VisitTypeInline(*column->type(), &visitor)); - } - } - i++; - } -} - Result> Table::ToTensor(bool null_to_nan, bool row_major, MemoryPool* pool) const { - if (num_columns() == 0) { - return Status::TypeError( - "Conversion to Tensor for Tables or RecordBatches without columns/schema is " - "not supported."); - } - // Check for no validity bitmap of each field - // if null_to_nan conversion is set to false - for (int i = 0; i < num_columns(); ++i) { - if (column(i)->null_count() > 0 && !null_to_nan) { - return Status::TypeError( - "Can only convert a Table or RecordBatch with no nulls. Set null_to_nan to " - "true to convert nulls to NaN"); - } - } - - // Check for supported data types and merge fields - // to get the resulting uniform data type - if (!is_integer(column(0)->type()->id()) && !is_floating(column(0)->type()->id())) { - return Status::TypeError("DataType is not supported: ", - column(0)->type()->ToString()); - } - std::shared_ptr result_field = schema_->field(0); - std::shared_ptr result_type = result_field->type(); - - Field::MergeOptions options; - options.promote_integer_to_float = true; - options.promote_integer_sign = true; - options.promote_numeric_width = true; - - if (num_columns() > 1) { - for (int i = 1; i < num_columns(); ++i) { - if (!is_numeric(column(i)->type()->id())) { - return Status::TypeError("DataType is not supported: ", - column(i)->type()->ToString()); - } - - // Casting of float16 is not supported, throw an error in this case - if ((column(i)->type()->id() == Type::HALF_FLOAT || - result_field->type()->id() == Type::HALF_FLOAT) && - column(i)->type()->id() != result_field->type()->id()) { - return Status::NotImplemented("Casting from or to halffloat is not supported."); - } - - ARROW_ASSIGN_OR_RAISE( - result_field, result_field->MergeWith( - schema_->field(i)->WithName(result_field->name()), options)); - } - result_type = result_field->type(); - } - - // Check if result_type is signed or unsigned integer and null_to_nan is set to true - // Then all columns should be promoted to float type - if (is_integer(result_type->id()) && null_to_nan) { - ARROW_ASSIGN_OR_RAISE( - result_field, - result_field->MergeWith(arrow::field(result_field->name(), float32()), options)); - result_type = result_field->type(); - } - - // Allocate memory - ARROW_ASSIGN_OR_RAISE( - std::shared_ptr result, - AllocateBuffer(result_type->bit_width() * num_columns() * num_rows(), pool)); - // Copy data - switch (result_type->id()) { - case Type::UINT8: - ConvertColumnsToTensor(*this, result->mutable_data(), row_major); - break; - case Type::UINT16: - case Type::HALF_FLOAT: - ConvertColumnsToTensor(*this, result->mutable_data(), row_major); - break; - case Type::UINT32: - ConvertColumnsToTensor(*this, result->mutable_data(), row_major); - break; - case Type::UINT64: - ConvertColumnsToTensor(*this, result->mutable_data(), row_major); - break; - case Type::INT8: - ConvertColumnsToTensor(*this, result->mutable_data(), row_major); - break; - case Type::INT16: - ConvertColumnsToTensor(*this, result->mutable_data(), row_major); - break; - case Type::INT32: - ConvertColumnsToTensor(*this, result->mutable_data(), row_major); - break; - case Type::INT64: - ConvertColumnsToTensor(*this, result->mutable_data(), row_major); - break; - case Type::FLOAT: - ConvertColumnsToTensor(*this, result->mutable_data(), row_major); - break; - case Type::DOUBLE: - ConvertColumnsToTensor(*this, result->mutable_data(), row_major); - break; - default: - return Status::TypeError("DataType is not supported: ", result_type->ToString()); - } - - // Construct Tensor object - const auto& fixed_width_type = - internal::checked_cast(*result_type); - std::vector shape = {num_rows(), num_columns()}; - std::vector strides; std::shared_ptr tensor; - - if (row_major) { - ARROW_RETURN_NOT_OK( - internal::ComputeRowMajorStrides(fixed_width_type, shape, &strides)); - } else { - ARROW_RETURN_NOT_OK( - internal::ComputeColumnMajorStrides(fixed_width_type, shape, &strides)); - } - ARROW_ASSIGN_OR_RAISE(tensor, - Tensor::Make(result_type, std::move(result), shape, strides)); + ARROW_RETURN_NOT_OK( + internal::TableToTensor(*this, null_to_nan, row_major, pool, &tensor)); return tensor; } From afe3d1e16f50a728b8bcf5bf20c50119b392923b Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Mon, 10 Jun 2024 17:55:53 +0200 Subject: [PATCH 12/23] Fix docstrings and change index names --- cpp/src/arrow/record_batch.h | 2 +- cpp/src/arrow/table.h | 2 +- cpp/src/arrow/tensor.cc | 25 ++++++++++++------------- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index 4601b1ba9d6a..a6ef744ac121 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -90,7 +90,7 @@ class ARROW_EXPORT RecordBatch { /// in the resulting struct array. Result> ToStructArray() const; - /// \brief Convert record batch with one data type to Tensor + /// \brief Convert RecordBatch to Tensor /// /// Create a Tensor object with shape (number of rows, number of columns) and /// strides (type size in bytes, type size in bytes * number of rows). diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index f57e23aaf5dd..3558cb46d8c5 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -102,7 +102,7 @@ class ARROW_EXPORT Table { static Result> FromChunkedStructArray( const std::shared_ptr& array); - /// \brief Convert table with one data type to Tensor + /// \brief Convert Table to Tensor /// /// Create a Tensor object with shape (number of rows, number of columns) and /// strides (type size in bytes, type size in bytes * number of rows). diff --git a/cpp/src/arrow/tensor.cc b/cpp/src/arrow/tensor.cc index 1ecaa384631b..07563a406bee 100644 --- a/cpp/src/arrow/tensor.cc +++ b/cpp/src/arrow/tensor.cc @@ -270,15 +270,14 @@ struct ConvertArrayToTensorRowMajorVisitor { auto in_values = ArraySpan(in_data).GetSpan(1, in_data.length); if (in_data.null_count == 0) { - for (int64_t data_idx = 0; data_idx < in_data.length; ++data_idx) { - out_values[(data_idx + chunk_idx) * num_cols + col_idx] = - static_cast(in_values[data_idx]); + for (int64_t i = 0; i < in_data.length; ++i) { + out_values[(i + chunk_idx) * num_cols + col_idx] = + static_cast(in_values[i]); } } else { - for (int64_t data_idx = 0; data_idx < in_data.length; ++data_idx) { - out_values[(data_idx + chunk_idx) * num_cols + col_idx] = - in_data.IsNull(data_idx) ? static_cast(NAN) - : static_cast(in_values[data_idx]); + for (int64_t i = 0; i < in_data.length; ++i) { + out_values[(i + chunk_idx) * num_cols + col_idx] = + in_data.IsNull(i) ? static_cast(NAN) : static_cast(in_values[i]); } } return Status::OK(); @@ -292,21 +291,21 @@ inline void ConvertColumnsToTensor(const Table& table, uint8_t* out, bool row_ma using CType = typename arrow::TypeTraits::CType; auto* out_values = reinterpret_cast(out); - int i = 0; + int col_idx = 0; for (const auto& column : table.columns()) { - int j = 0; + int chunk_idx = 0; for (const auto& chunk : column->chunks()) { if (row_major) { - ConvertArrayToTensorRowMajorVisitor visitor{out_values, *chunk->data(), - table.num_columns(), i, j}; + ConvertArrayToTensorRowMajorVisitor visitor{ + out_values, *chunk->data(), table.num_columns(), col_idx, chunk_idx}; DCHECK_OK(VisitTypeInline(*chunk->type(), &visitor)); - j = j + static_cast(chunk->length()); + chunk_idx = chunk_idx + static_cast(chunk->length()); } else { ConvertArrayToTensorVisitor visitor{out_values, *chunk->data()}; DCHECK_OK(VisitTypeInline(*chunk->type(), &visitor)); } } - i++; + col_idx++; } } From 2fcc6b6f618404571e1e8465a0ac18a99449b3a0 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Tue, 11 Jun 2024 08:08:49 +0200 Subject: [PATCH 13/23] Remove most of table_to_tensor tests in python and parametrize one test for both batch and table --- python/pyarrow/tests/test_table.py | 248 ++--------------------------- 1 file changed, 14 insertions(+), 234 deletions(-) diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index b82cf507a890..7158c303e53d 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -1079,38 +1079,44 @@ def test_recordbatch_to_tensor_uniform_float_16(): check_tensors(result, expected, pa.float16(), 27) -@pytest.mark.numpy -def test_recordbatch_to_tensor_mixed_type(): +@pytest.mark.parametrize( + ('cls'), + [ + (pa.Table), + (pa.RecordBatch) + ] +) +def test_to_tensor_mixed_type(cls): # uint16 + int16 = int32 arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9] arr2 = [10, 20, 30, 40, 50, 60, 70, 80, 90] arr3 = [100, 200, 300, np.nan, 500, 600, 700, 800, 900] - batch = pa.RecordBatch.from_arrays( + tabular = cls.from_arrays( [ pa.array(arr1, type=pa.uint16()), pa.array(arr2, type=pa.int16()), ], ["a", "b"] ) - result = batch.to_tensor(row_major=False) + result = tabular.to_tensor(row_major=False) x = np.column_stack([arr1, arr2]).astype(np.int32, order="F") expected = pa.Tensor.from_numpy(x) check_tensors(result, expected, pa.int32(), 18) - result = batch.to_tensor() + result = tabular.to_tensor() x = np.column_stack([arr1, arr2]).astype(np.int32, order="C") expected = pa.Tensor.from_numpy(x) check_tensors(result, expected, pa.int32(), 18) # uint16 + int16 + float32 = float64 - batch = pa.RecordBatch.from_arrays( + tabular = cls.from_arrays( [ pa.array(arr1, type=pa.uint16()), pa.array(arr2, type=pa.int16()), pa.array(arr3, type=pa.float32()), ], ["a", "b", "c"] ) - result = batch.to_tensor(row_major=False) + result = tabular.to_tensor(row_major=False) x = np.column_stack([arr1, arr2, arr3]).astype(np.float64, order="F") expected = pa.Tensor.from_numpy(x) @@ -1120,7 +1126,7 @@ def test_recordbatch_to_tensor_mixed_type(): assert result.shape == expected.shape assert result.strides == expected.strides - result = batch.to_tensor() + result = tabular.to_tensor() x = np.column_stack([arr1, arr2, arr3]).astype(np.float64, order="C") expected = pa.Tensor.from_numpy(x) @@ -1332,232 +1338,6 @@ def test_table_to_tensor_uniform_type(typ): check_tensors(result, expected, pa.from_numpy_dtype(typ), 15) -def test_table_to_tensor_uniform_float_16(): - arr1 = [np.array([1, 2, 3], dtype=np.float16), - np.array([4, 5, 6, 7, 8, 9], dtype=np.float16)] - arr2 = [np.array([10, 20], dtype=np.float16), - np.array([30, 40, 50, 60, 70, 80, 90], dtype=np.float16)] - arr3 = [np.array([100, 100, 100, 100, 100, 100], dtype=np.float16), - np.array([100, 100, 100], dtype=np.float16)] - table = pa.Table.from_arrays( - [ - pa.chunked_array(arr1, type=pa.float16()), - pa.chunked_array(arr2, type=pa.float16()), - pa.chunked_array(arr3, type=pa.float16()), - ], ["a", "b", "c"] - ) - - arr1_f = [1, 2, 3, 4, 5, 6, 7, 8, 9] - arr2_f = [10, 20, 30, 40, 50, 60, 70, 80, 90] - arr3_f = [100, 100, 100, 100, 100, 100, 100, 100, 100] - - result = table.to_tensor(row_major=False) - x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(np.float16, order="F") - expected = pa.Tensor.from_numpy(x) - check_tensors(result, expected, pa.float16(), 27) - - result = table.to_tensor() - x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(np.float16, order="C") - expected = pa.Tensor.from_numpy(x) - check_tensors(result, expected, pa.float16(), 27) - - -def test_table_to_tensor_mixed_type(): - # uint16 + int16 = int32 - arr1 = [[1, 2, 3], [4, 5, 6, 7, 8, 9]] - arr2 = [[10, 20], [30, 40, 50, 60, 70, 80, 90]] - arr3 = [[100, 200, 300, np.nan, 500, 600], [700, 800, 900]] - table = pa.Table.from_arrays( - [ - pa.chunked_array(arr1, type=pa.uint16()), - pa.chunked_array(arr2, type=pa.int16()), - ], ["a", "b"] - ) - - arr1_f = [1, 2, 3, 4, 5, 6, 7, 8, 9] - arr2_f = [10, 20, 30, 40, 50, 60, 70, 80, 90] - arr3_f = [100, 200, 300, np.nan, 500, 600, 700, 800, 900] - - result = table.to_tensor(row_major=False) - x = np.column_stack([arr1_f, arr2_f]).astype(np.int32, order="F") - expected = pa.Tensor.from_numpy(x) - check_tensors(result, expected, pa.int32(), 18) - - result = table.to_tensor() - x = np.column_stack([arr1_f, arr2_f]).astype(np.int32, order="C") - expected = pa.Tensor.from_numpy(x) - check_tensors(result, expected, pa.int32(), 18) - - # uint16 + int16 + float32 = float64 - table = pa.Table.from_arrays( - [ - pa.chunked_array(arr1, type=pa.uint16()), - pa.chunked_array(arr2, type=pa.int16()), - pa.chunked_array(arr3, type=pa.float32()), - ], ["a", "b", "c"] - ) - result = table.to_tensor(row_major=False) - x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(np.float64, order="F") - expected = pa.Tensor.from_numpy(x) - - np.testing.assert_equal(result.to_numpy(), x) - assert result.size == 27 - assert result.type == pa.float64() - assert result.shape == expected.shape - assert result.strides == expected.strides - - result = table.to_tensor() - x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(np.float64, order="C") - expected = pa.Tensor.from_numpy(x) - - np.testing.assert_equal(result.to_numpy(), x) - assert result.size == 27 - assert result.type == pa.float64() - assert result.shape == expected.shape - assert result.strides == expected.strides - - -def test_table_to_tensor_unsupported_mixed_type_with_float16(): - arr1 = [[1, 2, 3], [4, 5, 6, 7, 8, 9]] - arr2 = [10, 20, 30, 40, 50, 60, 70, 80, 90] - arr3 = [[100, 200, 300, 400, 500, 600], [700, 800, 900]] - table = pa.Table.from_arrays( - [ - pa.chunked_array(arr1, type=pa.uint16()), - pa.chunked_array([np.array(arr2, dtype=np.float16)], type=pa.float16()), - pa.chunked_array(arr3, type=pa.float32()), - ], ["a", "b", "c"] - ) - - with pytest.raises( - NotImplementedError, - match="Casting from or to halffloat is not supported." - ): - table.to_tensor() - - -def test_table_to_tensor_nan(): - arr1 = [[1, 2, 3], [4, np.nan, 6, 7, 8, 9]] - arr2 = [[10, 20], [30, 40, 50, 60, 70, np.nan, 90]] - table = pa.Table.from_arrays( - [ - pa.chunked_array(arr1, type=pa.float32()), - pa.chunked_array(arr2, type=pa.float32()), - ], ["a", "b"] - ) - - arr1_f = [1, 2, 3, 4, np.nan, 6, 7, 8, 9] - arr2_f = [10, 20, 30, 40, 50, 60, 70, np.nan, 90] - - result = table.to_tensor(row_major=False) - x = np.column_stack([arr1_f, arr2_f]).astype(np.float32, order="F") - expected = pa.Tensor.from_numpy(x) - - np.testing.assert_equal(result.to_numpy(), x) - assert result.size == 18 - assert result.type == pa.float32() - assert result.shape == expected.shape - assert result.strides == expected.strides - - -def test_table_to_tensor_null(): - arr1 = [[1, 2, 3], [4, None, 6, 7, 8, 9]] - arr2 = [[10, 20], [30, 40, 50, 60, 70, None, 90]] - table = pa.Table.from_arrays( - [ - pa.chunked_array(arr1, type=pa.int32()), - pa.chunked_array(arr2, type=pa.float32()), - ], ["a", "b"] - ) - with pytest.raises( - pa.ArrowTypeError, - match="Can only convert a Table or RecordBatch with no nulls." - ): - table.to_tensor() - - arr1_f = [1, 2, 3, 4, np.nan, 6, 7, 8, 9] - arr2_f = [10, 20, 30, 40, 50, 60, 70, np.nan, 90] - - result = table.to_tensor(null_to_nan=True, row_major=False) - x = np.column_stack([arr1_f, arr2_f]).astype(np.float64, order="F") - expected = pa.Tensor.from_numpy(x) - - np.testing.assert_equal(result.to_numpy(), x) - assert result.size == 18 - assert result.type == pa.float64() - assert result.shape == expected.shape - assert result.strides == expected.strides - - # int32 -> float64 - table = pa.Table.from_arrays( - [ - pa.chunked_array(arr1, type=pa.int32()), - pa.chunked_array(arr2, type=pa.int32()), - ], ["a", "b"] - ) - - result = table.to_tensor(null_to_nan=True, row_major=False) - - np.testing.assert_equal(result.to_numpy(), x) - assert result.size == 18 - assert result.type == pa.float64() - assert result.shape == expected.shape - assert result.strides == expected.strides - - # int8 -> float32 - table = pa.Table.from_arrays( - [ - pa.chunked_array(arr1, type=pa.int8()), - pa.chunked_array(arr2, type=pa.int8()), - ], ["a", "b"] - ) - - result = table.to_tensor(null_to_nan=True, row_major=False) - x = np.column_stack([arr1_f, arr2_f]).astype(np.float32, order="F") - expected = pa.Tensor.from_numpy(x) - - np.testing.assert_equal(result.to_numpy(), x) - assert result.size == 18 - assert result.type == pa.float32() - assert result.shape == expected.shape - assert result.strides == expected.strides - - -def test_table_to_tensor_empty(): - table = pa.Table.from_arrays( - [ - pa.chunked_array([], type=pa.float32()), - pa.chunked_array([], type=pa.float32()), - ], ["a", "b"] - ) - result = table.to_tensor() - - x = np.column_stack([[], []]).astype(np.float32, order="F") - expected = pa.Tensor.from_numpy(x) - - assert result.size == expected.size - assert result.type == pa.float32() - assert result.shape == expected.shape - assert result.strides == (4, 4) - - -def test_table_to_tensor_unsupported(): - arr1 = [[1, 2, 3], [4, 5, 6, 7, 8, 9]] - # Unsupported data type - arr2 = [["a", "b", "c", "a"], ["b", "c", "a", "b", "c"]] - table = pa.Table.from_arrays( - [ - pa.chunked_array(arr1, type=pa.int32()), - pa.chunked_array(arr2, type=pa.utf8()), - ], ["a", "b"] - ) - with pytest.raises( - pa.ArrowTypeError, - match="DataType is not supported" - ): - table.to_tensor() - - def _table_like_slice_tests(factory): data = [ pa.array(range(5)), From c817081625866f718265290fcd7a31f75b8b9642 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Tue, 11 Jun 2024 08:17:49 +0200 Subject: [PATCH 14/23] Use self.table and self.batch, run linter --- python/pyarrow/table.pxi | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 38d920e0d20f..f9dbb52597fb 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1413,7 +1413,8 @@ cdef class ChunkedArray(_PandasConvertible): chunked = self.cast(target_type, safe=True) except ArrowInvalid as e: raise ValueError( - f"Could not cast {self.type} to requested type {target_type}: {e}" + f"Could not cast {self.type} to requested type { + target_type}: {e}" ) else: chunked = self @@ -3659,15 +3660,11 @@ cdef class RecordBatch(_Tabular): """ self._assert_cpu() cdef: - shared_ptr[CRecordBatch] c_record_batch shared_ptr[CTensor] c_tensor CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) - c_record_batch = pyarrow_unwrap_batch(self) with nogil: - c_tensor = GetResultValue( - deref(c_record_batch).ToTensor(null_to_nan, - row_major, pool)) + c_tensor = GetResultValue(self.batch.ToTensor(null_to_nan, row_major, pool)) return pyarrow_wrap_tensor(c_tensor) def copy_to(self, destination): @@ -3798,7 +3795,8 @@ cdef class RecordBatch(_Tabular): inner_batch = pyarrow_unwrap_batch(casted_batch) except ArrowInvalid as e: raise ValueError( - f"Could not cast {self.schema} to requested schema {target_schema}: {e}" + f"Could not cast {self.schema} to requested schema { + target_schema}: {e}" ) else: inner_batch = self.sp_batch @@ -5152,15 +5150,11 @@ cdef class Table(_Tabular): [nan, nan]]) """ cdef: - shared_ptr[CTable] c_table shared_ptr[CTensor] c_tensor CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) - c_table = pyarrow_unwrap_table(self) with nogil: - c_tensor = GetResultValue( - deref(c_table).ToTensor(null_to_nan, - row_major, pool)) + c_tensor = GetResultValue(self.table.ToTensor(null_to_nan, row_major, pool)) return pyarrow_wrap_tensor(c_tensor) def to_reader(self, max_chunksize=None): From 3e213ddd05ce65fcaa1ea073c53da2af4050f024 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Tue, 11 Jun 2024 08:33:02 +0200 Subject: [PATCH 15/23] Redu unrelated linter changes --- python/pyarrow/table.pxi | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index f9dbb52597fb..f29b4a130e86 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -1413,8 +1413,7 @@ cdef class ChunkedArray(_PandasConvertible): chunked = self.cast(target_type, safe=True) except ArrowInvalid as e: raise ValueError( - f"Could not cast {self.type} to requested type { - target_type}: {e}" + f"Could not cast {self.type} to requested type {target_type}: {e}" ) else: chunked = self @@ -3795,8 +3794,7 @@ cdef class RecordBatch(_Tabular): inner_batch = pyarrow_unwrap_batch(casted_batch) except ArrowInvalid as e: raise ValueError( - f"Could not cast {self.schema} to requested schema { - target_schema}: {e}" + f"Could not cast {self.schema} to requested schema {target_schema}: {e}" ) else: inner_batch = self.sp_batch From 4bc7e39e65fd8528912eddc4a6ce6998b8970de1 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Tue, 11 Jun 2024 09:16:48 +0200 Subject: [PATCH 16/23] Remove shape and strides from ToTensor docstrings --- cpp/src/arrow/record_batch.h | 3 +-- cpp/src/arrow/table.h | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index a6ef744ac121..d1e5d541821f 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -92,8 +92,7 @@ class ARROW_EXPORT RecordBatch { /// \brief Convert RecordBatch to Tensor /// - /// Create a Tensor object with shape (number of rows, number of columns) and - /// strides (type size in bytes, type size in bytes * number of rows). + /// Create a Tensor object. /// /// \param[in] null_to_nan if true, convert nulls to NaN /// \param[in] row_major if true, create row-major Tensor else column-major Tensor diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index 3558cb46d8c5..051060a52c4b 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -104,8 +104,7 @@ class ARROW_EXPORT Table { /// \brief Convert Table to Tensor /// - /// Create a Tensor object with shape (number of rows, number of columns) and - /// strides (type size in bytes, type size in bytes * number of rows). + /// Create a Tensor object. /// /// \param[in] null_to_nan if true, convert nulls to NaN /// \param[in] row_major if true, create row-major Tensor else column-major Tensor From 7f58c559679ed6f557f054504132c4dabb515cd1 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Tue, 11 Jun 2024 09:18:23 +0200 Subject: [PATCH 17/23] Remove s in NaNs --- python/pyarrow/table.pxi | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index f29b4a130e86..0c35c915015a 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -3629,7 +3629,7 @@ cdef class RecordBatch(_Tabular): b: [10,20,30,40,null] Convert a RecordBatch to row-major Tensor with null values - written as NaN values + written as ``NaN``: >>> batch.to_tensor(null_to_nan=True) @@ -3643,7 +3643,7 @@ cdef class RecordBatch(_Tabular): [ 4., 40.], [nan, nan]]) - Convert a RecordBatch to column-major Tensor + Convert a RecordBatch to column-major Tensor: >>> batch.to_tensor(null_to_nan=True, row_major=False) @@ -5119,7 +5119,7 @@ cdef class Table(_Tabular): a: [[1,2],[3,4,null]] b: [[10,20,30],[40,null]] - Convert a Table to row-major Tensor with null values written as ``NaN``s: + Convert a Table to row-major Tensor with null values written as ``NaN``: >>> table.to_tensor(null_to_nan=True) From 4a879f96b0f9efbdd71c2c28377b122591164386 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Thu, 9 Apr 2026 19:08:57 +0200 Subject: [PATCH 18/23] Pre-calculate index and remove the need to cast --- cpp/src/arrow/tensor.cc | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/tensor.cc b/cpp/src/arrow/tensor.cc index 07563a406bee..7d1ef2160799 100644 --- a/cpp/src/arrow/tensor.cc +++ b/cpp/src/arrow/tensor.cc @@ -259,9 +259,9 @@ template struct ConvertArrayToTensorRowMajorVisitor { Out*& out_values; const ArrayData& in_data; - int num_cols; - int col_idx; - int chunk_idx; + int64_t num_cols; + int64_t col_idx; + int64_t chunk_idx; template Status Visit(const T&) { @@ -269,14 +269,15 @@ struct ConvertArrayToTensorRowMajorVisitor { using In = typename T::c_type; auto in_values = ArraySpan(in_data).GetSpan(1, in_data.length); + const int64_t base = chunk_idx * num_cols + col_idx; + if (in_data.null_count == 0) { for (int64_t i = 0; i < in_data.length; ++i) { - out_values[(i + chunk_idx) * num_cols + col_idx] = - static_cast(in_values[i]); + out_values[base + i * num_cols] = static_cast(in_values[i]); } } else { for (int64_t i = 0; i < in_data.length; ++i) { - out_values[(i + chunk_idx) * num_cols + col_idx] = + out_values[base + i * num_cols] = in_data.IsNull(i) ? static_cast(NAN) : static_cast(in_values[i]); } } @@ -291,7 +292,7 @@ inline void ConvertColumnsToTensor(const Table& table, uint8_t* out, bool row_ma using CType = typename arrow::TypeTraits::CType; auto* out_values = reinterpret_cast(out); - int col_idx = 0; + int64_t col_idx = 0; for (const auto& column : table.columns()) { int chunk_idx = 0; for (const auto& chunk : column->chunks()) { @@ -299,7 +300,7 @@ inline void ConvertColumnsToTensor(const Table& table, uint8_t* out, bool row_ma ConvertArrayToTensorRowMajorVisitor visitor{ out_values, *chunk->data(), table.num_columns(), col_idx, chunk_idx}; DCHECK_OK(VisitTypeInline(*chunk->type(), &visitor)); - chunk_idx = chunk_idx + static_cast(chunk->length()); + chunk_idx += chunk->length(); } else { ConvertArrayToTensorVisitor visitor{out_values, *chunk->data()}; DCHECK_OK(VisitTypeInline(*chunk->type(), &visitor)); From 1f12b9013faf76c663b79e6f62c7c1be8c9fec2f Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Fri, 10 Apr 2026 11:19:07 +0200 Subject: [PATCH 19/23] Split batch and table path to eliminate heap-allocations for unnecessary Table creation --- cpp/src/arrow/record_batch.cc | 3 +- cpp/src/arrow/table.cc | 2 +- cpp/src/arrow/tensor.cc | 120 +++++++++++++++++++++------------- cpp/src/arrow/tensor.h | 4 ++ 4 files changed, 81 insertions(+), 48 deletions(-) diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 3271f26be8c8..bc2612f92add 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -295,10 +295,9 @@ Result> RecordBatch::ToStructArray() const { Result> RecordBatch::ToTensor(bool null_to_nan, bool row_major, MemoryPool* pool) const { - std::shared_ptr
table = Table::Make(schema(), columns()); std::shared_ptr tensor; ARROW_RETURN_NOT_OK( - internal::TableToTensor(*table, null_to_nan, row_major, pool, &tensor)); + internal::RecordBatchToTensor(*this, null_to_nan, row_major, pool, &tensor)); return tensor; } diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index b2b18bd510bc..2c9ed5195a6f 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -41,8 +41,8 @@ #include "arrow/type_fwd.h" #include "arrow/type_traits.h" #include "arrow/util/checked_cast.h" -#include "arrow/util/logging_internal.h" #include "arrow/util/logging.h" +#include "arrow/util/logging_internal.h" #include "arrow/util/vector.h" namespace arrow { diff --git a/cpp/src/arrow/tensor.cc b/cpp/src/arrow/tensor.cc index 7d1ef2160799..7e82ba1c6ab5 100644 --- a/cpp/src/arrow/tensor.cc +++ b/cpp/src/arrow/tensor.cc @@ -287,40 +287,60 @@ struct ConvertArrayToTensorRowMajorVisitor { } }; -template -inline void ConvertColumnsToTensor(const Table& table, uint8_t* out, bool row_major) { +template +inline void ConvertColumnsToTensor(const Container& container, uint8_t* out, + bool row_major) { using CType = typename arrow::TypeTraits::CType; auto* out_values = reinterpret_cast(out); - int64_t col_idx = 0; - for (const auto& column : table.columns()) { - int chunk_idx = 0; - for (const auto& chunk : column->chunks()) { + for (int col_idx = 0; col_idx < container.num_columns(); ++col_idx) { + if constexpr (std::is_same_v) { + int chunk_idx = 0; + + for (const auto& chunk : container.column(col_idx)->chunks()) { + if (row_major) { + ConvertArrayToTensorRowMajorVisitor visitor{ + out_values, *chunk->data(), container.num_columns(), col_idx, chunk_idx}; + DCHECK_OK(VisitTypeInline(*chunk->type(), &visitor)); + chunk_idx += chunk->length(); + } else { + ConvertArrayToTensorVisitor visitor{out_values, *chunk->data()}; + DCHECK_OK(VisitTypeInline(*chunk->type(), &visitor)); + } + } + } else if constexpr (std::is_same_v) { + const auto& array_data = container.column_data(col_idx); + if (row_major) { ConvertArrayToTensorRowMajorVisitor visitor{ - out_values, *chunk->data(), table.num_columns(), col_idx, chunk_idx}; - DCHECK_OK(VisitTypeInline(*chunk->type(), &visitor)); - chunk_idx += chunk->length(); + out_values, *array_data, container.num_columns(), col_idx, 0}; + DCHECK_OK(VisitTypeInline(*array_data->type, &visitor)); } else { - ConvertArrayToTensorVisitor visitor{out_values, *chunk->data()}; - DCHECK_OK(VisitTypeInline(*chunk->type(), &visitor)); + ConvertArrayToTensorVisitor visitor{out_values, *array_data}; + DCHECK_OK(VisitTypeInline(*array_data->type, &visitor)); } } - col_idx++; } } -Status TableToTensor(const Table& table, bool null_to_nan, bool row_major, - MemoryPool* pool, std::shared_ptr* tensor) { - if (table.num_columns() == 0) { +template +Status ToTensorImpl(const Container& container, bool null_to_nan, bool row_major, + MemoryPool* pool, std::shared_ptr* tensor) { + if (container.num_columns() == 0) { return Status::TypeError( "Conversion to Tensor for Tables or RecordBatches without columns/schema is not " "supported."); } // Check for no validity bitmap of each field // if null_to_nan conversion is set to false - for (int i = 0; i < table.num_columns(); ++i) { - if (table.column(i)->null_count() > 0 && !null_to_nan) { + for (int i = 0; i < container.num_columns(); ++i) { + int64_t null_count; + if constexpr (std::is_same_v) { + null_count = container.column(i)->null_count(); + } else if constexpr (std::is_same_v) { + null_count = container.column_data(i)->GetNullCount(); + } + if (null_count > 0 && !null_to_nan) { return Status::TypeError( "Can only convert a Table or RecordBatch with no nulls. Set null_to_nan to " "true to convert nulls to NaN"); @@ -329,12 +349,11 @@ Status TableToTensor(const Table& table, bool null_to_nan, bool row_major, // Check for supported data types and merge fields // to get the resulting uniform data type - if (!is_integer(table.column(0)->type()->id()) && - !is_floating(table.column(0)->type()->id())) { - return Status::TypeError("DataType is not supported: ", - table.column(0)->type()->ToString()); + const auto& col_0_type = container.schema()->field(0)->type(); + if (!is_integer(col_0_type->id()) && !is_floating(col_0_type->id())) { + return Status::TypeError("DataType is not supported: ", col_0_type->ToString()); } - std::shared_ptr result_field = table.schema()->field(0); + std::shared_ptr result_field = container.schema()->field(0); std::shared_ptr result_type = result_field->type(); Field::MergeOptions options; @@ -342,24 +361,25 @@ Status TableToTensor(const Table& table, bool null_to_nan, bool row_major, options.promote_integer_sign = true; options.promote_numeric_width = true; - if (table.num_columns() > 1) { - for (int i = 1; i < table.num_columns(); ++i) { - if (!is_numeric(table.column(i)->type()->id())) { - return Status::TypeError("DataType is not supported: ", - table.column(i)->type()->ToString()); + if (container.num_columns() > 1) { + for (int i = 1; i < container.num_columns(); ++i) { + const auto& col_type = container.schema()->field(i)->type(); + + if (!is_numeric(col_type->id())) { + return Status::TypeError("DataType is not supported: ", col_type->ToString()); } // Casting of float16 is not supported, throw an error in this case - if ((table.column(i)->type()->id() == Type::HALF_FLOAT || + if ((col_type->id() == Type::HALF_FLOAT || result_field->type()->id() == Type::HALF_FLOAT) && - table.column(i)->type()->id() != result_field->type()->id()) { + col_type->id() != result_field->type()->id()) { return Status::NotImplemented("Casting from or to halffloat is not supported."); } ARROW_ASSIGN_OR_RAISE( result_field, result_field->MergeWith( - table.schema()->field(i)->WithName(result_field->name()), options)); + container.schema()->field(i)->WithName(result_field->name()), options)); } result_type = result_field->type(); } @@ -374,42 +394,42 @@ Status TableToTensor(const Table& table, bool null_to_nan, bool row_major, } // Allocate memory - ARROW_ASSIGN_OR_RAISE( - std::shared_ptr result, - AllocateBuffer(result_type->bit_width() * table.num_columns() * table.num_rows(), - pool)); + ARROW_ASSIGN_OR_RAISE(std::shared_ptr result, + AllocateBuffer(result_type->bit_width() * + container.num_columns() * container.num_rows(), + pool)); // Copy data switch (result_type->id()) { case Type::UINT8: - ConvertColumnsToTensor(table, result->mutable_data(), row_major); + ConvertColumnsToTensor(container, result->mutable_data(), row_major); break; case Type::UINT16: case Type::HALF_FLOAT: - ConvertColumnsToTensor(table, result->mutable_data(), row_major); + ConvertColumnsToTensor(container, result->mutable_data(), row_major); break; case Type::UINT32: - ConvertColumnsToTensor(table, result->mutable_data(), row_major); + ConvertColumnsToTensor(container, result->mutable_data(), row_major); break; case Type::UINT64: - ConvertColumnsToTensor(table, result->mutable_data(), row_major); + ConvertColumnsToTensor(container, result->mutable_data(), row_major); break; case Type::INT8: - ConvertColumnsToTensor(table, result->mutable_data(), row_major); + ConvertColumnsToTensor(container, result->mutable_data(), row_major); break; case Type::INT16: - ConvertColumnsToTensor(table, result->mutable_data(), row_major); + ConvertColumnsToTensor(container, result->mutable_data(), row_major); break; case Type::INT32: - ConvertColumnsToTensor(table, result->mutable_data(), row_major); + ConvertColumnsToTensor(container, result->mutable_data(), row_major); break; case Type::INT64: - ConvertColumnsToTensor(table, result->mutable_data(), row_major); + ConvertColumnsToTensor(container, result->mutable_data(), row_major); break; case Type::FLOAT: - ConvertColumnsToTensor(table, result->mutable_data(), row_major); + ConvertColumnsToTensor(container, result->mutable_data(), row_major); break; case Type::DOUBLE: - ConvertColumnsToTensor(table, result->mutable_data(), row_major); + ConvertColumnsToTensor(container, result->mutable_data(), row_major); break; default: return Status::TypeError("DataType is not supported: ", result_type->ToString()); @@ -418,7 +438,7 @@ Status TableToTensor(const Table& table, bool null_to_nan, bool row_major, // Construct Tensor object const auto& fixed_width_type = internal::checked_cast(*result_type); - std::vector shape = {table.num_rows(), table.num_columns()}; + std::vector shape = {container.num_rows(), container.num_columns()}; std::vector strides; if (row_major) { @@ -433,6 +453,16 @@ Status TableToTensor(const Table& table, bool null_to_nan, bool row_major, return Status::OK(); } +Status TableToTensor(const Table& table, bool null_to_nan, bool row_major, + MemoryPool* pool, std::shared_ptr* tensor) { + return ToTensorImpl(table, null_to_nan, row_major, pool, tensor); +} + +Status RecordBatchToTensor(const RecordBatch& batch, bool null_to_nan, bool row_major, + MemoryPool* pool, std::shared_ptr* tensor) { + return ToTensorImpl(batch, null_to_nan, row_major, pool, tensor); +} + } // namespace internal /// Constructor with strides and dimension names diff --git a/cpp/src/arrow/tensor.h b/cpp/src/arrow/tensor.h index b1c98bf733c7..1300003c2985 100644 --- a/cpp/src/arrow/tensor.h +++ b/cpp/src/arrow/tensor.h @@ -81,6 +81,10 @@ ARROW_EXPORT Status TableToTensor(const Table& table, bool null_to_nan, bool row_major, MemoryPool* pool, std::shared_ptr* tensor); +ARROW_EXPORT +Status RecordBatchToTensor(const RecordBatch& batch, bool null_to_nan, bool row_major, + MemoryPool* pool, std::shared_ptr* tensor); + } // namespace internal class ARROW_EXPORT Tensor { From 81baf3011bbcc1f8a2438aaff1eec9abd25becc2 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Thu, 16 Apr 2026 16:09:31 +0200 Subject: [PATCH 20/23] FIx missing int type change --- cpp/src/arrow/tensor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/tensor.cc b/cpp/src/arrow/tensor.cc index 7e82ba1c6ab5..c7bb049a1fd0 100644 --- a/cpp/src/arrow/tensor.cc +++ b/cpp/src/arrow/tensor.cc @@ -295,7 +295,7 @@ inline void ConvertColumnsToTensor(const Container& container, uint8_t* out, for (int col_idx = 0; col_idx < container.num_columns(); ++col_idx) { if constexpr (std::is_same_v) { - int chunk_idx = 0; + int64_t chunk_idx = 0; for (const auto& chunk : container.column(col_idx)->chunks()) { if (row_major) { From e1cbc851dc718064837bbbf2473ec825c9d70ed2 Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Thu, 16 Apr 2026 16:23:23 +0200 Subject: [PATCH 21/23] Add numpy test markers --- python/pyarrow/tests/test_table.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 7158c303e53d..a3d2c7aeda1f 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -1079,6 +1079,7 @@ def test_recordbatch_to_tensor_uniform_float_16(): check_tensors(result, expected, pa.float16(), 27) +@pytest.mark.numpy @pytest.mark.parametrize( ('cls'), [ @@ -1275,6 +1276,7 @@ def test_recordbatch_to_tensor_unsupported(): batch.to_tensor() +@pytest.mark.numpy @pytest.mark.parametrize('typ', [ np.uint8, np.uint16, np.uint32, np.uint64, np.int8, np.int16, np.int32, np.int64, From 065185d99852915b1b7f77189b12a05cfca21d3a Mon Sep 17 00:00:00 2001 From: AlenkaF Date: Fri, 17 Apr 2026 06:00:42 +0200 Subject: [PATCH 22/23] Parametrize with string --- python/pyarrow/tests/test_table.py | 40 +++++++++++++++--------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index a3d2c7aeda1f..57d6faa677bf 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -1277,20 +1277,20 @@ def test_recordbatch_to_tensor_unsupported(): @pytest.mark.numpy -@pytest.mark.parametrize('typ', [ - np.uint8, np.uint16, np.uint32, np.uint64, - np.int8, np.int16, np.int32, np.int64, - np.float32, np.float64, +@pytest.mark.parametrize('typ_str', [ + "uint8", "uint16", "uint32", "uint64", + "int8", "int16", "int32", "int64", + "float32", "float64", ]) -def test_table_to_tensor_uniform_type(typ): +def test_table_to_tensor_uniform_type(typ_str): arr1 = [[1, 2, 3], [4, 5, 6, 7, 8, 9]] arr2 = [[10, 20], [30, 40, 50, 60, 70, 80, 90]] arr3 = [[100, 100, 100, 100, 100, 100], [100, 100, 100]] table = pa.Table.from_arrays( [ - pa.chunked_array(arr1, type=pa.from_numpy_dtype(typ)), - pa.chunked_array(arr2, type=pa.from_numpy_dtype(typ)), - pa.chunked_array(arr3, type=pa.from_numpy_dtype(typ)), + pa.chunked_array(arr1, type=pa.from_numpy_dtype(typ_str)), + pa.chunked_array(arr2, type=pa.from_numpy_dtype(typ_str)), + pa.chunked_array(arr3, type=pa.from_numpy_dtype(typ_str)), ], ["a", "b", "c"] ) @@ -1299,14 +1299,14 @@ def test_table_to_tensor_uniform_type(typ): arr3_f = [100, 100, 100, 100, 100, 100, 100, 100, 100] result = table.to_tensor(row_major=False) - x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(typ, order="F") + x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(typ_str, order="F") expected = pa.Tensor.from_numpy(x) - check_tensors(result, expected, pa.from_numpy_dtype(typ), 27) + check_tensors(result, expected, pa.from_numpy_dtype(typ_str), 27) result = table.to_tensor() - x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(typ, order="C") + x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(typ_str, order="C") expected = pa.Tensor.from_numpy(x) - check_tensors(result, expected, pa.from_numpy_dtype(typ), 27) + check_tensors(result, expected, pa.from_numpy_dtype(typ_str), 27) # Test offset table1 = table.slice(1) @@ -1315,14 +1315,14 @@ def test_table_to_tensor_uniform_type(typ): arr3_f = [100, 100, 100, 100, 100, 100, 100, 100] result = table1.to_tensor(row_major=False) - x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(typ, order="F") + x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(typ_str, order="F") expected = pa.Tensor.from_numpy(x) - check_tensors(result, expected, pa.from_numpy_dtype(typ), 24) + check_tensors(result, expected, pa.from_numpy_dtype(typ_str), 24) result = table1.to_tensor() - x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(typ, order="C") + x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(typ_str, order="C") expected = pa.Tensor.from_numpy(x) - check_tensors(result, expected, pa.from_numpy_dtype(typ), 24) + check_tensors(result, expected, pa.from_numpy_dtype(typ_str), 24) table2 = table.slice(1, 5) arr1_f = [2, 3, 4, 5, 6] @@ -1330,14 +1330,14 @@ def test_table_to_tensor_uniform_type(typ): arr3_f = [100, 100, 100, 100, 100] result = table2.to_tensor(row_major=False) - x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(typ, order="F") + x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(typ_str, order="F") expected = pa.Tensor.from_numpy(x) - check_tensors(result, expected, pa.from_numpy_dtype(typ), 15) + check_tensors(result, expected, pa.from_numpy_dtype(typ_str), 15) result = table2.to_tensor() - x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(typ, order="C") + x = np.column_stack([arr1_f, arr2_f, arr3_f]).astype(typ_str, order="C") expected = pa.Tensor.from_numpy(x) - check_tensors(result, expected, pa.from_numpy_dtype(typ), 15) + check_tensors(result, expected, pa.from_numpy_dtype(typ_str), 15) def _table_like_slice_tests(factory): From 5d370894ce5f9a557f1787b35ea2464b8005def7 Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Thu, 30 Apr 2026 06:50:38 +0200 Subject: [PATCH 23/23] Apply suggestions from code review Co-authored-by: tadeja --- cpp/src/arrow/table_test.cc | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/table_test.cc b/cpp/src/arrow/table_test.cc index c19541a14925..d3522b282df3 100644 --- a/cpp/src/arrow/table_test.cc +++ b/cpp/src/arrow/table_test.cc @@ -932,8 +932,7 @@ TYPED_TEST_P(TestTableToTensorColumnMajor, SupportedTypes) { shape_sliced, f_strides_sliced); EXPECT_TRUE(tensor_expected_sliced->Equals(*tensor_sliced)); - CheckTableToTensor(tensor_expected_sliced, 24, shape_sliced, - f_strides_sliced); + CheckTableToTensor(tensor_sliced, 24, shape_sliced, f_strides_sliced); auto table_slice_1 = table->Slice(1, 5); @@ -950,8 +949,7 @@ TYPED_TEST_P(TestTableToTensorColumnMajor, SupportedTypes) { shape_sliced_1, f_strides_sliced_1); EXPECT_TRUE(tensor_expected_sliced_1->Equals(*tensor_sliced_1)); - CheckTableToTensor(tensor_expected_sliced_1, 15, shape_sliced_1, - f_strides_sliced_1); + CheckTableToTensor(tensor_sliced_1, 15, shape_sliced_1, f_strides_sliced_1); } REGISTER_TYPED_TEST_SUITE_P(TestTableToTensorColumnMajor, SupportedTypes);