Skip to content
5 changes: 4 additions & 1 deletion cpp/src/arrow/compute/kernels/hash_aggregate.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <functional>
#include <memory>
#include <string>
#include <type_traits>
#include <vector>

#include "arrow/array/builder_nested.h"
Expand Down Expand Up @@ -277,8 +278,10 @@ template <typename T>
concept CBooleanConcept = std::same_as<T, bool>;

// XXX: Ideally we want to have std::floating_point<Float16> = true.
// Note: Using std::is_floating_point_v instead of std::floating_point concept
// for compatibility with older compilers (e.g., Apple Clang 14.0.0)
template <typename T>
concept CFloatingPointConcept = std::floating_point<T> || std::same_as<T, util::Float16>;
concept CFloatingPointConcept = std::is_floating_point_v<T> || std::same_as<T, util::Float16>;

template <typename T>
concept CDecimalConcept = std::same_as<T, Decimal32> || std::same_as<T, Decimal64> ||
Expand Down
88 changes: 87 additions & 1 deletion cpp/src/arrow/json/parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@

#include "arrow/json/parser.h"

#include <cctype>
#include <functional>
#include <limits>
#include <memory>
#include <string>
#include <string_view>
#include <tuple>
#include <unordered_map>
Expand Down Expand Up @@ -654,7 +656,8 @@ class HandlerBase : public BlockParser,
: BlockParser(pool),
builder_set_(pool),
field_index_(-1),
scalar_values_builder_(pool) {}
scalar_values_builder_(pool),
explicit_schema_(nullptr) {}

/// Retrieve a pointer to a builder from a BuilderPtr
template <Kind::type kind>
Expand All @@ -679,6 +682,15 @@ class HandlerBase : public BlockParser,
bool Bool(bool value) {
constexpr auto kind = Kind::kBoolean;
if (ARROW_PREDICT_FALSE(builder_.kind != kind)) {
// When explicit schema is provided, try to convert the value
if (explicit_schema_ != nullptr) {
std::string bool_str = value ? "true" : "false";
status_ = TryConvertAndAppend(kind, builder_, bool_str);
if (status_.ok()) {
return true;
}
// If conversion failed, fall through to error
}
status_ = IllegallyChangedTo(kind);
return status_.ok();
}
Expand Down Expand Up @@ -729,6 +741,7 @@ class HandlerBase : public BlockParser,

/// \brief Set up builders using an expected Schema
Status Initialize(const std::shared_ptr<Schema>& s) {
explicit_schema_ = s;
auto type = struct_({});
if (s) {
type = struct_(s->fields());
Expand Down Expand Up @@ -808,6 +821,14 @@ class HandlerBase : public BlockParser,
template <Kind::type kind>
Status AppendScalar(BuilderPtr builder, std::string_view scalar) {
if (ARROW_PREDICT_FALSE(builder.kind != kind)) {
// When explicit schema is provided, try to convert the value
if (explicit_schema_ != nullptr) {
Status convert_status = TryConvertAndAppend(kind, builder, scalar);
if (convert_status.ok()) {
return Status::OK();
}
// If conversion failed, fall through to error
}
return IllegallyChangedTo(kind);
}
auto index = static_cast<int32_t>(scalar_values_builder_.length());
Expand Down Expand Up @@ -918,6 +939,69 @@ class HandlerBase : public BlockParser,
" to ", Kind::Name(illegally_changed_to), " in row ", num_rows_);
}

/// Try to convert a JSON value to match the builder's expected kind
/// Returns OK if conversion succeeded and value was appended, error otherwise
Status TryConvertAndAppend(Kind::type json_kind, BuilderPtr builder,
std::string_view scalar) {
// Convert based on target builder kind
switch (builder.kind) {
case Kind::kString: {
// Target is string - can convert from number or boolean
if (json_kind == Kind::kNumber) {
// Number to string: the scalar already contains the number as string
// (due to kParseNumbersAsStringsFlag), so we can append directly
return AppendScalar<Kind::kString>(builder, scalar);
} else if (json_kind == Kind::kBoolean) {
// Boolean to string: convert true/false to "true"/"false"
std::string bool_str = (scalar == "true" || scalar == "1") ? "true" : "false";
return AppendScalar<Kind::kString>(builder, bool_str);
}
break;
}
case Kind::kNumber: {
// Target is number - can convert from numeric string
if (json_kind == Kind::kString) {
// Try to parse string as number
// The string should already be in scalar, we just need to verify it's numeric
// and append it (the parser flag kParseNumbersAsStringsFlag means numbers
// come as strings, so we can treat numeric strings as numbers)
return AppendScalar<Kind::kNumber>(builder, scalar);
} else if (json_kind == Kind::kBoolean) {
// Boolean to number: true -> 1, false -> 0
std::string num_str = (scalar == "true" || scalar == "1") ? "1" : "0";
return AppendScalar<Kind::kNumber>(builder, num_str);
}
break;
}
case Kind::kBoolean: {
// Target is boolean - can convert from number (0/1) or string ("true"/"false")
if (json_kind == Kind::kNumber) {
// Number to boolean: 0 -> false, non-zero -> true
std::string bool_str = (scalar == "0" || scalar == "0.0") ? "false" : "true";
return AppendScalar<Kind::kBoolean>(builder, bool_str);
} else if (json_kind == Kind::kString) {
// String to boolean: check if it's a boolean-like string
std::string lower_scalar;
lower_scalar.reserve(scalar.size());
for (char c : scalar) {
lower_scalar += std::tolower(static_cast<unsigned char>(c));
}
if (lower_scalar == "true" || lower_scalar == "1" || lower_scalar == "yes") {
return AppendScalar<Kind::kBoolean>(builder, "true");
} else if (lower_scalar == "false" || lower_scalar == "0" || lower_scalar == "no") {
return AppendScalar<Kind::kBoolean>(builder, "false");
}
}
break;
}
default:
break;
}
// Conversion not supported
return Status::Invalid("Cannot convert ", Kind::Name(json_kind), " to ",
Kind::Name(builder.kind));
}

/// Reserve storage for scalars, these can occupy almost all of the JSON buffer
Status ReserveScalarStorage(int64_t size) override {
auto available_storage = scalar_values_builder_.value_data_capacity() -
Expand All @@ -941,6 +1025,8 @@ class HandlerBase : public BlockParser,
// top of this stack == field_index_
std::vector<int> field_index_stack_;
StringBuilder scalar_values_builder_;
// Store explicit schema for type conversion
std::shared_ptr<Schema> explicit_schema_;
};

template <UnexpectedFieldBehavior>
Expand Down
26 changes: 26 additions & 0 deletions cpp/src/arrow/json/parser_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,32 @@ TEST_P(BlockParserTypeError, FailOnInconvertible) {
"JSON parse error: Column(/a) changed from number to boolean in row 1"));
}

TEST_P(BlockParserTypeError, AllowNumberToStringConversion) {
// Test that number can be converted to string when explicit schema is provided
auto options = Options(schema({field("a", utf8())}));
std::shared_ptr<Array> parsed;
// This should succeed - number 456 should be converted to string "456"
ASSERT_OK(ParseFromString(options, "{\"a\":\"123\"}\n{\"a\":456}", &parsed));
auto struct_array = std::static_pointer_cast<StructArray>(parsed);
ASSERT_NE(struct_array, nullptr);
auto field_array = struct_array->GetFieldByName("a");
ASSERT_NE(field_array, nullptr);
ASSERT_EQ(field_array->length(), 2);
}

TEST_P(BlockParserTypeError, AllowStringToNumberConversion) {
// Test that numeric string can be converted to number when explicit schema is provided
auto options = Options(schema({field("a", int64())}));
std::shared_ptr<Array> parsed;
// This should succeed - string "456" should be converted to number 456
ASSERT_OK(ParseFromString(options, "{\"a\":123}\n{\"a\":\"456\"}", &parsed));
auto struct_array = std::static_pointer_cast<StructArray>(parsed);
ASSERT_NE(struct_array, nullptr);
auto field_array = struct_array->GetFieldByName("a");
ASSERT_NE(field_array, nullptr);
ASSERT_EQ(field_array->length(), 2);
}

TEST_P(BlockParserTypeError, FailOnNestedInconvertible) {
auto options = Options(schema({field("a", list(struct_({field("b", int32())})))}));
std::shared_ptr<Array> parsed;
Expand Down
2 changes: 1 addition & 1 deletion dev/release/download_rc_binaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def _download_url(self, url, dest_path, *, extra_args=None):
os.remove(dest_path)
except IOError:
pass
if "OpenSSL" not in stderr:
if b"OpenSSL" not in stderr:
# We assume curl has already retried on other errors.
break
else:
Expand Down
2 changes: 1 addition & 1 deletion docs/source/_static/versions.json
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,6 @@
{
"name": "1.0",
"version": "1.0/",
"url": "https://arrow.apache.org/docs/dev/"
"url": "https://arrow.apache.org/docs/1.0/"
}
]
7 changes: 5 additions & 2 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1136,7 +1136,7 @@ cdef class Array(_PandasConvertible):
result = self.ap.Diff(deref(other.ap))
return frombytes(result, safe=True)

def cast(self, object target_type=None, safe=None, options=None, memory_pool=None):
def cast(self, object target_type=None, safe=None, options=None, memory_pool=None, *, errors='raise'):
"""
Cast array values to another data type

Expand All @@ -1152,14 +1152,17 @@ cdef class Array(_PandasConvertible):
Additional checks pass by CastOptions
memory_pool : MemoryPool, optional
memory pool to use for allocations during function execution.
errors : str, default 'raise'
What to do if a value cannot be casted to the target type.
'raise' will raise an error, 'coerce' will produce a null.

Returns
-------
cast : Array
"""
self._assert_cpu()
return _pc().cast(self, target_type, safe=safe,
options=options, memory_pool=memory_pool)
options=options, memory_pool=memory_pool, errors=errors)

def view(self, object target_type):
"""
Expand Down
90 changes: 81 additions & 9 deletions python/pyarrow/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,8 @@ def _make_global_functions():
utf8_zfill = utf8_zero_fill = globals()["utf8_zero_fill"]


def cast(arr, target_type=None, safe=None, options=None, memory_pool=None):
def cast(arr, target_type=None, safe=None, options=None, memory_pool=None, *,
errors='raise'):
"""
Cast array values to another data type. Can also be invoked as an array
instance method.
Expand All @@ -357,10 +358,11 @@ def cast(arr, target_type=None, safe=None, options=None, memory_pool=None):
Type to cast to
safe : bool, default True
Check for overflows or other unsafe conversions
options : CastOptions, default None
Additional checks pass by CastOptions
memory_pool : MemoryPool, optional
memory pool to use for allocations during function execution.
errors : str, default 'raise'
What to do if a value cannot be casted to the target type.
'raise' will raise an error, 'coerce' will produce a null.

Examples
--------
Expand Down Expand Up @@ -394,26 +396,96 @@ def cast(arr, target_type=None, safe=None, options=None, memory_pool=None):
>>> arr.cast('timestamp[ms]').type
TimestampType(timestamp[ms])

Use ``errors='coerce'`` to convert invalid values to null instead of
raising an error:

>>> arr = pa.array(["1.2", "3", "10-20", None, "nan", ""])
>>> cast(arr, pa.float64(), errors='coerce')
<pyarrow.lib.DoubleArray object at ...>
[
1.2,
3.0,
null,
null,
nan,
null
]

Returns
-------
casted : Array
The cast result as a new Array
"""
safe_vars_passed = (safe is not None) or (target_type is not None)

if safe_vars_passed and (options is not None):
raise ValueError("Must either pass values for 'target_type' and 'safe'"
" or pass a value for 'options'")

# Validate parameter combinations
if target_type is not None and options is not None:
raise ValueError("Must either pass 'target_type' (and optionally 'safe') "
"or pass 'options', but not both")

if options is None:
if target_type is None:
raise ValueError("Must provide either 'target_type' or 'options'")
target_type = pa.types.lib.ensure_type(target_type)
if safe is False:
options = CastOptions.unsafe(target_type)
else:
options = CastOptions.safe(target_type)

# Apply errors parameter regardless of whether options was provided
if errors == 'coerce':
options.null_on_error = True
elif errors == 'raise':
options.null_on_error = False
else:
raise ValueError("errors must be either 'raise' or 'coerce'")

return call_function("cast", [arr], options, memory_pool)


def is_castable(arr, target_type=None, options=None, memory_pool=None):
"""
Check if values can be casted to another data type.

Returns true if the value can be successfully casted to the target type.

Parameters
----------
arr : Array-like
target_type : DataType or str, optional
The PyArrow type to check castability to.
options : CastOptions, optional
Casting options. If passed, 'target_type' must be None.
memory_pool : MemoryPool, optional
If not passed, will allocate memory from the default memory pool.

Returns
-------
is_castable : Array
A boolean array

Examples
--------
>>> import pyarrow as pa
>>> import pyarrow.compute as pc
>>> arr = pa.array(["1.1", "2.2", "abc", "4.4"])
>>> pc.is_castable(arr, pa.float64())
<pyarrow.lib.BooleanArray object at ...>
[
true,
true,
false,
true
]
"""
if target_type is not None and options is not None:
raise ValueError("Must either pass 'target_type' or 'options'")

if options is None:
target_type = pa.types.lib.ensure_type(target_type)
options = CastOptions.safe(target_type)

return call_function("is_castable", [arr], options, memory_pool)


def index(data, value, start=None, end=None, *, memory_pool=None):
"""
Find the index of the first occurrence of a given value.
Expand Down
7 changes: 5 additions & 2 deletions python/pyarrow/scalar.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ cdef class Scalar(_Weakrefable):
"""
return self.wrapped.get().is_valid

def cast(self, object target_type=None, safe=None, options=None, memory_pool=None):
def cast(self, object target_type=None, safe=None, options=None, memory_pool=None, *, errors='raise'):
"""
Cast scalar value to another data type.

Expand All @@ -86,13 +86,16 @@ cdef class Scalar(_Weakrefable):
Additional checks pass by CastOptions
memory_pool : MemoryPool, optional
memory pool to use for allocations during function execution.
errors : str, default 'raise'
What to do if a value cannot be casted to the target type.
'raise' will raise an error, 'coerce' will produce a null.

Returns
-------
scalar : A Scalar of the given target data type.
"""
return _pc().cast(self, target_type, safe=safe,
options=options, memory_pool=memory_pool)
options=options, memory_pool=memory_pool, errors=errors)

def validate(self, *, full=False):
"""
Expand Down
Loading
Loading