Skip to content

Commit d8beb19

Browse files
committed
GH-49753: [C++][Gandiva] Added a helper function for concat_ws
Added a helper function that takes initializer_lists as arguments, which allows several parameters (word, word_len) to be passed to concat_ws* function without breaking the original functionality and APIs. Removed redundant code. Tested for regression too.
1 parent 72b7a85 commit d8beb19

1 file changed

Lines changed: 71 additions & 209 deletions

File tree

cpp/src/gandiva/precompiled/string_ops.cc

Lines changed: 71 additions & 209 deletions
Original file line numberDiff line numberDiff line change
@@ -2460,18 +2460,18 @@ struct SafeLengthState {
24602460
};
24612461

24622462
// Helper to safely add a word length
2463-
static inline bool safe_accumulate_word(SafeLengthState* state, int32_t word_len,
2463+
static inline bool safe_accumulate_word(SafeLengthState& state, int32_t word_len,
24642464
bool word_validity) {
2465-
if (!word_validity) return true;
2465+
if (not word_validity) return true;
24662466

24672467
int32_t temp = 0;
24682468
if (ARROW_PREDICT_FALSE(
2469-
arrow::internal::AddWithOverflow(state->total_len, word_len, &temp))) {
2470-
state->overflow = true;
2469+
arrow::internal::AddWithOverflow(state.total_len, word_len, &temp))) {
2470+
state.overflow = true;
24712471
return false;
24722472
}
2473-
state->total_len = temp;
2474-
state->num_valid++;
2473+
state.total_len = temp;
2474+
state.num_valid++;
24752475
return true;
24762476
}
24772477

@@ -2512,121 +2512,97 @@ static inline const char* handle_empty_result(bool* out_valid, int32_t* out_len)
25122512
return "";
25132513
}
25142514

2515-
FORCE_INLINE
2516-
const char* concat_ws_utf8_utf8(int64_t context, const char* separator,
2517-
int32_t separator_len, bool separator_validity,
2518-
const char* word1, int32_t word1_len, bool word1_validity,
2519-
const char* word2, int32_t word2_len, bool word2_validity,
2520-
bool* out_valid, int32_t* out_len) {
2515+
struct WordArg {
2516+
const char* data;
2517+
int32_t len;
2518+
bool valid;
2519+
};
2520+
2521+
static inline const char* concat_ws_impl(int64_t context, const char* separator,
2522+
int32_t separator_len, bool separator_validity,
2523+
bool* out_valid, int32_t* out_len,
2524+
std::initializer_list<WordArg> words) {
25212525
*out_len = 0;
2522-
// If separator is null, always return null
2523-
if (!separator_validity) {
2524-
*out_len = 0;
2526+
2527+
// Separator validity check
2528+
if (not separator_validity) {
25252529
*out_valid = false;
25262530
return "";
25272531
}
25282532

2529-
// If separator is null, always return null
2530-
if (!separator_validity) {
2531-
return handle_overflow_failure(out_valid, out_len);
2532-
}
2533-
25342533
SafeLengthState state;
25352534

2536-
// Accumulate word lengths safely
2537-
safe_accumulate_word(&state, word1_len, word1_validity);
2538-
safe_accumulate_word(&state, word2_len, word2_validity);
2539-
2540-
if (state.overflow) {
2541-
return handle_overflow_failure(out_valid, out_len);
2535+
// Accumulate all word lengths safely
2536+
for (const WordArg& w : words) {
2537+
safe_accumulate_word(state, w.len, w.valid);
2538+
if (state.overflow) {
2539+
*out_valid = false;
2540+
*out_len = 0;
2541+
return "";
2542+
}
25422543
}
25432544

25442545
// Add separator lengths
2545-
if (!safe_add_separators(&state, separator_len)) {
2546-
return handle_overflow_failure(out_valid, out_len);
2546+
if (not safe_add_separators(&state, separator_len)) {
2547+
*out_valid = false;
2548+
*out_len = 0;
2549+
return "";
25472550
}
25482551

2549-
// Handle case with no valid words
2552+
// Empty result
25502553
if (state.total_len == 0) {
2551-
return handle_empty_result(out_valid, out_len);
2554+
*out_valid = true;
2555+
*out_len = 0;
2556+
return "";
25522557
}
25532558

2554-
// Allocate and concatenate
2559+
// Allocate memory
25552560
char* out =
25562561
reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, state.total_len));
25572562
if (out == nullptr) {
25582563
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
2559-
*out_len = 0;
25602564
*out_valid = false;
2565+
*out_len = 0;
25612566
return "";
25622567
}
25632568

2569+
// Concatenate all words
25642570
char* tmp = out;
25652571
int out_idx = 0;
25662572
bool seenAnyValidInput = false;
25672573

2568-
concat_word(tmp, &out_idx, word1, word1_len, word1_validity, separator, separator_len,
2569-
&seenAnyValidInput);
2570-
concat_word(tmp, &out_idx, word2, word2_len, word2_validity, separator, separator_len,
2571-
&seenAnyValidInput);
2574+
for (const WordArg& w : words) {
2575+
concat_word(tmp, &out_idx, w.data, w.len, w.valid, separator, separator_len,
2576+
&seenAnyValidInput);
2577+
}
25722578

25732579
*out_valid = true;
25742580
*out_len = out_idx;
25752581
return out;
25762582
}
25772583

2584+
FORCE_INLINE
2585+
const char* concat_ws_utf8_utf8(int64_t context, const char* separator,
2586+
int32_t separator_len, bool separator_validity,
2587+
const char* word1, int32_t word1_len, bool word1_validity,
2588+
const char* word2, int32_t word2_len, bool word2_validity,
2589+
bool* out_valid, int32_t* out_len) {
2590+
return concat_ws_impl(
2591+
context, separator, separator_len, separator_validity, out_valid, out_len,
2592+
{{word1, word1_len, word1_validity}, {word2, word2_len, word2_validity}});
2593+
}
2594+
25782595
FORCE_INLINE
25792596
const char* concat_ws_utf8_utf8_utf8(
25802597
int64_t context, const char* separator, int32_t separator_len,
25812598
bool separator_validity, const char* word1, int32_t word1_len, bool word1_validity,
25822599
const char* word2, int32_t word2_len, bool word2_validity, const char* word3,
25832600
int32_t word3_len, bool word3_validity, bool* out_valid, int32_t* out_len) {
2584-
*out_len = 0;
2585-
if (!separator_validity) {
2586-
return handle_overflow_failure(out_valid, out_len);
2587-
}
2588-
2589-
SafeLengthState state;
2590-
2591-
safe_accumulate_word(&state, word1_len, word1_validity);
2592-
safe_accumulate_word(&state, word2_len, word2_validity);
2593-
safe_accumulate_word(&state, word3_len, word3_validity);
2594-
2595-
if (state.overflow) {
2596-
return handle_overflow_failure(out_valid, out_len);
2597-
}
2598-
2599-
if (!safe_add_separators(&state, separator_len)) {
2600-
return handle_overflow_failure(out_valid, out_len);
2601-
}
2602-
2603-
if (state.total_len == 0) {
2604-
return handle_empty_result(out_valid, out_len);
2605-
}
2606-
2607-
char* out =
2608-
reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, state.total_len));
2609-
if (out == nullptr) {
2610-
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
2611-
*out_len = 0;
2612-
*out_valid = false;
2613-
return "";
2614-
}
2615-
2616-
char* tmp = out;
2617-
int out_idx = 0;
2618-
bool seenAnyValidInput = false;
2619-
2620-
concat_word(tmp, &out_idx, word1, word1_len, word1_validity, separator, separator_len,
2621-
&seenAnyValidInput);
2622-
concat_word(tmp, &out_idx, word2, word2_len, word2_validity, separator, separator_len,
2623-
&seenAnyValidInput);
2624-
concat_word(tmp, &out_idx, word3, word3_len, word3_validity, separator, separator_len,
2625-
&seenAnyValidInput);
2626-
2627-
*out_valid = true;
2628-
*out_len = out_idx;
2629-
return out;
2601+
return concat_ws_impl(context, separator, separator_len, separator_validity, out_valid,
2602+
out_len,
2603+
{{word1, word1_len, word1_validity},
2604+
{word2, word2_len, word2_validity},
2605+
{word3, word3_len, word3_validity}});
26302606
}
26312607

26322608
FORCE_INLINE
@@ -2636,68 +2612,12 @@ const char* concat_ws_utf8_utf8_utf8_utf8(
26362612
const char* word2, int32_t word2_len, bool word2_validity, const char* word3,
26372613
int32_t word3_len, bool word3_validity, const char* word4, int32_t word4_len,
26382614
bool word4_validity, bool* out_valid, int32_t* out_len) {
2639-
*out_len = 0;
2640-
// If separator is null, always return null
2641-
if (!separator_validity) {
2642-
*out_len = 0;
2643-
*out_valid = false;
2644-
return "";
2645-
}
2646-
2647-
SafeLengthState state;
2648-
2649-
// Accumulate all word lengths with overflow checking
2650-
safe_accumulate_word(&state, word1_len, word1_validity);
2651-
safe_accumulate_word(&state, word2_len, word2_validity);
2652-
safe_accumulate_word(&state, word3_len, word3_validity);
2653-
safe_accumulate_word(&state, word4_len, word4_validity);
2654-
2655-
if (state.overflow) {
2656-
*out_len = 0;
2657-
*out_valid = false;
2658-
return "";
2659-
}
2660-
2661-
// Add separator lengths with overflow checking
2662-
if (!safe_add_separators(&state, separator_len)) {
2663-
*out_len = 0;
2664-
*out_valid = false;
2665-
return "";
2666-
}
2667-
2668-
// Handle case with no valid words
2669-
if (state.total_len == 0) {
2670-
*out_len = 0;
2671-
*out_valid = true;
2672-
return "";
2673-
}
2674-
2675-
// Allocate memory
2676-
char* out =
2677-
reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, state.total_len));
2678-
if (out == nullptr) {
2679-
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
2680-
*out_valid = false;
2681-
*out_len = 0;
2682-
return "";
2683-
}
2684-
2685-
char* tmp = out;
2686-
int out_idx = 0;
2687-
bool seenAnyValidInput = false;
2688-
2689-
concat_word(tmp, &out_idx, word1, word1_len, word1_validity, separator, separator_len,
2690-
&seenAnyValidInput);
2691-
concat_word(tmp, &out_idx, word2, word2_len, word2_validity, separator, separator_len,
2692-
&seenAnyValidInput);
2693-
concat_word(tmp, &out_idx, word3, word3_len, word3_validity, separator, separator_len,
2694-
&seenAnyValidInput);
2695-
concat_word(tmp, &out_idx, word4, word4_len, word4_validity, separator, separator_len,
2696-
&seenAnyValidInput);
2697-
2698-
*out_valid = true;
2699-
*out_len = out_idx;
2700-
return out;
2615+
return concat_ws_impl(context, separator, separator_len, separator_validity, out_valid,
2616+
out_len,
2617+
{{word1, word1_len, word1_validity},
2618+
{word2, word2_len, word2_validity},
2619+
{word3, word3_len, word3_validity},
2620+
{word4, word4_len, word4_validity}});
27012621
}
27022622

27032623
FORCE_INLINE
@@ -2708,71 +2628,13 @@ const char* concat_ws_utf8_utf8_utf8_utf8_utf8(
27082628
int32_t word3_len, bool word3_validity, const char* word4, int32_t word4_len,
27092629
bool word4_validity, const char* word5, int32_t word5_len, bool word5_validity,
27102630
bool* out_valid, int32_t* out_len) {
2711-
*out_len = 0;
2712-
// If separator is null, always return null
2713-
if (!separator_validity) {
2714-
*out_len = 0;
2715-
*out_valid = false;
2716-
return "";
2717-
}
2718-
2719-
SafeLengthState state;
2720-
2721-
// Accumulate all word lengths with overflow checking
2722-
safe_accumulate_word(&state, word1_len, word1_validity);
2723-
safe_accumulate_word(&state, word2_len, word2_validity);
2724-
safe_accumulate_word(&state, word3_len, word3_validity);
2725-
safe_accumulate_word(&state, word4_len, word4_validity);
2726-
safe_accumulate_word(&state, word5_len, word5_validity);
2727-
2728-
if (state.overflow) {
2729-
*out_len = 0;
2730-
*out_valid = false;
2731-
return "";
2732-
}
2733-
2734-
// Add separator lengths with overflow checking
2735-
if (!safe_add_separators(&state, separator_len)) {
2736-
*out_len = 0;
2737-
*out_valid = false;
2738-
return "";
2739-
}
2740-
2741-
// Handle case with no valid words
2742-
if (state.total_len == 0) {
2743-
*out_len = 0;
2744-
*out_valid = true;
2745-
return "";
2746-
}
2747-
2748-
// Allocate memory
2749-
char* out =
2750-
reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, state.total_len));
2751-
if (out == nullptr) {
2752-
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
2753-
*out_len = 0;
2754-
*out_valid = false;
2755-
return "";
2756-
}
2757-
2758-
char* tmp = out;
2759-
int out_idx = 0;
2760-
bool seenAnyValidInput = false;
2761-
2762-
concat_word(tmp, &out_idx, word1, word1_len, word1_validity, separator, separator_len,
2763-
&seenAnyValidInput);
2764-
concat_word(tmp, &out_idx, word2, word2_len, word2_validity, separator, separator_len,
2765-
&seenAnyValidInput);
2766-
concat_word(tmp, &out_idx, word3, word3_len, word3_validity, separator, separator_len,
2767-
&seenAnyValidInput);
2768-
concat_word(tmp, &out_idx, word4, word4_len, word4_validity, separator, separator_len,
2769-
&seenAnyValidInput);
2770-
concat_word(tmp, &out_idx, word5, word5_len, word5_validity, separator, separator_len,
2771-
&seenAnyValidInput);
2772-
2773-
*out_valid = true;
2774-
*out_len = out_idx;
2775-
return out;
2631+
return concat_ws_impl(context, separator, separator_len, separator_validity, out_valid,
2632+
out_len,
2633+
{{word1, word1_len, word1_validity},
2634+
{word2, word2_len, word2_validity},
2635+
{word3, word3_len, word3_validity},
2636+
{word4, word4_len, word4_validity},
2637+
{word5, word5_len, word5_validity}});
27762638
}
27772639

27782640
FORCE_INLINE

0 commit comments

Comments
 (0)