This program reproduces my problem on rzadams.
/*
TPLIBS=/usr/WS1/axom/libs/toss_4_x86_64_ib_cray/2026_01_29_13_42_44/llvm-amdgpu-6.4.2
RAJA_DIR=$TPLIBS/raja-git.3b8b59a1e9be2e1066c0d77372b3bf5956e6d6e2_develop-ayjwwho7m3chhut5igfdk6ycvuzd4tgh
CAMP_DIR=$TPLIBS/camp-git.a8caefa9f4c811b1a114b4ed2c9b681d40f12325_main-jspykahx2pbh3nddl6tz2n3arip53r4l
hipcc -o raja_scan raja_scan.cpp -I${RAJA_DIR}/include -I${CAMP_DIR}/include -L${RAJA_DIR}/lib -L${CAMP_DIR}/lib -lRAJA -lcamp
*/
#include <cassert>
#include <iostream>
#include <type_traits>
#include "RAJA/RAJA.hpp"
// We have a char mask to save memory
const char hostInput[] = {1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0};
// We want to exclusive_scan the mask to get this output:
const int hostOutput[] = {0, 1, 1, 2, 2, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 27, 28, 28, 28, 28, 29, 29, 29, 29, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 39, 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 42, 42, 42, 42, 43, 43, 43, 43, 44, 44, 44, 44, 45, 45, 46, 46, 46, 46, 47, 47, 47, 47, 48, 48, 49, 49, 49, 49, 50, 50, 50, 50, 51, 51, 51, 51, 52, 52, 52, 52, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 55, 55, 56, 56, 56, 56, 57, 57, 57, 57, 58, 58, 58, 58, 59, 59, 59, 59, 60, 60, 61, 61, 61, 61, 62, 62, 62, 62, 63, 63, 64, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66, 66, 67, 67, 67, 67, 68, 68, 68, 68, 69, 69, 69, 69, 70, 70, 70, 70, 71, 71, 71, 71, 72, 72, 72, 72, 73, 73, 73, 73, 74, 74, 74, 74, 75, 75, 76, 76, 76, 76, 77, 77, 77, 77, 78, 78, 79, 79, 79, 79, 80, 80, 80, 80, 81, 81, 81, 81, 82, 82, 82, 82, 83, 83, 83, 83, 84, 84, 84, 84, 85, 85, 85, 85, 86, 86, 86, 86, 87, 87, 87, 87, 88, 88, 88, 88, 89, 89, 89, 89, 90, 90, 91, 91, 91, 91, 92, 92, 92, 92, 93, 93, 94, 94, 94, 94, 95, 95, 95, 95, 96, 96, 96, 96, 97, 97, 97, 97, 98, 98, 98, 98, 99, 99, 99, 99, 100, 100, 100, 100, 101, 101, 101, 101, 102, 102, 102, 102, 103, 103, 103, 103, 104, 104, 104, 104, 105, 105, 106, 106, 106, 106, 107, 107, 107, 107, 108, 108, 109, 109, 109, 109, 110, 110, 110, 110, 111, 111, 111, 111, 112, 112, 112, 112, 113, 113, 113, 113, 114, 114, 114, 114, 115, 115, 115, 115, 116, 116, 116, 116, 117, 117, 117, 117, 118, 118, 118, 118, 119, 119, 119, 119, 120, 120, 121, 121, 121, 121, 122, 122, 122, 122, 123, 123, 124, 124, 124, 124, 125, 125, 125, 125, 126, 126, 126, 126, 127, 127, 127, 127, 128, 128, 128, 128, 129, 129, 129, 129, 130, 130, 130, 130, 131, 131, 131, 131, 132, 132, 132, 132, 133, 133, 133, 133, 134, 134, 134, 134, 135, 135, 136, 136, 136, 136, 137, 137, 137, 137, 138, 138, 139, 139, 139, 139, 140, 140, 140, 140, 141, 141, 141, 141, 142, 142, 142, 142, 143, 143, 143, 143, 144, 144, 144, 144, 145, 145, 145, 145, 146, 146, 146, 146, 147, 147, 147, 147, 148, 148, 148, 148, 149, 149, 149, 149, 150, 150, 151, 151, 152, 152, 153, 154, 154, 155, 155, 156, 156, 157, 157, 158, 158, 159, 159, 160, 160, 161, 161, 162, 162, 163, 163, 164, 164, 165, 166, 166, 167, 167, 167, 167, 168, 168, 168, 168, 169, 169, 169, 169, 170, 170, 170, 170, 171, 171, 171, 171, 172, 172, 172, 172, 173, 173, 173, 173, 174, 174, 174, 174, 175, 175, 175, 175, 176};
template <typename ExecPolicy, typename T_in, typename T_out>
void raja_exclusive_scan(const T_in* d_in, T_out* d_out, int n)
{
// Scan from char and sum into ints.
RAJA::exclusive_scan<ExecPolicy>(
RAJA::make_span(d_in, n),
RAJA::make_span(d_out, n),
// This operator is supposed to make the sum happen as T_out
RAJA::operators::plus<T_out>{});
}
bool check(const int *result, int n)
{
// Check results
bool ok = true;
for(int i = 0; i < n; ++i)
{
if(result[i] != hostOutput[i])
{
std::cout << "Mismatch at " << i
<< ", got " << result[i]
<< ", expected " << hostOutput[i] << "\n";
ok = false;
break;
}
}
return ok;
}
void test_exclusive_scan_char_int_seq()
{
using ExecPolicy = RAJA::seq_exec;
const int n = static_cast<int>(sizeof(hostInput) / sizeof(char));
int* d_out = new int[n];
// RAJA exclusive scan: char input, int output
raja_exclusive_scan<ExecPolicy>(hostInput, d_out, n);
// Check results
const bool ok = check(d_out, n);
std::cout << "SEQ char->int exclusive_scan: "
<< (ok ? "PASS" : "FAIL") << "\n";
delete[] d_out;
}
void test_exclusive_scan_char_int_hip()
{
using ExecPolicy = RAJA::hip_exec<256>;
const int n = static_cast<int>(sizeof(hostInput) / sizeof(char));
// Device allocations
char* d_in = nullptr;
int* d_out = nullptr;
hipMalloc(&d_in, n * sizeof(char));
hipMalloc(&d_out, n * sizeof(int));
// host -> device
hipMemcpy(d_in, hostInput, n * sizeof(char), hipMemcpyHostToDevice);
// RAJA exclusive scan: char input, int output
raja_exclusive_scan<ExecPolicy>(d_in, d_out, n);
// device -> host
int* hostResult = new int[n];
hipMemcpy(hostResult, d_out, n * sizeof(int), hipMemcpyDeviceToHost);
// Check results
const bool ok = check(hostResult, n);
std::cout << "HIP char->int exclusive_scan: "
<< (ok ? "PASS" : "FAIL") << "\n";
delete[] hostResult;
hipFree(d_in);
hipFree(d_out);
}
int main(int argc, char* argv[])
{
// Initialize HIP if needed, usually hipSetDevice(0) is enough
hipSetDevice(0);
test_exclusive_scan_char_int_seq();
test_exclusive_scan_char_int_hip();
hipDeviceSynchronize();
return 0;
}
I wanted the HIP output for exclusive_scan on char data with int output to have the same values as other backends.
I have places in my code where I do an
exclusive_scan()over a mask and make some offsets from it. I wanted to experiment with making my maskscharwhile leaving the scan output asintorlong. I wanted the scan to accumulate using the output type so I passedRAJA::operators::plus<T_out>{}as an argument to the scan. That is enough to make it work for SEQ, OMP, and CUDA. With HIP it does not work and makes some negative offsets once char overflows.This program reproduces my problem on rzadams.
Expected behavior
I wanted the HIP output for exclusive_scan on char data with int output to have the same values as other backends.
Compilers & Libraries (please complete the following information):
HIP - see above program for RAJA paths, etc.