Skip to content

Exclusive_scan makes wrong answer for char to int output on HIP. #2004

@BradWhitlock

Description

@BradWhitlock

I have places in my code where I do an exclusive_scan() over a mask and make some offsets from it. I wanted to experiment with making my masks char while leaving the scan output as int or long. I wanted the scan to accumulate using the output type so I passed RAJA::operators::plus<T_out>{} as an argument to the scan. That is enough to make it work for SEQ, OMP, and CUDA. With HIP it does not work and makes some negative offsets once char overflows.

This program reproduces my problem on rzadams.

/*
TPLIBS=/usr/WS1/axom/libs/toss_4_x86_64_ib_cray/2026_01_29_13_42_44/llvm-amdgpu-6.4.2
RAJA_DIR=$TPLIBS/raja-git.3b8b59a1e9be2e1066c0d77372b3bf5956e6d6e2_develop-ayjwwho7m3chhut5igfdk6ycvuzd4tgh
CAMP_DIR=$TPLIBS/camp-git.a8caefa9f4c811b1a114b4ed2c9b681d40f12325_main-jspykahx2pbh3nddl6tz2n3arip53r4l
hipcc -o raja_scan raja_scan.cpp -I${RAJA_DIR}/include -I${CAMP_DIR}/include -L${RAJA_DIR}/lib -L${CAMP_DIR}/lib -lRAJA -lcamp
*/

#include <cassert>
#include <iostream>
#include <type_traits>

#include "RAJA/RAJA.hpp"

// We have a char mask to save memory
const char hostInput[] = {1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0};
// We want to exclusive_scan the mask to get this output:
const int hostOutput[] = {0, 1, 1, 2, 2, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 27, 28, 28, 28, 28, 29, 29, 29, 29, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 39, 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 42, 42, 42, 42, 43, 43, 43, 43, 44, 44, 44, 44, 45, 45, 46, 46, 46, 46, 47, 47, 47, 47, 48, 48, 49, 49, 49, 49, 50, 50, 50, 50, 51, 51, 51, 51, 52, 52, 52, 52, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 55, 55, 56, 56, 56, 56, 57, 57, 57, 57, 58, 58, 58, 58, 59, 59, 59, 59, 60, 60, 61, 61, 61, 61, 62, 62, 62, 62, 63, 63, 64, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66, 66, 67, 67, 67, 67, 68, 68, 68, 68, 69, 69, 69, 69, 70, 70, 70, 70, 71, 71, 71, 71, 72, 72, 72, 72, 73, 73, 73, 73, 74, 74, 74, 74, 75, 75, 76, 76, 76, 76, 77, 77, 77, 77, 78, 78, 79, 79, 79, 79, 80, 80, 80, 80, 81, 81, 81, 81, 82, 82, 82, 82, 83, 83, 83, 83, 84, 84, 84, 84, 85, 85, 85, 85, 86, 86, 86, 86, 87, 87, 87, 87, 88, 88, 88, 88, 89, 89, 89, 89, 90, 90, 91, 91, 91, 91, 92, 92, 92, 92, 93, 93, 94, 94, 94, 94, 95, 95, 95, 95, 96, 96, 96, 96, 97, 97, 97, 97, 98, 98, 98, 98, 99, 99, 99, 99, 100, 100, 100, 100, 101, 101, 101, 101, 102, 102, 102, 102, 103, 103, 103, 103, 104, 104, 104, 104, 105, 105, 106, 106, 106, 106, 107, 107, 107, 107, 108, 108, 109, 109, 109, 109, 110, 110, 110, 110, 111, 111, 111, 111, 112, 112, 112, 112, 113, 113, 113, 113, 114, 114, 114, 114, 115, 115, 115, 115, 116, 116, 116, 116, 117, 117, 117, 117, 118, 118, 118, 118, 119, 119, 119, 119, 120, 120, 121, 121, 121, 121, 122, 122, 122, 122, 123, 123, 124, 124, 124, 124, 125, 125, 125, 125, 126, 126, 126, 126, 127, 127, 127, 127, 128, 128, 128, 128, 129, 129, 129, 129, 130, 130, 130, 130, 131, 131, 131, 131, 132, 132, 132, 132, 133, 133, 133, 133, 134, 134, 134, 134, 135, 135, 136, 136, 136, 136, 137, 137, 137, 137, 138, 138, 139, 139, 139, 139, 140, 140, 140, 140, 141, 141, 141, 141, 142, 142, 142, 142, 143, 143, 143, 143, 144, 144, 144, 144, 145, 145, 145, 145, 146, 146, 146, 146, 147, 147, 147, 147, 148, 148, 148, 148, 149, 149, 149, 149, 150, 150, 151, 151, 152, 152, 153, 154, 154, 155, 155, 156, 156, 157, 157, 158, 158, 159, 159, 160, 160, 161, 161, 162, 162, 163, 163, 164, 164, 165, 166, 166, 167, 167, 167, 167, 168, 168, 168, 168, 169, 169, 169, 169, 170, 170, 170, 170, 171, 171, 171, 171, 172, 172, 172, 172, 173, 173, 173, 173, 174, 174, 174, 174, 175, 175, 175, 175, 176};

template <typename ExecPolicy, typename T_in, typename T_out>
void raja_exclusive_scan(const T_in* d_in, T_out* d_out, int n)
{
  // Scan from char and sum into ints.
  RAJA::exclusive_scan<ExecPolicy>(
    RAJA::make_span(d_in, n),
    RAJA::make_span(d_out, n),
    // This operator is supposed to make the sum happen as T_out
    RAJA::operators::plus<T_out>{});
}

bool check(const int *result, int n)
{
  // Check results
  bool ok = true;
  for(int i = 0; i < n; ++i)
  {
    if(result[i] != hostOutput[i])
    {
      std::cout << "Mismatch at " << i
                << ", got " << result[i]
                << ", expected " << hostOutput[i] << "\n";
      ok = false;
      break;
    }
  }
  return ok;
}

void test_exclusive_scan_char_int_seq()
{
  using ExecPolicy = RAJA::seq_exec;
  const int n = static_cast<int>(sizeof(hostInput) / sizeof(char));

  int* d_out = new int[n];

  // RAJA exclusive scan: char input, int output
  raja_exclusive_scan<ExecPolicy>(hostInput, d_out, n);

  // Check results
  const bool ok = check(d_out, n);
  std::cout << "SEQ char->int exclusive_scan: "
            << (ok ? "PASS" : "FAIL") << "\n";

  delete[] d_out;
}

void test_exclusive_scan_char_int_hip()
{
  using ExecPolicy = RAJA::hip_exec<256>;
  const int n = static_cast<int>(sizeof(hostInput) / sizeof(char));

  // Device allocations
  char* d_in = nullptr;
  int* d_out = nullptr;

  hipMalloc(&d_in, n * sizeof(char));
  hipMalloc(&d_out, n * sizeof(int));

  // host -> device
  hipMemcpy(d_in, hostInput, n * sizeof(char), hipMemcpyHostToDevice);

  // RAJA exclusive scan: char input, int output
  raja_exclusive_scan<ExecPolicy>(d_in, d_out, n);

  // device -> host
  int* hostResult = new int[n];
  hipMemcpy(hostResult, d_out, n * sizeof(int), hipMemcpyDeviceToHost);

  // Check results
  const bool ok = check(hostResult, n);
  std::cout << "HIP char->int exclusive_scan: "
            << (ok ? "PASS" : "FAIL") << "\n";

  delete[] hostResult;
  hipFree(d_in);
  hipFree(d_out);
}

int main(int argc, char* argv[])
{
  // Initialize HIP if needed, usually hipSetDevice(0) is enough
  hipSetDevice(0);

  test_exclusive_scan_char_int_seq();
  test_exclusive_scan_char_int_hip();

  hipDeviceSynchronize();
  return 0;
}

Expected behavior

I wanted the HIP output for exclusive_scan on char data with int output to have the same values as other backends.

Compilers & Libraries (please complete the following information):
HIP - see above program for RAJA paths, etc.

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions