Closed
Description
Describe the bug
group_ballot
returned from the get_group_barrier()
is having SFINAE isues when used with group_barrier()
Thanks to @nbeams for reproducer.
Error:
sycl_group_ballot.cpp:48:14: error: no matching function for call to 'group_barrier'
sycl::group_barrier(active_threads, sycl::memory_scope_sub_group);
^~~~~~~~~~~~~~~~~~~
/soft/testing/dpcpp/bin/../include/sycl/group_barrier.hpp:35:1: note: candidate template ignored: requirement 'is_group_v<sycl::ext::oneapi::experimental::ballot_group<sycl::ext::oneapi::sub_group>>' was not satisfied [with Group = ballot_group<sub_group>]
group_barrier(Group, memory_scope FenceScope = Group::fence_scope) {
^
To Reproduce
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <sycl/sycl.hpp>
void check_vals(double *array, int array_size, double value) {
bool pass = true;
for (int k = 0; k < array_size; k++) {
if ((abs(array[k] - value) > 1.e-11)) {
pass = false;
printf("A[%d] = %e\n", k, array[k]);
}
}
if (pass)
printf("OK\n\n");
else
printf("FAIL\n\n");
}
int main() {
std::vector<sycl::platform> platforms = sycl::platform::get_platforms();
std::vector<sycl::device> devices = platforms[0].get_devices();
sycl::queue queue(devices[0], {sycl::property::queue::in_order()});
int M = 200;
int N = 1;
int block_size = 64;
int array_size = 0;
int num_blocks = 0;
array_size = M * N;
num_blocks = (array_size / block_size == 0) ? array_size/block_size : (array_size/block_size + 1);
double *A = (double*)(malloc(M*N*sizeof(double)));
double *dA = (double*)(sycl::malloc_device(M*N*sizeof(double), queue));
queue.submit([&](sycl::handler &cgh) {
cgh.parallel_for(
sycl::nd_range<1>(sycl::range<1>(block_size * num_blocks), sycl::range<1>(block_size)),
[=](sycl::nd_item<1> item_ct1) {
const int idx = item_ct1.get_local_id(0) + block_size * item_ct1.get_group(0);
auto sg = item_ct1.get_sub_group();
auto active_threads = sycl::ext::oneapi::experimental::get_ballot_group(
sg, idx < array_size);
if (idx < array_size) {
dA[idx] = 2.0;
sycl::group_barrier(active_threads, sycl::memory_scope_sub_group);
}
});
}).wait();
// Copy from device to host
queue.memcpy(A, dA, array_size * sizeof(double));
queue.wait();
check_vals(A, array_size, 2.0);
sycl::free(dA, queue);
return 0;
}