Skip to content

Commit 7144536

Browse files
sbastrakovpsychocoderHPC
authored andcommitted
Add a warp concept for accessing warp functionality from kernels
Implement compile-time accessor for warp size, and warp voting functions CUDA and HIP use built-in warps. CPU accelerators emulate it with single-thread warps.
1 parent da9dd9f commit 7144536

11 files changed

+486
-0
lines changed

include/alpaka/acc/AccCpuFibers.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <alpaka/intrinsic/IntrinsicCpu.hpp>
2626
#include <alpaka/rand/RandStdLib.hpp>
2727
#include <alpaka/time/TimeStdLib.hpp>
28+
#include <alpaka/warp/WarpSingleThread.hpp>
2829

2930
// Specialized traits.
3031
#include <alpaka/acc/Traits.hpp>
@@ -83,6 +84,7 @@ namespace alpaka
8384
public intrinsic::IntrinsicCpu,
8485
public rand::RandStdLib,
8586
public time::TimeStdLib,
87+
public warp::WarpSingleThread,
8688
public concepts::Implements<ConceptAcc, AccCpuFibers<TDim, TIdx>>
8789
{
8890
public:

include/alpaka/acc/AccCpuOmp2Blocks.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include <alpaka/intrinsic/IntrinsicCpu.hpp>
3131
#include <alpaka/rand/RandStdLib.hpp>
3232
#include <alpaka/time/TimeOmp.hpp>
33+
#include <alpaka/warp/WarpSingleThread.hpp>
3334

3435
// Specialized traits.
3536
#include <alpaka/acc/Traits.hpp>
@@ -84,6 +85,7 @@ namespace alpaka
8485
public intrinsic::IntrinsicCpu,
8586
public rand::RandStdLib,
8687
public time::TimeOmp,
88+
public warp::WarpSingleThread,
8789
public concepts::Implements<ConceptAcc, AccCpuOmp2Blocks<TDim, TIdx>>
8890
{
8991
public:

include/alpaka/acc/AccCpuOmp2Threads.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include <alpaka/intrinsic/IntrinsicCpu.hpp>
3030
#include <alpaka/rand/RandStdLib.hpp>
3131
#include <alpaka/time/TimeOmp.hpp>
32+
#include <alpaka/warp/WarpSingleThread.hpp>
3233

3334
// Specialized traits.
3435
#include <alpaka/acc/Traits.hpp>
@@ -85,6 +86,7 @@ namespace alpaka
8586
public intrinsic::IntrinsicCpu,
8687
public rand::RandStdLib,
8788
public time::TimeOmp,
89+
public warp::WarpSingleThread,
8890
public concepts::Implements<ConceptAcc, AccCpuOmp2Threads<TDim, TIdx>>
8991
{
9092
public:

include/alpaka/acc/AccCpuOmp4.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include <alpaka/intrinsic/IntrinsicCpu.hpp>
3030
#include <alpaka/rand/RandStdLib.hpp>
3131
#include <alpaka/time/TimeOmp.hpp>
32+
#include <alpaka/warp/WarpSingleThread.hpp>
3233

3334
// Specialized traits.
3435
#include <alpaka/acc/Traits.hpp>
@@ -85,6 +86,7 @@ namespace alpaka
8586
public intrinsic::IntrinsicCpu,
8687
public rand::RandStdLib,
8788
public time::TimeOmp,
89+
public warp::WarpSingleThread,
8890
public concepts::Implements<ConceptAcc, AccCpuOmp4<TDim, TIdx>>
8991
{
9092
public:

include/alpaka/acc/AccCpuSerial.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <alpaka/intrinsic/IntrinsicCpu.hpp>
2626
#include <alpaka/rand/RandStdLib.hpp>
2727
#include <alpaka/time/TimeStdLib.hpp>
28+
#include <alpaka/warp/WarpSingleThread.hpp>
2829

2930
// Specialized traits.
3031
#include <alpaka/acc/Traits.hpp>
@@ -78,6 +79,7 @@ namespace alpaka
7879
public intrinsic::IntrinsicCpu,
7980
public rand::RandStdLib,
8081
public time::TimeStdLib,
82+
public warp::WarpSingleThread,
8183
public concepts::Implements<ConceptAcc, AccCpuSerial<TDim, TIdx>>
8284
{
8385
public:

include/alpaka/acc/AccCpuTbbBlocks.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <alpaka/intrinsic/IntrinsicCpu.hpp>
2626
#include <alpaka/rand/RandStdLib.hpp>
2727
#include <alpaka/time/TimeStdLib.hpp>
28+
#include <alpaka/warp/WarpSingleThread.hpp>
2829

2930
// Specialized traits.
3031
#include <alpaka/acc/Traits.hpp>
@@ -76,6 +77,7 @@ namespace alpaka
7677
public intrinsic::IntrinsicCpu,
7778
public rand::RandStdLib,
7879
public time::TimeStdLib,
80+
public warp::WarpSingleThread,
7981
public concepts::Implements<ConceptAcc, AccCpuTbbBlocks<TDim, TIdx>>
8082
{
8183
public:

include/alpaka/acc/AccCpuThreads.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include <alpaka/intrinsic/IntrinsicCpu.hpp>
2525
#include <alpaka/rand/RandStdLib.hpp>
2626
#include <alpaka/time/TimeStdLib.hpp>
27+
#include <alpaka/warp/WarpSingleThread.hpp>
2728

2829
// Specialized traits.
2930
#include <alpaka/acc/Traits.hpp>
@@ -80,6 +81,7 @@ namespace alpaka
8081
public intrinsic::IntrinsicCpu,
8182
public rand::RandStdLib,
8283
public time::TimeStdLib,
84+
public warp::WarpSingleThread,
8385
public concepts::Implements<ConceptAcc, AccCpuThreads<TDim, TIdx>>
8486
{
8587
public:

include/alpaka/acc/AccGpuUniformCudaHipRt.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include <alpaka/intrinsic/IntrinsicUniformCudaHipBuiltIn.hpp>
3535
#include <alpaka/rand/RandUniformCudaHipRand.hpp>
3636
#include <alpaka/time/TimeUniformCudaHipBuiltIn.hpp>
37+
#include <alpaka/warp/WarpUniformCudaHipBuiltIn.hpp>
3738

3839
// Specialized traits.
3940
#include <alpaka/acc/Traits.hpp>
@@ -86,6 +87,7 @@ namespace alpaka
8687
public intrinsic::IntrinsicUniformCudaHipBuiltIn,
8788
public rand::RandUniformCudaHipRand,
8889
public time::TimeUniformCudaHipBuiltIn,
90+
public warp::WarpUniformCudaHipBuiltIn,
8991
public concepts::Implements<ConceptAcc, AccGpuUniformCudaHipRt<TDim, TIdx>>
9092
{
9193
public:

include/alpaka/warp/Traits.hpp

+200
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
/* Copyright 2020 Sergei Bastrakov
2+
*
3+
* This file is part of Alpaka.
4+
*
5+
* This Source Code Form is subject to the terms of the Mozilla Public
6+
* License, v. 2.0. If a copy of the MPL was not distributed with this
7+
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
8+
*/
9+
10+
#pragma once
11+
12+
#include <alpaka/core/Common.hpp>
13+
#include <alpaka/core/Concepts.hpp>
14+
15+
#include <cstdint>
16+
#include <type_traits>
17+
18+
namespace alpaka
19+
{
20+
//-----------------------------------------------------------------------------
21+
//! The thread warp specifics
22+
namespace warp
23+
{
24+
struct ConceptWarp{};
25+
26+
//-----------------------------------------------------------------------------
27+
//! The warp traits.
28+
namespace traits
29+
{
30+
//#############################################################################
31+
//! The warp size trait.
32+
template<
33+
typename TWarp,
34+
typename TSfinae = void>
35+
struct GetSize;
36+
37+
//#############################################################################
38+
//! The all warp vote trait.
39+
template<
40+
typename TWarp,
41+
typename TSfinae = void>
42+
struct All;
43+
44+
//#############################################################################
45+
//! The any warp vote trait.
46+
template<
47+
typename TWarp,
48+
typename TSfinae = void>
49+
struct Any;
50+
51+
//#############################################################################
52+
//! The ballot warp vote trait.
53+
template<
54+
typename TWarp,
55+
typename TSfinae = void>
56+
struct Ballot;
57+
58+
//#############################################################################
59+
//! The active mask trait.
60+
template<
61+
typename TWarp,
62+
typename TSfinae = void>
63+
struct Activemask;
64+
}
65+
66+
//-----------------------------------------------------------------------------
67+
//! Returns warp size.
68+
//!
69+
//! \tparam TWarp The warp implementation type.
70+
//! \param warp The warp implementation.
71+
ALPAKA_NO_HOST_ACC_WARNING
72+
template<
73+
typename TWarp>
74+
ALPAKA_FN_ACC auto getSize(
75+
TWarp const & warp)
76+
-> std::int32_t
77+
{
78+
using ImplementationBase = concepts::ImplementationBase<
79+
ConceptWarp,
80+
TWarp>;
81+
return traits::GetSize<
82+
ImplementationBase>
83+
::getSize(
84+
warp);
85+
}
86+
87+
//-----------------------------------------------------------------------------
88+
//! Returns a 32- or 64-bit unsigned integer (depending on the
89+
//! accelerator) whose Nth bit is set if and only if the Nth thread
90+
//! of the warp is active.
91+
//!
92+
//! Note: decltype for return type is required there, otherwise
93+
//! compilcation with a CPU and a GPU accelerator enabled fails as it
94+
//! tries to call device function from a host-device one. The reason
95+
//! is unclear, but likely related to deducing the return type.
96+
//!
97+
//! \tparam TWarp The warp implementation type.
98+
//! \param warp The warp implementation.
99+
//! \return 32-bit or 64-bit unsigned type depending on the accelerator.
100+
ALPAKA_NO_HOST_ACC_WARNING
101+
template<
102+
typename TWarp>
103+
ALPAKA_FN_ACC auto activemask(
104+
TWarp const & warp) -> decltype(traits::Activemask<
105+
concepts::ImplementationBase<ConceptWarp, TWarp> >::activemask(warp))
106+
{
107+
using ImplementationBase = concepts::ImplementationBase<
108+
ConceptWarp,
109+
TWarp>;
110+
return traits::Activemask<
111+
ImplementationBase>
112+
::activemask(
113+
warp);
114+
}
115+
116+
//-----------------------------------------------------------------------------
117+
//! Evaluates predicate for all active threads of the warp and returns
118+
//! non-zero if and only if predicate evaluates to non-zero for all of them.
119+
//!
120+
//! It follows the logic of __all(predicate) in CUDA before version 9.0 and HIP,
121+
//! the operation is applied for all active threads.
122+
//! The modern CUDA counterpart would be __all_sync(__activemask(), predicate).
123+
//!
124+
//! \tparam TWarp The warp implementation type.
125+
//! \param warp The warp implementation.
126+
//! \param predicate The predicate value for current thread.
127+
ALPAKA_NO_HOST_ACC_WARNING
128+
template<
129+
typename TWarp>
130+
ALPAKA_FN_ACC auto all(
131+
TWarp const & warp,
132+
std::int32_t predicate)
133+
-> std::int32_t
134+
{
135+
using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
136+
return traits::All<
137+
ImplementationBase>
138+
::all(
139+
warp,
140+
predicate);
141+
}
142+
143+
//-----------------------------------------------------------------------------
144+
//! Evaluates predicate for all active threads of the warp and returns
145+
//! non-zero if and only if predicate evaluates to non-zero for any of them.
146+
//!
147+
//! It follows the logic of __any(predicate) in CUDA before version 9.0 and HIP,
148+
//! the operation is applied for all active threads.
149+
//! The modern CUDA counterpart would be __any_sync(__activemask(), predicate).
150+
//!
151+
//! \tparam TWarp The warp implementation type.
152+
//! \param warp The warp implementation.
153+
//! \param predicate The predicate value for current thread.
154+
ALPAKA_NO_HOST_ACC_WARNING
155+
template<
156+
typename TWarp>
157+
ALPAKA_FN_ACC auto any(
158+
TWarp const & warp,
159+
std::int32_t predicate)
160+
-> std::int32_t
161+
{
162+
using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
163+
return traits::Any<
164+
ImplementationBase>
165+
::any(
166+
warp,
167+
predicate);
168+
}
169+
170+
//-----------------------------------------------------------------------------
171+
//! Evaluates predicate for all non-exited threads in a warp and returns
172+
//! a 32- or 64-bit unsigned integer (depending on the accelerator)
173+
//! whose Nth bit is set if and only if predicate evaluates to non-zero
174+
//! for the Nth thread of the warp and the Nth thread is active.
175+
//!
176+
//! It follows the logic of __ballot(predicate) in CUDA before version 9.0 and HIP,
177+
//! the operation is applied for all active threads.
178+
//! The modern CUDA counterpart would be __ballot_sync(__activemask(), predicate).
179+
//! Return type is 64-bit to fit all platforms.
180+
//!
181+
//! \tparam TWarp The warp implementation type.
182+
//! \param warp The warp implementation.
183+
//! \param predicate The predicate value for current thread.
184+
//! \return 32-bit or 64-bit unsigned type depending on the accelerator.
185+
ALPAKA_NO_HOST_ACC_WARNING
186+
template<
187+
typename TWarp>
188+
ALPAKA_FN_ACC auto ballot(
189+
TWarp const & warp,
190+
std::int32_t predicate)
191+
{
192+
using ImplementationBase = concepts::ImplementationBase<ConceptWarp, TWarp>;
193+
return traits::Ballot<
194+
ImplementationBase>
195+
::ballot(
196+
warp,
197+
predicate);
198+
}
199+
}
200+
}

0 commit comments

Comments
 (0)