Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ci run bnorm tuning ocl #3642

Merged
merged 8 commits into from
Mar 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion fin
6 changes: 2 additions & 4 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -212,15 +212,13 @@ set( MIOpen_Source
solver/batchnorm/backward_ck.cpp
solver/batchnorm/backward_per_activation.cpp
solver/batchnorm/backward_per_activation_fused.cpp
solver/batchnorm/backward_spatial_multiple.cpp
solver/batchnorm/backward_spatial_single.cpp
solver/batchnorm/backward_spatial.cpp
solver/batchnorm/forward_inference.cpp
solver/batchnorm/forward_inference_ck.cpp
solver/batchnorm/forward_inference_fused.cpp
solver/batchnorm/forward_per_activation.cpp
solver/batchnorm/forward_per_activation_fused.cpp
solver/batchnorm/forward_spatial_multiple.cpp
solver/batchnorm/forward_spatial_single.cpp
solver/batchnorm/forward_spatial.cpp
solver/batchnorm/forward_training_ck.cpp
solver/cat/forward_cat.cpp
solver/conv/conv_asm_1x1u.cpp
Expand Down
2 changes: 2 additions & 0 deletions src/fin/fin_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,8 @@ AnySolver<miopen::ExecutionContext, miopen::batchnorm::ProblemDescription>::AnyS

switch(id)
{
case 113: SetObject<miopen::solver::batchnorm::BnFwdTrainingSpatial>(); break;
case 117: SetObject<miopen::solver::batchnorm::BnBwdTrainingSpatial>(); break;
case 142: SetObject<miopen::solver::batchnorm::BnCKFwdInference>(); break;
case 143: SetObject<miopen::solver::batchnorm::BnCKBwdBackward>(); break;
case 144: SetObject<miopen::solver::batchnorm::BnCKFwdTraining>(); break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@

#include <miopen/batchnorm/problem_description.hpp>

#define WORKAROUND_SWDEV_253606 1

namespace miopen {

namespace solver {
Expand All @@ -37,7 +39,7 @@ namespace batchnorm {
inline void GetWGSizeNHWC(size_t c,
size_t h,
size_t w,
size_t maxCUs,
size_t min_workgroups,
bool bfp32parm,
size_t vectorsize,
size_t& xlocalsize,
Expand All @@ -48,9 +50,10 @@ inline void GetWGSizeNHWC(size_t c,
unsigned int max_localsize = 1024 / vectorsize;

size_t nworkgroups = 0;
xlocalsize = 0;
// decrease max_localsize until the number of workgroups is greater than 80%
// of the available CUs
while((float)nworkgroups < 0.8f * maxCUs && max_localsize >= xlocalsize_limit)
while(nworkgroups < min_workgroups && max_localsize >= xlocalsize_limit)
{
// xlocalsize must be power of 2 as reductions in the kernels rely on it, here c is rounded
// up to next power of 2.
Expand Down Expand Up @@ -92,7 +95,6 @@ inline int GetStashMethod(bool IsLayoutNHWC,
// First workgroup size is computed given a problem and vectorsize, then it checks
// if the computed workgroup is applicable (spatial multiple restrictions)
inline bool GetLocalConfigNHWC(const miopen::batchnorm::ProblemDescription& problem,
size_t maxCUs,
unsigned int stash_values,
size_t vectorsize,
size_t& xlocalsize,
Expand All @@ -103,11 +105,21 @@ inline bool GetLocalConfigNHWC(const miopen::batchnorm::ProblemDescription& prob
? false
: true;

size_t n, c, h, w;
size_t n, c, h, w = 0;
std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths());
assert((n != 0) && "n cannot be 0");
assert((c != 0) && "c cannot be 0");
assert((h != 0) && "h cannot be 0");
assert((w != 0) && "w cannot be 0");

GetWGSizeNHWC(c, h, w, maxCUs, bfp32parm, vectorsize, xlocalsize, ylocalsize);

GetWGSizeNHWC(
c, h, w, problem.GetMinWorkgroups(), bfp32parm, vectorsize, xlocalsize, ylocalsize);
assert((xlocalsize != 0) && "xlocalsize cannot be 0");
assert((ylocalsize != 0) && "ylocalsize cannot be 0");
if(ylocalsize == 0)
{
ylocalsize = 1;
}
stash_values *= (bfp32parm ? 1 : 2);
unsigned int last_ylocalsize = (h * w) % ylocalsize == 0 ? ylocalsize : (h * w) % ylocalsize;
// FP32:
Expand All @@ -129,45 +141,64 @@ inline bool GetLocalConfigNHWC(const miopen::batchnorm::ProblemDescription& prob
return true;
}

// Returns true if spatial multiple is applicable and fill NHWC configuration
// (xlocalsize, ylocalsize, vectorsize).
// Internally, it tries to use vectorization if possible. If vectorization can be
// used but spatial multiple is not applicable, it tries to see if spatial multiple
// is applicable without vectorization as a fallback.
inline bool GetConfigNHWC(const miopen::batchnorm::ProblemDescription& problem,
size_t maxCUs,
unsigned int stash_values,
size_t& xlocalsize,
size_t& ylocalsize,
size_t& vectorsize)
inline bool IsSpatialMultipleApplicable(const miopen::batchnorm::ProblemDescription& problem,
size_t vectorsize,
unsigned int stash_values)
{
size_t c = problem.GetXDesc().GetLengths()[1];
// Apply vectorization if possible, given the size of C
vectorsize = c % 4 == 0 ? 4 : 1;
int n, c, h, w = 0;
std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths());
assert((n != 0) && "n cannot be 0");
assert((c != 0) && "c cannot be 0");
assert((h != 0) && "h cannot be 0");
assert((w != 0) && "w cannot be 0");

// Check if variant 2 is applicable (with possible vectorization)
bool valid =
GetLocalConfigNHWC(problem, maxCUs, stash_values, vectorsize, xlocalsize, ylocalsize);
unsigned int in_cstride = h * w;

// If vectorization is used but variant 2 is not applicable,
// check if it's applicable without vectorization
if(!valid && vectorsize > 1)
if(problem.IsLayoutNHWC())
{
vectorsize = 1;
valid =
GetLocalConfigNHWC(problem, maxCUs, stash_values, vectorsize, xlocalsize, ylocalsize);
// check if the provided vectorsize can be used
if(c % vectorsize != 0)
{
return false;
}
// Variant 2 is the primary choice for NHWC
size_t xlocalsize, ylocalsize = 0;

// The configuration is ignored at this point, it was just computed to check
// if spatial multiple could be applied.
return GetLocalConfigNHWC(problem, stash_values, vectorsize, xlocalsize, ylocalsize);
}
else
{
// check if the provided vectorsize can be used
if(in_cstride % vectorsize != 0)
{
return false;
}

unsigned int ylocalsize = 1024;
unsigned int last_ylocalsize =
in_cstride % ylocalsize == 0 ? ylocalsize : in_cstride % ylocalsize;
// Restrictions:
// - last block must have enough space to stash intermediate results in HW dimension
// - if last block doesn't fit, intermediate results are stored in N dimension which must
// be large enough
stash_values *= (problem.GetXDesc().GetType() == miopenFloat ? 1 : 2);
if(last_ylocalsize < stash_values && n < (size_t)stash_values)
{
return false;
}
}
return valid;
return true;
}

inline void GetSpatialMultipleConfig(const miopen::batchnorm::ProblemDescription& problem,
size_t maxCUs,
unsigned int stash_values,
size_t vectorsize,
size_t& xlocalsize,
size_t& ylocalsize,
size_t& xgridsize,
size_t& ygridsize,
size_t& vectorsize,
int& stash_method)
{
int n, c, h, w;
Expand All @@ -178,14 +209,13 @@ inline void GetSpatialMultipleConfig(const miopen::batchnorm::ProblemDescription
{
// The function returns if the method is valid but we can ignore it
// at this point
GetConfigNHWC(problem, maxCUs, stash_values, xlocalsize, ylocalsize, vectorsize);
GetLocalConfigNHWC(problem, stash_values, vectorsize, xlocalsize, ylocalsize);

xgridsize = xlocalsize * ((c / vectorsize + xlocalsize - 1) / xlocalsize);
ygridsize = ylocalsize * ((in_cstride + ylocalsize - 1) / ylocalsize);
}
else
{
vectorsize = in_cstride % 4 == 0 ? 4 : 1;
xlocalsize = 1;
xgridsize = c;
ylocalsize = 1024;
Expand All @@ -206,39 +236,167 @@ inline void GetSpatialMultipleConfig(const miopen::batchnorm::ProblemDescription
ylocalsize);
}

inline bool IsSpatialMultipleApplicable(const miopen::batchnorm::ProblemDescription& problem,
size_t maxCUs,
unsigned int stash_values)
inline void GetVariantFromKernelId(const std::string& kernel_id, int& variant, size_t& vectorsize)
{
// kernel_id has the following standard:
// Variant<variant>-<vectorsize>
size_t pos = kernel_id.find("Variant");
if(pos != std::string::npos)
{
variant = kernel_id[pos + 7] - '0';
vectorsize = kernel_id[pos + 9] - '0';
}
}

inline std::string GetKernelIdFromVariant(int variant, size_t vectorsize)
{
std::stringstream stream;
stream << "Variant" << variant << "-" << vectorsize;
return stream.str();
}

inline bool UseMultiple(const miopen::batchnorm::ProblemDescription& problem)
{
size_t n, c, h, w;
std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths());

unsigned int in_cstride = h * w;
unsigned int in_nhw = n * in_cstride;
// Check heuristics (used to choose between spatial single and multiple for performance)
// TODO: review these conditions (variant 2 was optimized and vectorization was added,
// so we need a set of benchmarks to check that these conditions are still correct)
if(!problem.IsLayoutNHWC() &&
problem.GetDirection() == miopen::batchnorm::Direction::Backward &&
(!((in_nhw >= static_cast<size_t>(32 * 1024 * 1024) || in_cstride <= 1024) &&
in_cstride > 512)))
{
return false;
}

if(!problem.IsLayoutNHWC() &&
problem.GetDirection() == miopen::batchnorm::Direction::ForwardTraining &&
(!(n >= 3 && ((in_nhw >= static_cast<size_t>(32 * 1024 * 1024) || in_cstride <= 1024) &&
in_cstride > 512))))
{
return false;
}

return true;
}

inline void DefaultConfigSpatialSingle(const miopen::batchnorm::ProblemDescription& problem,
std::vector<std::string>& valid_kernels)
{
int n, c, h, w;
std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths());
std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths());

unsigned int in_cstride = h * w;
unsigned int in_nhw = n * in_cstride;

if(problem.IsLayoutNHWC())
bool bfpmixparm =
problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenFloat
? true
: false;

// NCHW supports also variants 0 and 3 which can be much faster than
// variant 1 but have more restrictions. Here we decide if we use variant
// 0, 1, 3
// In case variant 0 or 3 are selected, we add also variant 1 for tuning.
// Almost always variant 0 and 3 will be faster than variant 1 but
// we add the latter for tuning to be sure and because it is cheap
if(!problem.IsLayoutNHWC())
{
// Variant 2 is the primary choice for NHWC
size_t xlocalsize, ylocalsize, vectorsize;

// The configuration is ignored at this point, it was just computed to check
// if spatial multiple could be applied.
return GetConfigNHWC(problem, maxCUs, stash_values, xlocalsize, ylocalsize, vectorsize);
#if(WORKAROUND_SWDEV_253606 == 0)
if(n < 3 && problem.GetDirection() == miopen::batchnorm::Direction::ForwardTraining)
{
valid_kernels.push_back(GetKernelIdFromVariant(4, 1));
valid_kernels.push_back(GetKernelIdFromVariant(1, 1));
return;
}
#endif

if((in_cstride < 200) && (in_cstride > 60) && bfpmixparm)
{
valid_kernels.push_back(GetKernelIdFromVariant(1, 1));
return;
}

// N*H*W < 32M and H*W > 1024
// use batchnorm variant#1 implementation which parallelize
// work groups over channels and loop through NHW.
if((in_nhw < (32 * 1024 * 1024) && in_cstride > 1024))
{
valid_kernels.push_back(GetKernelIdFromVariant(1, 1));
return;
}
// N*H*W < 32M and H*W > 512
// use batchnorm variant#1 or variant#3 implementation which
// parallelize work groups over channels and loop through N.
else if(in_nhw < (32 * 1024 * 1024) && in_cstride > 512)
{
if(n >= 32)
{
valid_kernels.push_back(GetKernelIdFromVariant(1, 1));
return;
}
else
{
valid_kernels.push_back(GetKernelIdFromVariant(3, 1));
valid_kernels.push_back(GetKernelIdFromVariant(1, 1));
return;
}
}
// H*W < 512 use batchnorm variant#0 or variant#3 implementation
// based on batch size and H*W
else if(in_cstride <= 512)
{
if((n > 64) && (in_cstride > 160))
{
valid_kernels.push_back(GetKernelIdFromVariant(3, 1));
valid_kernels.push_back(GetKernelIdFromVariant(1, 1));
return;
}
else
{
valid_kernels.push_back(GetKernelIdFromVariant(0, 1));
valid_kernels.push_back(GetKernelIdFromVariant(1, 1));
return;
}
}
}
else
valid_kernels.push_back(GetKernelIdFromVariant(1, 1));
}

inline void DefaultConfigSpatialMultiple(const miopen::batchnorm::ProblemDescription& problem,
unsigned int stash_values,
std::vector<std::string>& valid_kernels)
{
int n, c, h, w;
std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths());
unsigned int in_cstride = h * w;

// Default configuration for spatial multiple tries to use vectorization
// for both NCHW or NHWC
size_t vectorsize =
problem.IsLayoutNHWC() ? (c % 4 == 0 ? 4 : 1) : (in_cstride % 4 == 0 ? 4 : 1);
if(IsSpatialMultipleApplicable(problem, vectorsize, stash_values))
{
unsigned int ylocalsize = 1024;
unsigned int last_ylocalsize =
in_cstride % ylocalsize == 0 ? ylocalsize : in_cstride % ylocalsize;
// Restrictions:
// - last block must have enough space to stash intermediate results in HW dimension
// - if last block doesn't fit, intermediate results are stored in N dimension which must
// be large enough
stash_values *= (problem.GetXDesc().GetType() == miopenFloat ? 1 : 2);
if(last_ylocalsize < stash_values && n < (size_t)stash_values)
valid_kernels.push_back(GetKernelIdFromVariant(2, vectorsize));
// if vectorized version is applicable, then the non vectorized version
// is also added to the list of configurations
if(vectorsize > 1)
{
return false;
valid_kernels.push_back(GetKernelIdFromVariant(2, 1));
}
return;
}

// If spatial multiple with vectorization can not be used, try without vectorization
if(vectorsize > 1 && IsSpatialMultipleApplicable(problem, 1, stash_values))
{
valid_kernels.push_back(GetKernelIdFromVariant(2, 1));
}
return true;
}

} // namespace batchnorm
Expand Down
Loading
Loading