Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Exhaustive tune reduce operators #3751

Open
wants to merge 3 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions src/targets/gpu/compile_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
#include <migraphx/par_for.hpp>
#include <migraphx/register_op.hpp>
#include <migraphx/algorithm.hpp>
#include <migraphx/pass_manager.hpp>
#include <migraphx/dead_code_elimination.hpp>
#include <migraphx/op/identity.hpp>
#include <migraphx/gpu/compiler.hpp>
#include <migraphx/gpu/compile_ops.hpp>
Expand Down Expand Up @@ -225,8 +227,8 @@ struct compile_plan
auto bench_ins = bench_mm->add_instruction(
cr->ins->get_operator(), bench_ins_inputs, cr->ins->module_inputs());
cr->replace.replace(*bench_mm, bench_ins);
// do dead code elimination by directly removing instruction
bench_mm->remove_instruction(bench_ins);
// do dead code elimination
run_passes(*bench_mm, {dead_code_elimination{}});
auto t = time_program(*ctx, bench_prog, 20);
if(trace_level > 1)
std::cout << t << "ms" << std::endl;
Expand Down
59 changes: 49 additions & 10 deletions src/targets/gpu/jit/reduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,22 @@
return reduce_lens;
}

static shape get_input_shape(const std::vector<shape>& inputs)
{
auto it = std::max_element(inputs.begin(),
inputs.end(),
by(std::less<>{}, [](const shape& s) { return s.elements(); }));
return *it;
}

static shape get_reduce_shape(const std::vector<shape>& inputs)

Check warning on line 90 in src/targets/gpu/jit/reduce.cpp

View workflow job for this annotation

GitHub Actions / tidy

unused function 'get_reduce_shape' [clang-diagnostic-unused-function,-warnings-as-errors]
{
auto it = std::min_element(inputs.begin(),
inputs.end(),
by(std::less<>{}, [](const shape& s) { return s.elements(); }));
return *it;
}

template <class T>
static shape get_reduced_shape(const shape& s, const std::vector<T>& axes)
{
Expand Down Expand Up @@ -310,14 +326,6 @@
{
std::vector<std::string> names() const { return {"fused_reduce", "split_fused_reduce"}; }

static shape get_input_shape(const std::vector<shape>& inputs)
{
auto it = std::max_element(inputs.begin(),
inputs.end(),
by(std::less<>{}, [](const shape& s) { return s.elements(); }));
return *it;
}

operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
{
auto assign = v.get("assign", "assign_none");
Expand Down Expand Up @@ -352,7 +360,7 @@
auto relements = reduction_shape.elements() / vec.size;
if(algo == "block")
{
auto block_size = compute_block_size(ctx, relements, 256);
auto block_size = v.get("block_size", compute_block_size(ctx, relements, 256));
if(relements >= block_size * 256)
algo = "block_large";
options.set_launch_params(
Expand Down Expand Up @@ -392,16 +400,47 @@
return compile_hip_code_object(ctx, src, options);
}

compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
compiler_replace compile(context& ctx, instruction_ref ins, const operation& op, const value& solution) const
{
assert(not ins->module_inputs().empty());
auto v = op.to_value();
for(const auto& x:solution)
v.insert(x);
auto* rm = ins->module_inputs().front();
v["preamble"] = generate_reduce(*rm, "fused_reduce_op");
v["lambda"] = "MIGRAPHX_LIFT(fused_reduce_op)";
v["kernel"] = generate_name_from_ops(*rm) + "_kernel";
return compile_op(ctx, to_shapes(ins->inputs()), v);
}

optional<tuning_config> get_tuning_config(const context&,
instruction_ref ins,
const operation& op,
bool exhaustive) const
{
if(not exhaustive)
return nullopt;
if(op.name() != "fused_reduce")
return nullopt;
tuning_config tc;
auto shapes = to_shapes(ins->inputs());
tc.problem = to_value(shapes);
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably should add the kernel name to the problem key.

auto axes = op.to_value().at("axes").to_vector<std::size_t>();
auto input_shape = get_input_shape(shapes);
auto reduce_shape = get_reduced_shape(input_shape, axes);
auto relements = reduce_shape.elements();
for(auto block_size:{64, 128, 256, 512, 1024})
{
if(relements < block_size)
continue;
tc.solutions.push_back({{"algo", "block"}, {"block_size", block_size}});
}
tc.solutions.push_back({{"algo", "lane"}});
if (relements < 16384)
tc.solutions.push_back({{"algo", "wave"}});
return tc;
}

};
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
Expand Down
Loading