From f9ed44558116bede4d9993dd7a51849fd6630383 Mon Sep 17 00:00:00 2001 From: Hugo Latendresse Date: Tue, 10 Dec 2024 18:32:07 -0500 Subject: [PATCH] ZHIHAO COMMENTS --- inference/models/mixtral.cc | 4 ++-- src/ops/aggregate.cc | 15 +++++++++++++++ src/ops/aggregate.cu | 2 ++ src/ops/inc_multihead_self_attention.cc | 4 +++- src/ops/inc_multihead_self_attention.cu | 3 ++- src/ops/sigmoid_silu_multi.cu | 2 +- 6 files changed, 25 insertions(+), 5 deletions(-) diff --git a/inference/models/mixtral.cc b/inference/models/mixtral.cc index 2c5429873a..6b48ef8668 100644 --- a/inference/models/mixtral.cc +++ b/inference/models/mixtral.cc @@ -273,7 +273,7 @@ void MIXTRAL::create_mixtral_model(FFModel &ff, Tensor aggregate_inputs[4 + mixtral_config.num_local_experts] = {nullptr}; for (int expert_idx = 0; expert_idx < mixtral_config.num_local_experts; expert_idx++) { - grouped_tokens[expert_idx] = ff_norm; // TODO this is a dirty fix. Restore using group_by! +// grouped_tokens[expert_idx] = ff_norm; // TODO this is a dirty fix. Restore using group_by! Tensor w1 = ff.dense(grouped_tokens[expert_idx], // (hidden_size, 1, result of calc in groupby) mixtral_config.intermediate_size, AC_MODE_NONE, @@ -336,7 +336,7 @@ void MIXTRAL::create_mixtral_model(FFModel &ff, aggregate_inputs[0] = topk_values; aggregate_inputs[1] = topk_indices; - aggregate_inputs[2] = topk_values; + aggregate_inputs[2] = topk_values; // TODO Causes Legion runtime error!! aggregate_inputs[3] = gate; mlp_out = aggregate_inputs[5]; // TODO don't use only one expert // mlp_out = ff.aggregate(aggregate_inputs, diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc index de8b3a94ae..fbf6a30494 100644 --- a/src/ops/aggregate.cc +++ b/src/ops/aggregate.cc @@ -313,6 +313,7 @@ OpMeta *Aggregate::init_task(Task const *task, // ... including some steps with GenericTensorAccessorR // Shoud I include? + // Only needed to allocate memroy in the kernel AggregateMeta *m = new AggregateMeta(handle, agg, gpu_mem_allocator); for (int i = 0; i < 10; i++) { // TODO 10 is a magic number m->input_type[i] = agg->inputs[i]->data_type; @@ -489,6 +490,18 @@ void Aggregate::forward_task(Task const *task, ctx, task->regions[total_input_cnt].region.get_index_space()); + Aggregate::forward_kernel_wrapper(m, + bc, + exp_preds, + acc_gate_assign.ptr(rect_gate_assign), + acc_gate_pred.ptr(rect_gate_pred), + acc_output.ptr(rect_output), + n, + k, + rows, + batch_size, + out_dim); + // TODO One of those three linese cause the mismatch error // get gate_pred, gate_assign, output //AccessorRO const acc_gate_pred(regions[0], FID_DATA); // This one alone does cause the problem @@ -534,6 +547,7 @@ void Aggregate::forward_task(Task const *task, // printf("CALLING FOWARD_KERNEL_WRAPPER IN FORWARD_TASK\n"); + // From ZJ: we lose shape of tensors when we do this approach. Appraoch in sigmoid silu is recommended // Aggregate::forward_kernel_wrapper(m, // exp_preds, // acc_gate_assign.ptr(rect_gate_assign), @@ -604,6 +618,7 @@ void Aggregate::inference_task(Task const *task, // TODO should we have an inference_kernel wrapper? Aggregate::forward_kernel_wrapper(m, + bc, exp_preds, acc_gate_assign.ptr(rect_gate_assign), acc_gate_pred.ptr(rect_gate_pred), diff --git a/src/ops/aggregate.cu b/src/ops/aggregate.cu index 2161c50542..906e8c8026 100644 --- a/src/ops/aggregate.cu +++ b/src/ops/aggregate.cu @@ -200,6 +200,7 @@ __global__ void agg_backward_kernel(float **exp_preds, /*static*/ void Aggregate::forward_kernel_wrapper(AggregateMeta const *m, + BatchConfig const *bc, float **exp_preds, int const *acc_gate_assign_ptr, float const *acc_gate_pred_ptr, @@ -307,6 +308,7 @@ void Aggregate::backward_kernel_wrapper(AggregateMeta const *m, } } +// Only needed if we allocate memory , hwihci s not our case AggregateMeta::AggregateMeta(FFHandler handler, Aggregate const *aggr, MemoryAllocator &gpu_mem_allocator) diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc index 4daf8a054c..0e305b8f91 100644 --- a/src/ops/inc_multihead_self_attention.cc +++ b/src/ops/inc_multihead_self_attention.cc @@ -493,7 +493,8 @@ OpMeta *IncMultiHeadSelfAttention::init_task( // printf("running IncMultiHeadSelfAttention::init_task\n"); IncMultiHeadSelfAttention const *attn = (IncMultiHeadSelfAttention *)task->args; FFHandler handle = *((FFHandler const *)task->local_args); - + // We call the below to get the shape info, so we can do the assertions + // I also shouldnt care about offloading GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(attn->inputs[0]->data_type, regions[0], @@ -745,6 +746,7 @@ bool IncMultiHeadSelfAttention::get_int_parameter(PMParameter para, } } +// Just for benchmarking, don't need that bool IncMultiHeadSelfAttention::measure_operator_cost( Simulator *sim, MachineView const &mv, CostMetrics &cost_metrics) const { return false; diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu index e3565179a5..3619a1fbae 100644 --- a/src/ops/inc_multihead_self_attention.cu +++ b/src/ops/inc_multihead_self_attention.cu @@ -908,7 +908,7 @@ void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m, int const per_head_size = m->qProjSize; float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f; size_t smem_sz; - if (per_head_size == 32) { + if (per_head_size == 32) { // ok to do that constexpr int THREADS_PER_VALUE_32 = threads_per_value_t::value; LAUNCH_ATTENTION_SCORE_KERNEL( DT, 32, 32, 4, THREADS_PER_VALUE_32, 128, stream); @@ -1517,6 +1517,7 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper( assert(input.data_type == output.data_type); if (input.data_type == DT_HALF) { + // calling input.get_inc_ptr() below would cause a legion error type mismatch get index space doamine Kernels::IncMultiHeadAttention::inference_kernel( m, bc, shard_id, input.get_half_ptr(), output.get_half_ptr(), stream); } else if (input.data_type == DT_FLOAT) { diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu index ca0168a59d..8446b0f28f 100644 --- a/src/ops/sigmoid_silu_multi.cu +++ b/src/ops/sigmoid_silu_multi.cu @@ -181,7 +181,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper( min(CUDA_NUM_THREADS, num_elements), 0, stream>>>(input1.domain.get_volume(), - input1.get_float_ptr(), + input1.get_float_ptr(), // Ultimately we get pointers,whereas in mixtarl branch we pass pointers to this func. input2.get_float_ptr(), output.get_float_ptr()); } else if (m->input_type[0] == DT_HALF) {