diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc index d01cd98574..e0bbbbbb18 100644 --- a/inference/incr_decoding/incr_decoding.cc +++ b/inference/incr_decoding/incr_decoding.cc @@ -134,7 +134,7 @@ void FlexFlow::top_level_task(Task const *task, bool do_sample = false; float temperature = 0.0f; float topp = 0.0f; - int max_requests_per_batch = 8; + int max_requests_per_batch = 1; // 8 int max_tokens_per_batch = 128; int max_sequence_length = 256; diff --git a/inference/models/mixtral.cc b/inference/models/mixtral.cc index 942c0f421b..f8f9c28cca 100644 --- a/inference/models/mixtral.cc +++ b/inference/models/mixtral.cc @@ -35,7 +35,6 @@ void MIXTRAL::create_mixtral_model(FFModel &ff, bool use_full_precision) { MixtralConfig mixtral_config(model_config_file_path); - mixtral_config.print(); if (ff.config.tensor_parallelism_degree > mixtral_config.num_attention_heads || mixtral_config.num_attention_heads % ff.config.tensor_parallelism_degree != @@ -45,6 +44,8 @@ void MIXTRAL::create_mixtral_model(FFModel &ff, } std::unordered_map weights_layers; + mixtral_config.num_local_experts = 5; + mixtral_config.num_experts_per_tok = 1; Tensor input; { diff --git a/prof_1.gz b/prof_1.gz new file mode 100644 index 0000000000..aba0771906 Binary files /dev/null and b/prof_1.gz differ diff --git a/prof_2.gz b/prof_2.gz new file mode 100644 index 0000000000..955c4ed035 Binary files /dev/null and b/prof_2.gz differ diff --git a/prof_3.gz b/prof_3.gz new file mode 100644 index 0000000000..f3c39fce2b Binary files /dev/null and b/prof_3.gz differ diff --git a/prof_4.gz b/prof_4.gz new file mode 100644 index 0000000000..ffd6fa293d Binary files /dev/null and b/prof_4.gz differ diff --git a/prof_5.gz b/prof_5.gz new file mode 100644 index 0000000000..ca5146f762 Binary files /dev/null and b/prof_5.gz differ diff --git a/prof_6.gz b/prof_6.gz new file mode 100644 index 0000000000..aabc8c5c04 Binary files /dev/null and b/prof_6.gz differ diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc index 1e4fb407a0..39f42f32de 100644 --- a/src/ops/aggregate.cc +++ b/src/ops/aggregate.cc @@ -353,13 +353,13 @@ OpMeta *Aggregate::init_task(Task const *task, // Only needed to allocate memroy in the kernel AggregateMeta *m = new AggregateMeta(handle, agg, gpu_mem_allocator); - for (int i = 0; i < regions.size() - 1; i++) { + int num_inputs = agg->n + FIXED_ARG_CNT; + for (int i = 0; i < num_inputs; i++) { m->input_type[i] = agg->inputs[i]->data_type; } m->output_type[0] = agg->outputs[0]->data_type; std::strcpy(m->op_name, agg->name); - // TODO three instructions below are not in SigmoidSiluMulti::init_task m->profiling = agg->profiling; m->inference_debugging = agg->inference_debugging; std::strcpy(m->op_name, agg->name); diff --git a/src/ops/group_by.cu b/src/ops/group_by.cu index d7c471bf07..516504434a 100644 --- a/src/ops/group_by.cu +++ b/src/ops/group_by.cu @@ -37,7 +37,7 @@ __global__ void // Get pred pointers, single thread per block if (threadIdx.x == 0) { - int exp_tensor_rows = ceil(alpha * k / n * batch_size); + int exp_tensor_rows = alpha == 0.0f ? ceil(alpha * k / n * batch_size) : 128; int expert_idx[MAX_N] = {0}; for (int i = 0; i < k * batch_size; i++) { // Get pointer to chosen expert predictions