diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index d01cd98574..e0bbbbbb18 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -134,7 +134,7 @@ void FlexFlow::top_level_task(Task const *task,
   bool do_sample = false;
   float temperature = 0.0f;
   float topp = 0.0f;
-  int max_requests_per_batch = 8;
+  int max_requests_per_batch = 1; // 8
   int max_tokens_per_batch = 128;
   int max_sequence_length = 256;
 
diff --git a/inference/models/mixtral.cc b/inference/models/mixtral.cc
index 942c0f421b..f8f9c28cca 100644
--- a/inference/models/mixtral.cc
+++ b/inference/models/mixtral.cc
@@ -35,7 +35,6 @@ void MIXTRAL::create_mixtral_model(FFModel &ff,
                                    bool use_full_precision) {
 
   MixtralConfig mixtral_config(model_config_file_path);
-  mixtral_config.print();
 
   if (ff.config.tensor_parallelism_degree > mixtral_config.num_attention_heads ||
       mixtral_config.num_attention_heads % ff.config.tensor_parallelism_degree !=
@@ -45,6 +44,8 @@ void MIXTRAL::create_mixtral_model(FFModel &ff,
   }
 
   std::unordered_map<std::string, Layer *> weights_layers;
+  mixtral_config.num_local_experts = 5;
+  mixtral_config.num_experts_per_tok = 1;
 
   Tensor input;
   {
diff --git a/prof_1.gz b/prof_1.gz
new file mode 100644
index 0000000000..aba0771906
Binary files /dev/null and b/prof_1.gz differ
diff --git a/prof_2.gz b/prof_2.gz
new file mode 100644
index 0000000000..955c4ed035
Binary files /dev/null and b/prof_2.gz differ
diff --git a/prof_3.gz b/prof_3.gz
new file mode 100644
index 0000000000..f3c39fce2b
Binary files /dev/null and b/prof_3.gz differ
diff --git a/prof_4.gz b/prof_4.gz
new file mode 100644
index 0000000000..ffd6fa293d
Binary files /dev/null and b/prof_4.gz differ
diff --git a/prof_5.gz b/prof_5.gz
new file mode 100644
index 0000000000..ca5146f762
Binary files /dev/null and b/prof_5.gz differ
diff --git a/prof_6.gz b/prof_6.gz
new file mode 100644
index 0000000000..aabc8c5c04
Binary files /dev/null and b/prof_6.gz differ
diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc
index 1e4fb407a0..39f42f32de 100644
--- a/src/ops/aggregate.cc
+++ b/src/ops/aggregate.cc
@@ -353,13 +353,13 @@ OpMeta *Aggregate::init_task(Task const *task,
 
   // Only needed to allocate memroy in the kernel
   AggregateMeta *m = new AggregateMeta(handle, agg, gpu_mem_allocator);
-  for (int i = 0; i < regions.size() - 1; i++) {
+  int num_inputs = agg->n + FIXED_ARG_CNT;
+  for (int i = 0; i < num_inputs; i++) {
     m->input_type[i] = agg->inputs[i]->data_type;
   }
   m->output_type[0] = agg->outputs[0]->data_type;
   std::strcpy(m->op_name, agg->name);
 
-  // TODO three instructions below are not in SigmoidSiluMulti::init_task
   m->profiling = agg->profiling;
   m->inference_debugging = agg->inference_debugging;
   std::strcpy(m->op_name, agg->name);
diff --git a/src/ops/group_by.cu b/src/ops/group_by.cu
index d7c471bf07..516504434a 100644
--- a/src/ops/group_by.cu
+++ b/src/ops/group_by.cu
@@ -37,7 +37,7 @@ __global__ void
 
   // Get pred pointers, single thread per block
   if (threadIdx.x == 0) {
-    int exp_tensor_rows = ceil(alpha * k / n * batch_size);
+    int exp_tensor_rows = alpha == 0.0f ? ceil(alpha * k / n * batch_size) : 128;
     int expert_idx[MAX_N] = {0};
     for (int i = 0; i < k * batch_size; i++) {
       // Get pointer to chosen expert predictions