From 43c9b5fb4408c5ce7f9dd19d1eed18737489b4e0 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Fri, 21 Feb 2025 15:24:23 +0800
Subject: [PATCH 001/100] [chat] add distributed impl (#6210)

---
 .../ColossalChat/coati/distributed/README.md  |   6 +
 .../coati/distributed/__init__.py             |   0
 .../ColossalChat/coati/distributed/comm.py    |  57 ++++++
 .../coati/distributed/consumer.py             | 190 ++++++++++++++++++
 .../coati/distributed/inference_backend.py    | 164 +++++++++++++++
 .../ColossalChat/coati/distributed/launch.py  |  87 ++++++++
 .../coati/distributed/producer.py             | 160 +++++++++++++++
 .../ColossalChat/coati/distributed/utils.py   |  40 ++++
 applications/ColossalChat/rl_example.py       |  94 +++++++++
 9 files changed, 798 insertions(+)
 create mode 100644 applications/ColossalChat/coati/distributed/README.md
 create mode 100644 applications/ColossalChat/coati/distributed/__init__.py
 create mode 100644 applications/ColossalChat/coati/distributed/comm.py
 create mode 100644 applications/ColossalChat/coati/distributed/consumer.py
 create mode 100644 applications/ColossalChat/coati/distributed/inference_backend.py
 create mode 100644 applications/ColossalChat/coati/distributed/launch.py
 create mode 100644 applications/ColossalChat/coati/distributed/producer.py
 create mode 100644 applications/ColossalChat/coati/distributed/utils.py
 create mode 100644 applications/ColossalChat/rl_example.py

diff --git a/applications/ColossalChat/coati/distributed/README.md b/applications/ColossalChat/coati/distributed/README.md
new file mode 100644
index 000000000000..b7bac2b2db93
--- /dev/null
+++ b/applications/ColossalChat/coati/distributed/README.md
@@ -0,0 +1,6 @@
+# Requirements
+
+```bash
+pip install cupy-cuda12x
+python -m cupyx.tools.install_library --cuda 12.x --library nccl
+```
diff --git a/applications/ColossalChat/coati/distributed/__init__.py b/applications/ColossalChat/coati/distributed/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/applications/ColossalChat/coati/distributed/comm.py b/applications/ColossalChat/coati/distributed/comm.py
new file mode 100644
index 000000000000..3824303f55bd
--- /dev/null
+++ b/applications/ColossalChat/coati/distributed/comm.py
@@ -0,0 +1,57 @@
+from typing import Any, Dict
+
+import ray.util.collective as cc
+import torch
+import torch.distributed.distributed_c10d as c10d
+from packaging.version import Version
+
+
+def ray_broadcast_object(obj: Any, src: int = 0, device=None, group_name: str = "default") -> Any:
+    rank = cc.get_rank(group_name)
+    if rank == src:
+        if Version(torch.__version__) >= Version("2.3.0"):
+            obj_tensor, size_tensor = c10d._object_to_tensor(obj, device=device, group=None)
+        elif Version(torch.__version__) >= Version("1.13.0"):
+            obj_tensor, size_tensor = c10d._object_to_tensor(obj, device=device)
+        else:
+            obj_tensor, size_tensor = c10d._object_to_tensor(obj)
+        obj_tensor = obj_tensor.to(device)
+        size_tensor = size_tensor.to(device)
+    else:
+        size_tensor = torch.empty(1, dtype=torch.int64, device=device)
+    cc.broadcast(size_tensor, src, group_name)
+    if rank != src:
+        obj_tensor = torch.empty(size_tensor.item(), dtype=torch.uint8, device=device)
+    cc.broadcast(obj_tensor, src, group_name)
+    if rank != src:
+        if Version(torch.__version__) >= Version("2.3.0"):
+            obj = c10d._tensor_to_object(obj_tensor, size_tensor.item(), group=None)
+        else:
+            obj = c10d._tensor_to_object(obj, size_tensor.item())
+    return obj
+
+
+def ray_broadcast_tensor_dict(
+    tensor_dict: Dict[str, torch.Tensor], src: int = 0, device=None, group_name: str = "default"
+) -> Dict[str, torch.Tensor]:
+    rank = cc.get_rank(group_name)
+    if rank == src:
+        metadata = []
+        for k, v in tensor_dict.items():
+            metadata.append((k, v.shape, v.dtype))
+    else:
+        metadata = None
+    metadata = ray_broadcast_object(metadata, src, device, group_name)
+    if rank != src:
+        out_dict = {}
+    for k, shape, dtype in metadata:
+        if rank == src:
+            tensor = tensor_dict[k]
+        else:
+            tensor = torch.empty(shape, dtype=dtype, device=device)
+        cc.broadcast(tensor, src, group_name)
+        if rank != src:
+            out_dict[k] = tensor
+    if rank == src:
+        out_dict = tensor_dict
+    return out_dict
diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
new file mode 100644
index 000000000000..61417f7e6b41
--- /dev/null
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -0,0 +1,190 @@
+from contextlib import nullcontext
+from typing import Any, Dict, Optional
+
+import ray
+import ray.util.collective as cc
+import torch
+import torch.distributed as dist
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM
+
+from colossalai.booster import Booster
+from colossalai.booster.plugin import HybridParallelPlugin
+from colossalai.initialize import launch
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.utils import get_current_device
+
+from .comm import ray_broadcast_tensor_dict
+from .utils import bind_batch, post_recv, unbind_batch
+
+
+class BaseConsumer:
+    def __init__(
+        self,
+        num_producers: int,
+        num_episodes: int,
+        rank: int,
+        world_size: int,
+        master_addr: str,
+        master_port: int,
+        num_update_per_episode: int,
+        num_recv_per_update: int,
+        batch_size: int,
+        model_config: Dict[str, Any],
+        plugin_config: Dict[str, Any],
+        microbatch_size: int = 1,
+    ):
+        self.num_producers = num_producers
+        self.num_episodes = num_episodes
+        self.rank = rank
+        self.world_size = world_size
+        self.master_addr = master_addr
+        self.master_port = master_port
+        self.num_update_per_episode = num_update_per_episode
+        self.num_recv_per_update = num_recv_per_update
+        self.batch_size = batch_size
+        self.microbatch_size = microbatch_size
+        assert batch_size % microbatch_size == 0, "batch_size should be divisible by microbatch_size"
+        self.num_microbatches = batch_size // microbatch_size
+
+        self.model_config = model_config
+        self.plugin_config = plugin_config
+        assert self.plugin_config.get("pp_size", 1) == 1, "pp_size > 1 is not supported now"
+
+        self.device = get_current_device()
+
+    def setup(self) -> None:
+        for i in range(self.num_producers):
+            cc.init_collective_group(self.world_size + 1, self.rank + 1, group_name=f"sync_data_{i}")
+        if self.rank == 0:
+            cc.init_collective_group(self.num_producers + 1, self.num_producers, group_name="sync_model")
+        launch(self.rank, self.world_size, self.master_addr, self.master_port, local_rank=0)
+
+        plugin_config = dict(
+            tp_size=1,
+            pp_size=1,
+            precision="bf16",
+            zero_stage=1,
+        )
+        if self.plugin_config.get("pp_size", 1) > 1 and "num_microbatches" not in self.plugin_config:
+            plugin_config["microbatch_size"] = self.microbatch_size
+        plugin_config.update(self.plugin_config)
+        self.plugin = HybridParallelPlugin(**plugin_config)
+        self.booster = Booster(plugin=self.plugin)
+        self.dp_rank = dist.get_rank(self.plugin.dp_group)
+        self.dp_size = dist.get_world_size(self.plugin.dp_group)
+
+        self.buffer = []
+
+        self.recv_cnt = 0
+
+    def state_dict(self) -> Dict[str, torch.Tensor]:
+        raise NotImplementedError
+
+    def step(self, step_idx: int, **kwargs) -> Optional[float]:
+        raise NotImplementedError
+
+    def loop(self) -> None:
+        print(
+            f"Consumer{self.rank} num_update: {self.num_update_per_episode}, num_recv: {self.num_recv_per_update}, nmb: {self.num_microbatches}"
+        )
+        for episode in range(self.num_episodes):
+            with tqdm(range(self.num_update_per_episode), desc=f"Episode {episode}", disable=self.rank != 0) as pbar:
+                for step in pbar:
+                    i = 0
+                    for _ in range(self.num_recv_per_update):
+                        # receive data from producers
+
+                        for r in range(self.num_producers):
+                            print(f"[T{dist.get_rank()}] Recv data episode {episode} step {step} from {r}")
+                            self.buffer.extend(
+                                unbind_batch(
+                                    ray_broadcast_tensor_dict(
+                                        None, src=0, device=self.device, group_name=f"sync_data_{r}"
+                                    )
+                                )
+                            )
+                        while len(self.buffer) >= self.dp_size * self.microbatch_size:
+                            batches = self.buffer[
+                                self.dp_rank * self.microbatch_size : (self.dp_rank + 1) * self.microbatch_size
+                            ]
+                            self.buffer = self.buffer[self.dp_size * self.microbatch_size :]
+                            batch = bind_batch(batches)
+                            batch = post_recv(batch)
+                            loss = self.step(i, **batch)
+                            if loss is not None:
+                                pbar.set_postfix({"loss": loss})
+                            i += 1
+                    assert len(self.buffer) == 0
+                    if episode != self.num_episodes - 1 or step != self.num_update_per_episode - 1:
+                        print(f"[T{dist.get_rank()}] Sync model episode {episode} step {step}")
+                        state_dict = self.state_dict()
+                        if self.rank == 0:
+                            ray_broadcast_tensor_dict(
+                                state_dict, src=self.num_producers, device=self.device, group_name="sync_model"
+                            )
+
+
+@ray.remote
+class SimpleConsumer(BaseConsumer):
+    def __init__(
+        self,
+        num_producers,
+        num_episodes,
+        rank,
+        world_size,
+        master_addr,
+        master_port,
+        num_update_per_episode,
+        num_recv_per_update,
+        batch_size,
+        model_config,
+        plugin_config,
+        microbatch_size=1,
+    ):
+        super().__init__(
+            num_producers,
+            num_episodes,
+            rank,
+            world_size,
+            master_addr,
+            master_port,
+            num_update_per_episode,
+            num_recv_per_update,
+            batch_size,
+            model_config,
+            plugin_config,
+            microbatch_size,
+        )
+        path = model_config.pop("path")
+        self.model = AutoModelForCausalLM.from_pretrained(path, **model_config)
+        self.model.train()
+        self.model.gradient_checkpointing_enable()
+        self.optimizer = HybridAdam(self.model.parameters(), lr=1e-3)
+        self.accum_loss = torch.zeros(1, device=self.device)
+
+    def setup(self):
+        super().setup()
+        self.model, self.optimizer, *_ = self.booster.boost(self.model, self.optimizer)
+
+    def step(self, step_idx: int, **kwargs) -> Optional[float]:
+        need_update = (step_idx + 1) % self.num_microbatches == 0
+
+        ctx = nullcontext() if need_update else self.booster.no_sync(self.model, self.optimizer)
+        with ctx:
+            out = self.model(**kwargs)
+            loss = out.loss / self.num_microbatches
+            self.accum_loss.add_(loss.data)
+            self.booster.backward(loss, self.optimizer)
+        if need_update:
+            self.optimizer.step()
+            self.optimizer.zero_grad()
+            loss_scalar = self.accum_loss.item()
+            self.accum_loss.zero_()
+            return loss_scalar
+
+    def state_dict(self):
+        self.model._force_wait_all_gather()
+        model = self.model.unwrap()
+        state_dict = model.state_dict()
+        return state_dict
diff --git a/applications/ColossalChat/coati/distributed/inference_backend.py b/applications/ColossalChat/coati/distributed/inference_backend.py
new file mode 100644
index 000000000000..d40808ab429f
--- /dev/null
+++ b/applications/ColossalChat/coati/distributed/inference_backend.py
@@ -0,0 +1,164 @@
+from typing import Any, Dict
+
+import torch
+import torch.nn.functional as F
+from transformers import AutoConfig, AutoModelForCausalLM, PreTrainedTokenizer
+
+from colossalai.utils import get_current_device
+
+try:
+    import sglang as sgl
+except ImportError:
+    sgl = None
+
+try:
+    from vllm import LLM, SamplingParams
+except ImportError:
+    LLM = None
+
+
+class BaseInferenceBackend:
+    def __init__(self, model_config: Dict[str, Any], generate_config: Dict[str, Any], tokenizer: PreTrainedTokenizer):
+        pass
+
+    def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:
+        pass
+
+    def load_state_dict(self, state_dict: Dict[str, torch.Tensor]) -> None:
+        pass
+
+
+class TransformersInferenceBackend(BaseInferenceBackend):
+    def __init__(self, model_config: Dict[str, Any], generate_config: Dict[str, Any], tokenizer: PreTrainedTokenizer):
+        path = model_config.pop("path")
+        defaut_config = dict(
+            trust_remote_code=True,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+        )
+        defaut_config.update(model_config)
+        self.model: AutoModelForCausalLM = AutoModelForCausalLM.from_pretrained(path, **defaut_config)
+        self.generate_config = generate_config
+
+    def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:
+        input_ids = input_ids.to(get_current_device())
+        attention_mask = attention_mask.to(get_current_device())
+        out = self.model.generate(input_ids, attention_mask=attention_mask, **kwargs, **self.generate_config)
+        input_len = input_ids.shape[-1]
+        labels = out.clone()
+        labels[..., :input_len] = -100
+        attention_mask = F.pad(attention_mask, (0, out.shape[-1] - input_len), value=1)
+        attention_mask = attention_mask.expand_as(labels)
+        data = {
+            "input_ids": out,
+            "attention_mask": attention_mask,
+            "labels": labels,
+        }
+        return data
+
+    def load_state_dict(self, state_dict: Dict[str, torch.Tensor]) -> None:
+        self.model.load_state_dict(state_dict)
+
+
+class SGLangInferenceBackend(BaseInferenceBackend):
+    def __init__(self, model_config: Dict[str, Any], generate_config: Dict[str, Any], tokenizer: PreTrainedTokenizer):
+        if sgl is None:
+            raise ImportError("sglang is not installed")
+        path = model_config.pop("path")
+        defaut_config = dict(
+            trust_remote_code=True,
+            skip_tokenizer_init=True,
+        )
+        defaut_config.update(model_config)
+        self.llm = sgl.Engine(model_path=path, **defaut_config)
+        self.generate_config = generate_config
+        self.tokenizer = tokenizer
+        self.config = AutoConfig.from_pretrained(path)
+
+    def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:
+        outputs = self.llm.generate(input_ids=input_ids.tolist(), sampling_params=self.generate_config)
+        out_tokens = []
+        out_len = []
+        for out in outputs:
+            out_tokens.append(out["token_ids"])
+            out_len.append(out["meta_info"]["completion_tokens"])
+        max_len = max(out_len)
+        input_len = input_ids.shape[-1]
+        attention_mask = F.pad(attention_mask, (0, max_len), value=1)
+        for i in range(len(out_tokens)):
+            out_tokens[i] = out_tokens[i] + [self.tokenizer.pad_token_id] * (max_len - out_len[i])
+            attention_mask[i, input_len + out_len[i] :] = 0
+        out = torch.tensor(out_tokens)
+        out = torch.cat((input_ids, out), dim=1)
+        labels = out.clone()
+        labels[..., :input_len] = -100
+        for i in range(len(out_len)):
+            labels[i, input_len + out_len[i] :] = -100
+        data = {
+            "input_ids": out,
+            "attention_mask": attention_mask,
+            "labels": labels,
+        }
+        data = {k: v.to(get_current_device()) for k, v in data.items()}
+        return data
+
+    def load_state_dict(self, state_dict: Dict[str, torch.Tensor]) -> None:
+        if self.config.tie_word_embeddings:
+            del state_dict["lm_head.weight"]
+        named_tensors = [(k, v) for k, v in state_dict.items()]
+        self.llm.update_weights_from_tensor(named_tensors)
+
+
+class VLLMInferenceBackend(BaseInferenceBackend):
+    def __init__(self, model_config: Dict[str, Any], generate_config: Dict[str, Any], tokenizer: PreTrainedTokenizer):
+        if LLM is None:
+            raise ImportError("vllm is not installed")
+        path = model_config.pop("path")
+        defaut_config = dict(
+            trust_remote_code=True,
+            # skip_tokenizer_init=True,
+        )
+        defaut_config.update(model_config)
+        self.llm = LLM(path, **defaut_config)
+        self.generate_config = SamplingParams(**generate_config, stop_token_ids=[tokenizer.eos_token_id])
+        self.tokenizer = tokenizer
+        self.config = AutoConfig.from_pretrained(path)
+
+    def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:
+        outputs = self.llm.generate(
+            prompt_token_ids=input_ids.tolist(), sampling_params=self.generate_config, use_tqdm=False
+        )
+        out_tokens = []
+        out_len = []
+        for out in outputs:
+            out_tokens.append(list(out.outputs[0].token_ids))
+            out_len.append(len(out.outputs[0].token_ids))
+        max_len = max(out_len)
+        input_len = input_ids.shape[-1]
+        attention_mask = F.pad(attention_mask, (0, max_len), value=1)
+        for i in range(len(out_tokens)):
+            out_tokens[i] = out_tokens[i] + [self.tokenizer.pad_token_id] * (max_len - out_len[i])
+            attention_mask[i, input_len + out_len[i] :] = 0
+        out = torch.tensor(out_tokens)
+        out = torch.cat((input_ids, out), dim=1)
+        labels = out.clone()
+        labels[..., :input_len] = -100
+        for i in range(len(out_len)):
+            labels[i, input_len + out_len[i] :] = -100
+        data = {
+            "input_ids": out,
+            "attention_mask": attention_mask,
+            "labels": labels,
+        }
+        data = {k: v.to(get_current_device()) for k, v in data.items()}
+        return data
+
+    def load_state_dict(self, state_dict: Dict[str, torch.Tensor]) -> None:
+        self.llm.llm_engine.model_executor.driver_worker.model_runner.model.load_weights(state_dict.items())
+
+
+BACKEND_MAP = {
+    "transformers": TransformersInferenceBackend,
+    "sglang": SGLangInferenceBackend,
+    "vllm": VLLMInferenceBackend,
+}
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
new file mode 100644
index 000000000000..438c463002f0
--- /dev/null
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -0,0 +1,87 @@
+from typing import Any, Dict, Optional
+
+import ray
+
+from .consumer import SimpleConsumer
+from .producer import SimpleProducer
+
+
+def get_jsonl_size_fast(path: str) -> int:
+    with open(path) as f:
+        lines = f.readlines()
+        lines = [line for line in lines if line.strip()]
+        return len(lines) - 1
+
+
+def get_dp_size_fast(n_procs: int, plugin_config: Dict[str, Any]) -> int:
+    tp_size = plugin_config.get("tp_size", 1)
+    pp_size = plugin_config.get("pp_size", 1)
+    ep_size = plugin_config.get("ep_size", 1)
+    sp_size = plugin_config.get("sp_size", 1)
+    return n_procs // (tp_size * pp_size * ep_size * sp_size)
+
+
+def launch_distributed(
+    num_producers: int,
+    num_proc_per_producer: int,
+    num_consumer_procs: int,
+    num_episodes: int,
+    inference_batch_size: int,
+    inference_microbatch_size: int,
+    train_batch_size: int,
+    train_microbatch_size: int,
+    dataset_config: Dict[str, Any],
+    dataloaders_config: Dict[str, Any],
+    inference_model_config: Dict[str, Any],
+    generate_config: Dict[str, Any],
+    train_model_config: Dict[str, Any],
+    plugin_config: Dict[str, Any],
+    tokenizer_config: Optional[Dict[str, Any]] = None,
+    inference_backend: str = "transformers",
+    master_addr: str = "localhost",
+    master_port: int = 29500,
+):
+    train_dp_size = get_dp_size_fast(num_producers, plugin_config)
+    assert (inference_batch_size * num_producers) % (train_batch_size * train_dp_size) == 0
+
+    dataset_path = dataset_config["path"]
+    num_samples = get_jsonl_size_fast(dataset_path)
+    global_inference_batch_size = inference_batch_size * num_producers
+    num_update_per_episode = num_samples // global_inference_batch_size
+    num_recv_per_update = inference_batch_size // inference_microbatch_size
+
+    procs = []
+    for i in range(num_producers):
+        producer = SimpleProducer.options(num_gpus=num_proc_per_producer).remote(
+            producer_idx=i,
+            num_producers=num_producers,
+            num_consumer_procs=num_consumer_procs,
+            num_episodes=num_episodes,
+            batch_size=inference_batch_size,
+            dataset_config=dataset_config,
+            dataloaders_config=dataloaders_config,
+            model_config=inference_model_config,
+            generate_config=generate_config,
+            tokenizer_config=tokenizer_config,
+            microbatch_size=inference_microbatch_size,
+            backend=inference_backend,
+        )
+        procs.append(producer)
+    for i in range(num_consumer_procs):
+        consumer = SimpleConsumer.options(num_gpus=1).remote(
+            num_producers=num_producers,
+            num_episodes=num_episodes,
+            rank=i,
+            world_size=num_consumer_procs,
+            master_addr=master_addr,
+            master_port=master_port,
+            num_update_per_episode=num_update_per_episode,
+            num_recv_per_update=num_recv_per_update,
+            batch_size=train_batch_size,
+            model_config=train_model_config,
+            plugin_config=plugin_config,
+            microbatch_size=train_microbatch_size,
+        )
+        procs.append(consumer)
+    ray.get([p.setup.remote() for p in procs])
+    ray.get([p.loop.remote() for p in procs])
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
new file mode 100644
index 000000000000..3e4a5277ad2e
--- /dev/null
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -0,0 +1,160 @@
+from typing import Any, Dict, Optional
+
+import ray
+import ray.util.collective as cc
+import torch
+from coati.dataset.loader import RawConversationDataset
+from torch.utils.data import DataLoader, DistributedSampler
+from transformers import AutoTokenizer
+
+from colossalai.utils import get_current_device
+
+from .comm import ray_broadcast_tensor_dict
+from .inference_backend import BACKEND_MAP
+from .utils import pre_send
+
+
+class BaseProducer:
+    def __init__(
+        self,
+        producer_idx: int,
+        num_producers: int,
+        num_consumer_procs: int,
+        num_episodes: int,
+        batch_size: int,
+        dataset_config: Dict[str, Any],
+        dataloaders_config: Dict[str, Any],
+        model_config: Dict[str, Any],
+        generate_config: Dict[str, Any],
+        tokenizer_config: Optional[Dict[str, Any]] = None,
+        microbatch_size: int = 1,
+        backend: str = "transformers",
+    ):
+        self.producer_idx = producer_idx
+        self.num_producers = num_producers
+        self.num_consumer_procs = num_consumer_procs
+        self.num_episodes = num_episodes
+        self.batch_size = batch_size
+        self.microbatch_size = microbatch_size
+        assert batch_size % microbatch_size == 0
+        self.num_microbatches = batch_size // microbatch_size
+
+        self.dataset_config = dataset_config
+        self.model_config = model_config
+        self.generate_config = generate_config
+        self.tokenizer_config = tokenizer_config
+
+        # init tokenizer
+        if tokenizer_config is None:
+            tokenizer_path = model_config["path"]
+            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+        else:
+            tokenizer_path = tokenizer_config.pop("path")
+            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, **tokenizer_config)
+        self.tokenizer.padding_side = "left"
+
+        # init dataloader
+        dataset_path = dataset_config.pop("path")
+        self.dataset = RawConversationDataset(self.tokenizer, dataset_path, **dataset_config)
+        self.dataloader = DataLoader(
+            self.dataset,
+            batch_size=microbatch_size,
+            sampler=DistributedSampler(
+                self.dataset,
+                num_replicas=num_producers,
+                rank=producer_idx,
+                shuffle=True,
+                drop_last=True,
+                seed=42,
+            ),
+            num_workers=4,
+        )
+        self.device = get_current_device()
+
+        # init backend
+        if backend in BACKEND_MAP:
+            self.backend_cls = BACKEND_MAP[backend]
+        else:
+            raise ValueError(f"Unexpected backend {backend}")
+
+    def setup(self) -> None:
+        cc.init_collective_group(1 + self.num_consumer_procs, 0, group_name=f"sync_data_{self.producer_idx}")
+        cc.init_collective_group(self.num_producers + 1, self.producer_idx, group_name="sync_model")
+
+    def rollout(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:
+        raise NotImplementedError
+
+    def load_state_dict(self, state_dict: Dict[str, torch.Tensor]) -> None:
+        raise NotImplementedError
+
+    def loop(self) -> None:
+        num_update_per_episode = len(self.dataloader) // self.num_microbatches
+        num_valid_microbatches = num_update_per_episode * self.num_microbatches
+
+        print(
+            f"[P{self.producer_idx}] num_valid_microbatches {num_valid_microbatches}, nmb: {self.num_microbatches}, dl: {len(self.dataloader)}"
+        )
+        for episode in range(self.num_episodes):
+            self.dataloader.sampler.set_epoch(episode)
+            for i, batch in enumerate(self.dataloader):
+                if i >= num_valid_microbatches:
+                    break
+                outputs = self.rollout(**batch)
+                print(f"[P{self.producer_idx}] Send data {[(k, v.shape) for k, v in outputs.items()]}")
+                outputs = pre_send(outputs)
+                ray_broadcast_tensor_dict(
+                    outputs, src=0, device=self.device, group_name=f"sync_data_{self.producer_idx}"
+                )
+
+                if (i + 1) % self.num_microbatches == 0 and (
+                    episode != self.num_episodes - 1 or i != num_valid_microbatches - 1
+                ):
+                    # don't sync model for last iteration
+                    print(
+                        f"[P{self.producer_idx}] Sync model episode {episode} step {(i + 1) // self.num_microbatches - 1}"
+                    )
+                    state_dict = ray_broadcast_tensor_dict(
+                        None, self.num_producers, device=self.device, group_name="sync_model"
+                    )
+                    self.load_state_dict(state_dict)
+
+
+@ray.remote
+class SimpleProducer(BaseProducer):
+    def __init__(
+        self,
+        producer_idx,
+        num_producers,
+        num_consumer_procs,
+        num_episodes,
+        batch_size,
+        dataset_config,
+        dataloaders_config,
+        model_config,
+        generate_config,
+        tokenizer_config=None,
+        microbatch_size=1,
+        backend="transformers",
+    ):
+        super().__init__(
+            producer_idx,
+            num_producers,
+            num_consumer_procs,
+            num_episodes,
+            batch_size,
+            dataset_config,
+            dataloaders_config,
+            model_config,
+            generate_config,
+            tokenizer_config,
+            microbatch_size,
+            backend,
+        )
+        self.model = self.backend_cls(model_config, generate_config, self.tokenizer)
+
+    @torch.no_grad()
+    def rollout(self, input_ids, attention_mask, **kwargs):
+        return self.model.generate(input_ids, attention_mask, **kwargs)
+
+    def load_state_dict(self, state_dict):
+        self.model.load_state_dict(state_dict)
diff --git a/applications/ColossalChat/coati/distributed/utils.py b/applications/ColossalChat/coati/distributed/utils.py
new file mode 100644
index 000000000000..2f3267a1f494
--- /dev/null
+++ b/applications/ColossalChat/coati/distributed/utils.py
@@ -0,0 +1,40 @@
+from typing import Dict, List
+
+import torch
+
+
+def unbind_batch(batch: Dict[str, torch.Tensor]) -> List[Dict[str, torch.Tensor]]:
+    batches = []
+    for k, v in batch.items():
+        if len(batches) == 0:
+            unbinded_tensors = v.unbind(0)
+            batches = [{k: tensor} for tensor in unbinded_tensors]
+        else:
+            unbinded_tensors = v.unbind(0)
+            assert len(batches) == len(unbinded_tensors)
+            for i, tensor in enumerate(unbinded_tensors):
+                batches[i][k] = tensor
+    return batches
+
+
+def bind_batch(batches: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
+    batch = {}
+    for k in batches[0].keys():
+        batch[k] = torch.stack([batch[k] for batch in batches], dim=0)
+    return batch
+
+
+def pre_send(batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    # compress attention_mask to save bandwidth
+    if "attention_mask" in batch:
+        attention_mask = batch["attention_mask"]
+        batch["attention_mask"] = attention_mask.to(torch.bool)
+    return batch
+
+
+def post_recv(batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    # decompress attention_mask
+    if "attention_mask" in batch:
+        attention_mask = batch["attention_mask"]
+        batch["attention_mask"] = attention_mask.to(torch.int)
+    return batch
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
new file mode 100644
index 000000000000..a6f82b3bebf9
--- /dev/null
+++ b/applications/ColossalChat/rl_example.py
@@ -0,0 +1,94 @@
+import argparse
+
+import ray
+import torch
+from coati.distributed.launch import launch_distributed
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-m", "--model", type=str, default="Qwen/Qwen2.5-7B")
+    parser.add_argument("-d", "--dataset", type=str, default="data.jsonl")
+    parser.add_argument("-t", "--num-trainers", type=int, default=2)
+    parser.add_argument("-i", "--num-inferencer", type=int, default=2)
+    parser.add_argument("-ibs", "--inference-batch-size", type=int, default=64)
+    parser.add_argument("-imbs", "--inference-microbatch-size", type=int, default=16)
+    parser.add_argument("-tbs", "--train-batch-size", type=int, default=16)
+    parser.add_argument("-tmbs", "--train-microbatch-size", type=int, default=2)
+    parser.add_argument("-b", "--backend", type=str, default="transformers")
+    args = parser.parse_args()
+
+    ray.init(address="local", namespace="ray-example")
+
+    inference_model_config = dict(path=args.model)
+    train_model_config = dict(path=args.model)
+    generate_config = dict(
+        top_k=50,
+        top_p=0.8,
+    )
+
+    if args.backend == "transformers":
+        inference_model_config.update(
+            dict(
+                attn_implementation="flash_attention_2",
+                torch_dtype=torch.bfloat16,
+            )
+        )
+        train_model_config.update(
+            dict(
+                attn_implementation="flash_attention_2",
+                torch_dtype=torch.bfloat16,
+                use_cache=False,
+            )
+        )
+        generate_config.update(
+            dict(
+                max_length=512,
+                do_sample=True,
+                max_new_tokens=None,
+                early_stopping=False,
+            )
+        )
+    elif args.backend == "vllm":
+        inference_model_config.update(
+            dict(
+                gpu_memory_utilization=0.6,
+            )
+        )
+        generate_config.update(
+            dict(
+                max_tokens=256,
+                ignore_eos=True,
+            )
+        )
+    else:
+        inference_model_config.update(
+            dict(
+                mem_fraction_static=0.6,
+            )
+        )
+        generate_config.update(
+            dict(
+                max_new_tokens=256,
+                ignore_eos=True,
+            )
+        )
+
+    launch_distributed(
+        num_producers=args.num_inferencer,
+        num_proc_per_producer=1,
+        num_consumer_procs=args.num_trainers,
+        num_episodes=1,
+        inference_batch_size=args.inference_batch_size,
+        inference_microbatch_size=args.inference_microbatch_size,
+        train_batch_size=args.train_batch_size,
+        train_microbatch_size=args.train_microbatch_size,
+        dataset_config={"path": args.dataset, "max_length": 256},
+        dataloaders_config={},
+        inference_model_config=inference_model_config,
+        generate_config=generate_config,
+        train_model_config=train_model_config,
+        plugin_config={},
+        inference_backend=args.backend,
+        master_addr="localhost",
+        master_port=29504,
+    )

From de282dd694ebd8c226017ff2bd68cff56a86e820 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Fri, 21 Feb 2025 17:28:19 +0800
Subject: [PATCH 002/100] [feature] fit RL style generation (#6213)

* [feature] fit rl style generation

* [doc] add docstr

* [doc] add docstr
---
 .../coati/distributed/consumer.py             |   5 +
 .../coati/distributed/inference_backend.py    | 140 +++++++++++++-----
 .../ColossalChat/coati/distributed/utils.py   |  40 ++++-
 3 files changed, 138 insertions(+), 47 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 61417f7e6b41..84a69979fb88 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -168,6 +168,11 @@ def setup(self):
         self.model, self.optimizer, *_ = self.booster.boost(self.model, self.optimizer)
 
     def step(self, step_idx: int, **kwargs) -> Optional[float]:
+        labels = kwargs["input_ids"].clone()
+        labels[kwargs["attention_mask"] == 0] = -100
+        kwargs["labels"] = labels
+        assert kwargs.pop("action_mask").shape == kwargs.pop("action_log_probs").shape
+
         need_update = (step_idx + 1) % self.num_microbatches == 0
 
         ctx = nullcontext() if need_update else self.booster.no_sync(self.model, self.optimizer)
diff --git a/applications/ColossalChat/coati/distributed/inference_backend.py b/applications/ColossalChat/coati/distributed/inference_backend.py
index d40808ab429f..95b7d1e80308 100644
--- a/applications/ColossalChat/coati/distributed/inference_backend.py
+++ b/applications/ColossalChat/coati/distributed/inference_backend.py
@@ -2,10 +2,12 @@
 
 import torch
 import torch.nn.functional as F
-from transformers import AutoConfig, AutoModelForCausalLM, PreTrainedTokenizer
+from transformers import AutoConfig, AutoModelForCausalLM, PreTrainedModel, PreTrainedTokenizer
 
 from colossalai.utils import get_current_device
 
+from .utils import log_probs_from_logits, update_by_default
+
 try:
     import sglang as sgl
 except ImportError:
@@ -22,37 +24,73 @@ def __init__(self, model_config: Dict[str, Any], generate_config: Dict[str, Any]
         pass
 
     def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:
-        pass
+        """Generate new tokens given input_ids and attention_mask.
+
+        Args:
+            input_ids (torch.Tensor): shape [B, S]
+            attention_mask (torch.Tensor): shape [B, S]
+
+        Returns:
+            Dict[str, torch.Tensor]: containing the
+                - input_ids (torch.Tensor): shape [B, S+N]
+                - attention_mask (torch.Tensor): shape [B, S+N]
+                - action_log_probs (torch.Tensor): shape [B, N]
+                - action_mask (torch.Tensor): shape [B, N]
+                where N is the number of generated tokens. And all tensors should be on CUDA.
+        """
 
     def load_state_dict(self, state_dict: Dict[str, torch.Tensor]) -> None:
         pass
 
 
 class TransformersInferenceBackend(BaseInferenceBackend):
+    DEFAULT_MODEL_CONFIG = dict(
+        trust_remote_code=True,
+        torch_dtype=torch.bfloat16,
+    )
+    FORCE_MODEL_CONFIG = dict(
+        device_map="auto",
+    )
+    FORCE_GENERATE_CONFIG = dict(output_logits=True, return_dict_in_generate=True)
+
     def __init__(self, model_config: Dict[str, Any], generate_config: Dict[str, Any], tokenizer: PreTrainedTokenizer):
+        model_config = update_by_default(model_config, self.DEFAULT_MODEL_CONFIG)
+        model_config.update(self.FORCE_MODEL_CONFIG)
         path = model_config.pop("path")
-        defaut_config = dict(
-            trust_remote_code=True,
-            torch_dtype=torch.bfloat16,
-            device_map="auto",
-        )
-        defaut_config.update(model_config)
-        self.model: AutoModelForCausalLM = AutoModelForCausalLM.from_pretrained(path, **defaut_config)
-        self.generate_config = generate_config
+        self.model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(path, **model_config)
+        self.generate_config = generate_config.copy()
+        self.generate_config.update(self.FORCE_GENERATE_CONFIG)
+        self.tokenizer = tokenizer
 
+    @torch.no_grad()
     def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:
         input_ids = input_ids.to(get_current_device())
         attention_mask = attention_mask.to(get_current_device())
         out = self.model.generate(input_ids, attention_mask=attention_mask, **kwargs, **self.generate_config)
         input_len = input_ids.shape[-1]
-        labels = out.clone()
-        labels[..., :input_len] = -100
-        attention_mask = F.pad(attention_mask, (0, out.shape[-1] - input_len), value=1)
-        attention_mask = attention_mask.expand_as(labels)
+        new_token_ids = out.sequences[:, input_len:]
+        # get log probs
+        assert new_token_ids.shape[-1] == len(out.logits)
+        action_log_probs = []
+        for i, logits in enumerate(out.logits):
+            action_log_probs.append(log_probs_from_logits(logits[:, None, :], new_token_ids[:, i : i + 1]))
+        action_log_probs = torch.cat(action_log_probs, dim=1)
+        # get action mask
+        action_mask = torch.ones_like(new_token_ids, dtype=attention_mask.dtype)
+        if self.tokenizer.eos_token_id is not None:
+            for indices in torch.nonzero(new_token_ids == self.tokenizer.eos_token_id):
+                action_mask[indices[0], indices[1] + 1 :] = 0
+
+        if attention_mask.size(0) != action_mask.size(0):
+            assert action_mask.size(0) % attention_mask.size(0) == 0
+            attention_mask = attention_mask.repeat_interleave(action_mask.size(0) // attention_mask.size(0), dim=0)
+
+        attention_mask = torch.cat((attention_mask, action_mask), dim=1)
         data = {
-            "input_ids": out,
+            "input_ids": out.sequences,
             "attention_mask": attention_mask,
-            "labels": labels,
+            "action_log_probs": action_log_probs,
+            "action_mask": action_mask,
         }
         return data
 
@@ -75,6 +113,7 @@ def __init__(self, model_config: Dict[str, Any], generate_config: Dict[str, Any]
         self.tokenizer = tokenizer
         self.config = AutoConfig.from_pretrained(path)
 
+    @torch.no_grad()
     def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:
         outputs = self.llm.generate(input_ids=input_ids.tolist(), sampling_params=self.generate_config)
         out_tokens = []
@@ -110,45 +149,66 @@ def load_state_dict(self, state_dict: Dict[str, torch.Tensor]) -> None:
 
 
 class VLLMInferenceBackend(BaseInferenceBackend):
+    DEFAULT_MODEL_CONFIG = dict(
+        trust_remote_code=True,
+    )
+    FORCE_GENERATE_CONFIG = dict(
+        logprobs=0,
+    )
+
     def __init__(self, model_config: Dict[str, Any], generate_config: Dict[str, Any], tokenizer: PreTrainedTokenizer):
         if LLM is None:
             raise ImportError("vllm is not installed")
+        model_config = update_by_default(model_config, self.DEFAULT_MODEL_CONFIG)
         path = model_config.pop("path")
-        defaut_config = dict(
-            trust_remote_code=True,
-            # skip_tokenizer_init=True,
-        )
-        defaut_config.update(model_config)
-        self.llm = LLM(path, **defaut_config)
-        self.generate_config = SamplingParams(**generate_config, stop_token_ids=[tokenizer.eos_token_id])
+        self.llm = LLM(path, **model_config)
+        generate_config = generate_config.copy()
+        generate_config.update(self.FORCE_GENERATE_CONFIG)
+        self.generate_config = SamplingParams(**generate_config)
         self.tokenizer = tokenizer
-        self.config = AutoConfig.from_pretrained(path)
 
+    @torch.no_grad()
     def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:
         outputs = self.llm.generate(
             prompt_token_ids=input_ids.tolist(), sampling_params=self.generate_config, use_tqdm=False
         )
         out_tokens = []
         out_len = []
+        log_probs = []
         for out in outputs:
-            out_tokens.append(list(out.outputs[0].token_ids))
-            out_len.append(len(out.outputs[0].token_ids))
+            for output_i in out.outputs:
+                out_len.append(len(output_i.token_ids))
+                out_tokens.append(list(output_i.token_ids))
+                assert len(output_i.logprobs) == len(output_i.token_ids)
+                p = [m[t].logprob for m, t in zip(output_i.logprobs, output_i.token_ids)]
+                log_probs.append(p)
+
+        # pad them
         max_len = max(out_len)
-        input_len = input_ids.shape[-1]
-        attention_mask = F.pad(attention_mask, (0, max_len), value=1)
-        for i in range(len(out_tokens)):
-            out_tokens[i] = out_tokens[i] + [self.tokenizer.pad_token_id] * (max_len - out_len[i])
-            attention_mask[i, input_len + out_len[i] :] = 0
-        out = torch.tensor(out_tokens)
-        out = torch.cat((input_ids, out), dim=1)
-        labels = out.clone()
-        labels[..., :input_len] = -100
-        for i in range(len(out_len)):
-            labels[i, input_len + out_len[i] :] = -100
+        action_mask = torch.ones(len(out_tokens), max_len, dtype=attention_mask.dtype)
+
+        for i, new_token_ids in enumerate(out_tokens):
+            pad_len = max_len - out_len[i]
+            out_tokens[i] = new_token_ids + [self.tokenizer.pad_token_id] * pad_len
+            log_probs[i] = log_probs[i] + [0.0] * pad_len
+            action_mask[i, out_len[i] :] = 0
+
+        out_tokens = torch.tensor(out_tokens)
+        log_probs = torch.tensor(log_probs)
+        if attention_mask.size(0) != action_mask.size(0):
+            assert action_mask.size(0) % attention_mask.size(0) == 0
+            num_returns = action_mask.size(0) // attention_mask.size(0)
+            attention_mask = attention_mask.repeat_interleave(num_returns, dim=0)
+            input_ids = input_ids.repeat_interleave(num_returns, dim=0)
+
+        out_tokens = torch.cat((input_ids, out_tokens), dim=1)
+        attention_mask = torch.cat((attention_mask, action_mask), dim=1)
+
         data = {
-            "input_ids": out,
+            "input_ids": out_tokens,
             "attention_mask": attention_mask,
-            "labels": labels,
+            "action_log_probs": log_probs,
+            "action_mask": action_mask,
         }
         data = {k: v.to(get_current_device()) for k, v in data.items()}
         return data
@@ -159,6 +219,6 @@ def load_state_dict(self, state_dict: Dict[str, torch.Tensor]) -> None:
 
 BACKEND_MAP = {
     "transformers": TransformersInferenceBackend,
-    "sglang": SGLangInferenceBackend,
+    # "sglang": SGLangInferenceBackend, # sglang backend will stuck the process due to unknown reason
     "vllm": VLLMInferenceBackend,
 }
diff --git a/applications/ColossalChat/coati/distributed/utils.py b/applications/ColossalChat/coati/distributed/utils.py
index 2f3267a1f494..533a5ffb22da 100644
--- a/applications/ColossalChat/coati/distributed/utils.py
+++ b/applications/ColossalChat/coati/distributed/utils.py
@@ -1,4 +1,4 @@
-from typing import Dict, List
+from typing import Any, Dict, List
 
 import torch
 
@@ -25,16 +25,42 @@ def bind_batch(batches: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor
 
 
 def pre_send(batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
-    # compress attention_mask to save bandwidth
+    # compress mask to save bandwidth
     if "attention_mask" in batch:
-        attention_mask = batch["attention_mask"]
-        batch["attention_mask"] = attention_mask.to(torch.bool)
+        batch["attention_mask"] = batch["attention_mask"].to(torch.bool)
+    if "action_mask" in batch:
+        batch["action_mask"] = batch["action_mask"].to(torch.bool)
     return batch
 
 
 def post_recv(batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
-    # decompress attention_mask
+    # decompress mask
     if "attention_mask" in batch:
-        attention_mask = batch["attention_mask"]
-        batch["attention_mask"] = attention_mask.to(torch.int)
+        batch["attention_mask"] = batch["attention_mask"].to(torch.int)
+    if "action_mask" in batch:
+        batch["action_mask"] = batch["action_mask"].to(torch.int)
     return batch
+
+
+def update_by_default(data: Dict[str, Any], default: Dict[str, Any]) -> Dict[str, Any]:
+    data = data.copy()
+    for k, v in default.items():
+        if k not in data:
+            data[k] = v
+    return data
+
+
+def log_probs_from_logits(logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
+    """
+    Compute the log probabilities from logits for the given labels.
+
+    Args:
+        logits (torch.Tensor): The input logits.
+        labels (torch.Tensor): The target labels.
+
+    Returns:
+        torch.Tensor: The log probabilities corresponding to the labels.
+    """
+    log_probs = torch.log_softmax(logits, dim=-1)
+    per_label_logps = log_probs.gather(dim=-1, index=labels.unsqueeze(-1))
+    return per_label_logps.squeeze(-1)

From 8e6c9a4ab3f10a22ee5d1d3001ea64d0bda5191a Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Sun, 23 Feb 2025 11:02:54 +0800
Subject: [PATCH 003/100] add reward related function

---
 .../coati/distributed/reward/reward_fn.py     | 51 +++++++++++++
 .../coati/distributed/reward/reward_utils.py  | 76 +++++++++++++++++++
 .../distributed/reward/verifiable_reward.py   | 47 ++++++++++++
 3 files changed, 174 insertions(+)
 create mode 100644 applications/ColossalChat/coati/distributed/reward/reward_fn.py
 create mode 100644 applications/ColossalChat/coati/distributed/reward/reward_utils.py
 create mode 100644 applications/ColossalChat/coati/distributed/reward/verifiable_reward.py

diff --git a/applications/ColossalChat/coati/distributed/reward/reward_fn.py b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
new file mode 100644
index 000000000000..f127a1eced20
--- /dev/null
+++ b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
@@ -0,0 +1,51 @@
+import torch
+
+from .reward_utils import extract_solution, validate_response_structure
+
+
+def math_reward_fn(input_ids, **kwargs):
+    # apply varifiable reward
+    # reward 10 points if the final answer is correct, reward 1 point if format is correct
+
+    gt_answer = kwargs["gt_answer"]
+    tokenizer = kwargs["tokenizer"]
+    s, e = kwargs["response_start"], kwargs["response_end"]
+    reward = torch.tensor(0.0).to(input_ids.device)
+    if gt_answer is None:
+        return reward
+    decoded_final_answer = tokenizer.decode(input_ids[s : e + 1], skip_special_tokens=True)
+    final_answer, processed_str = extract_solution(decoded_final_answer)
+
+    format_valid = validate_response_structure(processed_str, kwargs["tags"])
+    if not format_valid:
+        return reward
+    else:
+        reward += 1.0
+        if gt_answer.strip().replace(" ", "").lower() == final_answer.strip().replace(" ", "").lower():
+            reward = reward + 9.0
+        return reward
+
+
+def gsm8k_reward_fn(input_ids, **kwargs):
+    gt_answer = kwargs["gt_answer"]
+    tokenizer = kwargs["tokenizer"]
+    s, e = kwargs["response_start"], kwargs["response_end"]
+    reward = torch.tensor(0.0).to(input_ids.device)
+    if gt_answer is None:
+        return reward
+    decoded_final_answer = tokenizer.decode(input_ids[s:e], skip_special_tokens=True)
+    final_answer, processed_str = extract_solution(decoded_final_answer)
+    is_valid = True
+    try:
+        int(final_answer.strip())
+    except Exception:
+        is_valid = False
+
+    format_valid = validate_response_structure(processed_str, kwargs["tags"])
+    if not is_valid or not format_valid:
+        return reward
+    else:
+        reward += 1.0
+        if gt_answer.strip().replace(" ", "").lower() == final_answer.strip().replace(" ", "").lower():
+            reward = reward + 9.0
+        return reward
diff --git a/applications/ColossalChat/coati/distributed/reward/reward_utils.py b/applications/ColossalChat/coati/distributed/reward/reward_utils.py
new file mode 100644
index 000000000000..c1e73d4b9738
--- /dev/null
+++ b/applications/ColossalChat/coati/distributed/reward/reward_utils.py
@@ -0,0 +1,76 @@
+# Copyright Unakar
+# Modified from https://github.com/Unakar/Logic-RL/blob/086373176ac198c97277ff50f4b6e7e1bfe669d3/verl/utils/reward_score/kk.py#L99
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from typing import Dict, Optional, Tuple
+
+
+def validate_response_structure(processed_str: str, tags: Dict = None) -> bool:
+    """Performs comprehensive validation of response structure.
+
+    Args:
+        processed_str: Processed response string from the model
+
+    Returns:
+        Boolean indicating whether all formatting requirements are met
+    """
+    validation_passed = True
+    # Check required tags
+    if tags is None:
+        tags = {
+            "think_start": {"text": "<think>", "num_occur": 1},
+            "think_end": {"text": "</think>", "num_occur": 1},
+            "answer_start": {"text": "<answer>", "num_occur": 1},
+            "answer_end": {"text": "</answer>", "num_occur": 1},
+        }
+    positions = {}
+    for tag_name, tag_info in tags.items():
+        tag_str = tag_info["text"]
+        expected_count = tag_info["num_occur"]
+        count = processed_str.count(tag_str)
+        positions[tag_name] = pos = processed_str.find(tag_str)
+        if count != expected_count:
+            validation_passed = False
+    # Verify tag order
+    if (
+        positions["think_start"] > positions["think_end"]
+        or positions["think_end"] > positions["answer_start"]
+        or positions["answer_start"] > positions["answer_end"]
+    ):
+        validation_passed = False
+    if len(processed_str) - positions["answer_end"] != len(tags["answer_end"]["text"]):
+        validation_passed = False
+    return validation_passed
+
+
+def extract_solution(solution_str: str) -> Tuple[Optional[str], str]:
+    """Extracts the final answer from the model's response string.
+
+    Args:
+        solution_str: Raw response string from the language model
+
+    Returns:
+        Tuple containing (extracted_answer, processed_string)
+    """
+
+    # Extract final answer using XML-style tags
+    answer_pattern = r"<answer>(.*?)</answer>"
+    matches = list(re.finditer(answer_pattern, solution_str, re.DOTALL))
+
+    if not matches:
+        return None, solution_str
+
+    final_answer = matches[-1].group(1).strip()
+    return final_answer, solution_str
diff --git a/applications/ColossalChat/coati/distributed/reward/verifiable_reward.py b/applications/ColossalChat/coati/distributed/reward/verifiable_reward.py
new file mode 100644
index 000000000000..d1700d86f14e
--- /dev/null
+++ b/applications/ColossalChat/coati/distributed/reward/verifiable_reward.py
@@ -0,0 +1,47 @@
+"""
+Function-based reward verification module.
+"""
+
+from typing import Any, Dict, List
+
+import torch
+
+
+class VerifiableReward:
+    def __init__(self, reward_fn: List[callable], reward_args: List[Dict[str, Any]]):
+        self.reward_fn = reward_fn
+        self.reward_args = reward_args
+
+    def __call__(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: torch.LongTensor,
+        response_start: List[int] = None,
+        response_end: List[int] = None,
+        gt_answer: List[str] = None,
+    ) -> torch.Tensor:
+        # Get batch size
+        bs = input_ids.size(0)
+        # Initialize reward
+        reward = torch.zeros(bs, device=input_ids.device)
+
+        # Loop through reward functions
+        for reward_fn in self.reward_fn_list:
+            # Apply the reward function to the entire batch at once
+            reward_batch = torch.stack(
+                [
+                    reward_fn(
+                        input_ids[i],
+                        attention_mask[i],
+                        response_start=response_start[i],
+                        response_end=response_end[i],
+                        gt_answer=gt_answer[i],
+                        **self.kwargs,
+                    )
+                    for i in range(bs)
+                ],
+                dim=0,
+            )
+
+            rewards += reward_batch
+        return rewards

From ffd3878a1ecaba925de4e64eb41e89bf0dfafd70 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Sun, 23 Feb 2025 22:54:26 +0800
Subject: [PATCH 004/100] add simple grpo

---
 .../ColossalChat/coati/dataset/loader.py      |  11 ++
 .../coati/distributed/grpo_consumer.py        | 150 ++++++++++++++++++
 .../coati/distributed/inference_backend.py    |   2 +
 .../ColossalChat/coati/distributed/launch.py  |   4 +-
 .../ColossalChat/coati/distributed/loss.py    |  44 +++++
 .../coati/distributed/reward/reward_fn.py     |  10 +-
 .../distributed/reward/verifiable_reward.py   |  18 +--
 .../ColossalChat/coati/distributed/utils.py   |  35 ++++
 8 files changed, 253 insertions(+), 21 deletions(-)
 create mode 100644 applications/ColossalChat/coati/distributed/grpo_consumer.py
 create mode 100644 applications/ColossalChat/coati/distributed/loss.py

diff --git a/applications/ColossalChat/coati/dataset/loader.py b/applications/ColossalChat/coati/dataset/loader.py
index fc3a4930c058..04093e7057ae 100755
--- a/applications/ColossalChat/coati/dataset/loader.py
+++ b/applications/ColossalChat/coati/dataset/loader.py
@@ -356,6 +356,12 @@ def apply_chat_template_and_mask(
     truncation: bool = True,
     ignore_idx: int = -100,
 ) -> Dict[str, torch.Tensor]:
+    # Format for RL.
+    gt_answer = None
+    if "messages" in chat and "gt_answer" in chat:
+        gt_answer = chat["gt_answer"]
+        chat = [chat["messages"]]
+
     tokens = []
     assistant_mask = []
     for i, msg in enumerate(chat):
@@ -389,6 +395,11 @@ def apply_chat_template_and_mask(
     labels = input_ids.clone()
     labels[~torch.tensor(assistant_mask, dtype=torch.bool)] = ignore_idx
 
+    if gt_answer is not None:
+        gt_answer = tokenizer.encode(gt_answer, padding="max_length", max_length=64, return_tensors="pt")
+        gt_answer = gt_answer.squeeze(1)
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels, "gt_answer": gt_answer}
+
     return {
         "input_ids": input_ids,
         "attention_mask": attention_mask,
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
new file mode 100644
index 000000000000..79128b89e17a
--- /dev/null
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -0,0 +1,150 @@
+from contextlib import nullcontext
+from typing import Optional
+
+import ray
+import torch
+from coati.distributed.consumer import BaseConsumer
+from coati.distributed.loss import PolicyLoss
+from coati.distributed.reward.reward_fn import math_reward_fn
+from coati.distributed.reward.verifiable_reward import VerifiableReward
+from coati.distributed.utils import calc_action_log_probs
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from colossalai.nn.optimizer import HybridAdam
+
+
+@ray.remote
+class GRPOConsumer(BaseConsumer):
+    def __init__(
+        self,
+        num_producers,
+        num_episodes,
+        rank,
+        world_size,
+        master_addr,
+        master_port,
+        num_update_per_episode,
+        num_recv_per_update,
+        batch_size,
+        model_config,
+        plugin_config,
+        microbatch_size=1,
+    ):
+        super().__init__(
+            num_producers,
+            num_episodes,
+            rank,
+            world_size,
+            master_addr,
+            master_port,
+            num_update_per_episode,
+            num_recv_per_update,
+            batch_size,
+            model_config,
+            plugin_config,
+            microbatch_size,
+        )
+        path = model_config.pop("path")
+        self.policy_model = AutoModelForCausalLM.from_pretrained(path, **model_config)
+        self.policy_model.train()
+        self.policy_model.gradient_checkpointing_enable()
+        self.optimizer = HybridAdam(self.policy_model.parameters(), lr=1e-4)
+        self.accum_loss = torch.zeros(1, device=self.device)
+
+        # Reference model is initialized from policy model.
+        self.reference_model = AutoModelForCausalLM.from_pretrained(path, **model_config)
+        self.reference_model.eval()
+
+        self.tokenizer = AutoTokenizer.from_pretrained(path)
+        self.pad_token_id = self.tokenizer.pad_token_id
+
+        # Initialize verifiable reward.
+        response_format_tags = {
+            "think_start": {"text": "<think>", "num_occur": 1},
+            "think_end": {"text": "</think>", "num_occur": 1},
+            "answer_start": {"text": "<answer>", "num_occur": 1},
+            "answer_end": {"text": "</answer>", "num_occur": 1},
+        }
+        self.reward_model = VerifiableReward(
+            reward_fns=[math_reward_fn], tokenizer=self.tokenizer, tags=response_format_tags
+        )
+
+        self.policy_loss_fn = PolicyLoss()
+
+    def setup(self):
+        super().setup()
+        self.policy_model, self.optimizer, *_ = self.booster.boost(self.policy_model, self.optimizer)
+        self.reference_model, *_ = self.booster.boost(self.reference_model)
+
+    def step(self, step_idx: int, **kwargs) -> Optional[float]:
+        """
+        Step data from policy model:
+            [{
+                "input_ids": torch.Tensor,
+                "attention_mask": torch.Tensor,
+                "action_mask": torch.Tensor,
+                "action_log_probs": torch.Tensor,
+            },
+            ...]
+        Format:
+            [batch_size, prompt_length + response_length] --- <PAD>...<PAD><PROMPT>...<PROMPT><RESPONSE>...<RESPONSE><PAD>...<PAD>.
+        """
+        labels = kwargs["input_ids"].clone()
+        labels[kwargs["attention_mask"] == 0] = -100
+        kwargs["labels"] = labels
+        sequences = kwargs["input_ids"]
+        action_mask = kwargs["action_mask"]
+        num_action = action_mask.shape[1]
+        old_action_log_probs = kwargs["action_log_probs"]
+        assert kwargs.pop("action_mask").shape == kwargs.pop("action_log_probs").shape
+
+        need_update = (step_idx + 1) % self.num_microbatches == 0
+
+        ctx = nullcontext() if need_update else self.booster.no_sync(self.policy_model, self.optimizer)
+        with ctx:
+            policy_model_logits = self.policy_model(
+                input_ids=kwargs["input_ids"],
+                attention_mask=kwargs["attention_mask"],
+            )["logits"]
+            action_log_probs = calc_action_log_probs(policy_model_logits, sequences, num_action)
+
+            reference_model_logits = self.reference_model(
+                input_ids=sequences,
+                attention_mask=kwargs["attention_mask"],
+            )["logits"]
+            reference_action_log_probs = calc_action_log_probs(reference_model_logits, sequences, num_action)
+
+            # GRPO advantage calculation
+            kl = torch.sum(-0.01 * (action_log_probs - reference_action_log_probs) * action_mask, dim=-1) / torch.sum(
+                action_mask, dim=-1
+            )
+
+            reward = self.reward_model(sequences, gt_answer=kwargs["gt_answer"])
+            reward = reward + kl
+            mean = reward.view(-1, reward.size(0)).mean(dim=1)
+            std = reward.view(-1, reward.size(0)).std(dim=1)
+            advantages = (reward - mean) / (std + 1e-4)
+            # Calculate Loss
+            loss, skip_update, _ = self.policy_loss_fn(
+                action_log_probs,
+                old_action_log_probs,
+                advantages.unsqueeze(dim=-1).repeat_interleave(action_log_probs.size(-1), dim=-1),
+                action_mask,
+            )
+
+            loss = loss / self.num_microbatches
+            self.accum_loss.add_(loss.data)
+            if not skip_update:
+                self.booster.backward(loss, self.optimizer)
+        if need_update:
+            self.optimizer.step()
+            self.optimizer.zero_grad()
+            loss_scalar = self.accum_loss.item()
+            self.accum_loss.zero_()
+            return loss_scalar
+
+    def state_dict(self):
+        self.policy_model._force_wait_all_gather()
+        model = self.policy_model.unwrap()
+        state_dict = model.state_dict()
+        return state_dict
diff --git a/applications/ColossalChat/coati/distributed/inference_backend.py b/applications/ColossalChat/coati/distributed/inference_backend.py
index 95b7d1e80308..210ed503635c 100644
--- a/applications/ColossalChat/coati/distributed/inference_backend.py
+++ b/applications/ColossalChat/coati/distributed/inference_backend.py
@@ -210,6 +210,8 @@ def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwar
             "action_log_probs": log_probs,
             "action_mask": action_mask,
         }
+        if "gt_answer" in kwargs:
+            data["gt_answer"] = kwargs["gt_answer"]
         data = {k: v.to(get_current_device()) for k, v in data.items()}
         return data
 
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index 438c463002f0..5244cc7d9fed 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -2,7 +2,7 @@
 
 import ray
 
-from .consumer import SimpleConsumer
+from .grpo_consumer import GRPOConsumer
 from .producer import SimpleProducer
 
 
@@ -68,7 +68,7 @@ def launch_distributed(
         )
         procs.append(producer)
     for i in range(num_consumer_procs):
-        consumer = SimpleConsumer.options(num_gpus=1).remote(
+        consumer = GRPOConsumer.options(num_gpus=1).remote(
             num_producers=num_producers,
             num_episodes=num_episodes,
             rank=i,
diff --git a/applications/ColossalChat/coati/distributed/loss.py b/applications/ColossalChat/coati/distributed/loss.py
new file mode 100644
index 000000000000..c08acba511cf
--- /dev/null
+++ b/applications/ColossalChat/coati/distributed/loss.py
@@ -0,0 +1,44 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from coati.distributed.utils import masked_mean
+
+
+class PolicyLoss(nn.Module):
+    """
+    Policy Loss for PPO
+    """
+
+    def __init__(self, clip_eps: float = 0.2, skip_threshold: float = 20.0) -> None:
+        super().__init__()
+        self.clip_eps = clip_eps
+        self.skip_threshold = skip_threshold
+
+    def forward(
+        self,
+        log_probs: torch.Tensor,
+        old_log_probs: torch.Tensor,
+        advantages: torch.Tensor,
+        action_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        skip = False
+        if action_mask is None:
+            ratio_ = (log_probs - old_log_probs).exp()
+        else:
+            ratio_ = ((log_probs - old_log_probs) * action_mask).exp()
+
+        # note that if dropout is disabled (recommanded), ratio will always be 1.
+        if ratio_.mean() > self.skip_threshold:
+            skip = True
+
+        ratio = ratio_.clamp(0.0, 10.0)
+        surr1 = ratio * advantages
+        surr2 = ratio.clamp(1 - self.clip_eps, 1 + self.clip_eps) * advantages
+        loss = -torch.min(surr1, surr2)
+        if action_mask is not None:
+            loss = masked_mean(loss, action_mask)
+        else:
+            loss = loss.mean(dim=1)
+        loss = loss.mean()
+        return loss, skip, ratio_.max()
diff --git a/applications/ColossalChat/coati/distributed/reward/reward_fn.py b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
index f127a1eced20..c7b452c54545 100644
--- a/applications/ColossalChat/coati/distributed/reward/reward_fn.py
+++ b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
@@ -3,17 +3,13 @@
 from .reward_utils import extract_solution, validate_response_structure
 
 
-def math_reward_fn(input_ids, **kwargs):
-    # apply varifiable reward
-    # reward 10 points if the final answer is correct, reward 1 point if format is correct
-
-    gt_answer = kwargs["gt_answer"]
+def math_reward_fn(input_ids, gt_answer, **kwargs):
     tokenizer = kwargs["tokenizer"]
-    s, e = kwargs["response_start"], kwargs["response_end"]
     reward = torch.tensor(0.0).to(input_ids.device)
     if gt_answer is None:
         return reward
-    decoded_final_answer = tokenizer.decode(input_ids[s : e + 1], skip_special_tokens=True)
+    decoded_final_answer = tokenizer.decode(input_ids, skip_special_tokens=True)
+    gt_answer = tokenizer.decode(gt_answer.squeeze(0))
     final_answer, processed_str = extract_solution(decoded_final_answer)
 
     format_valid = validate_response_structure(processed_str, kwargs["tags"])
diff --git a/applications/ColossalChat/coati/distributed/reward/verifiable_reward.py b/applications/ColossalChat/coati/distributed/reward/verifiable_reward.py
index d1700d86f14e..fe889a7f4a46 100644
--- a/applications/ColossalChat/coati/distributed/reward/verifiable_reward.py
+++ b/applications/ColossalChat/coati/distributed/reward/verifiable_reward.py
@@ -8,33 +8,27 @@
 
 
 class VerifiableReward:
-    def __init__(self, reward_fn: List[callable], reward_args: List[Dict[str, Any]]):
-        self.reward_fn = reward_fn
-        self.reward_args = reward_args
+    def __init__(self, reward_fns: List[callable], **kwargs: List[Dict[str, Any]]):
+        self.reward_fns = reward_fns
+        self.kwargs = kwargs
 
     def __call__(
         self,
         input_ids: torch.LongTensor,
-        attention_mask: torch.LongTensor,
-        response_start: List[int] = None,
-        response_end: List[int] = None,
-        gt_answer: List[str] = None,
+        gt_answer: List[torch.Tensor] = None,
     ) -> torch.Tensor:
         # Get batch size
         bs = input_ids.size(0)
         # Initialize reward
-        reward = torch.zeros(bs, device=input_ids.device)
+        rewards = torch.zeros(bs, device=input_ids.device)
 
         # Loop through reward functions
-        for reward_fn in self.reward_fn_list:
+        for reward_fn in self.reward_fns:
             # Apply the reward function to the entire batch at once
             reward_batch = torch.stack(
                 [
                     reward_fn(
                         input_ids[i],
-                        attention_mask[i],
-                        response_start=response_start[i],
-                        response_end=response_end[i],
                         gt_answer=gt_answer[i],
                         **self.kwargs,
                     )
diff --git a/applications/ColossalChat/coati/distributed/utils.py b/applications/ColossalChat/coati/distributed/utils.py
index 533a5ffb22da..98b54815b5b4 100644
--- a/applications/ColossalChat/coati/distributed/utils.py
+++ b/applications/ColossalChat/coati/distributed/utils.py
@@ -64,3 +64,38 @@ def log_probs_from_logits(logits: torch.Tensor, labels: torch.Tensor) -> torch.T
     log_probs = torch.log_softmax(logits, dim=-1)
     per_label_logps = log_probs.gather(dim=-1, index=labels.unsqueeze(-1))
     return per_label_logps.squeeze(-1)
+
+
+def calc_action_log_probs(logits: torch.Tensor, sequences: torch.LongTensor, num_actions: int) -> torch.Tensor:
+    """Calculate action log probs.
+
+    Args:
+        output (torch.Tensor): Output tensor of Actor.forward.logits.
+        sequences (torch.LongTensor): Input sequences.
+        num_actions (int): Number of actions.
+
+    Returns:
+        torch.Tensor: Action log probs.
+    """
+    log_probs = log_probs_from_logits(logits[:, :-1, :], sequences[:, 1:])
+    return log_probs[:, -num_actions:]
+
+
+def masked_mean(tensor: torch.Tensor, mask: torch.Tensor, dim: int = 1) -> torch.Tensor:
+    """
+    Compute the masked mean of a tensor along a specified dimension.
+
+    Args:
+        tensor (torch.Tensor): The input tensor.
+        mask (torch.Tensor): The mask tensor with the same shape as the input tensor.
+        dim (int, optional): The dimension along which to compute the mean. Default is 1.
+
+    Returns:
+        torch.Tensor: The masked mean tensor.
+
+    """
+    tensor = tensor * mask
+    tensor = tensor.sum(dim=dim)
+    mask_sum = mask.sum(dim=dim)
+    mean = tensor / (mask_sum + 1e-8)
+    return mean

From f736d747e3b3c2601a15d240db669607e8aacba9 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Tue, 25 Feb 2025 18:12:04 +0800
Subject: [PATCH 005/100] update grpo

---
 .../coati/distributed/grpo_consumer.py        | 70 +++++++++++++------
 1 file changed, 50 insertions(+), 20 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 79128b89e17a..d88df2360df7 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -3,11 +3,13 @@
 
 import ray
 import torch
+import wandb
 from coati.distributed.consumer import BaseConsumer
 from coati.distributed.loss import PolicyLoss
 from coati.distributed.reward.reward_fn import math_reward_fn
 from coati.distributed.reward.verifiable_reward import VerifiableReward
 from coati.distributed.utils import calc_action_log_probs
+from coati.trainer.utils import all_reduce_mean, is_rank_0
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from colossalai.nn.optimizer import HybridAdam
@@ -29,6 +31,8 @@ def __init__(
         model_config,
         plugin_config,
         microbatch_size=1,
+        num_generations=4,
+        use_wandb=False,
     ):
         super().__init__(
             num_producers,
@@ -50,6 +54,8 @@ def __init__(
         self.policy_model.gradient_checkpointing_enable()
         self.optimizer = HybridAdam(self.policy_model.parameters(), lr=1e-4)
         self.accum_loss = torch.zeros(1, device=self.device)
+        self.accum_reward = torch.zeros(1, device=self.device)
+        self.accum_kl = torch.zeros(1, device=self.device)
 
         # Reference model is initialized from policy model.
         self.reference_model = AutoModelForCausalLM.from_pretrained(path, **model_config)
@@ -57,6 +63,7 @@ def __init__(
 
         self.tokenizer = AutoTokenizer.from_pretrained(path)
         self.pad_token_id = self.tokenizer.pad_token_id
+        self.num_generations = num_generations
 
         # Initialize verifiable reward.
         response_format_tags = {
@@ -70,6 +77,8 @@ def __init__(
         )
 
         self.policy_loss_fn = PolicyLoss()
+        if is_rank_0():
+            self.run = wandb.init(project="Colossal-GRPO-Test4")
 
     def setup(self):
         super().setup()
@@ -87,43 +96,52 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
             },
             ...]
         Format:
-            [batch_size, prompt_length + response_length] --- <PAD>...<PAD><PROMPT>...<PROMPT><RESPONSE>...<RESPONSE><PAD>...<PAD>.
+            [batch_size, num_of_generation, prompt_length + response_length] --- <PAD>...<PAD><PROMPT>...<PROMPT><RESPONSE>...<RESPONSE><PAD>...<PAD>.
         """
-        labels = kwargs["input_ids"].clone()
-        labels[kwargs["attention_mask"] == 0] = -100
-        kwargs["labels"] = labels
-        sequences = kwargs["input_ids"]
-        action_mask = kwargs["action_mask"]
+
+        # Reshape to [batch_size x num_of_generation, prompt_length + response_length]
+        data = {k: v.view(-1, v.size(-1)) for k, v in kwargs.items()}
+        action_mask = data["action_mask"]
         num_action = action_mask.shape[1]
-        old_action_log_probs = kwargs["action_log_probs"]
-        assert kwargs.pop("action_mask").shape == kwargs.pop("action_log_probs").shape
+        old_action_log_probs = data["action_log_probs"]
 
         need_update = (step_idx + 1) % self.num_microbatches == 0
 
         ctx = nullcontext() if need_update else self.booster.no_sync(self.policy_model, self.optimizer)
         with ctx:
             policy_model_logits = self.policy_model(
-                input_ids=kwargs["input_ids"],
-                attention_mask=kwargs["attention_mask"],
+                input_ids=data["input_ids"],
+                attention_mask=data["attention_mask"],
             )["logits"]
-            action_log_probs = calc_action_log_probs(policy_model_logits, sequences, num_action)
+            action_log_probs = calc_action_log_probs(policy_model_logits, data["input_ids"], num_action)
 
             reference_model_logits = self.reference_model(
-                input_ids=sequences,
-                attention_mask=kwargs["attention_mask"],
+                input_ids=data["input_ids"],
+                attention_mask=data["attention_mask"],
             )["logits"]
-            reference_action_log_probs = calc_action_log_probs(reference_model_logits, sequences, num_action)
+            reference_action_log_probs = calc_action_log_probs(reference_model_logits, data["input_ids"], num_action)
+
+            # GRPO advantage calculation
+            kl = torch.sum(-0.1 * (action_log_probs - reference_action_log_probs) * action_mask, dim=-1) / torch.sum(
+                action_mask, dim=-1
+            )
+
+            reward = self.reward_model(data["input_ids"], gt_answer=data["gt_answer"])
+            reward = kl + reward
+            # [batch_size, num_generations]
+            group_reward = reward.view(-1, self.num_generations)
+
+            # [batch_size x num_generations]
+            reward_mean = group_reward.mean(dim=1).repeat_interleave(self.num_generations, dim=0)
+            reward_std = group_reward.std(dim=1).repeat_interleave(self.num_generations, dim=0)
+            # [batch_size x num_generations]
+            advantages = (group_reward.view(-1) - reward_mean) / (reward_std + 1e-4)
 
             # GRPO advantage calculation
             kl = torch.sum(-0.01 * (action_log_probs - reference_action_log_probs) * action_mask, dim=-1) / torch.sum(
                 action_mask, dim=-1
             )
 
-            reward = self.reward_model(sequences, gt_answer=kwargs["gt_answer"])
-            reward = reward + kl
-            mean = reward.view(-1, reward.size(0)).mean(dim=1)
-            std = reward.view(-1, reward.size(0)).std(dim=1)
-            advantages = (reward - mean) / (std + 1e-4)
             # Calculate Loss
             loss, skip_update, _ = self.policy_loss_fn(
                 action_log_probs,
@@ -133,14 +151,26 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
             )
 
             loss = loss / self.num_microbatches
-            self.accum_loss.add_(loss.data)
             if not skip_update:
                 self.booster.backward(loss, self.optimizer)
+            loss = all_reduce_mean(loss)
+            reward = all_reduce_mean(reward.mean())
+            kl = all_reduce_mean(kl.mean())
+            self.accum_loss.add_(loss.data)
+            self.accum_reward.add_(reward.data)
+            self.accum_kl.add_(kl.data)
         if need_update:
             self.optimizer.step()
             self.optimizer.zero_grad()
             loss_scalar = self.accum_loss.item()
+            if is_rank_0():
+                print("Loss:", self.accum_loss.item(), "Reward:", self.accum_reward.item(), "KL:", self.accum_kl.item())
+                self.run.log(
+                    {"loss": self.accum_loss.item(), "reward": self.accum_reward.item(), "kl": self.accum_kl.item()}
+                )
             self.accum_loss.zero_()
+            self.accum_reward.zero_()
+            self.accum_kl.zero_()
             return loss_scalar
 
     def state_dict(self):

From 070907dd7fe9d2dcf32e06b928a54a73b406bd83 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Fri, 28 Feb 2025 10:16:42 +0800
Subject: [PATCH 006/100] polish

---
 .../ColossalChat/coati/dataset/loader.py      | 27 ++++++++-----
 .../coati/distributed/grpo_consumer.py        | 40 ++++++++++++++-----
 .../coati/distributed/inference_backend.py    | 15 ++++++-
 .../coati/distributed/reward/reward_fn.py     |  8 ++--
 .../distributed/reward/verifiable_reward.py   |  2 +
 applications/ColossalChat/rl_example.py       |  8 +++-
 6 files changed, 74 insertions(+), 26 deletions(-)

diff --git a/applications/ColossalChat/coati/dataset/loader.py b/applications/ColossalChat/coati/dataset/loader.py
index 04093e7057ae..bfee5dce0304 100755
--- a/applications/ColossalChat/coati/dataset/loader.py
+++ b/applications/ColossalChat/coati/dataset/loader.py
@@ -356,6 +356,14 @@ def apply_chat_template_and_mask(
     truncation: bool = True,
     ignore_idx: int = -100,
 ) -> Dict[str, torch.Tensor]:
+
+    system_prompt = "You are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and<answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. Now the user asks you to solve a math problem that involves reasoning. After thinking, when you finally reach a conclusion, clearly output the final answer without explanation within the <answer> </answer> tags, your final answer should be a integer without unit, currency mark, thousands separator or other text. i.e., <answer> 123 </answer>.\n"
+
+    system_element = {
+        "role": "system",
+        "content": system_prompt,
+    }
+
     # Format for RL.
     gt_answer = None
     if "messages" in chat and "gt_answer" in chat:
@@ -365,7 +373,7 @@ def apply_chat_template_and_mask(
     tokens = []
     assistant_mask = []
     for i, msg in enumerate(chat):
-        msg_tokens = tokenizer.apply_chat_template([msg], tokenize=True)
+        msg_tokens = tokenizer.apply_chat_template([system_element, msg], tokenize=True, add_generation_prompt=True)
         # remove unexpected bos token
         if i > 0 and msg_tokens[0] == tokenizer.bos_token_id:
             msg_tokens = msg_tokens[1:]
@@ -378,14 +386,15 @@ def apply_chat_template_and_mask(
     if max_length is not None:
         if padding and len(tokens) < max_length:
             to_pad = max_length - len(tokens)
-            if tokenizer.padding_side == "right":
-                tokens.extend([tokenizer.pad_token_id] * to_pad)
-                assistant_mask.extend([False] * to_pad)
-                attention_mask.extend([0] * to_pad)
-            else:
-                tokens = [tokenizer.pad_token_id] * to_pad + tokens
-                assistant_mask = [False] * to_pad + assistant_mask
-                attention_mask = [0] * to_pad + attention_mask
+            # Left padding for generation.
+            # if tokenizer.padding_side == "right":
+            #     tokens.extend([tokenizer.pad_token_id] * to_pad)
+            #     assistant_mask.extend([False] * to_pad)
+            #     attention_mask.extend([0] * to_pad)
+            # else:
+            tokens = [tokenizer.pad_token_id] * to_pad + tokens
+            assistant_mask = [False] * to_pad + assistant_mask
+            attention_mask = [0] * to_pad + attention_mask
         if truncation and len(tokens) > max_length:
             tokens = tokens[:max_length]
             assistant_mask = assistant_mask[:max_length]
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index d88df2360df7..2f230f5ed574 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -9,7 +9,7 @@
 from coati.distributed.reward.reward_fn import math_reward_fn
 from coati.distributed.reward.verifiable_reward import VerifiableReward
 from coati.distributed.utils import calc_action_log_probs
-from coati.trainer.utils import all_reduce_mean, is_rank_0
+from coati.trainer.utils import all_reduce_mean
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from colossalai.nn.optimizer import HybridAdam
@@ -77,8 +77,15 @@ def __init__(
         )
 
         self.policy_loss_fn = PolicyLoss()
-        if is_rank_0():
-            self.run = wandb.init(project="Colossal-GRPO-Test4")
+        self.global_step = 0
+        if self.rank == 0:
+            self.wandb_run = wandb.init(project="Colossal-GRPO-Test6", sync_tensorboard=True)
+            # import os
+            # import time
+
+            # log_dir = self.wandb_run.dir
+            # # log_dir = os.path.join(log_dir, time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()))
+            # # self.writer = SummaryWriter(log_dir=log_dir)
 
     def setup(self):
         super().setup()
@@ -115,10 +122,11 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
             )["logits"]
             action_log_probs = calc_action_log_probs(policy_model_logits, data["input_ids"], num_action)
 
-            reference_model_logits = self.reference_model(
-                input_ids=data["input_ids"],
-                attention_mask=data["attention_mask"],
-            )["logits"]
+            with torch.no_grad():
+                reference_model_logits = self.reference_model(
+                    input_ids=data["input_ids"],
+                    attention_mask=data["attention_mask"],
+                )["logits"]
             reference_action_log_probs = calc_action_log_probs(reference_model_logits, data["input_ids"], num_action)
 
             # GRPO advantage calculation
@@ -126,7 +134,9 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
                 action_mask, dim=-1
             )
 
-            reward = self.reward_model(data["input_ids"], gt_answer=data["gt_answer"])
+            reward = self.reward_model(
+                data["input_ids"], gt_answer=data["gt_answer"], response_idx=data["response_idx"]
+            )
             reward = kl + reward
             # [batch_size, num_generations]
             group_reward = reward.view(-1, self.num_generations)
@@ -163,11 +173,19 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
             self.optimizer.step()
             self.optimizer.zero_grad()
             loss_scalar = self.accum_loss.item()
-            if is_rank_0():
+            if self.rank == 0:
                 print("Loss:", self.accum_loss.item(), "Reward:", self.accum_reward.item(), "KL:", self.accum_kl.item())
-                self.run.log(
-                    {"loss": self.accum_loss.item(), "reward": self.accum_reward.item(), "kl": self.accum_kl.item()}
+                self.wandb_run.log(
+                    {
+                        "train/loss": self.accum_loss.item(),
+                        "train/reward": self.accum_reward.item(),
+                        "train/kl": self.accum_kl.item(),
+                    }
                 )
+                # self.writer.add_scalar("train/loss", self.accum_loss.item(), self.global_step)
+                # self.writer.add_scalar("train/reward", self.accum_reward.item(), self.global_step)
+                # self.writer.add_scalar("train/kl", self.accum_kl.item(), self.global_step)
+                # self.global_step += 1
             self.accum_loss.zero_()
             self.accum_reward.zero_()
             self.accum_kl.zero_()
diff --git a/applications/ColossalChat/coati/distributed/inference_backend.py b/applications/ColossalChat/coati/distributed/inference_backend.py
index 210ed503635c..bc0ae5c36673 100644
--- a/applications/ColossalChat/coati/distributed/inference_backend.py
+++ b/applications/ColossalChat/coati/distributed/inference_backend.py
@@ -154,6 +154,7 @@ class VLLMInferenceBackend(BaseInferenceBackend):
     )
     FORCE_GENERATE_CONFIG = dict(
         logprobs=0,
+        n=4,
     )
 
     def __init__(self, model_config: Dict[str, Any], generate_config: Dict[str, Any], tokenizer: PreTrainedTokenizer):
@@ -166,19 +167,24 @@ def __init__(self, model_config: Dict[str, Any], generate_config: Dict[str, Any]
         generate_config.update(self.FORCE_GENERATE_CONFIG)
         self.generate_config = SamplingParams(**generate_config)
         self.tokenizer = tokenizer
+        self.num_generations = self.FORCE_GENERATE_CONFIG["n"]
 
     @torch.no_grad()
     def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:
+        micro_batch_size = input_ids.size(0)
+        response_start_idx = input_ids.size(1)
         outputs = self.llm.generate(
             prompt_token_ids=input_ids.tolist(), sampling_params=self.generate_config, use_tqdm=False
         )
         out_tokens = []
         out_len = []
         log_probs = []
+        response_idx = []
         for out in outputs:
             for output_i in out.outputs:
                 out_len.append(len(output_i.token_ids))
                 out_tokens.append(list(output_i.token_ids))
+                response_idx.append((response_start_idx, response_start_idx + len(output_i.token_ids) - 1))
                 assert len(output_i.logprobs) == len(output_i.token_ids)
                 p = [m[t].logprob for m, t in zip(output_i.logprobs, output_i.token_ids)]
                 log_probs.append(p)
@@ -195,6 +201,8 @@ def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwar
 
         out_tokens = torch.tensor(out_tokens)
         log_probs = torch.tensor(log_probs)
+        response_idx = torch.tensor(response_idx)
+
         if attention_mask.size(0) != action_mask.size(0):
             assert action_mask.size(0) % attention_mask.size(0) == 0
             num_returns = action_mask.size(0) // attention_mask.size(0)
@@ -209,9 +217,14 @@ def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwar
             "attention_mask": attention_mask,
             "action_log_probs": log_probs,
             "action_mask": action_mask,
+            "response_idx": response_idx,
         }
+
+        data = {k: v.view(micro_batch_size, self.num_generations, v.size(-1)) for k, v in data.items()}
+
         if "gt_answer" in kwargs:
-            data["gt_answer"] = kwargs["gt_answer"]
+            # repeat gt_answer for each prompt.
+            data["gt_answer"] = kwargs["gt_answer"].repeat_interleave(self.num_generations, dim=1)
         data = {k: v.to(get_current_device()) for k, v in data.items()}
         return data
 
diff --git a/applications/ColossalChat/coati/distributed/reward/reward_fn.py b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
index c7b452c54545..c92f822f7373 100644
--- a/applications/ColossalChat/coati/distributed/reward/reward_fn.py
+++ b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
@@ -3,12 +3,14 @@
 from .reward_utils import extract_solution, validate_response_structure
 
 
-def math_reward_fn(input_ids, gt_answer, **kwargs):
+def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
     tokenizer = kwargs["tokenizer"]
     reward = torch.tensor(0.0).to(input_ids.device)
+    s, e = response_idx[0], response_idx[1]
     if gt_answer is None:
         return reward
-    decoded_final_answer = tokenizer.decode(input_ids, skip_special_tokens=True)
+
+    decoded_final_answer = tokenizer.decode(input_ids[s : e + 1], skip_special_tokens=True)
     gt_answer = tokenizer.decode(gt_answer.squeeze(0))
     final_answer, processed_str = extract_solution(decoded_final_answer)
 
@@ -29,7 +31,7 @@ def gsm8k_reward_fn(input_ids, **kwargs):
     reward = torch.tensor(0.0).to(input_ids.device)
     if gt_answer is None:
         return reward
-    decoded_final_answer = tokenizer.decode(input_ids[s:e], skip_special_tokens=True)
+    decoded_final_answer = tokenizer.decode(input_ids[s : e + 1], skip_special_tokens=True)
     final_answer, processed_str = extract_solution(decoded_final_answer)
     is_valid = True
     try:
diff --git a/applications/ColossalChat/coati/distributed/reward/verifiable_reward.py b/applications/ColossalChat/coati/distributed/reward/verifiable_reward.py
index fe889a7f4a46..b43ba65c0ab7 100644
--- a/applications/ColossalChat/coati/distributed/reward/verifiable_reward.py
+++ b/applications/ColossalChat/coati/distributed/reward/verifiable_reward.py
@@ -16,6 +16,7 @@ def __call__(
         self,
         input_ids: torch.LongTensor,
         gt_answer: List[torch.Tensor] = None,
+        response_idx: List[torch.Tensor] = None,
     ) -> torch.Tensor:
         # Get batch size
         bs = input_ids.size(0)
@@ -30,6 +31,7 @@ def __call__(
                     reward_fn(
                         input_ids[i],
                         gt_answer=gt_answer[i],
+                        response_idx=response_idx[i],
                         **self.kwargs,
                     )
                     for i in range(bs)
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index a6f82b3bebf9..1b5c18486b35 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -51,13 +51,17 @@
     elif args.backend == "vllm":
         inference_model_config.update(
             dict(
-                gpu_memory_utilization=0.6,
+                gpu_memory_utilization=0.7,
             )
         )
         generate_config.update(
             dict(
-                max_tokens=256,
+                max_tokens=2048,
                 ignore_eos=True,
+                include_stop_str_in_output=True,
+                stop=["</answer>"],
+                temperature=0.2,
+                top_p=0.95,
             )
         )
     else:

From c15225bc528f341dbd7757e3279439a18e065a5d Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li35271158@gmail.com>
Date: Thu, 6 Mar 2025 10:49:44 +0800
Subject: [PATCH 007/100] modify data loader

---
 applications/ColossalChat/coati/dataset/loader.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/applications/ColossalChat/coati/dataset/loader.py b/applications/ColossalChat/coati/dataset/loader.py
index bfee5dce0304..265449326541 100755
--- a/applications/ColossalChat/coati/dataset/loader.py
+++ b/applications/ColossalChat/coati/dataset/loader.py
@@ -387,11 +387,6 @@ def apply_chat_template_and_mask(
         if padding and len(tokens) < max_length:
             to_pad = max_length - len(tokens)
             # Left padding for generation.
-            # if tokenizer.padding_side == "right":
-            #     tokens.extend([tokenizer.pad_token_id] * to_pad)
-            #     assistant_mask.extend([False] * to_pad)
-            #     attention_mask.extend([0] * to_pad)
-            # else:
             tokens = [tokenizer.pad_token_id] * to_pad + tokens
             assistant_mask = [False] * to_pad + assistant_mask
             attention_mask = [0] * to_pad + attention_mask
@@ -405,7 +400,9 @@ def apply_chat_template_and_mask(
     labels[~torch.tensor(assistant_mask, dtype=torch.bool)] = ignore_idx
 
     if gt_answer is not None:
-        gt_answer = tokenizer.encode(gt_answer, padding="max_length", max_length=64, return_tensors="pt")
+        gt_answer = tokenizer.encode(
+            gt_answer, padding="max_length", truncation=True, max_length=128, return_tensors="pt"
+        )
         gt_answer = gt_answer.squeeze(1)
         return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels, "gt_answer": gt_answer}
 

From b96d69055e693d8c84d62a00dae896e50d3a7e60 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li35271158@gmail.com>
Date: Thu, 6 Mar 2025 10:51:27 +0800
Subject: [PATCH 008/100] grpo consumer

---
 .../coati/distributed/grpo_consumer.py        | 56 +++++++++----------
 1 file changed, 26 insertions(+), 30 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 2f230f5ed574..49240d8da176 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -52,10 +52,11 @@ def __init__(
         self.policy_model = AutoModelForCausalLM.from_pretrained(path, **model_config)
         self.policy_model.train()
         self.policy_model.gradient_checkpointing_enable()
-        self.optimizer = HybridAdam(self.policy_model.parameters(), lr=1e-4)
+        self.optimizer = HybridAdam(self.policy_model.parameters(), lr=1e-6)
         self.accum_loss = torch.zeros(1, device=self.device)
         self.accum_reward = torch.zeros(1, device=self.device)
         self.accum_kl = torch.zeros(1, device=self.device)
+        self.accum_count = 0
 
         # Reference model is initialized from policy model.
         self.reference_model = AutoModelForCausalLM.from_pretrained(path, **model_config)
@@ -79,13 +80,7 @@ def __init__(
         self.policy_loss_fn = PolicyLoss()
         self.global_step = 0
         if self.rank == 0:
-            self.wandb_run = wandb.init(project="Colossal-GRPO-Test6", sync_tensorboard=True)
-            # import os
-            # import time
-
-            # log_dir = self.wandb_run.dir
-            # # log_dir = os.path.join(log_dir, time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()))
-            # # self.writer = SummaryWriter(log_dir=log_dir)
+            self.wandb_run = wandb.init(project="GRPO-Test", sync_tensorboard=True)
 
     def setup(self):
         super().setup()
@@ -129,15 +124,16 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
                 )["logits"]
             reference_action_log_probs = calc_action_log_probs(reference_model_logits, data["input_ids"], num_action)
 
-            # GRPO advantage calculation
-            kl = torch.sum(-0.1 * (action_log_probs - reference_action_log_probs) * action_mask, dim=-1) / torch.sum(
-                action_mask, dim=-1
+            per_token_kl = (
+                torch.exp(reference_action_log_probs - action_log_probs)
+                - (reference_action_log_probs - action_log_probs)
+                - 1
             )
+            kl = torch.sum(per_token_kl * action_mask, dim=-1) / torch.sum(action_mask, dim=-1)
 
             reward = self.reward_model(
                 data["input_ids"], gt_answer=data["gt_answer"], response_idx=data["response_idx"]
             )
-            reward = kl + reward
             # [batch_size, num_generations]
             group_reward = reward.view(-1, self.num_generations)
 
@@ -145,50 +141,50 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
             reward_mean = group_reward.mean(dim=1).repeat_interleave(self.num_generations, dim=0)
             reward_std = group_reward.std(dim=1).repeat_interleave(self.num_generations, dim=0)
             # [batch_size x num_generations]
-            advantages = (group_reward.view(-1) - reward_mean) / (reward_std + 1e-4)
-
-            # GRPO advantage calculation
-            kl = torch.sum(-0.01 * (action_log_probs - reference_action_log_probs) * action_mask, dim=-1) / torch.sum(
-                action_mask, dim=-1
-            )
+            advantages = (reward - reward_mean) / (reward_std + 1e-4)
 
             # Calculate Loss
             loss, skip_update, _ = self.policy_loss_fn(
                 action_log_probs,
                 old_action_log_probs,
                 advantages.unsqueeze(dim=-1).repeat_interleave(action_log_probs.size(-1), dim=-1),
+                per_token_kl,
                 action_mask,
             )
 
-            loss = loss / self.num_microbatches
             if not skip_update:
                 self.booster.backward(loss, self.optimizer)
-            loss = all_reduce_mean(loss)
-            reward = all_reduce_mean(reward.mean())
-            kl = all_reduce_mean(kl.mean())
+            loss = all_reduce_mean(loss, self.plugin)
+            reward = all_reduce_mean(reward.mean(), self.plugin)
+            kl = all_reduce_mean(kl.mean(), self.plugin)
             self.accum_loss.add_(loss.data)
             self.accum_reward.add_(reward.data)
             self.accum_kl.add_(kl.data)
+            self.accum_count += 1
         if need_update:
             self.optimizer.step()
             self.optimizer.zero_grad()
             loss_scalar = self.accum_loss.item()
             if self.rank == 0:
-                print("Loss:", self.accum_loss.item(), "Reward:", self.accum_reward.item(), "KL:", self.accum_kl.item())
+                print(
+                    "Loss:",
+                    self.accum_loss.item() / self.accum_count,
+                    "Reward:",
+                    self.accum_reward.item() / self.accum_count,
+                    "KL:",
+                    self.accum_kl.item() / self.accum_count,
+                )
                 self.wandb_run.log(
                     {
-                        "train/loss": self.accum_loss.item(),
-                        "train/reward": self.accum_reward.item(),
-                        "train/kl": self.accum_kl.item(),
+                        "train/loss": self.accum_loss.item() / self.accum_count,
+                        "train/reward": self.accum_reward.item() / self.accum_count,
+                        "train/kl": self.accum_kl.item() / self.accum_count,
                     }
                 )
-                # self.writer.add_scalar("train/loss", self.accum_loss.item(), self.global_step)
-                # self.writer.add_scalar("train/reward", self.accum_reward.item(), self.global_step)
-                # self.writer.add_scalar("train/kl", self.accum_kl.item(), self.global_step)
-                # self.global_step += 1
             self.accum_loss.zero_()
             self.accum_reward.zero_()
             self.accum_kl.zero_()
+            self.accum_count = 0
             return loss_scalar
 
     def state_dict(self):

From 678f5a9ecac480f5f9dfe19c9aa5668e53b77976 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li35271158@gmail.com>
Date: Thu, 6 Mar 2025 10:53:03 +0800
Subject: [PATCH 009/100] update loss

---
 applications/ColossalChat/coati/distributed/loss.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/loss.py b/applications/ColossalChat/coati/distributed/loss.py
index c08acba511cf..222540b9270d 100644
--- a/applications/ColossalChat/coati/distributed/loss.py
+++ b/applications/ColossalChat/coati/distributed/loss.py
@@ -10,16 +10,18 @@ class PolicyLoss(nn.Module):
     Policy Loss for PPO
     """
 
-    def __init__(self, clip_eps: float = 0.2, skip_threshold: float = 20.0) -> None:
+    def __init__(self, clip_eps: float = 0.2, skip_threshold: float = 20.0, beta: float = 0.01) -> None:
         super().__init__()
         self.clip_eps = clip_eps
         self.skip_threshold = skip_threshold
+        self.beta = beta
 
     def forward(
         self,
         log_probs: torch.Tensor,
         old_log_probs: torch.Tensor,
         advantages: torch.Tensor,
+        per_token_kl: torch.Tensor,
         action_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         skip = False
@@ -35,7 +37,8 @@ def forward(
         ratio = ratio_.clamp(0.0, 10.0)
         surr1 = ratio * advantages
         surr2 = ratio.clamp(1 - self.clip_eps, 1 + self.clip_eps) * advantages
-        loss = -torch.min(surr1, surr2)
+        loss = -torch.min(surr1, surr2) + self.beta * per_token_kl
+
         if action_mask is not None:
             loss = masked_mean(loss, action_mask)
         else:

From d03cdea949425f02b0f147e8b604f6d770a5cfeb Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li35271158@gmail.com>
Date: Thu, 6 Mar 2025 10:53:48 +0800
Subject: [PATCH 010/100] update reward fn

---
 .../ColossalChat/coati/distributed/reward/reward_fn.py        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/reward/reward_fn.py b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
index c92f822f7373..9e6d1066e1df 100644
--- a/applications/ColossalChat/coati/distributed/reward/reward_fn.py
+++ b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
@@ -11,7 +11,7 @@ def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
         return reward
 
     decoded_final_answer = tokenizer.decode(input_ids[s : e + 1], skip_special_tokens=True)
-    gt_answer = tokenizer.decode(gt_answer.squeeze(0))
+    gt_answer = tokenizer.decode(gt_answer.squeeze(0), skip_special_tokens=True)
     final_answer, processed_str = extract_solution(decoded_final_answer)
 
     format_valid = validate_response_structure(processed_str, kwargs["tags"])
@@ -20,7 +20,7 @@ def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
     else:
         reward += 1.0
         if gt_answer.strip().replace(" ", "").lower() == final_answer.strip().replace(" ", "").lower():
-            reward = reward + 9.0
+            reward = reward + 2.0
         return reward
 
 

From 7f2ceac5c3acc75b8366062970eb124ce0a56e2c Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li35271158@gmail.com>
Date: Thu, 6 Mar 2025 10:54:23 +0800
Subject: [PATCH 011/100] update example

---
 applications/ColossalChat/rl_example.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 1b5c18486b35..57cab4164f9d 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -60,7 +60,7 @@
                 ignore_eos=True,
                 include_stop_str_in_output=True,
                 stop=["</answer>"],
-                temperature=0.2,
+                temperature=0.7,
                 top_p=0.95,
             )
         )

From 812f4b775040c3de0fb40e973190a9cf68abe93c Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li35271158@gmail.com>
Date: Thu, 6 Mar 2025 11:44:42 +0800
Subject: [PATCH 012/100] update loader

---
 applications/ColossalChat/coati/dataset/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/ColossalChat/coati/dataset/loader.py b/applications/ColossalChat/coati/dataset/loader.py
index 265449326541..35b3deced717 100755
--- a/applications/ColossalChat/coati/dataset/loader.py
+++ b/applications/ColossalChat/coati/dataset/loader.py
@@ -357,7 +357,7 @@ def apply_chat_template_and_mask(
     ignore_idx: int = -100,
 ) -> Dict[str, torch.Tensor]:
 
-    system_prompt = "You are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and<answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. Now the user asks you to solve a math problem that involves reasoning. After thinking, when you finally reach a conclusion, clearly output the final answer without explanation within the <answer> </answer> tags, your final answer should be a integer without unit, currency mark, thousands separator or other text. i.e., <answer> 123 </answer>.\n"
+    system_prompt = "You are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and<answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. Now the user asks you to solve a math problem that involves reasoning.\n"
 
     system_element = {
         "role": "system",

From 0f566cc2d49657fec09c34aa50389f951f55103b Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li35271158@gmail.com>
Date: Thu, 6 Mar 2025 14:29:22 +0800
Subject: [PATCH 013/100] add algo selection

---
 .../ColossalChat/coati/distributed/launch.py      | 15 ++++++++++++++-
 applications/ColossalChat/rl_example.py           |  2 ++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index 5244cc7d9fed..8581ff5865f8 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -2,9 +2,15 @@
 
 import ray
 
+from .consumer import SimpleConsumer
 from .grpo_consumer import GRPOConsumer
 from .producer import SimpleProducer
 
+ALGO_MAP = {
+    "Simple": SimpleConsumer,
+    "GRPO": GRPOConsumer,
+}
+
 
 def get_jsonl_size_fast(path: str) -> int:
     with open(path) as f:
@@ -40,7 +46,14 @@ def launch_distributed(
     inference_backend: str = "transformers",
     master_addr: str = "localhost",
     master_port: int = 29500,
+    core_algo: str = "GRPO",
 ):
+
+    if core_algo not in ALGO_MAP:
+        raise NotImplementedError(f"{core_algo} is not supported yet.")
+    else:
+        core_consumer = ALGO_MAP.get(core_algo, SimpleConsumer)
+
     train_dp_size = get_dp_size_fast(num_producers, plugin_config)
     assert (inference_batch_size * num_producers) % (train_batch_size * train_dp_size) == 0
 
@@ -68,7 +81,7 @@ def launch_distributed(
         )
         procs.append(producer)
     for i in range(num_consumer_procs):
-        consumer = GRPOConsumer.options(num_gpus=1).remote(
+        consumer = core_consumer.options(num_gpus=1).remote(
             num_producers=num_producers,
             num_episodes=num_episodes,
             rank=i,
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 57cab4164f9d..40231582d787 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -15,6 +15,7 @@
     parser.add_argument("-tbs", "--train-batch-size", type=int, default=16)
     parser.add_argument("-tmbs", "--train-microbatch-size", type=int, default=2)
     parser.add_argument("-b", "--backend", type=str, default="transformers")
+    parser.add_argument("-a", "--algo", type=str, default="GPRO", choices=["Simple, GPRO"])
     args = parser.parse_args()
 
     ray.init(address="local", namespace="ray-example")
@@ -95,4 +96,5 @@
         inference_backend=args.backend,
         master_addr="localhost",
         master_port=29504,
+        core_algo=args.algo
     )

From ab5b6d8432ea1287a21345993b1755e40c07a0c7 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 6 Mar 2025 06:30:26 +0000
Subject: [PATCH 014/100] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 applications/ColossalChat/rl_example.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 40231582d787..77c567913523 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -96,5 +96,5 @@
         inference_backend=args.backend,
         master_addr="localhost",
         master_port=29504,
-        core_algo=args.algo
+        core_algo=args.algo,
     )

From 0cc0c843ede88f9e7ae88988b99237f40950acd1 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li35271158@gmail.com>
Date: Thu, 6 Mar 2025 16:26:14 +0800
Subject: [PATCH 015/100] add save

---
 applications/ColossalChat/coati/dataset/loader.py  |  4 +++-
 .../ColossalChat/coati/distributed/consumer.py     | 14 +++++++++++++-
 .../coati/distributed/grpo_consumer.py             |  4 ++--
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/applications/ColossalChat/coati/dataset/loader.py b/applications/ColossalChat/coati/dataset/loader.py
index 35b3deced717..1a6b04d43924 100755
--- a/applications/ColossalChat/coati/dataset/loader.py
+++ b/applications/ColossalChat/coati/dataset/loader.py
@@ -357,7 +357,9 @@ def apply_chat_template_and_mask(
     ignore_idx: int = -100,
 ) -> Dict[str, torch.Tensor]:
 
-    system_prompt = "You are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and<answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. Now the user asks you to solve a math problem that involves reasoning.\n"
+    system_prompt = "You are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and<answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. Now the user asks you to solve a math problem that involves reasoning. After thinking, when you finally reach a conclusion, clearly output the final answer without explanation within the <answer> </answer> tags, i.e., <answer> 123 </answer>.\n\n"
+
+    
 
     system_element = {
         "role": "system",
diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 84a69979fb88..a99198d7ed8f 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -1,6 +1,6 @@
 from contextlib import nullcontext
 from typing import Any, Dict, Optional
-
+import os
 import ray
 import ray.util.collective as cc
 import torch
@@ -33,6 +33,8 @@ def __init__(
         model_config: Dict[str, Any],
         plugin_config: Dict[str, Any],
         microbatch_size: int = 1,
+        save_interval: int = 100,
+        save_dir: str = "./model"
     ):
         self.num_producers = num_producers
         self.num_episodes = num_episodes
@@ -44,6 +46,8 @@ def __init__(
         self.num_recv_per_update = num_recv_per_update
         self.batch_size = batch_size
         self.microbatch_size = microbatch_size
+        self.save_interval = save_interval
+        self.save_dir = save_dir
         assert batch_size % microbatch_size == 0, "batch_size should be divisible by microbatch_size"
         self.num_microbatches = batch_size // microbatch_size
 
@@ -116,6 +120,14 @@ def loop(self) -> None:
                                 pbar.set_postfix({"loss": loss})
                             i += 1
                     assert len(self.buffer) == 0
+                    if (step + 1) % self.save_interval == 0:
+                        if self.rank == 0:
+                            print(f"Start saving policy model at step {step + 1}.")
+                        save_path = os.path.join(self.save_dir, f"modeling-step-{step + 1}")
+                        self.booster.save_model(self.policy_model, save_path, shard=True)
+                        if self.rank == 0:
+                            print(f"Saved model checkpoint at step {step + 1} in folder {save_path}")
+
                     if episode != self.num_episodes - 1 or step != self.num_update_per_episode - 1:
                         print(f"[T{dist.get_rank()}] Sync model episode {episode} step {step}")
                         state_dict = self.state_dict()
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 49240d8da176..ead0c86e032a 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -32,7 +32,7 @@ def __init__(
         plugin_config,
         microbatch_size=1,
         num_generations=4,
-        use_wandb=False,
+        use_wandb=True,
     ):
         super().__init__(
             num_producers,
@@ -79,7 +79,7 @@ def __init__(
 
         self.policy_loss_fn = PolicyLoss()
         self.global_step = 0
-        if self.rank == 0:
+        if use_wandb and  self.rank == 0:
             self.wandb_run = wandb.init(project="GRPO-Test", sync_tensorboard=True)
 
     def setup(self):

From 0590f10fb78f45e9b94c4a928379262b9bfad4e0 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li35271158@gmail.com>
Date: Thu, 6 Mar 2025 16:27:13 +0800
Subject: [PATCH 016/100] update select algo

---
 applications/ColossalChat/rl_example.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 40231582d787..347ffeeae61b 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -15,7 +15,7 @@
     parser.add_argument("-tbs", "--train-batch-size", type=int, default=16)
     parser.add_argument("-tmbs", "--train-microbatch-size", type=int, default=2)
     parser.add_argument("-b", "--backend", type=str, default="transformers")
-    parser.add_argument("-a", "--algo", type=str, default="GPRO", choices=["Simple, GPRO"])
+    parser.add_argument("-a", "--algo", type=str, default="GRPO", choices=["Simple, GPRO"])
     args = parser.parse_args()
 
     ray.init(address="local", namespace="ray-example")

From eb6337f07f5b01c181762ebb9e5338005552bca6 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 6 Mar 2025 08:29:58 +0000
Subject: [PATCH 017/100] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 applications/ColossalChat/coati/dataset/loader.py            | 2 --
 applications/ColossalChat/coati/distributed/consumer.py      | 5 +++--
 applications/ColossalChat/coati/distributed/grpo_consumer.py | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/applications/ColossalChat/coati/dataset/loader.py b/applications/ColossalChat/coati/dataset/loader.py
index 1a6b04d43924..4518fd71f91b 100755
--- a/applications/ColossalChat/coati/dataset/loader.py
+++ b/applications/ColossalChat/coati/dataset/loader.py
@@ -359,8 +359,6 @@ def apply_chat_template_and_mask(
 
     system_prompt = "You are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and<answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. Now the user asks you to solve a math problem that involves reasoning. After thinking, when you finally reach a conclusion, clearly output the final answer without explanation within the <answer> </answer> tags, i.e., <answer> 123 </answer>.\n\n"
 
-    
-
     system_element = {
         "role": "system",
         "content": system_prompt,
diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index a99198d7ed8f..1e85cccb3c5b 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -1,6 +1,7 @@
+import os
 from contextlib import nullcontext
 from typing import Any, Dict, Optional
-import os
+
 import ray
 import ray.util.collective as cc
 import torch
@@ -34,7 +35,7 @@ def __init__(
         plugin_config: Dict[str, Any],
         microbatch_size: int = 1,
         save_interval: int = 100,
-        save_dir: str = "./model"
+        save_dir: str = "./model",
     ):
         self.num_producers = num_producers
         self.num_episodes = num_episodes
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index ead0c86e032a..15f7e340ebb3 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -79,7 +79,7 @@ def __init__(
 
         self.policy_loss_fn = PolicyLoss()
         self.global_step = 0
-        if use_wandb and  self.rank == 0:
+        if use_wandb and self.rank == 0:
             self.wandb_run = wandb.init(project="GRPO-Test", sync_tensorboard=True)
 
     def setup(self):

From 9d9d51614e6aa9d047dc5ef4c7f89535f1e071a8 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li35271158@gmail.com>
Date: Mon, 10 Mar 2025 14:12:04 +0800
Subject: [PATCH 018/100] update grpo

---
 .../coati/distributed/grpo_consumer.py        | 28 ++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 15f7e340ebb3..ae9f2c400bce 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -56,6 +56,9 @@ def __init__(
         self.accum_loss = torch.zeros(1, device=self.device)
         self.accum_reward = torch.zeros(1, device=self.device)
         self.accum_kl = torch.zeros(1, device=self.device)
+        self.accum_format_reward = torch.zeros(1, device=self.device)
+        self.accum_acc_reward = torch.zeros(1, device=self.device)
+        self.accum_advantages = torch.zeros(1, device=self.device)
         self.accum_count = 0
 
         # Reference model is initialized from policy model.
@@ -131,9 +134,14 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
             )
             kl = torch.sum(per_token_kl * action_mask, dim=-1) / torch.sum(action_mask, dim=-1)
 
-            reward = self.reward_model(
+            reward_group = self.reward_model(
                 data["input_ids"], gt_answer=data["gt_answer"], response_idx=data["response_idx"]
             )
+
+            reward = torch.tensor([value[0] for value in reward_group]).to(data["input_ids"].device)
+            format_reward = torch.tensor([value[1] for value in reward_group]).to(data["input_ids"].device)
+            acc_reward = torch.tensor([value[2] for value in reward_group]).to(data["input_ids"].device)
+
             # [batch_size, num_generations]
             group_reward = reward.view(-1, self.num_generations)
 
@@ -157,9 +165,16 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
             loss = all_reduce_mean(loss, self.plugin)
             reward = all_reduce_mean(reward.mean(), self.plugin)
             kl = all_reduce_mean(kl.mean(), self.plugin)
+            format_reward = all_reduce_mean(format_reward.mean(), self.plugin)
+            acc_reward = all_reduce_mean(acc_reward.mean(), self.plugin)
+            advantages = all_reduce_mean(advantages.mean(), self.plugin)
+            # Calculate accumulate value.
             self.accum_loss.add_(loss.data)
             self.accum_reward.add_(reward.data)
             self.accum_kl.add_(kl.data)
+            self.accum_format_reward.add_(format_reward.data)
+            self.accum_acc_reward.add_(acc_reward.data)
+            self.accum_advantages.add_(advantages.data)
             self.accum_count += 1
         if need_update:
             self.optimizer.step()
@@ -173,17 +188,28 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
                     self.accum_reward.item() / self.accum_count,
                     "KL:",
                     self.accum_kl.item() / self.accum_count,
+                    "Format Reward:",
+                    self.accum_format_reward.item() / self.accum_count,
+                    "Acc Reward:",
+                    self.accum_acc_reward.item() / self.accum_count,
+                    "Advantages:",
+                    self.accum_advantages.item() / self.accum_count,
                 )
                 self.wandb_run.log(
                     {
                         "train/loss": self.accum_loss.item() / self.accum_count,
                         "train/reward": self.accum_reward.item() / self.accum_count,
                         "train/kl": self.accum_kl.item() / self.accum_count,
+                        "train/format_reward": self.accum_format_reward.item() / self.accum_count,
+                        "train/acc_reward": self.accum_acc_reward.item() / self.accum_count,
+                        "train/advantages": self.accum_advantages.item() / self.accum_count,
                     }
                 )
             self.accum_loss.zero_()
             self.accum_reward.zero_()
             self.accum_kl.zero_()
+            self.accum_acc_reward.zero_()
+            self.accum_format_reward.zero_()
             self.accum_count = 0
             return loss_scalar
 

From 754b16dfbf5e83d7764925dae631b9e65ae67a7a Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li35271158@gmail.com>
Date: Mon, 10 Mar 2025 14:18:22 +0800
Subject: [PATCH 019/100] update reward fn

---
 .../coati/distributed/reward/reward_fn.py     | 24 +++++++++++++------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/reward/reward_fn.py b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
index 9e6d1066e1df..da19c7d22458 100644
--- a/applications/ColossalChat/coati/distributed/reward/reward_fn.py
+++ b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
@@ -5,7 +5,9 @@
 
 def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
     tokenizer = kwargs["tokenizer"]
-    reward = torch.tensor(0.0).to(input_ids.device)
+    reward = torch.tensor(0.0)
+    format_reward = torch.tensor(0.0)
+    acc_reward = torch.tensor(0.0)
     s, e = response_idx[0], response_idx[1]
     if gt_answer is None:
         return reward
@@ -15,13 +17,21 @@ def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
     final_answer, processed_str = extract_solution(decoded_final_answer)
 
     format_valid = validate_response_structure(processed_str, kwargs["tags"])
-    if not format_valid:
-        return reward
-    else:
+
+    # Check format accuracy
+    if format_valid:
+        format_reward += 1.0
         reward += 1.0
-        if gt_answer.strip().replace(" ", "").lower() == final_answer.strip().replace(" ", "").lower():
-            reward = reward + 2.0
-        return reward
+
+    # Check answer accuracy
+    if (
+        final_answer is not None
+        and gt_answer.strip().replace(" ", "").lower() == final_answer.strip().replace(" ", "").lower()
+    ):
+        acc_reward += 5.0
+        reward += 5.0
+
+    return torch.tensor([reward, format_reward, acc_reward]).to(input_ids.device)
 
 
 def gsm8k_reward_fn(input_ids, **kwargs):

From 71a0181fcec3da8768918576b05796be8dddbe0b Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li35271158@gmail.com>
Date: Mon, 10 Mar 2025 14:19:10 +0800
Subject: [PATCH 020/100] update reward

---
 .../ColossalChat/coati/distributed/reward/verifiable_reward.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/ColossalChat/coati/distributed/reward/verifiable_reward.py b/applications/ColossalChat/coati/distributed/reward/verifiable_reward.py
index b43ba65c0ab7..ba83f7787586 100644
--- a/applications/ColossalChat/coati/distributed/reward/verifiable_reward.py
+++ b/applications/ColossalChat/coati/distributed/reward/verifiable_reward.py
@@ -21,7 +21,7 @@ def __call__(
         # Get batch size
         bs = input_ids.size(0)
         # Initialize reward
-        rewards = torch.zeros(bs, device=input_ids.device)
+        rewards = torch.zeros((bs, 3), device=input_ids.device)
 
         # Loop through reward functions
         for reward_fn in self.reward_fns:

From abca66e69f2b1cc58ab98f0efaaf09164da59fe5 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li35271158@gmail.com>
Date: Tue, 11 Mar 2025 10:17:32 +0800
Subject: [PATCH 021/100] fix reward score

---
 .../ColossalChat/coati/distributed/reward/reward_fn.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/reward/reward_fn.py b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
index da19c7d22458..53bc15e25a8c 100644
--- a/applications/ColossalChat/coati/distributed/reward/reward_fn.py
+++ b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
@@ -4,6 +4,8 @@
 
 
 def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
+    format_score = 1.0
+    acc_score = 9.0
     tokenizer = kwargs["tokenizer"]
     reward = torch.tensor(0.0)
     format_reward = torch.tensor(0.0)
@@ -20,16 +22,16 @@ def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
 
     # Check format accuracy
     if format_valid:
-        format_reward += 1.0
-        reward += 1.0
+        format_reward += format_score
+        reward += format_score
 
     # Check answer accuracy
     if (
         final_answer is not None
         and gt_answer.strip().replace(" ", "").lower() == final_answer.strip().replace(" ", "").lower()
     ):
-        acc_reward += 5.0
-        reward += 5.0
+        acc_reward += acc_score
+        reward += acc_score
 
     return torch.tensor([reward, format_reward, acc_reward]).to(input_ids.device)
 

From 47d64937782b4e2b0d21b84eef07b5e258b82abb Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li35271158@gmail.com>
Date: Tue, 11 Mar 2025 13:06:09 +0800
Subject: [PATCH 022/100] add response length

---
 .../coati/distributed/grpo_consumer.py        | 28 +++++++++++++------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index ae9f2c400bce..55dfd09ab244 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -59,6 +59,7 @@ def __init__(
         self.accum_format_reward = torch.zeros(1, device=self.device)
         self.accum_acc_reward = torch.zeros(1, device=self.device)
         self.accum_advantages = torch.zeros(1, device=self.device)
+        self.accum_response_length = torch.zeros(1, device=self.device)
         self.accum_count = 0
 
         # Reference model is initialized from policy model.
@@ -83,7 +84,7 @@ def __init__(
         self.policy_loss_fn = PolicyLoss()
         self.global_step = 0
         if use_wandb and self.rank == 0:
-            self.wandb_run = wandb.init(project="GRPO-Test", sync_tensorboard=True)
+            self.wandb_run = wandb.init(project="GRPO-V1", sync_tensorboard=True)
 
     def setup(self):
         super().setup()
@@ -109,6 +110,7 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
         action_mask = data["action_mask"]
         num_action = action_mask.shape[1]
         old_action_log_probs = data["action_log_probs"]
+        response_length = torch.sum(action_mask, dim=1).to(torch.float32)
 
         need_update = (step_idx + 1) % self.num_microbatches == 0
 
@@ -168,6 +170,7 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
             format_reward = all_reduce_mean(format_reward.mean(), self.plugin)
             acc_reward = all_reduce_mean(acc_reward.mean(), self.plugin)
             advantages = all_reduce_mean(advantages.mean(), self.plugin)
+            response_length = all_reduce_mean(response_length.mean(), self.plugin)
             # Calculate accumulate value.
             self.accum_loss.add_(loss.data)
             self.accum_reward.add_(reward.data)
@@ -175,6 +178,7 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
             self.accum_format_reward.add_(format_reward.data)
             self.accum_acc_reward.add_(acc_reward.data)
             self.accum_advantages.add_(advantages.data)
+            self.accum_response_length.add_(response_length.data)
             self.accum_count += 1
         if need_update:
             self.optimizer.step()
@@ -184,32 +188,38 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
                 print(
                     "Loss:",
                     self.accum_loss.item() / self.accum_count,
-                    "Reward:",
+                    "\nReward:",
                     self.accum_reward.item() / self.accum_count,
-                    "KL:",
-                    self.accum_kl.item() / self.accum_count,
-                    "Format Reward:",
+                    "\nFormat Reward:",
                     self.accum_format_reward.item() / self.accum_count,
-                    "Acc Reward:",
+                    "\nAcc Reward:",
                     self.accum_acc_reward.item() / self.accum_count,
-                    "Advantages:",
+                    "\nKL:",
+                    self.accum_kl.item() / self.accum_count,
+                    "\nAdvantages:",
                     self.accum_advantages.item() / self.accum_count,
+                    "\nResponse Length:",
+                    self.accum_response_length.item() / self.accum_count,
                 )
                 self.wandb_run.log(
                     {
                         "train/loss": self.accum_loss.item() / self.accum_count,
                         "train/reward": self.accum_reward.item() / self.accum_count,
-                        "train/kl": self.accum_kl.item() / self.accum_count,
                         "train/format_reward": self.accum_format_reward.item() / self.accum_count,
                         "train/acc_reward": self.accum_acc_reward.item() / self.accum_count,
+                        "train/kl": self.accum_kl.item() / self.accum_count,
                         "train/advantages": self.accum_advantages.item() / self.accum_count,
+                        "train/response_length": self.accum_response_length.item() / self.accum_count,
                     }
                 )
             self.accum_loss.zero_()
             self.accum_reward.zero_()
-            self.accum_kl.zero_()
             self.accum_acc_reward.zero_()
             self.accum_format_reward.zero_()
+            self.accum_kl.zero_()
+            self.accum_advantages.zero_()
+            self.accum_response_length.zero_()
+
             self.accum_count = 0
             return loss_scalar
 

From 704866a240fd9dfd10f8ecdd91e812026b0af09e Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li35271158@gmail.com>
Date: Tue, 11 Mar 2025 16:17:02 +0800
Subject: [PATCH 023/100] detach

---
 applications/ColossalChat/coati/distributed/loss.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/loss.py b/applications/ColossalChat/coati/distributed/loss.py
index 222540b9270d..af5776731a25 100644
--- a/applications/ColossalChat/coati/distributed/loss.py
+++ b/applications/ColossalChat/coati/distributed/loss.py
@@ -26,15 +26,10 @@ def forward(
     ) -> torch.Tensor:
         skip = False
         if action_mask is None:
-            ratio_ = (log_probs - old_log_probs).exp()
+            ratio = (log_probs - log_probs.detach()).exp()
         else:
-            ratio_ = ((log_probs - old_log_probs) * action_mask).exp()
+            ratio = ((log_probs - log_probs.detach()) * action_mask).exp()
 
-        # note that if dropout is disabled (recommanded), ratio will always be 1.
-        if ratio_.mean() > self.skip_threshold:
-            skip = True
-
-        ratio = ratio_.clamp(0.0, 10.0)
         surr1 = ratio * advantages
         surr2 = ratio.clamp(1 - self.clip_eps, 1 + self.clip_eps) * advantages
         loss = -torch.min(surr1, surr2) + self.beta * per_token_kl
@@ -44,4 +39,4 @@ def forward(
         else:
             loss = loss.mean(dim=1)
         loss = loss.mean()
-        return loss, skip, ratio_.max()
+        return loss, skip, ratio.max()

From 131eeceb5d93d5f4eef44f1e04a72a5815a3df1a Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li35271158@gmail.com>
Date: Thu, 13 Mar 2025 14:52:09 +0800
Subject: [PATCH 024/100] fix tp bug

---
 colossalai/shardformer/policies/qwen2.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/colossalai/shardformer/policies/qwen2.py b/colossalai/shardformer/policies/qwen2.py
index 84d2b2fdbd99..8e150fef1f3d 100644
--- a/colossalai/shardformer/policies/qwen2.py
+++ b/colossalai/shardformer/policies/qwen2.py
@@ -13,6 +13,7 @@
     PaddingEmbedding,
     RMSNorm,
     VocabParallelEmbedding1D,
+    VocabParallelLMHead1D,
 )
 
 from ..modeling.qwen2 import (
@@ -446,7 +447,16 @@ def module_policy(self):
                             suffix="lm_head",
                             target_module=LinearWithGradAccum,
                             kwargs=dict(fp8_communication=self.shard_config.fp8_communication, use_zbv=use_zbv),
-                        )
+                        ),
+                        SubModuleReplacementDescription(
+                            suffix="lm_head",
+                            target_module=VocabParallelLMHead1D,
+                            kwargs={
+                                "gather_output": not self.shard_config.parallel_output,
+                                "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                                "fp8_communication": self.shard_config.fp8_communication,
+                            },
+                        ),
                     ],
                     method_replacement={"forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)},
                 )

From afddfde2dd2855258d413b1e95c5b9170b841b94 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li35271158@gmail.com>
Date: Thu, 13 Mar 2025 14:55:26 +0800
Subject: [PATCH 025/100] fix consumer

---
 applications/ColossalChat/coati/distributed/consumer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 1e85cccb3c5b..380a2ee1b78a 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -73,6 +73,8 @@ def setup(self) -> None:
         )
         if self.plugin_config.get("pp_size", 1) > 1 and "num_microbatches" not in self.plugin_config:
             plugin_config["microbatch_size"] = self.microbatch_size
+        if self.plugin_config.get("tp_size", 1) > 1:
+            plugin_config["parallel_output"] = False
         plugin_config.update(self.plugin_config)
         self.plugin = HybridParallelPlugin(**plugin_config)
         self.booster = Booster(plugin=self.plugin)

From 4702d5784145b6164fe3dd90b35ba32fcd85b491 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li35271158@gmail.com>
Date: Thu, 13 Mar 2025 16:49:02 +0800
Subject: [PATCH 026/100] convert to 8 generation

---
 .../ColossalChat/coati/distributed/inference_backend.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/ColossalChat/coati/distributed/inference_backend.py b/applications/ColossalChat/coati/distributed/inference_backend.py
index bc0ae5c36673..8711d0b8c89e 100644
--- a/applications/ColossalChat/coati/distributed/inference_backend.py
+++ b/applications/ColossalChat/coati/distributed/inference_backend.py
@@ -154,7 +154,7 @@ class VLLMInferenceBackend(BaseInferenceBackend):
     )
     FORCE_GENERATE_CONFIG = dict(
         logprobs=0,
-        n=4,
+        n=8,
     )
 
     def __init__(self, model_config: Dict[str, Any], generate_config: Dict[str, Any], tokenizer: PreTrainedTokenizer):

From 45ac6c6cb29d7d1cfd9bb31fbbc8a81a2aa8f5d6 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li35271158@gmail.com>
Date: Thu, 13 Mar 2025 16:51:22 +0800
Subject: [PATCH 027/100] print results

---
 applications/ColossalChat/coati/distributed/producer.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 3e4a5277ad2e..a3ae22a7935c 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -154,7 +154,11 @@ def __init__(
 
     @torch.no_grad()
     def rollout(self, input_ids, attention_mask, **kwargs):
-        return self.model.generate(input_ids, attention_mask, **kwargs)
+        rollouts = self.model.generate(input_ids, attention_mask, **kwargs)
+        if self.producer_idx == 1:
+            print("Rollout example:\n", self.tokenizer.decode(rollouts["input_ids"][0][0], skip_special_tokens=True))
+
+        return rollouts
 
     def load_state_dict(self, state_dict):
         self.model.load_state_dict(state_dict)

From 57b49da5e4932327a4576be3901a3cb08c81bae8 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li35271158@gmail.com>
Date: Thu, 13 Mar 2025 16:52:15 +0800
Subject: [PATCH 028/100] setup update

---
 applications/ColossalChat/rl_example.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 3d4b8a575cad..30d56c90b4ad 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -10,12 +10,12 @@
     parser.add_argument("-d", "--dataset", type=str, default="data.jsonl")
     parser.add_argument("-t", "--num-trainers", type=int, default=2)
     parser.add_argument("-i", "--num-inferencer", type=int, default=2)
-    parser.add_argument("-ibs", "--inference-batch-size", type=int, default=64)
+    parser.add_argument("-ibs", "--inference-batch-size", type=int, default=32)
     parser.add_argument("-imbs", "--inference-microbatch-size", type=int, default=16)
     parser.add_argument("-tbs", "--train-batch-size", type=int, default=16)
-    parser.add_argument("-tmbs", "--train-microbatch-size", type=int, default=2)
+    parser.add_argument("-tmbs", "--train-microbatch-size", type=int, default=1)
     parser.add_argument("-b", "--backend", type=str, default="transformers")
-    parser.add_argument("-a", "--algo", type=str, default="GRPO", choices=["Simple, GPRO"])
+    parser.add_argument("-a", "--algo", type=str, default="GRPO", choices=["Simple, GRPO"])
     args = parser.parse_args()
 
     ray.init(address="local", namespace="ray-example")

From bc0171d392fcc5d32f93f40791c8733ee1880908 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Fri, 14 Mar 2025 18:12:35 +0800
Subject: [PATCH 029/100] fix transformers backend

---
 .gitignore                                    |  1 +
 .../coati/distributed/inference_backend.py    | 23 ++++++++++++++++++-
 applications/ColossalChat/rl_example.py       | 20 ++++++++--------
 3 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8bc74b4c8c2c..16f764c1b1ef 100644
--- a/.gitignore
+++ b/.gitignore
@@ -163,3 +163,4 @@ coverage.xml
 # log, test files - ColossalChat
 applications/ColossalChat/logs
 applications/ColossalChat/tests/logs
+applications/ColossalChat/wandb
diff --git a/applications/ColossalChat/coati/distributed/inference_backend.py b/applications/ColossalChat/coati/distributed/inference_backend.py
index 8711d0b8c89e..58414b29fd47 100644
--- a/applications/ColossalChat/coati/distributed/inference_backend.py
+++ b/applications/ColossalChat/coati/distributed/inference_backend.py
@@ -61,12 +61,22 @@ def __init__(self, model_config: Dict[str, Any], generate_config: Dict[str, Any]
         self.generate_config = generate_config.copy()
         self.generate_config.update(self.FORCE_GENERATE_CONFIG)
         self.tokenizer = tokenizer
+        self.num_generations = 8
 
     @torch.no_grad()
     def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:
+        micro_batch_size = input_ids.size(0)
         input_ids = input_ids.to(get_current_device())
         attention_mask = attention_mask.to(get_current_device())
-        out = self.model.generate(input_ids, attention_mask=attention_mask, **kwargs, **self.generate_config)
+        gt_answer = None
+        if "gt_answer" in kwargs:
+            gt_answer = kwargs.pop("gt_answer")
+        if self.num_generations > 1:
+            input_ids = input_ids.repeat_interleave(self.num_generations, dim=0)
+            attention_mask = attention_mask.repeat_interleave(self.num_generations, dim=0)
+        out = self.model.generate(
+            input_ids, attention_mask=attention_mask, **kwargs, **self.generate_config, tokenizer=self.tokenizer
+        )
         input_len = input_ids.shape[-1]
         new_token_ids = out.sequences[:, input_len:]
         # get log probs
@@ -76,10 +86,13 @@ def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwar
             action_log_probs.append(log_probs_from_logits(logits[:, None, :], new_token_ids[:, i : i + 1]))
         action_log_probs = torch.cat(action_log_probs, dim=1)
         # get action mask
+        response_idx = torch.zeros((new_token_ids.size(0), 2), dtype=torch.int).to(get_current_device())
         action_mask = torch.ones_like(new_token_ids, dtype=attention_mask.dtype)
         if self.tokenizer.eos_token_id is not None:
             for indices in torch.nonzero(new_token_ids == self.tokenizer.eos_token_id):
                 action_mask[indices[0], indices[1] + 1 :] = 0
+        response_idx[:, 0] = input_len
+        response_idx[:, 1] = input_len + action_mask.sum(dim=1) - 1
 
         if attention_mask.size(0) != action_mask.size(0):
             assert action_mask.size(0) % attention_mask.size(0) == 0
@@ -91,7 +104,15 @@ def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwar
             "attention_mask": attention_mask,
             "action_log_probs": action_log_probs,
             "action_mask": action_mask,
+            "response_idx": response_idx,
         }
+
+        data = {k: v.view(micro_batch_size, self.num_generations, v.size(-1)) for k, v in data.items()}
+
+        if gt_answer is not None:
+            # repeat gt_answer for each prompt.
+            data["gt_answer"] = gt_answer.repeat_interleave(self.num_generations, dim=1)
+        data = {k: v.to(get_current_device()) for k, v in data.items()}
         return data
 
     def load_state_dict(self, state_dict: Dict[str, torch.Tensor]) -> None:
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 30d56c90b4ad..1de8b649d5d1 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -10,9 +10,9 @@
     parser.add_argument("-d", "--dataset", type=str, default="data.jsonl")
     parser.add_argument("-t", "--num-trainers", type=int, default=2)
     parser.add_argument("-i", "--num-inferencer", type=int, default=2)
-    parser.add_argument("-ibs", "--inference-batch-size", type=int, default=32)
-    parser.add_argument("-imbs", "--inference-microbatch-size", type=int, default=16)
-    parser.add_argument("-tbs", "--train-batch-size", type=int, default=16)
+    parser.add_argument("-ibs", "--inference-batch-size", type=int, default=64)
+    parser.add_argument("-imbs", "--inference-microbatch-size", type=int, default=8)
+    parser.add_argument("-tbs", "--train-batch-size", type=int, default=32)
     parser.add_argument("-tmbs", "--train-microbatch-size", type=int, default=1)
     parser.add_argument("-b", "--backend", type=str, default="transformers")
     parser.add_argument("-a", "--algo", type=str, default="GRPO", choices=["Simple, GRPO"])
@@ -24,29 +24,31 @@
     train_model_config = dict(path=args.model)
     generate_config = dict(
         top_k=50,
-        top_p=0.8,
+        top_p=0.9,
+        temperature=1.0,
     )
 
     if args.backend == "transformers":
         inference_model_config.update(
             dict(
-                attn_implementation="flash_attention_2",
+                use_flash_attention_2=True,
                 torch_dtype=torch.bfloat16,
             )
         )
         train_model_config.update(
             dict(
-                attn_implementation="flash_attention_2",
+                use_flash_attention_2=True,
                 torch_dtype=torch.bfloat16,
                 use_cache=False,
             )
         )
         generate_config.update(
             dict(
-                max_length=512,
+                max_length=1024 + 512,
                 do_sample=True,
                 max_new_tokens=None,
                 early_stopping=False,
+                stop_strings=["</answer>"],
             )
         )
     elif args.backend == "vllm":
@@ -82,12 +84,12 @@
         num_producers=args.num_inferencer,
         num_proc_per_producer=1,
         num_consumer_procs=args.num_trainers,
-        num_episodes=1,
+        num_episodes=10,
         inference_batch_size=args.inference_batch_size,
         inference_microbatch_size=args.inference_microbatch_size,
         train_batch_size=args.train_batch_size,
         train_microbatch_size=args.train_microbatch_size,
-        dataset_config={"path": args.dataset, "max_length": 256},
+        dataset_config={"path": args.dataset, "max_length": 300},
         dataloaders_config={},
         inference_model_config=inference_model_config,
         generate_config=generate_config,

From 7795d4c50d727fea5c1a45b665fe30de63166518 Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Tue, 18 Mar 2025 17:47:55 +0800
Subject: [PATCH 030/100] [Feature] Support Distributed LogProb for GRPO
 Training (#6247)

* [fix] fix qwen VocabParallelLMHead1D and gather output

* fix tp bug

* fix consumer

* [feat] Support Distributed LogProb for GRPO Training

* [fix] fix loss func

* [fix] fix log prob plugin

* [fix] fix qwen modeling param

* [fix] rm comments

* [fix] rm hard-code;fix non-dist version

* [fix] fix test file param name and benchmark tp gather output=True/False

* [fix] rm non-dist version in dist log prob

* [fix] fix comments

* [fix] fix dis log prob plugin

* [fix] fix test case

* [fix] fix qwen VocabParallelLMHead1D and gather output

* [fix] fix DistLogProb comments

* [fix] restore tp size

* [fix] fix comments

* [fix] fix comment; fix LogSoftmax usage

---------

Co-authored-by: Tong Li <tong.li35271158@gmail.com>
---
 .../coati/distributed/consumer.py             |   2 -
 .../coati/distributed/grpo_consumer.py        |   8 +-
 .../ColossalChat/coati/distributed/utils.py   |  20 ++-
 colossalai/shardformer/layer/__init__.py      |   4 +-
 colossalai/shardformer/layer/loss.py          | 150 +++++++++++++++++-
 colossalai/shardformer/modeling/qwen2.py      |   1 -
 colossalai/shardformer/policies/qwen2.py      |   8 +-
 .../test_layer/test_dist_log_prob.py          |  52 ++++++
 8 files changed, 233 insertions(+), 12 deletions(-)
 create mode 100644 tests/test_shardformer/test_layer/test_dist_log_prob.py

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 380a2ee1b78a..1e85cccb3c5b 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -73,8 +73,6 @@ def setup(self) -> None:
         )
         if self.plugin_config.get("pp_size", 1) > 1 and "num_microbatches" not in self.plugin_config:
             plugin_config["microbatch_size"] = self.microbatch_size
-        if self.plugin_config.get("tp_size", 1) > 1:
-            plugin_config["parallel_output"] = False
         plugin_config.update(self.plugin_config)
         self.plugin = HybridParallelPlugin(**plugin_config)
         self.booster = Booster(plugin=self.plugin)
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 55dfd09ab244..b1edb89bb0e5 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -120,14 +120,18 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
                 input_ids=data["input_ids"],
                 attention_mask=data["attention_mask"],
             )["logits"]
-            action_log_probs = calc_action_log_probs(policy_model_logits, data["input_ids"], num_action)
+            action_log_probs = calc_action_log_probs(
+                policy_model_logits, data["input_ids"], num_action, self.plugin.shard_config
+            )
 
             with torch.no_grad():
                 reference_model_logits = self.reference_model(
                     input_ids=data["input_ids"],
                     attention_mask=data["attention_mask"],
                 )["logits"]
-            reference_action_log_probs = calc_action_log_probs(reference_model_logits, data["input_ids"], num_action)
+            reference_action_log_probs = calc_action_log_probs(
+                reference_model_logits, data["input_ids"], num_action, self.plugin.shard_config
+            )
 
             per_token_kl = (
                 torch.exp(reference_action_log_probs - action_log_probs)
diff --git a/applications/ColossalChat/coati/distributed/utils.py b/applications/ColossalChat/coati/distributed/utils.py
index 98b54815b5b4..919e4434faa6 100644
--- a/applications/ColossalChat/coati/distributed/utils.py
+++ b/applications/ColossalChat/coati/distributed/utils.py
@@ -2,6 +2,8 @@
 
 import torch
 
+from colossalai.shardformer.layer.loss import dist_log_prob
+
 
 def unbind_batch(batch: Dict[str, torch.Tensor]) -> List[Dict[str, torch.Tensor]]:
     batches = []
@@ -66,18 +68,30 @@ def log_probs_from_logits(logits: torch.Tensor, labels: torch.Tensor) -> torch.T
     return per_label_logps.squeeze(-1)
 
 
-def calc_action_log_probs(logits: torch.Tensor, sequences: torch.LongTensor, num_actions: int) -> torch.Tensor:
+def calc_action_log_probs(
+    logits: torch.Tensor,
+    sequences: torch.LongTensor,
+    num_actions: int,
+    shard_config,
+    vocab_size: int = None,
+) -> torch.Tensor:
     """Calculate action log probs.
 
     Args:
-        output (torch.Tensor): Output tensor of Actor.forward.logits.
+        logits (torch.Tensor): Output tensor of Actor.forward.logits.
         sequences (torch.LongTensor): Input sequences.
         num_actions (int): Number of actions.
+        shard_config
+        vocab_size
+
 
     Returns:
         torch.Tensor: Action log probs.
     """
-    log_probs = log_probs_from_logits(logits[:, :-1, :], sequences[:, 1:])
+    # labels: torch.Tensor,  # [B, S] or [B, S, Vocab_size]
+    # logits: torch.Tensor,  # [B, S, Vocab_size]
+    log_probs = dist_log_prob(sequences, logits, shard_config, vocab_size, logits.dtype)
+    log_probs = log_probs.squeeze(-1)
     return log_probs[:, -num_actions:]
 
 
diff --git a/colossalai/shardformer/layer/__init__.py b/colossalai/shardformer/layer/__init__.py
index 0bd1b60923e9..a1b80bf56b63 100644
--- a/colossalai/shardformer/layer/__init__.py
+++ b/colossalai/shardformer/layer/__init__.py
@@ -3,7 +3,7 @@
 from .dropout import DropoutForParallelInput, DropoutForReplicatedInput
 from .embedding import Embedding1D, PaddingEmbedding, VocabParallelEmbedding1D
 from .linear import Linear1D_Col, Linear1D_Row, LinearWithGradAccum, PaddingLMHead, VocabParallelLMHead1D
-from .loss import cross_entropy_1d, dist_cross_entropy
+from .loss import cross_entropy_1d, dist_cross_entropy, dist_log_prob, dist_log_prob_1d
 from .normalization import FusedLayerNorm, FusedRMSNorm, LayerNorm, RMSNorm
 from .parallel_module import ParallelModule
 from .qkv_fused_linear import (
@@ -28,6 +28,8 @@
     "DropoutForReplicatedInput",
     "cross_entropy_1d",
     "dist_cross_entropy",
+    "dist_log_prob_1d",
+    "dist_log_prob",
     "BaseLayerNorm",
     "LayerNorm",
     "RMSNorm",
diff --git a/colossalai/shardformer/layer/loss.py b/colossalai/shardformer/layer/loss.py
index 0e2241af9fc9..51419a38a0ed 100644
--- a/colossalai/shardformer/layer/loss.py
+++ b/colossalai/shardformer/layer/loss.py
@@ -3,13 +3,21 @@
 from torch.autograd import Function
 from torch.distributed import ProcessGroup
 from torch.nn import CrossEntropyLoss
+from torch.nn.functional import log_softmax
 
 from colossalai.shardformer.layer._operation import reduce_forward
 from colossalai.shardformer.shard import ShardConfig
 
 from .utils import is_share_sp_tp
 
-__all__ = ["DistCrossEntropy", "cross_entropy_1d", "dist_cross_entropy"]
+__all__ = [
+    "DistCrossEntropy",
+    "cross_entropy_1d",
+    "dist_cross_entropy",
+    "DistLogProb",
+    "dist_log_prob_1d",
+    "dist_log_prob",
+]
 
 _IGNORE_IDX = -100
 
@@ -137,6 +145,98 @@ def backward(ctx, grad_output):
         return grad_logits, None, None, None, None, None, None
 
 
+class DistLogProb(Function):
+    r"""
+    Overwrite the forward and backward function to calculate the log prob before gather
+
+    Args:
+        Function (:class:`torch.autograd.Function`): default
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        vocab_logits: torch.Tensor,
+        target: torch.Tensor,
+        process_group: ProcessGroup,
+        vocab_size: int,
+        dtype=torch.float32,
+    ):
+
+        ##################
+        # Step1:Find the global maximum value of logits
+        ##################
+        logits_max = torch.max(vocab_logits, dim=-1)[0]
+        handle = dist.all_reduce(logits_max, op=dist.ReduceOp.MAX, group=process_group, async_op=True)
+
+        ##################
+        # Step2:Find the local mask. local mask will be use to select log_probs value in Step 4.
+        # For accleration, we overlap Step 2 and Step 3
+        ##################
+        rank = dist.get_rank(group=process_group)
+        world_size = dist.get_world_size(group=process_group)
+        if vocab_size is None:
+            partition_vocab_size = vocab_logits.size()[-1]
+            global_vocab_size = partition_vocab_size * world_size
+        else:
+            global_vocab_size = vocab_size
+            partition_vocab_size = global_vocab_size // world_size
+        # down and up threshold for local logits
+        delta = (global_vocab_size + world_size - 1) // world_size
+        down_threshold = rank * delta
+        up_threshold = down_threshold + delta
+        if up_threshold > global_vocab_size:
+            up_threshold = global_vocab_size
+        # mask
+        mask = (target < down_threshold) | (target >= up_threshold)
+        masked_target = target.clone() - down_threshold
+        masked_target[mask] = 0
+        masked_target_1d = masked_target.view(-1).contiguous()
+        handle.wait()
+
+        ##################
+        # Step3:Calculate global summation exp logits
+        ##################
+        vocab_logits = vocab_logits - logits_max.unsqueeze(dim=-1)
+        exp_logits = torch.exp(vocab_logits)
+        sum_exp_logits = torch.sum(exp_logits, dim=-1, dtype=torch.float32)  # local summation exp logits
+        dist.all_reduce(sum_exp_logits, op=dist.ReduceOp.SUM, group=process_group)
+
+        ##################
+        # Step4:Calculate local prob. We first cal log_softmax, then select log probs via local mask
+        ##################
+        log_probs = vocab_logits - torch.log(sum_exp_logits.unsqueeze(dim=-1))  # cal log_softmax
+        log_probs = log_probs.gather(dim=-1, index=masked_target.unsqueeze(-1))
+        log_probs[mask.unsqueeze(-1)] = 0  # set masked val to zero
+        dist.all_reduce(log_probs, op=dist.ReduceOp.SUM, group=process_group)
+
+        ctx.save_for_backward(exp_logits, mask, masked_target_1d, sum_exp_logits)
+        ctx.dtype = dtype
+        return log_probs
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        exp_logits, mask, masked_target_1d, sum_exp_logits = ctx.saved_tensors
+        ##################
+        # Step1:Find the global sofmax value
+        ##################
+        softmax_logits = exp_logits / sum_exp_logits.unsqueeze(dim=-1)
+
+        ##################
+        # Step2:Update softmax value based on local target index
+        ##################
+        partion_vocab_size = softmax_logits.shape[-1]
+        softmax_logits_2d = softmax_logits.view(-1, partion_vocab_size)
+        update = 1.0 - mask.view(-1).float().to(ctx.dtype)
+        softmax_logits_2d[torch.arange(0, softmax_logits_2d.shape[0]), masked_target_1d] -= update
+
+        ##################
+        # Step3:Calculate grad_output, which is the gradient of the loss function with respect to the output of logsoftmax
+        ##################
+        grad_logits = -softmax_logits.mul_(grad_output)
+        return grad_logits, None, None, None, None, None, None
+
+
 def cross_entropy_1d(
     vocab_logits: torch.Tensor,
     labels: torch.Tensor,
@@ -149,6 +249,16 @@ def cross_entropy_1d(
     return DistCrossEntropy.apply(vocab_logits, labels, ignore_index, process_group, vocab_size, dtype, mode)
 
 
+def dist_log_prob_1d(
+    vocab_logits: torch.Tensor,
+    labels: torch.Tensor,
+    process_group: ProcessGroup = None,
+    vocab_size: int = None,
+    dtype: torch.dtype = None,
+) -> torch.Tensor:
+    return DistLogProb.apply(vocab_logits, labels, process_group, vocab_size, dtype)
+
+
 def dist_cross_entropy(
     labels: torch.Tensor,  # [B, S] or [B, S, Vocab_size]
     logits: torch.Tensor,  # [B, S, Vocab_size]
@@ -243,3 +353,41 @@ def dist_cross_entropy(
         loss, num_nonzero = loss[0], loss[1].detach()
     loss = (loss / num_nonzero).squeeze()
     return loss
+
+
+def dist_log_prob(
+    labels: torch.Tensor,  # [B, S] or [B, S, Vocab_size]
+    logits: torch.Tensor,  # [B, S, Vocab_size]
+    shard_config: ShardConfig,
+    vocab_size: int,
+    dtype: torch.dtype,
+    seq_dim: int = 1,
+) -> torch.Tensor:
+    """
+    Helper to compute log prob for most shardformer models supporting PP, TP.
+    """
+    # Split labels if not gather output
+    parallel_output = shard_config.parallel_output
+    is_tp = shard_config.enable_tensor_parallelism
+
+    # TODO:support sp
+    labels = labels[..., 1:]
+    logits = logits[..., :-1, :]
+    labels = labels.contiguous()
+    logits = logits.contiguous()
+    assert labels.shape == logits.shape[:-1], f"label shape {labels.shape} does not match logit shape {logits.shape}"
+
+    # Flatten the tokens
+    if is_tp and parallel_output:
+        log_prob = dist_log_prob_1d(
+            logits,
+            labels,
+            process_group=shard_config.tensor_parallel_process_group,
+            vocab_size=vocab_size,
+            dtype=dtype,
+        )
+    else:
+        log_prob = log_softmax(logits)
+        log_prob = log_prob.gather(dim=-1, index=labels.unsqueeze(-1))
+
+    return log_prob
diff --git a/colossalai/shardformer/modeling/qwen2.py b/colossalai/shardformer/modeling/qwen2.py
index 569fc4a459c5..71e3557fe214 100644
--- a/colossalai/shardformer/modeling/qwen2.py
+++ b/colossalai/shardformer/modeling/qwen2.py
@@ -832,7 +832,6 @@ def forward(
         loss = None
         if labels is not None:
             loss = dist_cross_entropy(labels, logits, shard_config, self.lm_head.out_features, logits.dtype)
-
         if not return_dict:
             output = (logits,) + outputs[1:]
             return (loss,) + output if loss is not None else output
diff --git a/colossalai/shardformer/policies/qwen2.py b/colossalai/shardformer/policies/qwen2.py
index 8e150fef1f3d..0adcdfdbd553 100644
--- a/colossalai/shardformer/policies/qwen2.py
+++ b/colossalai/shardformer/policies/qwen2.py
@@ -430,8 +430,12 @@ def module_policy(self):
                     sub_module_replacement=[
                         SubModuleReplacementDescription(
                             suffix="lm_head",
-                            target_module=Linear1D_Col,
-                            kwargs=dict(fp8_communication=self.shard_config.fp8_communication, use_zbv=use_zbv),
+                            target_module=VocabParallelLMHead1D,
+                            kwargs=dict(
+                                gather_output=not self.shard_config.parallel_output,
+                                fp8_communication=self.shard_config.fp8_communication,
+                                use_zbv=use_zbv,
+                            ),
                         )
                     ],
                     method_replacement={"forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)},
diff --git a/tests/test_shardformer/test_layer/test_dist_log_prob.py b/tests/test_shardformer/test_layer/test_dist_log_prob.py
new file mode 100644
index 000000000000..05a6a5d4766f
--- /dev/null
+++ b/tests/test_shardformer/test_layer/test_dist_log_prob.py
@@ -0,0 +1,52 @@
+import pytest
+import torch
+from coati.distributed.utils import log_probs_from_logits
+
+import colossalai
+from colossalai.logging import disable_existing_loggers
+from colossalai.shardformer.layer import dist_log_prob_1d
+from colossalai.testing import rerun_if_address_is_in_use, spawn
+
+CONFIG = dict(
+    parallel=dict(data=1, pipeline=1, tensor=dict(size=2, mode="1d")),
+)
+
+
+def check_dist_log_prob(rank, world_size, port):
+    disable_existing_loggers()
+    colossalai.launch(rank=rank, world_size=world_size, port=port, host="localhost", backend="nccl")
+
+    # prepare data
+    pred = torch.randn(2, 4, 8, requires_grad=True).cuda()
+    labels = torch.randint(8, (2, 4)).cuda()
+
+    logprob = log_probs_from_logits(pred, labels)
+
+    pred.retain_grad()
+    logprob.mean().backward()
+
+    dist_pred = pred.clone().chunk(world_size, -1)[rank].detach()
+    dist_pred.requires_grad = True
+    dist_logprob = dist_log_prob_1d(dist_pred, labels)
+
+    dist_pred.retain_grad()
+    dist_logprob.squeeze(-1).mean().backward()
+
+    assert torch.allclose(
+        logprob, dist_logprob.squeeze(-1), atol=1e-5
+    ), f"dist cross entropy logprob is not equal to orgin logprob\n{logprob}\n{dist_logprob.squeeze(-1)}"
+
+    pred_grad_partial = pred.grad.clone().chunk(world_size, -1)[rank].detach()
+    assert torch.allclose(
+        pred_grad_partial, dist_pred.grad
+    ), f"dist grad is not equal to orgin grad\n{pred.grad}\n{dist_pred.grad}"
+
+
+@pytest.mark.dist
+@rerun_if_address_is_in_use()
+def test_dist_log_prob():
+    spawn(check_dist_log_prob, 2)
+
+
+if __name__ == "__main__":
+    test_dist_log_prob()

From 7ee4452f8c5851a2dea715a920f6b6972c2ff9ee Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Wed, 19 Mar 2025 17:07:20 +0800
Subject: [PATCH 031/100] fix vllm

---
 .../coati/distributed/grpo_consumer.py        | 148 +++++++++++++++++-
 .../coati/distributed/inference_backend.py    |  11 +-
 .../ColossalChat/coati/distributed/launch.py  |  16 +-
 .../ColossalChat/coati/distributed/loss.py    |   3 +
 applications/ColossalChat/rl_example.py       |  18 +--
 5 files changed, 172 insertions(+), 24 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index b1edb89bb0e5..785fa820ec8b 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -1,8 +1,11 @@
+import json
+import os
 from contextlib import nullcontext
 from typing import Optional
 
 import ray
 import torch
+import torch.distributed as dist
 import wandb
 from coati.distributed.consumer import BaseConsumer
 from coati.distributed.loss import PolicyLoss
@@ -33,6 +36,8 @@ def __init__(
         microbatch_size=1,
         num_generations=4,
         use_wandb=True,
+        generator_config=None,
+        filter_range=None,
     ):
         super().__init__(
             num_producers,
@@ -69,6 +74,9 @@ def __init__(
         self.tokenizer = AutoTokenizer.from_pretrained(path)
         self.pad_token_id = self.tokenizer.pad_token_id
         self.num_generations = num_generations
+        self.filter_range = filter_range
+        if self.filter_range is not None:
+            assert len(self.filter_range) == 2, "Filter range should have 2 values."
 
         # Initialize verifiable reward.
         response_format_tags = {
@@ -84,7 +92,11 @@ def __init__(
         self.policy_loss_fn = PolicyLoss()
         self.global_step = 0
         if use_wandb and self.rank == 0:
-            self.wandb_run = wandb.init(project="GRPO-V1", sync_tensorboard=True)
+            if "repetition_penalty" in generator_config:
+                name = f"{generator_config['backend']}_bs_{self.batch_size*self.world_size}_temp_{generator_config['temperature']:.01f}_rep_penalty_{generator_config['repetition_penalty']:.01f}"
+            else:
+                name = f"{generator_config['backend']}_bs_{self.batch_size*self.world_size}_temp_{generator_config['temperature']:.01f}"
+            self.wandb_run = wandb.init(project="GRPO-V1", sync_tensorboard=True, dir="./wandb", name=name)
 
     def setup(self):
         super().setup()
@@ -121,7 +133,7 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
                 attention_mask=data["attention_mask"],
             )["logits"]
             action_log_probs = calc_action_log_probs(
-                policy_model_logits, data["input_ids"], num_action, self.plugin.shard_config
+                policy_model_logits / generator_config["temperature"], data["input_ids"], num_action, self.plugin.shard_config
             )
 
             with torch.no_grad():
@@ -130,7 +142,7 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
                     attention_mask=data["attention_mask"],
                 )["logits"]
             reference_action_log_probs = calc_action_log_probs(
-                reference_model_logits, data["input_ids"], num_action, self.plugin.shard_config
+                reference_model_logits / generator_config["temperature"], data["input_ids"], num_action, self.plugin.shard_config
             )
 
             per_token_kl = (
@@ -149,7 +161,14 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
             acc_reward = torch.tensor([value[2] for value in reward_group]).to(data["input_ids"].device)
 
             # [batch_size, num_generations]
+            # filter out the reward that is too high (all sample gets full score) or too low (all sample gets 0 score),
+            loss_mask = (
+                None
+                if self.filter_range is None
+                else torch.logical_and(reward > self.filter_range[0], reward < self.filter_range[1])
+            )
             group_reward = reward.view(-1, self.num_generations)
+            reward_mean = group_reward.mean(dim=1)
 
             # [batch_size x num_generations]
             reward_mean = group_reward.mean(dim=1).repeat_interleave(self.num_generations, dim=0)
@@ -164,6 +183,7 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
                 advantages.unsqueeze(dim=-1).repeat_interleave(action_log_probs.size(-1), dim=-1),
                 per_token_kl,
                 action_mask,
+                loss_mask=loss_mask,
             )
 
             if not skip_update:
@@ -232,3 +252,125 @@ def state_dict(self):
         model = self.policy_model.unwrap()
         state_dict = model.state_dict()
         return state_dict
+
+
+@ray.remote
+class GRPOEvalConsumer(BaseConsumer):
+    def __init__(
+        self,
+        num_producers,
+        num_episodes,
+        rank,
+        world_size,
+        master_addr,
+        master_port,
+        num_update_per_episode,
+        num_recv_per_update,
+        batch_size,
+        model_config,
+        plugin_config,
+        microbatch_size=1,
+        num_generations=4,
+        use_wandb=True,
+        log_dir="./results",
+    ):
+        super().__init__(
+            num_producers,
+            num_episodes,
+            rank,
+            world_size,
+            master_addr,
+            master_port,
+            num_update_per_episode,
+            num_recv_per_update,
+            batch_size,
+            model_config,
+            plugin_config,
+            microbatch_size,
+        )
+        path = model_config.pop("path")
+        self.policy_model = AutoModelForCausalLM.from_pretrained(path, **model_config)
+        self.policy_model.train()
+        self.accum_reward = torch.zeros(1, device=self.device)
+        self.accum_format_reward = torch.zeros(1, device=self.device)
+        self.accum_acc_reward = torch.zeros(1, device=self.device)
+        self.accum_response_length = torch.zeros(1, device=self.device)
+        self.accum_count = torch.zeros(1, device=self.device)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(path)
+        self.pad_token_id = self.tokenizer.pad_token_id
+        self.num_generations = num_generations
+
+        # Initialize verifiable reward.
+        response_format_tags = {
+            "think_start": {"text": "<think>", "num_occur": 1},
+            "think_end": {"text": "</think>", "num_occur": 1},
+            "answer_start": {"text": "<answer>", "num_occur": 1},
+            "answer_end": {"text": "</answer>", "num_occur": 1},
+        }
+        self.reward_model = VerifiableReward(
+            reward_fns=[math_reward_fn], tokenizer=self.tokenizer, tags=response_format_tags
+        )
+
+        self.log_dir = log_dir
+        if not os.path.exists(self.log_dir):
+            os.makedirs(self.log_dir)
+        else:
+            os.system(f"rm -rf {self.log_dir}/*")
+
+    def setup(self):
+        super().setup()
+        self.policy_model, _, *_ = self.booster.boost(self.policy_model)
+
+    def step(self, step_idx: int, **kwargs) -> Optional[float]:
+        rank = dist.get_rank()
+        data = {k: v.view(-1, v.size(-1)).cpu() for k, v in kwargs.items()}
+        kwargs["input_ids"].size(0)
+        reward_group = self.reward_model(
+            data["input_ids"], gt_answer=data["gt_answer"], response_idx=data["response_idx"]
+        )
+        reward = [value[0].item() for value in reward_group]
+        format_reward = [value[1].item() for value in reward_group]
+        acc_reward = [value[2].item() for value in reward_group]
+        response_length = [(data["response_idx"][i][1] - data["response_idx"][i][0]).item() for i in range(len(reward))]
+
+        response = self.tokenizer.batch_decode(data["input_ids"], skip_special_tokens=True)
+        with open(f"{self.log_dir}/eval_results_rank_{rank}.jsonl", "a", encoding="utf8") as f:
+            for i in range(len(response)):
+                f.write(
+                    json.dumps(
+                        {
+                            "response": response[i],
+                            "reward": reward[i],
+                            "format_reward": format_reward[i],
+                            "acc_reward": acc_reward[i],
+                            "response_length": response_length[i],
+                        },
+                        ensure_ascii=False,
+                    )
+                    + "\n"
+                )
+
+        self.accum_reward += sum(reward)
+        self.accum_format_reward += sum(format_reward)
+        self.accum_acc_reward += sum(acc_reward)
+        self.accum_response_length += sum(response_length)
+        self.accum_count += len(reward)
+
+        # print results
+        total_count = all_reduce_mean(self.accum_count, self.plugin)
+        mean_reward = all_reduce_mean(self.accum_reward, self.plugin) / total_count
+        mean_format_reward = all_reduce_mean(self.accum_format_reward, self.plugin) / total_count
+        mean_acc_reward = all_reduce_mean(self.accum_acc_reward, self.plugin) / total_count
+        mean_response_length = all_reduce_mean(self.accum_response_length, self.plugin) / total_count
+        if rank == 0:
+            print(
+                f"Step {step_idx}: Mean Reward: {mean_reward}, Mean Format Reward: {mean_format_reward}, Mean Acc Reward: {mean_acc_reward}, Mean Response Length: {mean_response_length}"
+            )
+        return None
+
+    def state_dict(self):
+        self.policy_model._force_wait_all_gather()
+        model = self.policy_model.unwrap()
+        state_dict = model.state_dict()
+        return state_dict
diff --git a/applications/ColossalChat/coati/distributed/inference_backend.py b/applications/ColossalChat/coati/distributed/inference_backend.py
index 58414b29fd47..b4cfffa4dc81 100644
--- a/applications/ColossalChat/coati/distributed/inference_backend.py
+++ b/applications/ColossalChat/coati/distributed/inference_backend.py
@@ -183,7 +183,7 @@ def __init__(self, model_config: Dict[str, Any], generate_config: Dict[str, Any]
             raise ImportError("vllm is not installed")
         model_config = update_by_default(model_config, self.DEFAULT_MODEL_CONFIG)
         path = model_config.pop("path")
-        self.llm = LLM(path, **model_config)
+        self.llm = LLM(model=path, **model_config)
         generate_config = generate_config.copy()
         generate_config.update(self.FORCE_GENERATE_CONFIG)
         self.generate_config = SamplingParams(**generate_config)
@@ -194,8 +194,15 @@ def __init__(self, model_config: Dict[str, Any], generate_config: Dict[str, Any]
     def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:
         micro_batch_size = input_ids.size(0)
         response_start_idx = input_ids.size(1)
+        micro_batch_input_ids = input_ids.tolist()
+        micro_batch_input_ids_no_padding = []
+        for i in range(micro_batch_size):
+            for j in range(input_ids.size(1)):
+                if micro_batch_input_ids[i][j] != self.tokenizer.pad_token_id:
+                    micro_batch_input_ids_no_padding.append(micro_batch_input_ids[i][j:])
+                    break
         outputs = self.llm.generate(
-            prompt_token_ids=input_ids.tolist(), sampling_params=self.generate_config, use_tqdm=False
+            prompt_token_ids=micro_batch_input_ids_no_padding, sampling_params=self.generate_config, use_tqdm=False
         )
         out_tokens = []
         out_len = []
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index 8581ff5865f8..512e7261fd6c 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -1,15 +1,13 @@
+import copy
 from typing import Any, Dict, Optional
 
 import ray
 
 from .consumer import SimpleConsumer
-from .grpo_consumer import GRPOConsumer
+from .grpo_consumer import GRPOConsumer, GRPOEvalConsumer
 from .producer import SimpleProducer
 
-ALGO_MAP = {
-    "Simple": SimpleConsumer,
-    "GRPO": GRPOConsumer,
-}
+ALGO_MAP = {"Simple": SimpleConsumer, "GRPO": GRPOConsumer, "EvalGRPO": GRPOEvalConsumer}
 
 
 def get_jsonl_size_fast(path: str) -> int:
@@ -80,6 +78,12 @@ def launch_distributed(
             backend=inference_backend,
         )
         procs.append(producer)
+    generate_config_consumer = copy.deepcopy(generate_config)
+    generate_config_consumer.update(
+        dict(
+            backend=inference_backend,
+        )
+    )
     for i in range(num_consumer_procs):
         consumer = core_consumer.options(num_gpus=1).remote(
             num_producers=num_producers,
@@ -94,6 +98,8 @@ def launch_distributed(
             model_config=train_model_config,
             plugin_config=plugin_config,
             microbatch_size=train_microbatch_size,
+            generate_config=generate_config_consumer,
+            filter_range=[0.05, 9.0],
         )
         procs.append(consumer)
     ray.get([p.setup.remote() for p in procs])
diff --git a/applications/ColossalChat/coati/distributed/loss.py b/applications/ColossalChat/coati/distributed/loss.py
index af5776731a25..90ad09736281 100644
--- a/applications/ColossalChat/coati/distributed/loss.py
+++ b/applications/ColossalChat/coati/distributed/loss.py
@@ -23,6 +23,7 @@ def forward(
         advantages: torch.Tensor,
         per_token_kl: torch.Tensor,
         action_mask: Optional[torch.Tensor] = None,
+        loss_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         skip = False
         if action_mask is None:
@@ -38,5 +39,7 @@ def forward(
             loss = masked_mean(loss, action_mask)
         else:
             loss = loss.mean(dim=1)
+        if loss_mask is not None:
+            loss = loss * loss_mask
         loss = loss.mean()
         return loss, skip, ratio.max()
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 1de8b649d5d1..fc32ece21cec 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -15,18 +15,14 @@
     parser.add_argument("-tbs", "--train-batch-size", type=int, default=32)
     parser.add_argument("-tmbs", "--train-microbatch-size", type=int, default=1)
     parser.add_argument("-b", "--backend", type=str, default="transformers")
-    parser.add_argument("-a", "--algo", type=str, default="GRPO", choices=["Simple, GRPO"])
+    parser.add_argument("-a", "--algo", type=str, default="GRPO", choices=["Simple", "GRPO", "EvalGRPO"])
     args = parser.parse_args()
 
     ray.init(address="local", namespace="ray-example")
 
     inference_model_config = dict(path=args.model)
     train_model_config = dict(path=args.model)
-    generate_config = dict(
-        top_k=50,
-        top_p=0.9,
-        temperature=1.0,
-    )
+    generate_config = dict(top_k=50, top_p=0.9, temperature=0.7)
 
     if args.backend == "transformers":
         inference_model_config.update(
@@ -52,19 +48,13 @@
             )
         )
     elif args.backend == "vllm":
-        inference_model_config.update(
-            dict(
-                gpu_memory_utilization=0.7,
-            )
-        )
+        inference_model_config.update(dict(gpu_memory_utilization=0.7, enforce_eager=True, enable_chunked_prefill=True))
         generate_config.update(
             dict(
                 max_tokens=2048,
                 ignore_eos=True,
                 include_stop_str_in_output=True,
                 stop=["</answer>"],
-                temperature=0.7,
-                top_p=0.95,
             )
         )
     else:
@@ -97,6 +87,6 @@
         plugin_config={},
         inference_backend=args.backend,
         master_addr="localhost",
-        master_port=29504,
+        master_port=29503,
         core_algo=args.algo,
     )

From 0472f44163c232d78506e6b2ebf7932780332951 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Fri, 21 Mar 2025 10:24:24 +0800
Subject: [PATCH 032/100] fix logprob, add filtering, temperature annealing, lr
 descent

---
 .../coati/distributed/consumer.py             |  3 ++
 .../coati/distributed/grpo_consumer.py        | 50 ++++++++++++-------
 .../coati/distributed/inference_backend.py    | 30 ++++++++---
 .../ColossalChat/coati/distributed/launch.py  |  5 +-
 .../coati/distributed/producer.py             |  9 +++-
 applications/ColossalChat/rl_example.py       |  2 +-
 colossalai/shardformer/layer/loss.py          |  2 +-
 7 files changed, 74 insertions(+), 27 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 1e85cccb3c5b..de289738347c 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -57,6 +57,7 @@ def __init__(
         assert self.plugin_config.get("pp_size", 1) == 1, "pp_size > 1 is not supported now"
 
         self.device = get_current_device()
+        self.lr_scheduler = None
 
     def setup(self) -> None:
         for i in range(self.num_producers):
@@ -121,6 +122,8 @@ def loop(self) -> None:
                                 pbar.set_postfix({"loss": loss})
                             i += 1
                     assert len(self.buffer) == 0
+                    if self.lr_scheduler is not None:
+                        self.lr_scheduler.step()
                     if (step + 1) % self.save_interval == 0:
                         if self.rank == 0:
                             print(f"Start saving policy model at step {step + 1}.")
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 785fa820ec8b..5a488f5aaf91 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -15,6 +15,7 @@
 from coati.trainer.utils import all_reduce_mean
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
 from colossalai.nn.optimizer import HybridAdam
 
 
@@ -34,10 +35,10 @@ def __init__(
         model_config,
         plugin_config,
         microbatch_size=1,
-        num_generations=4,
+        num_generations=8,
         use_wandb=True,
-        generator_config=None,
-        filter_range=None,
+        generate_config=None,
+        training_config={},
     ):
         super().__init__(
             num_producers,
@@ -57,7 +58,7 @@ def __init__(
         self.policy_model = AutoModelForCausalLM.from_pretrained(path, **model_config)
         self.policy_model.train()
         self.policy_model.gradient_checkpointing_enable()
-        self.optimizer = HybridAdam(self.policy_model.parameters(), lr=1e-6)
+        self.optimizer = HybridAdam(self.policy_model.parameters(), lr=training_config.get("lr", 1e-6))
         self.accum_loss = torch.zeros(1, device=self.device)
         self.accum_reward = torch.zeros(1, device=self.device)
         self.accum_kl = torch.zeros(1, device=self.device)
@@ -66,6 +67,7 @@ def __init__(
         self.accum_advantages = torch.zeros(1, device=self.device)
         self.accum_response_length = torch.zeros(1, device=self.device)
         self.accum_count = 0
+        self.generate_config = generate_config
 
         # Reference model is initialized from policy model.
         self.reference_model = AutoModelForCausalLM.from_pretrained(path, **model_config)
@@ -74,7 +76,7 @@ def __init__(
         self.tokenizer = AutoTokenizer.from_pretrained(path)
         self.pad_token_id = self.tokenizer.pad_token_id
         self.num_generations = num_generations
-        self.filter_range = filter_range
+        self.filter_range = training_config.get("filter_range", None)
         if self.filter_range is not None:
             assert len(self.filter_range) == 2, "Filter range should have 2 values."
 
@@ -92,15 +94,21 @@ def __init__(
         self.policy_loss_fn = PolicyLoss()
         self.global_step = 0
         if use_wandb and self.rank == 0:
-            if "repetition_penalty" in generator_config:
-                name = f"{generator_config['backend']}_bs_{self.batch_size*self.world_size}_temp_{generator_config['temperature']:.01f}_rep_penalty_{generator_config['repetition_penalty']:.01f}"
-            else:
-                name = f"{generator_config['backend']}_bs_{self.batch_size*self.world_size}_temp_{generator_config['temperature']:.01f}"
+            name = f"{generate_config['backend']}_bs_{self.batch_size*self.world_size}_temp_{generate_config['temperature']:.01f}_top_p_{generate_config['top_p']:.02f}"
             self.wandb_run = wandb.init(project="GRPO-V1", sync_tensorboard=True, dir="./wandb", name=name)
 
+        self.lr_scheduler = CosineAnnealingWarmupLR(
+            optimizer=self.optimizer,
+            total_steps=min(self.num_episodes, 4) * self.num_update_per_episode,
+            warmup_steps=0,
+            eta_min=0.1 * training_config.get("lr", 1e-6),
+        )
+
     def setup(self):
         super().setup()
-        self.policy_model, self.optimizer, *_ = self.booster.boost(self.policy_model, self.optimizer)
+        self.policy_model, self.optimizer, _, _, self.lr_scheduler = self.booster.boost(
+            self.policy_model, self.optimizer, lr_scheduler=self.lr_scheduler
+        )
         self.reference_model, *_ = self.booster.boost(self.reference_model)
 
     def step(self, step_idx: int, **kwargs) -> Optional[float]:
@@ -133,7 +141,10 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
                 attention_mask=data["attention_mask"],
             )["logits"]
             action_log_probs = calc_action_log_probs(
-                policy_model_logits / generator_config["temperature"], data["input_ids"], num_action, self.plugin.shard_config
+                policy_model_logits / self.generate_config["temperature"],
+                data["input_ids"],
+                num_action,
+                self.plugin.shard_config,
             )
 
             with torch.no_grad():
@@ -142,7 +153,10 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
                     attention_mask=data["attention_mask"],
                 )["logits"]
             reference_action_log_probs = calc_action_log_probs(
-                reference_model_logits / generator_config["temperature"], data["input_ids"], num_action, self.plugin.shard_config
+                reference_model_logits / self.generate_config["temperature"],
+                data["input_ids"],
+                num_action,
+                self.plugin.shard_config,
             )
 
             per_token_kl = (
@@ -161,22 +175,24 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
             acc_reward = torch.tensor([value[2] for value in reward_group]).to(data["input_ids"].device)
 
             # [batch_size, num_generations]
+
+            group_reward = reward.view(-1, self.num_generations)
+            reward_mean = group_reward.mean(dim=1)
             # filter out the reward that is too high (all sample gets full score) or too low (all sample gets 0 score),
             loss_mask = (
                 None
                 if self.filter_range is None
-                else torch.logical_and(reward > self.filter_range[0], reward < self.filter_range[1])
+                else torch.logical_and(
+                    reward_mean > self.filter_range[0], reward_mean < self.filter_range[1]
+                ).repeat_interleave(self.num_generations, dim=0)
             )
-            group_reward = reward.view(-1, self.num_generations)
-            reward_mean = group_reward.mean(dim=1)
 
             # [batch_size x num_generations]
-            reward_mean = group_reward.mean(dim=1).repeat_interleave(self.num_generations, dim=0)
+            reward_mean = reward_mean.repeat_interleave(self.num_generations, dim=0)
             reward_std = group_reward.std(dim=1).repeat_interleave(self.num_generations, dim=0)
             # [batch_size x num_generations]
             advantages = (reward - reward_mean) / (reward_std + 1e-4)
 
-            # Calculate Loss
             loss, skip_update, _ = self.policy_loss_fn(
                 action_log_probs,
                 old_action_log_probs,
diff --git a/applications/ColossalChat/coati/distributed/inference_backend.py b/applications/ColossalChat/coati/distributed/inference_backend.py
index b4cfffa4dc81..5039d89f5df5 100644
--- a/applications/ColossalChat/coati/distributed/inference_backend.py
+++ b/applications/ColossalChat/coati/distributed/inference_backend.py
@@ -53,7 +53,13 @@ class TransformersInferenceBackend(BaseInferenceBackend):
     )
     FORCE_GENERATE_CONFIG = dict(output_logits=True, return_dict_in_generate=True)
 
-    def __init__(self, model_config: Dict[str, Any], generate_config: Dict[str, Any], tokenizer: PreTrainedTokenizer):
+    def __init__(
+        self,
+        model_config: Dict[str, Any],
+        generate_config: Dict[str, Any],
+        tokenizer: PreTrainedTokenizer,
+        num_generations: int = 8,
+    ):
         model_config = update_by_default(model_config, self.DEFAULT_MODEL_CONFIG)
         model_config.update(self.FORCE_MODEL_CONFIG)
         path = model_config.pop("path")
@@ -61,7 +67,7 @@ def __init__(self, model_config: Dict[str, Any], generate_config: Dict[str, Any]
         self.generate_config = generate_config.copy()
         self.generate_config.update(self.FORCE_GENERATE_CONFIG)
         self.tokenizer = tokenizer
-        self.num_generations = 8
+        self.num_generations = num_generations
 
     @torch.no_grad()
     def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:
@@ -120,7 +126,13 @@ def load_state_dict(self, state_dict: Dict[str, torch.Tensor]) -> None:
 
 
 class SGLangInferenceBackend(BaseInferenceBackend):
-    def __init__(self, model_config: Dict[str, Any], generate_config: Dict[str, Any], tokenizer: PreTrainedTokenizer):
+    def __init__(
+        self,
+        model_config: Dict[str, Any],
+        generate_config: Dict[str, Any],
+        tokenizer: PreTrainedTokenizer,
+        num_generations: int = 8,
+    ):
         if sgl is None:
             raise ImportError("sglang is not installed")
         path = model_config.pop("path")
@@ -175,10 +187,15 @@ class VLLMInferenceBackend(BaseInferenceBackend):
     )
     FORCE_GENERATE_CONFIG = dict(
         logprobs=0,
-        n=8,
     )
 
-    def __init__(self, model_config: Dict[str, Any], generate_config: Dict[str, Any], tokenizer: PreTrainedTokenizer):
+    def __init__(
+        self,
+        model_config: Dict[str, Any],
+        generate_config: Dict[str, Any],
+        tokenizer: PreTrainedTokenizer,
+        num_generations: int = 8,
+    ):
         if LLM is None:
             raise ImportError("vllm is not installed")
         model_config = update_by_default(model_config, self.DEFAULT_MODEL_CONFIG)
@@ -186,9 +203,10 @@ def __init__(self, model_config: Dict[str, Any], generate_config: Dict[str, Any]
         self.llm = LLM(model=path, **model_config)
         generate_config = generate_config.copy()
         generate_config.update(self.FORCE_GENERATE_CONFIG)
+        generate_config.update({"n": num_generations})
         self.generate_config = SamplingParams(**generate_config)
         self.tokenizer = tokenizer
-        self.num_generations = self.FORCE_GENERATE_CONFIG["n"]
+        self.num_generations = num_generations
 
     @torch.no_grad()
     def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index 512e7261fd6c..c50db1378e16 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -42,6 +42,7 @@ def launch_distributed(
     plugin_config: Dict[str, Any],
     tokenizer_config: Optional[Dict[str, Any]] = None,
     inference_backend: str = "transformers",
+    num_generations: int = 8,
     master_addr: str = "localhost",
     master_port: int = 29500,
     core_algo: str = "GRPO",
@@ -76,6 +77,7 @@ def launch_distributed(
             tokenizer_config=tokenizer_config,
             microbatch_size=inference_microbatch_size,
             backend=inference_backend,
+            num_generations=num_generations,
         )
         procs.append(producer)
     generate_config_consumer = copy.deepcopy(generate_config)
@@ -99,7 +101,8 @@ def launch_distributed(
             plugin_config=plugin_config,
             microbatch_size=train_microbatch_size,
             generate_config=generate_config_consumer,
-            filter_range=[0.05, 9.0],
+            training_config={"filter_range": [0.05, 9.0], "lr": 1e-6},
+            num_generations=num_generations,
         )
         procs.append(consumer)
     ray.get([p.setup.remote() for p in procs])
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index a3ae22a7935c..6cc9b33305a6 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -117,6 +117,12 @@ def loop(self) -> None:
                         None, self.num_producers, device=self.device, group_name="sync_model"
                     )
                     self.load_state_dict(state_dict)
+                # linear annealing for 1 episode, temperature from initial to 0.7
+                if episode <= 0:
+                    ratio = 1 - (len(self.dataloader) - i) / len(self.dataloader)
+                    self.model.generate_config.temperature = (
+                        ratio * self.generate_config["temperature"] + (1 - ratio) * 0.7
+                    )
 
 
 @ray.remote
@@ -135,6 +141,7 @@ def __init__(
         tokenizer_config=None,
         microbatch_size=1,
         backend="transformers",
+        num_generations: int = 8,
     ):
         super().__init__(
             producer_idx,
@@ -150,7 +157,7 @@ def __init__(
             microbatch_size,
             backend,
         )
-        self.model = self.backend_cls(model_config, generate_config, self.tokenizer)
+        self.model = self.backend_cls(model_config, generate_config, self.tokenizer, num_generations)
 
     @torch.no_grad()
     def rollout(self, input_ids, attention_mask, **kwargs):
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index fc32ece21cec..a67a10bc5b35 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -22,7 +22,7 @@
 
     inference_model_config = dict(path=args.model)
     train_model_config = dict(path=args.model)
-    generate_config = dict(top_k=50, top_p=0.9, temperature=0.7)
+    generate_config = dict(top_k=50, top_p=0.75, temperature=0.9)
 
     if args.backend == "transformers":
         inference_model_config.update(
diff --git a/colossalai/shardformer/layer/loss.py b/colossalai/shardformer/layer/loss.py
index 51419a38a0ed..a9bb76fc7d6b 100644
--- a/colossalai/shardformer/layer/loss.py
+++ b/colossalai/shardformer/layer/loss.py
@@ -387,7 +387,7 @@ def dist_log_prob(
             dtype=dtype,
         )
     else:
-        log_prob = log_softmax(logits)
+        log_prob = log_softmax(logits, dim=-1)
         log_prob = log_prob.gather(dim=-1, index=labels.unsqueeze(-1))
 
     return log_prob

From d8eaf0d4734af11f4e57600509413ef6242966e3 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Fri, 21 Mar 2025 15:03:10 +0800
Subject: [PATCH 033/100] simplify vllm preprocessing input ids

---
 .../coati/distributed/inference_backend.py             | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/inference_backend.py b/applications/ColossalChat/coati/distributed/inference_backend.py
index 5039d89f5df5..17c71c8a85b1 100644
--- a/applications/ColossalChat/coati/distributed/inference_backend.py
+++ b/applications/ColossalChat/coati/distributed/inference_backend.py
@@ -212,13 +212,11 @@ def __init__(
     def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:
         micro_batch_size = input_ids.size(0)
         response_start_idx = input_ids.size(1)
+        first_non_padding_token_idx = (input_ids != self.tokenizer.pad_token_id).int().argmax(dim=1)
         micro_batch_input_ids = input_ids.tolist()
-        micro_batch_input_ids_no_padding = []
-        for i in range(micro_batch_size):
-            for j in range(input_ids.size(1)):
-                if micro_batch_input_ids[i][j] != self.tokenizer.pad_token_id:
-                    micro_batch_input_ids_no_padding.append(micro_batch_input_ids[i][j:])
-                    break
+        micro_batch_input_ids_no_padding = [
+            micro_batch_input_ids[i][first_non_padding_token_idx[i] :] for i in range(micro_batch_size)
+        ]
         outputs = self.llm.generate(
             prompt_token_ids=micro_batch_input_ids_no_padding, sampling_params=self.generate_config, use_tqdm=False
         )

From 2aa7385c88a1f271bb49741e56c19989421e3756 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Fri, 21 Mar 2025 16:12:07 +0800
Subject: [PATCH 034/100] update logging

---
 .../ColossalChat/coati/distributed/grpo_consumer.py   | 11 ++++++-----
 .../ColossalChat/coati/distributed/producer.py        |  3 +++
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 5a488f5aaf91..1c0773f4e9bc 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -133,7 +133,6 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
         response_length = torch.sum(action_mask, dim=1).to(torch.float32)
 
         need_update = (step_idx + 1) % self.num_microbatches == 0
-
         ctx = nullcontext() if need_update else self.booster.no_sync(self.policy_model, self.optimizer)
         with ctx:
             policy_model_logits = self.policy_model(
@@ -243,13 +242,15 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
                 )
                 self.wandb_run.log(
                     {
+                        "metrics/reward": self.accum_reward.item() / self.accum_count,
+                        "metrics/format_reward": self.accum_format_reward.item() / self.accum_count,
+                        "metrics/acc_reward": self.accum_acc_reward.item() / self.accum_count,
+                        "metrics/response_length": self.accum_response_length.item() / self.accum_count,
                         "train/loss": self.accum_loss.item() / self.accum_count,
-                        "train/reward": self.accum_reward.item() / self.accum_count,
-                        "train/format_reward": self.accum_format_reward.item() / self.accum_count,
-                        "train/acc_reward": self.accum_acc_reward.item() / self.accum_count,
                         "train/kl": self.accum_kl.item() / self.accum_count,
                         "train/advantages": self.accum_advantages.item() / self.accum_count,
-                        "train/response_length": self.accum_response_length.item() / self.accum_count,
+                        "train/learning_rate": self.lr_scheduler.get_last_lr()[0],
+                        "rollout/temperature": data["temperature"].cpu().numpy()[0][0],
                     }
                 )
             self.accum_loss.zero_()
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 6cc9b33305a6..51a1af332f25 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -101,6 +101,9 @@ def loop(self) -> None:
                     break
                 outputs = self.rollout(**batch)
                 print(f"[P{self.producer_idx}] Send data {[(k, v.shape) for k, v in outputs.items()]}")
+                outputs["temperature"] = torch.tensor(
+                    [self.model.generate_config.temperature] * outputs["input_ids"].size(0)
+                ).to(outputs["input_ids"].device)
                 outputs = pre_send(outputs)
                 ray_broadcast_tensor_dict(
                     outputs, src=0, device=self.device, group_name=f"sync_data_{self.producer_idx}"

From 50153005b4c2af4dfecc3b07adc91b20e97d00b0 Mon Sep 17 00:00:00 2001
From: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Date: Fri, 28 Mar 2025 10:24:58 +0800
Subject: [PATCH 035/100] [feat] add microbatch forwarding (#6251)

* add microbatch forwarding

* fix forward microbatch

* fix producer OOM

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* change project name

* fix temperature annealing

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* address conversation

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../coati/distributed/consumer.py             |   7 +-
 .../coati/distributed/grpo_consumer.py        | 133 +++++++++++-------
 .../ColossalChat/coati/distributed/launch.py  |   9 +-
 .../coati/distributed/producer.py             |  10 +-
 applications/ColossalChat/rl_example.py       |  27 ++--
 5 files changed, 113 insertions(+), 73 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index de289738347c..027acc2e7537 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -66,12 +66,7 @@ def setup(self) -> None:
             cc.init_collective_group(self.num_producers + 1, self.num_producers, group_name="sync_model")
         launch(self.rank, self.world_size, self.master_addr, self.master_port, local_rank=0)
 
-        plugin_config = dict(
-            tp_size=1,
-            pp_size=1,
-            precision="bf16",
-            zero_stage=1,
-        )
+        plugin_config = dict(tp_size=1, pp_size=1, precision="bf16", zero_stage=2)
         if self.plugin_config.get("pp_size", 1) > 1 and "num_microbatches" not in self.plugin_config:
             plugin_config["microbatch_size"] = self.microbatch_size
         plugin_config.update(self.plugin_config)
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 1c0773f4e9bc..4174f96514b8 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -68,6 +68,7 @@ def __init__(
         self.accum_response_length = torch.zeros(1, device=self.device)
         self.accum_count = 0
         self.generate_config = generate_config
+        self.training_config = training_config
 
         # Reference model is initialized from policy model.
         self.reference_model = AutoModelForCausalLM.from_pretrained(path, **model_config)
@@ -131,40 +132,16 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
         num_action = action_mask.shape[1]
         old_action_log_probs = data["action_log_probs"]
         response_length = torch.sum(action_mask, dim=1).to(torch.float32)
+        forward_batch_size = self.training_config.get("train_microbatch_size", data["input_ids"].size(0))
 
         need_update = (step_idx + 1) % self.num_microbatches == 0
-        ctx = nullcontext() if need_update else self.booster.no_sync(self.policy_model, self.optimizer)
+        # Gradient must be synchronized if zero2 is enabled. https://github.com/hpcaitech/ColossalAI/blob/44d4053fec005fe0b06b6bc755fdc962463145df/colossalai/booster/plugin/hybrid_parallel_plugin.py#L1500
+        ctx = (
+            nullcontext()
+            if need_update or self.booster.plugin.zero_stage == 2
+            else self.booster.no_sync(self.policy_model, self.optimizer)
+        )
         with ctx:
-            policy_model_logits = self.policy_model(
-                input_ids=data["input_ids"],
-                attention_mask=data["attention_mask"],
-            )["logits"]
-            action_log_probs = calc_action_log_probs(
-                policy_model_logits / self.generate_config["temperature"],
-                data["input_ids"],
-                num_action,
-                self.plugin.shard_config,
-            )
-
-            with torch.no_grad():
-                reference_model_logits = self.reference_model(
-                    input_ids=data["input_ids"],
-                    attention_mask=data["attention_mask"],
-                )["logits"]
-            reference_action_log_probs = calc_action_log_probs(
-                reference_model_logits / self.generate_config["temperature"],
-                data["input_ids"],
-                num_action,
-                self.plugin.shard_config,
-            )
-
-            per_token_kl = (
-                torch.exp(reference_action_log_probs - action_log_probs)
-                - (reference_action_log_probs - action_log_probs)
-                - 1
-            )
-            kl = torch.sum(per_token_kl * action_mask, dim=-1) / torch.sum(action_mask, dim=-1)
-
             reward_group = self.reward_model(
                 data["input_ids"], gt_answer=data["gt_answer"], response_idx=data["response_idx"]
             )
@@ -177,6 +154,11 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
 
             group_reward = reward.view(-1, self.num_generations)
             reward_mean = group_reward.mean(dim=1)
+            # [batch_size x num_generations]
+            reward_mean = reward_mean.repeat_interleave(self.num_generations, dim=0)
+            reward_std = group_reward.std(dim=1).repeat_interleave(self.num_generations, dim=0)
+            # [batch_size x num_generations]
+            advantages = ((reward - reward_mean) / (reward_std + 1e-4)).unsqueeze(dim=-1)
             # filter out the reward that is too high (all sample gets full score) or too low (all sample gets 0 score),
             loss_mask = (
                 None
@@ -185,35 +167,82 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
                     reward_mean > self.filter_range[0], reward_mean < self.filter_range[1]
                 ).repeat_interleave(self.num_generations, dim=0)
             )
+            mean_kl, mean_loss = [], []
+            for forward_micro_batch_start in range(0, data["input_ids"].size(0), forward_batch_size):
+                input_ids_forward_micro_batch = data["input_ids"][
+                    forward_micro_batch_start : forward_micro_batch_start + forward_batch_size
+                ]
+                attention_mask_forward_micro_batch = data["attention_mask"][
+                    forward_micro_batch_start : forward_micro_batch_start + forward_batch_size
+                ]
+                action_mask_forward_micro_batch = action_mask[
+                    forward_micro_batch_start : forward_micro_batch_start + forward_batch_size
+                ]
+                loss_mask_forward_micro_batch = (
+                    loss_mask[forward_micro_batch_start : forward_micro_batch_start + forward_batch_size]
+                    if loss_mask is not None
+                    else None
+                )
+                advantages_forward_micro_batch = advantages[
+                    forward_micro_batch_start : forward_micro_batch_start + forward_batch_size
+                ]
+                policy_model_logits = self.policy_model(
+                    input_ids=input_ids_forward_micro_batch,
+                    attention_mask=attention_mask_forward_micro_batch,
+                ).logits
+                action_log_probs = calc_action_log_probs(
+                    policy_model_logits / self.generate_config["temperature"],
+                    input_ids_forward_micro_batch,
+                    num_action,
+                    self.plugin.shard_config,
+                )
 
-            # [batch_size x num_generations]
-            reward_mean = reward_mean.repeat_interleave(self.num_generations, dim=0)
-            reward_std = group_reward.std(dim=1).repeat_interleave(self.num_generations, dim=0)
-            # [batch_size x num_generations]
-            advantages = (reward - reward_mean) / (reward_std + 1e-4)
-
-            loss, skip_update, _ = self.policy_loss_fn(
-                action_log_probs,
-                old_action_log_probs,
-                advantages.unsqueeze(dim=-1).repeat_interleave(action_log_probs.size(-1), dim=-1),
-                per_token_kl,
-                action_mask,
-                loss_mask=loss_mask,
-            )
+                with torch.no_grad():
+                    reference_model_logits = self.reference_model(
+                        input_ids=input_ids_forward_micro_batch,
+                        attention_mask=attention_mask_forward_micro_batch,
+                    ).logits
+                reference_action_log_probs = calc_action_log_probs(
+                    reference_model_logits / self.generate_config["temperature"],
+                    input_ids_forward_micro_batch,
+                    num_action,
+                    self.plugin.shard_config,
+                )
+
+                per_token_kl = (
+                    torch.exp(reference_action_log_probs - action_log_probs)
+                    - (reference_action_log_probs - action_log_probs)
+                    - 1
+                )
+                kl = torch.sum(per_token_kl * action_mask_forward_micro_batch, dim=-1) / torch.sum(
+                    action_mask_forward_micro_batch, dim=-1
+                )
+
+                loss, skip_update, _ = self.policy_loss_fn(
+                    action_log_probs,
+                    old_action_log_probs,
+                    advantages_forward_micro_batch.repeat_interleave(action_log_probs.size(-1), dim=-1),
+                    per_token_kl,
+                    action_mask_forward_micro_batch,
+                    loss_mask=loss_mask_forward_micro_batch,
+                )
+
+                if not skip_update:
+                    self.booster.backward(loss, self.optimizer)
+                loss = all_reduce_mean(loss, self.plugin)
+                kl = all_reduce_mean(kl.mean(), self.plugin)
+                # Calculate accumulate value.
+                mean_kl.append(kl.data)
+                mean_loss.append(loss.data)
 
-            if not skip_update:
-                self.booster.backward(loss, self.optimizer)
-            loss = all_reduce_mean(loss, self.plugin)
             reward = all_reduce_mean(reward.mean(), self.plugin)
-            kl = all_reduce_mean(kl.mean(), self.plugin)
             format_reward = all_reduce_mean(format_reward.mean(), self.plugin)
             acc_reward = all_reduce_mean(acc_reward.mean(), self.plugin)
             advantages = all_reduce_mean(advantages.mean(), self.plugin)
             response_length = all_reduce_mean(response_length.mean(), self.plugin)
-            # Calculate accumulate value.
-            self.accum_loss.add_(loss.data)
+            self.accum_loss.add_(sum(mean_loss) / len(mean_loss))
+            self.accum_kl.add_(sum(mean_kl) / len(mean_kl))
             self.accum_reward.add_(reward.data)
-            self.accum_kl.add_(kl.data)
             self.accum_format_reward.add_(format_reward.data)
             self.accum_acc_reward.add_(acc_reward.data)
             self.accum_advantages.add_(advantages.data)
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index c50db1378e16..ba5d3a9d4fd8 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -34,6 +34,7 @@ def launch_distributed(
     inference_microbatch_size: int,
     train_batch_size: int,
     train_microbatch_size: int,
+    train_minibatch_size: int,
     dataset_config: Dict[str, Any],
     dataloaders_config: Dict[str, Any],
     inference_model_config: Dict[str, Any],
@@ -99,9 +100,13 @@ def launch_distributed(
             batch_size=train_batch_size,
             model_config=train_model_config,
             plugin_config=plugin_config,
-            microbatch_size=train_microbatch_size,
+            microbatch_size=train_minibatch_size,
             generate_config=generate_config_consumer,
-            training_config={"filter_range": [0.05, 9.0], "lr": 1e-6},
+            training_config={
+                "filter_range": [0.05, 9.0],
+                "lr": 1e-6,
+                "train_microbatch_size": train_microbatch_size,
+            },
             num_generations=num_generations,
         )
         procs.append(consumer)
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 51a1af332f25..2c6a24a36711 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -100,6 +100,7 @@ def loop(self) -> None:
                 if i >= num_valid_microbatches:
                     break
                 outputs = self.rollout(**batch)
+
                 print(f"[P{self.producer_idx}] Send data {[(k, v.shape) for k, v in outputs.items()]}")
                 outputs["temperature"] = torch.tensor(
                     [self.model.generate_config.temperature] * outputs["input_ids"].size(0)
@@ -116,16 +117,19 @@ def loop(self) -> None:
                     print(
                         f"[P{self.producer_idx}] Sync model episode {episode} step {(i + 1) // self.num_microbatches - 1}"
                     )
+
                     state_dict = ray_broadcast_tensor_dict(
                         None, self.num_producers, device=self.device, group_name="sync_model"
                     )
                     self.load_state_dict(state_dict)
+                    del state_dict
+                    torch.cuda.empty_cache()
                 # linear annealing for 1 episode, temperature from initial to 0.7
                 if episode <= 0:
                     ratio = 1 - (len(self.dataloader) - i) / len(self.dataloader)
-                    self.model.generate_config.temperature = (
-                        ratio * self.generate_config["temperature"] + (1 - ratio) * 0.7
-                    )
+                    self.model.generate_config.temperature = (1 - ratio) * self.generate_config[
+                        "temperature"
+                    ] + ratio * 0.7
 
 
 @ray.remote
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index a67a10bc5b35..4a4a4c3404e9 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -10,18 +10,30 @@
     parser.add_argument("-d", "--dataset", type=str, default="data.jsonl")
     parser.add_argument("-t", "--num-trainers", type=int, default=2)
     parser.add_argument("-i", "--num-inferencer", type=int, default=2)
+    parser.add_argument("-g", "--num-generations", type=int, default=8)
     parser.add_argument("-ibs", "--inference-batch-size", type=int, default=64)
     parser.add_argument("-imbs", "--inference-microbatch-size", type=int, default=8)
     parser.add_argument("-tbs", "--train-batch-size", type=int, default=32)
-    parser.add_argument("-tmbs", "--train-microbatch-size", type=int, default=1)
+    parser.add_argument("-tMbs", "--train-minibatch-size", type=int, default=1)
+    parser.add_argument("-tmbs", "--train-microbatch-size", type=int, default=2)
     parser.add_argument("-b", "--backend", type=str, default="transformers")
     parser.add_argument("-a", "--algo", type=str, default="GRPO", choices=["Simple", "GRPO", "EvalGRPO"])
     args = parser.parse_args()
 
+    assert args.train_minibatch_size > 0, "Train mini batch size must be greater than 0"
+    assert (
+        args.train_minibatch_size * args.num_generations >= args.train_microbatch_size
+        and args.train_microbatch_size > 0
+    ), "Train micro batch size must be greater than 0 less than train mini batch size * num generations"
+
     ray.init(address="local", namespace="ray-example")
 
     inference_model_config = dict(path=args.model)
-    train_model_config = dict(path=args.model)
+    train_model_config = dict(
+        path=args.model,
+        # use_flash_attention_2=True,
+        # use_cache=False
+    )
     generate_config = dict(top_k=50, top_p=0.75, temperature=0.9)
 
     if args.backend == "transformers":
@@ -31,13 +43,6 @@
                 torch_dtype=torch.bfloat16,
             )
         )
-        train_model_config.update(
-            dict(
-                use_flash_attention_2=True,
-                torch_dtype=torch.bfloat16,
-                use_cache=False,
-            )
-        )
         generate_config.update(
             dict(
                 max_length=1024 + 512,
@@ -78,15 +83,17 @@
         inference_batch_size=args.inference_batch_size,
         inference_microbatch_size=args.inference_microbatch_size,
         train_batch_size=args.train_batch_size,
+        train_minibatch_size=args.train_minibatch_size,
         train_microbatch_size=args.train_microbatch_size,
         dataset_config={"path": args.dataset, "max_length": 300},
         dataloaders_config={},
         inference_model_config=inference_model_config,
         generate_config=generate_config,
+        num_generations=args.num_generations,
         train_model_config=train_model_config,
         plugin_config={},
         inference_backend=args.backend,
         master_addr="localhost",
-        master_port=29503,
+        master_port=29505,
         core_algo=args.algo,
     )

From ed43a4be046a3fe7650b061086681e1bd1599b12 Mon Sep 17 00:00:00 2001
From: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Date: Wed, 9 Apr 2025 13:23:24 +0800
Subject: [PATCH 036/100] [Distributed RLHF] Integration of PP (#6257)

* update help information

* update style

* fix

* minor fix

* support PP training

* add pp support

* remove unused code

* address conversation

---------

Co-authored-by: Tong Li <tong.li35271158@gmail.com>
---
 .gitignore                                    |   1 +
 .../coati/distributed/consumer.py             |   2 -
 .../coati/distributed/grpo_consumer.py        | 306 ++++++++++++------
 .../ColossalChat/coati/distributed/launch.py  |   2 +
 applications/ColossalChat/rl_example.py       |  63 +++-
 .../booster/plugin/hybrid_parallel_plugin.py  |   6 +-
 colossalai/shardformer/modeling/qwen2.py      |   1 +
 7 files changed, 264 insertions(+), 117 deletions(-)

diff --git a/.gitignore b/.gitignore
index 16f764c1b1ef..533450a7cce1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -164,3 +164,4 @@ coverage.xml
 applications/ColossalChat/logs
 applications/ColossalChat/tests/logs
 applications/ColossalChat/wandb
+applications/ColossalChat/model
diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 027acc2e7537..c6ae7be2daf9 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -54,7 +54,6 @@ def __init__(
 
         self.model_config = model_config
         self.plugin_config = plugin_config
-        assert self.plugin_config.get("pp_size", 1) == 1, "pp_size > 1 is not supported now"
 
         self.device = get_current_device()
         self.lr_scheduler = None
@@ -95,7 +94,6 @@ def loop(self) -> None:
                     i = 0
                     for _ in range(self.num_recv_per_update):
                         # receive data from producers
-
                         for r in range(self.num_producers):
                             print(f"[T{dist.get_rank()}] Recv data episode {episode} step {step} from {r}")
                             self.buffer.extend(
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 4174f96514b8..a282439cbd33 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -39,6 +39,7 @@ def __init__(
         use_wandb=True,
         generate_config=None,
         training_config={},
+        project_name=None,
     ):
         super().__init__(
             num_producers,
@@ -69,6 +70,7 @@ def __init__(
         self.accum_count = 0
         self.generate_config = generate_config
         self.training_config = training_config
+        self.project_name = project_name
 
         # Reference model is initialized from policy model.
         self.reference_model = AutoModelForCausalLM.from_pretrained(path, **model_config)
@@ -94,9 +96,7 @@ def __init__(
 
         self.policy_loss_fn = PolicyLoss()
         self.global_step = 0
-        if use_wandb and self.rank == 0:
-            name = f"{generate_config['backend']}_bs_{self.batch_size*self.world_size}_temp_{generate_config['temperature']:.01f}_top_p_{generate_config['top_p']:.02f}"
-            self.wandb_run = wandb.init(project="GRPO-V1", sync_tensorboard=True, dir="./wandb", name=name)
+        self.use_wandb = use_wandb
 
         self.lr_scheduler = CosineAnnealingWarmupLR(
             optimizer=self.optimizer,
@@ -107,10 +107,19 @@ def __init__(
 
     def setup(self):
         super().setup()
+        if self.use_wandb and (
+            (not self.plugin.pp_size > 1 and self.rank == 0)
+            or (self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage())
+        ):
+            # Initialize wandb.
+            name = f"{self.generate_config['backend']}_bs_{self.batch_size*self.dp_size}_temp_{self.generate_config['temperature']:.01f}_top_p_{self.generate_config['top_p']:.02f}"
+            self.wandb_run = wandb.init(project=self.project_name, sync_tensorboard=True, dir="./wandb", name=name)
+
         self.policy_model, self.optimizer, _, _, self.lr_scheduler = self.booster.boost(
             self.policy_model, self.optimizer, lr_scheduler=self.lr_scheduler
         )
         self.reference_model, *_ = self.booster.boost(self.reference_model)
+        self.plugin.logger.set_level("ERROR")
 
     def step(self, step_idx: int, **kwargs) -> Optional[float]:
         """
@@ -168,6 +177,7 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
                 ).repeat_interleave(self.num_generations, dim=0)
             )
             mean_kl, mean_loss = [], []
+
             for forward_micro_batch_start in range(0, data["input_ids"].size(0), forward_batch_size):
                 input_ids_forward_micro_batch = data["input_ids"][
                     forward_micro_batch_start : forward_micro_batch_start + forward_batch_size
@@ -186,112 +196,210 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
                 advantages_forward_micro_batch = advantages[
                     forward_micro_batch_start : forward_micro_batch_start + forward_batch_size
                 ]
-                policy_model_logits = self.policy_model(
-                    input_ids=input_ids_forward_micro_batch,
-                    attention_mask=attention_mask_forward_micro_batch,
-                ).logits
-                action_log_probs = calc_action_log_probs(
-                    policy_model_logits / self.generate_config["temperature"],
-                    input_ids_forward_micro_batch,
-                    num_action,
-                    self.plugin.shard_config,
-                )
 
-                with torch.no_grad():
-                    reference_model_logits = self.reference_model(
+                if self.plugin.pp_size > 1:
+                    # Support training with PP.
+
+                    with torch.no_grad():
+                        reference_model_outputs = self.booster.execute_pipeline(
+                            iter(
+                                [
+                                    {
+                                        "input_ids": input_ids_forward_micro_batch,
+                                        "attention_mask": attention_mask_forward_micro_batch,
+                                    }
+                                ]
+                            ),
+                            self.reference_model,
+                            criterion=lambda outputs, inputs: torch.tensor(
+                                [0.0], device=action_mask.device
+                            ),  # dummy criterion
+                            optimizer=None,
+                            return_loss=False,
+                            return_outputs=True,
+                        )
+
+                    if self.booster.plugin.stage_manager.is_last_stage():
+                        reference_model_logits = reference_model_outputs["outputs"]["logits"]
+                        reference_action_log_probs = calc_action_log_probs(
+                            reference_model_logits / self.generate_config["temperature"],
+                            input_ids_forward_micro_batch,
+                            num_action,
+                            self.plugin.shard_config,
+                        )
+                    else:
+                        # Dummy reference logprobs for data iterator.
+                        reference_action_log_probs = None
+
+                    data_policy_forward = {
+                        "input_ids": input_ids_forward_micro_batch,
+                        "attention_mask": attention_mask_forward_micro_batch,
+                        "action_mask": action_mask_forward_micro_batch,
+                        "reference_action_log_probs": reference_action_log_probs,
+                        "advantages": advantages_forward_micro_batch,
+                        "loss_mask": loss_mask_forward_micro_batch,
+                        "source": self.rank,
+                    }
+
+                    kl = []
+
+                    def _criterion(outputs, inputs):
+                        action_logits = outputs.logits
+                        action_log_probs = calc_action_log_probs(
+                            action_logits / self.generate_config["temperature"],
+                            inputs["input_ids"],
+                            num_action,
+                            self.plugin.shard_config,
+                        )
+                        per_token_kl = (
+                            torch.exp(inputs["reference_action_log_probs"] - action_log_probs)
+                            - (inputs["reference_action_log_probs"] - action_log_probs)
+                            - 1
+                        )
+                        appox_kl = torch.sum(per_token_kl * inputs["action_mask"], dim=-1) / torch.sum(
+                            inputs["action_mask"], dim=-1
+                        )
+                        kl.append(appox_kl.mean())
+                        loss, skip_update, _ = self.policy_loss_fn(
+                            action_log_probs,
+                            action_log_probs,
+                            inputs["advantages"].repeat_interleave(action_log_probs.size(-1), dim=-1),
+                            per_token_kl,
+                            inputs["action_mask"],
+                            loss_mask=inputs["loss_mask"],
+                        )
+                        return loss
+
+                    policy_model_outputs = self.booster.execute_pipeline(
+                        iter([data_policy_forward]),
+                        self.policy_model,
+                        criterion=_criterion,
+                        optimizer=self.optimizer,
+                        return_loss=True,
+                        return_outputs=True,
+                    )
+                    loss = policy_model_outputs["loss"]
+
+                    if self.booster.plugin.stage_manager.is_last_stage():
+                        if len(kl) > 0:
+                            kl = all_reduce_mean(torch.mean(torch.stack(kl)).to(loss.device), self.plugin)
+                            mean_kl.append(kl)
+                        loss = all_reduce_mean(loss, self.plugin)
+                        mean_loss.append(loss.data)
+                else:
+
+                    policy_model_logits = self.policy_model(
                         input_ids=input_ids_forward_micro_batch,
                         attention_mask=attention_mask_forward_micro_batch,
                     ).logits
-                reference_action_log_probs = calc_action_log_probs(
-                    reference_model_logits / self.generate_config["temperature"],
-                    input_ids_forward_micro_batch,
-                    num_action,
-                    self.plugin.shard_config,
-                )
+                    action_log_probs = calc_action_log_probs(
+                        policy_model_logits / self.generate_config["temperature"],
+                        input_ids_forward_micro_batch,
+                        num_action,
+                        self.plugin.shard_config,
+                    )
 
-                per_token_kl = (
-                    torch.exp(reference_action_log_probs - action_log_probs)
-                    - (reference_action_log_probs - action_log_probs)
-                    - 1
-                )
-                kl = torch.sum(per_token_kl * action_mask_forward_micro_batch, dim=-1) / torch.sum(
-                    action_mask_forward_micro_batch, dim=-1
-                )
+                    with torch.no_grad():
+                        reference_model_logits = self.reference_model(
+                            input_ids=input_ids_forward_micro_batch,
+                            attention_mask=attention_mask_forward_micro_batch,
+                        ).logits
+                    reference_action_log_probs = calc_action_log_probs(
+                        reference_model_logits / self.generate_config["temperature"],
+                        input_ids_forward_micro_batch,
+                        num_action,
+                        self.plugin.shard_config,
+                    )
+                    per_token_kl = (
+                        torch.exp(reference_action_log_probs - action_log_probs)
+                        - (reference_action_log_probs - action_log_probs)
+                        - 1
+                    )
+                    kl = torch.sum(per_token_kl * action_mask_forward_micro_batch, dim=-1) / torch.sum(
+                        action_mask_forward_micro_batch, dim=-1
+                    )
 
-                loss, skip_update, _ = self.policy_loss_fn(
-                    action_log_probs,
-                    old_action_log_probs,
-                    advantages_forward_micro_batch.repeat_interleave(action_log_probs.size(-1), dim=-1),
-                    per_token_kl,
-                    action_mask_forward_micro_batch,
-                    loss_mask=loss_mask_forward_micro_batch,
-                )
+                    loss, skip_update, _ = self.policy_loss_fn(
+                        action_log_probs,
+                        old_action_log_probs,
+                        advantages_forward_micro_batch.repeat_interleave(action_log_probs.size(-1), dim=-1),
+                        per_token_kl,
+                        action_mask_forward_micro_batch,
+                        loss_mask=loss_mask_forward_micro_batch,
+                    )
 
-                if not skip_update:
-                    self.booster.backward(loss, self.optimizer)
-                loss = all_reduce_mean(loss, self.plugin)
-                kl = all_reduce_mean(kl.mean(), self.plugin)
-                # Calculate accumulate value.
-                mean_kl.append(kl.data)
-                mean_loss.append(loss.data)
-
-            reward = all_reduce_mean(reward.mean(), self.plugin)
-            format_reward = all_reduce_mean(format_reward.mean(), self.plugin)
-            acc_reward = all_reduce_mean(acc_reward.mean(), self.plugin)
-            advantages = all_reduce_mean(advantages.mean(), self.plugin)
-            response_length = all_reduce_mean(response_length.mean(), self.plugin)
-            self.accum_loss.add_(sum(mean_loss) / len(mean_loss))
-            self.accum_kl.add_(sum(mean_kl) / len(mean_kl))
-            self.accum_reward.add_(reward.data)
-            self.accum_format_reward.add_(format_reward.data)
-            self.accum_acc_reward.add_(acc_reward.data)
-            self.accum_advantages.add_(advantages.data)
-            self.accum_response_length.add_(response_length.data)
-            self.accum_count += 1
+                    if not skip_update:
+                        self.booster.backward(loss, self.optimizer)
+                    loss = all_reduce_mean(loss, self.plugin)
+                    kl = all_reduce_mean(kl.mean(), self.plugin)
+                    # Calculate accumulate value.
+                    mean_kl.append(kl.data)
+                    mean_loss.append(loss.data)
+            if not self.plugin.pp_size > 1 or (
+                self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage()
+            ):
+                reward = all_reduce_mean(reward.mean(), self.plugin)
+                format_reward = all_reduce_mean(format_reward.mean(), self.plugin)
+                acc_reward = all_reduce_mean(acc_reward.mean(), self.plugin)
+                advantages = all_reduce_mean(advantages.mean(), self.plugin)
+                response_length = all_reduce_mean(response_length.mean(), self.plugin)
+                self.accum_loss.add_(sum(mean_loss) / len(mean_loss))
+                self.accum_kl.add_(sum(mean_kl) / len(mean_kl))
+                self.accum_reward.add_(reward.data)
+                self.accum_format_reward.add_(format_reward.data)
+                self.accum_acc_reward.add_(acc_reward.data)
+                self.accum_advantages.add_(advantages.data)
+                self.accum_response_length.add_(response_length.data)
+                self.accum_count += 1
         if need_update:
             self.optimizer.step()
             self.optimizer.zero_grad()
-            loss_scalar = self.accum_loss.item()
-            if self.rank == 0:
-                print(
-                    "Loss:",
-                    self.accum_loss.item() / self.accum_count,
-                    "\nReward:",
-                    self.accum_reward.item() / self.accum_count,
-                    "\nFormat Reward:",
-                    self.accum_format_reward.item() / self.accum_count,
-                    "\nAcc Reward:",
-                    self.accum_acc_reward.item() / self.accum_count,
-                    "\nKL:",
-                    self.accum_kl.item() / self.accum_count,
-                    "\nAdvantages:",
-                    self.accum_advantages.item() / self.accum_count,
-                    "\nResponse Length:",
-                    self.accum_response_length.item() / self.accum_count,
-                )
-                self.wandb_run.log(
-                    {
-                        "metrics/reward": self.accum_reward.item() / self.accum_count,
-                        "metrics/format_reward": self.accum_format_reward.item() / self.accum_count,
-                        "metrics/acc_reward": self.accum_acc_reward.item() / self.accum_count,
-                        "metrics/response_length": self.accum_response_length.item() / self.accum_count,
-                        "train/loss": self.accum_loss.item() / self.accum_count,
-                        "train/kl": self.accum_kl.item() / self.accum_count,
-                        "train/advantages": self.accum_advantages.item() / self.accum_count,
-                        "train/learning_rate": self.lr_scheduler.get_last_lr()[0],
-                        "rollout/temperature": data["temperature"].cpu().numpy()[0][0],
-                    }
-                )
-            self.accum_loss.zero_()
-            self.accum_reward.zero_()
-            self.accum_acc_reward.zero_()
-            self.accum_format_reward.zero_()
-            self.accum_kl.zero_()
-            self.accum_advantages.zero_()
-            self.accum_response_length.zero_()
-
-            self.accum_count = 0
-            return loss_scalar
+            if not self.plugin.pp_size > 1 or (
+                self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage()
+            ):
+                loss_scalar = self.accum_loss.item()
+                if (not self.plugin.pp_size > 1 and self.rank == 0) or (
+                    self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage()
+                ):
+                    print(
+                        "Loss:",
+                        self.accum_loss.item() / self.accum_count,
+                        "\nReward:",
+                        self.accum_reward.item() / self.accum_count,
+                        "\nFormat Reward:",
+                        self.accum_format_reward.item() / self.accum_count,
+                        "\nAcc Reward:",
+                        self.accum_acc_reward.item() / self.accum_count,
+                        "\nKL:",
+                        self.accum_kl.item() / self.accum_count,
+                        "\nAdvantages:",
+                        self.accum_advantages.item() / self.accum_count,
+                        "\nResponse Length:",
+                        self.accum_response_length.item() / self.accum_count,
+                    )
+                    self.wandb_run.log(
+                        {
+                            "metrics/reward": self.accum_reward.item() / self.accum_count,
+                            "metrics/format_reward": self.accum_format_reward.item() / self.accum_count,
+                            "metrics/acc_reward": self.accum_acc_reward.item() / self.accum_count,
+                            "metrics/response_length": self.accum_response_length.item() / self.accum_count,
+                            "train/loss": self.accum_loss.item() / self.accum_count,
+                            "train/kl": self.accum_kl.item() / self.accum_count,
+                            "train/advantages": self.accum_advantages.item() / self.accum_count,
+                            "train/learning_rate": self.lr_scheduler.get_last_lr()[0],
+                            "rollout/temperature": data["temperature"].cpu().numpy()[0][0],
+                        }
+                    )
+                self.accum_loss.zero_()
+                self.accum_reward.zero_()
+                self.accum_acc_reward.zero_()
+                self.accum_format_reward.zero_()
+                self.accum_kl.zero_()
+                self.accum_advantages.zero_()
+                self.accum_response_length.zero_()
+
+                self.accum_count = 0
+                return loss_scalar
 
     def state_dict(self):
         self.policy_model._force_wait_all_gather()
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index ba5d3a9d4fd8..699d90a8cdff 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -47,6 +47,7 @@ def launch_distributed(
     master_addr: str = "localhost",
     master_port: int = 29500,
     core_algo: str = "GRPO",
+    project_name: Optional[str] = None,
 ):
 
     if core_algo not in ALGO_MAP:
@@ -108,6 +109,7 @@ def launch_distributed(
                 "train_microbatch_size": train_microbatch_size,
             },
             num_generations=num_generations,
+            project_name=project_name,
         )
         procs.append(consumer)
     ray.get([p.setup.remote() for p in procs])
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 4a4a4c3404e9..f87f12ed23ca 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -10,13 +10,44 @@
     parser.add_argument("-d", "--dataset", type=str, default="data.jsonl")
     parser.add_argument("-t", "--num-trainers", type=int, default=2)
     parser.add_argument("-i", "--num-inferencer", type=int, default=2)
-    parser.add_argument("-g", "--num-generations", type=int, default=8)
-    parser.add_argument("-ibs", "--inference-batch-size", type=int, default=64)
-    parser.add_argument("-imbs", "--inference-microbatch-size", type=int, default=8)
-    parser.add_argument("-tbs", "--train-batch-size", type=int, default=32)
-    parser.add_argument("-tMbs", "--train-minibatch-size", type=int, default=1)
-    parser.add_argument("-tmbs", "--train-microbatch-size", type=int, default=2)
-    parser.add_argument("-b", "--backend", type=str, default="transformers")
+    parser.add_argument("-g", "--num-generations", type=int, default=8, help="Number of generations per prompt.")
+    parser.add_argument("-p", "--project", type=str, default="GRPO", help="Project name.")
+    parser.add_argument(
+        "-ibs",
+        "--inference-batch-size",
+        type=int,
+        default=64,
+        help="Number of prompts to generate per inference step. It should be divisible by tbs, and the weights on the inference backend will be synced every ibs/tbs training steps of the policy model.",
+    )
+    parser.add_argument(
+        "-imbs",
+        "--inference-microbatch-size",
+        type=int,
+        default=8,
+        help="Effective batch size for the inference backend to run generation. Please select based on memory constraint.",
+    )
+    parser.add_argument(
+        "-tbs",
+        "--train-batch-size",
+        type=int,
+        default=32,
+        help="Number of unique prompts to update policy model per step per dp group. Gradient is accumulated across tbs * dp_size unique prompts, equivalently tbs * g * dp_size samples",
+    )
+    parser.add_argument(
+        "-tMbs",
+        "--train-minibatch-size",
+        type=int,
+        default=1,
+        help="Number of unique prompts in each training batch per dp group. The inference backend must generate tMbs * g * dp_size samples before forwarding. Satisfy tMbs * g >= tmbs",
+    )
+    parser.add_argument(
+        "-tmbs",
+        "--train-microbatch-size",
+        type=int,
+        default=2,
+        help="Effective batch size per dp group for forwarding and backwarding. Please select based on the availiable memory.",
+    )
+    parser.add_argument("-b", "--backend", type=str, default="transformers", choices=["transformers", "vllm"])
     parser.add_argument("-a", "--algo", type=str, default="GRPO", choices=["Simple", "GRPO", "EvalGRPO"])
     args = parser.parse_args()
 
@@ -29,11 +60,7 @@
     ray.init(address="local", namespace="ray-example")
 
     inference_model_config = dict(path=args.model)
-    train_model_config = dict(
-        path=args.model,
-        # use_flash_attention_2=True,
-        # use_cache=False
-    )
+    train_model_config = dict(path=args.model, use_flash_attention_2=True, use_cache=False)
     generate_config = dict(top_k=50, top_p=0.75, temperature=0.9)
 
     if args.backend == "transformers":
@@ -91,9 +118,17 @@
         generate_config=generate_config,
         num_generations=args.num_generations,
         train_model_config=train_model_config,
-        plugin_config={},
+        # plugin_config={}, # for zero
+        plugin_config={
+            "pp_size": 2,
+            "tp_size": 1,
+            "microbatch_size": args.train_microbatch_size // 2,
+            "zero_stage": 0,
+            "max_norm": 1.0,
+        },  # for pp
         inference_backend=args.backend,
         master_addr="localhost",
-        master_port=29505,
+        master_port=29506,
         core_algo=args.algo,
+        project_name=args.project,
     )
diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index 1684fd702e70..74349091b4d4 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -1411,8 +1411,10 @@ def execute_pipeline(
             )
 
         # run with gradients accumulation
-        if model.require_grad_sync == False or (
-            isinstance(optimizer, HybridParallelZeroOptimizer) and optimizer.require_grad_sync == False
+        if (
+            not torch.is_grad_enabled()
+            or model.require_grad_sync == False
+            or (isinstance(optimizer, HybridParallelZeroOptimizer) and optimizer.require_grad_sync == False)
         ):
             return outputs
 
diff --git a/colossalai/shardformer/modeling/qwen2.py b/colossalai/shardformer/modeling/qwen2.py
index 71e3557fe214..27571309e453 100644
--- a/colossalai/shardformer/modeling/qwen2.py
+++ b/colossalai/shardformer/modeling/qwen2.py
@@ -284,6 +284,7 @@ def qwen2_for_causal_lm_forward(
         hidden_states: Optional[torch.FloatTensor] = None,
         stage_index: Optional[List[int]] = None,
         shard_config: ShardConfig = None,
+        **kwargs,
     ):
         r"""
         Args:

From 9467c106904d62bcfc00d2484ce1374f74da5826 Mon Sep 17 00:00:00 2001
From: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Date: Thu, 10 Apr 2025 10:52:18 +0800
Subject: [PATCH 037/100] [hot-fix] Fix memory leakage bug, support TP+PP
 (#6258)

* update help information

* update style

* fix

* minor fix

* support PP training

* add pp support

* remove unused code

* address conversation

* fix memory leakage support tp+pp

* move empty cache

* move empty cache

---------

Co-authored-by: Tong Li <tong.li35271158@gmail.com>
---
 .../ColossalChat/coati/distributed/consumer.py      |  5 +++++
 .../ColossalChat/coati/distributed/grpo_consumer.py | 13 ++++++-------
 applications/ColossalChat/rl_example.py             |  2 +-
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index c6ae7be2daf9..79beb2a2dba6 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -72,6 +72,8 @@ def setup(self) -> None:
         self.plugin = HybridParallelPlugin(**plugin_config)
         self.booster = Booster(plugin=self.plugin)
         self.dp_rank = dist.get_rank(self.plugin.dp_group)
+        self.tp_rank = dist.get_rank(self.plugin.tp_group)
+
         self.dp_size = dist.get_world_size(self.plugin.dp_group)
 
         self.buffer = []
@@ -127,11 +129,14 @@ def loop(self) -> None:
 
                     if episode != self.num_episodes - 1 or step != self.num_update_per_episode - 1:
                         print(f"[T{dist.get_rank()}] Sync model episode {episode} step {step}")
+                        torch.cuda.empty_cache()
                         state_dict = self.state_dict()
                         if self.rank == 0:
                             ray_broadcast_tensor_dict(
                                 state_dict, src=self.num_producers, device=self.device, group_name="sync_model"
                             )
+                        del state_dict
+                        torch.cuda.empty_cache()
 
 
 @ray.remote
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index a282439cbd33..e23254d1b07a 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -109,7 +109,7 @@ def setup(self):
         super().setup()
         if self.use_wandb and (
             (not self.plugin.pp_size > 1 and self.rank == 0)
-            or (self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage())
+            or (self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0)
         ):
             # Initialize wandb.
             name = f"{self.generate_config['backend']}_bs_{self.batch_size*self.dp_size}_temp_{self.generate_config['temperature']:.01f}_top_p_{self.generate_config['top_p']:.02f}"
@@ -282,10 +282,9 @@ def _criterion(outputs, inputs):
 
                     if self.booster.plugin.stage_manager.is_last_stage():
                         if len(kl) > 0:
-                            kl = all_reduce_mean(torch.mean(torch.stack(kl)).to(loss.device), self.plugin)
+                            kl = all_reduce_mean(torch.mean(torch.stack(kl)).to(loss.device), self.plugin).data
                             mean_kl.append(kl)
-                        loss = all_reduce_mean(loss, self.plugin)
-                        mean_loss.append(loss.data)
+                        mean_loss.append(all_reduce_mean(loss, self.plugin).data)
                 else:
 
                     policy_model_logits = self.policy_model(
@@ -336,7 +335,7 @@ def _criterion(outputs, inputs):
                     mean_kl.append(kl.data)
                     mean_loss.append(loss.data)
             if not self.plugin.pp_size > 1 or (
-                self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage()
+                self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0
             ):
                 reward = all_reduce_mean(reward.mean(), self.plugin)
                 format_reward = all_reduce_mean(format_reward.mean(), self.plugin)
@@ -355,11 +354,11 @@ def _criterion(outputs, inputs):
             self.optimizer.step()
             self.optimizer.zero_grad()
             if not self.plugin.pp_size > 1 or (
-                self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage()
+                self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0
             ):
                 loss_scalar = self.accum_loss.item()
                 if (not self.plugin.pp_size > 1 and self.rank == 0) or (
-                    self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage()
+                    self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0
                 ):
                     print(
                         "Loss:",
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index f87f12ed23ca..6c43ccd1960f 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -121,7 +121,7 @@
         # plugin_config={}, # for zero
         plugin_config={
             "pp_size": 2,
-            "tp_size": 1,
+            "tp_size": 2,
             "microbatch_size": args.train_microbatch_size // 2,
             "zero_stage": 0,
             "max_norm": 1.0,

From 03f4b1dde31d80c548e383bc514f10dde07195a1 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Tue, 22 Apr 2025 10:39:47 +0800
Subject: [PATCH 038/100] add prompt template (#6273)

Co-authored-by: Tong Li <tong.li35271158@gmail.com>
---
 .../ColossalChat/coati/dataset/loader.py      |  9 ++++++---
 applications/ColossalChat/rl_example.py       | 19 ++++++++++---------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/applications/ColossalChat/coati/dataset/loader.py b/applications/ColossalChat/coati/dataset/loader.py
index 4518fd71f91b..43cf78383015 100755
--- a/applications/ColossalChat/coati/dataset/loader.py
+++ b/applications/ColossalChat/coati/dataset/loader.py
@@ -352,12 +352,14 @@ def apply_chat_template_and_mask(
     tokenizer: PreTrainedTokenizer,
     chat: List[Dict[str, str]],
     max_length: Optional[int] = None,
+    system_prompt: str = None,
     padding: bool = True,
     truncation: bool = True,
     ignore_idx: int = -100,
 ) -> Dict[str, torch.Tensor]:
 
-    system_prompt = "You are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and<answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. Now the user asks you to solve a math problem that involves reasoning. After thinking, when you finally reach a conclusion, clearly output the final answer without explanation within the <answer> </answer> tags, i.e., <answer> 123 </answer>.\n\n"
+    if system_prompt is None:
+        system_prompt = "You are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and<answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. Now the user asks you to solve a math problem that involves reasoning. After thinking, when you finally reach a conclusion, clearly output the final answer without explanation within the <answer> </answer> tags, i.e., <answer> 123 </answer>.\n\n"
 
     system_element = {
         "role": "system",
@@ -419,7 +421,7 @@ class RawConversationDataset(Dataset):
     Each instance is a dictionary with fields `system`, `roles`, `messages`, `offset`, `sep_style`, `seps`.
     """
 
-    def __init__(self, tokenizer: PreTrainedTokenizer, input_file: str, max_length: int) -> None:
+    def __init__(self, tokenizer: PreTrainedTokenizer, input_file: str, max_length: int, system_prompt: str) -> None:
         self.tokenizer = tokenizer
         self.raw_texts = []
         with jsonlines.open(input_file) as f:
@@ -427,6 +429,7 @@ def __init__(self, tokenizer: PreTrainedTokenizer, input_file: str, max_length:
                 self.raw_texts.append(line)
         self.tokenized_texts = [None] * len(self.raw_texts)
         self.max_length = max_length
+        self.system_prompt = system_prompt
 
     def __len__(self) -> int:
         return len(self.raw_texts)
@@ -434,6 +437,6 @@ def __len__(self) -> int:
     def __getitem__(self, index: int):
         if self.tokenized_texts[index] is None:
             message = self.raw_texts[index]
-            tokens = apply_chat_template_and_mask(self.tokenizer, message, self.max_length)
+            tokens = apply_chat_template_and_mask(self.tokenizer, message, self.max_length, self.system_prompt)
             self.tokenized_texts[index] = dict(tokens)
         return self.tokenized_texts[index]
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 6c43ccd1960f..317446695036 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -49,6 +49,7 @@
     )
     parser.add_argument("-b", "--backend", type=str, default="transformers", choices=["transformers", "vllm"])
     parser.add_argument("-a", "--algo", type=str, default="GRPO", choices=["Simple", "GRPO", "EvalGRPO"])
+    parser.add_argument("-s", "--system-prompt", type=str, default=None, help="System prompt for data construction.")
     args = parser.parse_args()
 
     assert args.train_minibatch_size > 0, "Train mini batch size must be greater than 0"
@@ -112,20 +113,20 @@
         train_batch_size=args.train_batch_size,
         train_minibatch_size=args.train_minibatch_size,
         train_microbatch_size=args.train_microbatch_size,
-        dataset_config={"path": args.dataset, "max_length": 300},
+        dataset_config={"path": args.dataset, "max_length": 300, "system_prompt": args.system_prompt},
         dataloaders_config={},
         inference_model_config=inference_model_config,
         generate_config=generate_config,
         num_generations=args.num_generations,
         train_model_config=train_model_config,
-        # plugin_config={}, # for zero
-        plugin_config={
-            "pp_size": 2,
-            "tp_size": 2,
-            "microbatch_size": args.train_microbatch_size // 2,
-            "zero_stage": 0,
-            "max_norm": 1.0,
-        },  # for pp
+        plugin_config={},  # Default setting: zero.
+        # plugin_config={
+        #     "pp_size": 2,
+        #     "tp_size": 2,
+        #     "microbatch_size": args.train_microbatch_size // 2,
+        #     "zero_stage": 0,
+        #     "max_norm": 1.0,
+        # },  # for pp
         inference_backend=args.backend,
         master_addr="localhost",
         master_port=29506,

From b823c6eec757a03dbc8b48d8bf25a5748e99d770 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Wed, 23 Apr 2025 10:03:46 +0800
Subject: [PATCH 039/100] [feat] Add final save at the end (#6274)

* add final save

* default 1 episode
---
 applications/ColossalChat/coati/distributed/consumer.py | 2 +-
 applications/ColossalChat/rl_example.py                 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 79beb2a2dba6..b7b865b26286 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -119,7 +119,7 @@ def loop(self) -> None:
                     assert len(self.buffer) == 0
                     if self.lr_scheduler is not None:
                         self.lr_scheduler.step()
-                    if (step + 1) % self.save_interval == 0:
+                    if (step + 1) % self.save_interval == 0 or (step + 1) == self.num_update_per_episode:
                         if self.rank == 0:
                             print(f"Start saving policy model at step {step + 1}.")
                         save_path = os.path.join(self.save_dir, f"modeling-step-{step + 1}")
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 317446695036..f42a660b78ff 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -107,7 +107,7 @@
         num_producers=args.num_inferencer,
         num_proc_per_producer=1,
         num_consumer_procs=args.num_trainers,
-        num_episodes=10,
+        num_episodes=1,
         inference_batch_size=args.inference_batch_size,
         inference_microbatch_size=args.inference_microbatch_size,
         train_batch_size=args.train_batch_size,

From 26d859f68e12f3da00bff974d864f54fb9e6df72 Mon Sep 17 00:00:00 2001
From: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Date: Fri, 25 Apr 2025 17:39:17 +0800
Subject: [PATCH 040/100] [feat] Support DAPO (#6263)

* update help information

* update style

* fix

* minor fix

* support PP training

* add pp support

* remove unused code

* address conversation

* fix memory leakage support tp+pp

* move empty cache

* move empty cache

* add DAPO support

* remove format reward

* fix filtering, still buggy

* small fix

* add DAPO support

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* tested multi-node training; fix bind_batch bug

* fix conversation; support sleep mode

* support reusing excessive samples

* add dynamic batching control flag

* add dynamic batching control flag

* refactored

* fix logging

---------

Co-authored-by: Tong Li <tong.li35271158@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../coati/distributed/consumer.py             |  38 +-
 .../coati/distributed/grpo_consumer.py        | 552 +++++++++---------
 .../coati/distributed/inference_backend.py    |   2 +
 .../ColossalChat/coati/distributed/launch.py  |  17 +-
 .../ColossalChat/coati/distributed/loss.py    |  49 +-
 .../coati/distributed/producer.py             |  31 +-
 .../coati/distributed/reward/reward_fn.py     |  31 +-
 .../ColossalChat/coati/distributed/utils.py   |  39 ++
 .../ColossalChat/coati/trainer/utils.py       |   8 +-
 applications/ColossalChat/rl_example.py       | 144 ++++-
 10 files changed, 552 insertions(+), 359 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index b7b865b26286..28d83fa40588 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -16,7 +16,7 @@
 from colossalai.utils import get_current_device
 
 from .comm import ray_broadcast_tensor_dict
-from .utils import bind_batch, post_recv, unbind_batch
+from .utils import bind_batch, pad_batch, post_recv, unbind_batch
 
 
 class BaseConsumer:
@@ -33,7 +33,7 @@ def __init__(
         batch_size: int,
         model_config: Dict[str, Any],
         plugin_config: Dict[str, Any],
-        microbatch_size: int = 1,
+        minibatch_size: int = 1,
         save_interval: int = 100,
         save_dir: str = "./model",
     ):
@@ -46,11 +46,11 @@ def __init__(
         self.num_update_per_episode = num_update_per_episode
         self.num_recv_per_update = num_recv_per_update
         self.batch_size = batch_size
-        self.microbatch_size = microbatch_size
+        self.minibatch_size = minibatch_size
         self.save_interval = save_interval
         self.save_dir = save_dir
-        assert batch_size % microbatch_size == 0, "batch_size should be divisible by microbatch_size"
-        self.num_microbatches = batch_size // microbatch_size
+        assert batch_size % minibatch_size == 0, "batch_size should be divisible by microbatch_size"
+        self.num_microbatches = batch_size // minibatch_size
 
         self.model_config = model_config
         self.plugin_config = plugin_config
@@ -67,7 +67,7 @@ def setup(self) -> None:
 
         plugin_config = dict(tp_size=1, pp_size=1, precision="bf16", zero_stage=2)
         if self.plugin_config.get("pp_size", 1) > 1 and "num_microbatches" not in self.plugin_config:
-            plugin_config["microbatch_size"] = self.microbatch_size
+            plugin_config["microbatch_size"] = self.minibatch_size
         plugin_config.update(self.plugin_config)
         self.plugin = HybridParallelPlugin(**plugin_config)
         self.booster = Booster(plugin=self.plugin)
@@ -105,18 +105,26 @@ def loop(self) -> None:
                                     )
                                 )
                             )
-                        while len(self.buffer) >= self.dp_size * self.microbatch_size:
+                        while len(self.buffer) >= self.dp_size * self.minibatch_size:
                             batches = self.buffer[
-                                self.dp_rank * self.microbatch_size : (self.dp_rank + 1) * self.microbatch_size
+                                self.dp_rank * self.minibatch_size : (self.dp_rank + 1) * self.minibatch_size
                             ]
-                            self.buffer = self.buffer[self.dp_size * self.microbatch_size :]
+                            batch = pad_batch(
+                                batches
+                            )  # when `imbs` is smaller than `tMbs`, samples may have differ in size, need to pad before stacking
                             batch = bind_batch(batches)
                             batch = post_recv(batch)
-                            loss = self.step(i, **batch)
+                            loss, num_excessive_prompts = self.step(i, pbar, **batch)
+                            self.buffer = (
+                                self.buffer[
+                                    (self.dp_rank + 1) * self.minibatch_size
+                                    - num_excessive_prompts : (self.dp_rank + 1) * self.minibatch_size
+                                ]
+                                + self.buffer[self.dp_size * self.minibatch_size :]
+                            )
                             if loss is not None:
                                 pbar.set_postfix({"loss": loss})
                             i += 1
-                    assert len(self.buffer) == 0
                     if self.lr_scheduler is not None:
                         self.lr_scheduler.step()
                     if (step + 1) % self.save_interval == 0 or (step + 1) == self.num_update_per_episode:
@@ -154,7 +162,9 @@ def __init__(
         batch_size,
         model_config,
         plugin_config,
-        microbatch_size=1,
+        minibatch_size=1,
+        save_interval: int = 100,
+        save_dir="./model",
     ):
         super().__init__(
             num_producers,
@@ -168,7 +178,7 @@ def __init__(
             batch_size,
             model_config,
             plugin_config,
-            microbatch_size,
+            minibatch_size,
         )
         path = model_config.pop("path")
         self.model = AutoModelForCausalLM.from_pretrained(path, **model_config)
@@ -181,7 +191,7 @@ def setup(self):
         super().setup()
         self.model, self.optimizer, *_ = self.booster.boost(self.model, self.optimizer)
 
-    def step(self, step_idx: int, **kwargs) -> Optional[float]:
+    def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
         labels = kwargs["input_ids"].clone()
         labels[kwargs["attention_mask"] == 0] = -100
         kwargs["labels"] = labels
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index e23254d1b07a..d4589b0e2909 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -1,18 +1,16 @@
-import json
-import os
+import warnings
 from contextlib import nullcontext
-from typing import Optional
+from typing import Any, Optional
 
 import ray
 import torch
-import torch.distributed as dist
 import wandb
 from coati.distributed.consumer import BaseConsumer
 from coati.distributed.loss import PolicyLoss
 from coati.distributed.reward.reward_fn import math_reward_fn
 from coati.distributed.reward.verifiable_reward import VerifiableReward
 from coati.distributed.utils import calc_action_log_probs
-from coati.trainer.utils import all_reduce_mean
+from coati.trainer.utils import all_reduce_mean, all_reduce_sum
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
@@ -34,13 +32,23 @@ def __init__(
         batch_size,
         model_config,
         plugin_config,
-        microbatch_size=1,
+        minibatch_size=1,
         num_generations=8,
         use_wandb=True,
         generate_config=None,
-        training_config={},
+        grpo_config={},
         project_name=None,
+        save_interval: int = 100,
+        save_dir="./model",
     ):
+        print(f"Using GRPO config: {grpo_config}")
+        if grpo_config.get("loss_variation", "sample_level") == "token_level":
+            if batch_size != minibatch_size:
+                warnings.warn(
+                    f"Applied token_level loss, force overwrite mini-batch-size with batch-size: mini-batch-size: {minibatch_size}->{batch_size}",
+                    UserWarning,
+                )
+                minibatch_size = batch_size
         super().__init__(
             num_producers,
             num_episodes,
@@ -53,36 +61,58 @@ def __init__(
             batch_size,
             model_config,
             plugin_config,
-            microbatch_size,
+            minibatch_size,
+            save_dir=save_dir,
         )
         path = model_config.pop("path")
         self.policy_model = AutoModelForCausalLM.from_pretrained(path, **model_config)
         self.policy_model.train()
         self.policy_model.gradient_checkpointing_enable()
-        self.optimizer = HybridAdam(self.policy_model.parameters(), lr=training_config.get("lr", 1e-6))
+        self.optimizer = HybridAdam(self.policy_model.parameters(), lr=grpo_config.get("lr", 1e-6))
         self.accum_loss = torch.zeros(1, device=self.device)
         self.accum_reward = torch.zeros(1, device=self.device)
         self.accum_kl = torch.zeros(1, device=self.device)
-        self.accum_format_reward = torch.zeros(1, device=self.device)
-        self.accum_acc_reward = torch.zeros(1, device=self.device)
+        self.accum_format_acc = torch.zeros(1, device=self.device)
+        self.accum_ans_acc = torch.zeros(1, device=self.device)
         self.accum_advantages = torch.zeros(1, device=self.device)
         self.accum_response_length = torch.zeros(1, device=self.device)
         self.accum_count = 0
         self.generate_config = generate_config
-        self.training_config = training_config
+        self.grpo_config = grpo_config
         self.project_name = project_name
+        self.effective_sample_count = 0
+        self.total_sample_count = 0
+
+        self.policy_loss_fn = PolicyLoss(
+            clip_eps_low=grpo_config.get("clip_eps_low", 0.2),
+            clip_eps_high=grpo_config.get("clip_eps_high", 0.2),
+            beta=grpo_config.get("beta", 0.01),
+            loss_variation=grpo_config.get("loss_variation", "sample_level"),
+        )
 
         # Reference model is initialized from policy model.
-        self.reference_model = AutoModelForCausalLM.from_pretrained(path, **model_config)
-        self.reference_model.eval()
+        if self.policy_loss_fn.beta > 0:
+            self.reference_model = AutoModelForCausalLM.from_pretrained(path, **model_config)
+            self.reference_model.eval()
 
         self.tokenizer = AutoTokenizer.from_pretrained(path)
         self.pad_token_id = self.tokenizer.pad_token_id
         self.num_generations = num_generations
-        self.filter_range = training_config.get("filter_range", None)
+        self.filter_range = grpo_config.get("filter_range", None)
         if self.filter_range is not None:
             assert len(self.filter_range) == 2, "Filter range should have 2 values."
 
+        self.filter_truncated_response = grpo_config.get("filter_truncated_response", False)
+        if self.filter_truncated_response:
+            self.max_length = 0
+            if "max_tokens" in self.generate_config:
+                self.max_length = self.generate_config["max_tokens"]
+            elif "max_new_tokens" in self.generate_config:
+                self.max_length = self.generate_config["max_new_tokens"]
+            else:
+                raise ValueError(
+                    "either max_tokens (vllm) or max_new_tokens (transformers) must be set in generate_config."
+                )
         # Initialize verifiable reward.
         response_format_tags = {
             "think_start": {"text": "<think>", "num_occur": 1},
@@ -90,11 +120,12 @@ def __init__(
             "answer_start": {"text": "<answer>", "num_occur": 1},
             "answer_end": {"text": "</answer>", "num_occur": 1},
         }
+        reward_model_kwargs = {
+            k: v for k, v in grpo_config.items() if k in ["soft_over_length_punishment", "max_length", "cache_length"]
+        }
         self.reward_model = VerifiableReward(
-            reward_fns=[math_reward_fn], tokenizer=self.tokenizer, tags=response_format_tags
+            reward_fns=[math_reward_fn], tokenizer=self.tokenizer, tags=response_format_tags, **reward_model_kwargs
         )
-
-        self.policy_loss_fn = PolicyLoss()
         self.global_step = 0
         self.use_wandb = use_wandb
 
@@ -102,7 +133,7 @@ def __init__(
             optimizer=self.optimizer,
             total_steps=min(self.num_episodes, 4) * self.num_update_per_episode,
             warmup_steps=0,
-            eta_min=0.1 * training_config.get("lr", 1e-6),
+            eta_min=0.1 * grpo_config.get("lr", 1e-6),
         )
 
     def setup(self):
@@ -118,10 +149,11 @@ def setup(self):
         self.policy_model, self.optimizer, _, _, self.lr_scheduler = self.booster.boost(
             self.policy_model, self.optimizer, lr_scheduler=self.lr_scheduler
         )
-        self.reference_model, *_ = self.booster.boost(self.reference_model)
+        if self.policy_loss_fn.beta > 0:
+            self.reference_model, *_ = self.booster.boost(self.reference_model)
         self.plugin.logger.set_level("ERROR")
 
-    def step(self, step_idx: int, **kwargs) -> Optional[float]:
+    def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
         """
         Step data from policy model:
             [{
@@ -132,18 +164,108 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
             },
             ...]
         Format:
-            [batch_size, num_of_generation, prompt_length + response_length] --- <PAD>...<PAD><PROMPT>...<PROMPT><RESPONSE>...<RESPONSE><PAD>...<PAD>.
+            [minibatch_size, num_of_generation, prompt_length + response_length] --- <PAD>...<PAD><PROMPT>...<PROMPT><RESPONSE>...<RESPONSE><PAD>...<PAD>.
         """
 
-        # Reshape to [batch_size x num_of_generation, prompt_length + response_length]
+        # Reshape to [minibatch_size x num_of_generation, prompt_length + response_length]
         data = {k: v.view(-1, v.size(-1)) for k, v in kwargs.items()}
         action_mask = data["action_mask"]
         num_action = action_mask.shape[1]
         old_action_log_probs = data["action_log_probs"]
         response_length = torch.sum(action_mask, dim=1).to(torch.float32)
-        forward_batch_size = self.training_config.get("train_microbatch_size", data["input_ids"].size(0))
+        train_microbatch_size = self.grpo_config.get("train_microbatch_size", data["input_ids"].size(0))
+
+        reward_group = self.reward_model(
+            data["input_ids"],
+            gt_answer=data["gt_answer"],
+            response_idx=data["response_idx"],
+        )
+
+        reward = torch.tensor([value[0] for value in reward_group]).to(data["input_ids"].device)
+        format_acc = torch.tensor([value[1] for value in reward_group]).to(data["input_ids"].device)
+        ans_acc = torch.tensor([value[2] for value in reward_group]).to(data["input_ids"].device)
+
+        # [minibatch_size, num_generations]
+
+        group_reward = reward.view(-1, self.num_generations)
+        reward_mean = group_reward.mean(dim=1)
+        # [minibatch_size x num_generations]
+        reward_mean = reward_mean.repeat_interleave(self.num_generations, dim=0)
+
+        reward_std = group_reward.std(dim=1).repeat_interleave(self.num_generations, dim=0)
+        # [minibatch_size x num_generations]
+        advantages = ((reward - reward_mean) / (reward_std + 1e-4)).unsqueeze(dim=-1)
+        # filter out the reward that is too high (all sample gets full score) or too low (all sample gets 0 score),
+        group_ans_acc = (
+            ans_acc.view(-1, self.num_generations).mean(dim=1).repeat_interleave(self.num_generations, dim=0)
+        )
+        loss_mask = (
+            torch.ones(action_mask.size(0), device=action_mask.device).bool()
+            if self.filter_range is None
+            else torch.logical_and(group_ans_acc > self.filter_range[0], group_ans_acc < self.filter_range[1])
+        )
+        # filter out overlength samples
+        if self.filter_truncated_response and action_mask.size(1) == self.max_length:
+            loss_mask = torch.logical_and(
+                loss_mask,
+                action_mask[:, -1] == False,
+            )
+        effective_tokens_count = torch.sum(action_mask, dim=-1) * loss_mask
+        effective_samples = all_reduce_sum(torch.sum(loss_mask), self.plugin)
+        total_effective_tokens_count = all_reduce_sum(torch.sum(effective_tokens_count), self.plugin)
+        total_samples = all_reduce_sum(torch.sum(torch.ones_like(loss_mask, device=loss_mask.device)), self.plugin)
+        self.effective_sample_count += effective_samples.item()
+        self.total_sample_count += total_samples.item()
+
+        mean_kl, mean_loss = [], []
+
+        if self.grpo_config.get("dynamic_batching", True):
+            need_update = self.effective_sample_count >= self.batch_size * self.dp_size * self.num_generations
+            # to exclude the excessive samples from the last batch, the last num_excessive_samples samples are not used for training and will be kept in buffer for the next iteration.
+            num_excessive_samples = (
+                int(
+                    (self.effective_sample_count - self.batch_size * self.dp_size * self.num_generations)
+                    / self.num_generations
+                    / self.dp_size
+                )
+                * self.num_generations
+            )
+            if num_excessive_samples > 0:
+                data = {
+                    k: (
+                        v[: -num_excessive_samples if num_excessive_samples != 0 else v.size(0)]
+                        if k
+                        in [
+                            "input_ids",
+                            "attention_mask",
+                            "action_log_probs",
+                            "action_mask",
+                            "response_idx",
+                            "gt_answer",
+                        ]
+                        else v
+                    )
+                    for k, v in data.items()
+                }
+                action_mask = action_mask[
+                    : -num_excessive_samples if num_excessive_samples != 0 else action_mask.size(0)
+                ]
+                loss_mask = loss_mask[: -num_excessive_samples if num_excessive_samples != 0 else loss_mask.size(0)]
+                advantages = advantages[: -num_excessive_samples if num_excessive_samples != 0 else advantages.size(0)]
+            else:
+                num_excessive_samples = 0
+        else:
+            # If dynamic batching is disabled, we need to use all samples for training.
+            need_update = (step_idx + 1) % self.num_microbatches == 0
+            num_excessive_samples = 0
+
+        pbar.set_postfix(
+            {
+                "Step": self.global_step + 1,
+                "Status": f"Collecting: {self.effective_sample_count}/{self.batch_size * self.dp_size * self.num_generations}",
+            }
+        )
 
-        need_update = (step_idx + 1) % self.num_microbatches == 0
         # Gradient must be synchronized if zero2 is enabled. https://github.com/hpcaitech/ColossalAI/blob/44d4053fec005fe0b06b6bc755fdc962463145df/colossalai/booster/plugin/hybrid_parallel_plugin.py#L1500
         ctx = (
             nullcontext()
@@ -151,95 +273,71 @@ def step(self, step_idx: int, **kwargs) -> Optional[float]:
             else self.booster.no_sync(self.policy_model, self.optimizer)
         )
         with ctx:
-            reward_group = self.reward_model(
-                data["input_ids"], gt_answer=data["gt_answer"], response_idx=data["response_idx"]
-            )
-
-            reward = torch.tensor([value[0] for value in reward_group]).to(data["input_ids"].device)
-            format_reward = torch.tensor([value[1] for value in reward_group]).to(data["input_ids"].device)
-            acc_reward = torch.tensor([value[2] for value in reward_group]).to(data["input_ids"].device)
-
-            # [batch_size, num_generations]
-
-            group_reward = reward.view(-1, self.num_generations)
-            reward_mean = group_reward.mean(dim=1)
-            # [batch_size x num_generations]
-            reward_mean = reward_mean.repeat_interleave(self.num_generations, dim=0)
-            reward_std = group_reward.std(dim=1).repeat_interleave(self.num_generations, dim=0)
-            # [batch_size x num_generations]
-            advantages = ((reward - reward_mean) / (reward_std + 1e-4)).unsqueeze(dim=-1)
-            # filter out the reward that is too high (all sample gets full score) or too low (all sample gets 0 score),
-            loss_mask = (
-                None
-                if self.filter_range is None
-                else torch.logical_and(
-                    reward_mean > self.filter_range[0], reward_mean < self.filter_range[1]
-                ).repeat_interleave(self.num_generations, dim=0)
-            )
-            mean_kl, mean_loss = [], []
-
-            for forward_micro_batch_start in range(0, data["input_ids"].size(0), forward_batch_size):
+            for forward_micro_batch_start in range(0, data["input_ids"].size(0), train_microbatch_size):
                 input_ids_forward_micro_batch = data["input_ids"][
-                    forward_micro_batch_start : forward_micro_batch_start + forward_batch_size
+                    forward_micro_batch_start : forward_micro_batch_start + train_microbatch_size
                 ]
                 attention_mask_forward_micro_batch = data["attention_mask"][
-                    forward_micro_batch_start : forward_micro_batch_start + forward_batch_size
+                    forward_micro_batch_start : forward_micro_batch_start + train_microbatch_size
                 ]
                 action_mask_forward_micro_batch = action_mask[
-                    forward_micro_batch_start : forward_micro_batch_start + forward_batch_size
+                    forward_micro_batch_start : forward_micro_batch_start + train_microbatch_size
                 ]
                 loss_mask_forward_micro_batch = (
-                    loss_mask[forward_micro_batch_start : forward_micro_batch_start + forward_batch_size]
+                    loss_mask[forward_micro_batch_start : forward_micro_batch_start + train_microbatch_size]
                     if loss_mask is not None
                     else None
                 )
                 advantages_forward_micro_batch = advantages[
-                    forward_micro_batch_start : forward_micro_batch_start + forward_batch_size
+                    forward_micro_batch_start : forward_micro_batch_start + train_microbatch_size
                 ]
 
                 if self.plugin.pp_size > 1:
                     # Support training with PP.
-
-                    with torch.no_grad():
-                        reference_model_outputs = self.booster.execute_pipeline(
-                            iter(
-                                [
-                                    {
-                                        "input_ids": input_ids_forward_micro_batch,
-                                        "attention_mask": attention_mask_forward_micro_batch,
-                                    }
-                                ]
-                            ),
-                            self.reference_model,
-                            criterion=lambda outputs, inputs: torch.tensor(
-                                [0.0], device=action_mask.device
-                            ),  # dummy criterion
-                            optimizer=None,
-                            return_loss=False,
-                            return_outputs=True,
-                        )
-
-                    if self.booster.plugin.stage_manager.is_last_stage():
-                        reference_model_logits = reference_model_outputs["outputs"]["logits"]
-                        reference_action_log_probs = calc_action_log_probs(
-                            reference_model_logits / self.generate_config["temperature"],
-                            input_ids_forward_micro_batch,
-                            num_action,
-                            self.plugin.shard_config,
-                        )
+                    if self.policy_loss_fn.beta > 0:
+                        with torch.no_grad():
+                            reference_model_outputs = self.booster.execute_pipeline(
+                                iter(
+                                    [
+                                        {
+                                            "input_ids": input_ids_forward_micro_batch,
+                                            "attention_mask": attention_mask_forward_micro_batch,
+                                        }
+                                    ]
+                                ),
+                                self.reference_model,
+                                criterion=lambda outputs, inputs: torch.tensor(
+                                    [0.0], device=action_mask.device
+                                ),  # dummy criterion
+                                optimizer=None,
+                                return_loss=False,
+                                return_outputs=True,
+                            )
+
+                        if self.booster.plugin.stage_manager.is_last_stage():
+                            reference_model_logits = reference_model_outputs["outputs"]["logits"]
+                            reference_action_log_probs = calc_action_log_probs(
+                                reference_model_logits / self.generate_config["temperature"],
+                                input_ids_forward_micro_batch,
+                                num_action,
+                                self.plugin.shard_config,
+                            )
+                        else:
+                            # Dummy reference logprobs for data iterator.
+                            reference_action_log_probs = None
                     else:
-                        # Dummy reference logprobs for data iterator.
                         reference_action_log_probs = None
 
                     data_policy_forward = {
                         "input_ids": input_ids_forward_micro_batch,
                         "attention_mask": attention_mask_forward_micro_batch,
                         "action_mask": action_mask_forward_micro_batch,
-                        "reference_action_log_probs": reference_action_log_probs,
                         "advantages": advantages_forward_micro_batch,
                         "loss_mask": loss_mask_forward_micro_batch,
                         "source": self.rank,
                     }
+                    if reference_action_log_probs is not None:
+                        data_policy_forward["reference_action_log_probs"] = reference_action_log_probs
 
                     kl = []
 
@@ -251,24 +349,30 @@ def _criterion(outputs, inputs):
                             num_action,
                             self.plugin.shard_config,
                         )
-                        per_token_kl = (
-                            torch.exp(inputs["reference_action_log_probs"] - action_log_probs)
-                            - (inputs["reference_action_log_probs"] - action_log_probs)
-                            - 1
-                        )
-                        appox_kl = torch.sum(per_token_kl * inputs["action_mask"], dim=-1) / torch.sum(
-                            inputs["action_mask"], dim=-1
-                        )
-                        kl.append(appox_kl.mean())
-                        loss, skip_update, _ = self.policy_loss_fn(
+                        if "reference_action_log_probs" in inputs:
+                            per_token_kl = (
+                                torch.exp(inputs["reference_action_log_probs"] - action_log_probs)
+                                - (inputs["reference_action_log_probs"] - action_log_probs)
+                                - 1
+                            )
+                            appox_kl = torch.sum(per_token_kl * inputs["action_mask"], dim=-1) / torch.sum(
+                                inputs["action_mask"], dim=-1
+                            )
+                            kl.append(appox_kl.mean())
+                        else:
+                            per_token_kl = 0.0
+                            kl.append(0.0)
+
+                        loss, _ = self.policy_loss_fn(
                             action_log_probs,
                             action_log_probs,
                             inputs["advantages"].repeat_interleave(action_log_probs.size(-1), dim=-1),
                             per_token_kl,
                             inputs["action_mask"],
                             loss_mask=inputs["loss_mask"],
+                            total_effective_tokens_in_batch=total_effective_tokens_count,
                         )
-                        return loss
+                        return loss, num_excessive_samples // self.num_generations
 
                     policy_model_outputs = self.booster.execute_pipeline(
                         iter([data_policy_forward]),
@@ -298,61 +402,71 @@ def _criterion(outputs, inputs):
                         self.plugin.shard_config,
                     )
 
-                    with torch.no_grad():
-                        reference_model_logits = self.reference_model(
-                            input_ids=input_ids_forward_micro_batch,
-                            attention_mask=attention_mask_forward_micro_batch,
-                        ).logits
-                    reference_action_log_probs = calc_action_log_probs(
-                        reference_model_logits / self.generate_config["temperature"],
-                        input_ids_forward_micro_batch,
-                        num_action,
-                        self.plugin.shard_config,
-                    )
-                    per_token_kl = (
-                        torch.exp(reference_action_log_probs - action_log_probs)
-                        - (reference_action_log_probs - action_log_probs)
-                        - 1
-                    )
-                    kl = torch.sum(per_token_kl * action_mask_forward_micro_batch, dim=-1) / torch.sum(
-                        action_mask_forward_micro_batch, dim=-1
-                    )
+                    if self.policy_loss_fn.beta > 0:
+                        with torch.no_grad():
+                            reference_model_logits = self.reference_model(
+                                input_ids=input_ids_forward_micro_batch,
+                                attention_mask=attention_mask_forward_micro_batch,
+                            ).logits
+                        reference_action_log_probs = calc_action_log_probs(
+                            reference_model_logits / self.generate_config["temperature"],
+                            input_ids_forward_micro_batch,
+                            num_action,
+                            self.plugin.shard_config,
+                        )
+                        per_token_kl = (
+                            torch.exp(reference_action_log_probs - action_log_probs)
+                            - (reference_action_log_probs - action_log_probs)
+                            - 1
+                        )
+                        kl = torch.sum(per_token_kl * action_mask_forward_micro_batch, dim=-1) / torch.sum(
+                            action_mask_forward_micro_batch, dim=-1
+                        )
+                    else:
+                        per_token_kl = 0.0
+                        kl = None
 
-                    loss, skip_update, _ = self.policy_loss_fn(
+                    loss, _ = self.policy_loss_fn(
                         action_log_probs,
                         old_action_log_probs,
                         advantages_forward_micro_batch.repeat_interleave(action_log_probs.size(-1), dim=-1),
                         per_token_kl,
                         action_mask_forward_micro_batch,
                         loss_mask=loss_mask_forward_micro_batch,
+                        total_effective_tokens_in_batch=total_effective_tokens_count,
                     )
 
-                    if not skip_update:
-                        self.booster.backward(loss, self.optimizer)
+                    self.booster.backward(loss, self.optimizer)
                     loss = all_reduce_mean(loss, self.plugin)
-                    kl = all_reduce_mean(kl.mean(), self.plugin)
                     # Calculate accumulate value.
-                    mean_kl.append(kl.data)
+                    if kl is not None:
+                        kl = all_reduce_mean(kl.mean(), self.plugin)
+                        mean_kl.append(kl.data)
                     mean_loss.append(loss.data)
             if not self.plugin.pp_size > 1 or (
                 self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0
             ):
                 reward = all_reduce_mean(reward.mean(), self.plugin)
-                format_reward = all_reduce_mean(format_reward.mean(), self.plugin)
-                acc_reward = all_reduce_mean(acc_reward.mean(), self.plugin)
+                format_acc = all_reduce_mean(format_acc.mean(), self.plugin)
+                ans_acc = all_reduce_mean(ans_acc.mean(), self.plugin)
                 advantages = all_reduce_mean(advantages.mean(), self.plugin)
                 response_length = all_reduce_mean(response_length.mean(), self.plugin)
                 self.accum_loss.add_(sum(mean_loss) / len(mean_loss))
-                self.accum_kl.add_(sum(mean_kl) / len(mean_kl))
+                if self.policy_loss_fn.beta > 0:
+                    self.accum_kl.add_(sum(mean_kl) / len(mean_kl))
                 self.accum_reward.add_(reward.data)
-                self.accum_format_reward.add_(format_reward.data)
-                self.accum_acc_reward.add_(acc_reward.data)
+                self.accum_format_acc.add_(format_acc.data)
+                self.accum_ans_acc.add_(ans_acc.data)
                 self.accum_advantages.add_(advantages.data)
                 self.accum_response_length.add_(response_length.data)
                 self.accum_count += 1
         if need_update:
             self.optimizer.step()
             self.optimizer.zero_grad()
+            self.global_step += 1
+            sample_utilization = self.effective_sample_count / self.total_sample_count
+            self.effective_sample_count = 0
+            self.total_sample_count = 0
             if not self.plugin.pp_size > 1 or (
                 self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0
             ):
@@ -360,167 +474,41 @@ def _criterion(outputs, inputs):
                 if (not self.plugin.pp_size > 1 and self.rank == 0) or (
                     self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0
                 ):
-                    print(
-                        "Loss:",
-                        self.accum_loss.item() / self.accum_count,
-                        "\nReward:",
-                        self.accum_reward.item() / self.accum_count,
-                        "\nFormat Reward:",
-                        self.accum_format_reward.item() / self.accum_count,
-                        "\nAcc Reward:",
-                        self.accum_acc_reward.item() / self.accum_count,
-                        "\nKL:",
-                        self.accum_kl.item() / self.accum_count,
-                        "\nAdvantages:",
-                        self.accum_advantages.item() / self.accum_count,
-                        "\nResponse Length:",
-                        self.accum_response_length.item() / self.accum_count,
-                    )
-                    self.wandb_run.log(
-                        {
-                            "metrics/reward": self.accum_reward.item() / self.accum_count,
-                            "metrics/format_reward": self.accum_format_reward.item() / self.accum_count,
-                            "metrics/acc_reward": self.accum_acc_reward.item() / self.accum_count,
-                            "metrics/response_length": self.accum_response_length.item() / self.accum_count,
-                            "train/loss": self.accum_loss.item() / self.accum_count,
-                            "train/kl": self.accum_kl.item() / self.accum_count,
-                            "train/advantages": self.accum_advantages.item() / self.accum_count,
-                            "train/learning_rate": self.lr_scheduler.get_last_lr()[0],
-                            "rollout/temperature": data["temperature"].cpu().numpy()[0][0],
-                        }
-                    )
+                    to_log_msg = [
+                        f"Loss: {self.accum_loss.item() / self.accum_count:.4f}",
+                        f"Reward: {self.accum_reward.item() / self.accum_count:.4f}",
+                        f"format Reward: {self.accum_format_acc.item() / self.accum_count:.4f}",
+                        f"Acc Reward: {self.accum_ans_acc.item() / self.accum_count:.4f}",
+                        f"Advantages: {self.accum_advantages.item() / self.accum_count:.4f}",
+                        f"Response Length: {self.accum_response_length.item() / self.accum_count:.4f}",
+                    ] + ([f"KL: {self.accum_kl.item() / self.accum_count:.4f}"] if self.policy_loss_fn.beta > 0 else [])
+                    print("\n".join(to_log_msg))
+                    metrics = {
+                        "metrics/reward": self.accum_reward.item() / self.accum_count,
+                        "metrics/format_acc": self.accum_format_acc.item() / self.accum_count,
+                        "metrics/ans_acc": self.accum_ans_acc.item() / self.accum_count,
+                        "metrics/response_length": self.accum_response_length.item() / self.accum_count,
+                        "train/loss": self.accum_loss.item() / self.accum_count,
+                        "train/advantages": self.accum_advantages.item() / self.accum_count,
+                        "train/learning_rate": self.lr_scheduler.get_last_lr()[0],
+                        "train/sample_utilization": sample_utilization,
+                        "rollout/temperature": data["temperature"].cpu().numpy()[0][0],
+                    }
+                    if self.policy_loss_fn.beta > 0:
+                        metrics["train/kl"] = self.accum_kl.item() / self.accum_count
+
+                    self.wandb_run.log(metrics)
                 self.accum_loss.zero_()
                 self.accum_reward.zero_()
-                self.accum_acc_reward.zero_()
-                self.accum_format_reward.zero_()
+                self.accum_ans_acc.zero_()
+                self.accum_format_acc.zero_()
                 self.accum_kl.zero_()
                 self.accum_advantages.zero_()
                 self.accum_response_length.zero_()
-
                 self.accum_count = 0
-                return loss_scalar
-
-    def state_dict(self):
-        self.policy_model._force_wait_all_gather()
-        model = self.policy_model.unwrap()
-        state_dict = model.state_dict()
-        return state_dict
-
-
-@ray.remote
-class GRPOEvalConsumer(BaseConsumer):
-    def __init__(
-        self,
-        num_producers,
-        num_episodes,
-        rank,
-        world_size,
-        master_addr,
-        master_port,
-        num_update_per_episode,
-        num_recv_per_update,
-        batch_size,
-        model_config,
-        plugin_config,
-        microbatch_size=1,
-        num_generations=4,
-        use_wandb=True,
-        log_dir="./results",
-    ):
-        super().__init__(
-            num_producers,
-            num_episodes,
-            rank,
-            world_size,
-            master_addr,
-            master_port,
-            num_update_per_episode,
-            num_recv_per_update,
-            batch_size,
-            model_config,
-            plugin_config,
-            microbatch_size,
-        )
-        path = model_config.pop("path")
-        self.policy_model = AutoModelForCausalLM.from_pretrained(path, **model_config)
-        self.policy_model.train()
-        self.accum_reward = torch.zeros(1, device=self.device)
-        self.accum_format_reward = torch.zeros(1, device=self.device)
-        self.accum_acc_reward = torch.zeros(1, device=self.device)
-        self.accum_response_length = torch.zeros(1, device=self.device)
-        self.accum_count = torch.zeros(1, device=self.device)
-
-        self.tokenizer = AutoTokenizer.from_pretrained(path)
-        self.pad_token_id = self.tokenizer.pad_token_id
-        self.num_generations = num_generations
-
-        # Initialize verifiable reward.
-        response_format_tags = {
-            "think_start": {"text": "<think>", "num_occur": 1},
-            "think_end": {"text": "</think>", "num_occur": 1},
-            "answer_start": {"text": "<answer>", "num_occur": 1},
-            "answer_end": {"text": "</answer>", "num_occur": 1},
-        }
-        self.reward_model = VerifiableReward(
-            reward_fns=[math_reward_fn], tokenizer=self.tokenizer, tags=response_format_tags
-        )
-
-        self.log_dir = log_dir
-        if not os.path.exists(self.log_dir):
-            os.makedirs(self.log_dir)
+                return loss_scalar, num_excessive_samples // self.num_generations
         else:
-            os.system(f"rm -rf {self.log_dir}/*")
-
-    def setup(self):
-        super().setup()
-        self.policy_model, _, *_ = self.booster.boost(self.policy_model)
-
-    def step(self, step_idx: int, **kwargs) -> Optional[float]:
-        rank = dist.get_rank()
-        data = {k: v.view(-1, v.size(-1)).cpu() for k, v in kwargs.items()}
-        kwargs["input_ids"].size(0)
-        reward_group = self.reward_model(
-            data["input_ids"], gt_answer=data["gt_answer"], response_idx=data["response_idx"]
-        )
-        reward = [value[0].item() for value in reward_group]
-        format_reward = [value[1].item() for value in reward_group]
-        acc_reward = [value[2].item() for value in reward_group]
-        response_length = [(data["response_idx"][i][1] - data["response_idx"][i][0]).item() for i in range(len(reward))]
-
-        response = self.tokenizer.batch_decode(data["input_ids"], skip_special_tokens=True)
-        with open(f"{self.log_dir}/eval_results_rank_{rank}.jsonl", "a", encoding="utf8") as f:
-            for i in range(len(response)):
-                f.write(
-                    json.dumps(
-                        {
-                            "response": response[i],
-                            "reward": reward[i],
-                            "format_reward": format_reward[i],
-                            "acc_reward": acc_reward[i],
-                            "response_length": response_length[i],
-                        },
-                        ensure_ascii=False,
-                    )
-                    + "\n"
-                )
-
-        self.accum_reward += sum(reward)
-        self.accum_format_reward += sum(format_reward)
-        self.accum_acc_reward += sum(acc_reward)
-        self.accum_response_length += sum(response_length)
-        self.accum_count += len(reward)
-
-        # print results
-        total_count = all_reduce_mean(self.accum_count, self.plugin)
-        mean_reward = all_reduce_mean(self.accum_reward, self.plugin) / total_count
-        mean_format_reward = all_reduce_mean(self.accum_format_reward, self.plugin) / total_count
-        mean_acc_reward = all_reduce_mean(self.accum_acc_reward, self.plugin) / total_count
-        mean_response_length = all_reduce_mean(self.accum_response_length, self.plugin) / total_count
-        if rank == 0:
-            print(
-                f"Step {step_idx}: Mean Reward: {mean_reward}, Mean Format Reward: {mean_format_reward}, Mean Acc Reward: {mean_acc_reward}, Mean Response Length: {mean_response_length}"
-            )
-        return None
+            return None, num_excessive_samples // self.num_generations
 
     def state_dict(self):
         self.policy_model._force_wait_all_gather()
diff --git a/applications/ColossalChat/coati/distributed/inference_backend.py b/applications/ColossalChat/coati/distributed/inference_backend.py
index 17c71c8a85b1..7d32cd52a41e 100644
--- a/applications/ColossalChat/coati/distributed/inference_backend.py
+++ b/applications/ColossalChat/coati/distributed/inference_backend.py
@@ -184,6 +184,7 @@ def load_state_dict(self, state_dict: Dict[str, torch.Tensor]) -> None:
 class VLLMInferenceBackend(BaseInferenceBackend):
     DEFAULT_MODEL_CONFIG = dict(
         trust_remote_code=True,
+        enable_sleep_mode=False,
     )
     FORCE_GENERATE_CONFIG = dict(
         logprobs=0,
@@ -205,6 +206,7 @@ def __init__(
         generate_config.update(self.FORCE_GENERATE_CONFIG)
         generate_config.update({"n": num_generations})
         self.generate_config = SamplingParams(**generate_config)
+        self.model_config = model_config
         self.tokenizer = tokenizer
         self.num_generations = num_generations
 
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index 699d90a8cdff..b193dc8bc86d 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -4,10 +4,10 @@
 import ray
 
 from .consumer import SimpleConsumer
-from .grpo_consumer import GRPOConsumer, GRPOEvalConsumer
+from .grpo_consumer import GRPOConsumer
 from .producer import SimpleProducer
 
-ALGO_MAP = {"Simple": SimpleConsumer, "GRPO": GRPOConsumer, "EvalGRPO": GRPOEvalConsumer}
+ALGO_MAP = {"Simple": SimpleConsumer, "GRPO": GRPOConsumer, "DAPO": GRPOConsumer}
 
 
 def get_jsonl_size_fast(path: str) -> int:
@@ -40,6 +40,7 @@ def launch_distributed(
     inference_model_config: Dict[str, Any],
     generate_config: Dict[str, Any],
     train_model_config: Dict[str, Any],
+    grpo_config: Dict[str, Any],
     plugin_config: Dict[str, Any],
     tokenizer_config: Optional[Dict[str, Any]] = None,
     inference_backend: str = "transformers",
@@ -48,6 +49,8 @@ def launch_distributed(
     master_port: int = 29500,
     core_algo: str = "GRPO",
     project_name: Optional[str] = None,
+    save_interval: int = 100,
+    save_dir: str = "./model",
 ):
 
     if core_algo not in ALGO_MAP:
@@ -101,15 +104,13 @@ def launch_distributed(
             batch_size=train_batch_size,
             model_config=train_model_config,
             plugin_config=plugin_config,
-            microbatch_size=train_minibatch_size,
+            minibatch_size=train_minibatch_size,
             generate_config=generate_config_consumer,
-            training_config={
-                "filter_range": [0.05, 9.0],
-                "lr": 1e-6,
-                "train_microbatch_size": train_microbatch_size,
-            },
+            grpo_config=grpo_config,
             num_generations=num_generations,
             project_name=project_name,
+            save_interval=save_interval,
+            save_dir=save_dir,
         )
         procs.append(consumer)
     ray.get([p.setup.remote() for p in procs])
diff --git a/applications/ColossalChat/coati/distributed/loss.py b/applications/ColossalChat/coati/distributed/loss.py
index 90ad09736281..36057b24faf5 100644
--- a/applications/ColossalChat/coati/distributed/loss.py
+++ b/applications/ColossalChat/coati/distributed/loss.py
@@ -2,7 +2,7 @@
 
 import torch
 import torch.nn as nn
-from coati.distributed.utils import masked_mean
+from coati.distributed.utils import masked_mean, masked_sum
 
 
 class PolicyLoss(nn.Module):
@@ -10,11 +10,19 @@ class PolicyLoss(nn.Module):
     Policy Loss for PPO
     """
 
-    def __init__(self, clip_eps: float = 0.2, skip_threshold: float = 20.0, beta: float = 0.01) -> None:
+    def __init__(
+        self,
+        clip_eps_low: float = 0.2,
+        clip_eps_high: float = 0.2,
+        beta: float = 0.01,
+        loss_variation: str = "sample_level",
+    ) -> None:
         super().__init__()
-        self.clip_eps = clip_eps
-        self.skip_threshold = skip_threshold
+        self.clip_eps_low = clip_eps_low
+        self.clip_eps_high = clip_eps_high
         self.beta = beta
+        self.loss_variation = loss_variation
+        assert loss_variation in ["sample_level", "token_level"], f"Unsupported loss variation: {loss_variation}"
 
     def forward(
         self,
@@ -24,22 +32,37 @@ def forward(
         per_token_kl: torch.Tensor,
         action_mask: Optional[torch.Tensor] = None,
         loss_mask: Optional[torch.Tensor] = None,
+        total_effective_tokens_in_batch: torch.Tensor = None,
     ) -> torch.Tensor:
-        skip = False
         if action_mask is None:
             ratio = (log_probs - log_probs.detach()).exp()
         else:
             ratio = ((log_probs - log_probs.detach()) * action_mask).exp()
 
         surr1 = ratio * advantages
-        surr2 = ratio.clamp(1 - self.clip_eps, 1 + self.clip_eps) * advantages
+        surr2 = ratio.clamp(1 - self.clip_eps_low, 1 + self.clip_eps_high) * advantages
+        if self.beta == 0:
+            # skip kl term if kl coefficient is zero
+            per_token_kl = 0.0
         loss = -torch.min(surr1, surr2) + self.beta * per_token_kl
 
-        if action_mask is not None:
-            loss = masked_mean(loss, action_mask)
+        if self.loss_variation == "sample_level":
+            if action_mask is not None:
+                loss = masked_mean(loss, action_mask)
+            else:
+                loss = loss.mean(dim=1)
+            if loss_mask is not None:
+                loss = loss * loss_mask
+            loss = loss.mean()
+        elif self.loss_variation == "token_level":
+            if action_mask is not None:
+                loss = masked_sum(loss, action_mask)
+            else:
+                loss = loss.sum(dim=1)
+            if loss_mask is not None:
+                loss = loss * loss_mask
+            loss = loss.sum() / (total_effective_tokens_in_batch + 1e-8)
         else:
-            loss = loss.mean(dim=1)
-        if loss_mask is not None:
-            loss = loss * loss_mask
-        loss = loss.mean()
-        return loss, skip, ratio.max()
+            raise ValueError(f"Unsupported loss variation: {self.loss_variation}")
+
+        return loss, ratio.max()
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 2c6a24a36711..b0624f72ace3 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -103,7 +103,14 @@ def loop(self) -> None:
 
                 print(f"[P{self.producer_idx}] Send data {[(k, v.shape) for k, v in outputs.items()]}")
                 outputs["temperature"] = torch.tensor(
-                    [self.model.generate_config.temperature] * outputs["input_ids"].size(0)
+                    [
+                        (
+                            self.model.generate_config["temperature"]
+                            if isinstance(self.model.generate_config.temperature, dict)
+                            else self.model.generate_config.temperature
+                        )
+                    ]
+                    * outputs["input_ids"].size(0)
                 ).to(outputs["input_ids"].device)
                 outputs = pre_send(outputs)
                 ray_broadcast_tensor_dict(
@@ -113,10 +120,15 @@ def loop(self) -> None:
                 if (i + 1) % self.num_microbatches == 0 and (
                     episode != self.num_episodes - 1 or i != num_valid_microbatches - 1
                 ):
+                    if isinstance(self.model, BACKEND_MAP["vllm"]) and self.model.model_config.get(
+                        "enable_sleep_mode", False
+                    ):
+                        self.model.llm.sleep()  # revict KV_cache to avoid OOM
                     # don't sync model for last iteration
                     print(
                         f"[P{self.producer_idx}] Sync model episode {episode} step {(i + 1) // self.num_microbatches - 1}"
                     )
+                    torch.cuda.empty_cache()
 
                     state_dict = ray_broadcast_tensor_dict(
                         None, self.num_producers, device=self.device, group_name="sync_model"
@@ -124,12 +136,21 @@ def loop(self) -> None:
                     self.load_state_dict(state_dict)
                     del state_dict
                     torch.cuda.empty_cache()
-                # linear annealing for 1 episode, temperature from initial to 0.7
+                    if isinstance(self.model, BACKEND_MAP["vllm"]) and self.model.model_config.get(
+                        "enable_sleep_mode", False
+                    ):
+                        self.model.llm.wake_up()
+                # linear annealing for 1 episode, temperature from initial to 0.9
                 if episode <= 0:
                     ratio = 1 - (len(self.dataloader) - i) / len(self.dataloader)
-                    self.model.generate_config.temperature = (1 - ratio) * self.generate_config[
-                        "temperature"
-                    ] + ratio * 0.7
+                    if isinstance(self.model.generate_config.temperature, dict):
+                        self.model.generate_config["temperature"] = (1 - ratio) * self.generate_config[
+                            "temperature"
+                        ] + ratio * 0.9
+                    else:
+                        self.model.generate_config.temperature = (1 - ratio) * self.generate_config[
+                            "temperature"
+                        ] + ratio * 0.9
 
 
 @ray.remote
diff --git a/applications/ColossalChat/coati/distributed/reward/reward_fn.py b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
index 53bc15e25a8c..fd91291301c4 100644
--- a/applications/ColossalChat/coati/distributed/reward/reward_fn.py
+++ b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
@@ -4,13 +4,22 @@
 
 
 def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
-    format_score = 1.0
-    acc_score = 9.0
     tokenizer = kwargs["tokenizer"]
+    soft_over_length_punishment = kwargs.get("soft_over_length_punishment", False)
+    acc_score = 10.0
     reward = torch.tensor(0.0)
-    format_reward = torch.tensor(0.0)
-    acc_reward = torch.tensor(0.0)
+    format_acc = torch.tensor(0.0)
+    ans_acc = torch.tensor(0.0)
     s, e = response_idx[0], response_idx[1]
+
+    length_reward = 0.0
+    if soft_over_length_punishment:
+        max_length = kwargs.get("max_length", 1024 * 4)
+        cache_length = kwargs.get("cache_length", 512)
+        res_length = e.item() - s.item() + 1
+        if max_length - cache_length < res_length < max_length:
+            length_reward = ((max_length - cache_length) - res_length) / cache_length * acc_score
+
     if gt_answer is None:
         return reward
 
@@ -22,18 +31,20 @@ def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
 
     # Check format accuracy
     if format_valid:
-        format_reward += format_score
-        reward += format_score
+        format_acc += 1
 
-    # Check answer accuracy
+    # Check answer accuracy, answer is considered correct if the answer is correct and the format is valid
     if (
-        final_answer is not None
+        format_valid
+        and final_answer is not None
         and gt_answer.strip().replace(" ", "").lower() == final_answer.strip().replace(" ", "").lower()
     ):
-        acc_reward += acc_score
+        ans_acc += 1
         reward += acc_score
 
-    return torch.tensor([reward, format_reward, acc_reward]).to(input_ids.device)
+    reward = reward + length_reward
+
+    return torch.tensor([reward, format_acc, ans_acc]).to(input_ids.device)
 
 
 def gsm8k_reward_fn(input_ids, **kwargs):
diff --git a/applications/ColossalChat/coati/distributed/utils.py b/applications/ColossalChat/coati/distributed/utils.py
index 919e4434faa6..ce4923dc493e 100644
--- a/applications/ColossalChat/coati/distributed/utils.py
+++ b/applications/ColossalChat/coati/distributed/utils.py
@@ -1,3 +1,4 @@
+from collections import defaultdict
 from typing import Any, Dict, List
 
 import torch
@@ -26,6 +27,27 @@ def bind_batch(batches: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor
     return batch
 
 
+def pad_batch(batches: List[Dict[str, torch.Tensor]], tokenizer: Any = None) -> List[Dict[str, torch.Tensor]]:
+    max_len = defaultdict(int)
+    for sample in batches:
+        for k in sample:
+            if k in ["input_ids", "attention_mask", "action_log_probs", "action_mask"]:
+                max_len[k] = max(max_len[k], sample[k].size(-1))
+    for idx, sample in enumerate(batches):
+        for k in sample:
+            if k in ["input_ids", "attention_mask", "action_log_probs", "action_mask"]:
+                # right pad with 0s
+                if k in ["attention_mask", "action_mask"]:
+                    batches[idx][k] = torch.nn.functional.pad(
+                        batches[idx][k], (0, max_len[k] - batches[idx][k].size(-1)), "constant", False
+                    )
+                else:
+                    batches[idx][k] = torch.nn.functional.pad(
+                        batches[idx][k], (0, max_len[k] - batches[idx][k].size(-1)), "constant", 0
+                    )
+    return batches
+
+
 def pre_send(batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
     # compress mask to save bandwidth
     if "attention_mask" in batch:
@@ -113,3 +135,20 @@ def masked_mean(tensor: torch.Tensor, mask: torch.Tensor, dim: int = 1) -> torch
     mask_sum = mask.sum(dim=dim)
     mean = tensor / (mask_sum + 1e-8)
     return mean
+
+
+def masked_sum(tensor: torch.Tensor, mask: torch.Tensor, dim: int = 1) -> torch.Tensor:
+    """
+    Compute the masked sum of a tensor along a specified dimension.
+
+    Args:
+        tensor (torch.Tensor): The input tensor.
+        mask (torch.Tensor): The mask tensor with the same shape as the input tensor.
+        dim (int, optional): The dimension along which to compute the sum. Default is 1.
+
+    Returns:
+        torch.Tensor: The masked sum tensor.
+
+    """
+    tensor = tensor * mask
+    return tensor.sum(dim=dim)
diff --git a/applications/ColossalChat/coati/trainer/utils.py b/applications/ColossalChat/coati/trainer/utils.py
index 22a5f492ebd2..5153ce3adad5 100755
--- a/applications/ColossalChat/coati/trainer/utils.py
+++ b/applications/ColossalChat/coati/trainer/utils.py
@@ -128,7 +128,7 @@ def all_reduce_mean(tensor: torch.Tensor, plugin: Plugin = None) -> torch.Tensor
     return tensor
 
 
-def all_reduce_sum(tensor: torch.Tensor) -> torch.Tensor:
+def all_reduce_sum(tensor: torch.Tensor, plugin: Plugin = None) -> torch.Tensor:
     """
     Performs an all-reduce operation to sum the values of the given tensor across all processes.
 
@@ -138,5 +138,9 @@ def all_reduce_sum(tensor: torch.Tensor) -> torch.Tensor:
     Returns:
         torch.Tensor: The reduced tensor with the sum of values across all processes.
     """
-    dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM)
+    # All reduce sum across DP group
+    if plugin is not None:
+        dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM, group=plugin.dp_group)
+    else:
+        dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM)
     return tensor
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index f42a660b78ff..9c0ec79224be 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -1,4 +1,5 @@
 import argparse
+import os
 
 import ray
 import torch
@@ -8,15 +9,17 @@
     parser = argparse.ArgumentParser()
     parser.add_argument("-m", "--model", type=str, default="Qwen/Qwen2.5-7B")
     parser.add_argument("-d", "--dataset", type=str, default="data.jsonl")
+    parser.add_argument("-p", "--project", type=str, default="GRPO", help="Project name.")
+
+    # Distributed training parameters
     parser.add_argument("-t", "--num-trainers", type=int, default=2)
     parser.add_argument("-i", "--num-inferencer", type=int, default=2)
     parser.add_argument("-g", "--num-generations", type=int, default=8, help="Number of generations per prompt.")
-    parser.add_argument("-p", "--project", type=str, default="GRPO", help="Project name.")
     parser.add_argument(
         "-ibs",
         "--inference-batch-size",
         type=int,
-        default=64,
+        default=None,
         help="Number of prompts to generate per inference step. It should be divisible by tbs, and the weights on the inference backend will be synced every ibs/tbs training steps of the policy model.",
     )
     parser.add_argument(
@@ -37,7 +40,7 @@
         "-tMbs",
         "--train-minibatch-size",
         type=int,
-        default=1,
+        default=None,
         help="Number of unique prompts in each training batch per dp group. The inference backend must generate tMbs * g * dp_size samples before forwarding. Satisfy tMbs * g >= tmbs",
     )
     parser.add_argument(
@@ -47,22 +50,78 @@
         default=2,
         help="Effective batch size per dp group for forwarding and backwarding. Please select based on the availiable memory.",
     )
+    parser.add_argument(
+        "--ray_dir", type=str, default=None, help="Custom temperary directory for storing ray cluster data, Optional"
+    )
+    parser.add_argument(
+        "--master_address", type=str, default=None, help="Master address for multi-node distributed training, Optional"
+    )
+    parser.add_argument(
+        "--master_port", type=int, default=29506, help="Master port for multi-node distributed training, Optional"
+    )
+
+    # Sampling parameters
     parser.add_argument("-b", "--backend", type=str, default="transformers", choices=["transformers", "vllm"])
-    parser.add_argument("-a", "--algo", type=str, default="GRPO", choices=["Simple", "GRPO", "EvalGRPO"])
+    parser.add_argument("-temp", "--temperature", type=float, default=1.0, help="Temperature for sampling.")
+    parser.add_argument(
+        "-topk",
+        "--top-k",
+        type=int,
+        default=None,
+        help="Top k for sampling. Please check the generation arguments documentation for your backend.",
+    )
+    parser.add_argument(
+        "-topp",
+        "--top-p",
+        type=float,
+        default=1.0,
+        help="Top p for sampling. Please check the generation arguments documentation for your backend.",
+    )
     parser.add_argument("-s", "--system-prompt", type=str, default=None, help="System prompt for data construction.")
+    parser.add_argument("-mnt", "--max-new-tokens", type=int, default=1024 * 4 - 512, help="Max length for generation.")
+    parser.add_argument("-mpt", "--max-prompt-tokens", type=int, default=512, help="Max length for prompt.")
+
+    # GRPO parameters
+    parser.add_argument("-a", "--algo", type=str, default="GRPO", choices=["DAPO", "GRPO"])
+    parser.add_argument("-lr", "--learning-rate", type=float, default=1e-6, help="Learning rate for GRPO.")
+    parser.add_argument("-kl", "--kl-coeff", type=float, default=0.01, help="KL penalty coefficient for GRPO.")
+
+    # Logging/Checkpointing parameters
+    parser.add_argument("-si", "--save-interval", type=int, default=100, help="Interval for saving checkpoints.")
+    parser.add_argument("-sd", "--save-dir", type=str, default="./model", help="Directory for saving checkpoints.")
+
     args = parser.parse_args()
 
-    assert args.train_minibatch_size > 0, "Train mini batch size must be greater than 0"
+    if args.train_minibatch_size is None:
+        # Default settings: Using train batch size as mini batch size
+        args.train_minibatch_size = args.train_batch_size
+    if args.inference_batch_size is None:
+        # Default settings: Using train batch size as inference batch size, sync every inference model every train step
+        args.inference_batch_size = args.train_batch_size
     assert (
         args.train_minibatch_size * args.num_generations >= args.train_microbatch_size
         and args.train_microbatch_size > 0
     ), "Train micro batch size must be greater than 0 less than train mini batch size * num generations"
+    assert (
+        args.train_minibatch_size <= args.train_batch_size
+    ), "Train mini batch size must be less than or equals to train batch size"
 
-    ray.init(address="local", namespace="ray-example")
+    if args.master_address is None:
+        # Default settings: Using single machine
+        ray.init(address="local", namespace="ray-example")
+    else:
+        # For ray distributed multi-machine training, Please change _node_ip_address to your IP address of your master node
+        ray.init(_node_ip_address=args.master_address, namespace="ray-example", _temp_dir=args.ray_dir)
+
+    if args.top_k is None:
+        if args.backend == "transformers":
+            args.top_k = 50
+        elif args.backend == "vllm":
+            args.top_k = -1
 
     inference_model_config = dict(path=args.model)
     train_model_config = dict(path=args.model, use_flash_attention_2=True, use_cache=False)
-    generate_config = dict(top_k=50, top_p=0.75, temperature=0.9)
+    generate_config = dict(top_k=args.top_k, top_p=args.top_p, temperature=args.temperature)
 
     if args.backend == "transformers":
         inference_model_config.update(
@@ -73,7 +132,7 @@
         )
         generate_config.update(
             dict(
-                max_length=1024 + 512,
+                max_length=args.max_new_tokens + args.max_prompt_tokens,
                 do_sample=True,
                 max_new_tokens=None,
                 early_stopping=False,
@@ -81,31 +140,57 @@
             )
         )
     elif args.backend == "vllm":
-        inference_model_config.update(dict(gpu_memory_utilization=0.7, enforce_eager=True, enable_chunked_prefill=True))
-        generate_config.update(
-            dict(
-                max_tokens=2048,
-                ignore_eos=True,
-                include_stop_str_in_output=True,
-                stop=["</answer>"],
-            )
-        )
-    else:
         inference_model_config.update(
             dict(
-                mem_fraction_static=0.6,
+                gpu_memory_utilization=0.7,
+                enforce_eager=True,
+                enable_chunked_prefill=True,
+                max_model_len=args.max_new_tokens + args.max_prompt_tokens,
+                tensor_parallel_size=1,
             )
         )
         generate_config.update(
             dict(
-                max_new_tokens=256,
+                max_tokens=args.max_new_tokens,  # max new tokens
                 ignore_eos=True,
+                include_stop_str_in_output=True,
+                stop=["</answer>"],
             )
         )
+    else:
+        raise ValueError(f"Unsupported backend: {args.backend}")
+
+    if args.algo == "GRPO":
+        # Default Settings
+        grpo_config = {
+            "lr": args.learning_rate,
+            "train_microbatch_size": args.train_microbatch_size,
+            "beta": args.kl_coeff,  # KL penalty coefficient
+            "loss_variation": "sample_level",
+        }
+    elif args.algo == "DAPO":
+        # DAPO variant settings
+        grpo_config = {
+            "filter_range": [0.01, 0.99],  # only filter out all zero batch and all one batch
+            "lr": args.learning_rate,
+            "train_microbatch_size": args.train_microbatch_size,
+            "dynamic_batching": True,
+            "clip_eps_low": 0.2,
+            "clip_eps_high": 0.28,
+            "skip_threshold": 20.0,
+            "beta": 0,  # no KL penalty for DAPO
+            "loss_variation": "token_level",
+            "soft_over_length_punishment": True,
+            "max_length": args.max_new_tokens + args.max_prompt_tokens,
+            "cache_length": min(1024, int(args.max_new_tokens / 4)),
+            "filter_truncated_response": True,
+        }
+    else:
+        raise ValueError(f"Unsupported algorithm: {args.algo}")
 
     launch_distributed(
         num_producers=args.num_inferencer,
-        num_proc_per_producer=1,
+        num_proc_per_producer=inference_model_config.get("tensor_parallel_size", 1),
         num_consumer_procs=args.num_trainers,
         num_episodes=1,
         inference_batch_size=args.inference_batch_size,
@@ -113,15 +198,22 @@
         train_batch_size=args.train_batch_size,
         train_minibatch_size=args.train_minibatch_size,
         train_microbatch_size=args.train_microbatch_size,
-        dataset_config={"path": args.dataset, "max_length": 300, "system_prompt": args.system_prompt},
+        dataset_config={
+            "path": args.dataset,
+            "max_length": args.max_prompt_tokens,
+            "system_prompt": args.system_prompt,
+        },
         dataloaders_config={},
         inference_model_config=inference_model_config,
         generate_config=generate_config,
         num_generations=args.num_generations,
         train_model_config=train_model_config,
-        plugin_config={},  # Default setting: zero.
+        grpo_config=grpo_config,
+        plugin_config={
+            "zero_stage": 2,
+        },  # for zero
+        # currently not support tp/pp
         # plugin_config={
-        #     "pp_size": 2,
         #     "tp_size": 2,
         #     "microbatch_size": args.train_microbatch_size // 2,
         #     "zero_stage": 0,
@@ -129,7 +221,9 @@
         # },  # for pp
         inference_backend=args.backend,
         master_addr="localhost",
-        master_port=29506,
+        master_port=args.master_port,
         core_algo=args.algo,
         project_name=args.project,
+        save_interval=args.save_interval,
+        save_dir=os.path.join(args.save_dir, args.project.replace(" ", "_")),
     )

From 38008858e4bf5409d58374ada0bb4178bffae061 Mon Sep 17 00:00:00 2001
From: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Date: Sat, 26 Apr 2025 14:00:28 +0800
Subject: [PATCH 041/100] fix checkpoint naming; add num_epoch parameter
 (#6277)

---
 applications/ColossalChat/coati/distributed/consumer.py | 2 +-
 applications/ColossalChat/rl_example.py                 | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 28d83fa40588..47f08cc0bc14 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -130,7 +130,7 @@ def loop(self) -> None:
                     if (step + 1) % self.save_interval == 0 or (step + 1) == self.num_update_per_episode:
                         if self.rank == 0:
                             print(f"Start saving policy model at step {step + 1}.")
-                        save_path = os.path.join(self.save_dir, f"modeling-step-{step + 1}")
+                        save_path = os.path.join(self.save_dir, f"modeling-episode-{episode}-step-{step + 1}")
                         self.booster.save_model(self.policy_model, save_path, shard=True)
                         if self.rank == 0:
                             print(f"Saved model checkpoint at step {step + 1} in folder {save_path}")
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 9c0ec79224be..b20e9dc38973 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -10,6 +10,7 @@
     parser.add_argument("-m", "--model", type=str, default="Qwen/Qwen2.5-7B")
     parser.add_argument("-d", "--dataset", type=str, default="data.jsonl")
     parser.add_argument("-p", "--project", type=str, default="GRPO", help="Project name.")
+    parser.add_argument("-e", "--num-episodes", type=int, default=1, help="Number of episodes to train.")
 
     # Distributed training parameters
     parser.add_argument("-t", "--num-trainers", type=int, default=2)
@@ -192,7 +193,7 @@
         num_producers=args.num_inferencer,
         num_proc_per_producer=inference_model_config.get("tensor_parallel_size", 1),
         num_consumer_procs=args.num_trainers,
-        num_episodes=1,
+        num_episodes=args.num_episodes,
         inference_batch_size=args.inference_batch_size,
         inference_microbatch_size=args.inference_microbatch_size,
         train_batch_size=args.train_batch_size,

From 28795f560c717c33f7fa9c7410719e2ad1181c80 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Sun, 27 Apr 2025 17:54:06 +0800
Subject: [PATCH 042/100] fix save issue (#6279)

Co-authored-by: Tong Li <tong.li35271158@gmail.com>
---
 applications/ColossalChat/coati/distributed/grpo_consumer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index d4589b0e2909..29307e1d3f49 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -62,6 +62,7 @@ def __init__(
             model_config,
             plugin_config,
             minibatch_size,
+            save_interval=save_interval,
             save_dir=save_dir,
         )
         path = model_config.pop("path")

From 2ca1e3c630f3ff4b4f066b11ff9809989e1bac5e Mon Sep 17 00:00:00 2001
From: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Date: Mon, 28 Apr 2025 17:10:00 +0800
Subject: [PATCH 043/100] fix pp+tp, fix dataloader (#6280)

---
 .../ColossalChat/coati/distributed/consumer.py       |  6 +++++-
 .../ColossalChat/coati/distributed/grpo_consumer.py  | 12 +++++++++---
 .../ColossalChat/coati/distributed/launch.py         |  1 -
 .../ColossalChat/coati/distributed/producer.py       |  2 +-
 applications/ColossalChat/rl_example.py              |  4 ++--
 5 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 47f08cc0bc14..439d4d70271a 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -66,7 +66,11 @@ def setup(self) -> None:
         launch(self.rank, self.world_size, self.master_addr, self.master_port, local_rank=0)
 
         plugin_config = dict(tp_size=1, pp_size=1, precision="bf16", zero_stage=2)
-        if self.plugin_config.get("pp_size", 1) > 1 and "num_microbatches" not in self.plugin_config:
+        if (
+            self.plugin_config.get("pp_size", 1) > 1
+            and "num_microbatches" not in self.plugin_config
+            and "microbatch_size" not in self.plugin_config
+        ):
             plugin_config["microbatch_size"] = self.minibatch_size
         plugin_config.update(self.plugin_config)
         self.plugin = HybridParallelPlugin(**plugin_config)
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 29307e1d3f49..3da4a4f47bff 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -49,6 +49,12 @@ def __init__(
                     UserWarning,
                 )
                 minibatch_size = batch_size
+        if (
+            plugin_config.get("pp_size", 1) > 1
+            and "num_microbatches" not in plugin_config
+            and "microbatch_size" not in plugin_config
+        ):
+            plugin_config["microbatch_size"] = max(1, grpo_config.get("train_microbatch_size") // 2)
         super().__init__(
             num_producers,
             num_episodes,
@@ -373,7 +379,7 @@ def _criterion(outputs, inputs):
                             loss_mask=inputs["loss_mask"],
                             total_effective_tokens_in_batch=total_effective_tokens_count,
                         )
-                        return loss, num_excessive_samples // self.num_generations
+                        return loss
 
                     policy_model_outputs = self.booster.execute_pipeline(
                         iter([data_policy_forward]),
@@ -468,10 +474,10 @@ def _criterion(outputs, inputs):
             sample_utilization = self.effective_sample_count / self.total_sample_count
             self.effective_sample_count = 0
             self.total_sample_count = 0
+            loss_scalar = self.accum_loss.item()
             if not self.plugin.pp_size > 1 or (
                 self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0
             ):
-                loss_scalar = self.accum_loss.item()
                 if (not self.plugin.pp_size > 1 and self.rank == 0) or (
                     self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0
                 ):
@@ -507,7 +513,7 @@ def _criterion(outputs, inputs):
                 self.accum_advantages.zero_()
                 self.accum_response_length.zero_()
                 self.accum_count = 0
-                return loss_scalar, num_excessive_samples // self.num_generations
+            return loss_scalar, num_excessive_samples // self.num_generations
         else:
             return None, num_excessive_samples // self.num_generations
 
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index b193dc8bc86d..4e100ebd1074 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -33,7 +33,6 @@ def launch_distributed(
     inference_batch_size: int,
     inference_microbatch_size: int,
     train_batch_size: int,
-    train_microbatch_size: int,
     train_minibatch_size: int,
     dataset_config: Dict[str, Any],
     dataloaders_config: Dict[str, Any],
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index b0624f72ace3..fcb1a184a27e 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -68,6 +68,7 @@ def __init__(
                 seed=42,
             ),
             num_workers=4,
+            drop_last=True,
         )
         self.device = get_current_device()
 
@@ -116,7 +117,6 @@ def loop(self) -> None:
                 ray_broadcast_tensor_dict(
                     outputs, src=0, device=self.device, group_name=f"sync_data_{self.producer_idx}"
                 )
-
                 if (i + 1) % self.num_microbatches == 0 and (
                     episode != self.num_episodes - 1 or i != num_valid_microbatches - 1
                 ):
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index b20e9dc38973..bdfcadbb05cb 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -198,7 +198,6 @@
         inference_microbatch_size=args.inference_microbatch_size,
         train_batch_size=args.train_batch_size,
         train_minibatch_size=args.train_minibatch_size,
-        train_microbatch_size=args.train_microbatch_size,
         dataset_config={
             "path": args.dataset,
             "max_length": args.max_prompt_tokens,
@@ -216,7 +215,8 @@
         # currently not support tp/pp
         # plugin_config={
         #     "tp_size": 2,
-        #     "microbatch_size": args.train_microbatch_size // 2,
+        #     "pp_size": 2,
+        #     "microbatch_size": max(1, args.train_microbatch_size // 2),
         #     "zero_stage": 0,
         #     "max_norm": 1.0,
         # },  # for pp

From 14f237ce7ede98a2efe1f7d9fe30769903d6cf42 Mon Sep 17 00:00:00 2001
From: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Date: Tue, 29 Apr 2025 16:46:47 +0800
Subject: [PATCH 044/100] [feat] Support boxed math reward (#6284)

* fix pp+tp, fix dataloader

* fixed plugin micro-batch size

* support boxed reward

* add boxed reward

* fix pp state dict incomplete issue

* Revert "fix pp state dict incomplete issue"

This reverts commit 6c1b3b694f4898c04575c891d5ccc205a82a3c40.
---
 .../coati/distributed/consumer.py             |  2 +-
 .../coati/distributed/grpo_consumer.py        | 13 +++--
 .../coati/distributed/reward/reward_fn.py     | 42 +++++++++++++++-
 .../coati/distributed/reward/reward_utils.py  | 48 +++++++++++++++++++
 applications/ColossalChat/rl_example.py       | 25 +++++++---
 5 files changed, 118 insertions(+), 12 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 439d4d70271a..40fa67e1a23e 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -71,7 +71,7 @@ def setup(self) -> None:
             and "num_microbatches" not in self.plugin_config
             and "microbatch_size" not in self.plugin_config
         ):
-            plugin_config["microbatch_size"] = self.minibatch_size
+            plugin_config["microbatch_size"] = max(1, self.minibatch_size // plugin_config.get("pp_size", 1))
         plugin_config.update(self.plugin_config)
         self.plugin = HybridParallelPlugin(**plugin_config)
         self.booster = Booster(plugin=self.plugin)
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 3da4a4f47bff..877ff98ec55f 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -7,7 +7,7 @@
 import wandb
 from coati.distributed.consumer import BaseConsumer
 from coati.distributed.loss import PolicyLoss
-from coati.distributed.reward.reward_fn import math_reward_fn
+from coati.distributed.reward.reward_fn import boxed_math_reward_fn, math_reward_fn
 from coati.distributed.reward.verifiable_reward import VerifiableReward
 from coati.distributed.utils import calc_action_log_probs
 from coati.trainer.utils import all_reduce_mean, all_reduce_sum
@@ -54,7 +54,9 @@ def __init__(
             and "num_microbatches" not in plugin_config
             and "microbatch_size" not in plugin_config
         ):
-            plugin_config["microbatch_size"] = max(1, grpo_config.get("train_microbatch_size") // 2)
+            plugin_config["microbatch_size"] = max(
+                1, grpo_config.get("train_microbatch_size") // plugin_config.get("pp_size", 1)
+            )
         super().__init__(
             num_producers,
             num_episodes,
@@ -131,7 +133,12 @@ def __init__(
             k: v for k, v in grpo_config.items() if k in ["soft_over_length_punishment", "max_length", "cache_length"]
         }
         self.reward_model = VerifiableReward(
-            reward_fns=[math_reward_fn], tokenizer=self.tokenizer, tags=response_format_tags, **reward_model_kwargs
+            reward_fns=[
+                math_reward_fn if grpo_config.get("reward_fn_type") == "think_answer_tags" else boxed_math_reward_fn
+            ],
+            tokenizer=self.tokenizer,
+            tags=response_format_tags,
+            **reward_model_kwargs,
         )
         self.global_step = 0
         self.use_wandb = use_wandb
diff --git a/applications/ColossalChat/coati/distributed/reward/reward_fn.py b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
index fd91291301c4..b68c1a92fc6c 100644
--- a/applications/ColossalChat/coati/distributed/reward/reward_fn.py
+++ b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
@@ -1,6 +1,6 @@
 import torch
 
-from .reward_utils import extract_solution, validate_response_structure
+from .reward_utils import extract_boxed_solution, extract_solution, validate_response_structure
 
 
 def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
@@ -70,3 +70,43 @@ def gsm8k_reward_fn(input_ids, **kwargs):
         if gt_answer.strip().replace(" ", "").lower() == final_answer.strip().replace(" ", "").lower():
             reward = reward + 9.0
         return reward
+
+
+def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
+    tokenizer = kwargs["tokenizer"]
+    soft_over_length_punishment = kwargs.get("soft_over_length_punishment", False)
+    format_score = 0.0
+    acc_score = 10.0
+    reward = torch.tensor(0.0)
+    format_acc = torch.tensor(0.0)
+    ans_acc = torch.tensor(0.0)
+    s, e = response_idx[0], response_idx[1]
+
+    length_reward = 0.0
+    if soft_over_length_punishment:
+        max_length = kwargs.get("max_length", 1024 * 4)
+        cache_length = kwargs.get("cache_length", 512)
+        res_length = e.item() - s.item() + 1
+        if max_length - cache_length < res_length < max_length:
+            length_reward = ((max_length - cache_length) - res_length) / cache_length * acc_score
+
+    if gt_answer is None:
+        return reward
+
+    decoded_final_answer = tokenizer.decode(input_ids[s : e + 1], skip_special_tokens=True)
+    gt_answer = tokenizer.decode(gt_answer.squeeze(0), skip_special_tokens=True)
+    final_answer = extract_boxed_solution(decoded_final_answer)
+    format_valid = final_answer is not None
+    # Check format accuracy
+    if format_valid:
+        format_acc += 1
+        reward += format_score
+
+    # Check answer accuracy, answer is considered correct if the answer is correct and the format is valid
+    if format_valid and final_answer is not None and gt_answer.strip().lower() == final_answer.strip().lower():
+        ans_acc += 1
+        reward += acc_score
+
+    reward = reward + length_reward
+
+    return torch.tensor([reward, format_acc, ans_acc]).to(input_ids.device)
diff --git a/applications/ColossalChat/coati/distributed/reward/reward_utils.py b/applications/ColossalChat/coati/distributed/reward/reward_utils.py
index c1e73d4b9738..ffc220846ea7 100644
--- a/applications/ColossalChat/coati/distributed/reward/reward_utils.py
+++ b/applications/ColossalChat/coati/distributed/reward/reward_utils.py
@@ -74,3 +74,51 @@ def extract_solution(solution_str: str) -> Tuple[Optional[str], str]:
 
     final_answer = matches[-1].group(1).strip()
     return final_answer, solution_str
+
+
+def extract_boxed_solution(text: str) -> Optional[str]:
+    """
+    Modified from: https://gist.github.com/lewtun/9c2ce1937b741404090a3dc4c7c022b3
+    Retrieves the content from the last occurrence of `\boxed{}` in a LaTeX-like string.
+
+    Args:
+        text (str): A string potentially containing LaTeX-style boxed expressions.
+
+    Returns:
+        Optional[str]: The text inside the final `\boxed{}` if successfully extracted;
+                       returns `None` if no properly closed box is found.
+
+    Examples:
+        >>> extract_boxed_solution("The answer is \\boxed{42}.")
+        '42'
+        >>> extract_boxed_solution("Here is an unmatched \\boxed{42")
+        None
+    """
+    try:
+        # Find the last occurrence of "\boxed{"
+        start_idx = text.rindex("\\boxed{")
+        # Move past "\boxed{" to find the start of the content
+        content_start = start_idx + len("\\boxed{")
+        open_braces = 1
+        pos = content_start
+
+        # Traverse the string to find the matching closing brace
+        while open_braces > 0 and pos < len(text):
+            if text[pos] == "{":
+                open_braces += 1
+            elif text[pos] == "}":
+                open_braces -= 1
+            pos += 1
+
+        # If all braces are matched, extract and return the content
+        if open_braces == 0:
+            return text[content_start : pos - 1].strip()
+        else:
+            return None
+
+    except ValueError:
+        # "\boxed{" not found
+        return None
+    except Exception:
+        # Any other unexpected error
+        return None
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index bdfcadbb05cb..39584750c1df 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -86,6 +86,14 @@
     parser.add_argument("-a", "--algo", type=str, default="GRPO", choices=["DAPO", "GRPO"])
     parser.add_argument("-lr", "--learning-rate", type=float, default=1e-6, help="Learning rate for GRPO.")
     parser.add_argument("-kl", "--kl-coeff", type=float, default=0.01, help="KL penalty coefficient for GRPO.")
+    parser.add_argument(
+        "-rt",
+        "--reward-type",
+        type=str,
+        default="think_answer_tags",
+        choices=["think_answer_tags", "boxed"],
+        help="Reward type for GRPO.",
+    )
 
     # Logging/Checkpointing parameters
     parser.add_argument("-si", "--save-interval", type=int, default=100, help="Interval for saving checkpoints.")
@@ -136,8 +144,8 @@
                 max_length=args.max_new_tokens + args.max_prompt_tokens,
                 do_sample=True,
                 max_new_tokens=None,
-                early_stopping=False,
-                stop_strings=["</answer>"],
+                early_stopping=False if args.reward_type == "think_answer_tags" else True,
+                stop_strings=["</answer>"] if args.reward_type == "think_answer_tags" else None,
             )
         )
     elif args.backend == "vllm":
@@ -153,9 +161,9 @@
         generate_config.update(
             dict(
                 max_tokens=args.max_new_tokens,  # max new tokens
-                ignore_eos=True,
+                ignore_eos=True if args.reward_type == "think_answer_tags" else False,
                 include_stop_str_in_output=True,
-                stop=["</answer>"],
+                stop=["</answer>"] if args.reward_type == "think_answer_tags" else None,
             )
         )
     else:
@@ -168,6 +176,7 @@
             "train_microbatch_size": args.train_microbatch_size,
             "beta": args.kl_coeff,  # KL penalty coefficient
             "loss_variation": "sample_level",
+            "reward_fn_type": args.reward_type,
         }
     elif args.algo == "DAPO":
         # DAPO variant settings
@@ -185,6 +194,7 @@
             "max_length": args.max_new_tokens + args.max_prompt_tokens,
             "cache_length": min(1024, int(args.max_new_tokens / 4)),
             "filter_truncated_response": True,
+            "reward_fn_type": args.reward_type,
         }
     else:
         raise ValueError(f"Unsupported algorithm: {args.algo}")
@@ -212,14 +222,15 @@
         plugin_config={
             "zero_stage": 2,
         },  # for zero
-        # currently not support tp/pp
         # plugin_config={
         #     "tp_size": 2,
         #     "pp_size": 2,
-        #     "microbatch_size": max(1, args.train_microbatch_size // 2),
+        #     "microbatch_size": max(
+        #         1, args.train_microbatch_size // 2
+        #     ),  # microbatch size should be set to train_microbatch_size // pp_size
         #     "zero_stage": 0,
         #     "max_norm": 1.0,
-        # },  # for pp
+        # },  # for pp, tp
         inference_backend=args.backend,
         master_addr="localhost",
         master_port=args.master_port,

From 5fd4bcb9d80b5035c2fa457483d8c91d0f92d1ae Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Wed, 30 Apr 2025 14:47:01 +0800
Subject: [PATCH 045/100] [feat] Sync shard model (#6289)

* [feat] support hybrid parallel model sync

* update consumer and producer

* update files

* update producer

* remove print

* update

---------

Co-authored-by: duanjunwen <935724073@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li35271158@gmail.com>
---
 .../coati/distributed/consumer.py             | 44 +++++++++++++++----
 .../ColossalChat/coati/distributed/launch.py  |  3 +-
 .../coati/distributed/producer.py             | 35 +++++++++++----
 applications/ColossalChat/rl_example.py       |  4 +-
 4 files changed, 66 insertions(+), 20 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 40fa67e1a23e..1cebcb40eacb 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -59,10 +59,6 @@ def __init__(
         self.lr_scheduler = None
 
     def setup(self) -> None:
-        for i in range(self.num_producers):
-            cc.init_collective_group(self.world_size + 1, self.rank + 1, group_name=f"sync_data_{i}")
-        if self.rank == 0:
-            cc.init_collective_group(self.num_producers + 1, self.num_producers, group_name="sync_model")
         launch(self.rank, self.world_size, self.master_addr, self.master_port, local_rank=0)
 
         plugin_config = dict(tp_size=1, pp_size=1, precision="bf16", zero_stage=2)
@@ -77,8 +73,24 @@ def setup(self) -> None:
         self.booster = Booster(plugin=self.plugin)
         self.dp_rank = dist.get_rank(self.plugin.dp_group)
         self.tp_rank = dist.get_rank(self.plugin.tp_group)
+        self.pp_rank = dist.get_rank(self.plugin.pp_group)
 
         self.dp_size = dist.get_world_size(self.plugin.dp_group)
+        self.tp_size = dist.get_world_size(self.plugin.tp_group)
+        self.pp_size = dist.get_world_size(self.plugin.pp_group)
+
+        # Init Hybrid ray process group
+        for i in range(self.num_producers):
+            cc.init_collective_group(self.world_size + 1, self.rank + 1, group_name=f"sync_data_{i}")
+        if self.pp_size > 1:
+            # use hybrid tp + pp
+            if self.tp_rank == 0 and self.dp_rank == 0:
+                cc.init_collective_group(
+                    self.num_producers + 1, self.num_producers, group_name=f"sync_model_{self.pp_rank}"
+                )
+        else:
+            if self.rank == 0:
+                cc.init_collective_group(self.num_producers + 1, self.num_producers, group_name="sync_model")
 
         self.buffer = []
 
@@ -140,13 +152,27 @@ def loop(self) -> None:
                             print(f"Saved model checkpoint at step {step + 1} in folder {save_path}")
 
                     if episode != self.num_episodes - 1 or step != self.num_update_per_episode - 1:
-                        print(f"[T{dist.get_rank()}] Sync model episode {episode} step {step}")
+                        if self.pp_size > 1:
+                            print(
+                                f"[T{dist.get_rank()}] Sync model PP stage {self.pp_rank} episode {episode} step {step}"
+                            )
+                        else:
+                            print(f"[T{dist.get_rank()}] Sync model episode {episode} step {step}")
                         torch.cuda.empty_cache()
                         state_dict = self.state_dict()
-                        if self.rank == 0:
-                            ray_broadcast_tensor_dict(
-                                state_dict, src=self.num_producers, device=self.device, group_name="sync_model"
-                            )
+                        if self.pp_size > 1:
+                            if self.tp_rank == 0 and self.dp_rank == 0:
+                                ray_broadcast_tensor_dict(
+                                    state_dict,
+                                    src=self.num_producers,
+                                    device=self.device,
+                                    group_name=f"sync_model_{self.pp_rank}",
+                                )
+                        else:
+                            if self.rank == 0:
+                                ray_broadcast_tensor_dict(
+                                    state_dict, src=self.num_producers, device=self.device, group_name="sync_model"
+                                )
                         del state_dict
                         torch.cuda.empty_cache()
 
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index 4e100ebd1074..a346d1d4fae9 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -57,7 +57,7 @@ def launch_distributed(
     else:
         core_consumer = ALGO_MAP.get(core_algo, SimpleConsumer)
 
-    train_dp_size = get_dp_size_fast(num_producers, plugin_config)
+    train_dp_size = get_dp_size_fast(num_consumer_procs, plugin_config)
     assert (inference_batch_size * num_producers) % (train_batch_size * train_dp_size) == 0
 
     dataset_path = dataset_config["path"]
@@ -82,6 +82,7 @@ def launch_distributed(
             microbatch_size=inference_microbatch_size,
             backend=inference_backend,
             num_generations=num_generations,
+            consumer_plugin_config=plugin_config,
         )
         procs.append(producer)
     generate_config_consumer = copy.deepcopy(generate_config)
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index fcb1a184a27e..a2d675870fc2 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -29,6 +29,7 @@ def __init__(
         tokenizer_config: Optional[Dict[str, Any]] = None,
         microbatch_size: int = 1,
         backend: str = "transformers",
+        consumer_plugin_config: Dict[str, Any] = None,
     ):
         self.producer_idx = producer_idx
         self.num_producers = num_producers
@@ -78,9 +79,15 @@ def __init__(
         else:
             raise ValueError(f"Unexpected backend {backend}")
 
+        self.consumer_pp_size = consumer_plugin_config["pp_size"]  # consumer pp size
+
     def setup(self) -> None:
         cc.init_collective_group(1 + self.num_consumer_procs, 0, group_name=f"sync_data_{self.producer_idx}")
-        cc.init_collective_group(self.num_producers + 1, self.producer_idx, group_name="sync_model")
+        if self.consumer_pp_size > 1:
+            for i in range(self.consumer_pp_size):
+                cc.init_collective_group(self.num_producers + 1, self.producer_idx, group_name=f"sync_model_{i}")
+        else:
+            cc.init_collective_group(self.num_producers + 1, self.producer_idx, group_name="sync_model")
 
     def rollout(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:
         raise NotImplementedError
@@ -125,15 +132,25 @@ def loop(self) -> None:
                     ):
                         self.model.llm.sleep()  # revict KV_cache to avoid OOM
                     # don't sync model for last iteration
-                    print(
-                        f"[P{self.producer_idx}] Sync model episode {episode} step {(i + 1) // self.num_microbatches - 1}"
-                    )
                     torch.cuda.empty_cache()
 
-                    state_dict = ray_broadcast_tensor_dict(
-                        None, self.num_producers, device=self.device, group_name="sync_model"
-                    )
-                    self.load_state_dict(state_dict)
+                    if self.consumer_pp_size > 1:
+                        for pp_idx in range(self.consumer_pp_size):
+                            print(
+                                f"[P{self.producer_idx}] Sync model PP stage {pp_idx} episode {episode} step {(i + 1) // self.num_microbatches - 1}"
+                            )
+                            state_dict = ray_broadcast_tensor_dict(
+                                None, self.num_producers, device=self.device, group_name=f"sync_model_{pp_idx}"
+                            )
+                            self.load_state_dict(state_dict)
+                    else:
+                        print(
+                            f"[P{self.producer_idx}] Sync model episode {episode} step {(i + 1) // self.num_microbatches - 1}"
+                        )
+                        state_dict = ray_broadcast_tensor_dict(
+                            None, self.num_producers, device=self.device, group_name="sync_model"
+                        )
+                        self.load_state_dict(state_dict)
                     del state_dict
                     torch.cuda.empty_cache()
                     if isinstance(self.model, BACKEND_MAP["vllm"]) and self.model.model_config.get(
@@ -170,6 +187,7 @@ def __init__(
         microbatch_size=1,
         backend="transformers",
         num_generations: int = 8,
+        consumer_plugin_config=None,
     ):
         super().__init__(
             producer_idx,
@@ -184,6 +202,7 @@ def __init__(
             tokenizer_config,
             microbatch_size,
             backend,
+            consumer_plugin_config,
         )
         self.model = self.backend_cls(model_config, generate_config, self.tokenizer, num_generations)
 
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 39584750c1df..788e60c2edac 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -58,7 +58,7 @@
         "--master_address", type=str, default=None, help="Master address for multi-node distributed training, Optional"
     )
     parser.add_argument(
-        "--master_port", type=int, default=29506, help="Master port for multi-node distributed training, Optional"
+        "--master_port", type=int, default=29505, help="Master port for multi-node distributed training, Optional"
     )
 
     # Sampling parameters
@@ -223,7 +223,7 @@
             "zero_stage": 2,
         },  # for zero
         # plugin_config={
-        #     "tp_size": 2,
+        #     "tp_size": 1,
         #     "pp_size": 2,
         #     "microbatch_size": max(
         #         1, args.train_microbatch_size // 2

From 57a88395feef9d3c4478fab126bdea8508308dbf Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Wed, 30 Apr 2025 18:13:40 +0800
Subject: [PATCH 046/100] Support evaluation during training

---
 .gitignore                                    |   1 +
 .../coati/distributed/consumer.py             |  28 +++-
 .../coati/distributed/grpo_consumer.py        |   3 +
 .../coati/distributed/inference_backend.py    |  10 +-
 .../ColossalChat/coati/distributed/launch.py  |  16 +-
 .../coati/distributed/producer.py             | 149 +++++++++++++++---
 .../coati/distributed/reward/reward_fn.py     |  54 +++----
 .../ColossalChat/coati/distributed/utils.py   |  13 ++
 applications/ColossalChat/rl_example.py       |  25 ++-
 9 files changed, 234 insertions(+), 65 deletions(-)

diff --git a/.gitignore b/.gitignore
index 533450a7cce1..d48035a5b6df 100644
--- a/.gitignore
+++ b/.gitignore
@@ -165,3 +165,4 @@ applications/ColossalChat/logs
 applications/ColossalChat/tests/logs
 applications/ColossalChat/wandb
 applications/ColossalChat/model
+applications/ColossalChat/eval
diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 1cebcb40eacb..38e55a65cae2 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -36,6 +36,7 @@ def __init__(
         minibatch_size: int = 1,
         save_interval: int = 100,
         save_dir: str = "./model",
+        eval_interval: int = -1,
     ):
         self.num_producers = num_producers
         self.num_episodes = num_episodes
@@ -51,6 +52,7 @@ def __init__(
         self.save_dir = save_dir
         assert batch_size % minibatch_size == 0, "batch_size should be divisible by microbatch_size"
         self.num_microbatches = batch_size // minibatch_size
+        self.eval_interval = eval_interval
 
         self.model_config = model_config
         self.plugin_config = plugin_config
@@ -92,8 +94,10 @@ def setup(self) -> None:
             if self.rank == 0:
                 cc.init_collective_group(self.num_producers + 1, self.num_producers, group_name="sync_model")
 
-        self.buffer = []
+        for i in range(self.num_producers):
+            cc.init_collective_group(self.world_size + 1, self.rank + 1, group_name=f"sync_eval_statistics_{i}")
 
+        self.buffer = []
         self.recv_cnt = 0
 
     def state_dict(self) -> Dict[str, torch.Tensor]:
@@ -110,6 +114,24 @@ def loop(self) -> None:
             with tqdm(range(self.num_update_per_episode), desc=f"Episode {episode}", disable=self.rank != 0) as pbar:
                 for step in pbar:
                     i = 0
+                    if self.eval_interval > 0 and step % self.eval_interval == 0:
+                        eval_statistics = None
+                        for r in range(self.num_producers):
+                            print(f"[T{dist.get_rank()}] Recv eval result episode {episode} step {step} from {r}")
+                            local_eval_result = ray_broadcast_tensor_dict(
+                                None, src=0, device=self.device, group_name=f"sync_eval_statistics_{r}"
+                            )
+                            if eval_statistics is None:
+                                eval_statistics = local_eval_result
+                            else:
+                                eval_statistics = {
+                                    k: eval_statistics[k] + local_eval_result[k] for k in eval_statistics
+                                }
+                        eval_statistics = {k: (v[0] / v[1]).item() for k, v in eval_statistics.items()}
+                        if dist.get_rank() == 0:
+                            if hasattr(self, "wandb_run") and hasattr(self, "global_step"):
+                                self.wandb_run.log(eval_statistics, step=self.global_step)
+                            print(f"Eval statistics: {eval_statistics}")
                     for _ in range(self.num_recv_per_update):
                         # receive data from producers
                         for r in range(self.num_producers):
@@ -195,6 +217,7 @@ def __init__(
         minibatch_size=1,
         save_interval: int = 100,
         save_dir="./model",
+        eval_interval: int = -1,
     ):
         super().__init__(
             num_producers,
@@ -209,6 +232,9 @@ def __init__(
             model_config,
             plugin_config,
             minibatch_size,
+            save_interval,
+            save_dir,
+            eval_interval,
         )
         path = model_config.pop("path")
         self.model = AutoModelForCausalLM.from_pretrained(path, **model_config)
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 877ff98ec55f..0336125d17af 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -40,6 +40,7 @@ def __init__(
         project_name=None,
         save_interval: int = 100,
         save_dir="./model",
+        eval_interval: int = -1,
     ):
         print(f"Using GRPO config: {grpo_config}")
         if grpo_config.get("loss_variation", "sample_level") == "token_level":
@@ -72,6 +73,7 @@ def __init__(
             minibatch_size,
             save_interval=save_interval,
             save_dir=save_dir,
+            eval_interval=eval_interval,
         )
         path = model_config.pop("path")
         self.policy_model = AutoModelForCausalLM.from_pretrained(path, **model_config)
@@ -528,4 +530,5 @@ def state_dict(self):
         self.policy_model._force_wait_all_gather()
         model = self.policy_model.unwrap()
         state_dict = model.state_dict()
+        state_dict["consumer_global_step"] = torch.tensor([self.global_step], device=self.device)
         return state_dict
diff --git a/applications/ColossalChat/coati/distributed/inference_backend.py b/applications/ColossalChat/coati/distributed/inference_backend.py
index 7d32cd52a41e..a07da7a64793 100644
--- a/applications/ColossalChat/coati/distributed/inference_backend.py
+++ b/applications/ColossalChat/coati/distributed/inference_backend.py
@@ -205,7 +205,8 @@ def __init__(
         generate_config = generate_config.copy()
         generate_config.update(self.FORCE_GENERATE_CONFIG)
         generate_config.update({"n": num_generations})
-        self.generate_config = SamplingParams(**generate_config)
+        self.generate_config = generate_config
+        self.sample_params = SamplingParams(**generate_config)
         self.model_config = model_config
         self.tokenizer = tokenizer
         self.num_generations = num_generations
@@ -219,8 +220,9 @@ def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwar
         micro_batch_input_ids_no_padding = [
             micro_batch_input_ids[i][first_non_padding_token_idx[i] :] for i in range(micro_batch_size)
         ]
+        sample_params = kwargs.get("sample_params", self.sample_params)
         outputs = self.llm.generate(
-            prompt_token_ids=micro_batch_input_ids_no_padding, sampling_params=self.generate_config, use_tqdm=False
+            prompt_token_ids=micro_batch_input_ids_no_padding, sampling_params=sample_params, use_tqdm=False
         )
         out_tokens = []
         out_len = []
@@ -266,11 +268,11 @@ def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwar
             "response_idx": response_idx,
         }
 
-        data = {k: v.view(micro_batch_size, self.num_generations, v.size(-1)) for k, v in data.items()}
+        data = {k: v.view(micro_batch_size, -1, v.size(-1)) for k, v in data.items()}
 
         if "gt_answer" in kwargs:
             # repeat gt_answer for each prompt.
-            data["gt_answer"] = kwargs["gt_answer"].repeat_interleave(self.num_generations, dim=1)
+            data["gt_answer"] = kwargs["gt_answer"].repeat_interleave(data["input_ids"].size(1), dim=1)
         data = {k: v.to(get_current_device()) for k, v in data.items()}
         return data
 
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index a346d1d4fae9..d1b6158e5b2b 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -34,7 +34,7 @@ def launch_distributed(
     inference_microbatch_size: int,
     train_batch_size: int,
     train_minibatch_size: int,
-    dataset_config: Dict[str, Any],
+    train_dataset_config: Dict[str, Any],
     dataloaders_config: Dict[str, Any],
     inference_model_config: Dict[str, Any],
     generate_config: Dict[str, Any],
@@ -50,6 +50,9 @@ def launch_distributed(
     project_name: Optional[str] = None,
     save_interval: int = 100,
     save_dir: str = "./model",
+    eval_dataset_config: Optional[Dict[str, Any]] = None,
+    eval_interval: int = 100,
+    eval_save_dir: Optional[str] = None,
 ):
 
     if core_algo not in ALGO_MAP:
@@ -60,9 +63,9 @@ def launch_distributed(
     train_dp_size = get_dp_size_fast(num_consumer_procs, plugin_config)
     assert (inference_batch_size * num_producers) % (train_batch_size * train_dp_size) == 0
 
-    dataset_path = dataset_config["path"]
+    dataset_path = train_dataset_config["path"]
     num_samples = get_jsonl_size_fast(dataset_path)
-    global_inference_batch_size = inference_batch_size * num_producers
+    global_inference_batch_size = inference_batch_size * num_producers  # TODO: this doesn't support TP on producer
     num_update_per_episode = num_samples // global_inference_batch_size
     num_recv_per_update = inference_batch_size // inference_microbatch_size
 
@@ -74,7 +77,7 @@ def launch_distributed(
             num_consumer_procs=num_consumer_procs,
             num_episodes=num_episodes,
             batch_size=inference_batch_size,
-            dataset_config=dataset_config,
+            train_dataset_config=train_dataset_config,
             dataloaders_config=dataloaders_config,
             model_config=inference_model_config,
             generate_config=generate_config,
@@ -83,6 +86,10 @@ def launch_distributed(
             backend=inference_backend,
             num_generations=num_generations,
             consumer_plugin_config=plugin_config,
+            eval_dataset_config=eval_dataset_config,
+            eval_interval=eval_interval,
+            evaluation_function_type=grpo_config["reward_fn_type"],
+            eval_save_dir=eval_save_dir,
         )
         procs.append(producer)
     generate_config_consumer = copy.deepcopy(generate_config)
@@ -111,6 +118,7 @@ def launch_distributed(
             project_name=project_name,
             save_interval=save_interval,
             save_dir=save_dir,
+            eval_interval=eval_interval,
         )
         procs.append(consumer)
     ray.get([p.setup.remote() for p in procs])
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index a2d675870fc2..529e19bf4e14 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -1,9 +1,13 @@
+import copy
+import os
 from typing import Any, Dict, Optional
 
 import ray
 import ray.util.collective as cc
 import torch
+import tqdm
 from coati.dataset.loader import RawConversationDataset
+from coati.distributed.reward.reward_fn import boxed_math_reward_fn, math_reward_fn
 from torch.utils.data import DataLoader, DistributedSampler
 from transformers import AutoTokenizer
 
@@ -11,7 +15,12 @@
 
 from .comm import ray_broadcast_tensor_dict
 from .inference_backend import BACKEND_MAP
-from .utils import pre_send
+from .utils import pre_send, safe_write_jsonl
+
+try:
+    from vllm import SamplingParams
+except ImportError:
+    LLM = None
 
 
 class BaseProducer:
@@ -22,7 +31,7 @@ def __init__(
         num_consumer_procs: int,
         num_episodes: int,
         batch_size: int,
-        dataset_config: Dict[str, Any],
+        train_dataset_config: Dict[str, Any],
         dataloaders_config: Dict[str, Any],
         model_config: Dict[str, Any],
         generate_config: Dict[str, Any],
@@ -30,6 +39,10 @@ def __init__(
         microbatch_size: int = 1,
         backend: str = "transformers",
         consumer_plugin_config: Dict[str, Any] = None,
+        eval_dataset_config=None,
+        eval_interval=-1,  # disable evaluation
+        evaluation_function_type="think_answer_tags",
+        eval_save_dir: str = "./eval",
     ):
         self.producer_idx = producer_idx
         self.num_producers = num_producers
@@ -40,10 +53,17 @@ def __init__(
         assert batch_size % microbatch_size == 0
         self.num_microbatches = batch_size // microbatch_size
 
-        self.dataset_config = dataset_config
+        self.train_dataset_config = train_dataset_config
         self.model_config = model_config
         self.generate_config = generate_config
         self.tokenizer_config = tokenizer_config
+        self.consumer_plugin_config = consumer_plugin_config
+        self.eval_interval = eval_interval
+        self.eval_save_dir = eval_save_dir
+        self.consumer_global_step = 0
+
+        if os.path.exists(self.eval_save_dir):
+            raise ValueError(f"Eval save dir {self.eval_save_dir} already exists. Please delete it or change the name.")
 
         # init tokenizer
         if tokenizer_config is None:
@@ -55,13 +75,13 @@ def __init__(
         self.tokenizer.padding_side = "left"
 
         # init dataloader
-        dataset_path = dataset_config.pop("path")
-        self.dataset = RawConversationDataset(self.tokenizer, dataset_path, **dataset_config)
-        self.dataloader = DataLoader(
-            self.dataset,
+        train_dataset_path = train_dataset_config.pop("path")
+        self.train_dataset = RawConversationDataset(self.tokenizer, train_dataset_path, **train_dataset_config)
+        self.train_dataloader = DataLoader(
+            self.train_dataset,
             batch_size=microbatch_size,
             sampler=DistributedSampler(
-                self.dataset,
+                self.train_dataset,
                 num_replicas=num_producers,
                 rank=producer_idx,
                 shuffle=True,
@@ -71,6 +91,36 @@ def __init__(
             num_workers=4,
             drop_last=True,
         )
+
+        self.eval_dataset_config = eval_dataset_config
+        if self.eval_dataset_config is not None:
+            self.eval_dataloaders = {}
+            for eval_task_name in self.eval_dataset_config:
+                eval_dataset_path = eval_dataset_config[eval_task_name].pop("path")
+                eval_dataset = RawConversationDataset(
+                    self.tokenizer, eval_dataset_path, **eval_dataset_config[eval_task_name]
+                )
+                print(f"[P{self.producer_idx}] eval dataset {eval_task_name} size: {len(eval_dataset)}")
+                self.eval_dataloaders[eval_task_name] = DataLoader(
+                    eval_dataset,
+                    batch_size=microbatch_size,
+                    sampler=DistributedSampler(
+                        eval_dataset,
+                        num_replicas=num_producers,
+                        rank=producer_idx,
+                        shuffle=False,
+                        drop_last=False,
+                        seed=42,
+                    ),
+                )
+            if evaluation_function_type == "think_answer_tags":
+                self.evaluation_function = math_reward_fn
+            elif evaluation_function_type == "boxed":
+                self.evaluation_function = boxed_math_reward_fn
+            else:
+                raise ValueError(f"Unknown evaluation function type {evaluation_function_type}")
+        else:
+            raise ValueError("eval_dataset_config is not defined")
         self.device = get_current_device()
 
         # init backend
@@ -88,6 +138,7 @@ def setup(self) -> None:
                 cc.init_collective_group(self.num_producers + 1, self.producer_idx, group_name=f"sync_model_{i}")
         else:
             cc.init_collective_group(self.num_producers + 1, self.producer_idx, group_name="sync_model")
+        cc.init_collective_group(1 + self.num_consumer_procs, 0, group_name=f"sync_eval_statistics_{self.producer_idx}")
 
     def rollout(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:
         raise NotImplementedError
@@ -96,29 +147,64 @@ def load_state_dict(self, state_dict: Dict[str, torch.Tensor]) -> None:
         raise NotImplementedError
 
     def loop(self) -> None:
-        num_update_per_episode = len(self.dataloader) // self.num_microbatches
+        num_update_per_episode = len(self.train_dataloader) // self.num_microbatches
         num_valid_microbatches = num_update_per_episode * self.num_microbatches
 
         print(
-            f"[P{self.producer_idx}] num_valid_microbatches {num_valid_microbatches}, nmb: {self.num_microbatches}, dl: {len(self.dataloader)}"
+            f"[P{self.producer_idx}] num_valid_microbatches {num_valid_microbatches}, nmb: {self.num_microbatches}, dl: {len(self.train_dataloader)}"
         )
         for episode in range(self.num_episodes):
-            self.dataloader.sampler.set_epoch(episode)
-            for i, batch in enumerate(self.dataloader):
+            self.train_dataloader.sampler.set_epoch(episode)
+            for i, batch in enumerate(self.train_dataloader):
                 if i >= num_valid_microbatches:
                     break
+                if self.eval_interval > 0 and self.eval_dataset_config is not None:
+                    if i % self.eval_interval == 0:
+                        eval_statistics = {}
+                        for eval_task_name in self.eval_dataloaders:
+                            print(
+                                f"[P{self.producer_idx}] Evaluate episode {episode} step {i} on task {eval_task_name}"
+                            )
+                            eval_results = []
+                            eval_statistics[eval_task_name] = torch.zeros(2, device=self.device)
+                            for eval_batch in tqdm.tqdm(
+                                self.eval_dataloaders[eval_task_name], disable=self.producer_idx != 0
+                            ):
+                                eval_outputs = self.rollout(**eval_batch, sample_params=self.eval_sample_params)
+                                eval_results = eval_results + [
+                                    self.evaluation_function(
+                                        eval_outputs["input_ids"][m][n],
+                                        eval_outputs["gt_answer"][m][n],
+                                        eval_outputs["response_idx"][m][n],
+                                        tokenizer=self.tokenizer,
+                                        eval_mode=True,
+                                    )
+                                    for m in range(eval_outputs["input_ids"].size(0))
+                                    for n in range(eval_outputs["input_ids"].size(1))
+                                ]
+                            eval_statistics[eval_task_name][0] += len(
+                                [res for res in eval_results if res["ans_valid"] == 1]
+                            )
+                            eval_statistics[eval_task_name][1] += len(eval_results)
+                            # save eval results
+                            result_file_name = os.path.join(
+                                self.eval_save_dir,
+                                f"{eval_task_name}_episode_{episode}_step_{self.consumer_global_step}.jsonl",
+                            )
+                            # delete the file if it exists
+                            safe_write_jsonl(result_file_name, eval_results)
+                        print(f"[P{self.producer_idx}] Send eval statistics episode {episode} step {i}")
+                        ray_broadcast_tensor_dict(
+                            eval_statistics,
+                            src=0,
+                            device=self.device,
+                            group_name=f"sync_eval_statistics_{self.producer_idx}",
+                        )
                 outputs = self.rollout(**batch)
 
                 print(f"[P{self.producer_idx}] Send data {[(k, v.shape) for k, v in outputs.items()]}")
                 outputs["temperature"] = torch.tensor(
-                    [
-                        (
-                            self.model.generate_config["temperature"]
-                            if isinstance(self.model.generate_config.temperature, dict)
-                            else self.model.generate_config.temperature
-                        )
-                    ]
-                    * outputs["input_ids"].size(0)
+                    [self.model.generate_config["temperature"]] * outputs["input_ids"].size(0)
                 ).to(outputs["input_ids"].device)
                 outputs = pre_send(outputs)
                 ray_broadcast_tensor_dict(
@@ -150,6 +236,8 @@ def loop(self) -> None:
                         state_dict = ray_broadcast_tensor_dict(
                             None, self.num_producers, device=self.device, group_name="sync_model"
                         )
+                        if "consumer_global_step" in state_dict:
+                            self.consumer_global_step = state_dict.pop("consumer_global_step").item()
                         self.load_state_dict(state_dict)
                     del state_dict
                     torch.cuda.empty_cache()
@@ -159,7 +247,7 @@ def loop(self) -> None:
                         self.model.llm.wake_up()
                 # linear annealing for 1 episode, temperature from initial to 0.9
                 if episode <= 0:
-                    ratio = 1 - (len(self.dataloader) - i) / len(self.dataloader)
+                    ratio = 1 - (len(self.train_dataloader) - i) / len(self.train_dataloader)
                     if isinstance(self.model.generate_config.temperature, dict):
                         self.model.generate_config["temperature"] = (1 - ratio) * self.generate_config[
                             "temperature"
@@ -179,7 +267,7 @@ def __init__(
         num_consumer_procs,
         num_episodes,
         batch_size,
-        dataset_config,
+        train_dataset_config,
         dataloaders_config,
         model_config,
         generate_config,
@@ -188,6 +276,10 @@ def __init__(
         backend="transformers",
         num_generations: int = 8,
         consumer_plugin_config=None,
+        eval_dataset_config=None,
+        eval_interval=-1,  # disable evaluation
+        evaluation_function_type="think_answer_tags",
+        eval_save_dir: str = "./eval",
     ):
         super().__init__(
             producer_idx,
@@ -195,7 +287,7 @@ def __init__(
             num_consumer_procs,
             num_episodes,
             batch_size,
-            dataset_config,
+            train_dataset_config,
             dataloaders_config,
             model_config,
             generate_config,
@@ -203,14 +295,21 @@ def __init__(
             microbatch_size,
             backend,
             consumer_plugin_config,
+            eval_dataset_config=eval_dataset_config,
+            eval_interval=eval_interval,
+            evaluation_function_type=evaluation_function_type,
+            eval_save_dir=eval_save_dir,
         )
         self.model = self.backend_cls(model_config, generate_config, self.tokenizer, num_generations)
+        self.eval_generation_config = copy.deepcopy(self.model.generate_config)
+        self.eval_generation_config["n"] = 1  # use 1 generation for evaluation
+        self.eval_sample_params = SamplingParams(**self.eval_generation_config)
 
     @torch.no_grad()
     def rollout(self, input_ids, attention_mask, **kwargs):
         rollouts = self.model.generate(input_ids, attention_mask, **kwargs)
-        if self.producer_idx == 1:
-            print("Rollout example:\n", self.tokenizer.decode(rollouts["input_ids"][0][0], skip_special_tokens=True))
+        # if self.producer_idx == 1:
+        #     print("Rollout example:\n", self.tokenizer.decode(rollouts["input_ids"][0][0], skip_special_tokens=True))
 
         return rollouts
 
diff --git a/applications/ColossalChat/coati/distributed/reward/reward_fn.py b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
index b68c1a92fc6c..14d340dc4932 100644
--- a/applications/ColossalChat/coati/distributed/reward/reward_fn.py
+++ b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
@@ -5,6 +5,7 @@
 
 def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
     tokenizer = kwargs["tokenizer"]
+    eval_mode = kwargs.get("eval_mode", False)
     soft_over_length_punishment = kwargs.get("soft_over_length_punishment", False)
     acc_score = 10.0
     reward = torch.tensor(0.0)
@@ -44,36 +45,23 @@ def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
 
     reward = reward + length_reward
 
-    return torch.tensor([reward, format_acc, ans_acc]).to(input_ids.device)
-
-
-def gsm8k_reward_fn(input_ids, **kwargs):
-    gt_answer = kwargs["gt_answer"]
-    tokenizer = kwargs["tokenizer"]
-    s, e = kwargs["response_start"], kwargs["response_end"]
-    reward = torch.tensor(0.0).to(input_ids.device)
-    if gt_answer is None:
-        return reward
-    decoded_final_answer = tokenizer.decode(input_ids[s : e + 1], skip_special_tokens=True)
-    final_answer, processed_str = extract_solution(decoded_final_answer)
-    is_valid = True
-    try:
-        int(final_answer.strip())
-    except Exception:
-        is_valid = False
-
-    format_valid = validate_response_structure(processed_str, kwargs["tags"])
-    if not is_valid or not format_valid:
-        return reward
+    if not eval_mode:
+        return torch.tensor([reward, format_acc, ans_acc]).to(input_ids.device)
     else:
-        reward += 1.0
-        if gt_answer.strip().replace(" ", "").lower() == final_answer.strip().replace(" ", "").lower():
-            reward = reward + 9.0
-        return reward
+        prompt = tokenizer.decode(input_ids[:s], skip_special_tokens=True)
+        return {
+            "prompt": prompt,
+            "prediction": decoded_final_answer,
+            "gold": gt_answer,
+            "parsed": final_answer,
+            "format_valid": format_acc.item(),
+            "ans_valid": ans_acc.item(),
+        }
 
 
 def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
     tokenizer = kwargs["tokenizer"]
+    eval_mode = kwargs.get("eval_mode", False)
     soft_over_length_punishment = kwargs.get("soft_over_length_punishment", False)
     format_score = 0.0
     acc_score = 10.0
@@ -91,7 +79,7 @@ def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
             length_reward = ((max_length - cache_length) - res_length) / cache_length * acc_score
 
     if gt_answer is None:
-        return reward
+        return torch.tensor([reward, format_acc, ans_acc]).to(input_ids.device)
 
     decoded_final_answer = tokenizer.decode(input_ids[s : e + 1], skip_special_tokens=True)
     gt_answer = tokenizer.decode(gt_answer.squeeze(0), skip_special_tokens=True)
@@ -108,5 +96,15 @@ def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
         reward += acc_score
 
     reward = reward + length_reward
-
-    return torch.tensor([reward, format_acc, ans_acc]).to(input_ids.device)
+    if not eval_mode:
+        return torch.tensor([reward, format_acc, ans_acc]).to(input_ids.device)
+    else:
+        prompt = tokenizer.decode(input_ids[:s], skip_special_tokens=True)
+        return {
+            "prompt": prompt,
+            "prediction": decoded_final_answer,
+            "gold": gt_answer,
+            "parsed": final_answer,
+            "format_valid": format_acc.item(),
+            "ans_valid": ans_acc.item(),
+        }
diff --git a/applications/ColossalChat/coati/distributed/utils.py b/applications/ColossalChat/coati/distributed/utils.py
index ce4923dc493e..f9afcaacc21c 100644
--- a/applications/ColossalChat/coati/distributed/utils.py
+++ b/applications/ColossalChat/coati/distributed/utils.py
@@ -1,7 +1,10 @@
+import json
+import os
 from collections import defaultdict
 from typing import Any, Dict, List
 
 import torch
+from filelock import FileLock
 
 from colossalai.shardformer.layer.loss import dist_log_prob
 
@@ -152,3 +155,13 @@ def masked_sum(tensor: torch.Tensor, mask: torch.Tensor, dim: int = 1) -> torch.
     """
     tensor = tensor * mask
     return tensor.sum(dim=dim)
+
+
+def safe_write_jsonl(file_path, data):
+    with FileLock(file_path + ".lock"):
+        # Ensure file exists
+        os.makedirs(os.path.dirname(file_path), exist_ok=True)
+        with open(file_path, "a", encoding="utf8") as f:
+            for entry in data:
+                json_line = json.dumps(entry, ensure_ascii=False)
+                f.write(json_line + "\n")
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 788e60c2edac..b2fafd42e31f 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -1,4 +1,5 @@
 import argparse
+import json
 import os
 
 import ray
@@ -8,7 +9,16 @@
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("-m", "--model", type=str, default="Qwen/Qwen2.5-7B")
-    parser.add_argument("-d", "--dataset", type=str, default="data.jsonl")
+    parser.add_argument("-d", "--dataset", type=str, default="data_train.jsonl")
+    parser.add_argument(
+        "-ed",
+        "--eval-dataset",
+        type=str,
+        default='{"eval task name":"data_eval.jsonl"}',
+        help="Evaluation dataset for each task, please use json format to specify the dataset for each task. \
+        For example: {'task1':'data_eval_task1.jsonl', 'task2':'data_eval_task2.jsonl'}, the jsonl file should be in the same format as the training dataset. \
+        The key is the task name, and the value is the path to the jsonl file",
+    )
     parser.add_argument("-p", "--project", type=str, default="GRPO", help="Project name.")
     parser.add_argument("-e", "--num-episodes", type=int, default=1, help="Number of episodes to train.")
 
@@ -94,11 +104,14 @@
         choices=["think_answer_tags", "boxed"],
         help="Reward type for GRPO.",
     )
+    parser.add_argument("-ei", "--eval-interval", type=int, default=100, help="Interval for evaluation.")
 
     # Logging/Checkpointing parameters
     parser.add_argument("-si", "--save-interval", type=int, default=100, help="Interval for saving checkpoints.")
     parser.add_argument("-sd", "--save-dir", type=str, default="./model", help="Directory for saving checkpoints.")
-
+    parser.add_argument(
+        "-esd", "--eval-save-dir", type=str, default="./eval", help="Directory for saving evaluation results."
+    )
     args = parser.parse_args()
 
     if args.train_minibatch_size is None:
@@ -208,7 +221,7 @@
         inference_microbatch_size=args.inference_microbatch_size,
         train_batch_size=args.train_batch_size,
         train_minibatch_size=args.train_minibatch_size,
-        dataset_config={
+        train_dataset_config={
             "path": args.dataset,
             "max_length": args.max_prompt_tokens,
             "system_prompt": args.system_prompt,
@@ -238,4 +251,10 @@
         project_name=args.project,
         save_interval=args.save_interval,
         save_dir=os.path.join(args.save_dir, args.project.replace(" ", "_")),
+        eval_dataset_config={
+            k: {"path": v, "max_length": args.max_prompt_tokens, "system_prompt": args.system_prompt}
+            for k, v in json.loads(args.eval_dataset).items()
+        },
+        eval_interval=args.eval_interval,
+        eval_save_dir=os.path.join(args.eval_save_dir, args.project.replace(" ", "_")),
     )

From bd61918dcfe7f4028f8ade4e8be351c4df991bf9 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Wed, 30 Apr 2025 21:36:11 +0800
Subject: [PATCH 047/100] reuse comm-group

---
 .../ColossalChat/coati/distributed/consumer.py       | 12 ++++++------
 .../ColossalChat/coati/distributed/producer.py       |  6 ++++--
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 38e55a65cae2..31bd73e88380 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -94,9 +94,6 @@ def setup(self) -> None:
             if self.rank == 0:
                 cc.init_collective_group(self.num_producers + 1, self.num_producers, group_name="sync_model")
 
-        for i in range(self.num_producers):
-            cc.init_collective_group(self.world_size + 1, self.rank + 1, group_name=f"sync_eval_statistics_{i}")
-
         self.buffer = []
         self.recv_cnt = 0
 
@@ -116,11 +113,14 @@ def loop(self) -> None:
                     i = 0
                     if self.eval_interval > 0 and step % self.eval_interval == 0:
                         eval_statistics = None
+                        eval_global_step = None
                         for r in range(self.num_producers):
                             print(f"[T{dist.get_rank()}] Recv eval result episode {episode} step {step} from {r}")
                             local_eval_result = ray_broadcast_tensor_dict(
-                                None, src=0, device=self.device, group_name=f"sync_eval_statistics_{r}"
+                                None, src=0, device=self.device, group_name=f"sync_data_{r}"
                             )
+                            assert "consumer_global_step" in local_eval_result
+                            eval_global_step = local_eval_result.pop("consumer_global_step").item()
                             if eval_statistics is None:
                                 eval_statistics = local_eval_result
                             else:
@@ -129,8 +129,8 @@ def loop(self) -> None:
                                 }
                         eval_statistics = {k: (v[0] / v[1]).item() for k, v in eval_statistics.items()}
                         if dist.get_rank() == 0:
-                            if hasattr(self, "wandb_run") and hasattr(self, "global_step"):
-                                self.wandb_run.log(eval_statistics, step=self.global_step)
+                            if hasattr(self, "wandb_run"):
+                                self.wandb_run.log(eval_statistics, step=eval_global_step)
                             print(f"Eval statistics: {eval_statistics}")
                     for _ in range(self.num_recv_per_update):
                         # receive data from producers
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 529e19bf4e14..b7e1d8f2a960 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -138,7 +138,6 @@ def setup(self) -> None:
                 cc.init_collective_group(self.num_producers + 1, self.producer_idx, group_name=f"sync_model_{i}")
         else:
             cc.init_collective_group(self.num_producers + 1, self.producer_idx, group_name="sync_model")
-        cc.init_collective_group(1 + self.num_consumer_procs, 0, group_name=f"sync_eval_statistics_{self.producer_idx}")
 
     def rollout(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:
         raise NotImplementedError
@@ -194,11 +193,14 @@ def loop(self) -> None:
                             # delete the file if it exists
                             safe_write_jsonl(result_file_name, eval_results)
                         print(f"[P{self.producer_idx}] Send eval statistics episode {episode} step {i}")
+                        eval_statistics["consumer_global_step"] = torch.tensor(
+                            [self.consumer_global_step], device=self.device
+                        )
                         ray_broadcast_tensor_dict(
                             eval_statistics,
                             src=0,
                             device=self.device,
-                            group_name=f"sync_eval_statistics_{self.producer_idx}",
+                            group_name=f"sync_data_{self.producer_idx}",
                         )
                 outputs = self.rollout(**batch)
 

From 01640ebd650baa173929743b1a609693b0380065 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Wed, 30 Apr 2025 22:53:12 +0800
Subject: [PATCH 048/100] fix bug

---
 .../ColossalChat/coati/distributed/launch.py  |  2 +-
 .../coati/distributed/producer.py             | 19 ++++++++-----------
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index d1b6158e5b2b..400a62928bdc 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -87,7 +87,7 @@ def launch_distributed(
             num_generations=num_generations,
             consumer_plugin_config=plugin_config,
             eval_dataset_config=eval_dataset_config,
-            eval_interval=eval_interval,
+            eval_interval=eval_interval * num_recv_per_update,
             evaluation_function_type=grpo_config["reward_fn_type"],
             eval_save_dir=eval_save_dir,
         )
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index b7e1d8f2a960..de4d60b89ea3 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -129,7 +129,7 @@ def __init__(
         else:
             raise ValueError(f"Unexpected backend {backend}")
 
-        self.consumer_pp_size = consumer_plugin_config["pp_size"]  # consumer pp size
+        self.consumer_pp_size = consumer_plugin_config.get("pp_size", 1)  # consumer pp size
 
     def setup(self) -> None:
         cc.init_collective_group(1 + self.num_consumer_procs, 0, group_name=f"sync_data_{self.producer_idx}")
@@ -250,14 +250,11 @@ def loop(self) -> None:
                 # linear annealing for 1 episode, temperature from initial to 0.9
                 if episode <= 0:
                     ratio = 1 - (len(self.train_dataloader) - i) / len(self.train_dataloader)
-                    if isinstance(self.model.generate_config.temperature, dict):
-                        self.model.generate_config["temperature"] = (1 - ratio) * self.generate_config[
-                            "temperature"
-                        ] + ratio * 0.9
-                    else:
-                        self.model.generate_config.temperature = (1 - ratio) * self.generate_config[
-                            "temperature"
-                        ] + ratio * 0.9
+                    self.model.generate_config["temperature"] = (1 - ratio) * self.generate_config[
+                        "temperature"
+                    ] + ratio * 0.9
+                    if hasattr(self.model, "sample_params"):
+                        self.model.sample_params.temperature = self.model.generate_config["temperature"]
 
 
 @ray.remote
@@ -310,8 +307,8 @@ def __init__(
     @torch.no_grad()
     def rollout(self, input_ids, attention_mask, **kwargs):
         rollouts = self.model.generate(input_ids, attention_mask, **kwargs)
-        # if self.producer_idx == 1:
-        #     print("Rollout example:\n", self.tokenizer.decode(rollouts["input_ids"][0][0], skip_special_tokens=True))
+        if self.producer_idx == 1:
+            print("Rollout example:\n", self.tokenizer.decode(rollouts["input_ids"][0][0], skip_special_tokens=True))
 
         return rollouts
 

From a6085ff6765e7440bdcb94a823d8e8e88c1d2383 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Wed, 30 Apr 2025 22:59:54 +0800
Subject: [PATCH 049/100] upgrade reward math verification

---
 .../ColossalChat/coati/distributed/reward/reward_fn.py   | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/reward/reward_fn.py b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
index 14d340dc4932..6844d700af28 100644
--- a/applications/ColossalChat/coati/distributed/reward/reward_fn.py
+++ b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
@@ -1,4 +1,5 @@
 import torch
+from math_verify import parse, verify
 
 from .reward_utils import extract_boxed_solution, extract_solution, validate_response_structure
 
@@ -35,11 +36,7 @@ def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
         format_acc += 1
 
     # Check answer accuracy, answer is considered correct if the answer is correct and the format is valid
-    if (
-        format_valid
-        and final_answer is not None
-        and gt_answer.strip().replace(" ", "").lower() == final_answer.strip().replace(" ", "").lower()
-    ):
+    if format_valid and final_answer is not None and verify(parse(gt_answer.strip()), parse(final_answer.strip())):
         ans_acc += 1
         reward += acc_score
 
@@ -91,7 +88,7 @@ def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
         reward += format_score
 
     # Check answer accuracy, answer is considered correct if the answer is correct and the format is valid
-    if format_valid and final_answer is not None and gt_answer.strip().lower() == final_answer.strip().lower():
+    if format_valid and final_answer is not None and verify(parse(gt_answer.strip()), parse(final_answer.strip())):
         ans_acc += 1
         reward += acc_score
 

From d06042b434396d5f2001e9540f308f932bd7477e Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Thu, 1 May 2025 11:28:05 +0800
Subject: [PATCH 050/100] rewrite reward fn

---
 .../coati/distributed/consumer.py             |  2 +-
 .../coati/distributed/reward/reward_fn.py     | 74 +++++++++++++++++--
 2 files changed, 68 insertions(+), 8 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 31bd73e88380..a3a0948bfa19 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -127,7 +127,7 @@ def loop(self) -> None:
                                 eval_statistics = {
                                     k: eval_statistics[k] + local_eval_result[k] for k in eval_statistics
                                 }
-                        eval_statistics = {k: (v[0] / v[1]).item() for k, v in eval_statistics.items()}
+                        eval_statistics = {"eval/" + k: (v[0] / v[1]).item() for k, v in eval_statistics.items()}
                         if dist.get_rank() == 0:
                             if hasattr(self, "wandb_run"):
                                 self.wandb_run.log(eval_statistics, step=eval_global_step)
diff --git a/applications/ColossalChat/coati/distributed/reward/reward_fn.py b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
index 6844d700af28..467a9b41489a 100644
--- a/applications/ColossalChat/coati/distributed/reward/reward_fn.py
+++ b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
@@ -1,8 +1,70 @@
 import torch
-from math_verify import parse, verify
+from latex2sympy2_extended import NormalizationConfig
+from math_verify import ExprExtractionConfig, LatexExtractionConfig, parse, verify
 
 from .reward_utils import extract_boxed_solution, extract_solution, validate_response_structure
 
+CANNOT_PARSE_GT_ANSWER = -1
+CANNOT_PARSE_PREDICTION = -2
+SUCCESS = 1
+MATCHING_FAIL = 0
+
+
+def verify_math_representation(completion, gt_answer):
+    """
+    Verify if the completion is a valid math representation of the gt_answer.
+    """
+    target = (
+        ExprExtractionConfig(),
+        LatexExtractionConfig(
+            normalization_config=NormalizationConfig(
+                nits=False,
+                malformed_operators=False,
+                basic_latex=True,
+                boxed="all",
+                units=True,
+            ),
+            boxed_match_priority=0,
+        ),
+    )
+    if not isinstance(gt_answer, str) or len(gt_answer) == 0:
+        raise ValueError("gt_answer should be a string, please verify your training data.")
+    if not isinstance(completion, str) or len(completion) == 0:
+        return MATCHING_FAIL
+    try:
+        parsed_gt_answer = parse(gt_answer, extraction_config=target)
+        if len(parsed_gt_answer) == 0:
+            return CANNOT_PARSE_GT_ANSWER
+        parsed_completion = parse(completion, extraction_config=target)
+        if len(parsed_completion) == 0:
+            return CANNOT_PARSE_PREDICTION
+        if verify(parsed_gt_answer, parsed_completion):
+            return SUCCESS
+        else:
+            return MATCHING_FAIL
+    except Exception:
+        return MATCHING_FAIL
+
+
+def verify_model_answer(decoded_final_answer, gt_answer, ans_acc, acc_score, reward):
+    math_verify_result = verify_math_representation(decoded_final_answer, gt_answer)
+    if math_verify_result == SUCCESS:
+        ans_acc += 1
+        reward += acc_score
+    elif math_verify_result == CANNOT_PARSE_GT_ANSWER or math_verify_result == CANNOT_PARSE_PREDICTION:
+        if decoded_final_answer.strip().replace(" ", "").replace("{", "").replace("}", "").replace(
+            ",", ""
+        ) == gt_answer.strip().replace(" ", "").replace("{", "").replace("}", "").replace(",", ""):
+            ans_acc += 1
+            if math_verify_result == CANNOT_PARSE_GT_ANSWER:
+                # plain text answer cannot be parsed, but is correct
+                reward += acc_score
+            else:
+                reward += (
+                    acc_score / 2
+                )  # not a valid latex math representation, but the answer is correct, receive half of the score
+    return reward, ans_acc
+
 
 def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
     tokenizer = kwargs["tokenizer"]
@@ -36,9 +98,8 @@ def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
         format_acc += 1
 
     # Check answer accuracy, answer is considered correct if the answer is correct and the format is valid
-    if format_valid and final_answer is not None and verify(parse(gt_answer.strip()), parse(final_answer.strip())):
-        ans_acc += 1
-        reward += acc_score
+    if format_valid and final_answer is not None:
+        reward, ans_acc = verify_model_answer(decoded_final_answer, gt_answer, ans_acc, acc_score, reward)
 
     reward = reward + length_reward
 
@@ -88,9 +149,8 @@ def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
         reward += format_score
 
     # Check answer accuracy, answer is considered correct if the answer is correct and the format is valid
-    if format_valid and final_answer is not None and verify(parse(gt_answer.strip()), parse(final_answer.strip())):
-        ans_acc += 1
-        reward += acc_score
+    if format_valid and final_answer is not None:
+        reward, ans_acc = verify_model_answer(decoded_final_answer, gt_answer, ans_acc, acc_score, reward)
 
     reward = reward + length_reward
     if not eval_mode:

From eb6b5dd62ebfa907acb044ecdce208bcf088a310 Mon Sep 17 00:00:00 2001
From: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Date: Wed, 7 May 2025 10:56:47 +0800
Subject: [PATCH 051/100] [fix] revert reward update and evaluation (#6295)

* Revert "rewrite reward fn"

This reverts commit d06042b434396d5f2001e9540f308f932bd7477e.

* Revert "upgrade reward math verification"

This reverts commit a6085ff6765e7440bdcb94a823d8e8e88c1d2383.

* Revert "fix bug"

This reverts commit 01640ebd650baa173929743b1a609693b0380065.

* Revert "reuse comm-group"

This reverts commit bd61918dcfe7f4028f8ade4e8be351c4df991bf9.

* Revert "Support evaluation during training"

This reverts commit 57a88395feef9d3c4478fab126bdea8508308dbf.
---
 .gitignore                                    |   1 -
 .../coati/distributed/consumer.py             |  28 +--
 .../coati/distributed/grpo_consumer.py        |   3 -
 .../coati/distributed/inference_backend.py    |  10 +-
 .../ColossalChat/coati/distributed/launch.py  |  16 +-
 .../coati/distributed/producer.py             | 162 ++++--------------
 .../coati/distributed/reward/reward_fn.py     | 131 ++++----------
 .../ColossalChat/coati/distributed/utils.py   |  13 --
 applications/ColossalChat/rl_example.py       |  25 +--
 9 files changed, 82 insertions(+), 307 deletions(-)

diff --git a/.gitignore b/.gitignore
index d48035a5b6df..533450a7cce1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -165,4 +165,3 @@ applications/ColossalChat/logs
 applications/ColossalChat/tests/logs
 applications/ColossalChat/wandb
 applications/ColossalChat/model
-applications/ColossalChat/eval
diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index a3a0948bfa19..1cebcb40eacb 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -36,7 +36,6 @@ def __init__(
         minibatch_size: int = 1,
         save_interval: int = 100,
         save_dir: str = "./model",
-        eval_interval: int = -1,
     ):
         self.num_producers = num_producers
         self.num_episodes = num_episodes
@@ -52,7 +51,6 @@ def __init__(
         self.save_dir = save_dir
         assert batch_size % minibatch_size == 0, "batch_size should be divisible by microbatch_size"
         self.num_microbatches = batch_size // minibatch_size
-        self.eval_interval = eval_interval
 
         self.model_config = model_config
         self.plugin_config = plugin_config
@@ -95,6 +93,7 @@ def setup(self) -> None:
                 cc.init_collective_group(self.num_producers + 1, self.num_producers, group_name="sync_model")
 
         self.buffer = []
+
         self.recv_cnt = 0
 
     def state_dict(self) -> Dict[str, torch.Tensor]:
@@ -111,27 +110,6 @@ def loop(self) -> None:
             with tqdm(range(self.num_update_per_episode), desc=f"Episode {episode}", disable=self.rank != 0) as pbar:
                 for step in pbar:
                     i = 0
-                    if self.eval_interval > 0 and step % self.eval_interval == 0:
-                        eval_statistics = None
-                        eval_global_step = None
-                        for r in range(self.num_producers):
-                            print(f"[T{dist.get_rank()}] Recv eval result episode {episode} step {step} from {r}")
-                            local_eval_result = ray_broadcast_tensor_dict(
-                                None, src=0, device=self.device, group_name=f"sync_data_{r}"
-                            )
-                            assert "consumer_global_step" in local_eval_result
-                            eval_global_step = local_eval_result.pop("consumer_global_step").item()
-                            if eval_statistics is None:
-                                eval_statistics = local_eval_result
-                            else:
-                                eval_statistics = {
-                                    k: eval_statistics[k] + local_eval_result[k] for k in eval_statistics
-                                }
-                        eval_statistics = {"eval/" + k: (v[0] / v[1]).item() for k, v in eval_statistics.items()}
-                        if dist.get_rank() == 0:
-                            if hasattr(self, "wandb_run"):
-                                self.wandb_run.log(eval_statistics, step=eval_global_step)
-                            print(f"Eval statistics: {eval_statistics}")
                     for _ in range(self.num_recv_per_update):
                         # receive data from producers
                         for r in range(self.num_producers):
@@ -217,7 +195,6 @@ def __init__(
         minibatch_size=1,
         save_interval: int = 100,
         save_dir="./model",
-        eval_interval: int = -1,
     ):
         super().__init__(
             num_producers,
@@ -232,9 +209,6 @@ def __init__(
             model_config,
             plugin_config,
             minibatch_size,
-            save_interval,
-            save_dir,
-            eval_interval,
         )
         path = model_config.pop("path")
         self.model = AutoModelForCausalLM.from_pretrained(path, **model_config)
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 0336125d17af..877ff98ec55f 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -40,7 +40,6 @@ def __init__(
         project_name=None,
         save_interval: int = 100,
         save_dir="./model",
-        eval_interval: int = -1,
     ):
         print(f"Using GRPO config: {grpo_config}")
         if grpo_config.get("loss_variation", "sample_level") == "token_level":
@@ -73,7 +72,6 @@ def __init__(
             minibatch_size,
             save_interval=save_interval,
             save_dir=save_dir,
-            eval_interval=eval_interval,
         )
         path = model_config.pop("path")
         self.policy_model = AutoModelForCausalLM.from_pretrained(path, **model_config)
@@ -530,5 +528,4 @@ def state_dict(self):
         self.policy_model._force_wait_all_gather()
         model = self.policy_model.unwrap()
         state_dict = model.state_dict()
-        state_dict["consumer_global_step"] = torch.tensor([self.global_step], device=self.device)
         return state_dict
diff --git a/applications/ColossalChat/coati/distributed/inference_backend.py b/applications/ColossalChat/coati/distributed/inference_backend.py
index a07da7a64793..7d32cd52a41e 100644
--- a/applications/ColossalChat/coati/distributed/inference_backend.py
+++ b/applications/ColossalChat/coati/distributed/inference_backend.py
@@ -205,8 +205,7 @@ def __init__(
         generate_config = generate_config.copy()
         generate_config.update(self.FORCE_GENERATE_CONFIG)
         generate_config.update({"n": num_generations})
-        self.generate_config = generate_config
-        self.sample_params = SamplingParams(**generate_config)
+        self.generate_config = SamplingParams(**generate_config)
         self.model_config = model_config
         self.tokenizer = tokenizer
         self.num_generations = num_generations
@@ -220,9 +219,8 @@ def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwar
         micro_batch_input_ids_no_padding = [
             micro_batch_input_ids[i][first_non_padding_token_idx[i] :] for i in range(micro_batch_size)
         ]
-        sample_params = kwargs.get("sample_params", self.sample_params)
         outputs = self.llm.generate(
-            prompt_token_ids=micro_batch_input_ids_no_padding, sampling_params=sample_params, use_tqdm=False
+            prompt_token_ids=micro_batch_input_ids_no_padding, sampling_params=self.generate_config, use_tqdm=False
         )
         out_tokens = []
         out_len = []
@@ -268,11 +266,11 @@ def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwar
             "response_idx": response_idx,
         }
 
-        data = {k: v.view(micro_batch_size, -1, v.size(-1)) for k, v in data.items()}
+        data = {k: v.view(micro_batch_size, self.num_generations, v.size(-1)) for k, v in data.items()}
 
         if "gt_answer" in kwargs:
             # repeat gt_answer for each prompt.
-            data["gt_answer"] = kwargs["gt_answer"].repeat_interleave(data["input_ids"].size(1), dim=1)
+            data["gt_answer"] = kwargs["gt_answer"].repeat_interleave(self.num_generations, dim=1)
         data = {k: v.to(get_current_device()) for k, v in data.items()}
         return data
 
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index 400a62928bdc..a346d1d4fae9 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -34,7 +34,7 @@ def launch_distributed(
     inference_microbatch_size: int,
     train_batch_size: int,
     train_minibatch_size: int,
-    train_dataset_config: Dict[str, Any],
+    dataset_config: Dict[str, Any],
     dataloaders_config: Dict[str, Any],
     inference_model_config: Dict[str, Any],
     generate_config: Dict[str, Any],
@@ -50,9 +50,6 @@ def launch_distributed(
     project_name: Optional[str] = None,
     save_interval: int = 100,
     save_dir: str = "./model",
-    eval_dataset_config: Optional[Dict[str, Any]] = None,
-    eval_interval: int = 100,
-    eval_save_dir: Optional[str] = None,
 ):
 
     if core_algo not in ALGO_MAP:
@@ -63,9 +60,9 @@ def launch_distributed(
     train_dp_size = get_dp_size_fast(num_consumer_procs, plugin_config)
     assert (inference_batch_size * num_producers) % (train_batch_size * train_dp_size) == 0
 
-    dataset_path = train_dataset_config["path"]
+    dataset_path = dataset_config["path"]
     num_samples = get_jsonl_size_fast(dataset_path)
-    global_inference_batch_size = inference_batch_size * num_producers  # TODO: this doesn't support TP on producer
+    global_inference_batch_size = inference_batch_size * num_producers
     num_update_per_episode = num_samples // global_inference_batch_size
     num_recv_per_update = inference_batch_size // inference_microbatch_size
 
@@ -77,7 +74,7 @@ def launch_distributed(
             num_consumer_procs=num_consumer_procs,
             num_episodes=num_episodes,
             batch_size=inference_batch_size,
-            train_dataset_config=train_dataset_config,
+            dataset_config=dataset_config,
             dataloaders_config=dataloaders_config,
             model_config=inference_model_config,
             generate_config=generate_config,
@@ -86,10 +83,6 @@ def launch_distributed(
             backend=inference_backend,
             num_generations=num_generations,
             consumer_plugin_config=plugin_config,
-            eval_dataset_config=eval_dataset_config,
-            eval_interval=eval_interval * num_recv_per_update,
-            evaluation_function_type=grpo_config["reward_fn_type"],
-            eval_save_dir=eval_save_dir,
         )
         procs.append(producer)
     generate_config_consumer = copy.deepcopy(generate_config)
@@ -118,7 +111,6 @@ def launch_distributed(
             project_name=project_name,
             save_interval=save_interval,
             save_dir=save_dir,
-            eval_interval=eval_interval,
         )
         procs.append(consumer)
     ray.get([p.setup.remote() for p in procs])
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index de4d60b89ea3..a2d675870fc2 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -1,13 +1,9 @@
-import copy
-import os
 from typing import Any, Dict, Optional
 
 import ray
 import ray.util.collective as cc
 import torch
-import tqdm
 from coati.dataset.loader import RawConversationDataset
-from coati.distributed.reward.reward_fn import boxed_math_reward_fn, math_reward_fn
 from torch.utils.data import DataLoader, DistributedSampler
 from transformers import AutoTokenizer
 
@@ -15,12 +11,7 @@
 
 from .comm import ray_broadcast_tensor_dict
 from .inference_backend import BACKEND_MAP
-from .utils import pre_send, safe_write_jsonl
-
-try:
-    from vllm import SamplingParams
-except ImportError:
-    LLM = None
+from .utils import pre_send
 
 
 class BaseProducer:
@@ -31,7 +22,7 @@ def __init__(
         num_consumer_procs: int,
         num_episodes: int,
         batch_size: int,
-        train_dataset_config: Dict[str, Any],
+        dataset_config: Dict[str, Any],
         dataloaders_config: Dict[str, Any],
         model_config: Dict[str, Any],
         generate_config: Dict[str, Any],
@@ -39,10 +30,6 @@ def __init__(
         microbatch_size: int = 1,
         backend: str = "transformers",
         consumer_plugin_config: Dict[str, Any] = None,
-        eval_dataset_config=None,
-        eval_interval=-1,  # disable evaluation
-        evaluation_function_type="think_answer_tags",
-        eval_save_dir: str = "./eval",
     ):
         self.producer_idx = producer_idx
         self.num_producers = num_producers
@@ -53,17 +40,10 @@ def __init__(
         assert batch_size % microbatch_size == 0
         self.num_microbatches = batch_size // microbatch_size
 
-        self.train_dataset_config = train_dataset_config
+        self.dataset_config = dataset_config
         self.model_config = model_config
         self.generate_config = generate_config
         self.tokenizer_config = tokenizer_config
-        self.consumer_plugin_config = consumer_plugin_config
-        self.eval_interval = eval_interval
-        self.eval_save_dir = eval_save_dir
-        self.consumer_global_step = 0
-
-        if os.path.exists(self.eval_save_dir):
-            raise ValueError(f"Eval save dir {self.eval_save_dir} already exists. Please delete it or change the name.")
 
         # init tokenizer
         if tokenizer_config is None:
@@ -75,13 +55,13 @@ def __init__(
         self.tokenizer.padding_side = "left"
 
         # init dataloader
-        train_dataset_path = train_dataset_config.pop("path")
-        self.train_dataset = RawConversationDataset(self.tokenizer, train_dataset_path, **train_dataset_config)
-        self.train_dataloader = DataLoader(
-            self.train_dataset,
+        dataset_path = dataset_config.pop("path")
+        self.dataset = RawConversationDataset(self.tokenizer, dataset_path, **dataset_config)
+        self.dataloader = DataLoader(
+            self.dataset,
             batch_size=microbatch_size,
             sampler=DistributedSampler(
-                self.train_dataset,
+                self.dataset,
                 num_replicas=num_producers,
                 rank=producer_idx,
                 shuffle=True,
@@ -91,36 +71,6 @@ def __init__(
             num_workers=4,
             drop_last=True,
         )
-
-        self.eval_dataset_config = eval_dataset_config
-        if self.eval_dataset_config is not None:
-            self.eval_dataloaders = {}
-            for eval_task_name in self.eval_dataset_config:
-                eval_dataset_path = eval_dataset_config[eval_task_name].pop("path")
-                eval_dataset = RawConversationDataset(
-                    self.tokenizer, eval_dataset_path, **eval_dataset_config[eval_task_name]
-                )
-                print(f"[P{self.producer_idx}] eval dataset {eval_task_name} size: {len(eval_dataset)}")
-                self.eval_dataloaders[eval_task_name] = DataLoader(
-                    eval_dataset,
-                    batch_size=microbatch_size,
-                    sampler=DistributedSampler(
-                        eval_dataset,
-                        num_replicas=num_producers,
-                        rank=producer_idx,
-                        shuffle=False,
-                        drop_last=False,
-                        seed=42,
-                    ),
-                )
-            if evaluation_function_type == "think_answer_tags":
-                self.evaluation_function = math_reward_fn
-            elif evaluation_function_type == "boxed":
-                self.evaluation_function = boxed_math_reward_fn
-            else:
-                raise ValueError(f"Unknown evaluation function type {evaluation_function_type}")
-        else:
-            raise ValueError("eval_dataset_config is not defined")
         self.device = get_current_device()
 
         # init backend
@@ -129,7 +79,7 @@ def __init__(
         else:
             raise ValueError(f"Unexpected backend {backend}")
 
-        self.consumer_pp_size = consumer_plugin_config.get("pp_size", 1)  # consumer pp size
+        self.consumer_pp_size = consumer_plugin_config["pp_size"]  # consumer pp size
 
     def setup(self) -> None:
         cc.init_collective_group(1 + self.num_consumer_procs, 0, group_name=f"sync_data_{self.producer_idx}")
@@ -146,67 +96,29 @@ def load_state_dict(self, state_dict: Dict[str, torch.Tensor]) -> None:
         raise NotImplementedError
 
     def loop(self) -> None:
-        num_update_per_episode = len(self.train_dataloader) // self.num_microbatches
+        num_update_per_episode = len(self.dataloader) // self.num_microbatches
         num_valid_microbatches = num_update_per_episode * self.num_microbatches
 
         print(
-            f"[P{self.producer_idx}] num_valid_microbatches {num_valid_microbatches}, nmb: {self.num_microbatches}, dl: {len(self.train_dataloader)}"
+            f"[P{self.producer_idx}] num_valid_microbatches {num_valid_microbatches}, nmb: {self.num_microbatches}, dl: {len(self.dataloader)}"
         )
         for episode in range(self.num_episodes):
-            self.train_dataloader.sampler.set_epoch(episode)
-            for i, batch in enumerate(self.train_dataloader):
+            self.dataloader.sampler.set_epoch(episode)
+            for i, batch in enumerate(self.dataloader):
                 if i >= num_valid_microbatches:
                     break
-                if self.eval_interval > 0 and self.eval_dataset_config is not None:
-                    if i % self.eval_interval == 0:
-                        eval_statistics = {}
-                        for eval_task_name in self.eval_dataloaders:
-                            print(
-                                f"[P{self.producer_idx}] Evaluate episode {episode} step {i} on task {eval_task_name}"
-                            )
-                            eval_results = []
-                            eval_statistics[eval_task_name] = torch.zeros(2, device=self.device)
-                            for eval_batch in tqdm.tqdm(
-                                self.eval_dataloaders[eval_task_name], disable=self.producer_idx != 0
-                            ):
-                                eval_outputs = self.rollout(**eval_batch, sample_params=self.eval_sample_params)
-                                eval_results = eval_results + [
-                                    self.evaluation_function(
-                                        eval_outputs["input_ids"][m][n],
-                                        eval_outputs["gt_answer"][m][n],
-                                        eval_outputs["response_idx"][m][n],
-                                        tokenizer=self.tokenizer,
-                                        eval_mode=True,
-                                    )
-                                    for m in range(eval_outputs["input_ids"].size(0))
-                                    for n in range(eval_outputs["input_ids"].size(1))
-                                ]
-                            eval_statistics[eval_task_name][0] += len(
-                                [res for res in eval_results if res["ans_valid"] == 1]
-                            )
-                            eval_statistics[eval_task_name][1] += len(eval_results)
-                            # save eval results
-                            result_file_name = os.path.join(
-                                self.eval_save_dir,
-                                f"{eval_task_name}_episode_{episode}_step_{self.consumer_global_step}.jsonl",
-                            )
-                            # delete the file if it exists
-                            safe_write_jsonl(result_file_name, eval_results)
-                        print(f"[P{self.producer_idx}] Send eval statistics episode {episode} step {i}")
-                        eval_statistics["consumer_global_step"] = torch.tensor(
-                            [self.consumer_global_step], device=self.device
-                        )
-                        ray_broadcast_tensor_dict(
-                            eval_statistics,
-                            src=0,
-                            device=self.device,
-                            group_name=f"sync_data_{self.producer_idx}",
-                        )
                 outputs = self.rollout(**batch)
 
                 print(f"[P{self.producer_idx}] Send data {[(k, v.shape) for k, v in outputs.items()]}")
                 outputs["temperature"] = torch.tensor(
-                    [self.model.generate_config["temperature"]] * outputs["input_ids"].size(0)
+                    [
+                        (
+                            self.model.generate_config["temperature"]
+                            if isinstance(self.model.generate_config.temperature, dict)
+                            else self.model.generate_config.temperature
+                        )
+                    ]
+                    * outputs["input_ids"].size(0)
                 ).to(outputs["input_ids"].device)
                 outputs = pre_send(outputs)
                 ray_broadcast_tensor_dict(
@@ -238,8 +150,6 @@ def loop(self) -> None:
                         state_dict = ray_broadcast_tensor_dict(
                             None, self.num_producers, device=self.device, group_name="sync_model"
                         )
-                        if "consumer_global_step" in state_dict:
-                            self.consumer_global_step = state_dict.pop("consumer_global_step").item()
                         self.load_state_dict(state_dict)
                     del state_dict
                     torch.cuda.empty_cache()
@@ -249,12 +159,15 @@ def loop(self) -> None:
                         self.model.llm.wake_up()
                 # linear annealing for 1 episode, temperature from initial to 0.9
                 if episode <= 0:
-                    ratio = 1 - (len(self.train_dataloader) - i) / len(self.train_dataloader)
-                    self.model.generate_config["temperature"] = (1 - ratio) * self.generate_config[
-                        "temperature"
-                    ] + ratio * 0.9
-                    if hasattr(self.model, "sample_params"):
-                        self.model.sample_params.temperature = self.model.generate_config["temperature"]
+                    ratio = 1 - (len(self.dataloader) - i) / len(self.dataloader)
+                    if isinstance(self.model.generate_config.temperature, dict):
+                        self.model.generate_config["temperature"] = (1 - ratio) * self.generate_config[
+                            "temperature"
+                        ] + ratio * 0.9
+                    else:
+                        self.model.generate_config.temperature = (1 - ratio) * self.generate_config[
+                            "temperature"
+                        ] + ratio * 0.9
 
 
 @ray.remote
@@ -266,7 +179,7 @@ def __init__(
         num_consumer_procs,
         num_episodes,
         batch_size,
-        train_dataset_config,
+        dataset_config,
         dataloaders_config,
         model_config,
         generate_config,
@@ -275,10 +188,6 @@ def __init__(
         backend="transformers",
         num_generations: int = 8,
         consumer_plugin_config=None,
-        eval_dataset_config=None,
-        eval_interval=-1,  # disable evaluation
-        evaluation_function_type="think_answer_tags",
-        eval_save_dir: str = "./eval",
     ):
         super().__init__(
             producer_idx,
@@ -286,7 +195,7 @@ def __init__(
             num_consumer_procs,
             num_episodes,
             batch_size,
-            train_dataset_config,
+            dataset_config,
             dataloaders_config,
             model_config,
             generate_config,
@@ -294,15 +203,8 @@ def __init__(
             microbatch_size,
             backend,
             consumer_plugin_config,
-            eval_dataset_config=eval_dataset_config,
-            eval_interval=eval_interval,
-            evaluation_function_type=evaluation_function_type,
-            eval_save_dir=eval_save_dir,
         )
         self.model = self.backend_cls(model_config, generate_config, self.tokenizer, num_generations)
-        self.eval_generation_config = copy.deepcopy(self.model.generate_config)
-        self.eval_generation_config["n"] = 1  # use 1 generation for evaluation
-        self.eval_sample_params = SamplingParams(**self.eval_generation_config)
 
     @torch.no_grad()
     def rollout(self, input_ids, attention_mask, **kwargs):
diff --git a/applications/ColossalChat/coati/distributed/reward/reward_fn.py b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
index 467a9b41489a..b68c1a92fc6c 100644
--- a/applications/ColossalChat/coati/distributed/reward/reward_fn.py
+++ b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
@@ -1,74 +1,10 @@
 import torch
-from latex2sympy2_extended import NormalizationConfig
-from math_verify import ExprExtractionConfig, LatexExtractionConfig, parse, verify
 
 from .reward_utils import extract_boxed_solution, extract_solution, validate_response_structure
 
-CANNOT_PARSE_GT_ANSWER = -1
-CANNOT_PARSE_PREDICTION = -2
-SUCCESS = 1
-MATCHING_FAIL = 0
-
-
-def verify_math_representation(completion, gt_answer):
-    """
-    Verify if the completion is a valid math representation of the gt_answer.
-    """
-    target = (
-        ExprExtractionConfig(),
-        LatexExtractionConfig(
-            normalization_config=NormalizationConfig(
-                nits=False,
-                malformed_operators=False,
-                basic_latex=True,
-                boxed="all",
-                units=True,
-            ),
-            boxed_match_priority=0,
-        ),
-    )
-    if not isinstance(gt_answer, str) or len(gt_answer) == 0:
-        raise ValueError("gt_answer should be a string, please verify your training data.")
-    if not isinstance(completion, str) or len(completion) == 0:
-        return MATCHING_FAIL
-    try:
-        parsed_gt_answer = parse(gt_answer, extraction_config=target)
-        if len(parsed_gt_answer) == 0:
-            return CANNOT_PARSE_GT_ANSWER
-        parsed_completion = parse(completion, extraction_config=target)
-        if len(parsed_completion) == 0:
-            return CANNOT_PARSE_PREDICTION
-        if verify(parsed_gt_answer, parsed_completion):
-            return SUCCESS
-        else:
-            return MATCHING_FAIL
-    except Exception:
-        return MATCHING_FAIL
-
-
-def verify_model_answer(decoded_final_answer, gt_answer, ans_acc, acc_score, reward):
-    math_verify_result = verify_math_representation(decoded_final_answer, gt_answer)
-    if math_verify_result == SUCCESS:
-        ans_acc += 1
-        reward += acc_score
-    elif math_verify_result == CANNOT_PARSE_GT_ANSWER or math_verify_result == CANNOT_PARSE_PREDICTION:
-        if decoded_final_answer.strip().replace(" ", "").replace("{", "").replace("}", "").replace(
-            ",", ""
-        ) == gt_answer.strip().replace(" ", "").replace("{", "").replace("}", "").replace(",", ""):
-            ans_acc += 1
-            if math_verify_result == CANNOT_PARSE_GT_ANSWER:
-                # plain text answer cannot be parsed, but is correct
-                reward += acc_score
-            else:
-                reward += (
-                    acc_score / 2
-                )  # not a valid latex math representation, but the answer is correct, receive half of the score
-    return reward, ans_acc
-
 
 def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
     tokenizer = kwargs["tokenizer"]
-    eval_mode = kwargs.get("eval_mode", False)
     soft_over_length_punishment = kwargs.get("soft_over_length_punishment", False)
     acc_score = 10.0
     reward = torch.tensor(0.0)
@@ -98,28 +34,46 @@ def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
         format_acc += 1
 
     # Check answer accuracy, answer is considered correct if the answer is correct and the format is valid
-    if format_valid and final_answer is not None:
-        reward, ans_acc = verify_model_answer(decoded_final_answer, gt_answer, ans_acc, acc_score, reward)
+    if (
+        format_valid
+        and final_answer is not None
+        and gt_answer.strip().replace(" ", "").lower() == final_answer.strip().replace(" ", "").lower()
+    ):
+        ans_acc += 1
+        reward += acc_score
 
     reward = reward + length_reward
 
-    if not eval_mode:
-        return torch.tensor([reward, format_acc, ans_acc]).to(input_ids.device)
+    return torch.tensor([reward, format_acc, ans_acc]).to(input_ids.device)
+
+
+def gsm8k_reward_fn(input_ids, **kwargs):
+    gt_answer = kwargs["gt_answer"]
+    tokenizer = kwargs["tokenizer"]
+    s, e = kwargs["response_start"], kwargs["response_end"]
+    reward = torch.tensor(0.0).to(input_ids.device)
+    if gt_answer is None:
+        return reward
+    decoded_final_answer = tokenizer.decode(input_ids[s : e + 1], skip_special_tokens=True)
+    final_answer, processed_str = extract_solution(decoded_final_answer)
+    is_valid = True
+    try:
+        int(final_answer.strip())
+    except Exception:
+        is_valid = False
+
+    format_valid = validate_response_structure(processed_str, kwargs["tags"])
+    if not is_valid or not format_valid:
+        return reward
     else:
-        prompt = tokenizer.decode(input_ids[:s], skip_special_tokens=True)
-        return {
-            "prompt": prompt,
-            "prediction": decoded_final_answer,
-            "gold": gt_answer,
-            "parsed": final_answer,
-            "format_valid": format_acc.item(),
-            "ans_valid": ans_acc.item(),
-        }
+        reward += 1.0
+        if gt_answer.strip().replace(" ", "").lower() == final_answer.strip().replace(" ", "").lower():
+            reward = reward + 9.0
+        return reward
 
 
 def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
     tokenizer = kwargs["tokenizer"]
-    eval_mode = kwargs.get("eval_mode", False)
     soft_over_length_punishment = kwargs.get("soft_over_length_punishment", False)
     format_score = 0.0
     acc_score = 10.0
@@ -137,7 +91,7 @@ def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
             length_reward = ((max_length - cache_length) - res_length) / cache_length * acc_score
 
     if gt_answer is None:
-        return torch.tensor([reward, format_acc, ans_acc]).to(input_ids.device)
+        return reward
 
     decoded_final_answer = tokenizer.decode(input_ids[s : e + 1], skip_special_tokens=True)
     gt_answer = tokenizer.decode(gt_answer.squeeze(0), skip_special_tokens=True)
@@ -149,19 +103,10 @@ def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
         reward += format_score
 
     # Check answer accuracy, answer is considered correct if the answer is correct and the format is valid
-    if format_valid and final_answer is not None:
-        reward, ans_acc = verify_model_answer(decoded_final_answer, gt_answer, ans_acc, acc_score, reward)
+    if format_valid and final_answer is not None and gt_answer.strip().lower() == final_answer.strip().lower():
+        ans_acc += 1
+        reward += acc_score
 
     reward = reward + length_reward
-    if not eval_mode:
-        return torch.tensor([reward, format_acc, ans_acc]).to(input_ids.device)
-    else:
-        prompt = tokenizer.decode(input_ids[:s], skip_special_tokens=True)
-        return {
-            "prompt": prompt,
-            "prediction": decoded_final_answer,
-            "gold": gt_answer,
-            "parsed": final_answer,
-            "format_valid": format_acc.item(),
-            "ans_valid": ans_acc.item(),
-        }
+
+    return torch.tensor([reward, format_acc, ans_acc]).to(input_ids.device)
diff --git a/applications/ColossalChat/coati/distributed/utils.py b/applications/ColossalChat/coati/distributed/utils.py
index f9afcaacc21c..ce4923dc493e 100644
--- a/applications/ColossalChat/coati/distributed/utils.py
+++ b/applications/ColossalChat/coati/distributed/utils.py
@@ -1,10 +1,7 @@
-import json
-import os
 from collections import defaultdict
 from typing import Any, Dict, List
 
 import torch
-from filelock import FileLock
 
 from colossalai.shardformer.layer.loss import dist_log_prob
 
@@ -155,13 +152,3 @@ def masked_sum(tensor: torch.Tensor, mask: torch.Tensor, dim: int = 1) -> torch.
     """
     tensor = tensor * mask
     return tensor.sum(dim=dim)
-
-
-def safe_write_jsonl(file_path, data):
-    with FileLock(file_path + ".lock"):
-        # Ensure file exists
-        os.makedirs(os.path.dirname(file_path), exist_ok=True)
-        with open(file_path, "a", encoding="utf8") as f:
-            for entry in data:
-                json_line = json.dumps(entry, ensure_ascii=False)
-                f.write(json_line + "\n")
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index b2fafd42e31f..788e60c2edac 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -1,5 +1,4 @@
 import argparse
-import json
 import os
 
 import ray
@@ -9,16 +8,7 @@
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("-m", "--model", type=str, default="Qwen/Qwen2.5-7B")
-    parser.add_argument("-d", "--dataset", type=str, default="data_train.jsonl")
-    parser.add_argument(
-        "-ed",
-        "--eval-dataset",
-        type=str,
-        default='{"eval task name":"data_eval.jsonl"}',
-        help="Evaluation dataset for each task, please use json format to specify the dataset for each task. \
-        For example: {'task1':'data_eval_task1.jsonl', 'task2':'data_eval_task2.jsonl'}, the jsonl file should be in the same format as the training dataset. \
-        The key is the task name, and the value is the path to the jsonl file",
-    )
+    parser.add_argument("-d", "--dataset", type=str, default="data.jsonl")
     parser.add_argument("-p", "--project", type=str, default="GRPO", help="Project name.")
     parser.add_argument("-e", "--num-episodes", type=int, default=1, help="Number of episodes to train.")
 
@@ -104,14 +94,11 @@
         choices=["think_answer_tags", "boxed"],
         help="Reward type for GRPO.",
     )
-    parser.add_argument("-ei", "--eval-interval", type=int, default=100, help="Interval for evaluation.")
 
     # Logging/Checkpointing parameters
     parser.add_argument("-si", "--save-interval", type=int, default=100, help="Interval for saving checkpoints.")
     parser.add_argument("-sd", "--save-dir", type=str, default="./model", help="Directory for saving checkpoints.")
-    parser.add_argument(
-        "-esd", "--eval-save-dir", type=str, default="./eval", help="Directory for saving evaluation results."
-    )
+
     args = parser.parse_args()
 
     if args.train_minibatch_size is None:
@@ -221,7 +208,7 @@
         inference_microbatch_size=args.inference_microbatch_size,
         train_batch_size=args.train_batch_size,
         train_minibatch_size=args.train_minibatch_size,
-        train_dataset_config={
+        dataset_config={
             "path": args.dataset,
             "max_length": args.max_prompt_tokens,
             "system_prompt": args.system_prompt,
@@ -251,10 +238,4 @@
         project_name=args.project,
         save_interval=args.save_interval,
         save_dir=os.path.join(args.save_dir, args.project.replace(" ", "_")),
-        eval_dataset_config={
-            k: {"path": v, "max_length": args.max_prompt_tokens, "system_prompt": args.system_prompt}
-            for k, v in json.loads(args.eval_dataset).items()
-        },
-        eval_interval=args.eval_interval,
-        eval_save_dir=os.path.join(args.eval_save_dir, args.project.replace(" ", "_")),
     )

From b920af427bb47803482ab8a41c4517b2b07b7f59 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Tue, 13 May 2025 16:51:27 +0800
Subject: [PATCH 052/100] update pad seq (#6303)

Co-authored-by: Tong Li <tong.li35271158@gmail.com>
---
 .../coati/distributed/consumer.py             |  5 +----
 .../coati/distributed/inference_backend.py    |  2 +-
 .../coati/distributed/producer.py             |  2 +-
 .../ColossalChat/coati/distributed/utils.py   | 22 -------------------
 4 files changed, 3 insertions(+), 28 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 1cebcb40eacb..9503d65eb34d 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -16,7 +16,7 @@
 from colossalai.utils import get_current_device
 
 from .comm import ray_broadcast_tensor_dict
-from .utils import bind_batch, pad_batch, post_recv, unbind_batch
+from .utils import bind_batch, post_recv, unbind_batch
 
 
 class BaseConsumer:
@@ -125,9 +125,6 @@ def loop(self) -> None:
                             batches = self.buffer[
                                 self.dp_rank * self.minibatch_size : (self.dp_rank + 1) * self.minibatch_size
                             ]
-                            batch = pad_batch(
-                                batches
-                            )  # when `imbs` is smaller than `tMbs`, samples may have differ in size, need to pad before stacking
                             batch = bind_batch(batches)
                             batch = post_recv(batch)
                             loss, num_excessive_prompts = self.step(i, pbar, **batch)
diff --git a/applications/ColossalChat/coati/distributed/inference_backend.py b/applications/ColossalChat/coati/distributed/inference_backend.py
index 7d32cd52a41e..8ff4b15bfd5d 100644
--- a/applications/ColossalChat/coati/distributed/inference_backend.py
+++ b/applications/ColossalChat/coati/distributed/inference_backend.py
@@ -236,7 +236,7 @@ def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwar
                 log_probs.append(p)
 
         # pad them
-        max_len = max(out_len)
+        max_len = self.generate_config.max_tokens
         action_mask = torch.ones(len(out_tokens), max_len, dtype=attention_mask.dtype)
 
         for i, new_token_ids in enumerate(out_tokens):
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index a2d675870fc2..ffe3a44285ec 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -79,7 +79,7 @@ def __init__(
         else:
             raise ValueError(f"Unexpected backend {backend}")
 
-        self.consumer_pp_size = consumer_plugin_config["pp_size"]  # consumer pp size
+        self.consumer_pp_size = consumer_plugin_config.get("pp_size", 1)  # consumer pp size
 
     def setup(self) -> None:
         cc.init_collective_group(1 + self.num_consumer_procs, 0, group_name=f"sync_data_{self.producer_idx}")
diff --git a/applications/ColossalChat/coati/distributed/utils.py b/applications/ColossalChat/coati/distributed/utils.py
index ce4923dc493e..5f7879669d08 100644
--- a/applications/ColossalChat/coati/distributed/utils.py
+++ b/applications/ColossalChat/coati/distributed/utils.py
@@ -1,4 +1,3 @@
-from collections import defaultdict
 from typing import Any, Dict, List
 
 import torch
@@ -27,27 +26,6 @@ def bind_batch(batches: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor
     return batch
 
 
-def pad_batch(batches: List[Dict[str, torch.Tensor]], tokenizer: Any = None) -> List[Dict[str, torch.Tensor]]:
-    max_len = defaultdict(int)
-    for sample in batches:
-        for k in sample:
-            if k in ["input_ids", "attention_mask", "action_log_probs", "action_mask"]:
-                max_len[k] = max(max_len[k], sample[k].size(-1))
-    for idx, sample in enumerate(batches):
-        for k in sample:
-            if k in ["input_ids", "attention_mask", "action_log_probs", "action_mask"]:
-                # right pad with 0s
-                if k in ["attention_mask", "action_mask"]:
-                    batches[idx][k] = torch.nn.functional.pad(
-                        batches[idx][k], (0, max_len[k] - batches[idx][k].size(-1)), "constant", False
-                    )
-                else:
-                    batches[idx][k] = torch.nn.functional.pad(
-                        batches[idx][k], (0, max_len[k] - batches[idx][k].size(-1)), "constant", 0
-                    )
-    return batches
-
-
 def pre_send(batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
     # compress mask to save bandwidth
     if "attention_mask" in batch:

From 47a7dc7142586859c73e1f07bf06ee9829e09adf Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Wed, 30 Apr 2025 18:13:40 +0800
Subject: [PATCH 053/100] Support evaluation during training

---
 .gitignore                                    |   1 +
 .../coati/distributed/consumer.py             |  28 +++-
 .../coati/distributed/grpo_consumer.py        |   3 +
 .../coati/distributed/inference_backend.py    |  10 +-
 .../ColossalChat/coati/distributed/launch.py  |  16 +-
 .../coati/distributed/producer.py             | 149 +++++++++++++++---
 .../coati/distributed/reward/reward_fn.py     |  54 +++----
 .../ColossalChat/coati/distributed/utils.py   |  13 ++
 applications/ColossalChat/rl_example.py       |  25 ++-
 9 files changed, 234 insertions(+), 65 deletions(-)

diff --git a/.gitignore b/.gitignore
index 533450a7cce1..d48035a5b6df 100644
--- a/.gitignore
+++ b/.gitignore
@@ -165,3 +165,4 @@ applications/ColossalChat/logs
 applications/ColossalChat/tests/logs
 applications/ColossalChat/wandb
 applications/ColossalChat/model
+applications/ColossalChat/eval
diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 9503d65eb34d..620ab6ff63b2 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -36,6 +36,7 @@ def __init__(
         minibatch_size: int = 1,
         save_interval: int = 100,
         save_dir: str = "./model",
+        eval_interval: int = -1,
     ):
         self.num_producers = num_producers
         self.num_episodes = num_episodes
@@ -51,6 +52,7 @@ def __init__(
         self.save_dir = save_dir
         assert batch_size % minibatch_size == 0, "batch_size should be divisible by microbatch_size"
         self.num_microbatches = batch_size // minibatch_size
+        self.eval_interval = eval_interval
 
         self.model_config = model_config
         self.plugin_config = plugin_config
@@ -92,8 +94,10 @@ def setup(self) -> None:
             if self.rank == 0:
                 cc.init_collective_group(self.num_producers + 1, self.num_producers, group_name="sync_model")
 
-        self.buffer = []
+        for i in range(self.num_producers):
+            cc.init_collective_group(self.world_size + 1, self.rank + 1, group_name=f"sync_eval_statistics_{i}")
 
+        self.buffer = []
         self.recv_cnt = 0
 
     def state_dict(self) -> Dict[str, torch.Tensor]:
@@ -110,6 +114,24 @@ def loop(self) -> None:
             with tqdm(range(self.num_update_per_episode), desc=f"Episode {episode}", disable=self.rank != 0) as pbar:
                 for step in pbar:
                     i = 0
+                    if self.eval_interval > 0 and step % self.eval_interval == 0:
+                        eval_statistics = None
+                        for r in range(self.num_producers):
+                            print(f"[T{dist.get_rank()}] Recv eval result episode {episode} step {step} from {r}")
+                            local_eval_result = ray_broadcast_tensor_dict(
+                                None, src=0, device=self.device, group_name=f"sync_eval_statistics_{r}"
+                            )
+                            if eval_statistics is None:
+                                eval_statistics = local_eval_result
+                            else:
+                                eval_statistics = {
+                                    k: eval_statistics[k] + local_eval_result[k] for k in eval_statistics
+                                }
+                        eval_statistics = {k: (v[0] / v[1]).item() for k, v in eval_statistics.items()}
+                        if dist.get_rank() == 0:
+                            if hasattr(self, "wandb_run") and hasattr(self, "global_step"):
+                                self.wandb_run.log(eval_statistics, step=self.global_step)
+                            print(f"Eval statistics: {eval_statistics}")
                     for _ in range(self.num_recv_per_update):
                         # receive data from producers
                         for r in range(self.num_producers):
@@ -192,6 +214,7 @@ def __init__(
         minibatch_size=1,
         save_interval: int = 100,
         save_dir="./model",
+        eval_interval: int = -1,
     ):
         super().__init__(
             num_producers,
@@ -206,6 +229,9 @@ def __init__(
             model_config,
             plugin_config,
             minibatch_size,
+            save_interval,
+            save_dir,
+            eval_interval,
         )
         path = model_config.pop("path")
         self.model = AutoModelForCausalLM.from_pretrained(path, **model_config)
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 877ff98ec55f..0336125d17af 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -40,6 +40,7 @@ def __init__(
         project_name=None,
         save_interval: int = 100,
         save_dir="./model",
+        eval_interval: int = -1,
     ):
         print(f"Using GRPO config: {grpo_config}")
         if grpo_config.get("loss_variation", "sample_level") == "token_level":
@@ -72,6 +73,7 @@ def __init__(
             minibatch_size,
             save_interval=save_interval,
             save_dir=save_dir,
+            eval_interval=eval_interval,
         )
         path = model_config.pop("path")
         self.policy_model = AutoModelForCausalLM.from_pretrained(path, **model_config)
@@ -528,4 +530,5 @@ def state_dict(self):
         self.policy_model._force_wait_all_gather()
         model = self.policy_model.unwrap()
         state_dict = model.state_dict()
+        state_dict["consumer_global_step"] = torch.tensor([self.global_step], device=self.device)
         return state_dict
diff --git a/applications/ColossalChat/coati/distributed/inference_backend.py b/applications/ColossalChat/coati/distributed/inference_backend.py
index 8ff4b15bfd5d..64dfe5cfd531 100644
--- a/applications/ColossalChat/coati/distributed/inference_backend.py
+++ b/applications/ColossalChat/coati/distributed/inference_backend.py
@@ -205,7 +205,8 @@ def __init__(
         generate_config = generate_config.copy()
         generate_config.update(self.FORCE_GENERATE_CONFIG)
         generate_config.update({"n": num_generations})
-        self.generate_config = SamplingParams(**generate_config)
+        self.generate_config = generate_config
+        self.sample_params = SamplingParams(**generate_config)
         self.model_config = model_config
         self.tokenizer = tokenizer
         self.num_generations = num_generations
@@ -219,8 +220,9 @@ def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwar
         micro_batch_input_ids_no_padding = [
             micro_batch_input_ids[i][first_non_padding_token_idx[i] :] for i in range(micro_batch_size)
         ]
+        sample_params = kwargs.get("sample_params", self.sample_params)
         outputs = self.llm.generate(
-            prompt_token_ids=micro_batch_input_ids_no_padding, sampling_params=self.generate_config, use_tqdm=False
+            prompt_token_ids=micro_batch_input_ids_no_padding, sampling_params=sample_params, use_tqdm=False
         )
         out_tokens = []
         out_len = []
@@ -266,11 +268,11 @@ def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwar
             "response_idx": response_idx,
         }
 
-        data = {k: v.view(micro_batch_size, self.num_generations, v.size(-1)) for k, v in data.items()}
+        data = {k: v.view(micro_batch_size, -1, v.size(-1)) for k, v in data.items()}
 
         if "gt_answer" in kwargs:
             # repeat gt_answer for each prompt.
-            data["gt_answer"] = kwargs["gt_answer"].repeat_interleave(self.num_generations, dim=1)
+            data["gt_answer"] = kwargs["gt_answer"].repeat_interleave(data["input_ids"].size(1), dim=1)
         data = {k: v.to(get_current_device()) for k, v in data.items()}
         return data
 
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index a346d1d4fae9..d1b6158e5b2b 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -34,7 +34,7 @@ def launch_distributed(
     inference_microbatch_size: int,
     train_batch_size: int,
     train_minibatch_size: int,
-    dataset_config: Dict[str, Any],
+    train_dataset_config: Dict[str, Any],
     dataloaders_config: Dict[str, Any],
     inference_model_config: Dict[str, Any],
     generate_config: Dict[str, Any],
@@ -50,6 +50,9 @@ def launch_distributed(
     project_name: Optional[str] = None,
     save_interval: int = 100,
     save_dir: str = "./model",
+    eval_dataset_config: Optional[Dict[str, Any]] = None,
+    eval_interval: int = 100,
+    eval_save_dir: Optional[str] = None,
 ):
 
     if core_algo not in ALGO_MAP:
@@ -60,9 +63,9 @@ def launch_distributed(
     train_dp_size = get_dp_size_fast(num_consumer_procs, plugin_config)
     assert (inference_batch_size * num_producers) % (train_batch_size * train_dp_size) == 0
 
-    dataset_path = dataset_config["path"]
+    dataset_path = train_dataset_config["path"]
     num_samples = get_jsonl_size_fast(dataset_path)
-    global_inference_batch_size = inference_batch_size * num_producers
+    global_inference_batch_size = inference_batch_size * num_producers  # TODO: this doesn't support TP on producer
     num_update_per_episode = num_samples // global_inference_batch_size
     num_recv_per_update = inference_batch_size // inference_microbatch_size
 
@@ -74,7 +77,7 @@ def launch_distributed(
             num_consumer_procs=num_consumer_procs,
             num_episodes=num_episodes,
             batch_size=inference_batch_size,
-            dataset_config=dataset_config,
+            train_dataset_config=train_dataset_config,
             dataloaders_config=dataloaders_config,
             model_config=inference_model_config,
             generate_config=generate_config,
@@ -83,6 +86,10 @@ def launch_distributed(
             backend=inference_backend,
             num_generations=num_generations,
             consumer_plugin_config=plugin_config,
+            eval_dataset_config=eval_dataset_config,
+            eval_interval=eval_interval,
+            evaluation_function_type=grpo_config["reward_fn_type"],
+            eval_save_dir=eval_save_dir,
         )
         procs.append(producer)
     generate_config_consumer = copy.deepcopy(generate_config)
@@ -111,6 +118,7 @@ def launch_distributed(
             project_name=project_name,
             save_interval=save_interval,
             save_dir=save_dir,
+            eval_interval=eval_interval,
         )
         procs.append(consumer)
     ray.get([p.setup.remote() for p in procs])
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index ffe3a44285ec..483ad74b383b 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -1,9 +1,13 @@
+import copy
+import os
 from typing import Any, Dict, Optional
 
 import ray
 import ray.util.collective as cc
 import torch
+import tqdm
 from coati.dataset.loader import RawConversationDataset
+from coati.distributed.reward.reward_fn import boxed_math_reward_fn, math_reward_fn
 from torch.utils.data import DataLoader, DistributedSampler
 from transformers import AutoTokenizer
 
@@ -11,7 +15,12 @@
 
 from .comm import ray_broadcast_tensor_dict
 from .inference_backend import BACKEND_MAP
-from .utils import pre_send
+from .utils import pre_send, safe_write_jsonl
+
+try:
+    from vllm import SamplingParams
+except ImportError:
+    LLM = None
 
 
 class BaseProducer:
@@ -22,7 +31,7 @@ def __init__(
         num_consumer_procs: int,
         num_episodes: int,
         batch_size: int,
-        dataset_config: Dict[str, Any],
+        train_dataset_config: Dict[str, Any],
         dataloaders_config: Dict[str, Any],
         model_config: Dict[str, Any],
         generate_config: Dict[str, Any],
@@ -30,6 +39,10 @@ def __init__(
         microbatch_size: int = 1,
         backend: str = "transformers",
         consumer_plugin_config: Dict[str, Any] = None,
+        eval_dataset_config=None,
+        eval_interval=-1,  # disable evaluation
+        evaluation_function_type="think_answer_tags",
+        eval_save_dir: str = "./eval",
     ):
         self.producer_idx = producer_idx
         self.num_producers = num_producers
@@ -40,10 +53,17 @@ def __init__(
         assert batch_size % microbatch_size == 0
         self.num_microbatches = batch_size // microbatch_size
 
-        self.dataset_config = dataset_config
+        self.train_dataset_config = train_dataset_config
         self.model_config = model_config
         self.generate_config = generate_config
         self.tokenizer_config = tokenizer_config
+        self.consumer_plugin_config = consumer_plugin_config
+        self.eval_interval = eval_interval
+        self.eval_save_dir = eval_save_dir
+        self.consumer_global_step = 0
+
+        if os.path.exists(self.eval_save_dir):
+            raise ValueError(f"Eval save dir {self.eval_save_dir} already exists. Please delete it or change the name.")
 
         # init tokenizer
         if tokenizer_config is None:
@@ -55,13 +75,13 @@ def __init__(
         self.tokenizer.padding_side = "left"
 
         # init dataloader
-        dataset_path = dataset_config.pop("path")
-        self.dataset = RawConversationDataset(self.tokenizer, dataset_path, **dataset_config)
-        self.dataloader = DataLoader(
-            self.dataset,
+        train_dataset_path = train_dataset_config.pop("path")
+        self.train_dataset = RawConversationDataset(self.tokenizer, train_dataset_path, **train_dataset_config)
+        self.train_dataloader = DataLoader(
+            self.train_dataset,
             batch_size=microbatch_size,
             sampler=DistributedSampler(
-                self.dataset,
+                self.train_dataset,
                 num_replicas=num_producers,
                 rank=producer_idx,
                 shuffle=True,
@@ -71,6 +91,36 @@ def __init__(
             num_workers=4,
             drop_last=True,
         )
+
+        self.eval_dataset_config = eval_dataset_config
+        if self.eval_dataset_config is not None:
+            self.eval_dataloaders = {}
+            for eval_task_name in self.eval_dataset_config:
+                eval_dataset_path = eval_dataset_config[eval_task_name].pop("path")
+                eval_dataset = RawConversationDataset(
+                    self.tokenizer, eval_dataset_path, **eval_dataset_config[eval_task_name]
+                )
+                print(f"[P{self.producer_idx}] eval dataset {eval_task_name} size: {len(eval_dataset)}")
+                self.eval_dataloaders[eval_task_name] = DataLoader(
+                    eval_dataset,
+                    batch_size=microbatch_size,
+                    sampler=DistributedSampler(
+                        eval_dataset,
+                        num_replicas=num_producers,
+                        rank=producer_idx,
+                        shuffle=False,
+                        drop_last=False,
+                        seed=42,
+                    ),
+                )
+            if evaluation_function_type == "think_answer_tags":
+                self.evaluation_function = math_reward_fn
+            elif evaluation_function_type == "boxed":
+                self.evaluation_function = boxed_math_reward_fn
+            else:
+                raise ValueError(f"Unknown evaluation function type {evaluation_function_type}")
+        else:
+            raise ValueError("eval_dataset_config is not defined")
         self.device = get_current_device()
 
         # init backend
@@ -88,6 +138,7 @@ def setup(self) -> None:
                 cc.init_collective_group(self.num_producers + 1, self.producer_idx, group_name=f"sync_model_{i}")
         else:
             cc.init_collective_group(self.num_producers + 1, self.producer_idx, group_name="sync_model")
+        cc.init_collective_group(1 + self.num_consumer_procs, 0, group_name=f"sync_eval_statistics_{self.producer_idx}")
 
     def rollout(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:
         raise NotImplementedError
@@ -96,29 +147,64 @@ def load_state_dict(self, state_dict: Dict[str, torch.Tensor]) -> None:
         raise NotImplementedError
 
     def loop(self) -> None:
-        num_update_per_episode = len(self.dataloader) // self.num_microbatches
+        num_update_per_episode = len(self.train_dataloader) // self.num_microbatches
         num_valid_microbatches = num_update_per_episode * self.num_microbatches
 
         print(
-            f"[P{self.producer_idx}] num_valid_microbatches {num_valid_microbatches}, nmb: {self.num_microbatches}, dl: {len(self.dataloader)}"
+            f"[P{self.producer_idx}] num_valid_microbatches {num_valid_microbatches}, nmb: {self.num_microbatches}, dl: {len(self.train_dataloader)}"
         )
         for episode in range(self.num_episodes):
-            self.dataloader.sampler.set_epoch(episode)
-            for i, batch in enumerate(self.dataloader):
+            self.train_dataloader.sampler.set_epoch(episode)
+            for i, batch in enumerate(self.train_dataloader):
                 if i >= num_valid_microbatches:
                     break
+                if self.eval_interval > 0 and self.eval_dataset_config is not None:
+                    if i % self.eval_interval == 0:
+                        eval_statistics = {}
+                        for eval_task_name in self.eval_dataloaders:
+                            print(
+                                f"[P{self.producer_idx}] Evaluate episode {episode} step {i} on task {eval_task_name}"
+                            )
+                            eval_results = []
+                            eval_statistics[eval_task_name] = torch.zeros(2, device=self.device)
+                            for eval_batch in tqdm.tqdm(
+                                self.eval_dataloaders[eval_task_name], disable=self.producer_idx != 0
+                            ):
+                                eval_outputs = self.rollout(**eval_batch, sample_params=self.eval_sample_params)
+                                eval_results = eval_results + [
+                                    self.evaluation_function(
+                                        eval_outputs["input_ids"][m][n],
+                                        eval_outputs["gt_answer"][m][n],
+                                        eval_outputs["response_idx"][m][n],
+                                        tokenizer=self.tokenizer,
+                                        eval_mode=True,
+                                    )
+                                    for m in range(eval_outputs["input_ids"].size(0))
+                                    for n in range(eval_outputs["input_ids"].size(1))
+                                ]
+                            eval_statistics[eval_task_name][0] += len(
+                                [res for res in eval_results if res["ans_valid"] == 1]
+                            )
+                            eval_statistics[eval_task_name][1] += len(eval_results)
+                            # save eval results
+                            result_file_name = os.path.join(
+                                self.eval_save_dir,
+                                f"{eval_task_name}_episode_{episode}_step_{self.consumer_global_step}.jsonl",
+                            )
+                            # delete the file if it exists
+                            safe_write_jsonl(result_file_name, eval_results)
+                        print(f"[P{self.producer_idx}] Send eval statistics episode {episode} step {i}")
+                        ray_broadcast_tensor_dict(
+                            eval_statistics,
+                            src=0,
+                            device=self.device,
+                            group_name=f"sync_eval_statistics_{self.producer_idx}",
+                        )
                 outputs = self.rollout(**batch)
 
                 print(f"[P{self.producer_idx}] Send data {[(k, v.shape) for k, v in outputs.items()]}")
                 outputs["temperature"] = torch.tensor(
-                    [
-                        (
-                            self.model.generate_config["temperature"]
-                            if isinstance(self.model.generate_config.temperature, dict)
-                            else self.model.generate_config.temperature
-                        )
-                    ]
-                    * outputs["input_ids"].size(0)
+                    [self.model.generate_config["temperature"]] * outputs["input_ids"].size(0)
                 ).to(outputs["input_ids"].device)
                 outputs = pre_send(outputs)
                 ray_broadcast_tensor_dict(
@@ -150,6 +236,8 @@ def loop(self) -> None:
                         state_dict = ray_broadcast_tensor_dict(
                             None, self.num_producers, device=self.device, group_name="sync_model"
                         )
+                        if "consumer_global_step" in state_dict:
+                            self.consumer_global_step = state_dict.pop("consumer_global_step").item()
                         self.load_state_dict(state_dict)
                     del state_dict
                     torch.cuda.empty_cache()
@@ -159,7 +247,7 @@ def loop(self) -> None:
                         self.model.llm.wake_up()
                 # linear annealing for 1 episode, temperature from initial to 0.9
                 if episode <= 0:
-                    ratio = 1 - (len(self.dataloader) - i) / len(self.dataloader)
+                    ratio = 1 - (len(self.train_dataloader) - i) / len(self.train_dataloader)
                     if isinstance(self.model.generate_config.temperature, dict):
                         self.model.generate_config["temperature"] = (1 - ratio) * self.generate_config[
                             "temperature"
@@ -179,7 +267,7 @@ def __init__(
         num_consumer_procs,
         num_episodes,
         batch_size,
-        dataset_config,
+        train_dataset_config,
         dataloaders_config,
         model_config,
         generate_config,
@@ -188,6 +276,10 @@ def __init__(
         backend="transformers",
         num_generations: int = 8,
         consumer_plugin_config=None,
+        eval_dataset_config=None,
+        eval_interval=-1,  # disable evaluation
+        evaluation_function_type="think_answer_tags",
+        eval_save_dir: str = "./eval",
     ):
         super().__init__(
             producer_idx,
@@ -195,7 +287,7 @@ def __init__(
             num_consumer_procs,
             num_episodes,
             batch_size,
-            dataset_config,
+            train_dataset_config,
             dataloaders_config,
             model_config,
             generate_config,
@@ -203,14 +295,21 @@ def __init__(
             microbatch_size,
             backend,
             consumer_plugin_config,
+            eval_dataset_config=eval_dataset_config,
+            eval_interval=eval_interval,
+            evaluation_function_type=evaluation_function_type,
+            eval_save_dir=eval_save_dir,
         )
         self.model = self.backend_cls(model_config, generate_config, self.tokenizer, num_generations)
+        self.eval_generation_config = copy.deepcopy(self.model.generate_config)
+        self.eval_generation_config["n"] = 1  # use 1 generation for evaluation
+        self.eval_sample_params = SamplingParams(**self.eval_generation_config)
 
     @torch.no_grad()
     def rollout(self, input_ids, attention_mask, **kwargs):
         rollouts = self.model.generate(input_ids, attention_mask, **kwargs)
-        if self.producer_idx == 1:
-            print("Rollout example:\n", self.tokenizer.decode(rollouts["input_ids"][0][0], skip_special_tokens=True))
+        # if self.producer_idx == 1:
+        #     print("Rollout example:\n", self.tokenizer.decode(rollouts["input_ids"][0][0], skip_special_tokens=True))
 
         return rollouts
 
diff --git a/applications/ColossalChat/coati/distributed/reward/reward_fn.py b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
index b68c1a92fc6c..14d340dc4932 100644
--- a/applications/ColossalChat/coati/distributed/reward/reward_fn.py
+++ b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
@@ -5,6 +5,7 @@
 
 def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
     tokenizer = kwargs["tokenizer"]
+    eval_mode = kwargs.get("eval_mode", False)
     soft_over_length_punishment = kwargs.get("soft_over_length_punishment", False)
     acc_score = 10.0
     reward = torch.tensor(0.0)
@@ -44,36 +45,23 @@ def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
 
     reward = reward + length_reward
 
-    return torch.tensor([reward, format_acc, ans_acc]).to(input_ids.device)
-
-
-def gsm8k_reward_fn(input_ids, **kwargs):
-    gt_answer = kwargs["gt_answer"]
-    tokenizer = kwargs["tokenizer"]
-    s, e = kwargs["response_start"], kwargs["response_end"]
-    reward = torch.tensor(0.0).to(input_ids.device)
-    if gt_answer is None:
-        return reward
-    decoded_final_answer = tokenizer.decode(input_ids[s : e + 1], skip_special_tokens=True)
-    final_answer, processed_str = extract_solution(decoded_final_answer)
-    is_valid = True
-    try:
-        int(final_answer.strip())
-    except Exception:
-        is_valid = False
-
-    format_valid = validate_response_structure(processed_str, kwargs["tags"])
-    if not is_valid or not format_valid:
-        return reward
+    if not eval_mode:
+        return torch.tensor([reward, format_acc, ans_acc]).to(input_ids.device)
     else:
-        reward += 1.0
-        if gt_answer.strip().replace(" ", "").lower() == final_answer.strip().replace(" ", "").lower():
-            reward = reward + 9.0
-        return reward
+        prompt = tokenizer.decode(input_ids[:s], skip_special_tokens=True)
+        return {
+            "prompt": prompt,
+            "prediction": decoded_final_answer,
+            "gold": gt_answer,
+            "parsed": final_answer,
+            "format_valid": format_acc.item(),
+            "ans_valid": ans_acc.item(),
+        }
 
 
 def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
     tokenizer = kwargs["tokenizer"]
+    eval_mode = kwargs.get("eval_mode", False)
     soft_over_length_punishment = kwargs.get("soft_over_length_punishment", False)
     format_score = 0.0
     acc_score = 10.0
@@ -91,7 +79,7 @@ def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
             length_reward = ((max_length - cache_length) - res_length) / cache_length * acc_score
 
     if gt_answer is None:
-        return reward
+        return torch.tensor([reward, format_acc, ans_acc]).to(input_ids.device)
 
     decoded_final_answer = tokenizer.decode(input_ids[s : e + 1], skip_special_tokens=True)
     gt_answer = tokenizer.decode(gt_answer.squeeze(0), skip_special_tokens=True)
@@ -108,5 +96,15 @@ def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
         reward += acc_score
 
     reward = reward + length_reward
-
-    return torch.tensor([reward, format_acc, ans_acc]).to(input_ids.device)
+    if not eval_mode:
+        return torch.tensor([reward, format_acc, ans_acc]).to(input_ids.device)
+    else:
+        prompt = tokenizer.decode(input_ids[:s], skip_special_tokens=True)
+        return {
+            "prompt": prompt,
+            "prediction": decoded_final_answer,
+            "gold": gt_answer,
+            "parsed": final_answer,
+            "format_valid": format_acc.item(),
+            "ans_valid": ans_acc.item(),
+        }
diff --git a/applications/ColossalChat/coati/distributed/utils.py b/applications/ColossalChat/coati/distributed/utils.py
index 5f7879669d08..fb2a0dfd3e90 100644
--- a/applications/ColossalChat/coati/distributed/utils.py
+++ b/applications/ColossalChat/coati/distributed/utils.py
@@ -1,6 +1,9 @@
+import json
+import os
 from typing import Any, Dict, List
 
 import torch
+from filelock import FileLock
 
 from colossalai.shardformer.layer.loss import dist_log_prob
 
@@ -130,3 +133,13 @@ def masked_sum(tensor: torch.Tensor, mask: torch.Tensor, dim: int = 1) -> torch.
     """
     tensor = tensor * mask
     return tensor.sum(dim=dim)
+
+
+def safe_write_jsonl(file_path, data):
+    with FileLock(file_path + ".lock"):
+        # Ensure file exists
+        os.makedirs(os.path.dirname(file_path), exist_ok=True)
+        with open(file_path, "a", encoding="utf8") as f:
+            for entry in data:
+                json_line = json.dumps(entry, ensure_ascii=False)
+                f.write(json_line + "\n")
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 788e60c2edac..b2fafd42e31f 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -1,4 +1,5 @@
 import argparse
+import json
 import os
 
 import ray
@@ -8,7 +9,16 @@
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("-m", "--model", type=str, default="Qwen/Qwen2.5-7B")
-    parser.add_argument("-d", "--dataset", type=str, default="data.jsonl")
+    parser.add_argument("-d", "--dataset", type=str, default="data_train.jsonl")
+    parser.add_argument(
+        "-ed",
+        "--eval-dataset",
+        type=str,
+        default='{"eval task name":"data_eval.jsonl"}',
+        help="Evaluation dataset for each task, please use json format to specify the dataset for each task. \
+        For example: {'task1':'data_eval_task1.jsonl', 'task2':'data_eval_task2.jsonl'}, the jsonl file should be in the same format as the training dataset. \
+        The key is the task name, and the value is the path to the jsonl file",
+    )
     parser.add_argument("-p", "--project", type=str, default="GRPO", help="Project name.")
     parser.add_argument("-e", "--num-episodes", type=int, default=1, help="Number of episodes to train.")
 
@@ -94,11 +104,14 @@
         choices=["think_answer_tags", "boxed"],
         help="Reward type for GRPO.",
     )
+    parser.add_argument("-ei", "--eval-interval", type=int, default=100, help="Interval for evaluation.")
 
     # Logging/Checkpointing parameters
     parser.add_argument("-si", "--save-interval", type=int, default=100, help="Interval for saving checkpoints.")
     parser.add_argument("-sd", "--save-dir", type=str, default="./model", help="Directory for saving checkpoints.")
-
+    parser.add_argument(
+        "-esd", "--eval-save-dir", type=str, default="./eval", help="Directory for saving evaluation results."
+    )
     args = parser.parse_args()
 
     if args.train_minibatch_size is None:
@@ -208,7 +221,7 @@
         inference_microbatch_size=args.inference_microbatch_size,
         train_batch_size=args.train_batch_size,
         train_minibatch_size=args.train_minibatch_size,
-        dataset_config={
+        train_dataset_config={
             "path": args.dataset,
             "max_length": args.max_prompt_tokens,
             "system_prompt": args.system_prompt,
@@ -238,4 +251,10 @@
         project_name=args.project,
         save_interval=args.save_interval,
         save_dir=os.path.join(args.save_dir, args.project.replace(" ", "_")),
+        eval_dataset_config={
+            k: {"path": v, "max_length": args.max_prompt_tokens, "system_prompt": args.system_prompt}
+            for k, v in json.loads(args.eval_dataset).items()
+        },
+        eval_interval=args.eval_interval,
+        eval_save_dir=os.path.join(args.eval_save_dir, args.project.replace(" ", "_")),
     )

From aca547623f9d23bda7e0881d65fc7fe8affbbde8 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Wed, 14 May 2025 16:40:35 +0800
Subject: [PATCH 054/100] [feat] Support prompt level dynamic  (#6300)

* adjust to dynamic prompt bs

* remove debug

* update pad seq (#6303)

Co-authored-by: Tong Li <tong.li35271158@gmail.com>

* adjust to dynamic prompt bs

* remove debug

* fix dp issue

* fix

* fix default settings

---------

Co-authored-by: Tong Li <tong.li35271158@gmail.com>
---
 .../coati/distributed/consumer.py             |  69 ++++++-----
 .../coati/distributed/grpo_consumer.py        | 107 +++++++++---------
 .../ColossalChat/coati/trainer/utils.py       |  26 +++++
 applications/ColossalChat/rl_example.py       |  10 +-
 4 files changed, 121 insertions(+), 91 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 9503d65eb34d..38b31398d125 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -107,9 +107,14 @@ def loop(self) -> None:
             f"Consumer{self.rank} num_update: {self.num_update_per_episode}, num_recv: {self.num_recv_per_update}, nmb: {self.num_microbatches}"
         )
         for episode in range(self.num_episodes):
-            with tqdm(range(self.num_update_per_episode), desc=f"Episode {episode}", disable=self.rank != 0) as pbar:
+            with tqdm(
+                range(self.num_update_per_episode),
+                desc=f"Episode {episode} with rollout step(s)",
+                disable=self.rank != 0,
+            ) as pbar:
                 for step in pbar:
                     i = 0
+                    allow_sync_model = False
                     for _ in range(self.num_recv_per_update):
                         # receive data from producers
                         for r in range(self.num_producers):
@@ -127,15 +132,15 @@ def loop(self) -> None:
                             ]
                             batch = bind_batch(batches)
                             batch = post_recv(batch)
-                            loss, num_excessive_prompts = self.step(i, pbar, **batch)
-                            self.buffer = (
-                                self.buffer[
-                                    (self.dp_rank + 1) * self.minibatch_size
-                                    - num_excessive_prompts : (self.dp_rank + 1) * self.minibatch_size
-                                ]
-                                + self.buffer[self.dp_size * self.minibatch_size :]
-                            )
+                            loss, excessive_prompts_idx = self.step(i, pbar, **batch)
+
+                            if excessive_prompts_idx is not None:
+                                excessive_prompts = [self.buffer[idx] for idx in excessive_prompts_idx]
+                                self.buffer = excessive_prompts + self.buffer[self.dp_size * self.minibatch_size :]
+                            else:
+                                self.buffer = self.buffer[self.dp_size * self.minibatch_size :]
                             if loss is not None:
+                                allow_sync_model = True
                                 pbar.set_postfix({"loss": loss})
                             i += 1
                     if self.lr_scheduler is not None:
@@ -149,29 +154,31 @@ def loop(self) -> None:
                             print(f"Saved model checkpoint at step {step + 1} in folder {save_path}")
 
                     if episode != self.num_episodes - 1 or step != self.num_update_per_episode - 1:
-                        if self.pp_size > 1:
-                            print(
-                                f"[T{dist.get_rank()}] Sync model PP stage {self.pp_rank} episode {episode} step {step}"
-                            )
-                        else:
-                            print(f"[T{dist.get_rank()}] Sync model episode {episode} step {step}")
-                        torch.cuda.empty_cache()
-                        state_dict = self.state_dict()
-                        if self.pp_size > 1:
-                            if self.tp_rank == 0 and self.dp_rank == 0:
-                                ray_broadcast_tensor_dict(
-                                    state_dict,
-                                    src=self.num_producers,
-                                    device=self.device,
-                                    group_name=f"sync_model_{self.pp_rank}",
-                                )
-                        else:
-                            if self.rank == 0:
-                                ray_broadcast_tensor_dict(
-                                    state_dict, src=self.num_producers, device=self.device, group_name="sync_model"
+                        if allow_sync_model:
+                            if self.pp_size > 1:
+                                print(
+                                    f"[T{dist.get_rank()}] Sync model PP stage {self.pp_rank} episode {episode} step {step}"
                                 )
-                        del state_dict
-                        torch.cuda.empty_cache()
+                            else:
+                                print(f"[T{dist.get_rank()}] Sync model episode {episode} step {step}")
+                            torch.cuda.empty_cache()
+                            state_dict = self.state_dict()
+                            if self.pp_size > 1:
+                                if self.tp_rank == 0 and self.dp_rank == 0:
+                                    ray_broadcast_tensor_dict(
+                                        state_dict,
+                                        src=self.num_producers,
+                                        device=self.device,
+                                        group_name=f"sync_model_{self.pp_rank}",
+                                    )
+                            else:
+                                if self.rank == 0:
+                                    ray_broadcast_tensor_dict(
+                                        state_dict, src=self.num_producers, device=self.device, group_name="sync_model"
+                                    )
+                            del state_dict
+                            torch.cuda.empty_cache()
+                            allow_sync_model = False
 
 
 @ray.remote
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 877ff98ec55f..31b6876397fc 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -1,4 +1,3 @@
-import warnings
 from contextlib import nullcontext
 from typing import Any, Optional
 
@@ -10,7 +9,7 @@
 from coati.distributed.reward.reward_fn import boxed_math_reward_fn, math_reward_fn
 from coati.distributed.reward.verifiable_reward import VerifiableReward
 from coati.distributed.utils import calc_action_log_probs
-from coati.trainer.utils import all_reduce_mean, all_reduce_sum
+from coati.trainer.utils import all_gather_tensors, all_reduce_mean, all_reduce_sum
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
@@ -42,13 +41,6 @@ def __init__(
         save_dir="./model",
     ):
         print(f"Using GRPO config: {grpo_config}")
-        if grpo_config.get("loss_variation", "sample_level") == "token_level":
-            if batch_size != minibatch_size:
-                warnings.warn(
-                    f"Applied token_level loss, force overwrite mini-batch-size with batch-size: mini-batch-size: {minibatch_size}->{batch_size}",
-                    UserWarning,
-                )
-                minibatch_size = batch_size
         if (
             plugin_config.get("pp_size", 1) > 1
             and "num_microbatches" not in plugin_config
@@ -90,6 +82,7 @@ def __init__(
         self.grpo_config = grpo_config
         self.project_name = project_name
         self.effective_sample_count = 0
+        self.effective_prompt_count = 0
         self.total_sample_count = 0
 
         self.policy_loss_fn = PolicyLoss(
@@ -213,70 +206,66 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
         group_ans_acc = (
             ans_acc.view(-1, self.num_generations).mean(dim=1).repeat_interleave(self.num_generations, dim=0)
         )
+        # [minibatch_size x num_of_generation]
         loss_mask = (
             torch.ones(action_mask.size(0), device=action_mask.device).bool()
             if self.filter_range is None
             else torch.logical_and(group_ans_acc > self.filter_range[0], group_ans_acc < self.filter_range[1])
         )
+
         # filter out overlength samples
         if self.filter_truncated_response and action_mask.size(1) == self.max_length:
             loss_mask = torch.logical_and(
                 loss_mask,
                 action_mask[:, -1] == False,
             )
-        effective_tokens_count = torch.sum(action_mask, dim=-1) * loss_mask
-        effective_samples = all_reduce_sum(torch.sum(loss_mask), self.plugin)
-        total_effective_tokens_count = all_reduce_sum(torch.sum(effective_tokens_count), self.plugin)
-        total_samples = all_reduce_sum(torch.sum(torch.ones_like(loss_mask, device=loss_mask.device)), self.plugin)
-        self.effective_sample_count += effective_samples.item()
-        self.total_sample_count += total_samples.item()
+        prompt_level_mask = loss_mask.view(self.minibatch_size, self.num_generations)
+
+        # [minibatch_size] -> calculate the number of effective prompts
+        effective_prompts_mask = prompt_level_mask.any(dim=1)
+        effective_prompts = all_reduce_sum(torch.sum(effective_prompts_mask), self.plugin)
+        self.effective_prompt_count += effective_prompts.item()
+        excessive_prompts_idx = None
 
         mean_kl, mean_loss = [], []
 
         if self.grpo_config.get("dynamic_batching", True):
-            need_update = self.effective_sample_count >= self.batch_size * self.dp_size * self.num_generations
-            # to exclude the excessive samples from the last batch, the last num_excessive_samples samples are not used for training and will be kept in buffer for the next iteration.
-            num_excessive_samples = (
-                int(
-                    (self.effective_sample_count - self.batch_size * self.dp_size * self.num_generations)
-                    / self.num_generations
-                    / self.dp_size
-                )
-                * self.num_generations
-            )
-            if num_excessive_samples > 0:
-                data = {
-                    k: (
-                        v[: -num_excessive_samples if num_excessive_samples != 0 else v.size(0)]
-                        if k
-                        in [
-                            "input_ids",
-                            "attention_mask",
-                            "action_log_probs",
-                            "action_mask",
-                            "response_idx",
-                            "gt_answer",
-                        ]
-                        else v
-                    )
-                    for k, v in data.items()
-                }
-                action_mask = action_mask[
-                    : -num_excessive_samples if num_excessive_samples != 0 else action_mask.size(0)
-                ]
-                loss_mask = loss_mask[: -num_excessive_samples if num_excessive_samples != 0 else loss_mask.size(0)]
-                advantages = advantages[: -num_excessive_samples if num_excessive_samples != 0 else advantages.size(0)]
-            else:
-                num_excessive_samples = 0
+            need_update = self.effective_prompt_count >= self.batch_size * self.dp_size
+            excessive_prompts = self.effective_prompt_count - self.batch_size * self.dp_size
+
+            if excessive_prompts > 0:
+                excessive_prompts_per_rank = excessive_prompts // self.dp_size
+                # Only count excessive prompts if they are greater than 1 per rank.
+                # TODO: customize excessive prompts calculation.
+                if excessive_prompts_per_rank != 0:
+                    # Mask excessive prompts to False
+                    true_indices = torch.nonzero(effective_prompts_mask).squeeze()
+                    if excessive_prompts_per_rank <= len(true_indices):
+                        excessive_prompts_idx = true_indices[-excessive_prompts_per_rank:]
+                    else:
+                        excessive_prompts_idx = true_indices
+                    effective_prompts_mask[excessive_prompts_idx] = False
+
+                    for mask_idx in range(len(effective_prompts_mask)):
+                        if effective_prompts_mask[mask_idx] == False:
+                            # Update loss mask.
+                            loss_mask[mask_idx] = False
         else:
             # If dynamic batching is disabled, we need to use all samples for training.
             need_update = (step_idx + 1) % self.num_microbatches == 0
-            num_excessive_samples = 0
+
+        effective_samples = all_reduce_sum(torch.sum(loss_mask), self.plugin)
+        effective_tokens_count = torch.sum(action_mask, dim=-1) * loss_mask
+        total_effective_tokens_count = all_reduce_sum(torch.sum(effective_tokens_count), self.plugin)
+        total_samples = all_reduce_sum(torch.sum(torch.ones_like(loss_mask, device=loss_mask.device)), self.plugin)
+        self.effective_sample_count += effective_samples.item()
+        self.total_sample_count += total_samples.item()
 
         pbar.set_postfix(
             {
-                "Step": self.global_step + 1,
-                "Status": f"Collecting: {self.effective_sample_count}/{self.batch_size * self.dp_size * self.num_generations}",
+                "Global Step": self.global_step,
+                "Effective prompts": f"{self.effective_prompt_count}/{self.batch_size * self.dp_size}",
+                "Effective samples": f"{self.effective_sample_count}/{self.batch_size * self.dp_size * self.num_generations}",
             }
         )
 
@@ -375,7 +364,7 @@ def _criterion(outputs, inputs):
                             kl.append(appox_kl.mean())
                         else:
                             per_token_kl = 0.0
-                            kl.append(0.0)
+                            kl.append(torch.tensor(0.0))
 
                         loss, _ = self.policy_loss_fn(
                             action_log_probs,
@@ -479,6 +468,7 @@ def _criterion(outputs, inputs):
             self.optimizer.zero_grad()
             self.global_step += 1
             sample_utilization = self.effective_sample_count / self.total_sample_count
+            self.effective_prompt_count = 0
             self.effective_sample_count = 0
             self.total_sample_count = 0
             loss_scalar = self.accum_loss.item()
@@ -495,6 +485,7 @@ def _criterion(outputs, inputs):
                         f"Acc Reward: {self.accum_ans_acc.item() / self.accum_count:.4f}",
                         f"Advantages: {self.accum_advantages.item() / self.accum_count:.4f}",
                         f"Response Length: {self.accum_response_length.item() / self.accum_count:.4f}",
+                        f"Sample_utilization: {sample_utilization:.4f}",
                     ] + ([f"KL: {self.accum_kl.item() / self.accum_count:.4f}"] if self.policy_loss_fn.beta > 0 else [])
                     print("\n".join(to_log_msg))
                     metrics = {
@@ -520,9 +511,15 @@ def _criterion(outputs, inputs):
                 self.accum_advantages.zero_()
                 self.accum_response_length.zero_()
                 self.accum_count = 0
-            return loss_scalar, num_excessive_samples // self.num_generations
+
+            if excessive_prompts_idx is not None:
+                # All gather excessive prompts index across DP ranks.
+                excessive_prompts_idx = [idx + self.dp_rank * self.minibatch_size for idx in excessive_prompts_idx]
+                excessive_prompts_idx = all_gather_tensors(excessive_prompts_idx, self.plugin)
+
+            return loss_scalar, excessive_prompts_idx
         else:
-            return None, num_excessive_samples // self.num_generations
+            return None, excessive_prompts_idx
 
     def state_dict(self):
         self.policy_model._force_wait_all_gather()
diff --git a/applications/ColossalChat/coati/trainer/utils.py b/applications/ColossalChat/coati/trainer/utils.py
index 5153ce3adad5..25615d19184f 100755
--- a/applications/ColossalChat/coati/trainer/utils.py
+++ b/applications/ColossalChat/coati/trainer/utils.py
@@ -144,3 +144,29 @@ def all_reduce_sum(tensor: torch.Tensor, plugin: Plugin = None) -> torch.Tensor:
     else:
         dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM)
     return tensor
+
+
+def all_gather_tensors(local_tensor_list: torch.Tensor, plugin: Plugin = None) -> torch.Tensor:
+    """
+    Gathers tensors from all processes and concatenates them along the first dimension.
+
+    Args:
+        tensor (torch.Tensor): The input tensor to be gathered.
+
+    Returns:
+        torch.Tensor: The gathered tensor.
+    """
+    # Gather tensors across DP group
+    if plugin is not None:
+        all_tensor_lists = [None] * plugin.dp_size
+        dist.all_gather_object(all_tensor_lists, local_tensor_list, group=plugin.dp_group)
+        gathered_tensor_list = []
+        for tensors in all_tensor_lists:
+            gathered_tensor_list.extend(tensors)
+    else:
+        all_tensor_lists = [None] * dist.get_world_size()
+        dist.all_gather_object(all_tensor_lists, local_tensor_list)
+        gathered_tensor_list = []
+        for tensors in all_tensor_lists:
+            gathered_tensor_list.extend(tensors)
+    return gathered_tensor_list
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 788e60c2edac..f5609c8908d6 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -9,7 +9,7 @@
     parser = argparse.ArgumentParser()
     parser.add_argument("-m", "--model", type=str, default="Qwen/Qwen2.5-7B")
     parser.add_argument("-d", "--dataset", type=str, default="data.jsonl")
-    parser.add_argument("-p", "--project", type=str, default="GRPO", help="Project name.")
+    parser.add_argument("-p", "--project", type=str, default="GRPO-V3", help="Project name.")
     parser.add_argument("-e", "--num-episodes", type=int, default=1, help="Number of episodes to train.")
 
     # Distributed training parameters
@@ -20,7 +20,7 @@
         "-ibs",
         "--inference-batch-size",
         type=int,
-        default=None,
+        default=64,
         help="Number of prompts to generate per inference step. It should be divisible by tbs, and the weights on the inference backend will be synced every ibs/tbs training steps of the policy model.",
     )
     parser.add_argument(
@@ -41,7 +41,7 @@
         "-tMbs",
         "--train-minibatch-size",
         type=int,
-        default=None,
+        default=8,
         help="Number of unique prompts in each training batch per dp group. The inference backend must generate tMbs * g * dp_size samples before forwarding. Satisfy tMbs * g >= tmbs",
     )
     parser.add_argument(
@@ -58,7 +58,7 @@
         "--master_address", type=str, default=None, help="Master address for multi-node distributed training, Optional"
     )
     parser.add_argument(
-        "--master_port", type=int, default=29505, help="Master port for multi-node distributed training, Optional"
+        "--master_port", type=int, default=29506, help="Master port for multi-node distributed training, Optional"
     )
 
     # Sampling parameters
@@ -223,7 +223,7 @@
             "zero_stage": 2,
         },  # for zero
         # plugin_config={
-        #     "tp_size": 1,
+        #     "tp_size": 2,
         #     "pp_size": 2,
         #     "microbatch_size": max(
         #         1, args.train_microbatch_size // 2

From 50070c1e84a36f11d21efddd3c6828cd32b56f16 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Wed, 14 May 2025 18:10:57 +0800
Subject: [PATCH 055/100] move logging to producer

---
 .../coati/distributed/consumer.py             | 25 ------
 .../coati/distributed/grpo_consumer.py        | 30 ++++---
 .../coati/distributed/inference_backend.py    |  2 +-
 .../ColossalChat/coati/distributed/launch.py  | 14 ++-
 .../coati/distributed/producer.py             | 86 +++++++++++++------
 .../ColossalChat/coati/distributed/utils.py   |  2 +-
 applications/ColossalChat/rl_example.py       |  3 +
 7 files changed, 92 insertions(+), 70 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 620ab6ff63b2..800a3c799f0a 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -36,7 +36,6 @@ def __init__(
         minibatch_size: int = 1,
         save_interval: int = 100,
         save_dir: str = "./model",
-        eval_interval: int = -1,
     ):
         self.num_producers = num_producers
         self.num_episodes = num_episodes
@@ -52,7 +51,6 @@ def __init__(
         self.save_dir = save_dir
         assert batch_size % minibatch_size == 0, "batch_size should be divisible by microbatch_size"
         self.num_microbatches = batch_size // minibatch_size
-        self.eval_interval = eval_interval
 
         self.model_config = model_config
         self.plugin_config = plugin_config
@@ -94,9 +92,6 @@ def setup(self) -> None:
             if self.rank == 0:
                 cc.init_collective_group(self.num_producers + 1, self.num_producers, group_name="sync_model")
 
-        for i in range(self.num_producers):
-            cc.init_collective_group(self.world_size + 1, self.rank + 1, group_name=f"sync_eval_statistics_{i}")
-
         self.buffer = []
         self.recv_cnt = 0
 
@@ -114,24 +109,6 @@ def loop(self) -> None:
             with tqdm(range(self.num_update_per_episode), desc=f"Episode {episode}", disable=self.rank != 0) as pbar:
                 for step in pbar:
                     i = 0
-                    if self.eval_interval > 0 and step % self.eval_interval == 0:
-                        eval_statistics = None
-                        for r in range(self.num_producers):
-                            print(f"[T{dist.get_rank()}] Recv eval result episode {episode} step {step} from {r}")
-                            local_eval_result = ray_broadcast_tensor_dict(
-                                None, src=0, device=self.device, group_name=f"sync_eval_statistics_{r}"
-                            )
-                            if eval_statistics is None:
-                                eval_statistics = local_eval_result
-                            else:
-                                eval_statistics = {
-                                    k: eval_statistics[k] + local_eval_result[k] for k in eval_statistics
-                                }
-                        eval_statistics = {k: (v[0] / v[1]).item() for k, v in eval_statistics.items()}
-                        if dist.get_rank() == 0:
-                            if hasattr(self, "wandb_run") and hasattr(self, "global_step"):
-                                self.wandb_run.log(eval_statistics, step=self.global_step)
-                            print(f"Eval statistics: {eval_statistics}")
                     for _ in range(self.num_recv_per_update):
                         # receive data from producers
                         for r in range(self.num_producers):
@@ -214,7 +191,6 @@ def __init__(
         minibatch_size=1,
         save_interval: int = 100,
         save_dir="./model",
-        eval_interval: int = -1,
     ):
         super().__init__(
             num_producers,
@@ -231,7 +207,6 @@ def __init__(
             minibatch_size,
             save_interval,
             save_dir,
-            eval_interval,
         )
         path = model_config.pop("path")
         self.model = AutoModelForCausalLM.from_pretrained(path, **model_config)
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 0336125d17af..f301345b6c0d 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -34,13 +34,13 @@ def __init__(
         plugin_config,
         minibatch_size=1,
         num_generations=8,
-        use_wandb=True,
         generate_config=None,
         grpo_config={},
-        project_name=None,
         save_interval: int = 100,
         save_dir="./model",
-        eval_interval: int = -1,
+        project_name: str = None,
+        run_name: str = None,
+        wandb_group_name: str = None,
     ):
         print(f"Using GRPO config: {grpo_config}")
         if grpo_config.get("loss_variation", "sample_level") == "token_level":
@@ -73,7 +73,6 @@ def __init__(
             minibatch_size,
             save_interval=save_interval,
             save_dir=save_dir,
-            eval_interval=eval_interval,
         )
         path = model_config.pop("path")
         self.policy_model = AutoModelForCausalLM.from_pretrained(path, **model_config)
@@ -93,6 +92,9 @@ def __init__(
         self.project_name = project_name
         self.effective_sample_count = 0
         self.total_sample_count = 0
+        self.project_name = project_name
+        self.run_name = run_name
+        self.wandb_group_name = wandb_group_name
 
         self.policy_loss_fn = PolicyLoss(
             clip_eps_low=grpo_config.get("clip_eps_low", 0.2),
@@ -143,7 +145,6 @@ def __init__(
             **reward_model_kwargs,
         )
         self.global_step = 0
-        self.use_wandb = use_wandb
 
         self.lr_scheduler = CosineAnnealingWarmupLR(
             optimizer=self.optimizer,
@@ -154,13 +155,16 @@ def __init__(
 
     def setup(self):
         super().setup()
-        if self.use_wandb and (
-            (not self.plugin.pp_size > 1 and self.rank == 0)
-            or (self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0)
+        if (not self.plugin.pp_size > 1 and self.rank == 0) or (
+            self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0
         ):
-            # Initialize wandb.
-            name = f"{self.generate_config['backend']}_bs_{self.batch_size*self.dp_size}_temp_{self.generate_config['temperature']:.01f}_top_p_{self.generate_config['top_p']:.02f}"
-            self.wandb_run = wandb.init(project=self.project_name, sync_tensorboard=True, dir="./wandb", name=name)
+            self.wandb_run = wandb.init(
+                project=self.project_name,
+                sync_tensorboard=True,
+                dir="./wandb",
+                name=self.run_name,
+                group=self.wandb_group_name,
+            )
 
         self.policy_model, self.optimizer, _, _, self.lr_scheduler = self.booster.boost(
             self.policy_model, self.optimizer, lr_scheduler=self.lr_scheduler
@@ -512,8 +516,8 @@ def _criterion(outputs, inputs):
                     }
                     if self.policy_loss_fn.beta > 0:
                         metrics["train/kl"] = self.accum_kl.item() / self.accum_count
-
-                    self.wandb_run.log(metrics)
+                    if self.wandb_run is not None:
+                        self.wandb_run.log(metrics)
                 self.accum_loss.zero_()
                 self.accum_reward.zero_()
                 self.accum_ans_acc.zero_()
diff --git a/applications/ColossalChat/coati/distributed/inference_backend.py b/applications/ColossalChat/coati/distributed/inference_backend.py
index 64dfe5cfd531..ce2a2f28c12e 100644
--- a/applications/ColossalChat/coati/distributed/inference_backend.py
+++ b/applications/ColossalChat/coati/distributed/inference_backend.py
@@ -238,7 +238,7 @@ def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwar
                 log_probs.append(p)
 
         # pad them
-        max_len = self.generate_config.max_tokens
+        max_len = self.sample_params.max_tokens
         action_mask = torch.ones(len(out_tokens), max_len, dtype=attention_mask.dtype)
 
         for i, new_token_ids in enumerate(out_tokens):
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index d1b6158e5b2b..cb5b6e4e2080 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -1,4 +1,5 @@
 import copy
+import uuid
 from typing import Any, Dict, Optional
 
 import ray
@@ -53,6 +54,7 @@ def launch_distributed(
     eval_dataset_config: Optional[Dict[str, Any]] = None,
     eval_interval: int = 100,
     eval_save_dir: Optional[str] = None,
+    eval_generation_config: Optional[Dict[str, Any]] = None,
 ):
 
     if core_algo not in ALGO_MAP:
@@ -69,6 +71,9 @@ def launch_distributed(
     num_update_per_episode = num_samples // global_inference_batch_size
     num_recv_per_update = inference_batch_size // inference_microbatch_size
 
+    run_name = f"{inference_backend}_bs_{train_batch_size * train_dp_size}_temp_{generate_config['temperature']:.01f}_top_p_{generate_config['top_p']:.02f}"
+    wandb_group_name = str(uuid.uuid4())
+
     procs = []
     for i in range(num_producers):
         producer = SimpleProducer.options(num_gpus=num_proc_per_producer).remote(
@@ -90,6 +95,10 @@ def launch_distributed(
             eval_interval=eval_interval,
             evaluation_function_type=grpo_config["reward_fn_type"],
             eval_save_dir=eval_save_dir,
+            eval_generation_config=eval_generation_config,
+            project_name=project_name,
+            run_name=run_name,
+            wandb_group_name=wandb_group_name,
         )
         procs.append(producer)
     generate_config_consumer = copy.deepcopy(generate_config)
@@ -115,10 +124,11 @@ def launch_distributed(
             generate_config=generate_config_consumer,
             grpo_config=grpo_config,
             num_generations=num_generations,
-            project_name=project_name,
             save_interval=save_interval,
             save_dir=save_dir,
-            eval_interval=eval_interval,
+            project_name=project_name,
+            run_name=run_name,
+            wandb_group_name=wandb_group_name,
         )
         procs.append(consumer)
     ray.get([p.setup.remote() for p in procs])
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 483ad74b383b..fa2d1cc8d7f4 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -6,8 +6,11 @@
 import ray.util.collective as cc
 import torch
 import tqdm
+import wandb
 from coati.dataset.loader import RawConversationDataset
 from coati.distributed.reward.reward_fn import boxed_math_reward_fn, math_reward_fn
+from ray.util.collective import allreduce
+from ray.util.collective.types import Backend, ReduceOp
 from torch.utils.data import DataLoader, DistributedSampler
 from transformers import AutoTokenizer
 
@@ -15,7 +18,7 @@
 
 from .comm import ray_broadcast_tensor_dict
 from .inference_backend import BACKEND_MAP
-from .utils import pre_send, safe_write_jsonl
+from .utils import pre_send, safe_append_to_jsonl_file
 
 try:
     from vllm import SamplingParams
@@ -43,6 +46,9 @@ def __init__(
         eval_interval=-1,  # disable evaluation
         evaluation_function_type="think_answer_tags",
         eval_save_dir: str = "./eval",
+        project_name: str = None,
+        run_name: str = None,
+        wandb_group_name: str = None,
     ):
         self.producer_idx = producer_idx
         self.num_producers = num_producers
@@ -61,6 +67,14 @@ def __init__(
         self.eval_interval = eval_interval
         self.eval_save_dir = eval_save_dir
         self.consumer_global_step = 0
+        if self.producer_idx == 0:
+            self.wandb_run = wandb.init(
+                project=project_name,
+                sync_tensorboard=True,
+                dir="./wandb",
+                name=run_name + "_eval",
+                group=wandb_group_name,
+            )
 
         if os.path.exists(self.eval_save_dir):
             raise ValueError(f"Eval save dir {self.eval_save_dir} already exists. Please delete it or change the name.")
@@ -132,13 +146,18 @@ def __init__(
         self.consumer_pp_size = consumer_plugin_config.get("pp_size", 1)  # consumer pp size
 
     def setup(self) -> None:
+        cc.init_collective_group(
+            world_size=self.num_producers,
+            rank=self.producer_idx,
+            backend=Backend.NCCL,
+            group_name="producer_group",
+        )
         cc.init_collective_group(1 + self.num_consumer_procs, 0, group_name=f"sync_data_{self.producer_idx}")
         if self.consumer_pp_size > 1:
             for i in range(self.consumer_pp_size):
                 cc.init_collective_group(self.num_producers + 1, self.producer_idx, group_name=f"sync_model_{i}")
         else:
             cc.init_collective_group(self.num_producers + 1, self.producer_idx, group_name="sync_model")
-        cc.init_collective_group(1 + self.num_consumer_procs, 0, group_name=f"sync_eval_statistics_{self.producer_idx}")
 
     def rollout(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]:
         raise NotImplementedError
@@ -160,13 +179,14 @@ def loop(self) -> None:
                     break
                 if self.eval_interval > 0 and self.eval_dataset_config is not None:
                     if i % self.eval_interval == 0:
-                        eval_statistics = {}
+                        to_log_msg = {}
                         for eval_task_name in self.eval_dataloaders:
-                            print(
-                                f"[P{self.producer_idx}] Evaluate episode {episode} step {i} on task {eval_task_name}"
-                            )
+                            if self.producer_idx == 0:
+                                print(
+                                    f"[P{self.producer_idx}] Evaluate episode {episode} step {i} on task {eval_task_name}"
+                                )
                             eval_results = []
-                            eval_statistics[eval_task_name] = torch.zeros(2, device=self.device)
+                            eval_statistics_tensor = torch.zeros((2,), dtype=torch.float32).to(self.device)
                             for eval_batch in tqdm.tqdm(
                                 self.eval_dataloaders[eval_task_name], disable=self.producer_idx != 0
                             ):
@@ -182,24 +202,27 @@ def loop(self) -> None:
                                     for m in range(eval_outputs["input_ids"].size(0))
                                     for n in range(eval_outputs["input_ids"].size(1))
                                 ]
-                            eval_statistics[eval_task_name][0] += len(
-                                [res for res in eval_results if res["ans_valid"] == 1]
+                            eval_statistics_tensor[0] += len([res for res in eval_results if res["ans_valid"] == 1])
+                            eval_statistics_tensor[1] += len(eval_results)
+                            allreduce(eval_statistics_tensor, op=ReduceOp.SUM, group_name="producer_group")
+                            to_log_msg[f"eval/{eval_task_name}"] = (
+                                eval_statistics_tensor[0].item() / eval_statistics_tensor[1].item()
                             )
-                            eval_statistics[eval_task_name][1] += len(eval_results)
+                            if self.producer_idx == 0:
+                                print(
+                                    f"[P{self.producer_idx}]: Accuracy on {eval_task_name}: {to_log_msg[f'eval/{eval_task_name}']}"
+                                )
                             # save eval results
-                            result_file_name = os.path.join(
-                                self.eval_save_dir,
-                                f"{eval_task_name}_episode_{episode}_step_{self.consumer_global_step}.jsonl",
+                            safe_append_to_jsonl_file(
+                                os.path.join(
+                                    self.eval_save_dir,
+                                    f"{eval_task_name}_episode_{episode}_step_{self.consumer_global_step}.jsonl",
+                                ),
+                                eval_results,
                             )
-                            # delete the file if it exists
-                            safe_write_jsonl(result_file_name, eval_results)
-                        print(f"[P{self.producer_idx}] Send eval statistics episode {episode} step {i}")
-                        ray_broadcast_tensor_dict(
-                            eval_statistics,
-                            src=0,
-                            device=self.device,
-                            group_name=f"sync_eval_statistics_{self.producer_idx}",
-                        )
+
+                        if self.producer_idx == 0:
+                            self.wandb_run.log(to_log_msg, step=self.consumer_global_step)
                 outputs = self.rollout(**batch)
 
                 print(f"[P{self.producer_idx}] Send data {[(k, v.shape) for k, v in outputs.items()]}")
@@ -248,12 +271,11 @@ def loop(self) -> None:
                 # linear annealing for 1 episode, temperature from initial to 0.9
                 if episode <= 0:
                     ratio = 1 - (len(self.train_dataloader) - i) / len(self.train_dataloader)
-                    if isinstance(self.model.generate_config.temperature, dict):
-                        self.model.generate_config["temperature"] = (1 - ratio) * self.generate_config[
-                            "temperature"
-                        ] + ratio * 0.9
-                    else:
-                        self.model.generate_config.temperature = (1 - ratio) * self.generate_config[
+                    self.model.generate_config["temperature"] = (1 - ratio) * self.generate_config[
+                        "temperature"
+                    ] + ratio * 0.9
+                    if isinstance(self.model, BACKEND_MAP["vllm"]):
+                        self.model.sample_params.temperature = (1 - ratio) * self.generate_config[
                             "temperature"
                         ] + ratio * 0.9
 
@@ -280,6 +302,10 @@ def __init__(
         eval_interval=-1,  # disable evaluation
         evaluation_function_type="think_answer_tags",
         eval_save_dir: str = "./eval",
+        eval_generation_config={},
+        project_name: str = None,
+        run_name: str = None,
+        wandb_group_name: str = None,
     ):
         super().__init__(
             producer_idx,
@@ -299,10 +325,14 @@ def __init__(
             eval_interval=eval_interval,
             evaluation_function_type=evaluation_function_type,
             eval_save_dir=eval_save_dir,
+            project_name=project_name,
+            run_name=run_name,
+            wandb_group_name=wandb_group_name,
         )
         self.model = self.backend_cls(model_config, generate_config, self.tokenizer, num_generations)
         self.eval_generation_config = copy.deepcopy(self.model.generate_config)
         self.eval_generation_config["n"] = 1  # use 1 generation for evaluation
+        self.eval_generation_config.update(eval_generation_config)
         self.eval_sample_params = SamplingParams(**self.eval_generation_config)
 
     @torch.no_grad()
diff --git a/applications/ColossalChat/coati/distributed/utils.py b/applications/ColossalChat/coati/distributed/utils.py
index fb2a0dfd3e90..a40ebbcfbe92 100644
--- a/applications/ColossalChat/coati/distributed/utils.py
+++ b/applications/ColossalChat/coati/distributed/utils.py
@@ -135,7 +135,7 @@ def masked_sum(tensor: torch.Tensor, mask: torch.Tensor, dim: int = 1) -> torch.
     return tensor.sum(dim=dim)
 
 
-def safe_write_jsonl(file_path, data):
+def safe_append_to_jsonl_file(file_path, data):
     with FileLock(file_path + ".lock"):
         # Ensure file exists
         os.makedirs(os.path.dirname(file_path), exist_ok=True)
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index b2fafd42e31f..9fecdb52da51 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -161,6 +161,7 @@
                 stop_strings=["</answer>"] if args.reward_type == "think_answer_tags" else None,
             )
         )
+        eval_generation_config = {"temperature": 0.6}  # used to update generation config for evaluation
     elif args.backend == "vllm":
         inference_model_config.update(
             dict(
@@ -179,6 +180,7 @@
                 stop=["</answer>"] if args.reward_type == "think_answer_tags" else None,
             )
         )
+        eval_generation_config = {"temperature": 0.6}  # used to update generation config for evaluation
     else:
         raise ValueError(f"Unsupported backend: {args.backend}")
 
@@ -257,4 +259,5 @@
         },
         eval_interval=args.eval_interval,
         eval_save_dir=os.path.join(args.eval_save_dir, args.project.replace(" ", "_")),
+        eval_generation_config=eval_generation_config,
     )

From 4ec73298b22315ba27ef68df4c4836ce9be0e5ee Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Thu, 15 May 2025 14:15:40 +0800
Subject: [PATCH 056/100] use consumer global step

---
 .../ColossalChat/coati/distributed/grpo_consumer.py      | 2 --
 applications/ColossalChat/coati/distributed/launch.py    | 1 -
 applications/ColossalChat/coati/distributed/producer.py  | 9 +++++++--
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 0b0c8a9ad50a..5ad73f45cc01 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -266,7 +266,6 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
         total_samples = all_reduce_sum(torch.sum(torch.ones_like(loss_mask, device=loss_mask.device)), self.plugin)
         self.effective_sample_count += effective_samples.item()
         self.total_sample_count += total_samples.item()
-
         pbar.set_postfix(
             {
                 "Global Step": self.global_step,
@@ -522,7 +521,6 @@ def _criterion(outputs, inputs):
                 # All gather excessive prompts index across DP ranks.
                 excessive_prompts_idx = [idx + self.dp_rank * self.minibatch_size for idx in excessive_prompts_idx]
                 excessive_prompts_idx = all_gather_tensors(excessive_prompts_idx, self.plugin)
-
             return loss_scalar, excessive_prompts_idx
         else:
             return None, excessive_prompts_idx
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index cb5b6e4e2080..327db1b5560a 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -56,7 +56,6 @@ def launch_distributed(
     eval_save_dir: Optional[str] = None,
     eval_generation_config: Optional[Dict[str, Any]] = None,
 ):
-
     if core_algo not in ALGO_MAP:
         raise NotImplementedError(f"{core_algo} is not supported yet.")
     else:
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index fa2d1cc8d7f4..f7c17bf56b9e 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -58,6 +58,7 @@ def __init__(
         self.microbatch_size = microbatch_size
         assert batch_size % microbatch_size == 0
         self.num_microbatches = batch_size // microbatch_size
+        self.lastest_eval_step = -1
 
         self.train_dataset_config = train_dataset_config
         self.model_config = model_config
@@ -178,12 +179,15 @@ def loop(self) -> None:
                 if i >= num_valid_microbatches:
                     break
                 if self.eval_interval > 0 and self.eval_dataset_config is not None:
-                    if i % self.eval_interval == 0:
+                    if (
+                        self.consumer_global_step % self.eval_interval == 0
+                        and self.consumer_global_step > self.lastest_eval_step
+                    ):
                         to_log_msg = {}
                         for eval_task_name in self.eval_dataloaders:
                             if self.producer_idx == 0:
                                 print(
-                                    f"[P{self.producer_idx}] Evaluate episode {episode} step {i} on task {eval_task_name}"
+                                    f"[P{self.producer_idx}] Evaluate episode {episode} step {self.consumer_global_step} on task {eval_task_name}"
                                 )
                             eval_results = []
                             eval_statistics_tensor = torch.zeros((2,), dtype=torch.float32).to(self.device)
@@ -223,6 +227,7 @@ def loop(self) -> None:
 
                         if self.producer_idx == 0:
                             self.wandb_run.log(to_log_msg, step=self.consumer_global_step)
+                        self.lastest_eval_step = self.consumer_global_step
                 outputs = self.rollout(**batch)
 
                 print(f"[P{self.producer_idx}] Send data {[(k, v.shape) for k, v in outputs.items()]}")

From 957e3a521aa41d0580c6ed32d77b4a9cf8581210 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Thu, 15 May 2025 16:52:31 +0800
Subject: [PATCH 057/100] disable wandb tb syncing

---
 applications/ColossalChat/coati/distributed/grpo_consumer.py | 2 +-
 applications/ColossalChat/coati/distributed/producer.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 5ad73f45cc01..7ea2dbf18c57 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -153,7 +153,7 @@ def setup(self):
         ):
             self.wandb_run = wandb.init(
                 project=self.project_name,
-                sync_tensorboard=True,
+                sync_tensorboard=False,
                 dir="./wandb",
                 name=self.run_name,
                 group=self.wandb_group_name,
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index f7c17bf56b9e..8b94521606c7 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -71,7 +71,7 @@ def __init__(
         if self.producer_idx == 0:
             self.wandb_run = wandb.init(
                 project=project_name,
-                sync_tensorboard=True,
+                sync_tensorboard=False,
                 dir="./wandb",
                 name=run_name + "_eval",
                 group=wandb_group_name,

From 55eee129d29185878bdb5a41b1b1b6fda1483af0 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Thu, 15 May 2025 18:16:50 +0800
Subject: [PATCH 058/100] move prompt-level-filtering to buffer side

---
 .../coati/distributed/consumer.py             | 22 +++--
 .../coati/distributed/grpo_consumer.py        | 83 +++++++++++++++----
 2 files changed, 81 insertions(+), 24 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index ecfd6545818d..6385df2cfc2f 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -113,18 +113,24 @@ def loop(self) -> None:
             ) as pbar:
                 for step in pbar:
                     i = 0
-                    allow_sync_model = False
+                    allow_sync_model = True
                     for _ in range(self.num_recv_per_update):
                         # receive data from producers
                         for r in range(self.num_producers):
                             print(f"[T{dist.get_rank()}] Recv data episode {episode} step {step} from {r}")
-                            self.buffer.extend(
-                                unbind_batch(
-                                    ray_broadcast_tensor_dict(
-                                        None, src=0, device=self.device, group_name=f"sync_data_{r}"
-                                    )
-                                )
+                            raw_batch = unbind_batch(
+                                ray_broadcast_tensor_dict(None, src=0, device=self.device, group_name=f"sync_data_{r}")
                             )
+                            filtered_batch = [
+                                t
+                                for t in [
+                                    self.prompt_level_filtering(self.calculate_group_reward(group))
+                                    for group in raw_batch
+                                ]
+                                if t is not None
+                            ]
+
+                            self.buffer.extend(filtered_batch)
                         while len(self.buffer) >= self.dp_size * self.minibatch_size:
                             batches = self.buffer[
                                 self.dp_rank * self.minibatch_size : (self.dp_rank + 1) * self.minibatch_size
@@ -177,7 +183,7 @@ def loop(self) -> None:
                                     )
                             del state_dict
                             torch.cuda.empty_cache()
-                            allow_sync_model = False
+                            allow_sync_model = True
 
 
 @ray.remote
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 7ea2dbf18c57..fcf7b0740fad 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -1,5 +1,5 @@
 from contextlib import nullcontext
-from typing import Any, Optional
+from typing import Any, Dict, Optional
 
 import ray
 import torch
@@ -179,7 +179,6 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
         Format:
             [minibatch_size, num_of_generation, prompt_length + response_length] --- <PAD>...<PAD><PROMPT>...<PROMPT><RESPONSE>...<RESPONSE><PAD>...<PAD>.
         """
-
         # Reshape to [minibatch_size x num_of_generation, prompt_length + response_length]
         data = {k: v.view(-1, v.size(-1)) for k, v in kwargs.items()}
         action_mask = data["action_mask"]
@@ -188,15 +187,9 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
         response_length = torch.sum(action_mask, dim=1).to(torch.float32)
         train_microbatch_size = self.grpo_config.get("train_microbatch_size", data["input_ids"].size(0))
 
-        reward_group = self.reward_model(
-            data["input_ids"],
-            gt_answer=data["gt_answer"],
-            response_idx=data["response_idx"],
-        )
-
-        reward = torch.tensor([value[0] for value in reward_group]).to(data["input_ids"].device)
-        format_acc = torch.tensor([value[1] for value in reward_group]).to(data["input_ids"].device)
-        ans_acc = torch.tensor([value[2] for value in reward_group]).to(data["input_ids"].device)
+        reward = data["reward"].view((-1))
+        format_acc = data["format_acc"].view((-1))
+        ans_acc = data["ans_acc"].view((-1))
 
         # [minibatch_size, num_generations]
 
@@ -213,11 +206,7 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
             ans_acc.view(-1, self.num_generations).mean(dim=1).repeat_interleave(self.num_generations, dim=0)
         )
         # [minibatch_size x num_of_generation]
-        loss_mask = (
-            torch.ones(action_mask.size(0), device=action_mask.device).bool()
-            if self.filter_range is None
-            else torch.logical_and(group_ans_acc > self.filter_range[0], group_ans_acc < self.filter_range[1])
-        )
+        loss_mask = torch.ones(action_mask.size(0), device=action_mask.device).bool()
 
         # filter out overlength samples
         if self.filter_truncated_response and action_mask.size(1) == self.max_length:
@@ -525,6 +514,68 @@ def _criterion(outputs, inputs):
         else:
             return None, excessive_prompts_idx
 
+    def calculate_group_reward(self, rollout_group: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Calculate the group reward for the given rollout group.
+
+        Args:
+            rollout_group (Dict[str, Any]):
+                a group of samples generated by the model from the same prompt
+                contain the following keys:
+                    "input_ids": torch.Tensor, [num_of_generation, prompt_length + response_length]
+                    "attention_mask": torch.Tensor, [num_of_generation, prompt_length + response_length]
+                    "action_mask": torch.Tensor, [num_of_generation, response_length]
+                    "action_log_probs": torch.Tensor, [num_of_generation, response_length]
+                    "response_idx": int, torch.Tensor, [num_of_generation, 2]
+                    "gt_answer": torch.Tensor, [num_of_generation, 128]
+                    "temperature": torch.Tensor, [] (scalar)
+
+        Returns:
+            Dict[str, Any]: The new group data with calculated reward.
+        """
+        reward_group = self.reward_model(
+            rollout_group["input_ids"],
+            gt_answer=rollout_group["gt_answer"],
+            response_idx=rollout_group["response_idx"],
+        )
+        # [num_of_generation]
+        reward = torch.tensor([value[0] for value in reward_group]).to(rollout_group["input_ids"].device)
+        format_acc = torch.tensor([value[1] for value in reward_group]).to(rollout_group["input_ids"].device)
+        ans_acc = torch.tensor([value[2] for value in reward_group]).to(rollout_group["input_ids"].device)
+
+        rollout_group["reward"] = reward.view((-1, 1))
+        rollout_group["format_acc"] = format_acc.view((-1, 1))
+        rollout_group["ans_acc"] = ans_acc.view((-1, 1))
+        return rollout_group
+
+    def prompt_level_filtering(self, rollout_group: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        rollout_group: Dict[str, Any]
+            a group of samples generated by the model from the same prompt
+            contain the following keys:
+                "input_ids": torch.Tensor, [num_of_generation, prompt_length + response_length]
+                "attention_mask": torch.Tensor, [num_of_generation, prompt_length + response_length]
+                "action_mask": torch.Tensor, [num_of_generation, response_length]
+                "action_log_probs": torch.Tensor, [num_of_generation, response_length]
+                "response_idx": int, torch.Tensor, [num_of_generation, 2]
+                "gt_answer": torch.Tensor, [num_of_generation, 128]
+                "temperature": torch.Tensor, [] (scalar)
+                "reward": torch.Tensor, [num_of_generation]
+                "format_acc": torch.Tensor, [num_of_generation]
+                "ans_acc": torch.Tensor, [num_of_generation]
+        """
+        if self.filter_range is not None:
+            # filter prompt whoes accuracy is too high or too low (out of range)
+            group_ans_acc = torch.mean(rollout_group["ans_acc"])
+            if group_ans_acc < self.filter_range[0] or group_ans_acc > self.filter_range[1]:
+                # filter out the prompt
+                return None
+            else:
+                return rollout_group
+        else:
+            # no filter
+            return rollout_group
+
     def state_dict(self):
         self.policy_model._force_wait_all_gather()
         model = self.policy_model.unwrap()

From a528921944df44046a02366c45f93aec1ed254fa Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Thu, 15 May 2025 18:30:32 +0800
Subject: [PATCH 059/100] move prompt-level-filtering to buffer side

---
 applications/ColossalChat/coati/distributed/grpo_consumer.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index fcf7b0740fad..e709c8aed2be 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -254,7 +254,6 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
         total_effective_tokens_count = all_reduce_sum(torch.sum(effective_tokens_count), self.plugin)
         total_samples = all_reduce_sum(torch.sum(torch.ones_like(loss_mask, device=loss_mask.device)), self.plugin)
         self.effective_sample_count += effective_samples.item()
-        self.total_sample_count += total_samples.item()
         pbar.set_postfix(
             {
                 "Global Step": self.global_step,
@@ -461,6 +460,9 @@ def _criterion(outputs, inputs):
             self.optimizer.step()
             self.optimizer.zero_grad()
             self.global_step += 1
+            self.total_sample_count = all_reduce_sum(
+                torch.tensor(self.total_sample_count).to(self.accum_loss.device), self.plugin
+            ).item()
             sample_utilization = self.effective_sample_count / self.total_sample_count
             self.effective_prompt_count = 0
             self.effective_sample_count = 0
@@ -564,6 +566,7 @@ def prompt_level_filtering(self, rollout_group: Dict[str, Any]) -> Dict[str, Any
                 "format_acc": torch.Tensor, [num_of_generation]
                 "ans_acc": torch.Tensor, [num_of_generation]
         """
+        self.total_sample_count += rollout_group["input_ids"].size(0)
         if self.filter_range is not None:
             # filter prompt whoes accuracy is too high or too low (out of range)
             group_ans_acc = torch.mean(rollout_group["ans_acc"])

From 1644adf6843b735af84e76c3346bea2e7ceee85d Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li35271158@gmail.com>
Date: Thu, 15 May 2025 18:30:27 +0800
Subject: [PATCH 060/100] handle empty index

---
 .../coati/distributed/consumer.py             | 48 +++++++++----------
 .../coati/distributed/grpo_consumer.py        | 25 ++++++----
 2 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index ecfd6545818d..816cab50a93d 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -113,7 +113,6 @@ def loop(self) -> None:
             ) as pbar:
                 for step in pbar:
                     i = 0
-                    allow_sync_model = False
                     for _ in range(self.num_recv_per_update):
                         # receive data from producers
                         for r in range(self.num_producers):
@@ -139,7 +138,6 @@ def loop(self) -> None:
                             else:
                                 self.buffer = self.buffer[self.dp_size * self.minibatch_size :]
                             if loss is not None:
-                                allow_sync_model = True
                                 pbar.set_postfix({"loss": loss})
                             i += 1
                     if self.lr_scheduler is not None:
@@ -153,31 +151,29 @@ def loop(self) -> None:
                             print(f"Saved model checkpoint at step {step + 1} in folder {save_path}")
 
                     if episode != self.num_episodes - 1 or step != self.num_update_per_episode - 1:
-                        if allow_sync_model:
-                            if self.pp_size > 1:
-                                print(
-                                    f"[T{dist.get_rank()}] Sync model PP stage {self.pp_rank} episode {episode} step {step}"
+                        if self.pp_size > 1:
+                            print(
+                                f"[T{dist.get_rank()}] Sync model PP stage {self.pp_rank} episode {episode} step {step}"
+                            )
+                        else:
+                            print(f"[T{dist.get_rank()}] Sync model episode {episode} step {step}")
+                        torch.cuda.empty_cache()
+                        state_dict = self.state_dict()
+                        if self.pp_size > 1:
+                            if self.tp_rank == 0 and self.dp_rank == 0:
+                                ray_broadcast_tensor_dict(
+                                    state_dict,
+                                    src=self.num_producers,
+                                    device=self.device,
+                                    group_name=f"sync_model_{self.pp_rank}",
                                 )
-                            else:
-                                print(f"[T{dist.get_rank()}] Sync model episode {episode} step {step}")
-                            torch.cuda.empty_cache()
-                            state_dict = self.state_dict()
-                            if self.pp_size > 1:
-                                if self.tp_rank == 0 and self.dp_rank == 0:
-                                    ray_broadcast_tensor_dict(
-                                        state_dict,
-                                        src=self.num_producers,
-                                        device=self.device,
-                                        group_name=f"sync_model_{self.pp_rank}",
-                                    )
-                            else:
-                                if self.rank == 0:
-                                    ray_broadcast_tensor_dict(
-                                        state_dict, src=self.num_producers, device=self.device, group_name="sync_model"
-                                    )
-                            del state_dict
-                            torch.cuda.empty_cache()
-                            allow_sync_model = False
+                        else:
+                            if self.rank == 0:
+                                ray_broadcast_tensor_dict(
+                                    state_dict, src=self.num_producers, device=self.device, group_name="sync_model"
+                                )
+                        del state_dict
+                        torch.cuda.empty_cache()
 
 
 @ray.remote
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 7ea2dbf18c57..eae4ff54e994 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -245,17 +245,22 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
                 # TODO: customize excessive prompts calculation.
                 if excessive_prompts_per_rank != 0:
                     # Mask excessive prompts to False
-                    true_indices = torch.nonzero(effective_prompts_mask).squeeze()
-                    if excessive_prompts_per_rank <= len(true_indices):
-                        excessive_prompts_idx = true_indices[-excessive_prompts_per_rank:]
-                    else:
-                        excessive_prompts_idx = true_indices
-                    effective_prompts_mask[excessive_prompts_idx] = False
+                    true_indices = torch.nonzero(effective_prompts_mask)
+                    # Make sure the indices are not empty.
+                    if true_indices.numel() > 0:
+                        true_indices = true_indices.squeeze()
+                        if excessive_prompts_per_rank <= len(true_indices):
+                            excessive_prompts_idx = true_indices[-excessive_prompts_per_rank:]
+                        else:
+                            excessive_prompts_idx = true_indices
+                        effective_prompts_mask[excessive_prompts_idx] = False
 
-                    for mask_idx in range(len(effective_prompts_mask)):
-                        if effective_prompts_mask[mask_idx] == False:
-                            # Update loss mask.
-                            loss_mask[mask_idx] = False
+                        for mask_idx in range(len(effective_prompts_mask)):
+                            if effective_prompts_mask[mask_idx] == False:
+                                # Update loss mask.
+                                loss_mask[mask_idx] = False
+                    else:
+                        excessive_prompts_idx = torch.empty([0])
         else:
             # If dynamic batching is disabled, we need to use all samples for training.
             need_update = (step_idx + 1) % self.num_microbatches == 0

From 6abffb9100fe6abe002f484ee2b1eedb17ed204b Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Fri, 16 May 2025 09:42:35 +0800
Subject: [PATCH 061/100] fix evaluation

---
 applications/ColossalChat/coati/distributed/producer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 8b94521606c7..8b8be8e16f53 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -180,7 +180,7 @@ def loop(self) -> None:
                     break
                 if self.eval_interval > 0 and self.eval_dataset_config is not None:
                     if (
-                        self.consumer_global_step % self.eval_interval == 0
+                        self.consumer_global_step - self.lastest_eval_step >= self.eval_interval
                         and self.consumer_global_step > self.lastest_eval_step
                     ):
                         to_log_msg = {}
@@ -256,6 +256,8 @@ def loop(self) -> None:
                             state_dict = ray_broadcast_tensor_dict(
                                 None, self.num_producers, device=self.device, group_name=f"sync_model_{pp_idx}"
                             )
+                            if "consumer_global_step" in state_dict:
+                                self.consumer_global_step = state_dict.pop("consumer_global_step").item()
                             self.load_state_dict(state_dict)
                     else:
                         print(

From ab956249155408559b3930634a6ebe21c0f60c32 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Fri, 16 May 2025 10:00:10 +0800
Subject: [PATCH 062/100] handle empty index (#6311)

Co-authored-by: Tong Li <tong.li35271158@gmail.com>
---
 .../coati/distributed/consumer.py             | 48 +++++++++----------
 .../coati/distributed/grpo_consumer.py        | 25 ++++++----
 2 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 38b31398d125..b07f8d7a1aa0 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -114,7 +114,6 @@ def loop(self) -> None:
             ) as pbar:
                 for step in pbar:
                     i = 0
-                    allow_sync_model = False
                     for _ in range(self.num_recv_per_update):
                         # receive data from producers
                         for r in range(self.num_producers):
@@ -140,7 +139,6 @@ def loop(self) -> None:
                             else:
                                 self.buffer = self.buffer[self.dp_size * self.minibatch_size :]
                             if loss is not None:
-                                allow_sync_model = True
                                 pbar.set_postfix({"loss": loss})
                             i += 1
                     if self.lr_scheduler is not None:
@@ -154,31 +152,29 @@ def loop(self) -> None:
                             print(f"Saved model checkpoint at step {step + 1} in folder {save_path}")
 
                     if episode != self.num_episodes - 1 or step != self.num_update_per_episode - 1:
-                        if allow_sync_model:
-                            if self.pp_size > 1:
-                                print(
-                                    f"[T{dist.get_rank()}] Sync model PP stage {self.pp_rank} episode {episode} step {step}"
+                        if self.pp_size > 1:
+                            print(
+                                f"[T{dist.get_rank()}] Sync model PP stage {self.pp_rank} episode {episode} step {step}"
+                            )
+                        else:
+                            print(f"[T{dist.get_rank()}] Sync model episode {episode} step {step}")
+                        torch.cuda.empty_cache()
+                        state_dict = self.state_dict()
+                        if self.pp_size > 1:
+                            if self.tp_rank == 0 and self.dp_rank == 0:
+                                ray_broadcast_tensor_dict(
+                                    state_dict,
+                                    src=self.num_producers,
+                                    device=self.device,
+                                    group_name=f"sync_model_{self.pp_rank}",
                                 )
-                            else:
-                                print(f"[T{dist.get_rank()}] Sync model episode {episode} step {step}")
-                            torch.cuda.empty_cache()
-                            state_dict = self.state_dict()
-                            if self.pp_size > 1:
-                                if self.tp_rank == 0 and self.dp_rank == 0:
-                                    ray_broadcast_tensor_dict(
-                                        state_dict,
-                                        src=self.num_producers,
-                                        device=self.device,
-                                        group_name=f"sync_model_{self.pp_rank}",
-                                    )
-                            else:
-                                if self.rank == 0:
-                                    ray_broadcast_tensor_dict(
-                                        state_dict, src=self.num_producers, device=self.device, group_name="sync_model"
-                                    )
-                            del state_dict
-                            torch.cuda.empty_cache()
-                            allow_sync_model = False
+                        else:
+                            if self.rank == 0:
+                                ray_broadcast_tensor_dict(
+                                    state_dict, src=self.num_producers, device=self.device, group_name="sync_model"
+                                )
+                        del state_dict
+                        torch.cuda.empty_cache()
 
 
 @ray.remote
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 31b6876397fc..72e54b0dede9 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -239,17 +239,22 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
                 # TODO: customize excessive prompts calculation.
                 if excessive_prompts_per_rank != 0:
                     # Mask excessive prompts to False
-                    true_indices = torch.nonzero(effective_prompts_mask).squeeze()
-                    if excessive_prompts_per_rank <= len(true_indices):
-                        excessive_prompts_idx = true_indices[-excessive_prompts_per_rank:]
-                    else:
-                        excessive_prompts_idx = true_indices
-                    effective_prompts_mask[excessive_prompts_idx] = False
+                    true_indices = torch.nonzero(effective_prompts_mask)
+                    # Make sure the indices are not empty.
+                    if true_indices.numel() > 0:
+                        true_indices = true_indices.squeeze()
+                        if excessive_prompts_per_rank <= len(true_indices):
+                            excessive_prompts_idx = true_indices[-excessive_prompts_per_rank:]
+                        else:
+                            excessive_prompts_idx = true_indices
+                        effective_prompts_mask[excessive_prompts_idx] = False
 
-                    for mask_idx in range(len(effective_prompts_mask)):
-                        if effective_prompts_mask[mask_idx] == False:
-                            # Update loss mask.
-                            loss_mask[mask_idx] = False
+                        for mask_idx in range(len(effective_prompts_mask)):
+                            if effective_prompts_mask[mask_idx] == False:
+                                # Update loss mask.
+                                loss_mask[mask_idx] = False
+                    else:
+                        excessive_prompts_idx = torch.empty([0])
         else:
             # If dynamic batching is disabled, we need to use all samples for training.
             need_update = (step_idx + 1) % self.num_microbatches == 0

From 11a5854b50c67afebfbd6250db36f6e0940e471d Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Fri, 16 May 2025 14:08:23 +0800
Subject: [PATCH 063/100] remove redundant code and fix bugs

---
 .../coati/distributed/consumer.py             | 23 ++++-----
 .../coati/distributed/grpo_consumer.py        | 47 +++----------------
 .../ColossalChat/coati/distributed/launch.py  |  2 +-
 .../coati/distributed/producer.py             |  2 +-
 applications/ColossalChat/rl_example.py       | 12 +++--
 5 files changed, 27 insertions(+), 59 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 6385df2cfc2f..531cf363c581 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -121,14 +121,14 @@ def loop(self) -> None:
                             raw_batch = unbind_batch(
                                 ray_broadcast_tensor_dict(None, src=0, device=self.device, group_name=f"sync_data_{r}")
                             )
-                            filtered_batch = [
-                                t
-                                for t in [
-                                    self.prompt_level_filtering(self.calculate_group_reward(group))
-                                    for group in raw_batch
-                                ]
-                                if t is not None
+                            processed_batch = [
+                                self.prompt_level_filtering(self.calculate_group_reward(group)) for group in raw_batch
                             ]
+                            filtered_batch = [t for t in processed_batch if t is not None]
+                            if self.filter_range is not None:
+                                print(
+                                    f"[T{dist.get_rank()}] Filter recv data: {len(processed_batch)} -> {len(filtered_batch)}"
+                                )
 
                             self.buffer.extend(filtered_batch)
                         while len(self.buffer) >= self.dp_size * self.minibatch_size:
@@ -137,13 +137,8 @@ def loop(self) -> None:
                             ]
                             batch = bind_batch(batches)
                             batch = post_recv(batch)
-                            loss, excessive_prompts_idx = self.step(i, pbar, **batch)
-
-                            if excessive_prompts_idx is not None:
-                                excessive_prompts = [self.buffer[idx] for idx in excessive_prompts_idx]
-                                self.buffer = excessive_prompts + self.buffer[self.dp_size * self.minibatch_size :]
-                            else:
-                                self.buffer = self.buffer[self.dp_size * self.minibatch_size :]
+                            loss = self.step(i, pbar, **batch)
+                            self.buffer = self.buffer[self.dp_size * self.minibatch_size :]
                             if loss is not None:
                                 allow_sync_model = True
                                 pbar.set_postfix({"loss": loss})
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index e709c8aed2be..f73f2d96f256 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -9,7 +9,7 @@
 from coati.distributed.reward.reward_fn import boxed_math_reward_fn, math_reward_fn
 from coati.distributed.reward.verifiable_reward import VerifiableReward
 from coati.distributed.utils import calc_action_log_probs
-from coati.trainer.utils import all_gather_tensors, all_reduce_mean, all_reduce_sum
+from coati.trainer.utils import all_reduce_mean, all_reduce_sum
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
@@ -201,10 +201,7 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
         reward_std = group_reward.std(dim=1).repeat_interleave(self.num_generations, dim=0)
         # [minibatch_size x num_generations]
         advantages = ((reward - reward_mean) / (reward_std + 1e-4)).unsqueeze(dim=-1)
-        # filter out the reward that is too high (all sample gets full score) or too low (all sample gets 0 score),
-        group_ans_acc = (
-            ans_acc.view(-1, self.num_generations).mean(dim=1).repeat_interleave(self.num_generations, dim=0)
-        )
+
         # [minibatch_size x num_of_generation]
         loss_mask = torch.ones(action_mask.size(0), device=action_mask.device).bool()
 
@@ -214,37 +211,14 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
                 loss_mask,
                 action_mask[:, -1] == False,
             )
-        prompt_level_mask = loss_mask.view(self.minibatch_size, self.num_generations)
-
-        # [minibatch_size] -> calculate the number of effective prompts
-        effective_prompts_mask = prompt_level_mask.any(dim=1)
-        effective_prompts = all_reduce_sum(torch.sum(effective_prompts_mask), self.plugin)
-        self.effective_prompt_count += effective_prompts.item()
-        excessive_prompts_idx = None
+        self.effective_prompt_count += group_reward.size(0) * self.dp_size
 
         mean_kl, mean_loss = [], []
 
         if self.grpo_config.get("dynamic_batching", True):
             need_update = self.effective_prompt_count >= self.batch_size * self.dp_size
             excessive_prompts = self.effective_prompt_count - self.batch_size * self.dp_size
-
-            if excessive_prompts > 0:
-                excessive_prompts_per_rank = excessive_prompts // self.dp_size
-                # Only count excessive prompts if they are greater than 1 per rank.
-                # TODO: customize excessive prompts calculation.
-                if excessive_prompts_per_rank != 0:
-                    # Mask excessive prompts to False
-                    true_indices = torch.nonzero(effective_prompts_mask).squeeze()
-                    if excessive_prompts_per_rank <= len(true_indices):
-                        excessive_prompts_idx = true_indices[-excessive_prompts_per_rank:]
-                    else:
-                        excessive_prompts_idx = true_indices
-                    effective_prompts_mask[excessive_prompts_idx] = False
-
-                    for mask_idx in range(len(effective_prompts_mask)):
-                        if effective_prompts_mask[mask_idx] == False:
-                            # Update loss mask.
-                            loss_mask[mask_idx] = False
+            assert excessive_prompts <= 0, "Debug: Excessive prompts should always be less than 0. Bug!!!!"
         else:
             # If dynamic batching is disabled, we need to use all samples for training.
             need_update = (step_idx + 1) % self.num_microbatches == 0
@@ -460,9 +434,7 @@ def _criterion(outputs, inputs):
             self.optimizer.step()
             self.optimizer.zero_grad()
             self.global_step += 1
-            self.total_sample_count = all_reduce_sum(
-                torch.tensor(self.total_sample_count).to(self.accum_loss.device), self.plugin
-            ).item()
+            # no need to run all_reduce_sum on total_sample_count, because all dp ranks recieves a complete inference batch from all producers.
             sample_utilization = self.effective_sample_count / self.total_sample_count
             self.effective_prompt_count = 0
             self.effective_sample_count = 0
@@ -507,14 +479,9 @@ def _criterion(outputs, inputs):
                 self.accum_advantages.zero_()
                 self.accum_response_length.zero_()
                 self.accum_count = 0
-
-            if excessive_prompts_idx is not None:
-                # All gather excessive prompts index across DP ranks.
-                excessive_prompts_idx = [idx + self.dp_rank * self.minibatch_size for idx in excessive_prompts_idx]
-                excessive_prompts_idx = all_gather_tensors(excessive_prompts_idx, self.plugin)
-            return loss_scalar, excessive_prompts_idx
+            return loss_scalar
         else:
-            return None, excessive_prompts_idx
+            return None
 
     def calculate_group_reward(self, rollout_group: Dict[str, Any]) -> Dict[str, Any]:
         """
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index 327db1b5560a..e6367d2d16df 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -66,7 +66,7 @@ def launch_distributed(
 
     dataset_path = train_dataset_config["path"]
     num_samples = get_jsonl_size_fast(dataset_path)
-    global_inference_batch_size = inference_batch_size * num_producers  # TODO: this doesn't support TP on producer
+    global_inference_batch_size = inference_batch_size * num_producers
     num_update_per_episode = num_samples // global_inference_batch_size
     num_recv_per_update = inference_batch_size // inference_microbatch_size
 
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 8b94521606c7..d796bff48a17 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -187,7 +187,7 @@ def loop(self) -> None:
                         for eval_task_name in self.eval_dataloaders:
                             if self.producer_idx == 0:
                                 print(
-                                    f"[P{self.producer_idx}] Evaluate episode {episode} step {self.consumer_global_step} on task {eval_task_name}"
+                                    f"[P{self.producer_idx}] Evaluate consumer step {self.consumer_global_step} on task {eval_task_name}"
                                 )
                             eval_results = []
                             eval_statistics_tensor = torch.zeros((2,), dtype=torch.float32).to(self.device)
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 2ed9ef62c422..9f89eb5466be 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -104,7 +104,13 @@
         choices=["think_answer_tags", "boxed"],
         help="Reward type for GRPO.",
     )
-    parser.add_argument("-ei", "--eval-interval", type=int, default=100, help="Interval for evaluation.")
+    parser.add_argument(
+        "-ei",
+        "--eval-interval",
+        type=int,
+        default=100,
+        help="Interval for evaluation. Evaluate every ei consumer steps.",
+    )
 
     # Logging/Checkpointing parameters
     parser.add_argument("-si", "--save-interval", type=int, default=100, help="Interval for saving checkpoints.")
@@ -125,8 +131,8 @@
         and args.train_microbatch_size > 0
     ), "Train micro batch size must be greater than 0 less than train mini batch size * num generations"
     assert (
-        args.train_minibatch_size <= args.train_batch_size
-    ), "Train mini batch size must be less than or equals to train batch size"
+        args.train_minibatch_size <= args.train_batch_size and args.train_batch_size % args.train_minibatch_size == 0
+    ), "Train mini batch size must be less than or equals to train batch size and train batch size must be divisible by train mini batch size"
 
     if args.master_address is None:
         # Default settings: Using single machine

From 203dfb15365beb2c63a364665e342ac44da25111 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Fri, 16 May 2025 14:15:35 +0800
Subject: [PATCH 064/100] address conversation

---
 applications/ColossalChat/coati/distributed/launch.py   | 2 +-
 applications/ColossalChat/coati/distributed/producer.py | 4 ++--
 applications/ColossalChat/rl_example.py                 | 8 +++++++-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index 327db1b5560a..e6367d2d16df 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -66,7 +66,7 @@ def launch_distributed(
 
     dataset_path = train_dataset_config["path"]
     num_samples = get_jsonl_size_fast(dataset_path)
-    global_inference_batch_size = inference_batch_size * num_producers  # TODO: this doesn't support TP on producer
+    global_inference_batch_size = inference_batch_size * num_producers
     num_update_per_episode = num_samples // global_inference_batch_size
     num_recv_per_update = inference_batch_size // inference_microbatch_size
 
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 8b8be8e16f53..82f57094d5d9 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -187,7 +187,7 @@ def loop(self) -> None:
                         for eval_task_name in self.eval_dataloaders:
                             if self.producer_idx == 0:
                                 print(
-                                    f"[P{self.producer_idx}] Evaluate episode {episode} step {self.consumer_global_step} on task {eval_task_name}"
+                                    f"[P{self.producer_idx}] Evaluate model at training step {self.consumer_global_step} on task {eval_task_name}"
                                 )
                             eval_results = []
                             eval_statistics_tensor = torch.zeros((2,), dtype=torch.float32).to(self.device)
@@ -220,7 +220,7 @@ def loop(self) -> None:
                             safe_append_to_jsonl_file(
                                 os.path.join(
                                     self.eval_save_dir,
-                                    f"{eval_task_name}_episode_{episode}_step_{self.consumer_global_step}.jsonl",
+                                    f"{eval_task_name}_training_step_{self.consumer_global_step}.jsonl",
                                 ),
                                 eval_results,
                             )
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 2ed9ef62c422..8d1f25e74e3a 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -104,7 +104,13 @@
         choices=["think_answer_tags", "boxed"],
         help="Reward type for GRPO.",
     )
-    parser.add_argument("-ei", "--eval-interval", type=int, default=100, help="Interval for evaluation.")
+    parser.add_argument(
+        "-ei",
+        "--eval-interval",
+        type=int,
+        default=100,
+        help="Interval for evaluation. Evaluate every ei training steps.",
+    )
 
     # Logging/Checkpointing parameters
     parser.add_argument("-si", "--save-interval", type=int, default=100, help="Interval for saving checkpoints.")

From 021914c565ed5310671b974cc28b4525bf3a5a86 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Fri, 16 May 2025 15:56:03 +0800
Subject: [PATCH 065/100] support logging rollouts to wandb

---
 .../coati/distributed/producer.py             | 42 +++++++++++++++----
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 82f57094d5d9..0d91f43f154c 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -49,6 +49,7 @@ def __init__(
         project_name: str = None,
         run_name: str = None,
         wandb_group_name: str = None,
+        wandb_log_rollout_interval: int = 20,
     ):
         self.producer_idx = producer_idx
         self.num_producers = num_producers
@@ -58,7 +59,7 @@ def __init__(
         self.microbatch_size = microbatch_size
         assert batch_size % microbatch_size == 0
         self.num_microbatches = batch_size // microbatch_size
-        self.lastest_eval_step = -1
+        self.latest_eval_step = -1
 
         self.train_dataset_config = train_dataset_config
         self.model_config = model_config
@@ -68,6 +69,10 @@ def __init__(
         self.eval_interval = eval_interval
         self.eval_save_dir = eval_save_dir
         self.consumer_global_step = 0
+        self.eval_mode = False
+        self.wandb_rollout_data = []
+        self.wandb_log_rollout_interval = wandb_log_rollout_interval
+        self.latest_rollout_log_step = -1
         if self.producer_idx == 0:
             self.wandb_run = wandb.init(
                 project=project_name,
@@ -77,7 +82,7 @@ def __init__(
                 group=wandb_group_name,
             )
 
-        if os.path.exists(self.eval_save_dir):
+        if os.path.exists(self.eval_save_dir) and self.eval_interval > 0:
             raise ValueError(f"Eval save dir {self.eval_save_dir} already exists. Please delete it or change the name.")
 
         # init tokenizer
@@ -180,10 +185,11 @@ def loop(self) -> None:
                     break
                 if self.eval_interval > 0 and self.eval_dataset_config is not None:
                     if (
-                        self.consumer_global_step - self.lastest_eval_step >= self.eval_interval
-                        and self.consumer_global_step > self.lastest_eval_step
-                    ):
+                        self.consumer_global_step - self.latest_eval_step >= self.eval_interval
+                        and self.consumer_global_step > self.latest_eval_step
+                    ) or self.latest_eval_step == -1:
                         to_log_msg = {}
+                        self.eval_mode = True
                         for eval_task_name in self.eval_dataloaders:
                             if self.producer_idx == 0:
                                 print(
@@ -227,7 +233,8 @@ def loop(self) -> None:
 
                         if self.producer_idx == 0:
                             self.wandb_run.log(to_log_msg, step=self.consumer_global_step)
-                        self.lastest_eval_step = self.consumer_global_step
+                        self.eval_mode = False
+                        self.latest_eval_step = self.consumer_global_step
                 outputs = self.rollout(**batch)
 
                 print(f"[P{self.producer_idx}] Send data {[(k, v.shape) for k, v in outputs.items()]}")
@@ -345,9 +352,26 @@ def __init__(
     @torch.no_grad()
     def rollout(self, input_ids, attention_mask, **kwargs):
         rollouts = self.model.generate(input_ids, attention_mask, **kwargs)
-        # if self.producer_idx == 1:
-        #     print("Rollout example:\n", self.tokenizer.decode(rollouts["input_ids"][0][0], skip_special_tokens=True))
-
+        if self.producer_idx == 0 and not self.eval_mode:
+            wandb_rollout_data = self.wandb_rollout_data + [
+                [
+                    str(self.consumer_global_step),
+                    str(self.tokenizer.decode(rollouts["input_ids"][0][0], skip_special_tokens=True)),
+                ]
+            ]
+            if (
+                self.consumer_global_step - self.latest_rollout_log_step >= self.wandb_log_rollout_interval
+                or self.latest_rollout_log_step == -1
+            ):
+                self.wandb_rollout_data = wandb_rollout_data
+                self.latest_rollout_log_step = self.consumer_global_step
+            self.wandb_run.log(
+                {
+                    "rollout/rollout_examples": wandb.Table(
+                        columns=["train_step", "rollout_examples"], data=wandb_rollout_data
+                    )
+                }
+            )
         return rollouts
 
     def load_state_dict(self, state_dict):

From 03b41d6fb5f49b724e0a50d7b6bb3affee173794 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Fri, 16 May 2025 18:04:38 +0800
Subject: [PATCH 066/100] upgrade reward functions

---
 .../coati/distributed/grpo_consumer.py        |   4 +-
 .../coati/distributed/reward/reward_fn.py     | 143 ++++++++++++++----
 applications/ColossalChat/rl_example.py       |   3 +
 3 files changed, 123 insertions(+), 27 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index eae4ff54e994..8de8b774e333 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -127,7 +127,9 @@ def __init__(
             "answer_end": {"text": "</answer>", "num_occur": 1},
         }
         reward_model_kwargs = {
-            k: v for k, v in grpo_config.items() if k in ["soft_over_length_punishment", "max_length", "cache_length"]
+            k: v
+            for k, v in grpo_config.items()
+            if k in ["soft_over_length_punishment", "max_new_tokens", "cache_length"]
         }
         self.reward_model = VerifiableReward(
             reward_fns=[
diff --git a/applications/ColossalChat/coati/distributed/reward/reward_fn.py b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
index 14d340dc4932..a4042ae97707 100644
--- a/applications/ColossalChat/coati/distributed/reward/reward_fn.py
+++ b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
@@ -1,7 +1,77 @@
 import torch
+from latex2sympy2_extended import NormalizationConfig
+from math_verify import ExprExtractionConfig, LatexExtractionConfig, parse, verify
 
 from .reward_utils import extract_boxed_solution, extract_solution, validate_response_structure
 
+CANNOT_PARSE_GT_ANSWER = -1
+CANNOT_PARSE_PREDICTION = -2
+SUCCESS = 1
+MATCHING_FAIL = 0
+
+
+def verify_math_representation(completion, gt_answer):
+    """
+    Verify if the completion is a valid math representation of the gt_answer.
+    """
+    if not completion.startswith("\\boxed{"):
+        completion = "\\boxed{" + completion + "}"
+    if not gt_answer.startswith("\\boxed{"):
+        gt_answer = "\\boxed{" + gt_answer + "}"
+    target = (
+        ExprExtractionConfig(),
+        LatexExtractionConfig(
+            normalization_config=NormalizationConfig(
+                nits=False,
+                malformed_operators=False,
+                basic_latex=True,
+                boxed="all",
+                units=True,
+            ),
+            boxed_match_priority=0,
+        ),
+    )
+    if not isinstance(gt_answer, str) or len(gt_answer) == 0:
+        raise ValueError("gt_answer should be a string, please verify your training data.")
+    if not isinstance(completion, str) or len(completion) == 0:
+        return MATCHING_FAIL
+    try:
+        parsed_gt_answer = parse(gt_answer, extraction_config=target)
+        if len(parsed_gt_answer) == 0:
+            return CANNOT_PARSE_GT_ANSWER
+        parsed_completion = parse(completion, extraction_config=target)
+        if len(parsed_completion) == 0:
+            return CANNOT_PARSE_PREDICTION
+        if verify(parsed_gt_answer, parsed_completion):
+            return SUCCESS
+        else:
+            return MATCHING_FAIL
+    except Exception:
+        return MATCHING_FAIL
+
+
+def verify_model_answer(decoded_final_answer, gt_answer, ans_acc, acc_score, reward):
+    math_verify_result = verify_math_representation(decoded_final_answer, gt_answer)
+    exact_match_result = (
+        SUCCESS
+        if decoded_final_answer.strip().replace(" ", "").replace("{", "").replace("}", "").replace(",", "")
+        == gt_answer.strip().replace(" ", "").replace("{", "").replace("}", "").replace(",", "")
+        else MATCHING_FAIL
+    )
+    if math_verify_result == SUCCESS:
+        ans_acc += 1
+        reward += acc_score
+    elif exact_match_result == SUCCESS:
+        # sometimes for answers that's not a (valid) math expression, math_verify will fail
+        ans_acc += 1
+        if math_verify_result == CANNOT_PARSE_PREDICTION:
+            reward += (
+                acc_score / 2
+            )  # not a valid latex math representation, but the answer is correct, receive half of the score
+        else:
+            reward += acc_score
+    return reward, ans_acc
+
 
 def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
     tokenizer = kwargs["tokenizer"]
@@ -14,15 +84,18 @@ def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
     s, e = response_idx[0], response_idx[1]
 
     length_reward = 0.0
-    if soft_over_length_punishment:
-        max_length = kwargs.get("max_length", 1024 * 4)
-        cache_length = kwargs.get("cache_length", 512)
-        res_length = e.item() - s.item() + 1
-        if max_length - cache_length < res_length < max_length:
-            length_reward = ((max_length - cache_length) - res_length) / cache_length * acc_score
+    res_length = e.item() - s.item() + 1
+    if not eval_mode:
+        max_new_tokens = kwargs["max_new_tokens"]
+    else:
+        max_new_tokens = -1  # for eval mode, we don't need to check the length
+    if not eval_mode and soft_over_length_punishment:
+        cache_length = kwargs["cache_length"]
+        if max_new_tokens - cache_length < res_length < max_new_tokens:
+            length_reward = ((max_new_tokens - cache_length) - res_length) / cache_length * acc_score
 
     if gt_answer is None:
-        return reward
+        raise ValueError("no gt_answer is provided, please check your training dataset.")
 
     decoded_final_answer = tokenizer.decode(input_ids[s : e + 1], skip_special_tokens=True)
     gt_answer = tokenizer.decode(gt_answer.squeeze(0), skip_special_tokens=True)
@@ -35,15 +108,15 @@ def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
         format_acc += 1
 
     # Check answer accuracy, answer is considered correct if the answer is correct and the format is valid
-    if (
-        format_valid
-        and final_answer is not None
-        and gt_answer.strip().replace(" ", "").lower() == final_answer.strip().replace(" ", "").lower()
-    ):
-        ans_acc += 1
-        reward += acc_score
+    if final_answer is not None:
+        if eval_mode or format_valid:
+            reward, ans_acc = verify_model_answer(final_answer, gt_answer, ans_acc, acc_score, reward)
+        if not eval_mode:
+            reward = reward + length_reward
 
-    reward = reward + length_reward
+    # Check if the sequence is over length
+    if not eval_mode and res_length >= max_new_tokens:
+        reward *= 0.0
 
     if not eval_mode:
         return torch.tensor([reward, format_acc, ans_acc]).to(input_ids.device)
@@ -56,6 +129,8 @@ def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
             "parsed": final_answer,
             "format_valid": format_acc.item(),
             "ans_valid": ans_acc.item(),
+            "response_length": res_length,
+            "reward": reward.item(),
         }
 
 
@@ -71,31 +146,45 @@ def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
     s, e = response_idx[0], response_idx[1]
 
     length_reward = 0.0
-    if soft_over_length_punishment:
-        max_length = kwargs.get("max_length", 1024 * 4)
-        cache_length = kwargs.get("cache_length", 512)
-        res_length = e.item() - s.item() + 1
-        if max_length - cache_length < res_length < max_length:
-            length_reward = ((max_length - cache_length) - res_length) / cache_length * acc_score
+    res_length = e.item() - s.item() + 1
+    if not eval_mode:
+        max_new_tokens = kwargs["max_new_tokens"]
+    else:
+        max_new_tokens = -1  # for eval mode, we don't need to check the length
+    if not eval_mode and soft_over_length_punishment:
+        cache_length = kwargs["cache_length"]
+        if max_new_tokens - cache_length < res_length < max_new_tokens:
+            length_reward = ((max_new_tokens - cache_length) - res_length) / cache_length * acc_score
 
     if gt_answer is None:
-        return torch.tensor([reward, format_acc, ans_acc]).to(input_ids.device)
+        raise ValueError("no gt_answer is provided, please check your training dataset.")
 
     decoded_final_answer = tokenizer.decode(input_ids[s : e + 1], skip_special_tokens=True)
+
     gt_answer = tokenizer.decode(gt_answer.squeeze(0), skip_special_tokens=True)
     final_answer = extract_boxed_solution(decoded_final_answer)
     format_valid = final_answer is not None
+    if "tags" in kwargs and kwargs["tags"]:
+        tags = kwargs["tags"]
+        format_valid = format_valid and all(
+            [decoded_final_answer.count(tags[tag]["text"]) == tags[tag]["num_occur"] for tag in tags]
+        )
     # Check format accuracy
     if format_valid:
         format_acc += 1
         reward += format_score
 
     # Check answer accuracy, answer is considered correct if the answer is correct and the format is valid
-    if format_valid and final_answer is not None and gt_answer.strip().lower() == final_answer.strip().lower():
-        ans_acc += 1
-        reward += acc_score
+    if final_answer is not None:
+        if eval_mode or format_valid:
+            reward, ans_acc = verify_model_answer(final_answer, gt_answer, ans_acc, acc_score, reward)
+        if not eval_mode:
+            reward = reward + length_reward
+
+    # Check if the sequence is over length
+    if not eval_mode and res_length >= max_new_tokens:
+        reward *= 0.0
 
-    reward = reward + length_reward
     if not eval_mode:
         return torch.tensor([reward, format_acc, ans_acc]).to(input_ids.device)
     else:
@@ -107,4 +196,6 @@ def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
             "parsed": final_answer,
             "format_valid": format_acc.item(),
             "ans_valid": ans_acc.item(),
+            "response_length": res_length,
+            "reward": reward.item(),
         }
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 8d1f25e74e3a..071912ddf440 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -198,6 +198,8 @@
             "beta": args.kl_coeff,  # KL penalty coefficient
             "loss_variation": "sample_level",
             "reward_fn_type": args.reward_type,
+            "max_length": args.max_new_tokens + args.max_prompt_tokens,
+            "max_new_tokens": args.max_new_tokens,
         }
     elif args.algo == "DAPO":
         # DAPO variant settings
@@ -213,6 +215,7 @@
             "loss_variation": "token_level",
             "soft_over_length_punishment": True,
             "max_length": args.max_new_tokens + args.max_prompt_tokens,
+            "max_new_tokens": args.max_new_tokens,
             "cache_length": min(1024, int(args.max_new_tokens / 4)),
             "filter_truncated_response": True,
             "reward_fn_type": args.reward_type,

From 107470a36092996bbd2fcf564156ce3bb8416971 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Sat, 17 May 2025 21:12:58 +0800
Subject: [PATCH 067/100] fix logging rollouts

---
 .gitignore                                    |  1 +
 .../coati/distributed/grpo_consumer.py        | 16 +++---
 .../ColossalChat/coati/distributed/launch.py  |  4 ++
 .../coati/distributed/producer.py             | 54 ++++++++++++-------
 applications/ColossalChat/rl_example.py       |  5 ++
 5 files changed, 56 insertions(+), 24 deletions(-)

diff --git a/.gitignore b/.gitignore
index d48035a5b6df..0503f3c9522d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -166,3 +166,4 @@ applications/ColossalChat/tests/logs
 applications/ColossalChat/wandb
 applications/ColossalChat/model
 applications/ColossalChat/eval
+applications/ColossalChat/rollouts
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 8de8b774e333..70e2201fefb8 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -120,12 +120,16 @@ def __init__(
                     "either max_tokens (vllm) or max_new_tokens (transformers) must be set in generate_config."
                 )
         # Initialize verifiable reward.
-        response_format_tags = {
-            "think_start": {"text": "<think>", "num_occur": 1},
-            "think_end": {"text": "</think>", "num_occur": 1},
-            "answer_start": {"text": "<answer>", "num_occur": 1},
-            "answer_end": {"text": "</answer>", "num_occur": 1},
-        }
+        response_format_tags = (
+            {
+                "think_start": {"text": "<think>", "num_occur": 1},
+                "think_end": {"text": "</think>", "num_occur": 1},
+                "answer_start": {"text": "<answer>", "num_occur": 1},
+                "answer_end": {"text": "</answer>", "num_occur": 1},
+            }
+            if grpo_config.get("reward_fn_type") == "think_answer_tags"
+            else None
+        )
         reward_model_kwargs = {
             k: v
             for k, v in grpo_config.items()
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index e6367d2d16df..6eeb5d379b62 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -55,6 +55,8 @@ def launch_distributed(
     eval_interval: int = 100,
     eval_save_dir: Optional[str] = None,
     eval_generation_config: Optional[Dict[str, Any]] = None,
+    log_rollout_interval: int = 20,
+    rollout_log_file: str = "./rollout_log.jsonl",
 ):
     if core_algo not in ALGO_MAP:
         raise NotImplementedError(f"{core_algo} is not supported yet.")
@@ -98,6 +100,8 @@ def launch_distributed(
             project_name=project_name,
             run_name=run_name,
             wandb_group_name=wandb_group_name,
+            log_rollout_interval=log_rollout_interval,
+            rollout_log_file=rollout_log_file,
         )
         procs.append(producer)
     generate_config_consumer = copy.deepcopy(generate_config)
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 0d91f43f154c..75879278caa8 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -1,4 +1,5 @@
 import copy
+import json
 import os
 from typing import Any, Dict, Optional
 
@@ -49,7 +50,8 @@ def __init__(
         project_name: str = None,
         run_name: str = None,
         wandb_group_name: str = None,
-        wandb_log_rollout_interval: int = 20,
+        log_rollout_interval: int = 20,
+        rollout_log_file: str = "./rollout_log.jsonl",
     ):
         self.producer_idx = producer_idx
         self.num_producers = num_producers
@@ -70,9 +72,16 @@ def __init__(
         self.eval_save_dir = eval_save_dir
         self.consumer_global_step = 0
         self.eval_mode = False
-        self.wandb_rollout_data = []
-        self.wandb_log_rollout_interval = wandb_log_rollout_interval
+        self.log_rollout_interval = log_rollout_interval
         self.latest_rollout_log_step = -1
+        if producer_idx == 0:
+            if os.path.exists(rollout_log_file):
+                raise ValueError(
+                    f"Rollout log file {rollout_log_file} already exists. Please delete it or change the name."
+                )
+            else:
+                os.makedirs(os.path.dirname(rollout_log_file), exist_ok=True)
+                self.rollout_log_file = open(rollout_log_file, "w", encoding="utf8")
         if self.producer_idx == 0:
             self.wandb_run = wandb.init(
                 project=project_name,
@@ -320,6 +329,8 @@ def __init__(
         project_name: str = None,
         run_name: str = None,
         wandb_group_name: str = None,
+        log_rollout_interval: int = 20,
+        rollout_log_file: str = "./rollout_log.jsonl",
     ):
         super().__init__(
             producer_idx,
@@ -342,6 +353,8 @@ def __init__(
             project_name=project_name,
             run_name=run_name,
             wandb_group_name=wandb_group_name,
+            log_rollout_interval=log_rollout_interval,
+            rollout_log_file=rollout_log_file,
         )
         self.model = self.backend_cls(model_config, generate_config, self.tokenizer, num_generations)
         self.eval_generation_config = copy.deepcopy(self.model.generate_config)
@@ -353,26 +366,31 @@ def __init__(
     def rollout(self, input_ids, attention_mask, **kwargs):
         rollouts = self.model.generate(input_ids, attention_mask, **kwargs)
         if self.producer_idx == 0 and not self.eval_mode:
-            wandb_rollout_data = self.wandb_rollout_data + [
-                [
-                    str(self.consumer_global_step),
-                    str(self.tokenizer.decode(rollouts["input_ids"][0][0], skip_special_tokens=True)),
-                ]
-            ]
             if (
-                self.consumer_global_step - self.latest_rollout_log_step >= self.wandb_log_rollout_interval
+                self.consumer_global_step - self.latest_rollout_log_step >= self.log_rollout_interval
                 or self.latest_rollout_log_step == -1
             ):
-                self.wandb_rollout_data = wandb_rollout_data
-                self.latest_rollout_log_step = self.consumer_global_step
-            self.wandb_run.log(
-                {
-                    "rollout/rollout_examples": wandb.Table(
-                        columns=["train_step", "rollout_examples"], data=wandb_rollout_data
+                new_record = (
+                    json.dumps(
+                        {
+                            "train_step": self.consumer_global_step,
+                            "rollout": self.tokenizer.batch_decode(
+                                rollouts["input_ids"][:, 0], skip_special_tokens=True
+                            ),
+                        }
                     )
-                }
-            )
+                    + "\n"
+                )
+                self.rollout_log_file.write(new_record)
+                self.rollout_log_file.flush()
+                self.latest_rollout_log_step = self.consumer_global_step
         return rollouts
 
+    def __del__(self):
+        if self.producer_idx == 0:
+            self.wandb_run.finish()
+        if hasattr(self, "rollout_log_file"):
+            self.rollout_log_file.close()
+
     def load_state_dict(self, state_dict):
         self.model.load_state_dict(state_dict)
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 071912ddf440..98c139f14e04 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -118,6 +118,9 @@
     parser.add_argument(
         "-esd", "--eval-save-dir", type=str, default="./eval", help="Directory for saving evaluation results."
     )
+    parser.add_argument(
+        "-rsd", "--rollout-save-dir", type=str, default="./rollouts", help="Directory for saving rollout loggings."
+    )
     args = parser.parse_args()
 
     if args.train_minibatch_size is None:
@@ -269,4 +272,6 @@
         eval_interval=args.eval_interval,
         eval_save_dir=os.path.join(args.eval_save_dir, args.project.replace(" ", "_")),
         eval_generation_config=eval_generation_config,
+        log_rollout_interval=20,
+        rollout_log_file=os.path.join(args.rollout_save_dir, args.project.replace(" ", "_") + ".jsonl"),
     )

From f8bd2db33fa3b42ef8b1ddcf2c96ce6535ab672c Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Tue, 20 May 2025 09:45:56 +0800
Subject: [PATCH 068/100] add uuid to rollout log

---
 applications/ColossalChat/coati/distributed/launch.py | 6 +++++-
 applications/ColossalChat/rl_example.py               | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index 6eeb5d379b62..ef81bcbdd65d 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -56,7 +56,7 @@ def launch_distributed(
     eval_save_dir: Optional[str] = None,
     eval_generation_config: Optional[Dict[str, Any]] = None,
     log_rollout_interval: int = 20,
-    rollout_log_file: str = "./rollout_log.jsonl",
+    rollout_save_dir: str = "./rollout",
 ):
     if core_algo not in ALGO_MAP:
         raise NotImplementedError(f"{core_algo} is not supported yet.")
@@ -74,6 +74,10 @@ def launch_distributed(
 
     run_name = f"{inference_backend}_bs_{train_batch_size * train_dp_size}_temp_{generate_config['temperature']:.01f}_top_p_{generate_config['top_p']:.02f}"
     wandb_group_name = str(uuid.uuid4())
+    rollout_log_file = os.path.join(
+        rollout_save_dir,
+        f"{project_name.replace(' ','_')}_run_{wandb_group_name}.jsonl",
+    )
 
     procs = []
     for i in range(num_producers):
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 98c139f14e04..bfa0ab7d05eb 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -273,5 +273,5 @@
         eval_save_dir=os.path.join(args.eval_save_dir, args.project.replace(" ", "_")),
         eval_generation_config=eval_generation_config,
         log_rollout_interval=20,
-        rollout_log_file=os.path.join(args.rollout_save_dir, args.project.replace(" ", "_") + ".jsonl"),
+        rollout_save_dir=args.rollout_save_dir,
     )

From 32afa7bf29387045999df227faea2da8eed2faee Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Tue, 20 May 2025 17:41:44 +0800
Subject: [PATCH 069/100] fix empty tensor (#6319)

Co-authored-by: Tong Li <tong.li35271158@gmail.com>
---
 applications/ColossalChat/coati/distributed/grpo_consumer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 70e2201fefb8..eaf3521b6381 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -254,7 +254,7 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
                     true_indices = torch.nonzero(effective_prompts_mask)
                     # Make sure the indices are not empty.
                     if true_indices.numel() > 0:
-                        true_indices = true_indices.squeeze()
+                        true_indices = true_indices.squeeze(-1)
                         if excessive_prompts_per_rank <= len(true_indices):
                             excessive_prompts_idx = true_indices[-excessive_prompts_per_rank:]
                         else:

From 37663386bc5aec28e0aedee27ec65be0c9affff0 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Tue, 20 May 2025 18:14:05 +0800
Subject: [PATCH 070/100] fix metric calculation

---
 .../coati/distributed/consumer.py             | 85 ++++++++++++++++---
 .../coati/distributed/grpo_consumer.py        | 50 ++++++-----
 .../ColossalChat/coati/distributed/launch.py  |  7 +-
 applications/ColossalChat/rl_example.py       | 53 +++++++++---
 4 files changed, 147 insertions(+), 48 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 8a2221b92379..026056783dd4 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -120,24 +120,85 @@ def loop(self) -> None:
                             raw_batch = unbind_batch(
                                 ray_broadcast_tensor_dict(None, src=0, device=self.device, group_name=f"sync_data_{r}")
                             )
-                            processed_batch = [
-                                self.prompt_level_filtering(self.calculate_group_reward(group)) for group in raw_batch
-                            ]
-                            filtered_batch = [t for t in processed_batch if t is not None]
+                            recv_effective_count = 0
+                            # calculate group reward et al. filtering. As only the filtered group will be used for training (which is incomplete),
+                            # we need to calculate the metrics before filtering here for logging
+                            for group in raw_batch:
+                                group_with_reward = self.calculate_group_reward(group)
+                                group_reward_mean = group_with_reward["reward"].mean().cpu().item()
+                                group_format_acc_mean = group_with_reward["format_acc"].mean().cpu().item()
+                                group_ans_acc_mean = group_with_reward["ans_acc"].mean().cpu().item()
+                                group_response_len = (
+                                    (
+                                        group_with_reward["response_idx"][:, 1]
+                                        - group_with_reward["response_idx"][:, 0]
+                                        + 1
+                                    )
+                                    .type(torch.float32)
+                                    .mean()
+                                    .cpu()
+                                    .item()
+                                )
+                                filtered_group = self.prompt_level_filtering(group_with_reward)
+                                recv_effective_count += 1 if filtered_group is not None else 0
+                                self.buffer.append(
+                                    [
+                                        filtered_group,
+                                        group_reward_mean,
+                                        group_format_acc_mean,
+                                        group_ans_acc_mean,
+                                        group_response_len,
+                                    ]
+                                )
                             if self.filter_range is not None:
                                 print(
-                                    f"[T{dist.get_rank()}] Filter recv data: {len(processed_batch)} -> {len(filtered_batch)}"
+                                    f"[T{dist.get_rank()}] Filter recv data: {len(raw_batch)} -> {recv_effective_count}"
                                 )
+                            # mapping the effective group to the raw group for indexing
+                            effective_group_to_raw_group_mapping = {}
+                            for buffer_idx in range(len(self.buffer)):
+                                if self.buffer[buffer_idx][0] is not None:
+                                    effective_group_to_raw_group_mapping[len(effective_group_to_raw_group_mapping)] = (
+                                        buffer_idx
+                                    )
 
-                            self.buffer.extend(filtered_batch)
-                        while len(self.buffer) >= self.dp_size * self.minibatch_size:
-                            batches = self.buffer[
-                                self.dp_rank * self.minibatch_size : (self.dp_rank + 1) * self.minibatch_size
+                        while len(effective_group_to_raw_group_mapping) >= self.dp_size * self.minibatch_size:
+                            # on each dp_rank, we use minibatch_size effective samples to form a batch
+                            batches = [
+                                self.buffer[effective_group_to_raw_group_mapping[i]]
+                                for i in range(
+                                    self.dp_rank * self.minibatch_size, (self.dp_rank + 1) * self.minibatch_size
+                                )
                             ]
-                            batch = bind_batch(batches)
+                            # every dp_rank will receive a complete mini-batch, no need to sync within step() later
+                            # each mini-batch use the first self.dp_size * minibatch_size effective samples
+                            raw_mini_batches = self.buffer[
+                                : effective_group_to_raw_group_mapping[self.dp_size * self.minibatch_size - 1] + 1
+                            ]  # include the last effective sample
+                            raw_mini_batches_metric_dict = {
+                                "raw_train_mini_batch_reward": [t[1] for t in raw_mini_batches],
+                                "raw_train_mini_batch_format_acc": [t[2] for t in raw_mini_batches],
+                                "raw_train_mini_batch_ans_acc": [t[3] for t in raw_mini_batches],
+                                "raw_train_mini_batch_response_len": [t[4] for t in raw_mini_batches],
+                            }
+                            batch = bind_batch([t[0] for t in batches])
                             batch = post_recv(batch)
-                            loss = self.step(i, pbar, **batch)
-                            self.buffer = self.buffer[self.dp_size * self.minibatch_size :]
+                            loss = self.step(i, pbar, **batch, **raw_mini_batches_metric_dict)
+                            self.buffer = self.buffer[
+                                effective_group_to_raw_group_mapping[self.dp_size * self.minibatch_size - 1] + 1 :
+                            ]
+                            # recalculate the effective group to raw group mapping
+                            effective_group_to_raw_group_mapping_size_before = len(effective_group_to_raw_group_mapping)
+                            effective_group_to_raw_group_mapping = {}
+                            for buffer_idx in range(len(self.buffer)):
+                                if self.buffer[buffer_idx][0] is not None:
+                                    effective_group_to_raw_group_mapping[len(effective_group_to_raw_group_mapping)] = (
+                                        buffer_idx
+                                    )
+                            assert (
+                                len(effective_group_to_raw_group_mapping)
+                                == effective_group_to_raw_group_mapping_size_before - self.dp_size * self.minibatch_size
+                            )
                             if loss is not None:
                                 pbar.set_postfix({"loss": loss})
                             i += 1
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index bc2bd45d1767..3690239773af 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -72,12 +72,12 @@ def __init__(
         self.policy_model.gradient_checkpointing_enable()
         self.optimizer = HybridAdam(self.policy_model.parameters(), lr=grpo_config.get("lr", 1e-6))
         self.accum_loss = torch.zeros(1, device=self.device)
-        self.accum_reward = torch.zeros(1, device=self.device)
         self.accum_kl = torch.zeros(1, device=self.device)
-        self.accum_format_acc = torch.zeros(1, device=self.device)
-        self.accum_ans_acc = torch.zeros(1, device=self.device)
         self.accum_advantages = torch.zeros(1, device=self.device)
-        self.accum_response_length = torch.zeros(1, device=self.device)
+        self.raw_train_batch_reward = []
+        self.raw_train_batch_format_acc = []
+        self.raw_train_batch_ans_acc = []
+        self.raw_train_batch_response_len = []
         self.accum_count = 0
         self.generate_config = generate_config
         self.grpo_config = grpo_config
@@ -186,7 +186,11 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
             [minibatch_size, num_of_generation, prompt_length + response_length] --- <PAD>...<PAD><PROMPT>...<PROMPT><RESPONSE>...<RESPONSE><PAD>...<PAD>.
         """
         # Reshape to [minibatch_size x num_of_generation, prompt_length + response_length]
-        data = {k: v.view(-1, v.size(-1)) for k, v in kwargs.items()}
+        data = {k: v.view(-1, v.size(-1)) for k, v in kwargs.items() if "raw_train_mini_batch_" not in k}
+        self.raw_train_batch_reward.extend(kwargs["raw_train_mini_batch_reward"])
+        self.raw_train_batch_format_acc.extend(kwargs["raw_train_mini_batch_format_acc"])
+        self.raw_train_batch_ans_acc.extend(kwargs["raw_train_mini_batch_ans_acc"])
+        self.raw_train_batch_response_len.extend(kwargs["raw_train_mini_batch_response_len"])
         action_mask = data["action_mask"]
         num_action = action_mask.shape[1]
         old_action_log_probs = data["action_log_probs"]
@@ -430,11 +434,7 @@ def _criterion(outputs, inputs):
                 self.accum_loss.add_(sum(mean_loss) / len(mean_loss))
                 if self.policy_loss_fn.beta > 0:
                     self.accum_kl.add_(sum(mean_kl) / len(mean_kl))
-                self.accum_reward.add_(reward.data)
-                self.accum_format_acc.add_(format_acc.data)
-                self.accum_ans_acc.add_(ans_acc.data)
                 self.accum_advantages.add_(advantages.data)
-                self.accum_response_length.add_(response_length.data)
                 self.accum_count += 1
         if need_update:
             self.optimizer.step()
@@ -452,21 +452,33 @@ def _criterion(outputs, inputs):
                 if (not self.plugin.pp_size > 1 and self.rank == 0) or (
                     self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0
                 ):
+                    raw_batch_reward_mean = sum(self.raw_train_batch_reward) / len(self.raw_train_batch_reward)
+                    raw_batch_format_acc_mean = sum(self.raw_train_batch_format_acc) / len(
+                        self.raw_train_batch_format_acc
+                    )
+                    raw_batch_ans_acc_mean = sum(self.raw_train_batch_ans_acc) / len(self.raw_train_batch_ans_acc)
+                    raw_batch_response_len_mean = sum(self.raw_train_batch_response_len) / len(
+                        self.raw_train_batch_response_len
+                    )
+                    self.raw_train_batch_reward = []
+                    self.raw_train_batch_format_acc = []
+                    self.raw_train_batch_ans_acc = []
+                    self.raw_train_batch_response_len = []
                     to_log_msg = [
                         f"Loss: {self.accum_loss.item() / self.accum_count:.4f}",
-                        f"Reward: {self.accum_reward.item() / self.accum_count:.4f}",
-                        f"format Reward: {self.accum_format_acc.item() / self.accum_count:.4f}",
-                        f"Acc Reward: {self.accum_ans_acc.item() / self.accum_count:.4f}",
+                        f"Reward: {raw_batch_reward_mean:.4f}",
+                        f"format Reward: {raw_batch_format_acc_mean:.4f}",
+                        f"Acc Reward: {raw_batch_ans_acc_mean:.4f}",
                         f"Advantages: {self.accum_advantages.item() / self.accum_count:.4f}",
-                        f"Response Length: {self.accum_response_length.item() / self.accum_count:.4f}",
+                        f"Response Length: {raw_batch_response_len_mean:.4f}",
                         f"Sample_utilization: {sample_utilization:.4f}",
                     ] + ([f"KL: {self.accum_kl.item() / self.accum_count:.4f}"] if self.policy_loss_fn.beta > 0 else [])
                     print("\n".join(to_log_msg))
                     metrics = {
-                        "metrics/reward": self.accum_reward.item() / self.accum_count,
-                        "metrics/format_acc": self.accum_format_acc.item() / self.accum_count,
-                        "metrics/ans_acc": self.accum_ans_acc.item() / self.accum_count,
-                        "metrics/response_length": self.accum_response_length.item() / self.accum_count,
+                        "metrics/reward": raw_batch_reward_mean,
+                        "metrics/format_acc": raw_batch_format_acc_mean,
+                        "metrics/ans_acc": raw_batch_ans_acc_mean,
+                        "metrics/response_length": raw_batch_response_len_mean,
                         "train/loss": self.accum_loss.item() / self.accum_count,
                         "train/advantages": self.accum_advantages.item() / self.accum_count,
                         "train/learning_rate": self.lr_scheduler.get_last_lr()[0],
@@ -478,12 +490,8 @@ def _criterion(outputs, inputs):
                     if self.wandb_run is not None:
                         self.wandb_run.log(metrics)
                 self.accum_loss.zero_()
-                self.accum_reward.zero_()
-                self.accum_ans_acc.zero_()
-                self.accum_format_acc.zero_()
                 self.accum_kl.zero_()
                 self.accum_advantages.zero_()
-                self.accum_response_length.zero_()
                 self.accum_count = 0
             return loss_scalar
         else:
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index 6eeb5d379b62..220a47eb8cbc 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -1,4 +1,5 @@
 import copy
+import os
 import uuid
 from typing import Any, Dict, Optional
 
@@ -56,7 +57,7 @@ def launch_distributed(
     eval_save_dir: Optional[str] = None,
     eval_generation_config: Optional[Dict[str, Any]] = None,
     log_rollout_interval: int = 20,
-    rollout_log_file: str = "./rollout_log.jsonl",
+    rollout_save_dir: str = "./rollout",
 ):
     if core_algo not in ALGO_MAP:
         raise NotImplementedError(f"{core_algo} is not supported yet.")
@@ -74,6 +75,10 @@ def launch_distributed(
 
     run_name = f"{inference_backend}_bs_{train_batch_size * train_dp_size}_temp_{generate_config['temperature']:.01f}_top_p_{generate_config['top_p']:.02f}"
     wandb_group_name = str(uuid.uuid4())
+    rollout_log_file = os.path.join(
+        rollout_save_dir,
+        f"{project_name}_run_{wandb_group_name}.jsonl",
+    )
 
     procs = []
     for i in range(num_producers):
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index a97c2b210313..53f166d66d16 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -121,6 +121,34 @@
     parser.add_argument(
         "-rsd", "--rollout-save-dir", type=str, default="./rollouts", help="Directory for saving rollout loggings."
     )
+    parser.add_argument(
+        "-tp",
+        "--tensor-parallel-size",
+        type=int,
+        default=1,
+        help="Tensor parallel size for the inference backend. Please check the generation arguments documentation for your backend.",
+    )
+    parser.add_argument(
+        "-pp",
+        "--pipeline-parallel-size",
+        type=int,
+        default=1,
+        help="Pipeline parallel size for the inference backend. Please check the generation arguments documentation for your backend.",
+    )
+    parser.add_argument(
+        "-zero",
+        "--zero-stage",
+        type=int,
+        default=0,
+        help="Zero stage for the inference backend. Please check the generation arguments documentation for your backend.",
+    )
+    parser.add_argument(
+        "-ptp",
+        "--produce-tensor-parallel-size",
+        type=int,
+        default=1,
+        help="Tensor parallel size for the producer. Please check the generation arguments documentation for your backend.",
+    )
     args = parser.parse_args()
 
     if args.train_minibatch_size is None:
@@ -178,7 +206,7 @@
                 enforce_eager=True,
                 enable_chunked_prefill=True,
                 max_model_len=args.max_new_tokens + args.max_prompt_tokens,
-                tensor_parallel_size=1,
+                tensor_parallel_size=args.produce_tensor_parallel_size,
             )
         )
         generate_config.update(
@@ -228,7 +256,7 @@
 
     launch_distributed(
         num_producers=args.num_inferencer,
-        num_proc_per_producer=inference_model_config.get("tensor_parallel_size", 1),
+        num_proc_per_producer=inference_model_config.get("tensor_parallel_size", args.produce_tensor_parallel_size),
         num_consumer_procs=args.num_trainers,
         num_episodes=args.num_episodes,
         inference_batch_size=args.inference_batch_size,
@@ -247,17 +275,14 @@
         train_model_config=train_model_config,
         grpo_config=grpo_config,
         plugin_config={
-            "zero_stage": 2,
-        },  # for zero
-        # plugin_config={
-        #     "tp_size": 2,
-        #     "pp_size": 2,
-        #     "microbatch_size": max(
-        #         1, args.train_microbatch_size // 2
-        #     ),  # microbatch size should be set to train_microbatch_size // pp_size
-        #     "zero_stage": 0,
-        #     "max_norm": 1.0,
-        # },  # for pp, tp
+            "tp_size": args.tensor_parallel_size,
+            "pp_size": args.pipeline_parallel_size,
+            "microbatch_size": max(
+                1, args.train_microbatch_size // args.pipeline_parallel_size
+            ),  # microbatch size should be set to train_microbatch_size // pp_size
+            "zero_stage": args.zero_stage,
+            "max_norm": 1.0,
+        },  # for pp, tp
         inference_backend=args.backend,
         master_addr="localhost",
         master_port=args.master_port,
@@ -273,5 +298,5 @@
         eval_save_dir=os.path.join(args.eval_save_dir, args.project.replace(" ", "_")),
         eval_generation_config=eval_generation_config,
         log_rollout_interval=20,
-        rollout_log_file=os.path.join(args.rollout_save_dir, args.project.replace(" ", "_") + ".jsonl"),
+        rollout_save_dir=args.rollout_save_dir,
     )

From 78a06f5ce37832bcc635ca7aa1c623b199a0b807 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Wed, 21 May 2025 10:51:32 +0800
Subject: [PATCH 071/100] fix missing tags parameter

---
 .../coati/distributed/grpo_consumer.py        | 11 +---------
 .../ColossalChat/coati/distributed/launch.py  |  1 +
 .../coati/distributed/producer.py             |  5 +++++
 applications/ColossalChat/rl_example.py       | 20 +++++++++++++++++++
 4 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 3690239773af..152689e06f04 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -120,16 +120,7 @@ def __init__(
                     "either max_tokens (vllm) or max_new_tokens (transformers) must be set in generate_config."
                 )
         # Initialize verifiable reward.
-        response_format_tags = (
-            {
-                "think_start": {"text": "<think>", "num_occur": 1},
-                "think_end": {"text": "</think>", "num_occur": 1},
-                "answer_start": {"text": "<answer>", "num_occur": 1},
-                "answer_end": {"text": "</answer>", "num_occur": 1},
-            }
-            if grpo_config.get("reward_fn_type") == "think_answer_tags"
-            else None
-        )
+        response_format_tags = grpo_config.get("response_format_tags", None)
         reward_model_kwargs = {
             k: v
             for k, v in grpo_config.items()
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index c9e8d2ab253d..764325cdd78c 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -100,6 +100,7 @@ def launch_distributed(
             eval_dataset_config=eval_dataset_config,
             eval_interval=eval_interval,
             evaluation_function_type=grpo_config["reward_fn_type"],
+            response_format_tags=grpo_config["response_format_tags"],
             eval_save_dir=eval_save_dir,
             eval_generation_config=eval_generation_config,
             project_name=project_name,
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 75879278caa8..916ae33268cd 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -46,6 +46,7 @@ def __init__(
         eval_dataset_config=None,
         eval_interval=-1,  # disable evaluation
         evaluation_function_type="think_answer_tags",
+        response_format_tags=None,
         eval_save_dir: str = "./eval",
         project_name: str = None,
         run_name: str = None,
@@ -148,6 +149,7 @@ def __init__(
                 self.evaluation_function = boxed_math_reward_fn
             else:
                 raise ValueError(f"Unknown evaluation function type {evaluation_function_type}")
+            self.response_format_tags = response_format_tags
         else:
             raise ValueError("eval_dataset_config is not defined")
         self.device = get_current_device()
@@ -217,6 +219,7 @@ def loop(self) -> None:
                                         eval_outputs["response_idx"][m][n],
                                         tokenizer=self.tokenizer,
                                         eval_mode=True,
+                                        tags=self.response_format_tags,
                                     )
                                     for m in range(eval_outputs["input_ids"].size(0))
                                     for n in range(eval_outputs["input_ids"].size(1))
@@ -324,6 +327,7 @@ def __init__(
         eval_dataset_config=None,
         eval_interval=-1,  # disable evaluation
         evaluation_function_type="think_answer_tags",
+        response_format_tags=None,
         eval_save_dir: str = "./eval",
         eval_generation_config={},
         project_name: str = None,
@@ -349,6 +353,7 @@ def __init__(
             eval_dataset_config=eval_dataset_config,
             eval_interval=eval_interval,
             evaluation_function_type=evaluation_function_type,
+            response_format_tags=response_format_tags,
             eval_save_dir=eval_save_dir,
             project_name=project_name,
             run_name=run_name,
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 53f166d66d16..e1025442ceee 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -231,6 +231,16 @@
             "reward_fn_type": args.reward_type,
             "max_length": args.max_new_tokens + args.max_prompt_tokens,
             "max_new_tokens": args.max_new_tokens,
+            "response_format_tags": (
+                {
+                    "think_start": {"text": "<think>", "num_occur": 1},
+                    "think_end": {"text": "</think>", "num_occur": 1},
+                    "answer_start": {"text": "<answer>", "num_occur": 1},
+                    "answer_end": {"text": "</answer>", "num_occur": 1},
+                }
+                if args.reward_type == "think_answer_tags"
+                else None
+            ),
         }
     elif args.algo == "DAPO":
         # DAPO variant settings
@@ -250,6 +260,16 @@
             "cache_length": min(1024, int(args.max_new_tokens / 4)),
             "filter_truncated_response": True,
             "reward_fn_type": args.reward_type,
+            "response_format_tags": (
+                {
+                    "think_start": {"text": "<think>", "num_occur": 1},
+                    "think_end": {"text": "</think>", "num_occur": 1},
+                    "answer_start": {"text": "<answer>", "num_occur": 1},
+                    "answer_end": {"text": "</answer>", "num_occur": 1},
+                }
+                if args.reward_type == "think_answer_tags"
+                else None
+            ),
         }
     else:
         raise ValueError(f"Unsupported algorithm: {args.algo}")

From de2ad3b206a6814dbeae803b4c8429e048535f56 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Thu, 22 May 2025 11:52:41 +0800
Subject: [PATCH 072/100] fix default eval setting (#6321)

Co-authored-by: Tong Li <tong.li35271158@gmail.com>
---
 .../ColossalChat/coati/distributed/launch.py       |  1 +
 .../ColossalChat/coati/distributed/producer.py     |  2 +-
 applications/ColossalChat/rl_example.py            | 14 +++++++++-----
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index ef81bcbdd65d..c9e8d2ab253d 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -1,4 +1,5 @@
 import copy
+import os
 import uuid
 from typing import Any, Dict, Optional
 
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 75879278caa8..78964afa18a6 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -149,7 +149,7 @@ def __init__(
             else:
                 raise ValueError(f"Unknown evaluation function type {evaluation_function_type}")
         else:
-            raise ValueError("eval_dataset_config is not defined")
+            print("No eval dataset provided, skip eval")
         self.device = get_current_device()
 
         # init backend
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index bfa0ab7d05eb..5aec7f5a681a 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -14,7 +14,7 @@
         "-ed",
         "--eval-dataset",
         type=str,
-        default='{"eval task name":"data_eval.jsonl"}',
+        default=None,
         help="Evaluation dataset for each task, please use json format to specify the dataset for each task. \
         For example: {'task1':'data_eval_task1.jsonl', 'task2':'data_eval_task2.jsonl'}, the jsonl file should be in the same format as the training dataset. \
         The key is the task name, and the value is the path to the jsonl file",
@@ -265,10 +265,14 @@
         project_name=args.project,
         save_interval=args.save_interval,
         save_dir=os.path.join(args.save_dir, args.project.replace(" ", "_")),
-        eval_dataset_config={
-            k: {"path": v, "max_length": args.max_prompt_tokens, "system_prompt": args.system_prompt}
-            for k, v in json.loads(args.eval_dataset).items()
-        },
+        eval_dataset_config=(
+            {
+                k: {"path": v, "max_length": args.max_prompt_tokens, "system_prompt": args.system_prompt}
+                for k, v in json.loads(args.eval_dataset).items()
+            }
+            if args.eval_dataset
+            else None
+        ),
         eval_interval=args.eval_interval,
         eval_save_dir=os.path.join(args.eval_save_dir, args.project.replace(" ", "_")),
         eval_generation_config=eval_generation_config,

From 4c3656870ae25c959e310c559958c4a21d3f75cd Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Wed, 28 May 2025 17:34:11 +0800
Subject: [PATCH 073/100] address conversation

---
 .../coati/distributed/consumer.py             | 11 ++++----
 .../coati/distributed/grpo_consumer.py        | 28 +++++++++----------
 2 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 026056783dd4..6f8f1b497f62 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -117,14 +117,12 @@ def loop(self) -> None:
                         # receive data from producers
                         for r in range(self.num_producers):
                             print(f"[T{dist.get_rank()}] Recv data episode {episode} step {step} from {r}")
-                            raw_batch = unbind_batch(
-                                ray_broadcast_tensor_dict(None, src=0, device=self.device, group_name=f"sync_data_{r}")
-                            )
+                            raw_batch = ray_broadcast_tensor_dict(None, src=0, device=self.device, group_name=f"sync_data_{r}")
                             recv_effective_count = 0
                             # calculate group reward et al. filtering. As only the filtered group will be used for training (which is incomplete),
                             # we need to calculate the metrics before filtering here for logging
-                            for group in raw_batch:
-                                group_with_reward = self.calculate_group_reward(group)
+                            raw_batch_with_reward = unbind_batch(self.calculate_reward(raw_batch))
+                            for group_with_reward in raw_batch_with_reward:
                                 group_reward_mean = group_with_reward["reward"].mean().cpu().item()
                                 group_format_acc_mean = group_with_reward["format_acc"].mean().cpu().item()
                                 group_ans_acc_mean = group_with_reward["ans_acc"].mean().cpu().item()
@@ -139,7 +137,8 @@ def loop(self) -> None:
                                     .cpu()
                                     .item()
                                 )
-                                filtered_group = self.prompt_level_filtering(group_with_reward)
+                                if self.grpo_config.get("dynamic_batching", True):
+                                    filtered_group = self.prompt_level_filtering(group_with_reward)
                                 recv_effective_count += 1 if filtered_group is not None else 0
                                 self.buffer.append(
                                     [
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 152689e06f04..891a6f8429a7 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -218,8 +218,6 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
 
         if self.grpo_config.get("dynamic_batching", True):
             need_update = self.effective_prompt_count >= self.batch_size * self.dp_size
-            excessive_prompts = self.effective_prompt_count - self.batch_size * self.dp_size
-            assert excessive_prompts <= 0, "Debug: Excessive prompts should always be less than 0. Bug!!!!"
         else:
             # If dynamic batching is disabled, we need to use all samples for training.
             need_update = (step_idx + 1) % self.num_microbatches == 0
@@ -488,7 +486,7 @@ def _criterion(outputs, inputs):
         else:
             return None
 
-    def calculate_group_reward(self, rollout_group: Dict[str, Any]) -> Dict[str, Any]:
+    def calculate_reward(self, rollout: Dict[str, Any]) -> Dict[str, Any]:
         """
         Calculate the group reward for the given rollout group.
 
@@ -507,20 +505,20 @@ def calculate_group_reward(self, rollout_group: Dict[str, Any]) -> Dict[str, Any
         Returns:
             Dict[str, Any]: The new group data with calculated reward.
         """
-        reward_group = self.reward_model(
-            rollout_group["input_ids"],
-            gt_answer=rollout_group["gt_answer"],
-            response_idx=rollout_group["response_idx"],
+        reward_model_output = self.reward_model(
+            rollout["input_ids"],
+            gt_answer=rollout["gt_answer"],
+            response_idx=rollout["response_idx"],
         )
         # [num_of_generation]
-        reward = torch.tensor([value[0] for value in reward_group]).to(rollout_group["input_ids"].device)
-        format_acc = torch.tensor([value[1] for value in reward_group]).to(rollout_group["input_ids"].device)
-        ans_acc = torch.tensor([value[2] for value in reward_group]).to(rollout_group["input_ids"].device)
-
-        rollout_group["reward"] = reward.view((-1, 1))
-        rollout_group["format_acc"] = format_acc.view((-1, 1))
-        rollout_group["ans_acc"] = ans_acc.view((-1, 1))
-        return rollout_group
+        reward = torch.tensor([value[0] for value in reward_model_output]).to(rollout["input_ids"].device)
+        format_acc = torch.tensor([value[1] for value in reward_model_output]).to(rollout["input_ids"].device)
+        ans_acc = torch.tensor([value[2] for value in reward_model_output]).to(rollout["input_ids"].device)
+
+        rollout["reward"] = reward.view((-1, 1))
+        rollout["format_acc"] = format_acc.view((-1, 1))
+        rollout["ans_acc"] = ans_acc.view((-1, 1))
+        return rollout
 
     def prompt_level_filtering(self, rollout_group: Dict[str, Any]) -> Dict[str, Any]:
         """

From c8b368c294a76fd06b45b5f683cde466e03f987c Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Wed, 28 May 2025 19:18:09 +0800
Subject: [PATCH 074/100] add overlength sample count (#6332)

Co-authored-by: Tong Li <tong.li35271158@gmail.com>
---
 .../coati/distributed/grpo_consumer.py             | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index eaf3521b6381..1666ac582de6 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -85,6 +85,8 @@ def __init__(
         self.effective_sample_count = 0
         self.effective_prompt_count = 0
         self.total_sample_count = 0
+        self.overlength_samples = 0
+        self.total_overlength_samples = 0
         self.project_name = project_name
         self.run_name = run_name
         self.wandb_group_name = wandb_group_name
@@ -227,10 +229,18 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
 
         # filter out overlength samples
         if self.filter_truncated_response and action_mask.size(1) == self.max_length:
+            old_loss_mask = loss_mask.clone()
             loss_mask = torch.logical_and(
                 loss_mask,
                 action_mask[:, -1] == False,
             )
+
+            self.overlength_samples = (old_loss_mask & ~loss_mask).sum().item()
+            self.overlength_samples = all_reduce_sum(
+                torch.tensor(self.overlength_samples, device=loss_mask.device), self.plugin
+            )
+            self.total_overlength_samples += self.overlength_samples.item()
+
         prompt_level_mask = loss_mask.view(self.minibatch_size, self.num_generations)
 
         # [minibatch_size] -> calculate the number of effective prompts
@@ -484,9 +494,11 @@ def _criterion(outputs, inputs):
             self.optimizer.zero_grad()
             self.global_step += 1
             sample_utilization = self.effective_sample_count / self.total_sample_count
+            overlength_samples_percentage = self.total_overlength_samples / self.total_sample_count
             self.effective_prompt_count = 0
             self.effective_sample_count = 0
             self.total_sample_count = 0
+            self.total_overlength_samples = 0
             loss_scalar = self.accum_loss.item()
             if not self.plugin.pp_size > 1 or (
                 self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0
@@ -502,6 +514,7 @@ def _criterion(outputs, inputs):
                         f"Advantages: {self.accum_advantages.item() / self.accum_count:.4f}",
                         f"Response Length: {self.accum_response_length.item() / self.accum_count:.4f}",
                         f"Sample_utilization: {sample_utilization:.4f}",
+                        f"Percentage of overlength samples: {overlength_samples_percentage:.4f}",
                     ] + ([f"KL: {self.accum_kl.item() / self.accum_count:.4f}"] if self.policy_loss_fn.beta > 0 else [])
                     print("\n".join(to_log_msg))
                     metrics = {
@@ -513,6 +526,7 @@ def _criterion(outputs, inputs):
                         "train/advantages": self.accum_advantages.item() / self.accum_count,
                         "train/learning_rate": self.lr_scheduler.get_last_lr()[0],
                         "train/sample_utilization": sample_utilization,
+                        "train/percentage_overlength_samples": overlength_samples_percentage,
                         "rollout/temperature": data["temperature"].cpu().numpy()[0][0],
                     }
                     if self.policy_loss_fn.beta > 0:

From ee939d9aa506b4c5266fd2136e23bc6503b1d5b7 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Thu, 29 May 2025 10:25:59 +0800
Subject: [PATCH 075/100] address conversation

---
 .../coati/distributed/consumer.py             | 67 ++++++++++---------
 .../coati/distributed/grpo_consumer.py        | 34 +---------
 .../coati/distributed/producer.py             |  3 +-
 3 files changed, 36 insertions(+), 68 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 6f8f1b497f62..593e0f4ec4f4 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -118,48 +118,49 @@ def loop(self) -> None:
                         for r in range(self.num_producers):
                             print(f"[T{dist.get_rank()}] Recv data episode {episode} step {step} from {r}")
                             raw_batch = ray_broadcast_tensor_dict(None, src=0, device=self.device, group_name=f"sync_data_{r}")
-                            recv_effective_count = 0
                             # calculate group reward et al. filtering. As only the filtered group will be used for training (which is incomplete),
                             # we need to calculate the metrics before filtering here for logging
-                            raw_batch_with_reward = unbind_batch(self.calculate_reward(raw_batch))
-                            for group_with_reward in raw_batch_with_reward:
-                                group_reward_mean = group_with_reward["reward"].mean().cpu().item()
-                                group_format_acc_mean = group_with_reward["format_acc"].mean().cpu().item()
-                                group_ans_acc_mean = group_with_reward["ans_acc"].mean().cpu().item()
-                                group_response_len = (
-                                    (
-                                        group_with_reward["response_idx"][:, 1]
-                                        - group_with_reward["response_idx"][:, 0]
-                                        + 1
-                                    )
-                                    .type(torch.float32)
-                                    .mean()
-                                    .cpu()
-                                    .item()
+                            # [batch_size, num_generations, ...] -> [batch_size * num_generations, ...]
+                            raw_batch_with_reward = self.calculate_reward({k:v.view(-1, v.size(-1)) if k!='temperature' else v for k, v in raw_batch.items()})
+                            raw_batch_with_reward = {k: v.view(-1, self.num_generations, v.size(-1)) if k!='temperature' else v for k, v in raw_batch_with_reward.items()}
+                            # [batch_size, num_generations] -> [batch_size]
+                            group_reward_mean = raw_batch_with_reward["reward"][:,:,0].mean(dim=-1)  
+                            group_format_acc_mean = raw_batch_with_reward["format_acc"][:,:,0].mean(dim=-1) 
+                            group_ans_acc_mean = raw_batch_with_reward["ans_acc"][:,:,0].mean(dim=-1) 
+                            group_response_len = (
+                                (raw_batch_with_reward["response_idx"][:, :, 1] - raw_batch_with_reward["response_idx"][:, :, 0] + 1)
+                                .type(torch.float32)
+                                .mean(dim=-1)
+                            )
+                            effective_group_mask = None
+                            if self.filter_range is not None and self.grpo_config.get("dynamic_batching", True):
+                                # filter the group based on the reward and accuracy
+                                effective_group_mask = torch.logical_and(
+                                    group_ans_acc_mean > self.filter_range[0], group_ans_acc_mean < self.filter_range[1]
                                 )
-                                if self.grpo_config.get("dynamic_batching", True):
-                                    filtered_group = self.prompt_level_filtering(group_with_reward)
-                                recv_effective_count += 1 if filtered_group is not None else 0
+                            raw_batch_with_reward = unbind_batch(raw_batch_with_reward)       # List[Dict[str, torch.Tensor]]                       
+                            for group_idx, group_with_reward in enumerate(raw_batch_with_reward):
                                 self.buffer.append(
                                     [
-                                        filtered_group,
-                                        group_reward_mean,
-                                        group_format_acc_mean,
-                                        group_ans_acc_mean,
-                                        group_response_len,
+                                        group_with_reward if effective_group_mask is None or effective_group_mask[group_idx] else None,
+                                        group_reward_mean[group_idx],
+                                        group_format_acc_mean[group_idx],
+                                        group_ans_acc_mean[group_idx],
+                                        group_response_len[group_idx],
                                     ]
                                 )
-                            if self.filter_range is not None:
+                            if effective_group_mask is not None:
                                 print(
-                                    f"[T{dist.get_rank()}] Filter recv data: {len(raw_batch)} -> {recv_effective_count}"
+                                    f"[T{dist.get_rank()}] Filter recv data: {len(raw_batch)} -> {torch.sum(effective_group_mask).cpu().item()} effective groups"
                                 )
-                            # mapping the effective group to the raw group for indexing
-                            effective_group_to_raw_group_mapping = {}
-                            for buffer_idx in range(len(self.buffer)):
-                                if self.buffer[buffer_idx][0] is not None:
-                                    effective_group_to_raw_group_mapping[len(effective_group_to_raw_group_mapping)] = (
-                                        buffer_idx
-                                    )
+                        # mapping the effective group to the raw group for indexing
+                        effective_group_to_raw_group_mapping = {}
+                        for buffer_idx in range(len(self.buffer)):
+                            if self.buffer[buffer_idx][0] is not None:
+                                effective_group_to_raw_group_mapping[len(effective_group_to_raw_group_mapping)] = (
+                                    buffer_idx
+                                )
+                        pbar.set_postfix({"Collect Effective Prompt": f"{len(effective_group_to_raw_group_mapping)}/{self.dp_size * self.minibatch_size}"})
 
                         while len(effective_group_to_raw_group_mapping) >= self.dp_size * self.minibatch_size:
                             # on each dp_rank, we use minibatch_size effective samples to form a batch
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 891a6f8429a7..50702683fb96 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -84,7 +84,6 @@ def __init__(
         self.project_name = project_name
         self.effective_sample_count = 0
         self.effective_prompt_count = 0
-        self.total_sample_count = 0
         self.project_name = project_name
         self.run_name = run_name
         self.wandb_group_name = wandb_group_name
@@ -429,11 +428,9 @@ def _criterion(outputs, inputs):
             self.optimizer.step()
             self.optimizer.zero_grad()
             self.global_step += 1
-            # no need to run all_reduce_sum on total_sample_count, because all dp ranks recieves a complete inference batch from all producers.
-            sample_utilization = self.effective_sample_count / self.total_sample_count
+            sample_utilization = self.effective_sample_count / len(self.raw_train_batch_reward) / self.num_generations
             self.effective_prompt_count = 0
             self.effective_sample_count = 0
-            self.total_sample_count = 0
             loss_scalar = self.accum_loss.item()
             if not self.plugin.pp_size > 1 or (
                 self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0
@@ -520,35 +517,6 @@ def calculate_reward(self, rollout: Dict[str, Any]) -> Dict[str, Any]:
         rollout["ans_acc"] = ans_acc.view((-1, 1))
         return rollout
 
-    def prompt_level_filtering(self, rollout_group: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        rollout_group: Dict[str, Any]
-            a group of samples generated by the model from the same prompt
-            contain the following keys:
-                "input_ids": torch.Tensor, [num_of_generation, prompt_length + response_length]
-                "attention_mask": torch.Tensor, [num_of_generation, prompt_length + response_length]
-                "action_mask": torch.Tensor, [num_of_generation, response_length]
-                "action_log_probs": torch.Tensor, [num_of_generation, response_length]
-                "response_idx": int, torch.Tensor, [num_of_generation, 2]
-                "gt_answer": torch.Tensor, [num_of_generation, 128]
-                "temperature": torch.Tensor, [] (scalar)
-                "reward": torch.Tensor, [num_of_generation]
-                "format_acc": torch.Tensor, [num_of_generation]
-                "ans_acc": torch.Tensor, [num_of_generation]
-        """
-        self.total_sample_count += rollout_group["input_ids"].size(0)
-        if self.filter_range is not None:
-            # filter prompt whoes accuracy is too high or too low (out of range)
-            group_ans_acc = torch.mean(rollout_group["ans_acc"])
-            if group_ans_acc < self.filter_range[0] or group_ans_acc > self.filter_range[1]:
-                # filter out the prompt
-                return None
-            else:
-                return rollout_group
-        else:
-            # no filter
-            return rollout_group
-
     def state_dict(self):
         self.policy_model._force_wait_all_gather()
         model = self.policy_model.unwrap()
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index f5bdc68357a1..623ed7ab90d1 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -248,11 +248,10 @@ def loop(self) -> None:
                         self.eval_mode = False
                         self.latest_eval_step = self.consumer_global_step
                 outputs = self.rollout(**batch)
-
-                print(f"[P{self.producer_idx}] Send data {[(k, v.shape) for k, v in outputs.items()]}")
                 outputs["temperature"] = torch.tensor(
                     [self.model.generate_config["temperature"]] * outputs["input_ids"].size(0)
                 ).to(outputs["input_ids"].device)
+                print(f"[P{self.producer_idx}] Send data {[(k, v.shape) for k, v in outputs.items()]}")
                 outputs = pre_send(outputs)
                 ray_broadcast_tensor_dict(
                     outputs, src=0, device=self.device, group_name=f"sync_data_{self.producer_idx}"

From 0d008110e7a4fe2eb5d90cfd8fed6b9e2f294e00 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 29 May 2025 10:16:55 +0000
Subject: [PATCH 076/100] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../coati/distributed/consumer.py             | 38 +++++++++++++------
 .../coati/distributed/grpo_consumer.py        | 10 +++--
 2 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 690a10608c25..22de6911ea29 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -117,20 +117,28 @@ def loop(self) -> None:
                         # receive data from producers
                         for r in range(self.num_producers):
                             print(f"[T{dist.get_rank()}] Recv data episode {episode} step {step} from {r}")
-                            raw_batch = ray_broadcast_tensor_dict(None, src=0, device=self.device, group_name=f"sync_data_{r}")
+                            raw_batch = ray_broadcast_tensor_dict(
+                                None, src=0, device=self.device, group_name=f"sync_data_{r}"
+                            )
                             # calculate group reward et al. filtering. As only the filtered group will be used for training (which is incomplete),
                             # we need to calculate the metrics before filtering here for logging
                             # [batch_size, num_generations, ...] -> [batch_size * num_generations, ...]
-                            raw_batch_with_reward = self.calculate_reward({k:v.view(-1, v.size(-1)) if k!='temperature' else v for k, v in raw_batch.items()})
-                            raw_batch_with_reward = {k: v.view(-1, self.num_generations, v.size(-1)) if k!='temperature' else v for k, v in raw_batch_with_reward.items()}
+                            raw_batch_with_reward = self.calculate_reward(
+                                {k: v.view(-1, v.size(-1)) if k != "temperature" else v for k, v in raw_batch.items()}
+                            )
+                            raw_batch_with_reward = {
+                                k: v.view(-1, self.num_generations, v.size(-1)) if k != "temperature" else v
+                                for k, v in raw_batch_with_reward.items()
+                            }
                             # [batch_size, num_generations] -> [batch_size]
-                            reward = raw_batch_with_reward["reward"][:,:,0]
-                            format_acc = raw_batch_with_reward["format_acc"][:,:,0]
-                            ans_acc = raw_batch_with_reward["ans_acc"][:,:,0]
+                            reward = raw_batch_with_reward["reward"][:, :, 0]
+                            format_acc = raw_batch_with_reward["format_acc"][:, :, 0]
+                            ans_acc = raw_batch_with_reward["ans_acc"][:, :, 0]
                             response_len = (
-                                (raw_batch_with_reward["response_idx"][:, :, 1] - raw_batch_with_reward["response_idx"][:, :, 0] + 1)
-                                .type(torch.float32)
-                            )
+                                raw_batch_with_reward["response_idx"][:, :, 1]
+                                - raw_batch_with_reward["response_idx"][:, :, 0]
+                                + 1
+                            ).type(torch.float32)
                             effective_group_mask = None
                             if self.filter_range is not None and self.grpo_config.get("dynamic_batching", True):
                                 # filter the group based on the reward and accuracy
@@ -138,11 +146,15 @@ def loop(self) -> None:
                                 effective_group_mask = torch.logical_and(
                                     group_ans_acc_mean > self.filter_range[0], group_ans_acc_mean < self.filter_range[1]
                                 )
-                            raw_batch_with_reward = unbind_batch(raw_batch_with_reward)       # List[Dict[str, torch.Tensor]]                       
+                            raw_batch_with_reward = unbind_batch(raw_batch_with_reward)  # List[Dict[str, torch.Tensor]]
                             for group_idx, group_with_reward in enumerate(raw_batch_with_reward):
                                 self.buffer.append(
                                     [
-                                        group_with_reward if effective_group_mask is None or effective_group_mask[group_idx] else None,
+                                        (
+                                            group_with_reward
+                                            if effective_group_mask is None or effective_group_mask[group_idx]
+                                            else None
+                                        ),
                                         reward[group_idx],
                                         format_acc[group_idx],
                                         ans_acc[group_idx],
@@ -160,7 +172,9 @@ def loop(self) -> None:
                                 effective_group_to_raw_group_mapping[len(effective_group_to_raw_group_mapping)] = (
                                     buffer_idx
                                 )
-                        print(f"[T{dist.get_rank()}] Collect Effective Prompt: {len(effective_group_to_raw_group_mapping)}/{self.dp_size * self.minibatch_size}")
+                        print(
+                            f"[T{dist.get_rank()}] Collect Effective Prompt: {len(effective_group_to_raw_group_mapping)}/{self.dp_size * self.minibatch_size}"
+                        )
 
                         while len(effective_group_to_raw_group_mapping) >= self.dp_size * self.minibatch_size:
                             # on each dp_rank, we use minibatch_size effective samples to form a batch
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 451947b441c9..89fae6fc72bd 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -211,10 +211,12 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
                 loss_mask,
                 action_mask[:, -1] == False,
             )
-        if self.filter_range is not None and self.grpo_config.get("dynamic_batching", False)==False:
+        if self.filter_range is not None and self.grpo_config.get("dynamic_batching", False) == False:
             # filter out samples with reward outside the range
             # if dynamic batching is enabled, we filter out out of range groups before training
-            group_ans_acc_mean = ans_acc.view(-1, self.num_generations).mean(dim=1).repeat_interleave(self.num_generations, dim=-1)
+            group_ans_acc_mean = (
+                ans_acc.view(-1, self.num_generations).mean(dim=1).repeat_interleave(self.num_generations, dim=-1)
+            )
             loss_mask = torch.logical_and(
                 loss_mask,
                 torch.logical_and(
@@ -454,7 +456,9 @@ def _criterion(outputs, inputs):
                     raw_batch_ans_acc_mean = torch.cat(self.raw_train_batch_ans_acc, dim=0).mean().cpu().item()
                     raw_batch_response_len = torch.cat(self.raw_train_batch_response_len, dim=0)
                     raw_batch_response_len_mean = raw_batch_response_len.mean().cpu().item()
-                    overlength_samples_ratio = (raw_batch_response_len >= action_mask.size(-1)).to(float).mean().cpu().item()  # not an exact figure, but a close estimate
+                    overlength_samples_ratio = (
+                        (raw_batch_response_len >= action_mask.size(-1)).to(float).mean().cpu().item()
+                    )  # not an exact figure, but a close estimate
                     self.raw_train_batch_reward = []
                     self.raw_train_batch_format_acc = []
                     self.raw_train_batch_ans_acc = []

From 96faf5454245d001d06ef538a4fd033d87ca62de Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Thu, 5 Jun 2025 15:41:14 +0800
Subject: [PATCH 077/100] fix typ and parameter description

---
 applications/ColossalChat/rl_example.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index eed0af362741..b4f565617a26 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -126,25 +126,25 @@
         "--tensor-parallel-size",
         type=int,
         default=1,
-        help="Tensor parallel size for the inference backend. Please check the generation arguments documentation for your backend.",
+        help="Tensor parallel size for the trainer (consumer). Please check the generation arguments documentation for your backend.",
     )
     parser.add_argument(
         "-pp",
         "--pipeline-parallel-size",
         type=int,
         default=1,
-        help="Pipeline parallel size for the inference backend. Please check the generation arguments documentation for your backend.",
+        help="Pipeline parallel size for the trainer (consumer). Please check the generation arguments documentation for your backend.",
     )
     parser.add_argument(
         "-zero",
         "--zero-stage",
         type=int,
         default=0,
-        help="Zero stage for the inference backend. Please check the generation arguments documentation for your backend.",
+        help="Zero stage for the trainer (consumer). Please check the generation arguments documentation for your backend.",
     )
     parser.add_argument(
         "-ptp",
-        "--produce-tensor-parallel-size",
+        "--producer-tensor-parallel-size",
         type=int,
         default=1,
         help="Tensor parallel size for the producer. Please check the generation arguments documentation for your backend.",
@@ -206,7 +206,7 @@
                 enforce_eager=True,
                 enable_chunked_prefill=True,
                 max_model_len=args.max_new_tokens + args.max_prompt_tokens,
-                tensor_parallel_size=args.produce_tensor_parallel_size,
+                tensor_parallel_size=args.producer_tensor_parallel_size,
             )
         )
         generate_config.update(
@@ -276,7 +276,7 @@
 
     launch_distributed(
         num_producers=args.num_inferencer,
-        num_proc_per_producer=inference_model_config.get("tensor_parallel_size", args.produce_tensor_parallel_size),
+        num_proc_per_producer=inference_model_config.get("tensor_parallel_size", args.producer_tensor_parallel_size),
         num_consumer_procs=args.num_trainers,
         num_episodes=args.num_episodes,
         inference_batch_size=args.inference_batch_size,

From dc3033e68a927dcace02e951deda3b9882c89362 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Thu, 5 Jun 2025 17:56:42 +0800
Subject: [PATCH 078/100] support code generation tasks

---
 .../ColossalChat/coati/dataset/loader.py      |  35 +-
 .../coati/distributed/consumer.py             |  21 +-
 .../coati/distributed/grpo_consumer.py        |  51 +-
 .../coati/distributed/inference_backend.py    |  18 +-
 .../ColossalChat/coati/distributed/launch.py  |   5 +-
 .../coati/distributed/producer.py             |  91 ++-
 .../reward/code_reward/testing_util.py        | 669 ++++++++++++++++++
 .../distributed/reward/code_reward/utils.py   |  61 ++
 .../coati/distributed/reward/reward_fn.py     | 118 ++-
 .../distributed/reward/verifiable_reward.py   |  54 +-
 applications/ColossalChat/requirements.txt    |   3 +
 applications/ColossalChat/rl_example.py       |  28 +-
 12 files changed, 1027 insertions(+), 127 deletions(-)
 create mode 100644 applications/ColossalChat/coati/distributed/reward/code_reward/testing_util.py
 create mode 100644 applications/ColossalChat/coati/distributed/reward/code_reward/utils.py

diff --git a/applications/ColossalChat/coati/dataset/loader.py b/applications/ColossalChat/coati/dataset/loader.py
index 43cf78383015..16fd385bad30 100755
--- a/applications/ColossalChat/coati/dataset/loader.py
+++ b/applications/ColossalChat/coati/dataset/loader.py
@@ -367,9 +367,9 @@ def apply_chat_template_and_mask(
     }
 
     # Format for RL.
-    gt_answer = None
-    if "messages" in chat and "gt_answer" in chat:
-        gt_answer = chat["gt_answer"]
+    if "messages" in chat:
+        gt_answer = chat.get("gt_answer", None)
+        test_cases = chat.get("test_cases", None)
         chat = [chat["messages"]]
 
     tokens = []
@@ -402,12 +402,14 @@ def apply_chat_template_and_mask(
     labels[~torch.tensor(assistant_mask, dtype=torch.bool)] = ignore_idx
 
     if gt_answer is not None:
-        gt_answer = tokenizer.encode(
-            gt_answer, padding="max_length", truncation=True, max_length=128, return_tensors="pt"
-        )
-        gt_answer = gt_answer.squeeze(1)
         return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels, "gt_answer": gt_answer}
-
+    elif test_cases is not None:
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": labels,
+            "test_cases": test_cases,
+        }
     return {
         "input_ids": input_ids,
         "attention_mask": attention_mask,
@@ -440,3 +442,20 @@ def __getitem__(self, index: int):
             tokens = apply_chat_template_and_mask(self.tokenizer, message, self.max_length, self.system_prompt)
             self.tokenized_texts[index] = dict(tokens)
         return self.tokenized_texts[index]
+
+
+def collate_fn_grpo(batch):
+    input_ids = [item["input_ids"] for item in batch]
+    attention_mask = [item["attention_mask"] for item in batch]
+    labels = [item["labels"] for item in batch]
+    # Assume input_ids, attention_mask, labels are already of the same length,
+    # otherwise use pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
+    input_ids = torch.stack(input_ids)
+    attention_mask = torch.stack(attention_mask)
+    labels = torch.stack(labels)
+    ret = {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
+    if "test_cases" in batch[0]:
+        ret["test_cases"] = [item["test_cases"] for item in batch]
+    if "gt_answer" in batch[0]:
+        ret["gt_answer"] = [item["gt_answer"] for item in batch]
+    return ret
diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 22de6911ea29..3ae4a1796579 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -123,20 +123,17 @@ def loop(self) -> None:
                             # calculate group reward et al. filtering. As only the filtered group will be used for training (which is incomplete),
                             # we need to calculate the metrics before filtering here for logging
                             # [batch_size, num_generations, ...] -> [batch_size * num_generations, ...]
-                            raw_batch_with_reward = self.calculate_reward(
-                                {k: v.view(-1, v.size(-1)) if k != "temperature" else v for k, v in raw_batch.items()}
-                            )
-                            raw_batch_with_reward = {
+                            raw_batch = {
                                 k: v.view(-1, self.num_generations, v.size(-1)) if k != "temperature" else v
-                                for k, v in raw_batch_with_reward.items()
+                                for k, v in raw_batch.items()
                             }
                             # [batch_size, num_generations] -> [batch_size]
-                            reward = raw_batch_with_reward["reward"][:, :, 0]
-                            format_acc = raw_batch_with_reward["format_acc"][:, :, 0]
-                            ans_acc = raw_batch_with_reward["ans_acc"][:, :, 0]
+                            reward = raw_batch["reward"][:, :, 0]
+                            format_acc = raw_batch["format_acc"][:, :, 0]
+                            ans_acc = raw_batch["ans_acc"][:, :, 0]
                             response_len = (
-                                raw_batch_with_reward["response_idx"][:, :, 1]
-                                - raw_batch_with_reward["response_idx"][:, :, 0]
+                                raw_batch["response_idx"][:, :, 1]
+                                - raw_batch["response_idx"][:, :, 0]
                                 + 1
                             ).type(torch.float32)
                             effective_group_mask = None
@@ -146,8 +143,8 @@ def loop(self) -> None:
                                 effective_group_mask = torch.logical_and(
                                     group_ans_acc_mean > self.filter_range[0], group_ans_acc_mean < self.filter_range[1]
                                 )
-                            raw_batch_with_reward = unbind_batch(raw_batch_with_reward)  # List[Dict[str, torch.Tensor]]
-                            for group_idx, group_with_reward in enumerate(raw_batch_with_reward):
+                            raw_batch = unbind_batch(raw_batch)  # List[Dict[str, torch.Tensor]]
+                            for group_idx, group_with_reward in enumerate(raw_batch):
                                 self.buffer.append(
                                     [
                                         (
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 89fae6fc72bd..2e4d8f08cda9 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -6,8 +6,6 @@
 import wandb
 from coati.distributed.consumer import BaseConsumer
 from coati.distributed.loss import PolicyLoss
-from coati.distributed.reward.reward_fn import boxed_math_reward_fn, math_reward_fn
-from coati.distributed.reward.verifiable_reward import VerifiableReward
 from coati.distributed.utils import calc_action_log_probs
 from coati.trainer.utils import all_reduce_mean, all_reduce_sum
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -119,20 +117,7 @@ def __init__(
                     "either max_tokens (vllm) or max_new_tokens (transformers) must be set in generate_config."
                 )
         # Initialize verifiable reward.
-        response_format_tags = grpo_config.get("response_format_tags", None)
-        reward_model_kwargs = {
-            k: v
-            for k, v in grpo_config.items()
-            if k in ["soft_over_length_punishment", "max_new_tokens", "cache_length"]
-        }
-        self.reward_model = VerifiableReward(
-            reward_fns=[
-                math_reward_fn if grpo_config.get("reward_fn_type") == "think_answer_tags" else boxed_math_reward_fn
-            ],
-            tokenizer=self.tokenizer,
-            tags=response_format_tags,
-            **reward_model_kwargs,
-        )
+        grpo_config.get("response_format_tags", None)
         self.global_step = 0
 
         self.lr_scheduler = CosineAnnealingWarmupLR(
@@ -498,40 +483,6 @@ def _criterion(outputs, inputs):
         else:
             return None
 
-    def calculate_reward(self, rollout: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        Calculate the group reward for the given rollout group.
-
-        Args:
-            rollout_group (Dict[str, Any]):
-                a group of samples generated by the model from the same prompt
-                contain the following keys:
-                    "input_ids": torch.Tensor, [num_of_generation, prompt_length + response_length]
-                    "attention_mask": torch.Tensor, [num_of_generation, prompt_length + response_length]
-                    "action_mask": torch.Tensor, [num_of_generation, response_length]
-                    "action_log_probs": torch.Tensor, [num_of_generation, response_length]
-                    "response_idx": int, torch.Tensor, [num_of_generation, 2]
-                    "gt_answer": torch.Tensor, [num_of_generation, 128]
-                    "temperature": torch.Tensor, [] (scalar)
-
-        Returns:
-            Dict[str, Any]: The new group data with calculated reward.
-        """
-        reward_model_output = self.reward_model(
-            rollout["input_ids"],
-            gt_answer=rollout["gt_answer"],
-            response_idx=rollout["response_idx"],
-        )
-        # [num_of_generation]
-        reward = torch.tensor([value[0] for value in reward_model_output]).to(rollout["input_ids"].device)
-        format_acc = torch.tensor([value[1] for value in reward_model_output]).to(rollout["input_ids"].device)
-        ans_acc = torch.tensor([value[2] for value in reward_model_output]).to(rollout["input_ids"].device)
-
-        rollout["reward"] = reward.view((-1, 1))
-        rollout["format_acc"] = format_acc.view((-1, 1))
-        rollout["ans_acc"] = ans_acc.view((-1, 1))
-        return rollout
-
     def state_dict(self):
         self.policy_model._force_wait_all_gather()
         model = self.policy_model.unwrap()
diff --git a/applications/ColossalChat/coati/distributed/inference_backend.py b/applications/ColossalChat/coati/distributed/inference_backend.py
index ce2a2f28c12e..34827e4e2cf9 100644
--- a/applications/ColossalChat/coati/distributed/inference_backend.py
+++ b/applications/ColossalChat/coati/distributed/inference_backend.py
@@ -74,9 +74,8 @@ def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwar
         micro_batch_size = input_ids.size(0)
         input_ids = input_ids.to(get_current_device())
         attention_mask = attention_mask.to(get_current_device())
-        gt_answer = None
-        if "gt_answer" in kwargs:
-            gt_answer = kwargs.pop("gt_answer")
+        gt_answer = kwargs.pop("gt_answer", None)
+        test_cases = kwargs.pop("test_cases", None)
         if self.num_generations > 1:
             input_ids = input_ids.repeat_interleave(self.num_generations, dim=0)
             attention_mask = attention_mask.repeat_interleave(self.num_generations, dim=0)
@@ -116,8 +115,9 @@ def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwar
         data = {k: v.view(micro_batch_size, self.num_generations, v.size(-1)) for k, v in data.items()}
 
         if gt_answer is not None:
-            # repeat gt_answer for each prompt.
-            data["gt_answer"] = gt_answer.repeat_interleave(self.num_generations, dim=1)
+            data["gt_answer"] = gt_answer
+        if test_cases is not None:
+            data["test_cases"] = test_cases
         data = {k: v.to(get_current_device()) for k, v in data.items()}
         return data
 
@@ -269,11 +269,11 @@ def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwar
         }
 
         data = {k: v.view(micro_batch_size, -1, v.size(-1)) for k, v in data.items()}
-
-        if "gt_answer" in kwargs:
-            # repeat gt_answer for each prompt.
-            data["gt_answer"] = kwargs["gt_answer"].repeat_interleave(data["input_ids"].size(1), dim=1)
         data = {k: v.to(get_current_device()) for k, v in data.items()}
+        if "gt_answer" in kwargs:
+            data["gt_answer"] = kwargs["gt_answer"]
+        if "test_cases" in kwargs:
+            data["test_cases"] = kwargs["test_cases"]
         return data
 
     def load_state_dict(self, state_dict: Dict[str, torch.Tensor]) -> None:
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index 764325cdd78c..41ba8ea55f2c 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -37,7 +37,6 @@ def launch_distributed(
     train_batch_size: int,
     train_minibatch_size: int,
     train_dataset_config: Dict[str, Any],
-    dataloaders_config: Dict[str, Any],
     inference_model_config: Dict[str, Any],
     generate_config: Dict[str, Any],
     train_model_config: Dict[str, Any],
@@ -89,7 +88,6 @@ def launch_distributed(
             num_episodes=num_episodes,
             batch_size=inference_batch_size,
             train_dataset_config=train_dataset_config,
-            dataloaders_config=dataloaders_config,
             model_config=inference_model_config,
             generate_config=generate_config,
             tokenizer_config=tokenizer_config,
@@ -99,8 +97,7 @@ def launch_distributed(
             consumer_plugin_config=plugin_config,
             eval_dataset_config=eval_dataset_config,
             eval_interval=eval_interval,
-            evaluation_function_type=grpo_config["reward_fn_type"],
-            response_format_tags=grpo_config["response_format_tags"],
+            grpo_config=grpo_config,
             eval_save_dir=eval_save_dir,
             eval_generation_config=eval_generation_config,
             project_name=project_name,
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 623ed7ab90d1..135c8db0e55f 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -8,8 +8,9 @@
 import torch
 import tqdm
 import wandb
-from coati.dataset.loader import RawConversationDataset
-from coati.distributed.reward.reward_fn import boxed_math_reward_fn, math_reward_fn
+from coati.dataset.loader import RawConversationDataset, collate_fn_grpo
+from coati.distributed.reward.reward_fn import boxed_math_reward_fn, code_reward_fn, math_reward_fn
+from coati.distributed.reward.verifiable_reward import VerifiableReward
 from ray.util.collective import allreduce
 from ray.util.collective.types import Backend, ReduceOp
 from torch.utils.data import DataLoader, DistributedSampler
@@ -36,7 +37,6 @@ def __init__(
         num_episodes: int,
         batch_size: int,
         train_dataset_config: Dict[str, Any],
-        dataloaders_config: Dict[str, Any],
         model_config: Dict[str, Any],
         generate_config: Dict[str, Any],
         tokenizer_config: Optional[Dict[str, Any]] = None,
@@ -45,8 +45,7 @@ def __init__(
         consumer_plugin_config: Dict[str, Any] = None,
         eval_dataset_config=None,
         eval_interval=-1,  # disable evaluation
-        evaluation_function_type="think_answer_tags",
-        response_format_tags=None,
+        grpo_config: Dict[str, Any] = None,
         eval_save_dir: str = "./eval",
         project_name: str = None,
         run_name: str = None,
@@ -75,6 +74,13 @@ def __init__(
         self.eval_mode = False
         self.log_rollout_interval = log_rollout_interval
         self.latest_rollout_log_step = -1
+        self.grpo_config = grpo_config
+        reward_model_kwargs = {
+            k: v
+            for k, v in grpo_config.items()
+            if k in ["soft_over_length_punishment", "max_new_tokens", "cache_length"]
+        }
+        self.response_format_tags = grpo_config.get("response_format_tags", None)
         if producer_idx == 0:
             if os.path.exists(rollout_log_file):
                 raise ValueError(
@@ -120,6 +126,7 @@ def __init__(
             ),
             num_workers=4,
             drop_last=True,
+            collate_fn=collate_fn_grpo,
         )
 
         self.eval_dataset_config = eval_dataset_config
@@ -142,17 +149,25 @@ def __init__(
                         drop_last=False,
                         seed=42,
                     ),
+                    collate_fn=collate_fn_grpo,
                 )
-            if evaluation_function_type == "think_answer_tags":
+            if grpo_config["reward_fn_type"] == "think_answer_tags":
                 self.evaluation_function = math_reward_fn
-            elif evaluation_function_type == "boxed":
+            elif grpo_config["reward_fn_type"] == "boxed":
                 self.evaluation_function = boxed_math_reward_fn
+            elif grpo_config["reward_fn_type"] == "code":
+                self.evaluation_function = code_reward_fn
             else:
-                raise ValueError(f"Unknown evaluation function type {evaluation_function_type}")
-            self.response_format_tags = response_format_tags
+                raise ValueError(f"Unknown evaluation function type {grpo_config['reward_fn_type']}")
         else:
             print("No eval dataset provided, skip eval")
         self.device = get_current_device()
+        self.reward_model = VerifiableReward(
+            reward_fns=[self.evaluation_function],  # multiple reward functions can be added here
+            tokenizer=self.tokenizer,
+            tags=self.response_format_tags,
+            **reward_model_kwargs,
+        )
 
         # init backend
         if backend in BACKEND_MAP:
@@ -215,7 +230,13 @@ def loop(self) -> None:
                                 eval_results = eval_results + [
                                     self.evaluation_function(
                                         eval_outputs["input_ids"][m][n],
-                                        eval_outputs["gt_answer"][m][n],
+                                        eval_outputs[
+                                            (
+                                                "test_cases"
+                                                if self.grpo_config["reward_fn_type"] == "code"
+                                                else "gt_answer"
+                                            )
+                                        ][m],
                                         eval_outputs["response_idx"][m][n],
                                         tokenizer=self.tokenizer,
                                         eval_mode=True,
@@ -251,8 +272,50 @@ def loop(self) -> None:
                 outputs["temperature"] = torch.tensor(
                     [self.model.generate_config["temperature"]] * outputs["input_ids"].size(0)
                 ).to(outputs["input_ids"].device)
+                bs, num_gen = outputs["input_ids"].size(0), outputs["input_ids"].size(1)
+                # breakpoint()
+                if self.grpo_config["reward_fn_type"] == "code":
+                    test_cases = []
+                    for prompt_id in range(bs):
+                        test_cases.extend([outputs["test_cases"][prompt_id]] * num_gen)
+                    # assert all([isinstance(test_cases[i], str) for i in range(len(test_cases))]), f"{[type(test_cases[i]) for i in range(len(test_cases))]}, {test_cases}"
+                    reward_model_output = self.reward_model(
+                        outputs["input_ids"].view((-1, outputs["input_ids"].size(-1))),
+                        test_cases=test_cases,
+                        response_idx=outputs["response_idx"].view((-1, 2)),
+                    )
+                else:
+                    gt_answer = []
+                    for prompt_id in range(bs):
+                        gt_answer.extend([outputs["gt_answer"][prompt_id]] * num_gen)
+                    reward_model_output = self.reward_model(
+                        outputs["input_ids"].view((-1, outputs["input_ids"].size(-1))),
+                        gt_answer=gt_answer,
+                        response_idx=outputs["response_idx"],
+                    )
+                outputs["reward"] = (
+                    torch.tensor([value[0] for value in reward_model_output])
+                    .to(outputs["input_ids"].device)
+                    .view((bs, num_gen, 1))
+                )
+                outputs["format_acc"] = (
+                    torch.tensor([value[1] for value in reward_model_output])
+                    .to(outputs["input_ids"].device)
+                    .view((bs, num_gen, 1))
+                )
+                outputs["ans_acc"] = (
+                    torch.tensor([value[2] for value in reward_model_output])
+                    .to(outputs["input_ids"].device)
+                    .view((bs, num_gen, 1))
+                )
+                if "gt_answer" in outputs:
+                    outputs.pop("gt_answer")
+                if "test_cases" in outputs:
+                    outputs.pop("test_cases")
+
                 print(f"[P{self.producer_idx}] Send data {[(k, v.shape) for k, v in outputs.items()]}")
                 outputs = pre_send(outputs)
+                # breakpoint()
                 ray_broadcast_tensor_dict(
                     outputs, src=0, device=self.device, group_name=f"sync_data_{self.producer_idx}"
                 )
@@ -315,7 +378,6 @@ def __init__(
         num_episodes,
         batch_size,
         train_dataset_config,
-        dataloaders_config,
         model_config,
         generate_config,
         tokenizer_config=None,
@@ -325,8 +387,7 @@ def __init__(
         consumer_plugin_config=None,
         eval_dataset_config=None,
         eval_interval=-1,  # disable evaluation
-        evaluation_function_type="think_answer_tags",
-        response_format_tags=None,
+        grpo_config: Dict[str, Any] = None,
         eval_save_dir: str = "./eval",
         eval_generation_config={},
         project_name: str = None,
@@ -342,7 +403,6 @@ def __init__(
             num_episodes,
             batch_size,
             train_dataset_config,
-            dataloaders_config,
             model_config,
             generate_config,
             tokenizer_config,
@@ -351,8 +411,7 @@ def __init__(
             consumer_plugin_config,
             eval_dataset_config=eval_dataset_config,
             eval_interval=eval_interval,
-            evaluation_function_type=evaluation_function_type,
-            response_format_tags=response_format_tags,
+            grpo_config=grpo_config,
             eval_save_dir=eval_save_dir,
             project_name=project_name,
             run_name=run_name,
diff --git a/applications/ColossalChat/coati/distributed/reward/code_reward/testing_util.py b/applications/ColossalChat/coati/distributed/reward/code_reward/testing_util.py
new file mode 100644
index 000000000000..c6d6d8fad06e
--- /dev/null
+++ b/applications/ColossalChat/coati/distributed/reward/code_reward/testing_util.py
@@ -0,0 +1,669 @@
+# Code from the verl Project (https://github.com/agentica-project/rllm),
+# which itself is adapted from Prime (https://github.com/PRIME-RL/PRIME)
+#
+# Copyright 2024 ByteDance Group
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import ast
+import faulthandler
+import json
+import platform
+
+# to run the solution files we're using a timing based approach
+import signal
+import sys
+import traceback
+
+# used for debugging to time steps
+from datetime import datetime
+from enum import Enum
+
+# for capturing the stdout
+from io import StringIO
+
+# used for testing the code that reads from input
+from unittest.mock import mock_open, patch
+
+import numpy as np
+from pyext import RuntimeModule
+
+
+def truncatefn(s, length=300):
+    assert isinstance(s, str)
+    if len(s) <= length:
+        return s
+
+    return s[: length // 2] + "...(truncated) ..." + s[-length // 2 :]
+
+
+class CODE_TYPE(Enum):
+    call_based = 0
+    standard_input = 1
+
+
+# used to capture stdout as a list
+# from https://stackoverflow.com/a/16571630/6416660
+# alternative use redirect_stdout() from contextlib
+class Capturing(list):
+    def __enter__(self):
+        self._stdout = sys.stdout
+        sys.stdout = self._stringio = StringIO()
+        # Make closing the StringIO a no-op
+        self._stringio.close = lambda x: 1
+        return self
+
+    def __exit__(self, *args):
+        self.append(self._stringio.getvalue())
+        del self._stringio  # free up some memory
+        sys.stdout = self._stdout
+
+
+def only_int_check(val):
+    return isinstance(val, int)
+
+
+def string_int_check(val):
+    return isinstance(val, str) and val.isdigit()
+
+
+def combined_int_check(val):
+    return only_int_check(val) or string_int_check(val)
+
+
+def clean_traceback(error_traceback):
+    file_start = error_traceback.find('File "<string>"')
+    # print(file_start)
+    error_traceback = "Traceback (most recent call last):\n  " + error_traceback[file_start:]
+    return error_traceback
+
+
+def run_test(in_outs, test=None, debug=False, timeout=15):
+    """
+    if test(generated_code) is not None it'll try to run the code.
+    otherwise it'll just return an input and output pair.
+    """
+    # Disable functionalities that can make destructive changes to the test.
+    reliability_guard()
+
+    if debug:
+        print(f"start = {datetime.now().time()}")
+
+    if in_outs:
+        if in_outs.get("fn_name") is None:
+            which_type = CODE_TYPE.standard_input  # Standard input
+            method_name = None
+        else:
+            which_type = CODE_TYPE.call_based  # Call-based
+            method_name = in_outs["fn_name"]
+
+    if debug:
+        print(f"loaded input_output = {datetime.now().time()}")
+
+    if test is None:
+        raise AssertionError("should not happen: test code is none")
+    elif test is not None:
+        results = []
+        sol = "from string import *\nfrom re import *\nfrom datetime import *\nfrom collections import *\nfrom heapq import *\nfrom bisect import *\nfrom copy import *\nfrom math import *\nfrom random import *\nfrom statistics import *\nfrom itertools import *\nfrom functools import *\nfrom operator import *\nfrom io import *\nfrom sys import *\nfrom json import *\nfrom builtins import *\nfrom typing import *\nimport string\nimport re\nimport datetime\nimport collections\nimport heapq\nimport bisect\nimport copy\nimport math\nimport random\nimport statistics\nimport itertools\nimport functools\nimport operator\nimport io\nimport sys\nimport json\nsys.setrecursionlimit(6*10**5)\n"  # noqa: E501
+        if debug:
+            print(f"loading test code = {datetime.now().time()}")
+
+        if which_type == CODE_TYPE.call_based:
+            sol += test
+            if debug:
+                print(f"sol = {sol}")
+            signal.alarm(timeout)
+            try:
+                tmp_sol = RuntimeModule.from_string("tmp_sol", "", sol)
+                tmp = tmp_sol if "class Solution" not in test else tmp_sol.Solution()
+                signal.alarm(0)
+            except Exception as e:
+                signal.alarm(0)
+                error_traceback = traceback.format_exc()
+                if debug:
+                    print(f"type 0 compilation error = {e}")
+                results.append(-2)
+                return results, {
+                    "error": repr(e),
+                    # "error_code": -1,
+                    # "error_message": "Compilation Error",
+                    "traceback": clean_traceback(error_traceback),
+                }
+            signal.alarm(0)
+
+        elif which_type == CODE_TYPE.standard_input:
+            # sol
+            # if code has if __name__ == "__main__": then remove it
+            try:
+                astree = ast.parse(test)
+                last_block = astree.body[-1]
+                if isinstance(last_block, ast.If):
+                    condition = last_block.test
+                    if ast.unparse(condition).strip() == "__name__ == '__main__'":
+                        test = ast.unparse(astree.body[:-1]) + "\n" + ast.unparse(last_block.body)
+            except Exception:
+                pass
+
+            tmp_test = test.split("\n")
+
+            new_test = []
+            for x in tmp_test:
+                if (not x.startswith("from ")) and (not x.startswith("import ")):
+                    new_test.append("\t" + x + "\n")
+                else:
+                    new_test.append(x + "\n")
+            tmp_test = new_test
+
+            new_test = ""
+            started = False
+            for i in tmp_test:
+                if i.startswith("\t") and not started:
+                    new_test += "stdin = sys.stdin\nstdout = sys.stdout\n"
+                    new_test += "def code():\n"
+                    new_test += i
+                    started = True
+                elif started and ((i.startswith("from ")) or (i.startswith("import "))):
+                    new_test += "\t" + i
+                else:
+                    new_test += i
+            tmp_test = new_test
+
+            sol += tmp_test
+            if debug:
+                print(f"sol = {sol}")
+            method_name = "code"
+            signal.alarm(timeout)
+            try:
+                tmp_sol = RuntimeModule.from_string("tmp_sol", "", sol)
+                tmp = tmp_sol
+                signal.alarm(0)
+            except Exception as e:
+                signal.alarm(0)
+                error_traceback = traceback.format_exc()
+                if debug:
+                    print(f"type 1 compilation error = {e}")
+                results.append(-2)
+                return results, {
+                    "error": repr(e),
+                    # "error_code": -1,
+                    # "error_message": "Compilation Error",
+                    "traceback": clean_traceback(error_traceback),
+                }
+            signal.alarm(0)
+        if debug:
+            print(f"get method = {datetime.now().time()}")
+
+        try:
+            method = getattr(tmp, method_name)  # get_attr second arg must be str
+        except Exception:
+            signal.alarm(0)
+            error_traceback = traceback.format_exc()
+            error_info = sys.exc_info()
+            print(f"unable to get function error = {error_info}")
+            results.append(-2)
+            return results, {
+                "error": repr(error_info),
+                # "error_code": -1,
+                # "error_message": "Unable to extract code",
+                "traceback": clean_traceback(error_traceback),
+            }
+
+        for index, inputs in enumerate(in_outs["inputs"]):
+            raw_inputs = inputs
+            raw_outputs = in_outs["outputs"][index]
+            if which_type == CODE_TYPE.call_based:
+                inputs = [json.loads(line) for line in inputs.split("\n")]
+                in_outs["outputs"][index] = json.loads(in_outs["outputs"][index])
+
+                truncate_line_size = 300 // (raw_inputs.count("\n") + 1)
+                raw_inputs = "\n".join(
+                    [truncatefn(line, truncate_line_size) for line in raw_inputs.strip().split("\n")]
+                )
+                raw_outputs = truncatefn(raw_outputs, 200)
+            else:
+                raw_inputs = truncatefn(raw_inputs)
+                raw_outputs = truncatefn(raw_outputs, 200)
+            # JSON forces dictionaries to have string keys; this undoes this (assuming a singleton list)
+            try:
+                if isinstance(inputs[0], dict):
+                    inputs = [{int(k): v for k, v in inputs[0].items()}]
+            except Exception:
+                pass
+            try:
+                if isinstance(in_outs["outputs"][index], dict):
+                    in_outs["outputs"][index] = [{int(k): v for k, v in in_outs["outputs"][index].items()}]
+            except Exception:
+                pass
+            try:
+                if isinstance(in_outs["outputs"][index][0], dict):
+                    in_outs["outputs"][index] = [{int(k): v for k, v in in_outs["outputs"][index][0].items()}]
+            except Exception:
+                pass
+
+            if debug:
+                print(
+                    f"time: {datetime.now().time()} testing index = {index}  inputs = {inputs}, {type(inputs)}. type = {which_type}"
+                )
+            if which_type == CODE_TYPE.call_based:  # Call-based
+                signal.alarm(timeout)
+                faulthandler.enable()
+                try:
+                    output = method(*inputs)
+                    raw_true_output = output
+
+                    raw_true_output_copy = json.dumps(output)
+                    raw_true_output_copy = truncatefn(raw_true_output_copy, 200)
+
+                    # ground truth sequences are not tuples
+                    if isinstance(output, tuple):
+                        output = list(output)
+
+                    tmp_result = output == in_outs["outputs"][index]
+                    if isinstance(in_outs["outputs"][index], list) and in_outs["outputs"][index]:
+                        tmp_result = tmp_result or (output == in_outs["outputs"][index][0])
+
+                    # ground truth sequences are not tuples
+                    try:
+                        if isinstance(output[0], tuple):
+                            tmp_result = tmp_result or ([list(x) for x in output] == in_outs["outputs"][index][0])
+                    except Exception:
+                        pass
+                    results.append(tmp_result)
+                    if tmp_result is not True:
+                        return results, {
+                            "output": raw_true_output_copy,
+                            "expected": raw_outputs,
+                            "inputs": raw_inputs,
+                            # "error_code": -2,
+                            "error_message": "Wrong Answer",
+                        }
+                    # reset the alarm
+                    signal.alarm(0)
+                except Exception as e:
+                    signal.alarm(0)
+                    error_traceback = traceback.format_exc()
+                    faulthandler.disable()
+                    if debug:
+                        print(f"Standard input runtime error or time limit exceeded error = {e}")
+                    results.append(-1)
+                    return results, {
+                        "error": repr(e),
+                        "traceback": clean_traceback(error_traceback),
+                    }
+                faulthandler.disable()
+                signal.alarm(0)
+                if debug:
+                    print(
+                        f"outputs = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
+                    )
+            elif which_type == CODE_TYPE.standard_input:  # Standard input
+                faulthandler.enable()
+                passed = False
+
+                if isinstance(inputs, list):
+                    inputs = "\n".join(inputs)
+                if isinstance(in_outs["outputs"][index], list):
+                    in_outs["outputs"][index] = "\n".join(in_outs["outputs"][index])
+
+                signal.alarm(timeout)
+                with Capturing() as output:
+                    try:
+                        call_method(method, inputs)
+                        # reset the alarm
+                        signal.alarm(0)
+                        passed = True
+                    except Exception as e:
+                        # runtime error or took too long
+                        signal.alarm(0)
+                        error_traceback = traceback.format_exc()
+                        print(f"Call-based runtime error or time limit exceeded error = {repr(e)}{e}")
+                        results.append(-1)
+                        return results, {
+                            "error": repr(e),
+                            "traceback": clean_traceback(error_traceback),
+                        }
+                    signal.alarm(0)
+                raw_true_output = output[0]
+                raw_true_output_copy = truncatefn(raw_true_output, 200)
+                output = raw_true_output.splitlines()
+                if not passed:
+                    if debug:
+                        nl = "\n"
+                        if not isinstance(inputs, list):
+                            print(
+                                f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl, ' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
+                            )
+                        else:
+                            print(
+                                f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
+                            )
+                    continue
+
+                if passed and debug:
+                    print(f"==> output = {output}, test outputs = {in_outs['outputs'][index]}")
+
+                if custom_compare_(output, in_outs["outputs"][index]):
+                    tmp_result = True
+                    results.append(tmp_result)
+                    continue
+
+                # ground truth sequences are expressed as lists not tuples
+                if isinstance(output, tuple):
+                    output = list(output)
+
+                tmp_result = False
+                try:
+                    tmp_result = output == [in_outs["outputs"][index]]
+                    if isinstance(in_outs["outputs"][index], list):
+                        tmp_result = tmp_result or (output == in_outs["outputs"][index])
+                        if isinstance(output[0], str):
+                            tmp_result = tmp_result or ([e.strip() for e in output] == in_outs["outputs"][index])
+                except Exception as e:
+                    if debug:
+                        print(f"Failed check1 exception = {e}")
+
+                if tmp_result is True:
+                    results.append(tmp_result)
+                    continue
+
+                # try one more time without \n
+                if isinstance(in_outs["outputs"][index], list):
+                    for tmp_index, i in enumerate(in_outs["outputs"][index]):
+                        in_outs["outputs"][index][tmp_index] = i.split("\n")
+                        in_outs["outputs"][index][tmp_index] = [
+                            x.strip() for x in in_outs["outputs"][index][tmp_index] if x
+                        ]
+                else:
+                    in_outs["outputs"][index] = in_outs["outputs"][index].split("\n")
+                    in_outs["outputs"][index] = list(filter(len, in_outs["outputs"][index]))
+                    in_outs["outputs"][index] = list(map(lambda x: x.strip(), in_outs["outputs"][index]))
+
+                try:
+                    tmp_result = output == [in_outs["outputs"][index]]
+                    if isinstance(in_outs["outputs"][index], list):
+                        tmp_result = tmp_result or (output == in_outs["outputs"][index])
+                except Exception as e:
+                    if debug:
+                        print(f"Failed check2 exception = {e}")
+
+                if tmp_result is True:
+                    results.append(tmp_result)
+                    continue
+
+                # try by converting the output into a split up list too
+                if isinstance(output, list):
+                    output = list(filter(len, output))
+
+                if debug:
+                    nl = "\n"
+                    if not isinstance(inputs, list):
+                        print(
+                            f"@1 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl, ' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]} {tmp_result=}"
+                        )
+                    else:
+                        print(
+                            f"@1 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]} {tmp_result=}"
+                        )
+
+                if debug:
+                    print(f"{tmp_result=} @a")
+
+                try:
+                    tmp_result = output == [in_outs["outputs"][index]]
+                    if isinstance(in_outs["outputs"][index], list):
+                        tmp_result = tmp_result or (output == in_outs["outputs"][index])
+                except Exception as e:
+                    if debug:
+                        print(f"Failed check3 exception = {e}")
+
+                if debug:
+                    print(f"{tmp_result=} @b")
+
+                try:
+                    all_ints = all(
+                        combined_int_check(e1) and combined_int_check(e2)
+                        for e1, e2 in zip(output, in_outs["outputs"][index])
+                    )
+                    if not all_ints:
+                        if debug:
+                            print(
+                                [
+                                    combined_int_check(e1) and combined_int_check(e2)
+                                    for e1, e2 in zip(output, in_outs["outputs"][index])
+                                ]
+                            )
+                        output_float = [float(e) for e in output]
+                        gt_float = [float(e) for e in in_outs["outputs"][index]]
+                        tmp_result = tmp_result or (
+                            (len(output_float) == len(gt_float)) and np.allclose(output_float, gt_float)
+                        )
+                except Exception:
+                    pass
+
+                if debug:
+                    print(f"{tmp_result=} @c")
+
+                try:
+                    if isinstance(output[0], list):
+                        all_ints = all(
+                            combined_int_check(e1) and combined_int_check(e2)
+                            for e1, e2 in zip(output[0], in_outs["outputs"][index])
+                        )
+                        if not all_ints:
+                            output_float = [float(e) for e in output[0]]
+                            gt_float = [float(e) for e in in_outs["outputs"][index][0]]
+                            tmp_result = tmp_result or (
+                                (len(output_float) == len(gt_float)) and np.allclose(output_float, gt_float)
+                            )
+                except Exception:
+                    pass
+
+                if tmp_result is True:
+                    results.append(tmp_result)
+                    continue
+
+                if debug:
+                    print(f"{tmp_result=} @d")
+                # try by converting the stuff into split up list
+                if isinstance(in_outs["outputs"][index], list):
+                    for tmp_index, i in enumerate(in_outs["outputs"][index]):
+                        in_outs["outputs"][index][tmp_index] = set(i.split())
+                else:
+                    in_outs["outputs"][index] = set(in_outs["outputs"][index].split())
+
+                if debug:
+                    print(f"{tmp_result=} @e")
+
+                try:
+                    tmp_result = output == in_outs["outputs"][index]
+                except Exception as e:
+                    if debug:
+                        print(f"Failed check4 exception = {e}")
+                    continue
+
+                if tmp_result is True:
+                    results.append(tmp_result)
+                    continue
+
+                if debug:
+                    print(f"{tmp_result=} @f")
+
+                # try by converting the output into a split up list too
+                if isinstance(output, list):
+                    for tmp_index, i in enumerate(output):
+                        output[tmp_index] = i.split()
+                    output = list(filter(len, output))
+                    for tmp_index, i in enumerate(output):
+                        output[tmp_index] = set(i)
+                else:
+                    output = output.split()
+                    output = list(filter(len, output))
+                    output = set(output)
+
+                if debug:
+                    print(f"{tmp_result=} @g")
+
+                if tmp_result is True and debug:
+                    print("PASSED")
+
+                results.append(tmp_result)
+                if tmp_result is not True:
+                    return results, {
+                        "output": raw_true_output_copy,
+                        "expected": raw_outputs,
+                        "inputs": raw_inputs,
+                        # "error_code": -2,
+                        "error_message": "Wrong Answer",
+                    }
+
+                if debug:
+                    nl = "\n"
+                    if not isinstance(inputs, list):
+                        print(
+                            f"@2 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl, ' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
+                        )
+                    else:
+                        print(
+                            f"@2 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
+                        )
+
+                    print(f"results = {results}")
+
+    return results, {}
+
+
+def custom_compare_(output, ground_truth):
+    if isinstance(output, list):
+        output_1 = "\n".join(output)
+        if stripped_string_compare(output_1, ground_truth):
+            return True
+
+    if isinstance(output, list):
+        output_2 = [o.lstrip().rstrip() for o in output]
+        output_2 = "\n".join(output_2)
+        if stripped_string_compare(output_2, ground_truth):
+            return True
+
+    return False
+
+
+def stripped_string_compare(s1, s2):
+    s1 = s1.lstrip().rstrip()
+    s2 = s2.lstrip().rstrip()
+    return s1 == s2
+
+
+def call_method(method, inputs):
+    if isinstance(inputs, list):
+        inputs = "\n".join(inputs)
+
+    inputs_line_iterator = iter(inputs.split("\n"))
+
+    # sys.setrecursionlimit(10000)
+
+    # @patch('builtins.input', side_effect=inputs.split("\n"))
+    @patch("builtins.open", mock_open(read_data=inputs))
+    @patch("sys.stdin", StringIO(inputs))
+    @patch("sys.stdin.readline", lambda *args: next(inputs_line_iterator))
+    @patch("sys.stdin.readlines", lambda *args: inputs.split("\n"))
+    @patch("sys.stdin.read", lambda *args: inputs)
+    # @patch('sys.stdout.write', print)
+    def _inner_call_method(_method):
+        try:
+            return _method()
+        except SystemExit:
+            pass
+        finally:
+            pass
+
+    return _inner_call_method(method)
+
+
+def reliability_guard(maximum_memory_bytes=None):
+    """
+    This disables various destructive functions and prevents the generated code
+    from interfering with the test (e.g. fork bomb, killing other processes,
+    removing filesystem files, etc.)
+    WARNING
+    This function is NOT a security sandbox. Untrusted code, including, model-
+    generated code, should not be blindly executed outside of one. See the
+    Codex paper for more information about OpenAI's code sandbox, and proceed
+    with caution.
+    """
+
+    if maximum_memory_bytes is not None:
+        import resource
+
+        resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
+        resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
+        if platform.uname().system != "Darwin":
+            resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
+
+    faulthandler.disable()
+
+    import builtins
+
+    builtins.exit = None
+    builtins.quit = None
+
+    import os
+
+    os.environ["OMP_NUM_THREADS"] = "1"
+
+    os.kill = None
+    os.system = None  # 防止干扰repl评测
+    os.putenv = None
+    os.remove = None
+    os.removedirs = None
+    os.rmdir = None
+    os.fchdir = None
+    os.setuid = None
+    os.fork = None
+    os.forkpty = None
+    os.killpg = None
+    os.rename = None
+    os.renames = None
+    os.truncate = None
+    os.replace = None
+    os.unlink = None
+    os.fchmod = None
+    os.fchown = None
+    os.chmod = None
+    os.chown = None
+    os.chroot = None
+    os.lchflags = None
+    os.lchmod = None
+    os.lchown = None
+    os.getcwd = None
+    os.chdir = None
+
+    import shutil
+
+    shutil.rmtree = None
+    shutil.move = None
+    shutil.chown = None
+
+    import subprocess
+
+    subprocess.Popen = None  # type: ignore
+
+    __builtins__["help"] = None
+
+    import sys
+
+    sys.modules["ipdb"] = None
+    sys.modules["joblib"] = None
+    sys.modules["resource"] = None
+    sys.modules["psutil"] = None
+    sys.modules["tkinter"] = None
diff --git a/applications/ColossalChat/coati/distributed/reward/code_reward/utils.py b/applications/ColossalChat/coati/distributed/reward/code_reward/utils.py
new file mode 100644
index 000000000000..e19e9e387b71
--- /dev/null
+++ b/applications/ColossalChat/coati/distributed/reward/code_reward/utils.py
@@ -0,0 +1,61 @@
+# Code from the verl Project (https://github.com/agentica-project/rllm),
+# which itself is adapted from Prime (https://github.com/PRIME-RL/PRIME)
+#
+# Copyright 2024 ByteDance Group
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import multiprocessing
+import os
+import sys
+import traceback
+from typing import Optional
+
+from .testing_util import run_test
+
+
+def _temp_run(sample, generation, debug, result, metadata_list, timeout):
+    with open(os.devnull, "w") as devnull:
+        sys.stdout = devnull
+        sys.stderr = devnull
+        try:
+            res, metadata = run_test(in_outs=sample, test=generation, debug=debug, timeout=timeout)
+            result.append(res)
+            metadata_list.append(metadata)
+        except Exception:
+            # print(e) # some tracebacks are extremely long.
+            traceback.print_exc(10)
+            result.append([-1 for i in range(len(sample["inputs"]))])
+            metadata_list.append({})
+
+
+def check_correctness(in_outs: Optional[dict], generation, timeout=10, debug=True):
+    """Check correctness of code generation with a global timeout.
+    The global timeout is to catch some extreme/rare cases not handled by the timeouts
+    inside `run_test`"""
+
+    manager = multiprocessing.Manager()
+    result = manager.list()
+    metadata_list = manager.list()
+    p = multiprocessing.Process(target=_temp_run, args=(in_outs, generation, debug, result, metadata_list, timeout))
+    p.start()
+    p.join(timeout=timeout + 1)
+    if p.is_alive():
+        p.kill()
+        # p.terminate()
+    if not result:
+        # consider that all tests failed
+        result = [[-1 for i in range(len(in_outs["inputs"]))]]
+        if debug:
+            print("global timeout")
+    return result[0], metadata_list
diff --git a/applications/ColossalChat/coati/distributed/reward/reward_fn.py b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
index a4042ae97707..5c8810fdf4cc 100644
--- a/applications/ColossalChat/coati/distributed/reward/reward_fn.py
+++ b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
@@ -1,7 +1,30 @@
+# Copyright 2024 ByteDance Group
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Some functions in this file are adapted from the verl project
+under the Apache License 2.0:
+https://github.com/volcengine/verl
+"""
+
+
+import json
+
 import torch
 from latex2sympy2_extended import NormalizationConfig
 from math_verify import ExprExtractionConfig, LatexExtractionConfig, parse, verify
 
+from .code_reward.utils import check_correctness as check_correctness_code
 from .reward_utils import extract_boxed_solution, extract_solution, validate_response_structure
 
 CANNOT_PARSE_GT_ANSWER = -1
@@ -98,15 +121,11 @@ def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
         raise ValueError("no gt_answer is provided, please check your training dataset.")
 
     decoded_final_answer = tokenizer.decode(input_ids[s : e + 1], skip_special_tokens=True)
-    gt_answer = tokenizer.decode(gt_answer.squeeze(0), skip_special_tokens=True)
+
     final_answer, processed_str = extract_solution(decoded_final_answer)
 
     format_valid = validate_response_structure(processed_str, kwargs["tags"])
 
-    # Check format accuracy
-    if format_valid:
-        format_acc += 1
-
     # Check answer accuracy, answer is considered correct if the answer is correct and the format is valid
     if final_answer is not None:
         if eval_mode or format_valid:
@@ -114,6 +133,10 @@ def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
         if not eval_mode:
             reward = reward + length_reward
 
+    # Check format accuracy
+    if format_valid:
+        format_acc += 1
+
     # Check if the sequence is over length
     if not eval_mode and res_length >= max_new_tokens:
         reward *= 0.0
@@ -138,7 +161,6 @@ def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
     tokenizer = kwargs["tokenizer"]
     eval_mode = kwargs.get("eval_mode", False)
     soft_over_length_punishment = kwargs.get("soft_over_length_punishment", False)
-    format_score = 0.0
     acc_score = 10.0
     reward = torch.tensor(0.0)
     format_acc = torch.tensor(0.0)
@@ -161,7 +183,6 @@ def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
 
     decoded_final_answer = tokenizer.decode(input_ids[s : e + 1], skip_special_tokens=True)
 
-    gt_answer = tokenizer.decode(gt_answer.squeeze(0), skip_special_tokens=True)
     final_answer = extract_boxed_solution(decoded_final_answer)
     format_valid = final_answer is not None
     if "tags" in kwargs and kwargs["tags"]:
@@ -169,10 +190,6 @@ def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
         format_valid = format_valid and all(
             [decoded_final_answer.count(tags[tag]["text"]) == tags[tag]["num_occur"] for tag in tags]
         )
-    # Check format accuracy
-    if format_valid:
-        format_acc += 1
-        reward += format_score
 
     # Check answer accuracy, answer is considered correct if the answer is correct and the format is valid
     if final_answer is not None:
@@ -181,6 +198,10 @@ def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
         if not eval_mode:
             reward = reward + length_reward
 
+    # Check format accuracy
+    if format_valid:
+        format_acc += 1
+
     # Check if the sequence is over length
     if not eval_mode and res_length >= max_new_tokens:
         reward *= 0.0
@@ -199,3 +220,78 @@ def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
             "response_length": res_length,
             "reward": reward.item(),
         }
+
+
+def code_reward_fn(input_ids, test_cases, response_idx, **kwargs):
+    tokenizer = kwargs["tokenizer"]
+    eval_mode = kwargs.get("eval_mode", False)
+    soft_over_length_punishment = kwargs.get("soft_over_length_punishment", False)
+    acc_score = 10.0
+    reward = torch.tensor(0.0)
+    format_acc = torch.tensor(0.0)
+    ans_acc = torch.tensor(0.0)
+    s, e = response_idx[0], response_idx[1]
+
+    length_reward = 0.0
+    res_length = e.item() - s.item() + 1
+    if not eval_mode:
+        max_new_tokens = kwargs["max_new_tokens"]
+    else:
+        max_new_tokens = -1  # for eval mode, we don't need to check the length
+    if not eval_mode and soft_over_length_punishment:
+        cache_length = kwargs["cache_length"]
+        if max_new_tokens - cache_length < res_length < max_new_tokens:
+            length_reward = ((max_new_tokens - cache_length) - res_length) / cache_length * acc_score
+
+    # try to get code solution from completion. if the completion is pure code, this will not take effect.
+    decoded_final_answer = tokenizer.decode(input_ids[s : e + 1], skip_special_tokens=True)
+
+    solution = decoded_final_answer.split("```python")[-1].split("```")[0]
+    format_valid = False
+    if "```python" in decoded_final_answer:
+        format_valid = solution is not None
+
+    # Check format accuracy
+    if format_valid:
+        format_acc += 1
+
+    try:
+        try:
+            if not isinstance(test_cases, dict):
+                test_cases = json.loads(test_cases)
+        except Exception as e:
+            print(f"Error {e}: Cannot parse test cases.")
+            raise e
+        # Complete check on all in-out pairs first. If there is no failure, per-sample test can be skipped.
+        try:
+            res, metadata = check_correctness_code(in_outs=test_cases, generation=solution, timeout=10, debug=True)
+            metadata = dict(enumerate(metadata))[0]
+            success = all(map(lambda x: x is True, res))
+            if success:
+                ans_acc += 1
+                if eval_mode or format_valid:
+                    reward += acc_score
+                if not eval_mode:
+                    reward = reward + length_reward
+        except Exception:
+            pass
+
+        # Check if the sequence is over length
+        if not eval_mode and res_length >= max_new_tokens:
+            reward *= 0.0
+    except Exception:
+        pass
+    if not eval_mode:
+        return torch.tensor([reward, format_acc, ans_acc]).to(input_ids.device)
+    else:
+        prompt = tokenizer.decode(input_ids[:s], skip_special_tokens=True)
+        return {
+            "prompt": prompt,
+            "prediction": decoded_final_answer,
+            "gold": test_cases["outputs"],
+            "parsed": solution,
+            "format_valid": format_acc.item(),
+            "ans_valid": ans_acc.item(),
+            "response_length": res_length,
+            "reward": reward.item(),
+        }
diff --git a/applications/ColossalChat/coati/distributed/reward/verifiable_reward.py b/applications/ColossalChat/coati/distributed/reward/verifiable_reward.py
index ba83f7787586..2f0b3778c153 100644
--- a/applications/ColossalChat/coati/distributed/reward/verifiable_reward.py
+++ b/applications/ColossalChat/coati/distributed/reward/verifiable_reward.py
@@ -2,6 +2,7 @@
 Function-based reward verification module.
 """
 
+import inspect
 from typing import Any, Dict, List
 
 import torch
@@ -15,7 +16,8 @@ def __init__(self, reward_fns: List[callable], **kwargs: List[Dict[str, Any]]):
     def __call__(
         self,
         input_ids: torch.LongTensor,
-        gt_answer: List[torch.Tensor] = None,
+        gt_answer: List[str] = None,
+        test_cases: List[str] = None,
         response_idx: List[torch.Tensor] = None,
     ) -> torch.Tensor:
         # Get batch size
@@ -26,18 +28,44 @@ def __call__(
         # Loop through reward functions
         for reward_fn in self.reward_fns:
             # Apply the reward function to the entire batch at once
-            reward_batch = torch.stack(
-                [
-                    reward_fn(
-                        input_ids[i],
-                        gt_answer=gt_answer[i],
-                        response_idx=response_idx[i],
-                        **self.kwargs,
-                    )
-                    for i in range(bs)
-                ],
-                dim=0,
-            )
+            if "gt_answer" in inspect.getfullargspec(reward_fn).args:
+                reward_batch = torch.stack(
+                    [
+                        reward_fn(
+                            input_ids[i],
+                            gt_answer=gt_answer[i],
+                            response_idx=response_idx[i],
+                            **self.kwargs,
+                        )
+                        for i in range(bs)
+                    ],
+                    dim=0,
+                )
+            elif "test_cases" in inspect.getfullargspec(reward_fn).args:
+                reward_batch = torch.stack(
+                    [
+                        reward_fn(
+                            input_ids[i],
+                            test_cases=test_cases[i],
+                            response_idx=response_idx[i],
+                            **self.kwargs,
+                        )
+                        for i in range(bs)
+                    ],
+                    dim=0,
+                )
+            else:
+                reward_batch = torch.stack(
+                    [
+                        reward_fn(
+                            input_ids[i],
+                            response_idx=response_idx[i],
+                            **self.kwargs,
+                        )
+                        for i in range(bs)
+                    ],
+                    dim=0,
+                )
 
             rewards += reward_batch
         return rewards
diff --git a/applications/ColossalChat/requirements.txt b/applications/ColossalChat/requirements.txt
index 472080101b9b..6b9511ad551f 100755
--- a/applications/ColossalChat/requirements.txt
+++ b/applications/ColossalChat/requirements.txt
@@ -22,3 +22,6 @@ sentencepiece==0.1.99
 flash-attn
 tiktoken
 jsonlines
+math_verify
+latex2sympy2_extended
+pyext
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index b4f565617a26..0536a746df70 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -101,7 +101,7 @@
         "--reward-type",
         type=str,
         default="think_answer_tags",
-        choices=["think_answer_tags", "boxed"],
+        choices=["think_answer_tags", "boxed", "code"],
         help="Reward type for GRPO.",
     )
     parser.add_argument(
@@ -167,10 +167,29 @@
 
     if args.master_address is None:
         # Default settings: Using single machine
-        ray.init(address="local", namespace="ray-example")
+        ray.init(
+            address="local",
+            namespace="ray-example",
+            runtime_env={
+                "env_vars": {
+                    # "RAY_DEBUG_POST_MORTEM": "1"  # enable post-mortem debugging with ray
+                    "TOKENIZERS_PARALLELISM": "false"
+                },
+            },
+        )
     else:
         # For ray distributed multi-machine training, Please change _node_ip_address to your IP address of your master node
-        ray.init(_node_ip_address=args.master_address, namespace="ray-example", _temp_dir=args.ray_dir)
+        ray.init(
+            _node_ip_address=args.master_address,
+            namespace="ray-example",
+            _temp_dir=args.ray_dir,
+            runtime_env={
+                "env_vars": {
+                    # "RAY_DEBUG_POST_MORTEM": "1"  # enable post-mortem debugging with ray
+                    "TOKENIZERS_PARALLELISM": "false"
+                },
+            },
+        )
 
     if args.top_k is None:
         if args.backend == "transformers":
@@ -178,6 +197,8 @@
         elif args.backend == "vllm":
             args.top_k = -1
 
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Disable tokenizers parallelism to avoid deadlock
+
     inference_model_config = dict(path=args.model)
     train_model_config = dict(path=args.model, use_flash_attention_2=True, use_cache=False)
     generate_config = dict(top_k=args.top_k, top_p=args.top_p, temperature=args.temperature)
@@ -288,7 +309,6 @@
             "max_length": args.max_prompt_tokens,
             "system_prompt": args.system_prompt,
         },
-        dataloaders_config={},
         inference_model_config=inference_model_config,
         generate_config=generate_config,
         num_generations=args.num_generations,

From 3bed6ae9eea18424a1305b2d7b977a3c66fd2621 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Mon, 9 Jun 2025 09:37:28 +0800
Subject: [PATCH 079/100] fix bug, tested

---
 .gitignore                                              | 4 ++++
 applications/ColossalChat/coati/distributed/consumer.py | 6 ++----
 applications/ColossalChat/coati/distributed/producer.py | 2 +-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0503f3c9522d..e603f5015ec7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -167,3 +167,7 @@ applications/ColossalChat/wandb
 applications/ColossalChat/model
 applications/ColossalChat/eval
 applications/ColossalChat/rollouts
+applications/ColossalChat/*.txt
+applications/ColossalChat/*.db
+applications/ColossalChat/stdin
+applications/ColossalChat/*.zip
diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index 3ae4a1796579..a7abb1588d55 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -132,9 +132,7 @@ def loop(self) -> None:
                             format_acc = raw_batch["format_acc"][:, :, 0]
                             ans_acc = raw_batch["ans_acc"][:, :, 0]
                             response_len = (
-                                raw_batch["response_idx"][:, :, 1]
-                                - raw_batch["response_idx"][:, :, 0]
-                                + 1
+                                raw_batch["response_idx"][:, :, 1] - raw_batch["response_idx"][:, :, 0] + 1
                             ).type(torch.float32)
                             effective_group_mask = None
                             if self.filter_range is not None and self.grpo_config.get("dynamic_batching", True):
@@ -160,7 +158,7 @@ def loop(self) -> None:
                                 )
                             if effective_group_mask is not None:
                                 print(
-                                    f"[T{dist.get_rank()}] Filter recv data: {len(raw_batch_with_reward)} -> {torch.sum(effective_group_mask).cpu().item()} effective groups"
+                                    f"[T{dist.get_rank()}] Filter recv data: {len(raw_batch)} -> {torch.sum(effective_group_mask).cpu().item()} effective groups"
                                 )
                         # mapping the effective group to the raw group for indexing
                         effective_group_to_raw_group_mapping = {}
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 135c8db0e55f..23542f1c63d8 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -291,7 +291,7 @@ def loop(self) -> None:
                     reward_model_output = self.reward_model(
                         outputs["input_ids"].view((-1, outputs["input_ids"].size(-1))),
                         gt_answer=gt_answer,
-                        response_idx=outputs["response_idx"],
+                        response_idx=outputs["response_idx"].view((-1, 2)),
                     )
                 outputs["reward"] = (
                     torch.tensor([value[0] for value in reward_model_output])

From d0e12c508a4e72b73ed7f890f4850d16140fb756 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Mon, 9 Jun 2025 09:42:58 +0800
Subject: [PATCH 080/100] remove debug code

---
 applications/ColossalChat/coati/distributed/producer.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 23542f1c63d8..82e4b6ac1b23 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -273,12 +273,10 @@ def loop(self) -> None:
                     [self.model.generate_config["temperature"]] * outputs["input_ids"].size(0)
                 ).to(outputs["input_ids"].device)
                 bs, num_gen = outputs["input_ids"].size(0), outputs["input_ids"].size(1)
-                # breakpoint()
                 if self.grpo_config["reward_fn_type"] == "code":
                     test_cases = []
                     for prompt_id in range(bs):
                         test_cases.extend([outputs["test_cases"][prompt_id]] * num_gen)
-                    # assert all([isinstance(test_cases[i], str) for i in range(len(test_cases))]), f"{[type(test_cases[i]) for i in range(len(test_cases))]}, {test_cases}"
                     reward_model_output = self.reward_model(
                         outputs["input_ids"].view((-1, outputs["input_ids"].size(-1))),
                         test_cases=test_cases,
@@ -315,7 +313,6 @@ def loop(self) -> None:
 
                 print(f"[P{self.producer_idx}] Send data {[(k, v.shape) for k, v in outputs.items()]}")
                 outputs = pre_send(outputs)
-                # breakpoint()
                 ray_broadcast_tensor_dict(
                     outputs, src=0, device=self.device, group_name=f"sync_data_{self.producer_idx}"
                 )

From 9ca920c1af5076ca2816710852cf0ab4410ba3d3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 9 Jun 2025 01:48:19 +0000
Subject: [PATCH 081/100] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 applications/ColossalChat/coati/distributed/grpo_consumer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 2e4d8f08cda9..0e956ca413c2 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -1,5 +1,5 @@
 from contextlib import nullcontext
-from typing import Any, Dict, Optional
+from typing import Any, Optional
 
 import ray
 import torch

From bb6f5d98fc94d857284d3816591e3e248fcb5f46 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Tue, 10 Jun 2025 13:53:19 +0800
Subject: [PATCH 082/100] move out evaluation func (#6343)

Co-authored-by: Tong Li <tong.li35271158@gmail.com>
---
 .../ColossalChat/coati/distributed/producer.py   | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 82e4b6ac1b23..854c2fcc2b95 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -128,6 +128,14 @@ def __init__(
             drop_last=True,
             collate_fn=collate_fn_grpo,
         )
+        if grpo_config["reward_fn_type"] == "think_answer_tags":
+            self.evaluation_function = math_reward_fn
+        elif grpo_config["reward_fn_type"] == "boxed":
+            self.evaluation_function = boxed_math_reward_fn
+        elif grpo_config["reward_fn_type"] == "code":
+            self.evaluation_function = code_reward_fn
+        else:
+            raise ValueError(f"Unknown evaluation function type {grpo_config['reward_fn_type']}")
 
         self.eval_dataset_config = eval_dataset_config
         if self.eval_dataset_config is not None:
@@ -151,14 +159,6 @@ def __init__(
                     ),
                     collate_fn=collate_fn_grpo,
                 )
-            if grpo_config["reward_fn_type"] == "think_answer_tags":
-                self.evaluation_function = math_reward_fn
-            elif grpo_config["reward_fn_type"] == "boxed":
-                self.evaluation_function = boxed_math_reward_fn
-            elif grpo_config["reward_fn_type"] == "code":
-                self.evaluation_function = code_reward_fn
-            else:
-                raise ValueError(f"Unknown evaluation function type {grpo_config['reward_fn_type']}")
         else:
             print("No eval dataset provided, skip eval")
         self.device = get_current_device()

From 21d517d0fa393c526d025c7b3f3a1f1e54eda330 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Tue, 10 Jun 2025 15:00:48 +0800
Subject: [PATCH 083/100] Manually schedule resources and support auto master
 address assigning

---
 .../ColossalChat/coati/distributed/launch.py  | 48 ++++++++++++++++---
 1 file changed, 42 insertions(+), 6 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index 41ba8ea55f2c..30e7382ef552 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -79,8 +79,35 @@ def launch_distributed(
         f"{project_name.replace(' ','_')}_run_{wandb_group_name}.jsonl",
     )
 
-    procs = []
+    # Attention: Ray use complex schedualing method that consider various factors including load-balancing.
+    # when requesting resources, it is not guaranteed that the resource comes from a node with lower node it
+    # this go against the design principle of our implementation, and we need to manually force the schedualing,
+    # allocating the producer to nodes with lower node id and the consumer to the resouces from nodes with higher
+    # node id. See the reference here: https://docs.ray.io/en/latest/ray-core/scheduling/index.html#nodeaffinityschedulingstrategy
+    nodes = ray.nodes()
+    node_info = {
+        node["NodeID"]: {
+            "num_gpus": node["Resources"].get("GPU", 0),
+            "address": node["NodeManagerAddress"],
+        }  # Default to 0 if no GPUs are available
+        for node in nodes
+    }
+    gpu_to_node_id = []
+    gpu_to_ip_address = []
+    for node_id in node_info:
+        for idx in range(int(node_info[node_id]["num_gpus"])):
+            gpu_to_node_id.append(node_id)
+            gpu_to_ip_address.append(node_info[node_id]["address"])
+    print(node_info)
+
+    producer_procs = []
     for i in range(num_producers):
+        node_id = gpu_to_node_id[0]
+        producer_ip_address = gpu_to_ip_address[0]
+        for _ in range(num_proc_per_producer):
+            gpu_to_node_id.pop(0)
+            gpu_to_ip_address.pop(0)
+        print(f"Schedual Producer P[{i}] which requires {num_proc_per_producer} GPUs on node {producer_ip_address}")
         producer = SimpleProducer.options(num_gpus=num_proc_per_producer).remote(
             producer_idx=i,
             num_producers=num_producers,
@@ -106,20 +133,29 @@ def launch_distributed(
             log_rollout_interval=log_rollout_interval,
             rollout_log_file=rollout_log_file,
         )
-        procs.append(producer)
+        producer_procs.append(producer)
+    ray.get([p.setup.remote() for p in producer_procs])
     generate_config_consumer = copy.deepcopy(generate_config)
     generate_config_consumer.update(
         dict(
             backend=inference_backend,
         )
     )
+    consumer_master_ip_address = gpu_to_ip_address[0]
+    print(f"Use {consumer_master_ip_address} as master address for torch DDP.")
+    consumer_procs = []
     for i in range(num_consumer_procs):
+        node_id = gpu_to_node_id[0]
+        consumer_ip_address = gpu_to_ip_address[0]
+        gpu_to_node_id.pop(0)
+        gpu_to_ip_address.pop(0)
+        print(f"Schedual Consumer T[{i}] which requires 1 GPUs on node {consumer_ip_address}")
         consumer = core_consumer.options(num_gpus=1).remote(
             num_producers=num_producers,
             num_episodes=num_episodes,
             rank=i,
             world_size=num_consumer_procs,
-            master_addr=master_addr,
+            master_addr=consumer_master_ip_address,
             master_port=master_port,
             num_update_per_episode=num_update_per_episode,
             num_recv_per_update=num_recv_per_update,
@@ -136,6 +172,6 @@ def launch_distributed(
             run_name=run_name,
             wandb_group_name=wandb_group_name,
         )
-        procs.append(consumer)
-    ray.get([p.setup.remote() for p in procs])
-    ray.get([p.loop.remote() for p in procs])
+        consumer_procs.append(consumer)
+    ray.get([p.setup.remote() for p in consumer_procs])
+    ray.get([p.loop.remote() for p in (producer_procs + consumer_procs)])

From 25599246c5080cdfcc03e794495aa2c075ab048d Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Tue, 10 Jun 2025 17:00:35 +0800
Subject: [PATCH 084/100] modify readme

---
 .../ColossalChat/coati/distributed/README.md  | 312 +++++++++++++++++-
 applications/ColossalChat/rl_example.py       |  10 +
 2 files changed, 321 insertions(+), 1 deletion(-)

diff --git a/applications/ColossalChat/coati/distributed/README.md b/applications/ColossalChat/coati/distributed/README.md
index b7bac2b2db93..262ab6aab446 100644
--- a/applications/ColossalChat/coati/distributed/README.md
+++ b/applications/ColossalChat/coati/distributed/README.md
@@ -1,6 +1,316 @@
-# Requirements
+# Distributed RL Framework for Language Model Fine-Tuning
 
+This repository implements a distributed Reinforcement Learning (RL) training framework designed to fine-tune large language models using algorithms such as **GRPO** and **DAPO**. It supports multi-node and multi-GPU setups, scalable rollout generation, and policy optimization using libraries like VLLM. Currently, we supports two Reinforcement Learning with Verifiable Reward (RLVR) tasks: solving math problems and code generation.
+
+**Please note that we are still under intensive development, stay tuned.**
+
+---
+
+## 🚀 Features
+
+* **Distributed Training with Ray**: Scalable to multiple machines and GPUs.
+* **Support for GRPO and DAPO**: Choose your preferred policy optimization algorithm.
+* **Model Backends**: Support `vllm` as inference backends.
+* **Rollout and Policy Decoupling**: Efficient generation and consumption of data through parallel inferencer-trainer architecture.
+* **Evaluation Integration**: Easily plug in task-specific eval datasets.
+* **Checkpoints and Logging**: Configurable intervals and directories.
+
+---
+
+## 🛠 Installation
+
+### Prepare Develop Environment
+
+Install Colossalai & ColossalChat
+```bash
+git clone https://github.com/hpcaitech/ColossalAI.git
+git checkout grpo-latest
+BUILD_EXT=1 pip install -e .
+
+cd ./applications/ColossalChat
+pip install -e .
+```
+
+Install vllm
+```bash
+pip install vllm==0.7.3
+```
+
+Install Ray.
+```bash
+pip install ray
+```
+
+Install Other Dependencies
 ```bash
 pip install cupy-cuda12x
 python -m cupyx.tools.install_library --cuda 12.x --library nccl
 ```
+
+Prepare Model & dataset
+```bash
+huggingface-cli download --local-dir-use-symlinks False Qwen/Qwen2.5-7B --local-dir /models/Qwen/Qwen2.5-7B
+```
+
+## Architecture Design
+
+<div align="center">
+  <p align="center">
+    <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/producer-consumer-pattern.png" width=700/>
+  </p>
+</div>
+Producer-Consumer Pattern: a classic software design pattern used for managing resources, data, or tasks between two different processes or threads.
+
+* Producer: inference engine which rollouts out examples and saves them into a shared buffer.
+* Consumer: training framework which takes training examples from the shared buffer and train the policy model.
+
+Key features for Producer-Consumer Pattern:
+* Buffer: Acts as a shared queue where the producer adds data and the consumer removes data.
+* Concurrency: Rollout and training can work concurrently.
+
+## 🧠 Data Format
+
+Samples in the training or evaluation `.jsonl` file should the same format depends on the type of task, we currently support two RLVR tasks: solving math problems and code generation.
+
+### Math Data Format
+```json
+{
+  "messages": {
+    "role": "user",
+    "content": "Simplify $\\sqrt[3]{1+8} \\cdot \\sqrt[3]{1+\\sqrt[3]{8}}$."
+  },
+  "gt_answer": "3"
+}
+```
+
+### Code Data Format
+We support [Prime code dataset format](https://github.com/PRIME-RL/PRIME). Inputs and outputs in test cases should be two lists containing only strings and matching in the number of elements. You prompt must properly instruct the LLM to generate code to read test cases from stdin and output results to stdout.
+```json
+{
+    "messages": {
+        "role": "user",
+        "content": "Solve the following coding problem using the programming language python:\n\nMikhail walks on a Cartesian plane. He starts at the point $(0, 0)$, and in one move he can go to any of eight adjacent points. For example, ..."
+    },
+    "test_cases": {
+        "inputs": [
+            "3\n2 2 3\n4 3 7\n10 1 9\n"
+        ],
+        "outputs": [
+            "1\n6\n-1\n"
+        ]
+    }
+}
+```
+
+---
+
+## ⚙️ Hyperparameters & Arguments
+
+| Argument         | Description                             | Example           |
+| ---------------- | --------------------------------------- | ----------------- |
+| `--model`        | Model path or identifier                | `/path/to/model` |
+| `--dataset`      | Path to training `.jsonl`               | `/path/to/train_data.jsonl`      |
+| `--eval-dataset` | JSON of task\:eval\_dataset\_path pairs | `{"eval_1":"/path/to/eval_1.jsonl"}`            |
+| `--project`      | Project name                            | `Project1`            |
+| `--num-episodes` | Number of training episodes             | `1`               |
+
+### Distributed Training
+
+| Argument                      | Description                           | Example |
+| ----------------------------- | ------------------------------------- | ------- |
+| `--num-trainers`              | Number of trainer processes           | `4`     |
+| `--num-inferencer`            | Number of inferencer processes        | `4`     |
+| `--inference-batch-size`      | Prompts per inference step            | `8`    |
+| `--inference-microbatch-size` | Per-GPU batch size for inference      | `8`     |
+| `--train-batch-size`          | Prompts per trainer step per dp group | `8`    |
+| `--train-minibatch-size`      | Mini-batch size before forward pass   | `8`     |
+| `--train-microbatch-size`     | Per-GPU batch size for training       | `2`     |
+
+### Sampling
+
+| Argument              | Description           | Example        |
+| --------------------- | --------------------- | -------------- |
+| `--backend`           | Generation backend, choose from `vllm`     | `vllm` |
+| `--temperature`       | Sampling temperature for generation  | `1.0`          |
+| `--top-k`             | Top-K sampling parameter for generation        | `None`         |
+| `--top-p`             | Top-P sampling parameter for generation        | `1.0`          |
+| `--system-prompt`     | System prompt, Optional, default to the default system prompt for each reward types. For more information, refer to the [**reward type**](#-constraints-and-notes) section        | `Please reason step by step, and put your final answer within \\boxed{}.`         |
+| `--max-new-tokens`    | Max generation tokens | `3584`         |
+| `--max-prompt-tokens` | Max prompt tokens     | `512`          |
+
+### GRPO Specific
+
+| Argument          | Description                  | Example             |
+| ----------------- | ---------------------------- | ------------------- |
+| `--algo`          | Algorithm (`GRPO` or `DAPO`), for more customization refer to [GRPO Settings](#️-grpo-settings) | `GRPO`              |
+| `--learning-rate` | Learning rate                | `1e-6`              |
+| `--kl-coeff`      | KL penalty coefficient, if nonzero, a reference model will be used       | `0.01`              |
+| `--reward-type`   | Reward signal type (choose from 'think_answer_tags', 'boxed', 'code') For more information, refer to the [**reward type**](#-constraints-and-notes) section        | `think_answer_tags` |
+| `--eval-interval` | Evaluation interval in number of training steps (positive value to enable evaluation)         | `10`               |
+
+### Logging and Checkpointing
+
+| Argument             | Description               | Example      |
+| -------------------- | ------------------------- | ------------ |
+| `--save-interval`    | Training steps between checkpoints | `20`         |
+| `--save-dir`         | Checkpoint directory      | `./model`    |
+| `--eval-save-dir`    | Evaluation save path      | `./eval`     |
+| `--rollout-save-dir` | Rollout logs directory    | `./rollouts` |
+
+### Miscellaneous
+
+| Argument           | Description                             | Example |
+| ------------------ | --------------------------------------- | ------- |
+| `--ray_dir`        | Custom Ray temp dir of a running Ray cluster (optional)                   | `None`  |
+| `--master_address` | Master address of a running Ray cluster | `None`  |
+| `--master_port`    | Master port for torch DDP                            | `29506` |
+
+---
+
+## ⚙️ GRPO Settings
+
+In addition to the two default training settings we provided--- original `GRPO` and `DAPO`, users can customize their training by changing the following hyperparameters in `grpo_config` in `rl_example.py`.
+
+| Argument Name                 | Description                      | Default                                                                                                                                                   |
+| ----------------------------- | ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `filter_range`                | Filters out rollout group if the success rate within that group is out of this range.| `[0.01, 0.99]`                                  |
+| `dynamic_batching`            | Enables dynamic batching as described in the [DAPO paper](https://arxiv.org/abs/2503.14476).                                                                      | `True`                                         |
+| `clip_eps_low`                | epsilon_low in DAPO in equation in [DAPO paper](https://arxiv.org/abs/2503.14476)                                                   | `0.2`                                           |
+| `clip_eps_high`               | epsilon_high in DAPO equation in [DAPO paper](https://arxiv.org/abs/2503.14476)                                                 | `0.28`                                           |
+| `skip_threshold`              | If ratio is above this threshold, the sample is skipped to avoid instability.                                                   | `20.0`                                             |
+| `loss_variation`              | Type of loss variation. Supports `"token_level"` for token-wise policy gradient loss and `sample_level` for original GRPO loss.                                         |  `"token_level"`                                        |
+| `soft_over_length_punishment` | Whether to use soft overlength penalty in [DAPO paper](https://arxiv.org/abs/2503.14476) or not.                                                               | `True`                                             |
+| `cache_length`                | `L_cache` parameter for soft overlength penalty in e.q. 13 in [DAPO paper](https://arxiv.org/abs/2503.14476)                                                                          | `min(1024, int(args.max_new_tokens / 4))`                 |
+| `filter_truncated_response`    | Mask out truncated responses in loss calculation.                                       | `True`                                         |
+
+
+
+## 🔄 Constraints and Notes
+
+* `num_inferencer + num_trainer == NUM_GPUs`
+* `num_inferencer % num_trainer == 0`
+* `(num_inferencer * inference_batch_size) % (num_trainer * train_batch_size) == 0`
+* `train_batch_size >= train_minibatch_size >= train_microbatch_size`
+* `inference_batch_size >= inference_microbatch_size`
+* Set microbatch sizes based on **VRAM capacity**
+* To use tensor parallelism on inferencer
+  * set backend to `vllm`
+  * change `tensor_parallel_size` in `inference_model_config` in rl_example.py
+  * set `num_inferencer = NUM_INFERENCE_GPUs / tensor_parallel_size`
+* To set tensor parallelism / pipeline parallelism / zero stage
+  * change corresponding settings in `plugin_config` in rl_example.py
+* Ensure rollout generation rate matches trainer consumption:
+
+  ```
+  num_inferencer * inference_batch_size % (
+    num_trainer * train_batch_size /
+    train_pipeline_parallelism_size /
+    train_tensor_parallelism_size
+  ) == 0
+  ```
+* Model weights sync every:
+
+  ```
+  (num_inferencer * inference_batch_size) /
+  (num_trainer * train_batch_size /
+    train_pipeline_parallelism_size /
+    train_tensor_parallelism_size)
+  ```
+* Reward Type
+
+    We currently support three reward types--- `think_answer_tags`, `boxed`, `code`, each varies in details such as how answer is extracted and the reward calculation process. Please select one from `think_answer_tags`, `boxed` for math problem solving and use `code` for code generation. The default system prompt for each reward type is as follows. Please make sure your system prompt provides information for the answer to be correctly extracted from model responses.
+
+    * think_answer_tags
+
+        Answer extraction: extract the content between the last `<answer>`, `</answer>` tags.
+
+        ```
+        You are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and<answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. Now the user asks you to solve a math problem that involves reasoning. After thinking, when you finally reach a conclusion, clearly output the final answer without explanation within the <answer> </answer> tags, i.e., <answer> 123 </answer>.\n\n
+        ```
+    * boxed
+
+        Answer extraction: extract the last content marked by `\\boxed{}`
+        ```
+        Please reason step by step, and put your final answer within \\boxed{}.
+        ```
+    * code
+
+        Answer extraction: extract code inside ` ```python\n...``` `
+        ```
+        You are a helpful assistant.
+        ```
+        generated code is
+---
+
+## 🧪 Example: single machine 8-GPU Zero2 Strategy
+
+```bash
+python rl_example.py \
+  --dataset /path/to/train_data.jsonl \
+  --model /path/to/Qwen2.5-3B/ \
+  -t 4 -i 4 \
+  -b vllm \
+  -ibs 2 -tbs 4 -tMbs 1 -tmbs 4 -imbs 1 \
+  -rt boxed \
+  -g 4 \
+  -ibs 1 \
+  -tbs 2 \
+  -tMbs 1 \
+  -tmbs 2 \
+  -imbs 1 \
+  -s "Please reason step by step, and put your final answer within \\boxed{}." \
+  -tMbs 8 \
+  -p GRPO-Train-Align-Debug \
+```
+
+## 🧪 Example: multi-machine TP+PP Strategy
+
+### Create ray cluster on multi-machine
+For example, now we have 4 nodes and their IPs are 10.0.0.3, 10.0.0.4, 10.0.0.5, 10.0.0.6.
+We use 10.0.0.3 as master node. First we start a ray cluster on 10.0.0.3:
+```bash
+ray start --head --node-ip-address=10.0.0.3
+```
+
+Then, for each slave node (10.0.0.4/10.0.0.5/10.0.0.6), we add to the ray cluser by following code:
+```bash
+ray start --address='10.0.0.3:6379'
+```
+
+Modify plugin_config in ./applications/ColossalChat/rl_example.py
+```python
+plugin_config={
+  "tp_size": 4,
+  "pp_size": 2,
+  "microbatch_size": max(
+    1, args.train_microbatch_size // 2
+  ),  # microbatch size should be set to train_microbatch_size // pp_size
+  "zero_stage": 1,
+  "max_norm": 1.0,
+  },  # for pp, tp
+```
+
+```bash
+# Hint1: replace /models/Qwen/Qwen2.5-7B to your model path
+#        replace /datasets/train-alignment.jsonl to your dataset path
+python rl_example.py
+  -m /path/to/Qwen2.5-Math-7B/ \
+  -d /path/to/train_data.jsonl \
+  --master_address '10.0.0.3'
+  -t 16 \
+  -i 16 \
+  -p GRPO-Train-Align-Debug \
+  -g 2 \
+  -ibs 1 \
+  -tbs 2 \
+  -tMbs 1 \
+  -tmbs 2 \
+  -imbs 1 \
+  -b vllm \
+  -e 2 \
+  -rt boxed \
+  -s "Please reason step by step, and put your final answer within \\boxed{}."
+```
+
+## Acknowledgement
+Colossal-RL is a distributed version of ColossalChat and inspired by a few awesome open-source projects. We would like to express our gratitude to the Fuyao-ray team and the vllm-ascend team for their support throughout the development of the this project. We also thank the following awesome open-source projects and algorithms: GRPO, DAPO, TRL, Verl, OpenRLHF, StreamRL, Qwen, Logic-RL.
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index 0536a746df70..c60923e00cca 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -6,6 +6,12 @@
 import torch
 from coati.distributed.launch import launch_distributed
 
+DEFAUT_SYSTEM_PROMPT = {
+    "think_answer_tags": "You are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and<answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. Now the user asks you to solve a math problem that involves reasoning. After thinking, when you finally reach a conclusion, clearly output the final answer without explanation within the <answer> </answer> tags, i.e., <answer> 123 </answer>.\n\n",
+    "boxed": "Please reason step by step, and put your final answer within \\boxed{}.",
+    "code": "You are a helpful assistant.",
+}
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("-m", "--model", type=str, default="Qwen/Qwen2.5-7B")
@@ -295,6 +301,10 @@
     else:
         raise ValueError(f"Unsupported algorithm: {args.algo}")
 
+    if args.system_prompt is None:
+        # Default system prompt
+        args.system_prompt = DEFAUT_SYSTEM_PROMPT[args.reward_type]
+
     launch_distributed(
         num_producers=args.num_inferencer,
         num_proc_per_producer=inference_model_config.get("tensor_parallel_size", args.producer_tensor_parallel_size),

From dc29c746327d7c18fbabc5e26e2739f5178271a5 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Tue, 10 Jun 2025 17:17:41 +0800
Subject: [PATCH 085/100] update readme

---
 .../ColossalChat/coati/distributed/README.md       | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/README.md b/applications/ColossalChat/coati/distributed/README.md
index 262ab6aab446..29bd8d413a2b 100644
--- a/applications/ColossalChat/coati/distributed/README.md
+++ b/applications/ColossalChat/coati/distributed/README.md
@@ -1,6 +1,6 @@
 # Distributed RL Framework for Language Model Fine-Tuning
 
-This repository implements a distributed Reinforcement Learning (RL) training framework designed to fine-tune large language models using algorithms such as **GRPO** and **DAPO**. It supports multi-node and multi-GPU setups, scalable rollout generation, and policy optimization using libraries like VLLM. Currently, we supports two Reinforcement Learning with Verifiable Reward (RLVR) tasks: solving math problems and code generation.
+This repository implements a distributed Reinforcement Learning (RL) training framework designed to fine-tune large language models using algorithms such as **GRPO** and **DAPO**. It supports multi-node and multi-GPU setups, scalable rollout generation, and policy optimization using libraries like VLLM. Currently, we support two Reinforcement Learning with Verifiable Reward (RLVR) tasks: solving math problems and code generation.
 
 **Please note that we are still under intensive development, stay tuned.**
 
@@ -70,7 +70,7 @@ Key features for Producer-Consumer Pattern:
 
 ## 🧠 Data Format
 
-Samples in the training or evaluation `.jsonl` file should the same format depends on the type of task, we currently support two RLVR tasks: solving math problems and code generation.
+Samples in the training or evaluation `.jsonl` file should follow the format specific to the type of task. We currently support two RLVR tasks: solving math problems and code generation.
 
 ### Math Data Format
 ```json
@@ -84,7 +84,7 @@ Samples in the training or evaluation `.jsonl` file should the same format depen
 ```
 
 ### Code Data Format
-We support [Prime code dataset format](https://github.com/PRIME-RL/PRIME). Inputs and outputs in test cases should be two lists containing only strings and matching in the number of elements. You prompt must properly instruct the LLM to generate code to read test cases from stdin and output results to stdout.
+We support [Prime code dataset format](https://github.com/PRIME-RL/PRIME). Inputs and outputs in test cases should be two lists containing only strings and matching in the number of elements. Your prompt must properly instruct the LLM to generate code to read test cases from stdin and output results to stdout.
 ```json
 {
     "messages": {
@@ -134,7 +134,7 @@ We support [Prime code dataset format](https://github.com/PRIME-RL/PRIME). Input
 | `--temperature`       | Sampling temperature for generation  | `1.0`          |
 | `--top-k`             | Top-K sampling parameter for generation        | `None`         |
 | `--top-p`             | Top-P sampling parameter for generation        | `1.0`          |
-| `--system-prompt`     | System prompt, Optional, default to the default system prompt for each reward types. For more information, refer to the [**reward type**](#-constraints-and-notes) section        | `Please reason step by step, and put your final answer within \\boxed{}.`         |
+| `--system-prompt`     | System prompt, optional, default to the default system prompt for each reward types. For more information, refer to the [**reward type**](#-constraints-and-notes) section        | `Please reason step by step, and put your final answer within \\boxed{}.`         |
 | `--max-new-tokens`    | Max generation tokens | `3584`         |
 | `--max-prompt-tokens` | Max prompt tokens     | `512`          |
 
@@ -169,7 +169,7 @@ We support [Prime code dataset format](https://github.com/PRIME-RL/PRIME). Input
 
 ## ⚙️ GRPO Settings
 
-In addition to the two default training settings we provided--- original `GRPO` and `DAPO`, users can customize their training by changing the following hyperparameters in `grpo_config` in `rl_example.py`.
+In addition to the two default training settings provided—`GRPO` and `DAPO`—users can customize their training by changing the following hyperparameters in `grpo_config` in `rl_example.py`.
 
 | Argument Name                 | Description                      | Default                                                                                                                                                   |
 | ----------------------------- | ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
@@ -272,7 +272,7 @@ We use 10.0.0.3 as master node. First we start a ray cluster on 10.0.0.3:
 ray start --head --node-ip-address=10.0.0.3
 ```
 
-Then, for each slave node (10.0.0.4/10.0.0.5/10.0.0.6), we add to the ray cluser by following code:
+Then, for each slave node (10.0.0.4/10.0.0.5/10.0.0.6), we add to the ray cluster by following code:
 ```bash
 ray start --address='10.0.0.3:6379'
 ```
@@ -313,4 +313,4 @@ python rl_example.py
 ```
 
 ## Acknowledgement
-Colossal-RL is a distributed version of ColossalChat and inspired by a few awesome open-source projects. We would like to express our gratitude to the Fuyao-ray team and the vllm-ascend team for their support throughout the development of the this project. We also thank the following awesome open-source projects and algorithms: GRPO, DAPO, TRL, Verl, OpenRLHF, StreamRL, Qwen, Logic-RL.
+Colossal-RL is a distributed version of ColossalChat and inspired by a few awesome open-source projects. We would like to express our gratitude to the following awesome open-source projects and algorithms: GRPO, DAPO, TRL, Verl, OpenRLHF, StreamRL, Qwen, Logic-RL.

From 1330b5753ba15e2613a6f85149cace4f366ed15e Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Tue, 10 Jun 2025 18:21:42 +0800
Subject: [PATCH 086/100] add ray timeout handling instruction

---
 applications/ColossalChat/coati/distributed/README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/applications/ColossalChat/coati/distributed/README.md b/applications/ColossalChat/coati/distributed/README.md
index 29bd8d413a2b..85055a5f3c3b 100644
--- a/applications/ColossalChat/coati/distributed/README.md
+++ b/applications/ColossalChat/coati/distributed/README.md
@@ -47,6 +47,14 @@ pip install cupy-cuda12x
 python -m cupyx.tools.install_library --cuda 12.x --library nccl
 ```
 
+To support long input/output sequence length (e.g., 32K), you may need to manually change the default setting (180 seconds) for the `timeout_s` variable in your ray installation to a larger value as shown in the screenshot below.
+
+<div align="center">
+  <p align="center">
+    <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/change_ray_timeout.png" width=700/>
+  </p>
+</div>
+
 Prepare Model & dataset
 ```bash
 huggingface-cli download --local-dir-use-symlinks False Qwen/Qwen2.5-7B --local-dir /models/Qwen/Qwen2.5-7B

From 8992def757f7f27e0b6dadfaa6f66ff6caa95e89 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Wed, 11 Jun 2025 17:54:18 +0800
Subject: [PATCH 087/100] fix pp memory issue (#6344)

Co-authored-by: Tong Li <tong.li35271158@gmail.com>
---
 applications/ColossalChat/coati/distributed/grpo_consumer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 0e956ca413c2..51cdcf322da6 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -345,7 +345,7 @@ def _criterion(outputs, inputs):
                         criterion=_criterion,
                         optimizer=self.optimizer,
                         return_loss=True,
-                        return_outputs=True,
+                        return_outputs=False,
                     )
                     loss = policy_model_outputs["loss"]
 

From 2f02a287772519fc6eef66f08fe3ab2e85e24053 Mon Sep 17 00:00:00 2001
From: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Date: Thu, 12 Jun 2025 11:21:31 +0800
Subject: [PATCH 088/100] Update README.md

---
 applications/ColossalChat/coati/distributed/README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/applications/ColossalChat/coati/distributed/README.md b/applications/ColossalChat/coati/distributed/README.md
index 85055a5f3c3b..21647a8cc896 100644
--- a/applications/ColossalChat/coati/distributed/README.md
+++ b/applications/ColossalChat/coati/distributed/README.md
@@ -247,7 +247,6 @@ In addition to the two default training settings provided—`GRPO` and `DAPO`—
         ```
         You are a helpful assistant.
         ```
-        generated code is
 ---
 
 ## 🧪 Example: single machine 8-GPU Zero2 Strategy

From 51b7abe9ddaee6619e18f111fc80afeb4d217304 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Thu, 12 Jun 2025 15:06:01 +0800
Subject: [PATCH 089/100] fix num_update_per_episode

---
 applications/ColossalChat/coati/distributed/launch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index 30e7382ef552..dc8bf0057217 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -16,7 +16,7 @@ def get_jsonl_size_fast(path: str) -> int:
     with open(path) as f:
         lines = f.readlines()
         lines = [line for line in lines if line.strip()]
-        return len(lines) - 1
+        return len(lines)
 
 
 def get_dp_size_fast(n_procs: int, plugin_config: Dict[str, Any]) -> int:

From 30a6859f7792a9026d388224f8fa50e4d6e711ac Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Fri, 13 Jun 2025 18:21:54 +0800
Subject: [PATCH 090/100] optimize pp log_softmax OOM

---
 .gitignore                                    |  2 +
 .../coati/distributed/grpo_consumer.py        | 37 +++++++++++++------
 2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/.gitignore b/.gitignore
index e603f5015ec7..94d952295410 100644
--- a/.gitignore
+++ b/.gitignore
@@ -171,3 +171,5 @@ applications/ColossalChat/*.txt
 applications/ColossalChat/*.db
 applications/ColossalChat/stdin
 applications/ColossalChat/*.zip
+applications/ColossalChat/*.prof
+applications/ColossalChat/*.png
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 51cdcf322da6..040ca61a439b 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -280,13 +280,21 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
                             )
 
                         if self.booster.plugin.stage_manager.is_last_stage():
-                            reference_model_logits = reference_model_outputs["outputs"]["logits"]
-                            reference_action_log_probs = calc_action_log_probs(
-                                reference_model_logits / self.generate_config["temperature"],
-                                input_ids_forward_micro_batch,
-                                num_action,
-                                self.plugin.shard_config,
+                            reference_action_log_probs = torch.zeros(
+                                (input_ids_forward_micro_batch.size(0), num_action),
+                                device=input_ids_forward_micro_batch.device,
                             )
+                            for i in range(reference_action_log_probs.size(0)):
+                                # activation for log_softmax is too large if vocab size and sequence length are large
+                                # e.g., when using 152064 vocab size with 32K seqence length and a micro batch size of 4 (for pp=4 for example),
+                                # this activation sorely takes 152064*32000*4*4/1024/1024/1024=72.5GB
+                                reference_action_log_probs[i, :] += calc_action_log_probs(
+                                    reference_model_outputs["outputs"]["logits"][i : i + 1]
+                                    / self.generate_config["temperature"],
+                                    input_ids_forward_micro_batch[i : i + 1],
+                                    num_action,
+                                    self.plugin.shard_config,
+                                )[0]
                         else:
                             # Dummy reference logprobs for data iterator.
                             reference_action_log_probs = None
@@ -308,12 +316,19 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
 
                     def _criterion(outputs, inputs):
                         action_logits = outputs.logits
-                        action_log_probs = calc_action_log_probs(
-                            action_logits / self.generate_config["temperature"],
-                            inputs["input_ids"],
-                            num_action,
-                            self.plugin.shard_config,
+                        action_log_probs = torch.zeros(
+                            (inputs["input_ids"].size(0), num_action), device=action_logits.device
                         )
+                        for i in range(action_log_probs.size(0)):
+                            # activation for log_softmax is too large if vocab size and sequence length are large
+                            # e.g., when using 152064 vocab size with 32K seqence length and a micro batch size of 4 (for pp=4 for example),
+                            # this activation sorely takes 152064*32000*4*4/1024/1024/1024=72.5GB
+                            action_log_probs[i, :] += calc_action_log_probs(
+                                action_logits[i : i + 1] / self.generate_config["temperature"],
+                                inputs["input_ids"][i : i + 1],
+                                num_action,
+                                self.plugin.shard_config,
+                            )[0]
                         if "reference_action_log_probs" in inputs:
                             per_token_kl = (
                                 torch.exp(inputs["reference_action_log_probs"] - action_log_probs)

From e3d56cbd860da04d1b81a7917fac10d8aaedaa9a Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Wed, 18 Jun 2025 10:24:48 +0000
Subject: [PATCH 091/100] implement memory efficient logprob

---
 .../coati/distributed/grpo_consumer.py        | 45 ++++++------------
 .../ColossalChat/coati/distributed/utils.py   | 46 ++++++++++++-------
 2 files changed, 43 insertions(+), 48 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 040ca61a439b..5e8f329eb3b0 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -6,7 +6,7 @@
 import wandb
 from coati.distributed.consumer import BaseConsumer
 from coati.distributed.loss import PolicyLoss
-from coati.distributed.utils import calc_action_log_probs
+from coati.distributed.utils import memory_efficient_logprob
 from coati.trainer.utils import all_reduce_mean, all_reduce_sum
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -280,21 +280,12 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
                             )
 
                         if self.booster.plugin.stage_manager.is_last_stage():
-                            reference_action_log_probs = torch.zeros(
-                                (input_ids_forward_micro_batch.size(0), num_action),
-                                device=input_ids_forward_micro_batch.device,
+                            reference_action_log_probs = memory_efficient_logprob(
+                                reference_model_outputs["outputs"]["logits"],
+                                input_ids_forward_micro_batch,
+                                num_action,
+                                shard_config=self.plugin.shard_config,
                             )
-                            for i in range(reference_action_log_probs.size(0)):
-                                # activation for log_softmax is too large if vocab size and sequence length are large
-                                # e.g., when using 152064 vocab size with 32K seqence length and a micro batch size of 4 (for pp=4 for example),
-                                # this activation sorely takes 152064*32000*4*4/1024/1024/1024=72.5GB
-                                reference_action_log_probs[i, :] += calc_action_log_probs(
-                                    reference_model_outputs["outputs"]["logits"][i : i + 1]
-                                    / self.generate_config["temperature"],
-                                    input_ids_forward_micro_batch[i : i + 1],
-                                    num_action,
-                                    self.plugin.shard_config,
-                                )[0]
                         else:
                             # Dummy reference logprobs for data iterator.
                             reference_action_log_probs = None
@@ -316,19 +307,12 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
 
                     def _criterion(outputs, inputs):
                         action_logits = outputs.logits
-                        action_log_probs = torch.zeros(
-                            (inputs["input_ids"].size(0), num_action), device=action_logits.device
+                        action_log_probs = memory_efficient_logprob(
+                            action_logits,
+                            inputs["input_ids"],
+                            num_action,
+                            shard_config=self.plugin.shard_config,
                         )
-                        for i in range(action_log_probs.size(0)):
-                            # activation for log_softmax is too large if vocab size and sequence length are large
-                            # e.g., when using 152064 vocab size with 32K seqence length and a micro batch size of 4 (for pp=4 for example),
-                            # this activation sorely takes 152064*32000*4*4/1024/1024/1024=72.5GB
-                            action_log_probs[i, :] += calc_action_log_probs(
-                                action_logits[i : i + 1] / self.generate_config["temperature"],
-                                inputs["input_ids"][i : i + 1],
-                                num_action,
-                                self.plugin.shard_config,
-                            )[0]
                         if "reference_action_log_probs" in inputs:
                             per_token_kl = (
                                 torch.exp(inputs["reference_action_log_probs"] - action_log_probs)
@@ -370,16 +354,15 @@ def _criterion(outputs, inputs):
                             mean_kl.append(kl)
                         mean_loss.append(all_reduce_mean(loss, self.plugin).data)
                 else:
-
                     policy_model_logits = self.policy_model(
                         input_ids=input_ids_forward_micro_batch,
                         attention_mask=attention_mask_forward_micro_batch,
                     ).logits
-                    action_log_probs = calc_action_log_probs(
+                    action_log_probs = memory_efficient_logprob(
                         policy_model_logits / self.generate_config["temperature"],
                         input_ids_forward_micro_batch,
                         num_action,
-                        self.plugin.shard_config,
+                        shard_config=self.plugin.shard_config,
                     )
 
                     if self.policy_loss_fn.beta > 0:
@@ -388,7 +371,7 @@ def _criterion(outputs, inputs):
                                 input_ids=input_ids_forward_micro_batch,
                                 attention_mask=attention_mask_forward_micro_batch,
                             ).logits
-                        reference_action_log_probs = calc_action_log_probs(
+                        reference_action_log_probs = memory_efficient_logprob(
                             reference_model_logits / self.generate_config["temperature"],
                             input_ids_forward_micro_batch,
                             num_action,
diff --git a/applications/ColossalChat/coati/distributed/utils.py b/applications/ColossalChat/coati/distributed/utils.py
index a40ebbcfbe92..d46243114eea 100644
--- a/applications/ColossalChat/coati/distributed/utils.py
+++ b/applications/ColossalChat/coati/distributed/utils.py
@@ -71,31 +71,43 @@ def log_probs_from_logits(logits: torch.Tensor, labels: torch.Tensor) -> torch.T
     return per_label_logps.squeeze(-1)
 
 
-def calc_action_log_probs(
+def memory_efficient_logprob(
     logits: torch.Tensor,
-    sequences: torch.LongTensor,
-    num_actions: int,
-    shard_config,
+    inputs: torch.Tensor,
+    num_action: int,
+    chunk_size: int = 2048,
+    shard_config: Any = None,
     vocab_size: int = None,
 ) -> torch.Tensor:
-    """Calculate action log probs.
-
+    """
+    Calculate action log probs in a memory-efficient way by processing in chunks.
     Args:
         logits (torch.Tensor): Output tensor of Actor.forward.logits.
-        sequences (torch.LongTensor): Input sequences.
-        num_actions (int): Number of actions.
-        shard_config
-        vocab_size
-
-
+        inputs (torch.LongTensor): Input sequences.
+        num_action (int): Number of actions.
+        chunk_size (int, optional): Size of each chunk to process. Default is 2048.
+        shard_config: Shard configuration for distributed computation.
+        vocab_size (int, optional): Vocabulary size. Default is None.
     Returns:
         torch.Tensor: Action log probs.
     """
-    # labels: torch.Tensor,  # [B, S] or [B, S, Vocab_size]
-    # logits: torch.Tensor,  # [B, S, Vocab_size]
-    log_probs = dist_log_prob(sequences, logits, shard_config, vocab_size, logits.dtype)
-    log_probs = log_probs.squeeze(-1)
-    return log_probs[:, -num_actions:]
+    action_log_probs = torch.zeros((logits.size(0), num_action), device=logits.device, dtype=logits.dtype)
+    context_length = logits.size(1) - num_action
+    for i in range(action_log_probs.size(0)):
+        # loop over each sample in the micro-batch
+        for start in range(context_length, logits.size(1), chunk_size):
+            end = min(start + chunk_size, logits.size(1))
+            # calculate log probs in chunks to save memory
+            log_probs = dist_log_prob(
+                inputs[i : i + 1, start - 1 : end],
+                logits[i : i + 1, start - 1 : end],
+                shard_config,
+                vocab_size,
+                logits.dtype,
+            )  # [1, chunk_size, 1]
+            log_probs = log_probs.squeeze(-1)
+            action_log_probs[i, start - context_length : end - context_length] += log_probs[0]
+    return action_log_probs
 
 
 def masked_mean(tensor: torch.Tensor, mask: torch.Tensor, dim: int = 1) -> torch.Tensor:

From 6b06430ca418715240c10a28baf14ff3e4b35ca0 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Thu, 19 Jun 2025 01:37:52 +0000
Subject: [PATCH 092/100] fix small bug

---
 .../ColossalChat/coati/distributed/grpo_consumer.py         | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 5e8f329eb3b0..d7d6221a1033 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -281,7 +281,7 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
 
                         if self.booster.plugin.stage_manager.is_last_stage():
                             reference_action_log_probs = memory_efficient_logprob(
-                                reference_model_outputs["outputs"]["logits"],
+                                reference_model_outputs["outputs"]["logits"] / self.generate_config["temperature"],
                                 input_ids_forward_micro_batch,
                                 num_action,
                                 shard_config=self.plugin.shard_config,
@@ -308,7 +308,7 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
                     def _criterion(outputs, inputs):
                         action_logits = outputs.logits
                         action_log_probs = memory_efficient_logprob(
-                            action_logits,
+                            action_logits / self.generate_config["temperature"],
                             inputs["input_ids"],
                             num_action,
                             shard_config=self.plugin.shard_config,
@@ -375,7 +375,7 @@ def _criterion(outputs, inputs):
                             reference_model_logits / self.generate_config["temperature"],
                             input_ids_forward_micro_batch,
                             num_action,
-                            self.plugin.shard_config,
+                            shard_config=self.plugin.shard_config,
                         )
                         per_token_kl = (
                             torch.exp(reference_action_log_probs - action_log_probs)

From 8880b83791fa9d34d733f365236d8b7beabdb0ee Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Thu, 19 Jun 2025 14:02:08 +0800
Subject: [PATCH 093/100] add dp rank for multi-dp (#6351)

Co-authored-by: Tong Li <tong.li35271158@gmail.com>
---
 .../ColossalChat/coati/distributed/grpo_consumer.py   | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index d7d6221a1033..8d50734a9957 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -130,7 +130,10 @@ def __init__(
     def setup(self):
         super().setup()
         if (not self.plugin.pp_size > 1 and self.rank == 0) or (
-            self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0
+            self.plugin.pp_size > 1
+            and self.booster.plugin.stage_manager.is_last_stage()
+            and self.tp_rank == 0
+            and self.dp_rank == 0
         ):
             self.wandb_run = wandb.init(
                 project=self.project_name,
@@ -222,7 +225,6 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
         effective_samples = all_reduce_sum(torch.sum(loss_mask), self.plugin)
         effective_tokens_count = torch.sum(action_mask, dim=-1) * loss_mask
         total_effective_tokens_count = all_reduce_sum(torch.sum(effective_tokens_count), self.plugin)
-        total_samples = all_reduce_sum(torch.sum(torch.ones_like(loss_mask, device=loss_mask.device)), self.plugin)
         self.effective_sample_count += effective_samples.item()
         pbar.set_postfix(
             {
@@ -407,7 +409,10 @@ def _criterion(outputs, inputs):
                         mean_kl.append(kl.data)
                     mean_loss.append(loss.data)
             if not self.plugin.pp_size > 1 or (
-                self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0
+                self.plugin.pp_size > 1
+                and self.booster.plugin.stage_manager.is_last_stage()
+                and self.tp_rank == 0
+                and self.dp_rank == 0
             ):
                 reward = all_reduce_mean(reward.mean(), self.plugin)
                 format_acc = all_reduce_mean(format_acc.mean(), self.plugin)

From b1f646c7e75552c8b9a87d21c55f0c97cf1f7e7e Mon Sep 17 00:00:00 2001
From: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Date: Mon, 30 Jun 2025 13:21:08 +0800
Subject: [PATCH 094/100] [feat[ Support one-behind to reduce bubble time. Add
 profiling code (#6353)

* support n_behind, add profiling

* fix bugs

* fix visualization

* fix behind

* fix loop issue

* add profiling

* fix update

* update assert

* remove assert

---------

Co-authored-by: Tong Li <tong.li35271158@gmail.com>
---
 .../coati/distributed/consumer.py             | 171 +++++++++++++-----
 .../coati/distributed/grpo_consumer.py        |   4 +
 .../ColossalChat/coati/distributed/launch.py  |   6 +
 .../coati/distributed/producer.py             |  27 ++-
 .../coati/distributed/profiling_utils.py      |  37 ++++
 applications/ColossalChat/profiling.sh        |  13 ++
 applications/ColossalChat/rl_example.py       |  89 +++++----
 applications/ColossalChat/visualization.py    | 100 ++++++++++
 8 files changed, 365 insertions(+), 82 deletions(-)
 create mode 100644 applications/ColossalChat/coati/distributed/profiling_utils.py
 create mode 100755 applications/ColossalChat/profiling.sh
 create mode 100644 applications/ColossalChat/visualization.py

diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
index a7abb1588d55..e360392e74fa 100644
--- a/applications/ColossalChat/coati/distributed/consumer.py
+++ b/applications/ColossalChat/coati/distributed/consumer.py
@@ -6,6 +6,7 @@
 import ray.util.collective as cc
 import torch
 import torch.distributed as dist
+from coati.distributed.profiling_utils import CustomProfiler
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM
 
@@ -36,6 +37,8 @@ def __init__(
         minibatch_size: int = 1,
         save_interval: int = 100,
         save_dir: str = "./model",
+        enable_profiling: bool = False,
+        n_behind: int = 0,
     ):
         self.num_producers = num_producers
         self.num_episodes = num_episodes
@@ -49,6 +52,7 @@ def __init__(
         self.minibatch_size = minibatch_size
         self.save_interval = save_interval
         self.save_dir = save_dir
+        self.enable_profiling = enable_profiling
         assert batch_size % minibatch_size == 0, "batch_size should be divisible by microbatch_size"
         self.num_microbatches = batch_size // minibatch_size
 
@@ -57,6 +61,7 @@ def __init__(
 
         self.device = get_current_device()
         self.lr_scheduler = None
+        self.n_behind = n_behind
 
     def setup(self) -> None:
         launch(self.rank, self.world_size, self.master_addr, self.master_port, local_rank=0)
@@ -94,6 +99,7 @@ def setup(self) -> None:
 
         self.buffer = []
         self.recv_cnt = 0
+        self.profiler = CustomProfiler(f"C{self.rank}", disabled=not self.enable_profiling)
 
     def state_dict(self) -> Dict[str, torch.Tensor]:
         raise NotImplementedError
@@ -101,6 +107,41 @@ def state_dict(self) -> Dict[str, torch.Tensor]:
     def step(self, step_idx: int, **kwargs) -> Optional[float]:
         raise NotImplementedError
 
+    def prepare_mini_batch(self, effective_group_to_raw_group_mapping: Dict[int, int]) -> Dict[str, torch.Tensor]:
+        """
+        Prepare a mini-batch from the effective group to raw group mapping.
+        This method is used to create a mini-batch for training.
+        """
+        batches = [
+            self.buffer[effective_group_to_raw_group_mapping[i]]
+            for i in range(self.dp_rank * self.minibatch_size, (self.dp_rank + 1) * self.minibatch_size)
+        ]
+        # every dp_rank will receive a complete mini-batch, no need to sync within step() later
+        # each mini-batch use the first self.dp_size * minibatch_size effective samples
+        raw_mini_batches = self.buffer[
+            : effective_group_to_raw_group_mapping[self.dp_size * self.minibatch_size - 1] + 1
+        ]  # include the last effective sample
+        raw_mini_batches_metric_dict = {
+            "raw_train_mini_batch_reward": [t[1] for t in raw_mini_batches],
+            "raw_train_mini_batch_format_acc": [t[2] for t in raw_mini_batches],
+            "raw_train_mini_batch_ans_acc": [t[3] for t in raw_mini_batches],
+            "raw_train_mini_batch_response_len": [t[4] for t in raw_mini_batches],
+        }
+        batch = bind_batch([t[0] for t in batches])
+        batch = post_recv(batch)
+        return batch, raw_mini_batches_metric_dict
+
+    def calculate_effective_group_to_raw_group_mapping(self, step):
+        effective_group_to_raw_group_mapping = {}
+        for buffer_idx in range(len(self.buffer)):
+            if self.buffer[buffer_idx][0] is not None:
+                if self.n_behind == 0:
+                    effective_group_to_raw_group_mapping[len(effective_group_to_raw_group_mapping)] = buffer_idx
+                else:
+                    if self.buffer[buffer_idx][-1] <= step - self.n_behind:
+                        effective_group_to_raw_group_mapping[len(effective_group_to_raw_group_mapping)] = buffer_idx
+        return effective_group_to_raw_group_mapping
+
     def loop(self) -> None:
         print(
             f"Consumer{self.rank} num_update: {self.num_update_per_episode}, num_recv: {self.num_recv_per_update}, nmb: {self.num_microbatches}"
@@ -112,14 +153,53 @@ def loop(self) -> None:
                 disable=self.rank != 0,
             ) as pbar:
                 for step in pbar:
+                    torch.cuda.reset_peak_memory_stats()
                     i = 0
+
+                    self.profiler.enter(f"rollout_episode_{episode}_step_{step}")
                     for _ in range(self.num_recv_per_update):
+                        if self.n_behind > 0:
+                            # after sync model, do not wait for more data to arrive as rollout takes time, use buffered data
+                            effective_group_to_raw_group_mapping = self.calculate_effective_group_to_raw_group_mapping(
+                                step=step
+                            )
+                            while len(effective_group_to_raw_group_mapping) >= self.dp_size * self.minibatch_size:
+                                self.profiler.log(
+                                    f"Still have {len(effective_group_to_raw_group_mapping)} effective groups, greater than {self.dp_size * self.minibatch_size}, start training"
+                                )
+                                batch, raw_mini_batches_metric_dict = self.prepare_mini_batch(
+                                    effective_group_to_raw_group_mapping
+                                )
+                                self.profiler.enter("step")
+                                loss = self.step(i, pbar, **batch, **raw_mini_batches_metric_dict)
+                                self.profiler.exit("step")
+                                self.buffer = self.buffer[
+                                    effective_group_to_raw_group_mapping[self.dp_size * self.minibatch_size - 1] + 1 :
+                                ]
+                                # recalculate the effective group to raw group mapping
+                                effective_group_to_raw_group_mapping_size_before = len(
+                                    effective_group_to_raw_group_mapping
+                                )
+                                effective_group_to_raw_group_mapping = (
+                                    self.calculate_effective_group_to_raw_group_mapping(step=step)
+                                )
+                                assert (
+                                    len(effective_group_to_raw_group_mapping)
+                                    == effective_group_to_raw_group_mapping_size_before
+                                    - self.dp_size * self.minibatch_size
+                                )
+                                if loss is not None:
+                                    pbar.set_postfix({"loss": loss})
+                                i += 1
+
                         # receive data from producers
                         for r in range(self.num_producers):
                             print(f"[T{dist.get_rank()}] Recv data episode {episode} step {step} from {r}")
+                            self.profiler.enter(f"recv_broadcast_data_P{r}")
                             raw_batch = ray_broadcast_tensor_dict(
                                 None, src=0, device=self.device, group_name=f"sync_data_{r}"
                             )
+                            self.profiler.exit(f"recv_broadcast_data_P{r}")
                             # calculate group reward et al. filtering. As only the filtered group will be used for training (which is incomplete),
                             # we need to calculate the metrics before filtering here for logging
                             # [batch_size, num_generations, ...] -> [batch_size * num_generations, ...]
@@ -154,6 +234,7 @@ def loop(self) -> None:
                                         format_acc[group_idx],
                                         ans_acc[group_idx],
                                         response_len[group_idx],
+                                        step,
                                     ]
                                 )
                             if effective_group_mask is not None:
@@ -161,56 +242,44 @@ def loop(self) -> None:
                                     f"[T{dist.get_rank()}] Filter recv data: {len(raw_batch)} -> {torch.sum(effective_group_mask).cpu().item()} effective groups"
                                 )
                         # mapping the effective group to the raw group for indexing
-                        effective_group_to_raw_group_mapping = {}
-                        for buffer_idx in range(len(self.buffer)):
-                            if self.buffer[buffer_idx][0] is not None:
-                                effective_group_to_raw_group_mapping[len(effective_group_to_raw_group_mapping)] = (
-                                    buffer_idx
-                                )
+                        effective_group_to_raw_group_mapping = self.calculate_effective_group_to_raw_group_mapping(
+                            step=step
+                        )
                         print(
                             f"[T{dist.get_rank()}] Collect Effective Prompt: {len(effective_group_to_raw_group_mapping)}/{self.dp_size * self.minibatch_size}"
                         )
 
-                        while len(effective_group_to_raw_group_mapping) >= self.dp_size * self.minibatch_size:
-                            # on each dp_rank, we use minibatch_size effective samples to form a batch
-                            batches = [
-                                self.buffer[effective_group_to_raw_group_mapping[i]]
-                                for i in range(
-                                    self.dp_rank * self.minibatch_size, (self.dp_rank + 1) * self.minibatch_size
+                        if self.n_behind == 0:
+                            # If n_behind is 0, we start training after receiving data from producers.
+                            while len(effective_group_to_raw_group_mapping) >= self.dp_size * self.minibatch_size:
+                                self.profiler.log(
+                                    f"Collect {len(effective_group_to_raw_group_mapping)} effective groups, greater than {self.dp_size * self.minibatch_size}, start training"
                                 )
-                            ]
-                            # every dp_rank will receive a complete mini-batch, no need to sync within step() later
-                            # each mini-batch use the first self.dp_size * minibatch_size effective samples
-                            raw_mini_batches = self.buffer[
-                                : effective_group_to_raw_group_mapping[self.dp_size * self.minibatch_size - 1] + 1
-                            ]  # include the last effective sample
-                            raw_mini_batches_metric_dict = {
-                                "raw_train_mini_batch_reward": [t[1] for t in raw_mini_batches],
-                                "raw_train_mini_batch_format_acc": [t[2] for t in raw_mini_batches],
-                                "raw_train_mini_batch_ans_acc": [t[3] for t in raw_mini_batches],
-                                "raw_train_mini_batch_response_len": [t[4] for t in raw_mini_batches],
-                            }
-                            batch = bind_batch([t[0] for t in batches])
-                            batch = post_recv(batch)
-                            loss = self.step(i, pbar, **batch, **raw_mini_batches_metric_dict)
-                            self.buffer = self.buffer[
-                                effective_group_to_raw_group_mapping[self.dp_size * self.minibatch_size - 1] + 1 :
-                            ]
-                            # recalculate the effective group to raw group mapping
-                            effective_group_to_raw_group_mapping_size_before = len(effective_group_to_raw_group_mapping)
-                            effective_group_to_raw_group_mapping = {}
-                            for buffer_idx in range(len(self.buffer)):
-                                if self.buffer[buffer_idx][0] is not None:
-                                    effective_group_to_raw_group_mapping[len(effective_group_to_raw_group_mapping)] = (
-                                        buffer_idx
-                                    )
-                            assert (
-                                len(effective_group_to_raw_group_mapping)
-                                == effective_group_to_raw_group_mapping_size_before - self.dp_size * self.minibatch_size
-                            )
-                            if loss is not None:
-                                pbar.set_postfix({"loss": loss})
-                            i += 1
+                                batch, raw_mini_batches_metric_dict = self.prepare_mini_batch(
+                                    effective_group_to_raw_group_mapping
+                                )
+                                self.profiler.enter("step")
+                                loss = self.step(i, pbar, **batch, **raw_mini_batches_metric_dict)
+                                self.profiler.exit("step")
+                                self.buffer = self.buffer[
+                                    effective_group_to_raw_group_mapping[self.dp_size * self.minibatch_size - 1] + 1 :
+                                ]
+                                # recalculate the effective group to raw group mapping
+                                effective_group_to_raw_group_mapping_size_before = len(
+                                    effective_group_to_raw_group_mapping
+                                )
+                                effective_group_to_raw_group_mapping = (
+                                    self.calculate_effective_group_to_raw_group_mapping(step=step)
+                                )
+                                assert (
+                                    len(effective_group_to_raw_group_mapping)
+                                    == effective_group_to_raw_group_mapping_size_before
+                                    - self.dp_size * self.minibatch_size
+                                )
+                                if loss is not None:
+                                    pbar.set_postfix({"loss": loss})
+                                i += 1
+
                     if self.lr_scheduler is not None:
                         self.lr_scheduler.step()
                     if (step + 1) % self.save_interval == 0 or (step + 1) == self.num_update_per_episode:
@@ -221,13 +290,16 @@ def loop(self) -> None:
                         if self.rank == 0:
                             print(f"Saved model checkpoint at step {step + 1} in folder {save_path}")
 
-                    if episode != self.num_episodes - 1 or step != self.num_update_per_episode - 1:
+                    if (episode != self.num_episodes - 1 or step != self.num_update_per_episode - 1) and (
+                        episode != 0 or step >= self.n_behind
+                    ):
                         if self.pp_size > 1:
                             print(
                                 f"[T{dist.get_rank()}] Sync model PP stage {self.pp_rank} episode {episode} step {step}"
                             )
                         else:
                             print(f"[T{dist.get_rank()}] Sync model episode {episode} step {step}")
+                        self.profiler.enter("sync_model")
                         torch.cuda.empty_cache()
                         state_dict = self.state_dict()
                         if self.pp_size > 1:
@@ -245,6 +317,13 @@ def loop(self) -> None:
                                 )
                         del state_dict
                         torch.cuda.empty_cache()
+                        self.profiler.exit("sync_model")
+                    self.profiler.log(f"Peak memory usage: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
+                    self.profiler.exit(f"rollout_episode_{episode}_step_{step}")
+
+    def __del__(self):
+        if hasattr(self, "profiler"):
+            self.profiler.close()
 
 
 @ray.remote
diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 8d50734a9957..f8ce1afde4b4 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -38,6 +38,8 @@ def __init__(
         project_name: str = None,
         run_name: str = None,
         wandb_group_name: str = None,
+        enable_profiling: bool = False,
+        n_behind: int = 0,
     ):
         print(f"Using GRPO config: {grpo_config}")
         if (
@@ -63,6 +65,8 @@ def __init__(
             minibatch_size,
             save_interval=save_interval,
             save_dir=save_dir,
+            enable_profiling=enable_profiling,
+            n_behind=n_behind,
         )
         path = model_config.pop("path")
         self.policy_model = AutoModelForCausalLM.from_pretrained(path, **model_config)
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
index dc8bf0057217..a48246c87526 100644
--- a/applications/ColossalChat/coati/distributed/launch.py
+++ b/applications/ColossalChat/coati/distributed/launch.py
@@ -57,6 +57,8 @@ def launch_distributed(
     eval_generation_config: Optional[Dict[str, Any]] = None,
     log_rollout_interval: int = 20,
     rollout_save_dir: str = "./rollout",
+    enable_profiling: bool = False,
+    n_behind: int = 0,
 ):
     if core_algo not in ALGO_MAP:
         raise NotImplementedError(f"{core_algo} is not supported yet.")
@@ -132,6 +134,8 @@ def launch_distributed(
             wandb_group_name=wandb_group_name,
             log_rollout_interval=log_rollout_interval,
             rollout_log_file=rollout_log_file,
+            enable_profiling=enable_profiling,
+            n_behind=n_behind,
         )
         producer_procs.append(producer)
     ray.get([p.setup.remote() for p in producer_procs])
@@ -171,6 +175,8 @@ def launch_distributed(
             project_name=project_name,
             run_name=run_name,
             wandb_group_name=wandb_group_name,
+            enable_profiling=enable_profiling,
+            n_behind=n_behind,
         )
         consumer_procs.append(consumer)
     ray.get([p.setup.remote() for p in consumer_procs])
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 854c2fcc2b95..2a37463913ee 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -9,6 +9,7 @@
 import tqdm
 import wandb
 from coati.dataset.loader import RawConversationDataset, collate_fn_grpo
+from coati.distributed.profiling_utils import CustomProfiler
 from coati.distributed.reward.reward_fn import boxed_math_reward_fn, code_reward_fn, math_reward_fn
 from coati.distributed.reward.verifiable_reward import VerifiableReward
 from ray.util.collective import allreduce
@@ -52,6 +53,8 @@ def __init__(
         wandb_group_name: str = None,
         log_rollout_interval: int = 20,
         rollout_log_file: str = "./rollout_log.jsonl",
+        enable_profiling: bool = False,
+        n_behind: int = 0,
     ):
         self.producer_idx = producer_idx
         self.num_producers = num_producers
@@ -62,6 +65,7 @@ def __init__(
         assert batch_size % microbatch_size == 0
         self.num_microbatches = batch_size // microbatch_size
         self.latest_eval_step = -1
+        self.profiler = CustomProfiler(f"P{self.producer_idx}", disabled=not enable_profiling)
 
         self.train_dataset_config = train_dataset_config
         self.model_config = model_config
@@ -75,6 +79,7 @@ def __init__(
         self.log_rollout_interval = log_rollout_interval
         self.latest_rollout_log_step = -1
         self.grpo_config = grpo_config
+        self.n_behind = n_behind
         reward_model_kwargs = {
             k: v
             for k, v in grpo_config.items()
@@ -268,11 +273,14 @@ def loop(self) -> None:
                             self.wandb_run.log(to_log_msg, step=self.consumer_global_step)
                         self.eval_mode = False
                         self.latest_eval_step = self.consumer_global_step
+                self.profiler.enter("rollout")
                 outputs = self.rollout(**batch)
+                self.profiler.exit("rollout")
                 outputs["temperature"] = torch.tensor(
                     [self.model.generate_config["temperature"]] * outputs["input_ids"].size(0)
                 ).to(outputs["input_ids"].device)
                 bs, num_gen = outputs["input_ids"].size(0), outputs["input_ids"].size(1)
+                self.profiler.enter("calculate_reward")
                 if self.grpo_config["reward_fn_type"] == "code":
                     test_cases = []
                     for prompt_id in range(bs):
@@ -310,14 +318,19 @@ def loop(self) -> None:
                     outputs.pop("gt_answer")
                 if "test_cases" in outputs:
                     outputs.pop("test_cases")
+                self.profiler.exit("calculate_reward")
 
                 print(f"[P{self.producer_idx}] Send data {[(k, v.shape) for k, v in outputs.items()]}")
                 outputs = pre_send(outputs)
+                self.profiler.enter("send_broadcast_data")
                 ray_broadcast_tensor_dict(
                     outputs, src=0, device=self.device, group_name=f"sync_data_{self.producer_idx}"
                 )
-                if (i + 1) % self.num_microbatches == 0 and (
-                    episode != self.num_episodes - 1 or i != num_valid_microbatches - 1
+                self.profiler.exit("send_broadcast_data")
+                if (
+                    (i + 1) % self.num_microbatches == 0
+                    and (episode != self.num_episodes - 1 or i != num_valid_microbatches - 1)
+                    and (episode != 0 or (i + 1) > self.n_behind * self.num_microbatches)
                 ):
                     if isinstance(self.model, BACKEND_MAP["vllm"]) and self.model.model_config.get(
                         "enable_sleep_mode", False
@@ -325,7 +338,7 @@ def loop(self) -> None:
                         self.model.llm.sleep()  # revict KV_cache to avoid OOM
                     # don't sync model for last iteration
                     torch.cuda.empty_cache()
-
+                    self.profiler.enter("sync_model")
                     if self.consumer_pp_size > 1:
                         for pp_idx in range(self.consumer_pp_size):
                             print(
@@ -347,6 +360,7 @@ def loop(self) -> None:
                         if "consumer_global_step" in state_dict:
                             self.consumer_global_step = state_dict.pop("consumer_global_step").item()
                         self.load_state_dict(state_dict)
+                    self.profiler.exit("sync_model")
                     del state_dict
                     torch.cuda.empty_cache()
                     if isinstance(self.model, BACKEND_MAP["vllm"]) and self.model.model_config.get(
@@ -364,6 +378,9 @@ def loop(self) -> None:
                             "temperature"
                         ] + ratio * 0.9
 
+    def __del__(self):
+        self.profiler.close()
+
 
 @ray.remote
 class SimpleProducer(BaseProducer):
@@ -392,6 +409,8 @@ def __init__(
         wandb_group_name: str = None,
         log_rollout_interval: int = 20,
         rollout_log_file: str = "./rollout_log.jsonl",
+        enable_profiling: bool = False,
+        n_behind: int = 0,
     ):
         super().__init__(
             producer_idx,
@@ -415,6 +434,8 @@ def __init__(
             wandb_group_name=wandb_group_name,
             log_rollout_interval=log_rollout_interval,
             rollout_log_file=rollout_log_file,
+            enable_profiling=enable_profiling,
+            n_behind=n_behind,
         )
         self.model = self.backend_cls(model_config, generate_config, self.tokenizer, num_generations)
         self.eval_generation_config = copy.deepcopy(self.model.generate_config)
diff --git a/applications/ColossalChat/coati/distributed/profiling_utils.py b/applications/ColossalChat/coati/distributed/profiling_utils.py
new file mode 100644
index 000000000000..1c1169b50ada
--- /dev/null
+++ b/applications/ColossalChat/coati/distributed/profiling_utils.py
@@ -0,0 +1,37 @@
+import os
+import time
+
+
+class CustomProfiler:
+    def __init__(self, name, disabled=True):
+        self.disabled = disabled
+        if not disabled:
+            self.name = name
+            self.pid = os.getpid()
+            self.file = open(f"{name}.prof", "w")
+
+    def _log(self, message):
+        if self.disabled:
+            return
+        current_time = time.time()
+        self.file.write(f"{current_time} {self.name} {self.pid}:: {message}\n")
+        self.file.flush()
+
+    def log(self, message):
+        if self.disabled:
+            return
+        current_time = time.time()
+        self.file.write(f"[Log]: {current_time} {self.name} {self.pid}:: {message}\n")
+        self.file.flush()
+
+    def enter(self, event_name):
+        self._log(f"Enter {event_name}")
+
+    def exit(self, event_name):
+        self._log(f"Exit {event_name}")
+
+    def close(self):
+        if self.disabled:
+            return
+        self.file.close()
+        print(f"Profiler data written to {self.name}.prof")
diff --git a/applications/ColossalChat/profiling.sh b/applications/ColossalChat/profiling.sh
new file mode 100755
index 000000000000..d9f3d9a931df
--- /dev/null
+++ b/applications/ColossalChat/profiling.sh
@@ -0,0 +1,13 @@
+export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
+
+# 8K context length
+# rm -rf *.prof
+# MAX_NEW_TOKENS=$((8192-512))
+# python rl_example.py --dataset /mnt/nfs/yeanbang/experiments/RLHF/grpo/train-alignment-samll.jsonl --model /home/share/data/model/Qwen2.5-Math-7B/ -t 4 -i 4 -b vllm -a GRPO -ibs 16 -tbs 16 -e 1 -rt boxed -si 100 -s "Please reason step by step, and put your final answer within \\boxed{}." -tmbs 4 -p GRPO-Math-Profile -ei -5 -zero 1 -pp 2 -mnt $MAX_NEW_TOKENS -nb 1 --enable_profiling 2>&1| tee ibs_64_tbs_32_tmbs_2_pp_2_ptp_2_8192_GRPO_profile.txt
+# python profile_grpo.py --visualization actor_timelines_ibs_64_tbs_32_tmbs_2_pp_2_ptp_2_8192_GRPO_profile.png
+
+# 4K context length
+rm -rf *.prof
+MAX_NEW_TOKENS=$((4096-512))
+python rl_example.py --dataset /mnt/nfs/yeanbang/experiments/RLHF/grpo/train-alignment-samll.jsonl --model /home/share/data/model/Qwen2.5-Math-7B/ -t 4 -i 4 -b vllm -a GRPO -ibs 8 -tbs 8 -e 1 -rt boxed -si 100 -s "Please reason step by step, and put your final answer within \\boxed{}." -tMbs 4 -tmbs 4 -p GRPO-Math-Profile -ei -5 -zero 2 -mnt $MAX_NEW_TOKENS -nb 1 --enable_profiling 2>&1| tee ibs_32_tbs_32_tmbs_4_zero_2_4096_GRPO_profile.txt
+python visualization.py --visualization actor_timelines_ibs_32_tbs_32_tmbs_4_zero_2_4096_GRPO_profile.png
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index c60923e00cca..d7b7c2a5d0fc 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -67,6 +67,27 @@
         default=2,
         help="Effective batch size per dp group for forwarding and backwarding. Please select based on the availiable memory.",
     )
+    parser.add_argument(
+        "-tp",
+        "--tensor-parallel-size",
+        type=int,
+        default=1,
+        help="Tensor parallel size for the trainer (consumer). Please check the generation arguments documentation for your backend.",
+    )
+    parser.add_argument(
+        "-pp",
+        "--pipeline-parallel-size",
+        type=int,
+        default=1,
+        help="Pipeline parallel size for the trainer (consumer). Please check the generation arguments documentation for your backend.",
+    )
+    parser.add_argument(
+        "-zero",
+        "--zero-stage",
+        type=int,
+        default=0,
+        help="Zero stage for the trainer (consumer). Please check the generation arguments documentation for your backend.",
+    )
     parser.add_argument(
         "--ray_dir", type=str, default=None, help="Custom temperary directory for storing ray cluster data, Optional"
     )
@@ -97,6 +118,13 @@
     parser.add_argument("-s", "--system-prompt", type=str, default=None, help="System prompt for data construction.")
     parser.add_argument("-mnt", "--max-new-tokens", type=int, default=1024 * 4 - 512, help="Max length for generation.")
     parser.add_argument("-mpt", "--max-prompt-tokens", type=int, default=512, help="Max length for prompt.")
+    parser.add_argument(
+        "-ptp",
+        "--producer-tensor-parallel-size",
+        type=int,
+        default=1,
+        help="Tensor parallel size for the producer. Please check the generation arguments documentation for your backend.",
+    )
 
     # GRPO parameters
     parser.add_argument("-a", "--algo", type=str, default="GRPO", choices=["DAPO", "GRPO"])
@@ -117,6 +145,13 @@
         default=100,
         help="Interval for evaluation. Evaluate every ei training steps.",
     )
+    parser.add_argument(
+        "-nb",
+        "--n-behind",
+        type=int,
+        default=0,
+        help="Number of producer batches to rollout to fill the data buffer before trainer starts to decrease bubble time",
+    )
 
     # Logging/Checkpointing parameters
     parser.add_argument("-si", "--save-interval", type=int, default=100, help="Interval for saving checkpoints.")
@@ -128,32 +163,7 @@
         "-rsd", "--rollout-save-dir", type=str, default="./rollouts", help="Directory for saving rollout loggings."
     )
     parser.add_argument(
-        "-tp",
-        "--tensor-parallel-size",
-        type=int,
-        default=1,
-        help="Tensor parallel size for the trainer (consumer). Please check the generation arguments documentation for your backend.",
-    )
-    parser.add_argument(
-        "-pp",
-        "--pipeline-parallel-size",
-        type=int,
-        default=1,
-        help="Pipeline parallel size for the trainer (consumer). Please check the generation arguments documentation for your backend.",
-    )
-    parser.add_argument(
-        "-zero",
-        "--zero-stage",
-        type=int,
-        default=0,
-        help="Zero stage for the trainer (consumer). Please check the generation arguments documentation for your backend.",
-    )
-    parser.add_argument(
-        "-ptp",
-        "--producer-tensor-parallel-size",
-        type=int,
-        default=1,
-        help="Tensor parallel size for the producer. Please check the generation arguments documentation for your backend.",
+        "--enable_profiling", action="store_true", default=False, help="Enable profiling for the training process."
     )
     args = parser.parse_args()
 
@@ -236,14 +246,25 @@
                 tensor_parallel_size=args.producer_tensor_parallel_size,
             )
         )
-        generate_config.update(
-            dict(
-                max_tokens=args.max_new_tokens,  # max new tokens
-                ignore_eos=True if args.reward_type == "think_answer_tags" else False,
-                include_stop_str_in_output=True,
-                stop=["</answer>"] if args.reward_type == "think_answer_tags" else None,
+        if args.enable_profiling:
+            # If profiling is enabled, we force model to generate to max_new_tokens
+            generate_config.update(
+                dict(
+                    max_tokens=args.max_new_tokens,  # max new tokens
+                    ignore_eos=True,
+                    include_stop_str_in_output=True,
+                    stop=None,
+                )
+            )
+        else:
+            generate_config.update(
+                dict(
+                    max_tokens=args.max_new_tokens,  # max new tokens
+                    ignore_eos=True if args.reward_type == "think_answer_tags" else False,
+                    include_stop_str_in_output=True,
+                    stop=["</answer>"] if args.reward_type == "think_answer_tags" else None,
+                )
             )
-        )
         eval_generation_config = {"temperature": 0.6}  # used to update generation config for evaluation
     else:
         raise ValueError(f"Unsupported backend: {args.backend}")
@@ -353,4 +374,6 @@
         eval_generation_config=eval_generation_config,
         log_rollout_interval=20,
         rollout_save_dir=args.rollout_save_dir,
+        enable_profiling=args.enable_profiling,
+        n_behind=args.n_behind,
     )
diff --git a/applications/ColossalChat/visualization.py b/applications/ColossalChat/visualization.py
new file mode 100644
index 000000000000..afb729884ad3
--- /dev/null
+++ b/applications/ColossalChat/visualization.py
@@ -0,0 +1,100 @@
+# Re-import required libraries due to kernel reset
+import argparse
+from collections import defaultdict
+
+import matplotlib.cm as cm
+import matplotlib.pyplot as plt
+
+# Argument parser for command line arguments
+parser = argparse.ArgumentParser(description="Process profiling logs and generate a timeline plot.")
+parser.add_argument("--visualization", type=str, default="actor_timelines.png", help="Path to the visualization file.")
+args = parser.parse_args()
+
+# Raw log lines
+log_lines = []
+
+import glob
+
+files = glob.glob("*.prof")
+for file in files:
+    with open(file, "r") as f:
+        log_lines += f.readlines()
+
+# Parse logs and collect function intervals grouped by actor
+actors = defaultdict(lambda: defaultdict(list))
+current_entries = {}
+
+# First, collect all timestamps to find the minimum
+all_timestamps = []
+parsed_lines = []
+
+for line in log_lines:
+    if line.startswith("[Log]"):
+        continue
+    parts = line.split()
+    timestamp = float(parts[0])
+    actor = parts[1]
+    action = parts[3]
+    func_name = parts[4]
+    parsed_lines.append((timestamp, actor, action, func_name))
+    all_timestamps.append(timestamp)
+
+if not all_timestamps:
+    raise ValueError("No valid log entries found.")
+
+min_timestamp = min(all_timestamps)
+
+for timestamp, actor, action, func_name in parsed_lines:
+    rel_timestamp = timestamp - min_timestamp
+    key = (actor, func_name)
+    if action == "Enter":
+        current_entries[key] = rel_timestamp
+    elif action == "Exit":
+        start_time = current_entries.pop(key, None)
+        if start_time is not None:
+            actors[actor][func_name].append((start_time, rel_timestamp))
+
+# Plotting setup
+fig, ax = plt.subplots(figsize=(12, 6))
+colors = cm.get_cmap("tab10", len(actors))
+
+actor_offsets = {}
+base_offset = 0
+function_spacing = 0.9
+
+yticks = []
+yticklabels = []
+
+for idx, (actor, func_dict) in enumerate(actors.items()):
+    actor_offsets[actor] = base_offset
+    color = colors(idx)
+    for j, (func, intervals) in enumerate(func_dict.items()):
+        print(actor, func, intervals)
+        y_val = base_offset + j * function_spacing
+        yticks.append(y_val)
+        yticklabels.append(f"{actor}:{func}")
+        for start, end in intervals:
+            if end - start < 1:
+                end = start + 1  # Ensure all lines are at least 3 units long
+            ax.plot(
+                [start, end],
+                [y_val, y_val],
+                color=color,
+                linewidth=2,
+                label=actor if j == 0 else "",
+            )
+    base_offset += len(func_dict) * function_spacing + 1
+
+# Formatting
+ax.set_yticks(yticks)
+ax.set_yticklabels(yticklabels)
+ax.set_xlabel("Time")
+ax.set_title("Timeline per Actor")
+# Remove duplicate labels in legend
+handles, labels = ax.get_legend_handles_labels()
+unique = dict(zip(labels, handles))
+ax.legend(unique.values(), unique.keys())
+plt.tight_layout()
+plt.grid(True)
+plt.savefig(args.visualization, dpi=600)  # Increase dpi for higher resolution
+print(f"Plot saved as {args.visualization}")

From a992da9f0f7adc9c185e3bfb07d937fee3272a03 Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Mon, 14 Jul 2025 16:25:03 +0800
Subject: [PATCH 095/100] fix code evaluation

---
 .../coati/distributed/producer.py             |  4 +-
 .../reward/code_reward/testing_util.py        | 19 ++++++---
 .../distributed/reward/code_reward/utils.py   | 40 ++++++++++++-------
 .../coati/distributed/reward/reward_fn.py     | 17 ++++++--
 applications/ColossalChat/rl_example.py       | 19 +++++++--
 .../ColossalChat/start_code_verifier.py       | 35 ++++++++++++++++
 6 files changed, 104 insertions(+), 30 deletions(-)
 create mode 100644 applications/ColossalChat/start_code_verifier.py

diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
index 2a37463913ee..11fb5d3aa3a9 100644
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -83,7 +83,7 @@ def __init__(
         reward_model_kwargs = {
             k: v
             for k, v in grpo_config.items()
-            if k in ["soft_over_length_punishment", "max_new_tokens", "cache_length"]
+            if k in ["soft_over_length_punishment", "max_new_tokens", "cache_length", "code_verifier_api_url"]
         }
         self.response_format_tags = grpo_config.get("response_format_tags", None)
         if producer_idx == 0:
@@ -250,7 +250,7 @@ def loop(self) -> None:
                                     for m in range(eval_outputs["input_ids"].size(0))
                                     for n in range(eval_outputs["input_ids"].size(1))
                                 ]
-                            eval_statistics_tensor[0] += len([res for res in eval_results if res["ans_valid"] == 1])
+                            eval_statistics_tensor[0] += sum([max(0, res["ans_valid"]) for res in eval_results])
                             eval_statistics_tensor[1] += len(eval_results)
                             allreduce(eval_statistics_tensor, op=ReduceOp.SUM, group_name="producer_group")
                             to_log_msg[f"eval/{eval_task_name}"] = (
diff --git a/applications/ColossalChat/coati/distributed/reward/code_reward/testing_util.py b/applications/ColossalChat/coati/distributed/reward/code_reward/testing_util.py
index c6d6d8fad06e..12973337e2a6 100644
--- a/applications/ColossalChat/coati/distributed/reward/code_reward/testing_util.py
+++ b/applications/ColossalChat/coati/distributed/reward/code_reward/testing_util.py
@@ -89,7 +89,7 @@ def clean_traceback(error_traceback):
     return error_traceback
 
 
-def run_test(in_outs, test=None, debug=False, timeout=15):
+def run_test(in_outs, test=None, debug=False, timeout=15, run_all_tests=False):
     """
     if test(generated_code) is not None it'll try to run the code.
     otherwise it'll just return an input and output pair.
@@ -180,8 +180,8 @@ def run_test(in_outs, test=None, debug=False, timeout=15):
             tmp_test = new_test
 
             sol += tmp_test
-            if debug:
-                print(f"sol = {sol}")
+            # if debug:
+            #     print(f"sol = {sol}")
             method_name = "code"
             signal.alarm(timeout)
             try:
@@ -202,8 +202,7 @@ def run_test(in_outs, test=None, debug=False, timeout=15):
                 }
             signal.alarm(0)
         if debug:
-            print(f"get method = {datetime.now().time()}")
-
+            print(f"get method {method_name} = {datetime.now().time()}")
         try:
             method = getattr(tmp, method_name)  # get_attr second arg must be str
         except Exception:
@@ -329,6 +328,9 @@ def run_test(in_outs, test=None, debug=False, timeout=15):
                         error_traceback = traceback.format_exc()
                         print(f"Call-based runtime error or time limit exceeded error = {repr(e)}{e}")
                         results.append(-1)
+                        signal.alarm(0)
+                        if run_all_tests:
+                            continue
                         return results, {
                             "error": repr(e),
                             "traceback": clean_traceback(error_traceback),
@@ -519,6 +521,10 @@ def run_test(in_outs, test=None, debug=False, timeout=15):
 
                 results.append(tmp_result)
                 if tmp_result is not True:
+                    if debug:
+                        print("final result:", results)
+                    if run_all_tests:
+                        continue
                     return results, {
                         "output": raw_true_output_copy,
                         "expected": raw_outputs,
@@ -539,7 +545,8 @@ def run_test(in_outs, test=None, debug=False, timeout=15):
                         )
 
                     print(f"results = {results}")
-
+    if debug:
+        print("final results", results)
     return results, {}
 
 
diff --git a/applications/ColossalChat/coati/distributed/reward/code_reward/utils.py b/applications/ColossalChat/coati/distributed/reward/code_reward/utils.py
index e19e9e387b71..dbb3b2149780 100644
--- a/applications/ColossalChat/coati/distributed/reward/code_reward/utils.py
+++ b/applications/ColossalChat/coati/distributed/reward/code_reward/utils.py
@@ -16,27 +16,24 @@
 # limitations under the License.
 
 import multiprocessing
-import os
-import sys
 import traceback
 from typing import Optional
 
+import requests
+
 from .testing_util import run_test
 
 
 def _temp_run(sample, generation, debug, result, metadata_list, timeout):
-    with open(os.devnull, "w") as devnull:
-        sys.stdout = devnull
-        sys.stderr = devnull
-        try:
-            res, metadata = run_test(in_outs=sample, test=generation, debug=debug, timeout=timeout)
-            result.append(res)
-            metadata_list.append(metadata)
-        except Exception:
-            # print(e) # some tracebacks are extremely long.
-            traceback.print_exc(10)
-            result.append([-1 for i in range(len(sample["inputs"]))])
-            metadata_list.append({})
+    try:
+        res, metadata = run_test(in_outs=sample, test=generation, debug=debug, timeout=timeout)
+        result.append(res)
+        metadata_list.append(metadata)
+    except Exception:
+        # print(e) # some tracebacks are extremely long.
+        traceback.print_exc(10)
+        result.append([-1 for i in range(len(sample["inputs"]))])
+        metadata_list.append({})
 
 
 def check_correctness(in_outs: Optional[dict], generation, timeout=10, debug=True):
@@ -49,7 +46,7 @@ def check_correctness(in_outs: Optional[dict], generation, timeout=10, debug=Tru
     metadata_list = manager.list()
     p = multiprocessing.Process(target=_temp_run, args=(in_outs, generation, debug, result, metadata_list, timeout))
     p.start()
-    p.join(timeout=timeout + 1)
+    p.join(timeout=600)  # Global timeout of 10 minutes that's for all test cases combined
     if p.is_alive():
         p.kill()
         # p.terminate()
@@ -59,3 +56,16 @@ def check_correctness(in_outs: Optional[dict], generation, timeout=10, debug=Tru
         if debug:
             print("global timeout")
     return result[0], metadata_list
+
+
+def check_correctness_code_api(
+    in_outs: Optional[dict], generation, timeout=10, debug=True, url="http://localhost:8000/check_correctness"
+):
+    payload = {"in_outs": in_outs, "generation": generation, "timeout": timeout, "debug": debug}
+    response = requests.post(url, json=payload)
+    if response.status_code == 200:
+        results = response.json()
+        return results["result"], results["metadata"]
+    else:
+        print(f"Error: {response.status_code} - {response.text}")
+        return [-1 for i in range(len(in_outs["inputs"]))], {}
diff --git a/applications/ColossalChat/coati/distributed/reward/reward_fn.py b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
index 5c8810fdf4cc..f7a2fb89cadb 100644
--- a/applications/ColossalChat/coati/distributed/reward/reward_fn.py
+++ b/applications/ColossalChat/coati/distributed/reward/reward_fn.py
@@ -24,7 +24,7 @@
 from latex2sympy2_extended import NormalizationConfig
 from math_verify import ExprExtractionConfig, LatexExtractionConfig, parse, verify
 
-from .code_reward.utils import check_correctness as check_correctness_code
+from .code_reward.utils import check_correctness_code_api as check_correctness_code
 from .reward_utils import extract_boxed_solution, extract_solution, validate_response_structure
 
 CANNOT_PARSE_GT_ANSWER = -1
@@ -223,6 +223,7 @@ def boxed_math_reward_fn(input_ids, gt_answer, response_idx, **kwargs):
 
 
 def code_reward_fn(input_ids, test_cases, response_idx, **kwargs):
+    url = kwargs.get("url", "http://localhost:8000/check_correctness")
     tokenizer = kwargs["tokenizer"]
     eval_mode = kwargs.get("eval_mode", False)
     soft_over_length_punishment = kwargs.get("soft_over_length_punishment", False)
@@ -255,6 +256,9 @@ def code_reward_fn(input_ids, test_cases, response_idx, **kwargs):
     if format_valid:
         format_acc += 1
 
+    res = []
+    metadata = []
+
     try:
         try:
             if not isinstance(test_cases, dict):
@@ -264,15 +268,18 @@ def code_reward_fn(input_ids, test_cases, response_idx, **kwargs):
             raise e
         # Complete check on all in-out pairs first. If there is no failure, per-sample test can be skipped.
         try:
-            res, metadata = check_correctness_code(in_outs=test_cases, generation=solution, timeout=10, debug=True)
+            res, metadata = check_correctness_code(
+                in_outs=test_cases, generation=solution, timeout=10, debug=False, url=url
+            )
             metadata = dict(enumerate(metadata))[0]
-            success = all(map(lambda x: x is True, res))
+            success = all(map(lambda x: x == 1, res))
             if success:
                 ans_acc += 1
                 if eval_mode or format_valid:
                     reward += acc_score
                 if not eval_mode:
                     reward = reward + length_reward
+
         except Exception:
             pass
 
@@ -288,7 +295,9 @@ def code_reward_fn(input_ids, test_cases, response_idx, **kwargs):
         return {
             "prompt": prompt,
             "prediction": decoded_final_answer,
-            "gold": test_cases["outputs"],
+            "test_cases": test_cases,
+            "test_results": res,
+            "test_metadata": metadata,
             "parsed": solution,
             "format_valid": format_acc.item(),
             "ans_valid": ans_acc.item(),
diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py
index d7b7c2a5d0fc..08814f9f1e61 100644
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -12,6 +12,9 @@
     "code": "You are a helpful assistant.",
 }
 
+# bypass the proxy for local addresses
+os.environ["no_proxy"] = "127.0.0.1,localhost"
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("-m", "--model", type=str, default="Qwen/Qwen2.5-7B")
@@ -138,6 +141,13 @@
         choices=["think_answer_tags", "boxed", "code"],
         help="Reward type for GRPO.",
     )
+    parser.add_argument(
+        "-cv",
+        "--code-verifier-api-url",
+        type=str,
+        default=None,
+        help="API URL for code verifier. If not provided, the code verifier will be disabled.",
+    )
     parser.add_argument(
         "-ei",
         "--eval-interval",
@@ -165,6 +175,7 @@
     parser.add_argument(
         "--enable_profiling", action="store_true", default=False, help="Enable profiling for the training process."
     )
+
     args = parser.parse_args()
 
     if args.train_minibatch_size is None:
@@ -188,7 +199,7 @@
             namespace="ray-example",
             runtime_env={
                 "env_vars": {
-                    # "RAY_DEBUG_POST_MORTEM": "1"  # enable post-mortem debugging with ray
+                    # "RAY_DEBUG_POST_MORTEM": "1",  # enable post-mortem debugging with ray
                     "TOKENIZERS_PARALLELISM": "false"
                 },
             },
@@ -201,7 +212,7 @@
             _temp_dir=args.ray_dir,
             runtime_env={
                 "env_vars": {
-                    # "RAY_DEBUG_POST_MORTEM": "1"  # enable post-mortem debugging with ray
+                    # "RAY_DEBUG_POST_MORTEM": "1",  # enable post-mortem debugging with ray
                     "TOKENIZERS_PARALLELISM": "false"
                 },
             },
@@ -321,7 +332,9 @@
         }
     else:
         raise ValueError(f"Unsupported algorithm: {args.algo}")
-
+    if args.reward_type == "code":
+        assert args.code_verifier_api_url is not None, "Please provide a code verifier API URL for code reward type."
+        grpo_config.update({"code_verifier_api_url": args.code_verifier_api_url})
     if args.system_prompt is None:
         # Default system prompt
         args.system_prompt = DEFAUT_SYSTEM_PROMPT[args.reward_type]
diff --git a/applications/ColossalChat/start_code_verifier.py b/applications/ColossalChat/start_code_verifier.py
new file mode 100644
index 000000000000..d1924f610698
--- /dev/null
+++ b/applications/ColossalChat/start_code_verifier.py
@@ -0,0 +1,35 @@
+from typing import List, Optional
+
+from coati.distributed.reward.code_reward.utils import check_correctness  # Assuming utils.py is in the same directory
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+
+app = FastAPI()
+
+
+class CheckCorrectnessRequest(BaseModel):
+    in_outs: Optional[dict]
+    generation: str
+    timeout: int = 10
+    debug: bool = True
+    eval_mode: bool = False
+
+
+class CheckCorrectnessResponse(BaseModel):
+    result: List[int]
+    metadata: List[dict]
+
+
+@app.post("/check_correctness", response_model=CheckCorrectnessResponse)
+def check_correctness_api(request: CheckCorrectnessRequest):
+    try:
+        result, metadata = check_correctness(
+            in_outs=request.in_outs,
+            generation=request.generation,
+            timeout=request.timeout,
+            debug=request.debug,
+            eval_mode=request.eval_mode,
+        )
+        return CheckCorrectnessResponse(result=result, metadata=metadata)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))

From d8504752083f53dbfab03c82acd470f5df74f8fa Mon Sep 17 00:00:00 2001
From: YeAnbang <anbangy2@outlook.com>
Date: Mon, 14 Jul 2025 18:23:39 +0800
Subject: [PATCH 096/100] fix style

---
 .../coati/distributed/reward/code_reward/testing_util.py        | 2 --
 1 file changed, 2 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/reward/code_reward/testing_util.py b/applications/ColossalChat/coati/distributed/reward/code_reward/testing_util.py
index 12973337e2a6..19be101f66b7 100644
--- a/applications/ColossalChat/coati/distributed/reward/code_reward/testing_util.py
+++ b/applications/ColossalChat/coati/distributed/reward/code_reward/testing_util.py
@@ -180,8 +180,6 @@ def run_test(in_outs, test=None, debug=False, timeout=15, run_all_tests=False):
             tmp_test = new_test
 
             sol += tmp_test
-            # if debug:
-            #     print(f"sol = {sol}")
             method_name = "code"
             signal.alarm(timeout)
             try:

From 4cf5ce20bfad4a5a27b6fbac6cae5194b9916e66 Mon Sep 17 00:00:00 2001
From: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Date: Thu, 17 Jul 2025 15:05:10 +0800
Subject: [PATCH 097/100] add entropy (#6363)

---
 .../coati/distributed/grpo_consumer.py        | 39 ++++++++++++++++++-
 .../ColossalChat/coati/distributed/utils.py   | 10 +++++
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index f8ce1afde4b4..754f780979a9 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -6,7 +6,7 @@
 import wandb
 from coati.distributed.consumer import BaseConsumer
 from coati.distributed.loss import PolicyLoss
-from coati.distributed.utils import memory_efficient_logprob
+from coati.distributed.utils import entropy_from_logits, memory_efficient_logprob
 from coati.trainer.utils import all_reduce_mean, all_reduce_sum
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -75,6 +75,7 @@ def __init__(
         self.optimizer = HybridAdam(self.policy_model.parameters(), lr=grpo_config.get("lr", 1e-6))
         self.accum_loss = torch.zeros(1, device=self.device)
         self.accum_kl = torch.zeros(1, device=self.device)
+        self.accum_entropy = torch.zeros(1, device=self.device)
         self.accum_advantages = torch.zeros(1, device=self.device)
         self.raw_train_batch_reward = []
         self.raw_train_batch_format_acc = []
@@ -244,6 +245,7 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
             else self.booster.no_sync(self.policy_model, self.optimizer)
         )
         with ctx:
+            mini_batch_entropies = []
             for forward_micro_batch_start in range(0, data["input_ids"].size(0), train_microbatch_size):
                 input_ids_forward_micro_batch = data["input_ids"][
                     forward_micro_batch_start : forward_micro_batch_start + train_microbatch_size
@@ -310,9 +312,11 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
                         data_policy_forward["reference_action_log_probs"] = reference_action_log_probs
 
                     kl = []
+                    policy_model_logits = torch.empty_like(input_ids_forward_micro_batch, device=self.device)
 
                     def _criterion(outputs, inputs):
                         action_logits = outputs.logits
+                        policy_model_logits.copy_(action_logits)
                         action_log_probs = memory_efficient_logprob(
                             action_logits / self.generate_config["temperature"],
                             inputs["input_ids"],
@@ -359,6 +363,20 @@ def _criterion(outputs, inputs):
                             kl = all_reduce_mean(torch.mean(torch.stack(kl)).to(loss.device), self.plugin).data
                             mean_kl.append(kl)
                         mean_loss.append(all_reduce_mean(loss, self.plugin).data)
+                        mini_batch_entropies.append(
+                            all_reduce_mean(
+                                (
+                                    (
+                                        (
+                                            entropy_from_logits(policy_model_logits[:, -num_action:])
+                                            * action_mask_forward_micro_batch
+                                        ).sum(-1)
+                                    )
+                                    / action_mask_forward_micro_batch.sum(-1)
+                                ).detach(),
+                                self.plugin,
+                            )
+                        )
                 else:
                     policy_model_logits = self.policy_model(
                         input_ids=input_ids_forward_micro_batch,
@@ -412,6 +430,20 @@ def _criterion(outputs, inputs):
                         kl = all_reduce_mean(kl.mean(), self.plugin)
                         mean_kl.append(kl.data)
                     mean_loss.append(loss.data)
+                    mini_batch_entropies.append(
+                        all_reduce_mean(
+                            (
+                                (
+                                    (
+                                        entropy_from_logits(policy_model_logits[:, -num_action:])
+                                        * action_mask_forward_micro_batch
+                                    ).sum(-1)
+                                )
+                                / action_mask_forward_micro_batch.sum(-1)
+                            ).detach(),
+                            self.plugin,
+                        )
+                    )
             if not self.plugin.pp_size > 1 or (
                 self.plugin.pp_size > 1
                 and self.booster.plugin.stage_manager.is_last_stage()
@@ -423,7 +455,9 @@ def _criterion(outputs, inputs):
                 ans_acc = all_reduce_mean(ans_acc.mean(), self.plugin)
                 advantages = all_reduce_mean(advantages.mean(), self.plugin)
                 response_length = all_reduce_mean(response_length.mean(), self.plugin)
+                entropy = torch.cat(mini_batch_entropies, dim=0).mean()
                 self.accum_loss.add_(sum(mean_loss) / len(mean_loss))
+                self.accum_entropy.add_(entropy.data)
                 if self.policy_loss_fn.beta > 0:
                     self.accum_kl.add_(sum(mean_kl) / len(mean_kl))
                 self.accum_advantages.add_(advantages.data)
@@ -464,6 +498,7 @@ def _criterion(outputs, inputs):
                         f"Response Length: {raw_batch_response_len_mean:.4f}",
                         f"Sample_utilization: {sample_utilization:.4f}",
                         f"Overlength samples ratio: {overlength_samples_ratio:.4f}",
+                        f"Entropy: {self.accum_entropy.item() / self.accum_count:.4f}",
                     ] + ([f"KL: {self.accum_kl.item() / self.accum_count:.4f}"] if self.policy_loss_fn.beta > 0 else [])
                     print("\n".join(to_log_msg))
                     metrics = {
@@ -475,6 +510,7 @@ def _criterion(outputs, inputs):
                         "train/advantages": self.accum_advantages.item() / self.accum_count,
                         "train/learning_rate": self.lr_scheduler.get_last_lr()[0],
                         "train/sample_utilization": sample_utilization,
+                        "train/entropy": self.accum_entropy.item() / self.accum_count,
                         "train/overlength_samples_ratio": overlength_samples_ratio,
                         "rollout/temperature": data["temperature"].cpu().numpy()[0][0],
                     }
@@ -484,6 +520,7 @@ def _criterion(outputs, inputs):
                         self.wandb_run.log(metrics)
                 self.accum_loss.zero_()
                 self.accum_kl.zero_()
+                self.accum_entropy.zero_()
                 self.accum_advantages.zero_()
                 self.accum_count = 0
             return loss_scalar
diff --git a/applications/ColossalChat/coati/distributed/utils.py b/applications/ColossalChat/coati/distributed/utils.py
index d46243114eea..466914cc0d4d 100644
--- a/applications/ColossalChat/coati/distributed/utils.py
+++ b/applications/ColossalChat/coati/distributed/utils.py
@@ -110,6 +110,16 @@ def memory_efficient_logprob(
     return action_log_probs
 
 
+def entropy_from_logits(logits: torch.Tensor) -> torch.Tensor:
+    """
+    Calculate entropy
+    Reference: https://github.com/volcengine/verl/blob/96b730bbed80292a439f0c0057d3920ab8b28d52/verl/utils/torch_functional.py#L145
+    """
+    p = torch.nn.functional.softmax(logits, dim=-1)
+    entropy = torch.logsumexp(logits, dim=-1) - torch.sum(p * logits, dim=-1)
+    return entropy
+
+
 def masked_mean(tensor: torch.Tensor, mask: torch.Tensor, dim: int = 1) -> torch.Tensor:
     """
     Compute the masked mean of a tensor along a specified dimension.

From 57e92104a23e2e9085c71d70e14a1d288018926f Mon Sep 17 00:00:00 2001
From: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Date: Tue, 22 Jul 2025 10:02:02 +0800
Subject: [PATCH 098/100] hotfix entropy calculation (#6364)

---
 .../coati/distributed/grpo_consumer.py        | 32 ++++++++-----------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index 754f780979a9..a3f1a1cbbbb2 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -250,6 +250,9 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
                 input_ids_forward_micro_batch = data["input_ids"][
                     forward_micro_batch_start : forward_micro_batch_start + train_microbatch_size
                 ]
+                old_action_log_probs_micro_batch = old_action_log_probs[
+                    forward_micro_batch_start : forward_micro_batch_start + train_microbatch_size
+                ]
                 attention_mask_forward_micro_batch = data["attention_mask"][
                     forward_micro_batch_start : forward_micro_batch_start + train_microbatch_size
                 ]
@@ -306,17 +309,22 @@ def step(self, step_idx: int, pbar: Any, **kwargs) -> Optional[float]:
                         "action_mask": action_mask_forward_micro_batch,
                         "advantages": advantages_forward_micro_batch,
                         "loss_mask": loss_mask_forward_micro_batch,
+                        "old_action_log_probs": old_action_log_probs_micro_batch,
                         "source": self.rank,
                     }
                     if reference_action_log_probs is not None:
                         data_policy_forward["reference_action_log_probs"] = reference_action_log_probs
 
                     kl = []
-                    policy_model_logits = torch.empty_like(input_ids_forward_micro_batch, device=self.device)
 
                     def _criterion(outputs, inputs):
                         action_logits = outputs.logits
-                        policy_model_logits.copy_(action_logits)
+                        mini_batch_entropies.append(
+                            (
+                                ((entropy_from_logits(action_logits[:, -num_action:]) * inputs["action_mask"]).sum(-1))
+                                / inputs["action_mask"].sum(-1)
+                            ).detach()
+                        )
                         action_log_probs = memory_efficient_logprob(
                             action_logits / self.generate_config["temperature"],
                             inputs["input_ids"],
@@ -339,7 +347,7 @@ def _criterion(outputs, inputs):
 
                         loss, _ = self.policy_loss_fn(
                             action_log_probs,
-                            action_log_probs,
+                            inputs["old_action_log_probs"],
                             inputs["advantages"].repeat_interleave(action_log_probs.size(-1), dim=-1),
                             per_token_kl,
                             inputs["action_mask"],
@@ -363,20 +371,6 @@ def _criterion(outputs, inputs):
                             kl = all_reduce_mean(torch.mean(torch.stack(kl)).to(loss.device), self.plugin).data
                             mean_kl.append(kl)
                         mean_loss.append(all_reduce_mean(loss, self.plugin).data)
-                        mini_batch_entropies.append(
-                            all_reduce_mean(
-                                (
-                                    (
-                                        (
-                                            entropy_from_logits(policy_model_logits[:, -num_action:])
-                                            * action_mask_forward_micro_batch
-                                        ).sum(-1)
-                                    )
-                                    / action_mask_forward_micro_batch.sum(-1)
-                                ).detach(),
-                                self.plugin,
-                            )
-                        )
                 else:
                     policy_model_logits = self.policy_model(
                         input_ids=input_ids_forward_micro_batch,
@@ -415,7 +409,7 @@ def _criterion(outputs, inputs):
 
                     loss, _ = self.policy_loss_fn(
                         action_log_probs,
-                        old_action_log_probs,
+                        old_action_log_probs_micro_batch,
                         advantages_forward_micro_batch.repeat_interleave(action_log_probs.size(-1), dim=-1),
                         per_token_kl,
                         action_mask_forward_micro_batch,
@@ -455,7 +449,7 @@ def _criterion(outputs, inputs):
                 ans_acc = all_reduce_mean(ans_acc.mean(), self.plugin)
                 advantages = all_reduce_mean(advantages.mean(), self.plugin)
                 response_length = all_reduce_mean(response_length.mean(), self.plugin)
-                entropy = torch.cat(mini_batch_entropies, dim=0).mean()
+                entropy = all_reduce_mean(torch.cat(mini_batch_entropies, dim=0).mean(), self.plugin)
                 self.accum_loss.add_(sum(mean_loss) / len(mean_loss))
                 self.accum_entropy.add_(entropy.data)
                 if self.policy_loss_fn.beta > 0:

From d7b140df7a8173b5677ea6df80920595a23cfbb9 Mon Sep 17 00:00:00 2001
From: liuqh16 <lqh20@mails.tsinghua.edu.cn>
Date: Wed, 30 Jul 2025 18:40:08 +0800
Subject: [PATCH 099/100] fix: wrong dp-rank condition when enable pp

---
 applications/ColossalChat/coati/distributed/grpo_consumer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index a3f1a1cbbbb2..d1c8f2a7a0e5 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -442,7 +442,6 @@ def _criterion(outputs, inputs):
                 self.plugin.pp_size > 1
                 and self.booster.plugin.stage_manager.is_last_stage()
                 and self.tp_rank == 0
-                and self.dp_rank == 0
             ):
                 reward = all_reduce_mean(reward.mean(), self.plugin)
                 format_acc = all_reduce_mean(format_acc.mean(), self.plugin)
@@ -469,7 +468,7 @@ def _criterion(outputs, inputs):
                 self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0
             ):
                 if (not self.plugin.pp_size > 1 and self.rank == 0) or (
-                    self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0
+                    self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0 and self.dp_rank == 0
                 ):
                     raw_batch_reward_mean = torch.cat(self.raw_train_batch_reward, dim=0).mean().cpu().item()
                     raw_batch_format_acc_mean = torch.cat(self.raw_train_batch_format_acc, dim=0).mean().cpu().item()

From d0bc901bcbd745d56030aae467afe382d288592f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 30 Jul 2025 11:50:36 +0000
Subject: [PATCH 100/100] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../ColossalChat/coati/distributed/grpo_consumer.py      | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/applications/ColossalChat/coati/distributed/grpo_consumer.py b/applications/ColossalChat/coati/distributed/grpo_consumer.py
index d1c8f2a7a0e5..423a7afab71a 100644
--- a/applications/ColossalChat/coati/distributed/grpo_consumer.py
+++ b/applications/ColossalChat/coati/distributed/grpo_consumer.py
@@ -439,9 +439,7 @@ def _criterion(outputs, inputs):
                         )
                     )
             if not self.plugin.pp_size > 1 or (
-                self.plugin.pp_size > 1
-                and self.booster.plugin.stage_manager.is_last_stage()
-                and self.tp_rank == 0
+                self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0
             ):
                 reward = all_reduce_mean(reward.mean(), self.plugin)
                 format_acc = all_reduce_mean(format_acc.mean(), self.plugin)
@@ -468,7 +466,10 @@ def _criterion(outputs, inputs):
                 self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0
             ):
                 if (not self.plugin.pp_size > 1 and self.rank == 0) or (
-                    self.plugin.pp_size > 1 and self.booster.plugin.stage_manager.is_last_stage() and self.tp_rank == 0 and self.dp_rank == 0
+                    self.plugin.pp_size > 1
+                    and self.booster.plugin.stage_manager.is_last_stage()
+                    and self.tp_rank == 0
+                    and self.dp_rank == 0
                 ):
                     raw_batch_reward_mean = torch.cat(self.raw_train_batch_reward, dim=0).mean().cpu().item()
                     raw_batch_format_acc_mean = torch.cat(self.raw_train_batch_format_acc, dim=0).mean().cpu().item()