From 44f5a1d4e378e9286f974d3ca29ac2bf388840e2 Mon Sep 17 00:00:00 2001
From: Yan Bai <bayan@nvidia.com>
Date: Mon, 23 Feb 2026 22:47:51 -0800
Subject: [PATCH 01/11] add qwen3.5 megatron sft example

---
 examples/sft/gsm8k/run_qwen3_5_megatron.sh  | 143 ++++++++++++++++++++
 verl/models/mcore/model_forward.py          |   2 +-
 verl/utils/dataset/multiturn_sft_dataset.py |  63 +++++++--
 verl/utils/megatron_utils.py                |  11 +-
 verl/utils/model.py                         |  13 +-
 verl/utils/tensordict_utils.py              |  41 +++++-
 verl/workers/engine_workers.py              |  11 +-
 verl/workers/fsdp_workers.py                |  11 +-
 8 files changed, 264 insertions(+), 31 deletions(-)
 create mode 100644 examples/sft/gsm8k/run_qwen3_5_megatron.sh

diff --git a/examples/sft/gsm8k/run_qwen3_5_megatron.sh b/examples/sft/gsm8k/run_qwen3_5_megatron.sh
new file mode 100644
index 00000000000..43ab11ec213
--- /dev/null
+++ b/examples/sft/gsm8k/run_qwen3_5_megatron.sh
@@ -0,0 +1,143 @@
+#!/usr/bin/env bash
+# Qwen3.5-397B-A17B SFT with Megatron backend + mbridge
+#
+# Requirements:
+#   - 128+ GPUs (80GB each, e.g. 16x8 H100/H200)
+#   - Docker: verlai/verl:vllm015 (or equivalent)
+#   - Additional packages on top of the base image:
+#       pip install --upgrade transformers
+#       pip install flash-linear-attention
+#       pip install -U git+https://github.com/ISEEKYAN/mbridge.git
+#   - Megatron-LM dev branch with Qwen3.5 GDN support
+#
+# Qwen3.5 architecture notes:
+#   Qwen3.5 uses Gated Delta Net (GDN) linear attention which currently does
+#   NOT support packed sequences (THD format) in Megatron-LM. Therefore:
+#     - engine.use_remove_padding=False  (forces bshd compute format)
+#     - model.use_remove_padding=True    (keeps NestedTensor in data pipeline)
+#     - data.use_dynamic_bsz=False       (required for bshd mode)
+#
+#   Once https://github.com/NVIDIA/Megatron-LM/pull/2644 is merged, THD
+#   format will be supported and engine.use_remove_padding can be set to True
+#   for better performance.
+#
+# Tested parallelism config (128 GPUs / 16 nodes):
+#   TP=2 PP=4 EP=32 CP=1
+
+set -xeuo pipefail
+
+# ============================================================
+# Distributed
+# ============================================================
+NUM_GPUS=${NUM_GPUS:-8}
+MASTER_ADDR=${MASTER_ADDR:-localhost}
+MASTER_PORT=${MASTER_PORT:-29500}
+NNODES=${NNODES:-16}
+NODE_RANK=${NODE_RANK:-0}
+
+# ============================================================
+# Data
+# ============================================================
+DATASET_DIR=${DATASET_DIR:-~/dataset}
+TRAIN_FILES=${TRAIN_FILES:-${DATASET_DIR}/train.parquet}
+
+# ============================================================
+# Model
+# ============================================================
+MODEL_PATH=${MODEL_PATH:-Qwen/Qwen3.5-397B-A17B}
+
+# ============================================================
+# Parallelism
+# ============================================================
+TP_SIZE=${TP_SIZE:-2}
+PP_SIZE=${PP_SIZE:-4}
+VPP_SIZE=${VPP_SIZE:-null}
+CP_SIZE=${CP_SIZE:-1}
+EP_SIZE=${EP_SIZE:-32}
+ETP_SIZE=${ETP_SIZE:-1}
+
+# ============================================================
+# Training
+# ============================================================
+TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-128}
+MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-2}
+MAX_LENGTH=${MAX_LENGTH:-2048}
+LR=${LR:-2e-5}
+MIN_LR=${MIN_LR:-2e-6}
+DTYPE=${DTYPE:-bfloat16}
+
+BACKEND=megatron
+RESUME_MODE=${RESUME_MODE:-disable}
+
+project_name=verl_sft_qwen3_5
+exp_name=qwen3_5-${BACKEND}-tp${TP_SIZE}-pp${PP_SIZE}-cp${CP_SIZE}-ep${EP_SIZE}
+ckpts_home=${ckpts_home:-~/verl/checkpoints/${project_name}/${exp_name}}
+mkdir -p "${ckpts_home}"
+
+# ============================================================
+# Engine config
+# ============================================================
+# Key Qwen3.5 settings:
+#   engine.use_remove_padding=False   - GDN requires bshd format (no THD)
+#   engine.vanilla_mbridge=True       - use mbridge (not megatron-bridge)
+ENGINE_CONFIG="\
+    engine=${BACKEND} \
+    optim=${BACKEND} \
+    optim.lr=${LR} \
+    optim.min_lr=${MIN_LR} \
+    optim.lr_warmup_steps=10 \
+    optim.weight_decay=0.1 \
+    optim.betas='[0.9,0.95]' \
+    optim.clip_grad=1.0 \
+    optim.lr_warmup_init=0 \
+    optim.lr_decay_style=cosine \
+    +optim.override_optimizer_config.optimizer_offload_fraction=1 \
+    +optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \
+    +optim.override_optimizer_config.use_precision_aware_optimizer=True \
+    +optim.override_optimizer_config.optimizer_cpu_offload=True \
+    engine.tensor_model_parallel_size=${TP_SIZE} \
+    engine.pipeline_model_parallel_size=${PP_SIZE} \
+    engine.virtual_pipeline_model_parallel_size=${VPP_SIZE} \
+    engine.context_parallel_size=${CP_SIZE} \
+    engine.expert_model_parallel_size=${EP_SIZE} \
+    engine.expert_tensor_parallel_size=${ETP_SIZE} \
+    engine.use_mbridge=True \
+    engine.vanilla_mbridge=True \
+    engine.dtype=${DTYPE} \
+    engine.use_remove_padding=False \
+    engine.override_transformer_config.attention_backend=auto \
+    +engine.override_transformer_config.recompute_method=uniform \
+    +engine.override_transformer_config.recompute_granularity=full \
+    +engine.override_transformer_config.recompute_num_layers=1"
+
+# ============================================================
+# Launch
+# ============================================================
+torchrun \
+    --nproc_per_node=${NUM_GPUS} \
+    --nnodes=${NNODES} \
+    --node_rank=${NODE_RANK} \
+    --master_addr=${MASTER_ADDR} \
+    --master_port=${MASTER_PORT} \
+    -m verl.trainer.sft_trainer \
+    data.train_files="${TRAIN_FILES}" \
+    data.train_batch_size=${TRAIN_BATCH_SIZE} \
+    data.micro_batch_size_per_gpu=${MICRO_BATCH_SIZE} \
+    data.max_length=${MAX_LENGTH} \
+    data.pad_mode=no_padding \
+    data.truncation=error \
+    data.use_dynamic_bsz=False \
+    data.max_token_len_per_gpu=${MAX_LENGTH} \
+    data.messages_key=messages \
+    model.path=${MODEL_PATH} \
+    model.use_remove_padding=True \
+    model.trust_remote_code=True \
+    ${ENGINE_CONFIG} \
+    trainer.test_freq=-1 \
+    trainer.save_freq=500 \
+    trainer.logger="['console']" \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.total_epochs=1 \
+    trainer.default_local_dir="${ckpts_home}" \
+    trainer.resume_mode=${RESUME_MODE}
diff --git a/verl/models/mcore/model_forward.py b/verl/models/mcore/model_forward.py
index fd160fa86c9..4c82a56f09f 100644
--- a/verl/models/mcore/model_forward.py
+++ b/verl/models/mcore/model_forward.py
@@ -258,7 +258,7 @@ def gptmodel_forward_no_padding(
         output_orig = model(
             input_ids=input_ids_bshd,
             attention_mask=attention_mask_bshd,
-            position_ids=position_ids_bshd,
+            position_ids=None if vision_model else position_ids_bshd,
             **model_kwargs,
         )
         if post_process and logits_processor is not None:
diff --git a/verl/utils/dataset/multiturn_sft_dataset.py b/verl/utils/dataset/multiturn_sft_dataset.py
index 9da33228e21..e8fb4b65e37 100644
--- a/verl/utils/dataset/multiturn_sft_dataset.py
+++ b/verl/utils/dataset/multiturn_sft_dataset.py
@@ -22,6 +22,7 @@
 from functools import wraps
 from typing import Any, Optional
 
+import jinja2
 import numpy as np
 import pandas as pd
 import torch
@@ -215,20 +216,52 @@ def _process_single_message(
         Returns:
             Tuple of (input_ids, loss_mask, attention_mask, dict[str, torch.Tensor])
         """
-        processor = self.processor if self.processor is not None else self.tokenizer
+        has_visual_content = isinstance(message.get("content"), list) and any(
+            isinstance(c, dict) and c.get("type") in ("image", "video") for c in message["content"]
+        )
+        processor = self.processor if self.processor is not None and has_visual_content else self.tokenizer
         apply_chat_template_kwargs = {**self.apply_chat_template_kwargs}
         if enable_thinking is not None:
             apply_chat_template_kwargs["enable_thinking"] = enable_thinking
 
-        inputs = processor.apply_chat_template(
-            [message],
-            tools=tools,
-            add_generation_prompt=False,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-            **apply_chat_template_kwargs,
-        )
+        try:
+            inputs = processor.apply_chat_template(
+                [message],
+                tools=tools,
+                add_generation_prompt=False,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt",
+                **apply_chat_template_kwargs,
+            )
+        except (jinja2.exceptions.TemplateError, Exception) as e:
+            if "No user query" not in str(e):
+                raise
+            # Chat templates that require a user message (e.g. Qwen3.5) fail
+            # when tokenising a single non-user message. Fallback: tokenise the
+            # conversation up to this turn and subtract the prefix.
+            inputs_full = processor.apply_chat_template(
+                full_message[: index + 1],
+                tools=tools,
+                add_generation_prompt=False,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt",
+                **apply_chat_template_kwargs,
+            )
+            prefix_len = 0
+            if index > 0:
+                inputs_prev = processor.apply_chat_template(
+                    full_message[:index],
+                    tools=tools if index == 1 else None,
+                    add_generation_prompt=False,
+                    tokenize=True,
+                    return_dict=True,
+                    return_tensors="pt",
+                    **apply_chat_template_kwargs,
+                )
+                prefix_len = inputs_prev["input_ids"].shape[-1]
+            inputs = {k: v[..., prefix_len:] for k, v in inputs_full.items()}
 
         inputs = dict(inputs)
         input_ids = inputs.pop("input_ids")[0]
@@ -266,14 +299,16 @@ def _build_messages(self, example: dict):
 
         image_offset, video_offset = 0, 0
         for message in messages:
-            if self.image_key not in example and self.video_key not in example:
-                continue
-            assert self.processor is not None, "processor is needed to process image and video"
-
             content = message["content"]
             if not isinstance(content, str):
                 continue
 
+            if self.image_key not in example and self.video_key not in example:
+                if self.processor is not None:
+                    message["content"] = [{"type": "text", "text": content}]
+                continue
+            assert self.processor is not None, "processor is needed to process image and video"
+
             content_list = []
             segments = re.split("(<image>|<video>)", content)
             segments = [item for item in segments if item != ""]
diff --git a/verl/utils/megatron_utils.py b/verl/utils/megatron_utils.py
index 9572fb91962..a17b3de9734 100644
--- a/verl/utils/megatron_utils.py
+++ b/verl/utils/megatron_utils.py
@@ -459,8 +459,15 @@ def load_megatron_model_to_gpu(models, load_grad=True):
                 for buffer in buffers:
                     # sometimes, we don't want to load grad for pure inference
                     if load_grad and hasattr(buffer, "grad_data_size"):
-                        buffer.grad_data.storage().resize_(buffer.grad_data_size)
-                        buffer.grad_data.zero_()
+                        current_storage_size = buffer.grad_data.storage().size()
+                        if current_storage_size == 0 or current_storage_size == buffer.grad_data_size:
+                            buffer.grad_data.storage().resize_(buffer.grad_data_size)
+                            buffer.grad_data.zero_()
+                        else:
+                            # Non-standard layers (e.g. GatedDeltaNet) may have grad
+                            # buffers with mismatched storage size; skip resize and
+                            # zero in-place with current storage.
+                            buffer.grad_data.zero_()
 
                     if buffer.param_data.storage().size() == 0:
                         buffer.param_data.storage().resize_(buffer.param_data_size)
diff --git a/verl/utils/model.py b/verl/utils/model.py
index a59c4c32962..77eabaed2a1 100644
--- a/verl/utils/model.py
+++ b/verl/utils/model.py
@@ -30,15 +30,24 @@
     AutoConfig,
     AutoModel,
     AutoModelForCausalLM,
-    AutoModelForImageTextToText,
     AutoModelForSequenceClassification,
     AutoModelForTokenClassification,
-    AutoModelForVision2Seq,
     GenerationConfig,
     MistralForSequenceClassification,
     PretrainedConfig,
     PreTrainedModel,
 )
+
+try:
+    from transformers import AutoModelForVision2Seq
+except ImportError:
+    AutoModelForVision2Seq = None
+
+try:
+    from transformers import AutoModelForImageTextToText
+except ImportError:
+    AutoModelForImageTextToText = AutoModelForVision2Seq
+
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
 from verl.models.registry import ModelRegistry
diff --git a/verl/utils/tensordict_utils.py b/verl/utils/tensordict_utils.py
index 4946d18eddb..68167671dff 100644
--- a/verl/utils/tensordict_utils.py
+++ b/verl/utils/tensordict_utils.py
@@ -292,20 +292,47 @@ def chunk_tensordict(td: TensorDict, chunks: int) -> list[TensorDict]:
             evenly divisible by chunks.
 
     Note:
-        This is a workaround for PyTorch issue #153238 where torch.chunk()
-        doesn't support 3D jagged tensors (e.g., MRoPE position_ids).
-        See: https://github.com/pytorch/pytorch/issues/153238
+        PyTorch NestedTensor has issues with unbind/indexing on 2D and 3D
+        jagged tensors: unbind() internally calls split_with_sizes() using the
+        ragged lengths, but the underlying storage may be padded to a different
+        size, causing a RuntimeError.
+        - 3D+: https://github.com/pytorch/pytorch/issues/153238
+        - 2D:  select_int -> unbind -> split_with_sizes mismatch
+
+        For NestedTensors that can be chunked directly (regular batch dim with
+        no ragged interaction), we use the standard TensorDict.chunk(). For
+        those that cannot, we pad -> chunk -> unpad as a workaround.
     """
     assert isinstance(td, TensorDict) and len(td) % chunks == 0, (
         f"expecting td with length divisible by chunks, but got {len(td)} and {chunks}"
     )
     chunk_size = len(td) // chunks
-    keys = {key for key, val in td.items() if isinstance(val, torch.Tensor) and val.is_nested and val.dim() >= 3}
-    new_td = TensorDict({k: v for k, v in td.items() if k not in keys}, batch_size=td.batch_size, device=td.device)
+    nested_keys = {key for key, val in td.items() if isinstance(val, torch.Tensor) and val.is_nested}
+    new_td = TensorDict(
+        {k: v for k, v in td.items() if k not in nested_keys}, batch_size=td.batch_size, device=td.device
+    )
 
     tds = new_td.chunk(chunks=chunks)
-    for key in keys:
-        tensors = td[key].unbind(dim=0)
+    for key in nested_keys:
+        nt = td[key]
+        # Try the fast path first: direct unbind works for some NestedTensor
+        # layouts where the batch dim is not entangled with the ragged dim.
+        try:
+            tensors = nt.unbind(dim=0)
+        except RuntimeError:
+            # Fallback: pad -> chunk -> unpad.  This avoids the PyTorch bug
+            # where unbind/split_with_sizes fails because ragged lengths don't
+            # match the (padded) storage size.
+            padded = nt.to_padded_tensor(0)
+            padded_chunks = padded.chunk(chunks, dim=0)
+            offsets = nt.offsets()
+            lengths = offsets.diff().tolist()
+            for i, chunk_td in enumerate(tds):
+                chunk_lengths = lengths[i * chunk_size : (i + 1) * chunk_size]
+                chunk_tensors = [padded_chunks[i][j, :seq_len] for j, seq_len in enumerate(chunk_lengths)]
+                chunk_td[key] = torch.nested.as_nested_tensor(chunk_tensors, layout=torch.jagged)
+            continue
+
         for i, chunk_td in enumerate(tds):
             chunk_td[key] = torch.nested.as_nested_tensor(
                 tensors[i * chunk_size : (i + 1) * chunk_size], layout=torch.jagged
diff --git a/verl/workers/engine_workers.py b/verl/workers/engine_workers.py
index 6f8029600ea..6ea08dea5cd 100644
--- a/verl/workers/engine_workers.py
+++ b/verl/workers/engine_workers.py
@@ -24,6 +24,8 @@
 from tensordict import NonTensorData, TensorDict
 from torch.distributed.device_mesh import init_device_mesh
 
+from verl.workers.config.engine import McoreEngineConfig
+
 try:
     from verl.workers.engine.mindspeed.transformer_impl import repatch
 except ImportError:
@@ -98,9 +100,12 @@ def __init__(self, config: TrainingWorkerConfig):
                 self.model_config, self.device_name
             )
 
-        # we use the one defined in model
-        # TODO: this is not elegant and should refactor later
-        self.engine_config.use_remove_padding = self.model_config.use_remove_padding
+        # For Megatron engine, model.use_remove_padding (data pipeline) and
+        # engine.use_remove_padding (compute format: thd vs bshd) may differ
+        # (e.g. Qwen3.5 GDN requires bshd but still uses NestedTensor in data).
+        # For other engines, keep the original behavior of syncing them.
+        if not isinstance(self.engine_config, McoreEngineConfig):
+            self.engine_config.use_remove_padding = self.model_config.use_remove_padding
         self.engine_config.use_fused_kernels = self.model_config.use_fused_kernels
 
         if repatch is not None:
diff --git a/verl/workers/fsdp_workers.py b/verl/workers/fsdp_workers.py
index 66f02cbb235..ad644363e2c 100644
--- a/verl/workers/fsdp_workers.py
+++ b/verl/workers/fsdp_workers.py
@@ -292,10 +292,17 @@ def _build_model_optimizer(
             AutoConfig,
             AutoModel,
             AutoModelForCausalLM,
-            AutoModelForImageTextToText,
-            AutoModelForVision2Seq,
         )
 
+        try:
+            from transformers import AutoModelForVision2Seq
+        except ImportError:
+            AutoModelForVision2Seq = None
+        try:
+            from transformers import AutoModelForImageTextToText
+        except ImportError:
+            AutoModelForImageTextToText = AutoModelForVision2Seq
+
         from verl.utils.model import get_generation_config, print_model_size, update_model_config
         from verl.utils.torch_dtypes import PrecisionType
 

From d9611d9f5c23ec57a0e4499b5859bb957381f824 Mon Sep 17 00:00:00 2001
From: Yan Bai <bayan@nvidia.com>
Date: Wed, 25 Feb 2026 19:36:18 -0800
Subject: [PATCH 02/11] submit an RL script

---
 .../grpo_trainer/run_qwen3_5-35b-megatron.sh  | 166 ++++++++++++++++++
 verl/models/mcore/model_forward.py            |  19 +-
 verl/models/mcore/registry.py                 |   1 +
 3 files changed, 182 insertions(+), 4 deletions(-)
 create mode 100644 examples/grpo_trainer/run_qwen3_5-35b-megatron.sh

diff --git a/examples/grpo_trainer/run_qwen3_5-35b-megatron.sh b/examples/grpo_trainer/run_qwen3_5-35b-megatron.sh
new file mode 100644
index 00000000000..8da18446c83
--- /dev/null
+++ b/examples/grpo_trainer/run_qwen3_5-35b-megatron.sh
@@ -0,0 +1,166 @@
+#!/usr/bin/env bash
+# Qwen3.5-35B-A3B MoE GRPO RL with Megatron (single node, 8 GPUs, geo3k dataset)
+#
+# notes on vllm:
+#     by 20260225, the latest vllm nightly does not support qwen3.5 rollout, to use this script, you need to 
+#         1. wait until vllm supports qwen3.5 officially, and build a verl docker with that version of vllm
+#         2. self build a verl docker image with vllm from source code with qwen3.5 support (main branch 20260225 is OK)
+#     I succeeded in running this script with the main branch of vllm on 20260225, yet there are still some minor issues
+#     the vllm qwen3.5 during initialization, need to be fixed. Also, the cuda_graph is somehow not working, need to be 
+#     fixed, either by verl team with supoorts to vllm0.16, or by vllm team.
+# Requirements:
+#   - 8 GPUs (80GB each, e.g. 1x8 H100/H200)
+#   - Additional packages on top of the base image:
+#       pip install --upgrade transformers
+#       pip install flash-linear-attention
+#       pip install -U git+https://github.com/ISEEKYAN/mbridge.git
+#   - Megatron-LM dev branch with Qwen3.5 GDN support
+#
+# Qwen3.5 architecture notes:
+#   Qwen3.5 uses Gated Delta Net (GDN) linear attention which currently does
+#   NOT support packed sequences (THD format) in Megatron-LM. Therefore:
+#     - actor.megatron.use_remove_padding=False  (forces bshd compute format)
+#     - model.use_remove_padding=True           (keeps NestedTensor in data pipeline)
+#     - actor.use_dynamic_bsz=False              (required for bshd mode)
+#
+#   Once Megatron-LM adds THD support for Qwen3.5 GDN, use_remove_padding
+#   can be set to True for better performance.
+#
+# Tested parallelism config (8 GPUs / 1 node):
+#   TP=2 PP=1 CP=1 EP=8 ETP=1 GEN_TP=8
+#
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export VLLM_USE_V1=1
+export VLLM_ALLREDUCE_USE_SYMM_MEM=0
+
+set -xeuo pipefail
+
+########################### Quick Config ###########################
+
+TP=${TP:-2}
+PP=${PP:-1}
+CP=${CP:-1}
+EP=${EP:-8}
+ETP=${ETP:-1}
+GEN_TP=${GEN_TP:-8}
+
+ALL_OFFLOAD=${ALL_OFFLOAD:-True}
+
+rollout_name="vllm"
+project_name='verl_grpo_qwen3_5_35b_geo3k'
+exp_name='qwen3_5_35b_megatron'
+adv_estimator=grpo
+
+HF_MODEL_PATH=${HF_MODEL_PATH:-"Qwen3.5-35B-A3B"}
+train_path=${train_path:-$HOME/data/geo3k/train.parquet}
+test_path=${test_path:-$HOME/data/geo3k/test.parquet}
+
+########################### Parameter Arrays ###########################
+
+DATA=(
+    data.train_files=${train_path}
+    data.val_files=${test_path}
+    data.train_batch_size=32
+    data.max_prompt_length=1024
+    data.max_response_length=2048
+    data.truncation='error'
+    data.filter_overlong_prompts=True
+)
+
+MODEL=(
+    actor_rollout_ref.model.path=${HF_MODEL_PATH}
+    actor_rollout_ref.model.trust_remote_code=True
+    actor_rollout_ref.model.use_remove_padding=True
+)
+
+ACTOR=(
+    actor_rollout_ref.actor.optim.lr=1e-6
+    actor_rollout_ref.actor.ppo_mini_batch_size=32
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=4096
+    actor_rollout_ref.actor.use_dynamic_bsz=False
+    actor_rollout_ref.actor.use_kl_loss=True
+    actor_rollout_ref.actor.kl_loss_coef=0.01
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    actor_rollout_ref.actor.entropy_coeff=0
+    actor_rollout_ref.actor.megatron.use_mbridge=True
+    actor_rollout_ref.actor.megatron.vanilla_mbridge=True
+    actor_rollout_ref.actor.megatron.use_remove_padding=False
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${TP}
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${PP}
+    actor_rollout_ref.actor.megatron.context_parallel_size=${CP}
+    actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP}
+    actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP}
+    actor_rollout_ref.actor.megatron.param_offload=${ALL_OFFLOAD}
+    actor_rollout_ref.actor.megatron.optimizer_offload=${ALL_OFFLOAD}
+    actor_rollout_ref.actor.megatron.grad_offload=${ALL_OFFLOAD}
+    actor_rollout_ref.actor.megatron.dtype=bfloat16
+    ++actor_rollout_ref.actor.megatron.override_transformer_config.attention_backend=auto
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_aux_loss_coeff=0.01
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_z_loss_coeff=0.001
+    +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=1
+    +actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True
+    +actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True
+    +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True
+)
+
+ROLLOUT=(
+    actor_rollout_ref.rollout.name=${rollout_name}
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${GEN_TP}
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+    actor_rollout_ref.rollout.n=5
+    actor_rollout_ref.rollout.mode=async
+    actor_rollout_ref.rollout.enforce_eager=True
+    actor_rollout_ref.rollout.dtype=bfloat16
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=False
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=4096
+)
+
+REF=(
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=False
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=4096
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${TP}
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${PP}
+    actor_rollout_ref.ref.megatron.context_parallel_size=${CP}
+    actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP}
+    actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP}
+    actor_rollout_ref.ref.megatron.param_offload=${ALL_OFFLOAD}
+)
+
+ALGORITHM=(
+    algorithm.adv_estimator=${adv_estimator}
+    algorithm.use_kl_in_reward=False
+)
+
+TRAINER=(
+    trainer.critic_warmup=0
+    trainer.logger='["console","wandb"]'
+    trainer.project_name=${project_name}
+    trainer.experiment_name=${exp_name}
+    trainer.n_gpus_per_node=8
+    trainer.nnodes=1
+    trainer.save_freq=20
+    trainer.val_before_train=False
+    trainer.test_freq=5
+    trainer.total_epochs=15
+)
+
+########################### Launch ###########################
+
+python3 -m verl.trainer.main_ppo \
+    --config-path=config \
+    --config-name='ppo_megatron_trainer.yaml' \
+    "${DATA[@]}" \
+    "${ALGORITHM[@]}" \
+    "${MODEL[@]}" \
+    "${ROLLOUT[@]}" \
+    "${ACTOR[@]}" \
+    "${REF[@]}" \
+    "${TRAINER[@]}" \
+    "$@"
diff --git a/verl/models/mcore/model_forward.py b/verl/models/mcore/model_forward.py
index 4c82a56f09f..9955107f304 100644
--- a/verl/models/mcore/model_forward.py
+++ b/verl/models/mcore/model_forward.py
@@ -122,22 +122,33 @@ def model_forward(
             When using the bshd format, we have to add paddings to the input_ids to meet the longest sequence length, 
             so it is recommended to disable dynamic batch size and set batch size to 1
             """
-            assert not vision_model, "vision model does not support bshd format"
             assert fp8 is None, "fp8 is not supported for bshd format yet"
 
             batch_size, sequence_length = attention_mask.shape[:2]
+            position_ids_for_preprocess = (
+                torch.arange(sequence_length, device=input_ids.device).unsqueeze(0).expand(batch_size, -1)
+                if vision_model
+                else position_ids
+            )
+            pre_process_for_bshd = True if vision_model else pre_process
             new_input_ids, new_attention_mask, new_position_ids = preprocess_bshd(
-                input_ids, attention_mask, position_ids, sequence_parallel=sp, pre_process=pre_process
+                input_ids,
+                attention_mask,
+                position_ids_for_preprocess,
+                sequence_parallel=sp,
+                pre_process=pre_process_for_bshd,
             )
             output_orig = model(
                 input_ids=new_input_ids,
-                position_ids=new_position_ids,
+                position_ids=None if vision_model else new_position_ids,
                 attention_mask=new_attention_mask,
                 **model_kwargs,
             )
             if post_process and logits_processor is not None:
                 args = {
-                    k: preprocess_bshd(v, attention_mask, position_ids, sequence_parallel=sp, pre_process=True)[0]
+                    k: preprocess_bshd(
+                        v, attention_mask, position_ids_for_preprocess, sequence_parallel=sp, pre_process=True
+                    )[0]
                     for k, v in logits_processor_args.items()
                 }
                 output_dict = logits_processor(output_orig, **args)
diff --git a/verl/models/mcore/registry.py b/verl/models/mcore/registry.py
index b1b5c03406b..2feb4ed7019 100644
--- a/verl/models/mcore/registry.py
+++ b/verl/models/mcore/registry.py
@@ -30,6 +30,7 @@ class SupportedVLM(Enum):
     QWEN2_5_VL = "Qwen2_5_VLForConditionalGeneration"
     QWEN3_MOE_VL = "Qwen3VLMoeForConditionalGeneration"
     QWEN3_VL = "Qwen3VLForConditionalGeneration"
+    QWEN3_5_MOE_VL = "Qwen3_5MoeForConditionalGeneration"
 
 
 supported_vlm = [member.value for member in SupportedVLM]

From a41867251a0567d751b5052921bd11670fde56c7 Mon Sep 17 00:00:00 2001
From: Yan Bai <bayan@nvidia.com>
Date: Tue, 3 Mar 2026 09:03:25 -0800
Subject: [PATCH 03/11] update doc of chunk_tensordict

---
 verl/utils/tensordict_utils.py | 38 +++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/verl/utils/tensordict_utils.py b/verl/utils/tensordict_utils.py
index 68167671dff..9e6afb78f67 100644
--- a/verl/utils/tensordict_utils.py
+++ b/verl/utils/tensordict_utils.py
@@ -277,8 +277,8 @@ def chunk_tensordict(td: TensorDict, chunks: int) -> list[TensorDict]:
     """Split a TensorDict into equal-sized chunks with special nested tensor handling.
 
     Divides a TensorDict into the specified number of chunks along the batch
-    dimension. Handles 3D+ nested tensors specially since torch.chunk() doesn't
-    support jagged tensors with 3 or more dimensions.
+    dimension. Handles NestedTensors specially since TensorDict.chunk() doesn't
+    support jagged tensors.
 
     Args:
         td: The TensorDict to split.
@@ -292,16 +292,25 @@ def chunk_tensordict(td: TensorDict, chunks: int) -> list[TensorDict]:
             evenly divisible by chunks.
 
     Note:
-        PyTorch NestedTensor has issues with unbind/indexing on 2D and 3D
-        jagged tensors: unbind() internally calls split_with_sizes() using the
-        ragged lengths, but the underlying storage may be padded to a different
-        size, causing a RuntimeError.
-        - 3D+: https://github.com/pytorch/pytorch/issues/153238
-        - 2D:  select_int -> unbind -> split_with_sizes mismatch
-
-        For NestedTensors that can be chunked directly (regular batch dim with
-        no ragged interaction), we use the standard TensorDict.chunk(). For
-        those that cannot, we pad -> chunk -> unpad as a workaround.
+        PyTorch ``unbind(dim=0)`` on 3D+ jagged NestedTensors has a bug where
+        ``split_with_sizes`` is applied to the wrong dimension of the internal
+        ``_values`` tensor.  For example, mRoPE ``position_ids`` with per-sample
+        shape ``(4, seq_len)`` becomes a 3D jagged NestedTensor
+        ``[B, *(ragged=4), seq_len]``; ``_values`` is ``[B*4, seq_len]`` and
+        ``unbind`` erroneously splits dimension 1 (``seq_len``) instead of
+        dimension 0, causing::
+
+            RuntimeError: split_with_sizes expects split_sizes to sum exactly
+            to <seq_len>, but got split_sizes=[4, 4, ...]
+
+        2D jagged NestedTensors (e.g. ``input_ids``, ``loss_mask``) are
+        unaffected — ``unbind(dim=0)`` works correctly for them.
+
+        The workaround: try ``unbind`` first (fast path for 2D); on failure,
+        fall back to ``to_padded_tensor`` → ``chunk`` → reconstruct per-chunk
+        NestedTensors using the original ragged lengths from ``offsets``.
+
+        See https://github.com/pytorch/pytorch/issues/153238
     """
     assert isinstance(td, TensorDict) and len(td) % chunks == 0, (
         f"expecting td with length divisible by chunks, but got {len(td)} and {chunks}"
@@ -315,14 +324,9 @@ def chunk_tensordict(td: TensorDict, chunks: int) -> list[TensorDict]:
     tds = new_td.chunk(chunks=chunks)
     for key in nested_keys:
         nt = td[key]
-        # Try the fast path first: direct unbind works for some NestedTensor
-        # layouts where the batch dim is not entangled with the ragged dim.
         try:
             tensors = nt.unbind(dim=0)
         except RuntimeError:
-            # Fallback: pad -> chunk -> unpad.  This avoids the PyTorch bug
-            # where unbind/split_with_sizes fails because ragged lengths don't
-            # match the (padded) storage size.
             padded = nt.to_padded_tensor(0)
             padded_chunks = padded.chunk(chunks, dim=0)
             offsets = nt.offsets()

From 80c0f876622aae05e2df93c8bdb073b8797f7f80 Mon Sep 17 00:00:00 2001
From: Yan Bai <bayan@nvidia.com>
Date: Tue, 3 Mar 2026 09:46:37 -0800
Subject: [PATCH 04/11] update apply_chat_template_single_turn

---
 verl/experimental/agent_loop/agent_loop.py  |  7 +++
 verl/utils/chat_template.py                 | 59 +++++++++++++++++++++
 verl/utils/dataset/multiturn_sft_dataset.py | 53 +++++-------------
 3 files changed, 79 insertions(+), 40 deletions(-)

diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index 228d2248b7e..334556aaf21 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -262,6 +262,13 @@ async def apply_chat_template(
 
         Returns:
             list[int]: Prompt token ids.
+
+        .. todo::
+            Templates that require a user message (e.g. Qwen 3.5) will fail
+            when *messages* contains only tool/assistant turns.  Migrate to
+            :func:`verl.utils.chat_template.apply_chat_template_single_turn`
+            with ``full_conversation`` fallback — see the SFT dataset for the
+            reference pattern.
         """
         if self.processor is not None:
             raw_prompt = await self.loop.run_in_executor(
diff --git a/verl/utils/chat_template.py b/verl/utils/chat_template.py
index 64300601c58..4d1f24ee99b 100644
--- a/verl/utils/chat_template.py
+++ b/verl/utils/chat_template.py
@@ -42,3 +42,62 @@ def extract_system_prompt_and_generation(tokenizer):
     generate_prompt = token3[len(token1) :]
 
     return system_prompt, generate_prompt
+
+
+def apply_chat_template_single_turn(
+    processor,
+    messages: list[dict],
+    full_conversation: list[dict],
+    turn_index: int,
+    tools=None,
+    **kwargs,
+):
+    """Apply chat_template to a single turn's ``messages``, with automatic
+    fallback for templates that require a user message (e.g. Qwen 3.5's
+    "No user query found" error).
+
+    When the direct call fails, the function tokenises the full conversation up
+    to *turn_index* and subtracts the prefix produced by everything before
+    *turn_index*, yielding only this turn's tokens.
+
+    Args:
+        processor: tokenizer or processor that has ``apply_chat_template``.
+        messages: the message(s) to tokenise (typically ``[single_msg]``).
+        full_conversation: the complete conversation list for fallback context.
+        turn_index: 0-based position of the **last** message of ``messages``
+            inside ``full_conversation``.
+        tools: tool schemas forwarded to ``apply_chat_template``.
+        **kwargs: extra keyword arguments forwarded to ``apply_chat_template``
+            (e.g. ``add_generation_prompt``, ``return_tensors``, ``tokenize``).
+
+    Returns:
+        Same type as ``processor.apply_chat_template`` — typically a ``dict``
+        (when ``return_dict=True``) or a ``list[int]``.
+    """
+    try:
+        return processor.apply_chat_template(messages, tools=tools, **kwargs)
+    except Exception as e:
+        if "No user query" not in str(e):
+            raise
+
+        inputs_full = processor.apply_chat_template(
+            full_conversation[: turn_index + 1],
+            tools=tools,
+            **kwargs,
+        )
+        prefix_len = 0
+        if turn_index > 0:
+            prefix_tools = tools if turn_index == 1 else None
+            inputs_prev = processor.apply_chat_template(
+                full_conversation[:turn_index],
+                tools=prefix_tools,
+                **kwargs,
+            )
+            if isinstance(inputs_prev, dict):
+                prefix_len = inputs_prev["input_ids"].shape[-1]
+            else:
+                prefix_len = len(inputs_prev)
+
+        if isinstance(inputs_full, dict):
+            return {k: v[..., prefix_len:] for k, v in inputs_full.items()}
+        return inputs_full[prefix_len:]
diff --git a/verl/utils/dataset/multiturn_sft_dataset.py b/verl/utils/dataset/multiturn_sft_dataset.py
index e8fb4b65e37..d229eabeb21 100644
--- a/verl/utils/dataset/multiturn_sft_dataset.py
+++ b/verl/utils/dataset/multiturn_sft_dataset.py
@@ -22,7 +22,6 @@
 from functools import wraps
 from typing import Any, Optional
 
-import jinja2
 import numpy as np
 import pandas as pd
 import torch
@@ -33,7 +32,7 @@
 
 from verl.models.transformers.qwen2_vl import get_rope_index
 from verl.utils import hf_tokenizer
-from verl.utils.chat_template import extract_system_prompt_and_generation
+from verl.utils.chat_template import apply_chat_template_single_turn, extract_system_prompt_and_generation
 from verl.utils.dataset.dataset_utils import DatasetPadMode
 from verl.utils.dataset.vision_utils import process_image, process_video
 from verl.utils.fs import copy_local_path_from_hdfs
@@ -224,44 +223,18 @@ def _process_single_message(
         if enable_thinking is not None:
             apply_chat_template_kwargs["enable_thinking"] = enable_thinking
 
-        try:
-            inputs = processor.apply_chat_template(
-                [message],
-                tools=tools,
-                add_generation_prompt=False,
-                tokenize=True,
-                return_dict=True,
-                return_tensors="pt",
-                **apply_chat_template_kwargs,
-            )
-        except (jinja2.exceptions.TemplateError, Exception) as e:
-            if "No user query" not in str(e):
-                raise
-            # Chat templates that require a user message (e.g. Qwen3.5) fail
-            # when tokenising a single non-user message. Fallback: tokenise the
-            # conversation up to this turn and subtract the prefix.
-            inputs_full = processor.apply_chat_template(
-                full_message[: index + 1],
-                tools=tools,
-                add_generation_prompt=False,
-                tokenize=True,
-                return_dict=True,
-                return_tensors="pt",
-                **apply_chat_template_kwargs,
-            )
-            prefix_len = 0
-            if index > 0:
-                inputs_prev = processor.apply_chat_template(
-                    full_message[:index],
-                    tools=tools if index == 1 else None,
-                    add_generation_prompt=False,
-                    tokenize=True,
-                    return_dict=True,
-                    return_tensors="pt",
-                    **apply_chat_template_kwargs,
-                )
-                prefix_len = inputs_prev["input_ids"].shape[-1]
-            inputs = {k: v[..., prefix_len:] for k, v in inputs_full.items()}
+        inputs = apply_chat_template_single_turn(
+            processor,
+            messages=[message],
+            full_conversation=full_message,
+            turn_index=index,
+            tools=tools,
+            add_generation_prompt=False,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+            **apply_chat_template_kwargs,
+        )
 
         inputs = dict(inputs)
         input_ids = inputs.pop("input_ids")[0]

From 1bf133af4f87ab8bffa93404a7043c04a8590447 Mon Sep 17 00:00:00 2001
From: Yan Bai <bayan@nvidia.com>
Date: Wed, 4 Mar 2026 00:06:29 -0800
Subject: [PATCH 05/11] clean

---
 examples/grpo_trainer/run_qwen3_5-35b-megatron.sh |  4 ++--
 examples/sft/gsm8k/run_qwen3_5_megatron.sh        |  2 +-
 verl/models/mcore/registry.py                     |  1 +
 verl/workers/engine_workers.py                    | 11 +++--------
 4 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/examples/grpo_trainer/run_qwen3_5-35b-megatron.sh b/examples/grpo_trainer/run_qwen3_5-35b-megatron.sh
index 8da18446c83..43563c20c59 100644
--- a/examples/grpo_trainer/run_qwen3_5-35b-megatron.sh
+++ b/examples/grpo_trainer/run_qwen3_5-35b-megatron.sh
@@ -19,8 +19,8 @@
 # Qwen3.5 architecture notes:
 #   Qwen3.5 uses Gated Delta Net (GDN) linear attention which currently does
 #   NOT support packed sequences (THD format) in Megatron-LM. Therefore:
+#     - model.use_remove_padding=False           (deprecated option, will be removed in the future forces bshd compute format)
 #     - actor.megatron.use_remove_padding=False  (forces bshd compute format)
-#     - model.use_remove_padding=True           (keeps NestedTensor in data pipeline)
 #     - actor.use_dynamic_bsz=False              (required for bshd mode)
 #
 #   Once Megatron-LM adds THD support for Qwen3.5 GDN, use_remove_padding
@@ -71,7 +71,7 @@ DATA=(
 MODEL=(
     actor_rollout_ref.model.path=${HF_MODEL_PATH}
     actor_rollout_ref.model.trust_remote_code=True
-    actor_rollout_ref.model.use_remove_padding=True
+    actor_rollout_ref.model.use_remove_padding=False
 )
 
 ACTOR=(
diff --git a/examples/sft/gsm8k/run_qwen3_5_megatron.sh b/examples/sft/gsm8k/run_qwen3_5_megatron.sh
index 43ab11ec213..fef3fdc9cf7 100644
--- a/examples/sft/gsm8k/run_qwen3_5_megatron.sh
+++ b/examples/sft/gsm8k/run_qwen3_5_megatron.sh
@@ -130,7 +130,7 @@ torchrun \
     data.max_token_len_per_gpu=${MAX_LENGTH} \
     data.messages_key=messages \
     model.path=${MODEL_PATH} \
-    model.use_remove_padding=True \
+    model.use_remove_padding=False \
     model.trust_remote_code=True \
     ${ENGINE_CONFIG} \
     trainer.test_freq=-1 \
diff --git a/verl/models/mcore/registry.py b/verl/models/mcore/registry.py
index 2feb4ed7019..5776b13fcf4 100644
--- a/verl/models/mcore/registry.py
+++ b/verl/models/mcore/registry.py
@@ -31,6 +31,7 @@ class SupportedVLM(Enum):
     QWEN3_MOE_VL = "Qwen3VLMoeForConditionalGeneration"
     QWEN3_VL = "Qwen3VLForConditionalGeneration"
     QWEN3_5_MOE_VL = "Qwen3_5MoeForConditionalGeneration"
+    QWEN3_5_VL = "Qwen3_5ForConditionalGeneration"
 
 
 supported_vlm = [member.value for member in SupportedVLM]
diff --git a/verl/workers/engine_workers.py b/verl/workers/engine_workers.py
index fad1e9a0ca4..3d129d1fe2f 100644
--- a/verl/workers/engine_workers.py
+++ b/verl/workers/engine_workers.py
@@ -24,8 +24,6 @@
 from tensordict import NonTensorData, TensorDict
 from torch.distributed.device_mesh import init_device_mesh
 
-from verl.workers.config.engine import McoreEngineConfig
-
 try:
     from verl.workers.engine.mindspeed.transformer_impl import repatch
 except ImportError:
@@ -100,12 +98,9 @@ def __init__(self, config: TrainingWorkerConfig):
                 self.model_config, self.device_name
             )
 
-        # For Megatron engine, model.use_remove_padding (data pipeline) and
-        # engine.use_remove_padding (compute format: thd vs bshd) may differ
-        # (e.g. Qwen3.5 GDN requires bshd but still uses NestedTensor in data).
-        # For other engines, keep the original behavior of syncing them.
-        if not isinstance(self.engine_config, McoreEngineConfig):
-            self.engine_config.use_remove_padding = self.model_config.use_remove_padding
+        # we use the one defined in model
+        # TODO: this is not elegant and should refactor later
+        self.engine_config.use_remove_padding = self.model_config.use_remove_padding
         self.engine_config.use_fused_kernels = self.model_config.use_fused_kernels
 
         if repatch is not None:

From 98571a95213eb42b116ae58b9292f495883f6fb0 Mon Sep 17 00:00:00 2001
From: Yan Bai <bayan@nvidia.com>
Date: Wed, 4 Mar 2026 00:14:04 -0800
Subject: [PATCH 06/11] clean

---
 examples/sft/gsm8k/run_qwen3_5_megatron.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/sft/gsm8k/run_qwen3_5_megatron.sh b/examples/sft/gsm8k/run_qwen3_5_megatron.sh
index fef3fdc9cf7..102392e65c8 100644
--- a/examples/sft/gsm8k/run_qwen3_5_megatron.sh
+++ b/examples/sft/gsm8k/run_qwen3_5_megatron.sh
@@ -14,7 +14,6 @@
 #   Qwen3.5 uses Gated Delta Net (GDN) linear attention which currently does
 #   NOT support packed sequences (THD format) in Megatron-LM. Therefore:
 #     - engine.use_remove_padding=False  (forces bshd compute format)
-#     - model.use_remove_padding=True    (keeps NestedTensor in data pipeline)
 #     - data.use_dynamic_bsz=False       (required for bshd mode)
 #
 #   Once https://github.com/NVIDIA/Megatron-LM/pull/2644 is merged, THD

From 28b106e646d3898739aa4a5f983ba43cd0e1510e Mon Sep 17 00:00:00 2001
From: Yan Bai <bayan@nvidia.com>
Date: Wed, 4 Mar 2026 05:04:29 -0800
Subject: [PATCH 07/11] small fix

---
 verl/utils/chat_template.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/verl/utils/chat_template.py b/verl/utils/chat_template.py
index f527b07acd6..93faa4cdbce 100644
--- a/verl/utils/chat_template.py
+++ b/verl/utils/chat_template.py
@@ -97,11 +97,11 @@ def apply_chat_template_single_turn(
                 tools=prefix_tools,
                 **kwargs,
             )
-            if isinstance(inputs_prev, dict):
+            if hasattr(inputs_prev, "items"):
                 prefix_len = inputs_prev["input_ids"].shape[-1]
             else:
                 prefix_len = len(inputs_prev)
 
-        if isinstance(inputs_full, dict):
+        if hasattr(inputs_full, "items"):
             return {k: v[..., prefix_len:] for k, v in inputs_full.items()}
         return inputs_full[prefix_len:]

From 41dbe7ae84ee414bc7dd32a2953782a6c7713b3c Mon Sep 17 00:00:00 2001
From: wuxibin <wuxibin@bytedance.com>
Date: Tue, 10 Mar 2026 00:57:56 +0800
Subject: [PATCH 08/11] fix apply_chat_template_single_turn

---
 .github/workflows/cpu_unit_tests.yml          |   2 +-
 .../test_multiturn_sft_dataset_on_cpu.py      |  82 ++++++++------
 verl/utils/chat_template.py                   | 105 +++++++++++-------
 verl/utils/dataset/multiturn_sft_dataset.py   |   9 +-
 4 files changed, 117 insertions(+), 81 deletions(-)

diff --git a/.github/workflows/cpu_unit_tests.yml b/.github/workflows/cpu_unit_tests.yml
index c0145d6d8a1..62e63cbe613 100644
--- a/.github/workflows/cpu_unit_tests.yml
+++ b/.github/workflows/cpu_unit_tests.yml
@@ -95,7 +95,7 @@ jobs:
         run: |
           pip3 install -r requirements-test.txt
           pip3 install --no-deps -e .
-          pip3 install --upgrade "transformers<5.0.0"
+          pip3 install --upgrade "transformers>=5.0.0"
       - name: Download datasets
         run: |
           python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
diff --git a/tests/utils/dataset/test_multiturn_sft_dataset_on_cpu.py b/tests/utils/dataset/test_multiturn_sft_dataset_on_cpu.py
index a55417ce839..02381b972d2 100644
--- a/tests/utils/dataset/test_multiturn_sft_dataset_on_cpu.py
+++ b/tests/utils/dataset/test_multiturn_sft_dataset_on_cpu.py
@@ -26,9 +26,9 @@
 from tensordict import TensorDict
 from torch.utils.data import DistributedSampler
 from torchdata.stateful_dataloader import StatefulDataLoader
-from transformers import AutoProcessor, AutoTokenizer
 from transformers.utils import get_json_schema
 
+from verl.utils import hf_processor, hf_tokenizer
 from verl.utils.dataset.dataset_utils import DatasetPadMode, SFTTensorCollator
 from verl.utils.dataset.multiturn_sft_dataset import MultiTurnSFTDataset
 from verl.utils.model import extract_multi_modal_inputs
@@ -37,31 +37,29 @@
 
 
 @pytest.mark.parametrize(
-    "model_path",
+    "model_path, ignore_input_ids_mismatch",
     [
-        f"{custom_model_prefix}/Qwen/Qwen2.5-0.5B",
-        f"{custom_model_prefix}/Qwen/Qwen2.5-Coder-7B-Instruct",
-        f"{custom_model_prefix}/Qwen/Qwen3-30B-A3B-Instruct-2507",
-        # "Qwen/Qwen3-30B-A3B-Thinking-2507" # Thinking series models add <think></think> tags to last turn.
+        ("{custom_model_prefix}/Qwen/Qwen2.5-0.5B", False),
+        ("{custom_model_prefix}/Qwen/Qwen3-0.6B", True),
+        ("{custom_model_prefix}/Qwen/Qwen3.5-0.8B", False),
     ],
 )
-@pytest.mark.parametrize("enable_thinking", [False, True])
-def test_multiturn_sft_dataset(model_path: str, enable_thinking: bool):
-    print(f"Starting test... model_path={model_path}, enable_thinking={enable_thinking}")
+def test_multiturn_sft_dataset(model_path: str, ignore_input_ids_mismatch: bool):
+    print(f"Starting test... model_path={model_path}, ignore_input_ids_mismatch={ignore_input_ids_mismatch}")
     # Create a temporary parquet file with test data
     test_data = {
         "messages": [
             [
                 {"role": "user", "content": "What is 2+2?"},
                 {"role": "assistant", "content": "2+2 equals 4."},
-                {"role": "user", "content": "And what is 4+4?"},
+                {"role": "tool", "content": "And what is 4+4?"},
                 {"role": "assistant", "content": "4+4 equals 8."},
             ],
             [
-                {"role": "system", "content": "You are a powerful assistant."},
+                # {"role": "system", "content": "You are a powerful assistant."},
                 {"role": "user", "content": "Tell me a joke."},
                 {"role": "assistant", "content": "Why did the chicken cross the road?"},
-                {"role": "user", "content": "Why?"},
+                {"role": "tool", "content": "Why?"},
                 {"role": "assistant", "content": "To get to the other side!"},
             ],
         ]
@@ -76,14 +74,16 @@ def test_multiturn_sft_dataset(model_path: str, enable_thinking: bool):
     df.to_parquet(test_file)
 
     # Initialize tokenizer and dataset
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    tokenizer = hf_tokenizer(model_path)
+    # processor = hf_processor(model_path)
+    processor = None
     config = {
         "max_length": 512,
         "truncation": "error",
         "multiturn": {"messages_key": "messages"},
-        "apply_chat_template_kwargs": {"enable_thinking": enable_thinking},
+        "ignore_input_ids_mismatch": ignore_input_ids_mismatch,
     }
-    dataset = MultiTurnSFTDataset(parquet_files=test_file, tokenizer=tokenizer, config=config)
+    dataset = MultiTurnSFTDataset(parquet_files=test_file, tokenizer=tokenizer, processor=processor, config=config)
 
     # Test 1: Dataset Length
     assert len(dataset) == 2, f"Expected dataset length 2, got {len(dataset)}"
@@ -189,8 +189,15 @@ def test_multiturn_sft_dataset(model_path: str, enable_thinking: bool):
             )
 
     # Test 10: Verify padding behavior
-    padding_config = {"max_length": 1024, "truncation": "error", "multiturn": {"messages_key": "messages"}}
-    small_dataset = MultiTurnSFTDataset(parquet_files=test_file, tokenizer=tokenizer, config=padding_config)
+    padding_config = {
+        "max_length": 1024,
+        "truncation": "error",
+        "multiturn": {"messages_key": "messages"},
+        "ignore_input_ids_mismatch": ignore_input_ids_mismatch,
+    }
+    small_dataset = MultiTurnSFTDataset(
+        parquet_files=test_file, tokenizer=tokenizer, processor=processor, config=padding_config
+    )
     padded_item = small_dataset[0]
 
     # Get actual sequence length (before padding)
@@ -209,8 +216,9 @@ def test_multiturn_sft_dataset(model_path: str, enable_thinking: bool):
         "truncation": "error",
         "multiturn": {"messages_key": "messages"},
         "pad_mode": "no_padding",
+        "ignore_input_ids_mismatch": ignore_input_ids_mismatch,
     }
-    dataset = MultiTurnSFTDataset(parquet_files=test_file, tokenizer=tokenizer, config=config)
+    dataset = MultiTurnSFTDataset(parquet_files=test_file, tokenizer=tokenizer, processor=processor, config=config)
 
     item0 = dataset[0]
 
@@ -286,7 +294,7 @@ def vlm_data_file():
                     "content": "Let's generate a zoom-in image.",
                     "tool_calls": [
                         {
-                            "function": {"arguments": '{"bbox_2d": "[0, 1, 2, 4]"}', "name": "image_zoom_in_tool"},
+                            "function": {"arguments": {"bbox_2d": "[0, 1, 2, 4]"}, "name": "image_zoom_in_tool"},
                             "type": "function",
                         }
                     ],
@@ -331,13 +339,19 @@ def serialize_image(img):
     return test_file
 
 
-def test_multiturn_sft_vlm_dataset_on_cpu(vlm_data_file):
+@pytest.mark.parametrize(
+    "model_path",
+    [
+        "{custom_model_prefix}/Qwen/Qwen3-VL-2B-Instruct",
+        "{custom_model_prefix}/Qwen/Qwen3.5-0.8B",
+    ],
+)
+def test_multiturn_sft_vlm_dataset_on_cpu(model_path, vlm_data_file):
     df = pd.read_parquet(vlm_data_file)
-    model_path = f"{custom_model_prefix}/Qwen/Qwen3-VL-2B-Instruct"
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
-    processor = AutoProcessor.from_pretrained(model_path)
-    config = {"max_length": 512, "pad_mode": "no_padding", "truncation": "error", "messages_key": "messages"}
-    dataset = MultiTurnSFTDataset(parquet_files=vlm_data_file, tokenizer=tokenizer, config=config, processor=processor)
+    tokenizer = hf_tokenizer(model_path)
+    processor = hf_processor(model_path)
+    config = {"max_length": 1024, "pad_mode": "no_padding", "truncation": "error", "messages_key": "messages"}
+    dataset = MultiTurnSFTDataset(parquet_files=vlm_data_file, tokenizer=tokenizer, processor=processor, config=config)
     assert dataset.pad_mode == DatasetPadMode.NO_PADDING
 
     for i in range(len(dataset)):
@@ -387,13 +401,19 @@ def test_multiturn_sft_vlm_dataset_on_cpu(vlm_data_file):
             assert image_grid_thw is None, "image_grid_thw should be None when no image is provided"
 
 
-def test_multiturn_sft_vlm_dataloader_on_cpu(vlm_data_file):
+@pytest.mark.parametrize(
+    "model_path",
+    [
+        "{custom_model_prefix}/Qwen/Qwen3-VL-2B-Instruct",
+        "{custom_model_prefix}/Qwen/Qwen3.5-0.8B",
+    ],
+)
+def test_multiturn_sft_vlm_dataloader_on_cpu(model_path, vlm_data_file):
     df = pd.read_parquet(vlm_data_file)
-    model_path = f"{custom_model_prefix}/Qwen/Qwen3-VL-2B-Instruct"
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
-    processor = AutoProcessor.from_pretrained(model_path)
-    config = {"max_length": 512, "pad_mode": "no_padding", "truncation": "error", "messages_key": "messages"}
-    dataset = MultiTurnSFTDataset(parquet_files=vlm_data_file, tokenizer=tokenizer, config=config, processor=processor)
+    tokenizer = hf_tokenizer(model_path)
+    processor = hf_processor(model_path)
+    config = {"max_length": 1024, "pad_mode": "no_padding", "truncation": "error", "messages_key": "messages"}
+    dataset = MultiTurnSFTDataset(parquet_files=vlm_data_file, tokenizer=tokenizer, processor=processor, config=config)
     assert dataset.pad_mode == DatasetPadMode.NO_PADDING
 
     collate_fn = SFTTensorCollator(DatasetPadMode.NO_PADDING)
diff --git a/verl/utils/chat_template.py b/verl/utils/chat_template.py
index 93faa4cdbce..5dcef56afa1 100644
--- a/verl/utils/chat_template.py
+++ b/verl/utils/chat_template.py
@@ -2,6 +2,8 @@
 import logging
 import os
 
+from transformers import PreTrainedTokenizerBase, ProcessorMixin
+
 from verl.utils.tokenizer import normalize_token_ids
 
 logger = logging.getLogger(__name__)
@@ -49,59 +51,76 @@ def extract_system_prompt_and_generation(tokenizer):
 
 
 def apply_chat_template_single_turn(
-    processor,
+    processor: PreTrainedTokenizerBase | ProcessorMixin,
     messages: list[dict],
-    full_conversation: list[dict],
-    turn_index: int,
+    *,
+    tokenize: bool = True,
+    add_generation_prompt: bool = True,
     tools=None,
+    return_dict: bool = False,
+    return_mm_token_type_ids: bool = False,
     **kwargs,
-):
-    """Apply chat_template to a single turn's ``messages``, with automatic
-    fallback for templates that require a user message (e.g. Qwen 3.5's
-    "No user query found" error).
-
-    When the direct call fails, the function tokenises the full conversation up
-    to *turn_index* and subtracts the prefix produced by everything before
-    *turn_index*, yielding only this turn's tokens.
+) -> list[int] | str:
+    """apply_chat_template to a single turn's messages.
 
     Args:
-        processor: tokenizer or processor that has ``apply_chat_template``.
-        messages: the message(s) to tokenise (typically ``[single_msg]``).
-        full_conversation: the complete conversation list for fallback context.
-        turn_index: 0-based position of the **last** message of ``messages``
-            inside ``full_conversation``.
-        tools: tool schemas forwarded to ``apply_chat_template``.
-        **kwargs: extra keyword arguments forwarded to ``apply_chat_template``
-            (e.g. ``add_generation_prompt``, ``return_tensors``, ``tokenize``).
+        processor: tokenizer or processor.
+        messages: list[dict], single turn messages.
+        tokenize: bool, whether to tokenize the output.
+        add_generation_prompt: bool, whether to add generation prompt.
+        tools: list[dict], tools schema.
+        return_dict: bool, whether to return a dict.
+        return_mm_token_type_ids: bool, whether to return multimodal token type ids.
+        **kwargs: additional arguments for apply_chat_template.
 
     Returns:
-        Same type as ``processor.apply_chat_template`` — typically a ``dict``
-        (when ``return_dict=True``) or a ``list[int]``.
+        list[int] | str: tokenized ids or text string.
     """
+    assert isinstance(messages, list) and len(messages) == 1, f"messages must be a single turn, got {messages}"
     try:
-        return processor.apply_chat_template(messages, tools=tools, **kwargs)
-    except Exception as e:
-        if "No user query" not in str(e):
-            raise
-
-        inputs_full = processor.apply_chat_template(
-            full_conversation[: turn_index + 1],
+        return processor.apply_chat_template(
+            messages,
+            tokenize=tokenize,
+            add_generation_prompt=add_generation_prompt,
+            tools=tools,
+            return_dict=return_dict,
+            return_mm_token_type_ids=return_mm_token_type_ids,
+            **kwargs,
+        )
+    except Exception:
+        # Qwen3.5 apply_chat_template needs messages with at least one user message
+        dummy_user_message = [{"role": "user", "content": [{"type": "text", "text": ""}]}]
+        dummy_user_prefix = processor.apply_chat_template(
+            dummy_user_message,
+            tokenize=tokenize,
+            add_generation_prompt=False,
+            tools=tools,
+            return_dict=return_dict,
+            return_mm_token_type_ids=return_mm_token_type_ids,
+            **kwargs,
+        )
+        output = processor.apply_chat_template(
+            dummy_user_message + messages,
+            tokenize=tokenize,
+            add_generation_prompt=add_generation_prompt,
             tools=tools,
+            return_dict=return_dict,
+            return_mm_token_type_ids=return_mm_token_type_ids,
             **kwargs,
         )
-        prefix_len = 0
-        if turn_index > 0:
-            prefix_tools = tools if turn_index == 1 else None
-            inputs_prev = processor.apply_chat_template(
-                full_conversation[:turn_index],
-                tools=prefix_tools,
-                **kwargs,
-            )
-            if hasattr(inputs_prev, "items"):
-                prefix_len = inputs_prev["input_ids"].shape[-1]
-            else:
-                prefix_len = len(inputs_prev)
 
-        if hasattr(inputs_full, "items"):
-            return {k: v[..., prefix_len:] for k, v in inputs_full.items()}
-        return inputs_full[prefix_len:]
+        if not tokenize:  # tokenize=False
+            return output[len(dummy_user_prefix) :]
+        elif not return_dict:  # tokenize=True and return_dict=False
+            if isinstance(output[0], list):  # transformers>=5
+                assert len(output) == 1, "output must be a list[int] or list[list[int]]"
+                dummy_user_prefix = dummy_user_prefix[0]
+                output = output[0]
+            return output[len(dummy_user_prefix) :]
+        else:  # tokenize=True and return_dict=True and return_tensors="pt"
+            dummy_user_prefix = dict(dummy_user_prefix)
+            output = dict(output)
+            prefix_len = dummy_user_prefix["input_ids"].shape[1]
+            output["input_ids"] = output["input_ids"][:, prefix_len:]
+            output["attention_mask"] = output["attention_mask"][:, prefix_len:]
+            return output
diff --git a/verl/utils/dataset/multiturn_sft_dataset.py b/verl/utils/dataset/multiturn_sft_dataset.py
index 47793e84b4b..8f3ae1e166a 100644
--- a/verl/utils/dataset/multiturn_sft_dataset.py
+++ b/verl/utils/dataset/multiturn_sft_dataset.py
@@ -203,10 +203,7 @@ def _process_single_message(
         Returns:
             Tuple of (input_ids, loss_mask, attention_mask, dict[str, torch.Tensor])
         """
-        has_visual_content = isinstance(message.get("content"), list) and any(
-            isinstance(c, dict) and c.get("type") in ("image", "video") for c in message["content"]
-        )
-        processor = self.processor if self.processor is not None and has_visual_content else self.tokenizer
+        processor = self.processor if self.processor is not None else self.tokenizer
         apply_chat_template_kwargs = {**self.apply_chat_template_kwargs}
         if enable_thinking is not None:
             apply_chat_template_kwargs["enable_thinking"] = enable_thinking
@@ -214,8 +211,6 @@ def _process_single_message(
         inputs = apply_chat_template_single_turn(
             processor,
             messages=[message],
-            full_conversation=full_message,
-            turn_index=index,
             tools=tools,
             add_generation_prompt=False,
             tokenize=True,
@@ -399,6 +394,8 @@ def __getitem__(self, item):
                 res["multi_modal_inputs"] = multi_modal_inputs
             return res
         elif self.pad_mode == DatasetPadMode.NO_PADDING:
+            if sequence_length > self.max_length and self.truncation == "error":
+                raise ValueError(f"{sequence_length=} is larger than {self.max_length=}")
             # truncate input_ids if it is longer than max_length
             if len(input_ids) > self.max_length:
                 input_ids = input_ids[: self.max_length]

From 7f7eece429c362ddd85c8d681cc51ab80f7c6fd7 Mon Sep 17 00:00:00 2001
From: wuxibin <wuxibin@bytedance.com>
Date: Tue, 10 Mar 2026 11:53:55 +0800
Subject: [PATCH 09/11] add qwen3_coder tool parser

---
 verl/experimental/agent_loop/agent_loop.py    |  15 +-
 .../agent_loop/tool_agent_loop.py             |   2 +-
 verl/experimental/agent_loop/tool_parser.py   | 186 +++++++++++++++++-
 3 files changed, 189 insertions(+), 14 deletions(-)

diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index e0c77966dc0..7c11f2ca6cf 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -35,7 +35,7 @@
 from verl.experimental.agent_loop.utils import resolve_config_path
 from verl.protocol import DataProto
 from verl.single_controller.ray.base import RayResourcePool, RayWorkerGroup
-from verl.utils.chat_template import initialize_system_prompt
+from verl.utils.chat_template import apply_chat_template_single_turn, initialize_system_prompt
 from verl.utils.config import omega_conf_to_dataclass
 from verl.utils.dataset.rl_dataset import RLHFDataset, get_dataset_class
 from verl.utils.model import compute_position_id_with_mask
@@ -270,18 +270,12 @@ async def apply_chat_template(
 
         Returns:
             list[int]: Prompt token ids.
-
-        .. todo::
-            Templates that require a user message (e.g. Qwen 3.5) will fail
-            when *messages* contains only tool/assistant turns.  Migrate to
-            :func:`verl.utils.chat_template.apply_chat_template_single_turn`
-            with ``full_conversation`` fallback — see the SFT dataset for the
-            reference pattern.
         """
         if self.processor is not None:
             raw_prompt = await self.loop.run_in_executor(
                 None,
-                lambda: self.processor.apply_chat_template(
+                lambda: apply_chat_template_single_turn(
+                    self.processor,
                     messages,
                     tools=tools,
                     add_generation_prompt=True,
@@ -309,7 +303,8 @@ async def apply_chat_template(
         else:
             tokenized_prompt = await self.loop.run_in_executor(
                 None,
-                lambda: self.tokenizer.apply_chat_template(
+                lambda: apply_chat_template_single_turn(
+                    self.tokenizer,
                     messages,
                     tools=tools,
                     add_generation_prompt=True,
diff --git a/verl/experimental/agent_loop/tool_agent_loop.py b/verl/experimental/agent_loop/tool_agent_loop.py
index c649a2fc3fd..cd330d53d61 100644
--- a/verl/experimental/agent_loop/tool_agent_loop.py
+++ b/verl/experimental/agent_loop/tool_agent_loop.py
@@ -249,7 +249,7 @@ async def _handle_generating_state(
             return AgentState.TERMINATED
 
         # Extract tool calls
-        _, agent_data.tool_calls = await self.tool_parser.extract_tool_calls(agent_data.response_ids)
+        _, agent_data.tool_calls = await self.tool_parser.extract_tool_calls(agent_data.response_ids, self.tool_schemas)
 
         # Handle interaction if needed
         if self.interaction_config_file:
diff --git a/verl/experimental/agent_loop/tool_parser.py b/verl/experimental/agent_loop/tool_parser.py
index 67ad75e2bb8..b035bd16115 100644
--- a/verl/experimental/agent_loop/tool_parser.py
+++ b/verl/experimental/agent_loop/tool_parser.py
@@ -15,10 +15,12 @@
 import logging
 import os
 from abc import ABC, abstractmethod
+from typing import Any, Optional
 
 import regex
 from pydantic import BaseModel
 
+from verl.tools.schemas import OpenAIFunctionToolSchema
 from verl.utils.ray_utils import get_event_loop
 from verl.utils.rollout_trace import rollout_trace_op
 
@@ -46,11 +48,14 @@ def __init__(self, tokenizer) -> None:
         self.tokenizer = tokenizer
 
     @abstractmethod
-    async def extract_tool_calls(self, responses_ids: list[int]) -> tuple[str, list[FunctionCall]]:
+    async def extract_tool_calls(
+        self, responses_ids: list[int], tools: list[OpenAIFunctionToolSchema] = None
+    ) -> tuple[str, list[FunctionCall]]:
         """Extract tool calls from the responses.
 
         Args:
             responses_ids (List[int]): The ids of the responses.
+            tools (List[OpenAIFunctionToolSchema], optional): OpenAI function tool schema.
 
         Returns:
             Tuple[str, List[FunctionCall]]: Content and extracted tool calls.
@@ -84,7 +89,9 @@ def __init__(self, tokenizer) -> None:
         self.tool_call_regex = regex.compile(r"<tool_call>(.*?)</tool_call>", regex.DOTALL)
 
     @rollout_trace_op
-    async def extract_tool_calls(self, responses_ids: list[int]) -> tuple[str, list[FunctionCall]]:
+    async def extract_tool_calls(
+        self, responses_ids: list[int], tools: list[OpenAIFunctionToolSchema] = None
+    ) -> tuple[str, list[FunctionCall]]:
         loop = get_event_loop()
         text = await loop.run_in_executor(None, self.tokenizer.decode, responses_ids)
         if self.tool_call_start_token not in text or self.tool_call_end_token not in text:
@@ -131,7 +138,9 @@ def __init__(self, tokenizer) -> None:
         )
 
     @rollout_trace_op
-    async def extract_tool_calls(self, responses_ids: list[int]) -> tuple[str, list[FunctionCall]]:
+    async def extract_tool_calls(
+        self, responses_ids: list[int], tools: list[OpenAIFunctionToolSchema] = None
+    ) -> tuple[str, list[FunctionCall]]:
         loop = get_event_loop()
         # We need to keep special tokens for gpt-oss model for better tool call extraction.
         text = await loop.run_in_executor(None, lambda: self.tokenizer.decode(responses_ids, skip_special_tokens=False))
@@ -159,3 +168,174 @@ async def extract_tool_calls(self, responses_ids: list[int]) -> tuple[str, list[
         content = regex.sub(self.tool_call_pattern, "", text)
 
         return content, function_calls
+
+
+@ToolParser.register("qwen3_coder")
+class Qwen3XMLToolParser(ToolParser):
+    """
+    Tool parser for qwen3_coder/qwen3.5 model.
+    Adapted from https://huggingface.co/Qwen/Qwen3-Coder-30B-A3B-Instruct/blob/main/qwen3coder_tool_parser.py
+
+    Args:
+        tokenizer: The tokenizer to use.
+    """
+
+    def __init__(self, tokenizer):
+        super().__init__(tokenizer)
+
+        self.tool_call_start_token: str = "<tool_call>"
+        self.tool_call_end_token: str = "</tool_call>"
+        self.tool_call_prefix: str = "<function="
+
+        self.tool_call_complete_regex = regex.compile(r"<tool_call>(.*?)</tool_call>", regex.DOTALL)
+        self.tool_call_regex = regex.compile(r"<tool_call>(.*?)</tool_call>|<tool_call>(.*?)$", regex.DOTALL)
+        self.tool_call_function_regex = regex.compile(r"<function=(.*?)</function>|<function=(.*)$", regex.DOTALL)
+        self.tool_call_parameter_regex = regex.compile(r"<parameter=(.*?)</parameter>|<parameter=(.*?)$", regex.DOTALL)
+
+    def _parse_xml_function_call(
+        self, function_call_str: str, tools: Optional[list[OpenAIFunctionToolSchema]]
+    ) -> FunctionCall:
+        def get_arguments_config(func_name: str) -> dict:
+            for config in tools:
+                if config.type == "function" and config.function.name == func_name:
+                    properties = config.function.parameters.properties
+                    return {k: v.model_dump() for k, v in properties.items()}
+            logger.warning(f"Tool '{func_name}' is not defined in the tools list.")
+            return {}
+
+        def convert_param_value(param_value: str, param_name: str, param_config: dict, func_name: str) -> Any:
+            # Handle null value for any type
+            if param_value.lower() == "null":
+                return None
+
+            if param_name not in param_config:
+                if param_config != {}:
+                    logger.warning(
+                        f"Parsed parameter '{param_name}' is not defined in the tool "
+                        f"parameters for tool '{func_name}', directly returning the string value."
+                    )
+                return param_value
+
+            if isinstance(param_config[param_name], dict) and "type" in param_config[param_name]:
+                param_type = str(param_config[param_name]["type"]).strip().lower()
+            else:
+                param_type = "string"
+            if param_type in ["string", "str", "text", "varchar", "char", "enum"]:
+                return param_value
+            elif (
+                param_type.startswith("int")
+                or param_type.startswith("uint")
+                or param_type.startswith("long")
+                or param_type.startswith("short")
+                or param_type.startswith("unsigned")
+            ):
+                try:
+                    param_value = int(param_value)
+                except Exception:
+                    logger.warning(
+                        f"Parsed value '{param_value}' of parameter '{param_name}' is not an integer in tool "
+                        f"'{func_name}', degenerating to string."
+                    )
+                return param_value
+            elif param_type.startswith("num") or param_type.startswith("float"):
+                try:
+                    float_param_value = float(param_value)
+                    param_value = (
+                        float_param_value if float_param_value - int(float_param_value) != 0 else int(float_param_value)
+                    )
+                except Exception:
+                    logger.warning(
+                        f"Parsed value '{param_value}' of parameter '{param_name}' is not a float in tool "
+                        f"'{func_name}', degenerating to string."
+                    )
+                return param_value
+            elif param_type in ["boolean", "bool", "binary"]:
+                param_value = param_value.lower()
+                if param_value not in ["true", "false"]:
+                    logger.warning(
+                        f"Parsed value '{param_value}' of parameter '{param_name}' is not a "
+                        f"boolean (`true` of `false`) in tool '{func_name}', degenerating to false."
+                    )
+                return param_value == "true"
+            else:
+                if param_type == "object" or param_type.startswith("dict"):
+                    try:
+                        param_value = json.loads(param_value)
+                        return param_value
+                    except Exception:
+                        logger.warning(
+                            f"Parsed value '{param_value}' of parameter '{param_name}' is not a valid "
+                            f"JSON object in tool '{func_name}', will try other methods to parse it."
+                        )
+                try:
+                    param_value = eval(param_value)
+                except Exception:
+                    logger.warning(
+                        f"Parsed value '{param_value}' of parameter '{param_name}' cannot be converted "
+                        f"via Python `eval()` in tool '{func_name}', degenerating to string."
+                    )
+                return param_value
+
+        # Extract function name
+        end_index = function_call_str.index(">")
+        function_name = function_call_str[:end_index]
+        param_config = get_arguments_config(function_name)
+        parameters = function_call_str[end_index + 1 :]
+        param_dict = {}
+        for match in self.tool_call_parameter_regex.findall(parameters):
+            match_text = match[0] if match[0] else match[1]
+            idx = match_text.index(">")
+            param_name = match_text[:idx]
+            param_value = str(match_text[idx + 1 :])
+            # Remove prefix and trailing \n
+            if param_value.startswith("\n"):
+                param_value = param_value[1:]
+            if param_value.endswith("\n"):
+                param_value = param_value[:-1]
+
+            param_dict[param_name] = convert_param_value(param_value, param_name, param_config, function_name)
+        return FunctionCall(name=function_name, arguments=json.dumps(param_dict, ensure_ascii=False))
+
+    def _get_function_calls(self, model_output: str) -> list[str]:
+        # Find all tool calls
+        matched_ranges = self.tool_call_regex.findall(model_output)
+        raw_tool_calls = [match[0] if match[0] else match[1] for match in matched_ranges]
+
+        # Back-off strategy if no tool_call tags found
+        if len(raw_tool_calls) == 0:
+            raw_tool_calls = [model_output]
+
+        raw_function_calls = []
+        for tool_call in raw_tool_calls:
+            raw_function_calls.extend(self.tool_call_function_regex.findall(tool_call))
+
+        function_calls = [match[0] if match[0] else match[1] for match in raw_function_calls]
+        return function_calls
+
+    @rollout_trace_op
+    async def extract_tool_calls(
+        self, responses_ids: list[int], tools: list[OpenAIFunctionToolSchema] = None
+    ) -> tuple[str, list[FunctionCall]]:
+        loop = get_event_loop()
+        text = await loop.run_in_executor(None, self.tokenizer.decode, responses_ids)
+        if self.tool_call_start_token not in text:
+            return text, []
+
+        try:
+            function_calls = self._get_function_calls(text)
+            if len(function_calls) == 0:
+                return text, []
+
+            tool_calls = [
+                self._parse_xml_function_call(function_call_str, tools) for function_call_str in function_calls
+            ]
+
+            # Extract content before tool calls
+            content_index = text.find(self.tool_call_start_token)
+            content_index = content_index if content_index >= 0 else text.find(self.tool_call_prefix)
+            content = text[:content_index]  # .rstrip()
+
+            return content, tool_calls
+        except Exception as e:
+            logger.exception(f"Error in extracting tool call from response: {e}")
+            return text, []

From a6976314ce3b4368b4ea9241f1529b7bdb9c963f Mon Sep 17 00:00:00 2001
From: wuxibin <wuxibin@bytedance.com>
Date: Tue, 10 Mar 2026 20:36:47 +0800
Subject: [PATCH 10/11] fix apply_chat_template

---
 verl/experimental/agent_loop/agent_loop.py     | 18 +++++++++++-------
 .../experimental/agent_loop/tool_agent_loop.py |  3 ++-
 verl/utils/chat_template.py                    | 15 ++++++---------
 verl/utils/dataset/multiturn_sft_dataset.py    |  4 ++--
 4 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index 7c11f2ca6cf..0bd4ca42d8f 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -35,7 +35,7 @@
 from verl.experimental.agent_loop.utils import resolve_config_path
 from verl.protocol import DataProto
 from verl.single_controller.ray.base import RayResourcePool, RayWorkerGroup
-from verl.utils.chat_template import apply_chat_template_single_turn, initialize_system_prompt
+from verl.utils.chat_template import apply_chat_template, initialize_system_prompt
 from verl.utils.config import omega_conf_to_dataclass
 from verl.utils.dataset.rl_dataset import RLHFDataset, get_dataset_class
 from verl.utils.model import compute_position_id_with_mask
@@ -274,7 +274,7 @@ async def apply_chat_template(
         if self.processor is not None:
             raw_prompt = await self.loop.run_in_executor(
                 None,
-                lambda: apply_chat_template_single_turn(
+                lambda: apply_chat_template(
                     self.processor,
                     messages,
                     tools=tools,
@@ -303,7 +303,7 @@ async def apply_chat_template(
         else:
             tokenized_prompt = await self.loop.run_in_executor(
                 None,
-                lambda: apply_chat_template_single_turn(
+                lambda: apply_chat_template(
                     self.tokenizer,
                     messages,
                     tools=tools,
@@ -678,15 +678,19 @@ def _compute_position_ids(self, input_ids, attention_mask, multi_modal_inputs) -
         if self.processor is None:
             return compute_position_id_with_mask(attention_mask)  # (1, seq_len)
 
-        image_grid_thw = multi_modal_inputs.get("image_grid_thw")
-        video_grid_thw = multi_modal_inputs.get("video_grid_thw")
+        # For transformers>=5.0.0, mm_token_type_ids is only used to calculate position ids.
+        if multi_modal_inputs.pop("mm_token_type_ids", None) is not None:
+            multi_modal_inputs = multi_modal_inputs.copy()
+            mm_token_type_ids = torch.zeros_like(input_ids)
+            mm_token_type_ids[0][input_ids[0] == self.processor.image_token_id] = 1
+            mm_token_type_ids[0][input_ids[0] == self.processor.video_token_id] = 2
+            multi_modal_inputs["mm_token_type_ids"] = mm_token_type_ids
 
         # Model's get_rope_index has been dynamically bind to the processor.
         vision_position_ids, _ = self.processor.get_rope_index(
             input_ids=input_ids,
-            image_grid_thw=image_grid_thw,
-            video_grid_thw=video_grid_thw,
             attention_mask=attention_mask,
+            **multi_modal_inputs,
         )
         vision_position_ids = vision_position_ids.transpose(0, 1)  # (3, 1, seq_len) => (1, 3, seq_len)
 
diff --git a/verl/experimental/agent_loop/tool_agent_loop.py b/verl/experimental/agent_loop/tool_agent_loop.py
index cd330d53d61..36ac11c498d 100644
--- a/verl/experimental/agent_loop/tool_agent_loop.py
+++ b/verl/experimental/agent_loop/tool_agent_loop.py
@@ -249,7 +249,8 @@ async def _handle_generating_state(
             return AgentState.TERMINATED
 
         # Extract tool calls
-        _, agent_data.tool_calls = await self.tool_parser.extract_tool_calls(agent_data.response_ids, self.tool_schemas)
+        tools = [tool.tool_schema for tool in self.tools.values()]
+        _, agent_data.tool_calls = await self.tool_parser.extract_tool_calls(agent_data.response_ids, tools)
 
         # Handle interaction if needed
         if self.interaction_config_file:
diff --git a/verl/utils/chat_template.py b/verl/utils/chat_template.py
index 5dcef56afa1..39a1dd9cdf8 100644
--- a/verl/utils/chat_template.py
+++ b/verl/utils/chat_template.py
@@ -50,7 +50,7 @@ def extract_system_prompt_and_generation(tokenizer):
     return system_prompt, generate_prompt
 
 
-def apply_chat_template_single_turn(
+def apply_chat_template(
     processor: PreTrainedTokenizerBase | ProcessorMixin,
     messages: list[dict],
     *,
@@ -58,25 +58,23 @@ def apply_chat_template_single_turn(
     add_generation_prompt: bool = True,
     tools=None,
     return_dict: bool = False,
-    return_mm_token_type_ids: bool = False,
     **kwargs,
 ) -> list[int] | str:
-    """apply_chat_template to a single turn's messages.
+    """apply_chat_template to messages with special attention to template requiring
+    at least one user message, e.g. Qwen3.5.
 
     Args:
         processor: tokenizer or processor.
-        messages: list[dict], single turn messages.
+        messages: list[dict], messages.
         tokenize: bool, whether to tokenize the output.
         add_generation_prompt: bool, whether to add generation prompt.
         tools: list[dict], tools schema.
         return_dict: bool, whether to return a dict.
-        return_mm_token_type_ids: bool, whether to return multimodal token type ids.
         **kwargs: additional arguments for apply_chat_template.
 
     Returns:
         list[int] | str: tokenized ids or text string.
     """
-    assert isinstance(messages, list) and len(messages) == 1, f"messages must be a single turn, got {messages}"
     try:
         return processor.apply_chat_template(
             messages,
@@ -84,7 +82,6 @@ def apply_chat_template_single_turn(
             add_generation_prompt=add_generation_prompt,
             tools=tools,
             return_dict=return_dict,
-            return_mm_token_type_ids=return_mm_token_type_ids,
             **kwargs,
         )
     except Exception:
@@ -96,7 +93,6 @@ def apply_chat_template_single_turn(
             add_generation_prompt=False,
             tools=tools,
             return_dict=return_dict,
-            return_mm_token_type_ids=return_mm_token_type_ids,
             **kwargs,
         )
         output = processor.apply_chat_template(
@@ -105,7 +101,6 @@ def apply_chat_template_single_turn(
             add_generation_prompt=add_generation_prompt,
             tools=tools,
             return_dict=return_dict,
-            return_mm_token_type_ids=return_mm_token_type_ids,
             **kwargs,
         )
 
@@ -123,4 +118,6 @@ def apply_chat_template_single_turn(
             prefix_len = dummy_user_prefix["input_ids"].shape[1]
             output["input_ids"] = output["input_ids"][:, prefix_len:]
             output["attention_mask"] = output["attention_mask"][:, prefix_len:]
+            if "mm_token_type_ids" in output:
+                output["mm_token_type_ids"] = output["mm_token_type_ids"][:, prefix_len:]
             return output
diff --git a/verl/utils/dataset/multiturn_sft_dataset.py b/verl/utils/dataset/multiturn_sft_dataset.py
index 8f3ae1e166a..846e84c431e 100644
--- a/verl/utils/dataset/multiturn_sft_dataset.py
+++ b/verl/utils/dataset/multiturn_sft_dataset.py
@@ -32,7 +32,7 @@
 
 from verl.models.transformers.qwen2_vl import get_rope_index
 from verl.utils import hf_tokenizer
-from verl.utils.chat_template import apply_chat_template_single_turn, extract_system_prompt_and_generation
+from verl.utils.chat_template import apply_chat_template, extract_system_prompt_and_generation
 from verl.utils.dataset.dataset_utils import DatasetPadMode
 from verl.utils.dataset.vision_utils import process_image, process_video
 from verl.utils.fs import copy_local_path_from_hdfs
@@ -208,7 +208,7 @@ def _process_single_message(
         if enable_thinking is not None:
             apply_chat_template_kwargs["enable_thinking"] = enable_thinking
 
-        inputs = apply_chat_template_single_turn(
+        inputs = apply_chat_template(
             processor,
             messages=[message],
             tools=tools,

From 7c0b499cb4fecf5976e79bdc109cf669604e67e4 Mon Sep 17 00:00:00 2001
From: wuxibin <wuxibin@bytedance.com>
Date: Tue, 10 Mar 2026 22:55:34 +0800
Subject: [PATCH 11/11] fix processor

---
 verl/experimental/agent_loop/agent_loop.py |  2 +-
 verl/utils/tokenizer.py                    | 14 ++++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index 0bd4ca42d8f..706656eb402 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -678,7 +678,7 @@ def _compute_position_ids(self, input_ids, attention_mask, multi_modal_inputs) -
         if self.processor is None:
             return compute_position_id_with_mask(attention_mask)  # (1, seq_len)
 
-        # For transformers>=5.0.0, mm_token_type_ids is only used to calculate position ids.
+        # For transformers>=5.3.0, mm_token_type_ids is only used to calculate position ids.
         if multi_modal_inputs.pop("mm_token_type_ids", None) is not None:
             multi_modal_inputs = multi_modal_inputs.copy()
             mm_token_type_ids = torch.zeros_like(input_ids)
diff --git a/verl/utils/tokenizer.py b/verl/utils/tokenizer.py
index 5a99f1e695c..0cd521f9e96 100644
--- a/verl/utils/tokenizer.py
+++ b/verl/utils/tokenizer.py
@@ -126,27 +126,33 @@ def hf_processor(name_or_path, **kwargs):
 
         # Bind vlm model's get_rope_index method to processor
         processor.config = config
+        model_class = None
         match processor.__class__.__name__:
             case "Qwen2VLProcessor":
                 from transformers.models.qwen2_vl import Qwen2VLModel
 
-                processor.get_rope_index = types.MethodType(Qwen2VLModel.get_rope_index, processor)
+                model_class = Qwen2VLModel
             case "Qwen2_5_VLProcessor":
                 from transformers.models.qwen2_5_vl import Qwen2_5_VLModel
 
-                processor.get_rope_index = types.MethodType(Qwen2_5_VLModel.get_rope_index, processor)
+                model_class = Qwen2_5_VLModel
             case "Qwen3VLProcessor":
                 from transformers.models.qwen3_vl import Qwen3VLModel
 
-                processor.get_rope_index = types.MethodType(Qwen3VLModel.get_rope_index, processor)
+                model_class = Qwen3VLModel
             case "Glm4vImageProcessor":
                 from transformers.models.glm4v import Glm4vModel
 
-                processor.get_rope_index = types.MethodType(Glm4vModel.get_rope_index, processor)
+                model_class = Glm4vModel
             case "MllamaProcessor":
                 pass  # MllamaProcessor and MllamaModel doesn't have get_rope_index property
             case _:
                 raise ValueError(f"Unsupported processor type: {processor.__class__.__name__}")
+
+        if model_class is not None:
+            processor.get_rope_index = types.MethodType(model_class.get_rope_index, processor)
+            if hasattr(model_class, "get_vision_position_ids"):
+                processor.get_vision_position_ids = types.MethodType(model_class.get_vision_position_ids, processor)
     except Exception as e:
         processor = None
         # TODO(haibin.lin): try-catch should be removed after adding transformer version req to setup.py to avoid