verl-project · ISEEKYAN · Feb 24, 2026 · Feb 26, 2026 · Mar 3, 2026 · Mar 3, 2026
@@ -0,0 +1,166 @@
+#!/usr/bin/env bash
+# Qwen3.5-35B-A3B MoE GRPO RL with Megatron (single node, 8 GPUs, geo3k dataset)
+#
+# notes on vllm:
+#     by 20260225, the latest vllm nightly does not support qwen3.5 rollout, to use this script, you need to 
+#         1. wait until vllm supports qwen3.5 officially, and build a verl docker with that version of vllm
+#         2. self build a verl docker image with vllm from source code with qwen3.5 support (main branch 20260225 is OK)
+#     I succeeded in running this script with the main branch of vllm on 20260225, yet there are still some minor issues
+#     the vllm qwen3.5 during initialization, need to be fixed. Also, the cuda_graph is somehow not working, need to be 
+#     fixed, either by verl team with supoorts to vllm0.16, or by vllm team.
+# Requirements:
+#   - 8 GPUs (80GB each, e.g. 1x8 H100/H200)
+#   - Additional packages on top of the base image:
+#       pip install --upgrade transformers
+#       pip install flash-linear-attention
+#       pip install -U git+https://github.com/ISEEKYAN/mbridge.git
+#   - Megatron-LM dev branch with Qwen3.5 GDN support
+#
+# Qwen3.5 architecture notes:
+#   Qwen3.5 uses Gated Delta Net (GDN) linear attention which currently does
+#   NOT support packed sequences (THD format) in Megatron-LM. Therefore:
+#     - model.use_remove_padding=False           (deprecated option, will be removed in the future forces bshd compute format)
+#     - actor.megatron.use_remove_padding=False  (forces bshd compute format)
+#     - actor.use_dynamic_bsz=False              (required for bshd mode)
+#
+#   Once Megatron-LM adds THD support for Qwen3.5 GDN, use_remove_padding
+#   can be set to True for better performance.
+#
+# Tested parallelism config (8 GPUs / 1 node):
+#   TP=2 PP=1 CP=1 EP=8 ETP=1 GEN_TP=8
+#
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export VLLM_USE_V1=1
+export VLLM_ALLREDUCE_USE_SYMM_MEM=0
+
+set -xeuo pipefail
+
+########################### Quick Config ###########################
+
+TP=${TP:-2}
+PP=${PP:-1}
+CP=${CP:-1}
+EP=${EP:-8}
+ETP=${ETP:-1}
+GEN_TP=${GEN_TP:-8}
+
+ALL_OFFLOAD=${ALL_OFFLOAD:-True}
+
+rollout_name="vllm"
+project_name='verl_grpo_qwen3_5_35b_geo3k'
+exp_name='qwen3_5_35b_megatron'
+adv_estimator=grpo
+
+HF_MODEL_PATH=${HF_MODEL_PATH:-"Qwen3.5-35B-A3B"}
+train_path=${train_path:-$HOME/data/geo3k/train.parquet}
+test_path=${test_path:-$HOME/data/geo3k/test.parquet}
+
+########################### Parameter Arrays ###########################
+
+DATA=(
+    data.train_files=${train_path}
+    data.val_files=${test_path}
+    data.train_batch_size=32
+    data.max_prompt_length=1024
+    data.max_response_length=2048
+    data.truncation='error'
+    data.filter_overlong_prompts=True
+)
+
+MODEL=(
+    actor_rollout_ref.model.path=${HF_MODEL_PATH}
+    actor_rollout_ref.model.trust_remote_code=True
+    actor_rollout_ref.model.use_remove_padding=False
+)
+
+ACTOR=(
+    actor_rollout_ref.actor.optim.lr=1e-6
+    actor_rollout_ref.actor.ppo_mini_batch_size=32
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=4096
+    actor_rollout_ref.actor.use_dynamic_bsz=False
+    actor_rollout_ref.actor.use_kl_loss=True
+    actor_rollout_ref.actor.kl_loss_coef=0.01
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    actor_rollout_ref.actor.entropy_coeff=0
+    actor_rollout_ref.actor.megatron.use_mbridge=True
+    actor_rollout_ref.actor.megatron.vanilla_mbridge=True
+    actor_rollout_ref.actor.megatron.use_remove_padding=False
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${TP}
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${PP}
+    actor_rollout_ref.actor.megatron.context_parallel_size=${CP}
+    actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP}
+    actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP}
+    actor_rollout_ref.actor.megatron.param_offload=${ALL_OFFLOAD}
+    actor_rollout_ref.actor.megatron.optimizer_offload=${ALL_OFFLOAD}
+    actor_rollout_ref.actor.megatron.grad_offload=${ALL_OFFLOAD}
+    actor_rollout_ref.actor.megatron.dtype=bfloat16
+    ++actor_rollout_ref.actor.megatron.override_transformer_config.attention_backend=auto
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_aux_loss_coeff=0.01
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_z_loss_coeff=0.001
+    +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=1
+    +actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True
+    +actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True
+    +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True
+)
+
+ROLLOUT=(
+    actor_rollout_ref.rollout.name=${rollout_name}
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${GEN_TP}
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6
+    actor_rollout_ref.rollout.n=5
+    actor_rollout_ref.rollout.mode=async
+    actor_rollout_ref.rollout.enforce_eager=True
+    actor_rollout_ref.rollout.dtype=bfloat16
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=False
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=4096
+)
+
+REF=(
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=False
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=4096
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${TP}
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${PP}
+    actor_rollout_ref.ref.megatron.context_parallel_size=${CP}
+    actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP}
+    actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP}
+    actor_rollout_ref.ref.megatron.param_offload=${ALL_OFFLOAD}
+)
+
+ALGORITHM=(
+    algorithm.adv_estimator=${adv_estimator}
+    algorithm.use_kl_in_reward=False
+)
+
+TRAINER=(
+    trainer.critic_warmup=0
+    trainer.logger='["console","wandb"]'
+    trainer.project_name=${project_name}
+    trainer.experiment_name=${exp_name}
+    trainer.n_gpus_per_node=8
+    trainer.nnodes=1
+    trainer.save_freq=20
+    trainer.val_before_train=False
+    trainer.test_freq=5
+    trainer.total_epochs=15
+)
+
+########################### Launch ###########################
+
+python3 -m verl.trainer.main_ppo \
+    --config-path=config \
+    --config-name='ppo_megatron_trainer.yaml' \
+    "${DATA[@]}" \
+    "${ALGORITHM[@]}" \
+    "${MODEL[@]}" \
+    "${ROLLOUT[@]}" \
+    "${ACTOR[@]}" \
+    "${REF[@]}" \
+    "${TRAINER[@]}" \
+    "$@"
diff --git a/examples/sft/gsm8k/run_qwen3_5_megatron.sh b/examples/sft/gsm8k/run_qwen3_5_megatron.sh
@@ -0,0 +1,142 @@
+#!/usr/bin/env bash
+# Qwen3.5-397B-A17B SFT with Megatron backend + mbridge
+#
+# Requirements:
+#   - 128+ GPUs (80GB each, e.g. 16x8 H100/H200)
+#   - Docker: verlai/verl:vllm015 (or equivalent)
+#   - Additional packages on top of the base image:
+#       pip install --upgrade transformers
+#       pip install flash-linear-attention
+#       pip install -U git+https://github.com/ISEEKYAN/mbridge.git
+#   - Megatron-LM dev branch with Qwen3.5 GDN support
+#
+# Qwen3.5 architecture notes:
+#   Qwen3.5 uses Gated Delta Net (GDN) linear attention which currently does
+#   NOT support packed sequences (THD format) in Megatron-LM. Therefore:
+#     - engine.use_remove_padding=False  (forces bshd compute format)
+#     - data.use_dynamic_bsz=False       (required for bshd mode)
+#
+#   Once https://github.com/NVIDIA/Megatron-LM/pull/2644 is merged, THD
+#   format will be supported and engine.use_remove_padding can be set to True
+#   for better performance.
+#
+# Tested parallelism config (128 GPUs / 16 nodes):
+#   TP=2 PP=4 EP=32 CP=1
+
+set -xeuo pipefail
+
+# ============================================================
+# Distributed
+# ============================================================
+NUM_GPUS=${NUM_GPUS:-8}
+MASTER_ADDR=${MASTER_ADDR:-localhost}
+MASTER_PORT=${MASTER_PORT:-29500}
+NNODES=${NNODES:-16}
+NODE_RANK=${NODE_RANK:-0}
+
+# ============================================================
+# Data
+# ============================================================
+DATASET_DIR=${DATASET_DIR:-~/dataset}
+TRAIN_FILES=${TRAIN_FILES:-${DATASET_DIR}/train.parquet}
+
+# ============================================================
+# Model
+# ============================================================
+MODEL_PATH=${MODEL_PATH:-Qwen/Qwen3.5-397B-A17B}
+
+# ============================================================
+# Parallelism
+# ============================================================
+TP_SIZE=${TP_SIZE:-2}
+PP_SIZE=${PP_SIZE:-4}
+VPP_SIZE=${VPP_SIZE:-null}
+CP_SIZE=${CP_SIZE:-1}
+EP_SIZE=${EP_SIZE:-32}
+ETP_SIZE=${ETP_SIZE:-1}
+
+# ============================================================
+# Training
+# ============================================================
+TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-128}
+MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-2}
+MAX_LENGTH=${MAX_LENGTH:-2048}
+LR=${LR:-2e-5}
+MIN_LR=${MIN_LR:-2e-6}
+DTYPE=${DTYPE:-bfloat16}
+
+BACKEND=megatron
+RESUME_MODE=${RESUME_MODE:-disable}
+
+project_name=verl_sft_qwen3_5
+exp_name=qwen3_5-${BACKEND}-tp${TP_SIZE}-pp${PP_SIZE}-cp${CP_SIZE}-ep${EP_SIZE}
+ckpts_home=${ckpts_home:-~/verl/checkpoints/${project_name}/${exp_name}}
+mkdir -p "${ckpts_home}"
+
+# ============================================================
+# Engine config
+# ============================================================
+# Key Qwen3.5 settings:
+#   engine.use_remove_padding=False   - GDN requires bshd format (no THD)
+#   engine.vanilla_mbridge=True       - use mbridge (not megatron-bridge)
+ENGINE_CONFIG="\
+    engine=${BACKEND} \
+    optim=${BACKEND} \
+    optim.lr=${LR} \
+    optim.min_lr=${MIN_LR} \
+    optim.lr_warmup_steps=10 \
+    optim.weight_decay=0.1 \
+    optim.betas='[0.9,0.95]' \
+    optim.clip_grad=1.0 \
+    optim.lr_warmup_init=0 \
+    optim.lr_decay_style=cosine \
+    +optim.override_optimizer_config.optimizer_offload_fraction=1 \
+    +optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \
+    +optim.override_optimizer_config.use_precision_aware_optimizer=True \
+    +optim.override_optimizer_config.optimizer_cpu_offload=True \
+    engine.tensor_model_parallel_size=${TP_SIZE} \
+    engine.pipeline_model_parallel_size=${PP_SIZE} \
+    engine.virtual_pipeline_model_parallel_size=${VPP_SIZE} \
+    engine.context_parallel_size=${CP_SIZE} \
+    engine.expert_model_parallel_size=${EP_SIZE} \
+    engine.expert_tensor_parallel_size=${ETP_SIZE} \
+    engine.use_mbridge=True \
+    engine.vanilla_mbridge=True \
+    engine.dtype=${DTYPE} \
+    engine.use_remove_padding=False \
+    engine.override_transformer_config.attention_backend=auto \
+    +engine.override_transformer_config.recompute_method=uniform \
+    +engine.override_transformer_config.recompute_granularity=full \
+    +engine.override_transformer_config.recompute_num_layers=1"
+
+# ============================================================
+# Launch
+# ============================================================
+torchrun \
+    --nproc_per_node=${NUM_GPUS} \
+    --nnodes=${NNODES} \
+    --node_rank=${NODE_RANK} \
+    --master_addr=${MASTER_ADDR} \
+    --master_port=${MASTER_PORT} \
+    -m verl.trainer.sft_trainer \
+    data.train_files="${TRAIN_FILES}" \
+    data.train_batch_size=${TRAIN_BATCH_SIZE} \
+    data.micro_batch_size_per_gpu=${MICRO_BATCH_SIZE} \
+    data.max_length=${MAX_LENGTH} \
+    data.pad_mode=no_padding \
+    data.truncation=error \
+    data.use_dynamic_bsz=False \
+    data.max_token_len_per_gpu=${MAX_LENGTH} \
+    data.messages_key=messages \
+    model.path=${MODEL_PATH} \
+    model.use_remove_padding=False \
+    model.trust_remote_code=True \
+    ${ENGINE_CONFIG} \
+    trainer.test_freq=-1 \
+    trainer.save_freq=500 \
+    trainer.logger="['console']" \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.total_epochs=1 \
+    trainer.default_local_dir="${ckpts_home}" \
+    trainer.resume_mode=${RESUME_MODE}
@@ -270,6 +270,13 @@ async def apply_chat_template(
 
         Returns:
             list[int]: Prompt token ids.
+
+        .. todo::
+            Templates that require a user message (e.g. Qwen 3.5) will fail
+            when *messages* contains only tool/assistant turns.  Migrate to
+            :func:`verl.utils.chat_template.apply_chat_template_single_turn`
+            with ``full_conversation`` fallback — see the SFT dataset for the
+            reference pattern.
         """
         if self.processor is not None:
             raw_prompt = await self.loop.run_in_executor(

@@ -122,22 +122,33 @@ def model_forward(
             When using the bshd format, we have to add paddings to the input_ids to meet the longest sequence length, 
             so it is recommended to disable dynamic batch size and set batch size to 1
             """
-            assert not vision_model, "vision model does not support bshd format"
             assert fp8 is None, "fp8 is not supported for bshd format yet"
 
             batch_size, sequence_length = attention_mask.shape[:2]
+            position_ids_for_preprocess = (
+                torch.arange(sequence_length, device=input_ids.device).unsqueeze(0).expand(batch_size, -1)
+                if vision_model
+                else position_ids
+            )
+            pre_process_for_bshd = True if vision_model else pre_process
             new_input_ids, new_attention_mask, new_position_ids = preprocess_bshd(
-                input_ids, attention_mask, position_ids, sequence_parallel=sp, pre_process=pre_process
+                input_ids,
+                attention_mask,
+                position_ids_for_preprocess,
+                sequence_parallel=sp,
+                pre_process=pre_process_for_bshd,
             )
             output_orig = model(
                 input_ids=new_input_ids,
-                position_ids=new_position_ids,
+                position_ids=None if vision_model else new_position_ids,
                 attention_mask=new_attention_mask,
                 **model_kwargs,
             )
             if post_process and logits_processor is not None:
                 args = {
-                    k: preprocess_bshd(v, attention_mask, position_ids, sequence_parallel=sp, pre_process=True)[0]
+                    k: preprocess_bshd(
+                        v, attention_mask, position_ids_for_preprocess, sequence_parallel=sp, pre_process=True
+                    )[0]
                     for k, v in logits_processor_args.items()
                 }
                 output_dict = logits_processor(output_orig, **args)
@@ -258,7 +269,7 @@ def gptmodel_forward_no_padding(
         output_orig = model(
             input_ids=input_ids_bshd,
             attention_mask=attention_mask_bshd,
-            position_ids=position_ids_bshd,
+            position_ids=None if vision_model else position_ids_bshd,
             **model_kwargs,
         )
         if post_process and logits_processor is not None:

@@ -30,6 +30,8 @@ class SupportedVLM(Enum):
     QWEN2_5_VL = "Qwen2_5_VLForConditionalGeneration"
     QWEN3_MOE_VL = "Qwen3VLMoeForConditionalGeneration"
     QWEN3_VL = "Qwen3VLForConditionalGeneration"
+    QWEN3_5_MOE_VL = "Qwen3_5MoeForConditionalGeneration"
+    QWEN3_5_VL = "Qwen3_5ForConditionalGeneration"
 
 
 supported_vlm = [member.value for member in SupportedVLM]