Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 166 additions & 0 deletions examples/grpo_trainer/run_qwen3_5-35b-megatron.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
#!/usr/bin/env bash
# Qwen3.5-35B-A3B MoE GRPO RL with Megatron (single node, 8 GPUs, geo3k dataset)
#
# notes on vllm:
# by 20260225, the latest vllm nightly does not support qwen3.5 rollout, to use this script, you need to
# 1. wait until vllm supports qwen3.5 officially, and build a verl docker with that version of vllm
# 2. self build a verl docker image with vllm from source code with qwen3.5 support (main branch 20260225 is OK)
# I succeeded in running this script with the main branch of vllm on 20260225, yet there are still some minor issues
# the vllm qwen3.5 during initialization, need to be fixed. Also, the cuda_graph is somehow not working, need to be
# fixed, either by verl team with supoorts to vllm0.16, or by vllm team.
# Requirements:
# - 8 GPUs (80GB each, e.g. 1x8 H100/H200)
# - Additional packages on top of the base image:
# pip install --upgrade transformers
# pip install flash-linear-attention
# pip install -U git+https://github.com/ISEEKYAN/mbridge.git
# - Megatron-LM dev branch with Qwen3.5 GDN support
#
# Qwen3.5 architecture notes:
# Qwen3.5 uses Gated Delta Net (GDN) linear attention which currently does
# NOT support packed sequences (THD format) in Megatron-LM. Therefore:
# - model.use_remove_padding=False (deprecated option, will be removed in the future forces bshd compute format)
# - actor.megatron.use_remove_padding=False (forces bshd compute format)
# - actor.use_dynamic_bsz=False (required for bshd mode)
#
# Once Megatron-LM adds THD support for Qwen3.5 GDN, use_remove_padding
# can be set to True for better performance.
#
# Tested parallelism config (8 GPUs / 1 node):
# TP=2 PP=1 CP=1 EP=8 ETP=1 GEN_TP=8
#

export CUDA_DEVICE_MAX_CONNECTIONS=1
export VLLM_USE_V1=1
export VLLM_ALLREDUCE_USE_SYMM_MEM=0

set -xeuo pipefail

########################### Quick Config ###########################

TP=${TP:-2}
PP=${PP:-1}
CP=${CP:-1}
EP=${EP:-8}
ETP=${ETP:-1}
GEN_TP=${GEN_TP:-8}

ALL_OFFLOAD=${ALL_OFFLOAD:-True}

rollout_name="vllm"
project_name='verl_grpo_qwen3_5_35b_geo3k'
exp_name='qwen3_5_35b_megatron'
adv_estimator=grpo

HF_MODEL_PATH=${HF_MODEL_PATH:-"Qwen3.5-35B-A3B"}
train_path=${train_path:-$HOME/data/geo3k/train.parquet}
test_path=${test_path:-$HOME/data/geo3k/test.parquet}

########################### Parameter Arrays ###########################

DATA=(
data.train_files=${train_path}
data.val_files=${test_path}
data.train_batch_size=32
data.max_prompt_length=1024
data.max_response_length=2048
data.truncation='error'
data.filter_overlong_prompts=True
)

MODEL=(
actor_rollout_ref.model.path=${HF_MODEL_PATH}
actor_rollout_ref.model.trust_remote_code=True
actor_rollout_ref.model.use_remove_padding=False
)

ACTOR=(
actor_rollout_ref.actor.optim.lr=1e-6
actor_rollout_ref.actor.ppo_mini_batch_size=32
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=4096
actor_rollout_ref.actor.use_dynamic_bsz=False
actor_rollout_ref.actor.use_kl_loss=True
actor_rollout_ref.actor.kl_loss_coef=0.01
actor_rollout_ref.actor.kl_loss_type=low_var_kl
actor_rollout_ref.actor.entropy_coeff=0
actor_rollout_ref.actor.megatron.use_mbridge=True
actor_rollout_ref.actor.megatron.vanilla_mbridge=True
actor_rollout_ref.actor.megatron.use_remove_padding=False
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${TP}
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${PP}
actor_rollout_ref.actor.megatron.context_parallel_size=${CP}
actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP}
actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP}
actor_rollout_ref.actor.megatron.param_offload=${ALL_OFFLOAD}
actor_rollout_ref.actor.megatron.optimizer_offload=${ALL_OFFLOAD}
actor_rollout_ref.actor.megatron.grad_offload=${ALL_OFFLOAD}
actor_rollout_ref.actor.megatron.dtype=bfloat16
++actor_rollout_ref.actor.megatron.override_transformer_config.attention_backend=auto
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_aux_loss_coeff=0.01
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_z_loss_coeff=0.001
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=1
+actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True
+actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True
)

ROLLOUT=(
actor_rollout_ref.rollout.name=${rollout_name}
actor_rollout_ref.rollout.tensor_model_parallel_size=${GEN_TP}
actor_rollout_ref.rollout.gpu_memory_utilization=0.6
actor_rollout_ref.rollout.n=5
actor_rollout_ref.rollout.mode=async
actor_rollout_ref.rollout.enforce_eager=True
actor_rollout_ref.rollout.dtype=bfloat16
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=False
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=4096
)

REF=(
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=False
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=4096
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${TP}
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${PP}
actor_rollout_ref.ref.megatron.context_parallel_size=${CP}
actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP}
actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP}
actor_rollout_ref.ref.megatron.param_offload=${ALL_OFFLOAD}
)

ALGORITHM=(
algorithm.adv_estimator=${adv_estimator}
algorithm.use_kl_in_reward=False
)

TRAINER=(
trainer.critic_warmup=0
trainer.logger='["console","wandb"]'
trainer.project_name=${project_name}
trainer.experiment_name=${exp_name}
trainer.n_gpus_per_node=8
trainer.nnodes=1
trainer.save_freq=20
trainer.val_before_train=False
trainer.test_freq=5
trainer.total_epochs=15
)

########################### Launch ###########################

python3 -m verl.trainer.main_ppo \
--config-path=config \
--config-name='ppo_megatron_trainer.yaml' \
"${DATA[@]}" \
"${ALGORITHM[@]}" \
"${MODEL[@]}" \
"${ROLLOUT[@]}" \
"${ACTOR[@]}" \
"${REF[@]}" \
"${TRAINER[@]}" \
"$@"
142 changes: 142 additions & 0 deletions examples/sft/gsm8k/run_qwen3_5_megatron.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#!/usr/bin/env bash
# Qwen3.5-397B-A17B SFT with Megatron backend + mbridge
#
# Requirements:
# - 128+ GPUs (80GB each, e.g. 16x8 H100/H200)
# - Docker: verlai/verl:vllm015 (or equivalent)
# - Additional packages on top of the base image:
# pip install --upgrade transformers
# pip install flash-linear-attention
# pip install -U git+https://github.com/ISEEKYAN/mbridge.git
# - Megatron-LM dev branch with Qwen3.5 GDN support
#
# Qwen3.5 architecture notes:
# Qwen3.5 uses Gated Delta Net (GDN) linear attention which currently does
# NOT support packed sequences (THD format) in Megatron-LM. Therefore:
# - engine.use_remove_padding=False (forces bshd compute format)
# - data.use_dynamic_bsz=False (required for bshd mode)
#
# Once https://github.com/NVIDIA/Megatron-LM/pull/2644 is merged, THD
# format will be supported and engine.use_remove_padding can be set to True
# for better performance.
#
# Tested parallelism config (128 GPUs / 16 nodes):
# TP=2 PP=4 EP=32 CP=1

set -xeuo pipefail

# ============================================================
# Distributed
# ============================================================
NUM_GPUS=${NUM_GPUS:-8}
MASTER_ADDR=${MASTER_ADDR:-localhost}
MASTER_PORT=${MASTER_PORT:-29500}
NNODES=${NNODES:-16}
NODE_RANK=${NODE_RANK:-0}

# ============================================================
# Data
# ============================================================
DATASET_DIR=${DATASET_DIR:-~/dataset}
TRAIN_FILES=${TRAIN_FILES:-${DATASET_DIR}/train.parquet}

# ============================================================
# Model
# ============================================================
MODEL_PATH=${MODEL_PATH:-Qwen/Qwen3.5-397B-A17B}

# ============================================================
# Parallelism
# ============================================================
TP_SIZE=${TP_SIZE:-2}
PP_SIZE=${PP_SIZE:-4}
VPP_SIZE=${VPP_SIZE:-null}
CP_SIZE=${CP_SIZE:-1}
EP_SIZE=${EP_SIZE:-32}
ETP_SIZE=${ETP_SIZE:-1}

# ============================================================
# Training
# ============================================================
TRAIN_BATCH_SIZE=${TRAIN_BATCH_SIZE:-128}
MICRO_BATCH_SIZE=${MICRO_BATCH_SIZE:-2}
MAX_LENGTH=${MAX_LENGTH:-2048}
LR=${LR:-2e-5}
MIN_LR=${MIN_LR:-2e-6}
DTYPE=${DTYPE:-bfloat16}

BACKEND=megatron
RESUME_MODE=${RESUME_MODE:-disable}

project_name=verl_sft_qwen3_5
exp_name=qwen3_5-${BACKEND}-tp${TP_SIZE}-pp${PP_SIZE}-cp${CP_SIZE}-ep${EP_SIZE}
ckpts_home=${ckpts_home:-~/verl/checkpoints/${project_name}/${exp_name}}
mkdir -p "${ckpts_home}"

# ============================================================
# Engine config
# ============================================================
# Key Qwen3.5 settings:
# engine.use_remove_padding=False - GDN requires bshd format (no THD)
# engine.vanilla_mbridge=True - use mbridge (not megatron-bridge)
ENGINE_CONFIG="\
engine=${BACKEND} \
optim=${BACKEND} \
optim.lr=${LR} \
optim.min_lr=${MIN_LR} \
optim.lr_warmup_steps=10 \
optim.weight_decay=0.1 \
optim.betas='[0.9,0.95]' \
optim.clip_grad=1.0 \
optim.lr_warmup_init=0 \
optim.lr_decay_style=cosine \
+optim.override_optimizer_config.optimizer_offload_fraction=1 \
+optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \
+optim.override_optimizer_config.use_precision_aware_optimizer=True \
+optim.override_optimizer_config.optimizer_cpu_offload=True \
engine.tensor_model_parallel_size=${TP_SIZE} \
engine.pipeline_model_parallel_size=${PP_SIZE} \
engine.virtual_pipeline_model_parallel_size=${VPP_SIZE} \
engine.context_parallel_size=${CP_SIZE} \
engine.expert_model_parallel_size=${EP_SIZE} \
engine.expert_tensor_parallel_size=${ETP_SIZE} \
engine.use_mbridge=True \
engine.vanilla_mbridge=True \
engine.dtype=${DTYPE} \
engine.use_remove_padding=False \
engine.override_transformer_config.attention_backend=auto \
+engine.override_transformer_config.recompute_method=uniform \
+engine.override_transformer_config.recompute_granularity=full \
+engine.override_transformer_config.recompute_num_layers=1"

# ============================================================
# Launch
# ============================================================
torchrun \
--nproc_per_node=${NUM_GPUS} \
--nnodes=${NNODES} \
--node_rank=${NODE_RANK} \
--master_addr=${MASTER_ADDR} \
--master_port=${MASTER_PORT} \
-m verl.trainer.sft_trainer \
data.train_files="${TRAIN_FILES}" \
data.train_batch_size=${TRAIN_BATCH_SIZE} \
data.micro_batch_size_per_gpu=${MICRO_BATCH_SIZE} \
data.max_length=${MAX_LENGTH} \
data.pad_mode=no_padding \
data.truncation=error \
data.use_dynamic_bsz=False \
data.max_token_len_per_gpu=${MAX_LENGTH} \
data.messages_key=messages \
model.path=${MODEL_PATH} \
model.use_remove_padding=False \
model.trust_remote_code=True \
${ENGINE_CONFIG} \
trainer.test_freq=-1 \
trainer.save_freq=500 \
trainer.logger="['console']" \
trainer.project_name="${project_name}" \
trainer.experiment_name="${exp_name}" \
trainer.total_epochs=1 \
trainer.default_local_dir="${ckpts_home}" \
trainer.resume_mode=${RESUME_MODE}
7 changes: 7 additions & 0 deletions verl/experimental/agent_loop/agent_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,13 @@ async def apply_chat_template(
Returns:
list[int]: Prompt token ids.
.. todo::
Templates that require a user message (e.g. Qwen 3.5) will fail
when *messages* contains only tool/assistant turns. Migrate to
:func:`verl.utils.chat_template.apply_chat_template_single_turn`
with ``full_conversation`` fallback — see the SFT dataset for the
reference pattern.
"""
if self.processor is not None:
raw_prompt = await self.loop.run_in_executor(
Expand Down
21 changes: 16 additions & 5 deletions verl/models/mcore/model_forward.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,22 +122,33 @@ def model_forward(
When using the bshd format, we have to add paddings to the input_ids to meet the longest sequence length,
so it is recommended to disable dynamic batch size and set batch size to 1
"""
assert not vision_model, "vision model does not support bshd format"
assert fp8 is None, "fp8 is not supported for bshd format yet"

batch_size, sequence_length = attention_mask.shape[:2]
position_ids_for_preprocess = (
torch.arange(sequence_length, device=input_ids.device).unsqueeze(0).expand(batch_size, -1)
if vision_model
else position_ids
)
pre_process_for_bshd = True if vision_model else pre_process
new_input_ids, new_attention_mask, new_position_ids = preprocess_bshd(
input_ids, attention_mask, position_ids, sequence_parallel=sp, pre_process=pre_process
input_ids,
attention_mask,
position_ids_for_preprocess,
sequence_parallel=sp,
pre_process=pre_process_for_bshd,
)
output_orig = model(
input_ids=new_input_ids,
position_ids=new_position_ids,
position_ids=None if vision_model else new_position_ids,
attention_mask=new_attention_mask,
**model_kwargs,
)
if post_process and logits_processor is not None:
args = {
k: preprocess_bshd(v, attention_mask, position_ids, sequence_parallel=sp, pre_process=True)[0]
k: preprocess_bshd(
v, attention_mask, position_ids_for_preprocess, sequence_parallel=sp, pre_process=True
)[0]
for k, v in logits_processor_args.items()
}
output_dict = logits_processor(output_orig, **args)
Expand Down Expand Up @@ -258,7 +269,7 @@ def gptmodel_forward_no_padding(
output_orig = model(
input_ids=input_ids_bshd,
attention_mask=attention_mask_bshd,
position_ids=position_ids_bshd,
position_ids=None if vision_model else position_ids_bshd,
**model_kwargs,
)
if post_process and logits_processor is not None:
Expand Down
2 changes: 2 additions & 0 deletions verl/models/mcore/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ class SupportedVLM(Enum):
QWEN2_5_VL = "Qwen2_5_VLForConditionalGeneration"
QWEN3_MOE_VL = "Qwen3VLMoeForConditionalGeneration"
QWEN3_VL = "Qwen3VLForConditionalGeneration"
QWEN3_5_MOE_VL = "Qwen3_5MoeForConditionalGeneration"
QWEN3_5_VL = "Qwen3_5ForConditionalGeneration"


supported_vlm = [member.value for member in SupportedVLM]
Expand Down
Loading
Loading