verl-project · conver334 · Feb 27, 2026 · Feb 27, 2026 · Mar 4, 2026 · Mar 4, 2026
diff --git a/.github/workflows/e2e_ppo_trainer_megatron_fsdp_vllm.yml b/.github/workflows/e2e_ppo_trainer_megatron_fsdp_vllm.yml
@@ -0,0 +1,155 @@
+# # Tests layout
+
+# Each folder under tests/ corresponds to a test category for a sub-namespace in verl. For instance:
+# - `tests/trainer` for testing functionality related to `verl/trainer`
+# - `tests/models` for testing functionality related to `verl/models`
+# - ...
+
+# There are a few folders with `special_` prefix, created for special purposes:
+# - `special_distributed`: unit tests that must run with multiple GPUs
+# - `special_e2e`: end-to-end tests with training/generation scripts
+# - `special_npu`: tests for NPUs
+# - `special_sanity`: a suite of quick sanity tests
+# - `special_standalone`: a set of test that are designed to run in dedicated environments
+
+# Accelerators for tests
+# - By default tests are run with GPU available, except for the ones under `special_npu`, and any test script whose name ends with `on_cpu.py`.
+# - For test scripts with `on_cpu.py` name suffix would be tested on CPU resources in linux environment.
+
+# # Workflow layout
+
+# All CI tests are configured by yaml files in `.github/workflows/`. Here's an overview of all test configs:
+# 1. A list of always triggered CPU sanity tests: `check-pr-title.yml`, `secrets_scan.yml`, `check-pr-title,yml`, `pre-commit.yml`, `doc.yml`
+# 2. Some heavy multi-GPU unit tests, such as `model.yml`, `vllm.yml`, `sgl.yml`
+# 3. End-to-end tests: `e2e_*.yml`
+# 4. Unit tests
+#   - `cpu_unit_tests.yml`, run pytest on all scripts with file name pattern `tests/**/test_*_on_cpu.py`
+#   - `gpu_unit_tests.yml`, run pytest on all scripts with file without the `on_cpu.py` suffix.
+#   - Since cpu/gpu unit tests by default runs all tests under `tests`, please make sure tests are manually excluded in them when
+#     - new workflow yaml is added to `.github/workflows`
+#     - new tests are added to workflow mentioned in 2.
+
+name: e2e_ppo_trainer_megatron_fsdp_vllm
+
+on:
+  push:
+    branches:
+      - main
+      - v0.*
+    paths:
+      - "**/*.py"
+      - "!verl/trainer/fsdp_sft_trainer.py"
+      - "!verl/workers/**/*dp_*.py"
+      - "!verl/utils/fsdp_utils.py"
+      - "!verl/utils/checkpoint/fsdp_checkpoint_manager.py"
+      - "!verl/model_merger/fsdp_model_merger.py"
+  pull_request:
+    branches:
+      - main
+      - v0.*
+    paths:
+      - "**/*.py"
+      - "!docker/**"
+      - "!**/*.md"
+      - "!docs/**"
+      - "!examples/**"
+      - "!tests/**"
+      - "!verl/trainer/main_*.py"
+      - "!verl/trainer/fsdp_sft_trainer.py"
+      - "!verl/workers/**/*dp_*.py"
+      - "!verl/utils/fsdp_utils.py"
+      - "!verl/utils/checkpoint/fsdp_checkpoint_manager.py"
+      - "!verl/model_merger/fsdp_model_merger.py"
+      # Entrypoints
+      - ".github/workflows/e2e_ppo_trainer_megatron_fsdp_vllm.yml"
+      - "examples/data_preprocess/gsm8k.py"
+      - "tests/special_e2e/run_ppo_trainer_megatron.sh"
+      - "verl/trainer/main_ppo.py"
+      - "verl/trainer/config/ppo_megatron_trainer.yaml"
+      - "verl/utils/megatron_utils.py"
+      - "verl/workers/engine/megatron/transformer_impl.py"
+      - "verl/workers/config/engine.py"
+
+# Cancel jobs on the same ref if a new one is triggered
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
+# Declare permissions just read content.
+permissions:
+  contents: read
+
+env:
+  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm012.dev3"
+  DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
+
+jobs:
+  setup:
+    if: github.repository_owner == 'verl-project'
+    runs-on: ubuntu-latest
+    outputs:
+      runner-label: ${{ steps.create-runner.outputs.runner-label }}
+      mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }}
+    steps:
+      - uses: actions/checkout@v4
+      - id: create-runner
+        uses: volcengine/vemlp-github-runner@v1
+        with:
+          mode: "create"
+          faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
+          mlp-image: "${{ env.IMAGE }}"
+
+  e2e_ppo_trainer_megatron_fsdp:
+    needs: setup
+    runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
+    timeout-minutes: 60
+    env:
+      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
+      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
+      NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
+      HF_ENDPOINT: "https://hf-mirror.com"
+      HF_HUB_ENABLE_HF_TRANSFER: "0"
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+      - name: Install the current repository
+        run: |
+          pip3 install -r requirements-test.txt
+          pip3 install --no-deps -e .
+          pip3 install math-verify
+      - name: Prepare GSM8K dataset
+        run: |
+          python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
+      - name: Running GSM8K E2E PPO training with Megatron-FSDP on 8 GPUs (Qwen2.5-0.5B)
+        run: |
+          ray stop --force
+          ALL_OFFLOAD=True USE_MBRIDGE=True VANILLA_MBRIDGE=True USE_MEGATRON_FSDP=True \
+          COMMON_TP=1 COMMON_PP=1 COMMON_VPP=null COMMON_CP=1 \
+          ROLLOUT_TP=1 TOTAL_TRAIN_STEPS=2 \
+          MODEL_ID=Qwen/Qwen2.5-0.5B \
+          bash tests/special_e2e/run_ppo_trainer_megatron.sh
+      - name: Running GSM8K E2E GRPO training with Megatron-FSDP on 8 GPUs (Qwen2.5-0.5B)
+        run: |
+          ray stop --force
+          ALL_OFFLOAD=True USE_MBRIDGE=True VANILLA_MBRIDGE=True USE_MEGATRON_FSDP=True \
+          COMMON_TP=1 COMMON_PP=1 COMMON_VPP=null COMMON_CP=1 \
+          ROLLOUT_TP=1 TOTAL_TRAIN_STEPS=2 \
+          ADV_ESTIMATOR=grpo \
+          MODEL_ID=Qwen/Qwen2.5-0.5B \
+          bash tests/special_e2e/run_ppo_trainer_megatron.sh
+      - name: clean up
+        run: |
+          rm -rf checkpoints
+
+  cleanup:
+    runs-on: ubuntu-latest
+    needs: [setup, e2e_ppo_trainer_megatron_fsdp]
+    if: always()
+    steps:
+      - id: destroy-runner
+        uses: volcengine/vemlp-github-runner@v1
+        with:
+          mode: "destroy"
+          faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}"
+          mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}"
@@ -0,0 +1,105 @@
+Megatron-FSDP Example
+========================
+
+Last updated: 04/08/2026.
+
+Introduction
+------------
+
+In this example, we run SFT and RL training with Megatron-FSDP:
+
+- Runtime image: ``verlai/verl:vllm011.dev7``
+
+Step 1: Prepare code
+--------------------
+
+Use the tested PR branches for ``verl``, ``Megatron-LM``, and ``Megatron-Bridge``:
+
+.. code:: bash
+
+   cd /root
+
+   # 1) verl
+   git clone https://github.com/verl-project/verl.git
+   cd /root/verl
+   git fetch origin pull/5423/head:pr-5423
+   git checkout pr-5423
+
+   # 2) Megatron-LM
+   cd /root
+   git clone https://github.com/NVIDIA/Megatron-LM.git
+   cd /root/Megatron-LM
+   git fetch origin pull/3191/head:pr-3191
+   git checkout pr-3191
+
+   # 3) Megatron-Bridge
+   cd /root
+   git clone https://github.com/NVIDIA-NeMo/Megatron-Bridge.git
+   cd /root/Megatron-Bridge
+   git fetch origin pull/1910/head:pr-1910
+   git checkout pr-1910
+
+
+Step 2: Install dependencies and set environment
+------------------------------------------------
+
+.. code:: bash
+
+   cd /root/verl
+   pip3 install --no-deps -e .[test]
+   pip3 install "nvidia-modelopt[torch]>=0.37.0" math-verify transformers==4.57.1
+   export PYTHONPATH=/root/Megatron-LM:/root/Megatron-Bridge/src:$PYTHONPATH
+
+   unset CUDA_DEVICE_MAX_CONNECTIONS
+   ray stop --force
+
+Step 3: Prepare datasets
+------------------------
+
+.. code:: bash
+
+   cd /root/verl
+
+   # GSM8K
+   python3 examples/data_preprocess/gsm8k.py \
+     --local_save_dir ~/data/gsm8k
+
+   # MATH
+   python3 examples/data_preprocess/math_dataset.py \
+     --local_save_dir ~/data/math
+
+   # Check generated parquet files
+   ls -lh ~/data/gsm8k/train.parquet ~/data/gsm8k/test.parquet
+   ls -lh ~/data/math/train.parquet ~/data/math/test.parquet
+
+Step 4: Run Megatron-FSDP SFT
+----------------------------
+
+Before launch, check and update key fields `MODEL_PATH` and `SAVE_PATH` in the script.
+
+.. code:: bash
+
+   bash verl/examples/sft/gsm8k/run_qwen_megatron_fsdp.sh
+
+Step 5: Run Megatron-FSDP RL
+----------------------------
+
+Before launch, check and update key fields in
+``examples/grpo_trainer/run_qwen2-7b_math_megatron_fsdp.sh``:
+
+- ``actor_rollout_ref.model.path``: model name or local model path.
+- ``train_files`` / ``test_files``: parquet paths for GSM8K and MATH.
+- ``trainer.n_gpus_per_node`` and ``trainer.nnodes``: hardware topology.
+- ``trainer.project_name`` and ``trainer.experiment_name``: experiment identifiers.
+
+Then run:
+
+.. code:: bash
+
+   bash examples/grpo_trainer/run_qwen2-7b_math_megatron_fsdp.sh
+
+The script launches RL training and enables Megatron-FSDP with:
+
+- ``actor_rollout_ref.actor.megatron.use_mbridge=True``
+- ``actor_rollout_ref.actor.megatron.vanilla_mbridge=False``
+- ``actor_rollout_ref.actor.megatron.use_megatron_fsdp=True``
@@ -61,6 +61,7 @@ verl is fast with:
 
    examples/ppo_code_architecture
    examples/gsm8k_example
+   examples/megatron_fsdp_example
    examples/multi_modal_example
    examples/skypilot_examples
 

@@ -0,0 +1,65 @@
+set -x
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
+unset ROCR_VISIBLE_DEVICES
+export VLLM_USE_V1=1
+export VLLM_ALLREDUCE_USE_SYMM_MEM=0
+
+rollout_mode="async"
+export VLLM_USE_V1=1
+return_raw_chat="True"
+
+gsm8k_train_path=$HOME/data/gsm8k/train.parquet
+gsm8k_test_path=$HOME/data/gsm8k/test.parquet
+math_train_path=$HOME/data/math/train.parquet
+math_test_path=$HOME/data/math/test.parquet
+
+train_files="['$gsm8k_train_path', '$math_train_path']"
+test_files="['$gsm8k_test_path', '$math_test_path']"
+
+USE_FUSED_KERNELS=False
+
+python3 -m verl.trainer.main_ppo --config-path=config \
+    --config-name='ppo_megatron_trainer.yaml'\
+    algorithm.adv_estimator=grpo \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.return_raw_chat=$return_raw_chat \
+    data.train_batch_size=32 \
+    data.max_prompt_length=512 \
+    data.max_response_length=512 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
+    actor_rollout_ref.model.use_fused_kernels=$USE_FUSED_KERNELS \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=16 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=1 \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=4 \
+    actor_rollout_ref.actor.megatron.use_mbridge=True \
+    actor_rollout_ref.actor.megatron.vanilla_mbridge=False \
+    actor_rollout_ref.actor.megatron.use_megatron_fsdp=True \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.mode=$rollout_mode \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
+    actor_rollout_ref.rollout.n=2 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=1 \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=4 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_grpo_example_gsm8k_math' \
+    trainer.experiment_name='qwen2_7b_megatron_fsdp' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.test_freq=5 \
+    trainer.total_epochs=15 $@
diff --git a/examples/sft/gsm8k/run_qwen_megatron_fsdp.sh b/examples/sft/gsm8k/run_qwen_megatron_fsdp.sh
@@ -0,0 +1,46 @@
+set -x
+
+WORKSPACE_DIR="$(cd "$(dirname "$0")/../.." && pwd)"
+MODEL_PATH="/root/models/Qwen2.5-Math-7B"
+SAVE_PATH="/root/checkpoints/Qwen2.5-Math-7B"
+NPROC=8
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export HYDRA_FULL_ERROR=1
+unset ROCR_VISIBLE_DEVICES
+
+torchrun --standalone --nnodes=1 --nproc_per_node=$NPROC \
+     -m verl.trainer.sft_trainer \
+    data.train_files=$HOME/data/gsm8k/train.parquet \
+    data.val_files=$HOME/data/gsm8k/test.parquet \
+    data.messages_key=messages \
+    data.train_batch_size=8 \
+    data.use_dynamic_bsz=True \
+    data.max_token_len_per_gpu=1024 \
+    data.pad_mode=no_padding \
+    data.truncation=error \
+    model=hf_model \
+    model.trust_remote_code=True \
+    engine=megatron \
+    optim=megatron \
+    optim.lr=1e-5 \
+    optim.lr_warmup_steps_ratio=0.2 \
+    optim.weight_decay=0.1 \
+    optim.betas="[0.9,0.95]" \
+    optim.clip_grad=1.0 \
+    optim.lr_warmup_init=0 \
+    optim.lr_decay_style=cosine \
+    optim.min_lr=1e-6 \
+    engine.tensor_model_parallel_size=4 \
+    engine.pipeline_model_parallel_size=1 \
+    engine.expert_model_parallel_size=1 \
+    engine.use_mbridge=True \
+    engine.vanilla_mbridge=False \
+    engine.use_megatron_fsdp=True \
+    model.path=$MODEL_PATH \
+    model.use_remove_padding=true \
+    trainer.default_local_dir=$SAVE_PATH \
+    trainer.project_name=gsm8k-sft \
+    trainer.experiment_name=SFT-qwen2.5-7b-mfsdp \
+    trainer.logger='["console","wandb","file"]' \
+    trainer.total_epochs=4 "$@"