diff --git a/examples/train/megatron/run_dapo_glm_flash.sh b/examples/train/megatron/run_dapo_glm_flash.sh new file mode 100644 index 0000000000..2d9b627491 --- /dev/null +++ b/examples/train/megatron/run_dapo_glm_flash.sh @@ -0,0 +1,169 @@ +set -x + +# Colocated DAPO training+generation for GLM-4.7-Flash on DAPO with Megatron. +# GLM-4.7-Flash (zai-org/GLM-4.7-Flash) is a DeepSeek-V3 architecture clone +# with MLA + MoE (64 routed experts, 4 active per token, ~3B active parameters). +# +# Runs on 1 node of 8 GPUs (TP=1 EP=8 for Megatron, 2x TP=4 vLLM engines). +# GLM-4.7-Flash has 20 attention heads, so vLLM TP must divide 20 (use TP=4). +# +# Setup: +# 1. Install deps: +# uv sync --extra megatron +# 2. GLM-4.7-Flash needs transformers>=5.0.0 (for Glm4MoeLiteConfig). +# If not yet available via uv sync, install manually: +# uv pip install "transformers>=5.0.0" +# 3. Prepare data: +# bash examples/algorithms/dapo/prepare_dapo_data.sh +# 4. Run: +# export WANDB_API_KEY= # or set LOGGER=console below +# bash examples/train/megatron/run_dapo_glm_flash.sh + +MODEL_NAME="zai-org/GLM-4.7-Flash" +DATA_DIR="$HOME/data/dapo" +TRAIN_FILE="$DATA_DIR/dapo-math-17k-cleaned.parquet" +TEST_FILE="$DATA_DIR/aime-2024-cleaned.parquet" +NUM_NODES=2 +NUM_GPUS_PER_NODE=8 +NUM_INFERENCE_ENGINES=4 +INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE=4 +LOGGER="wandb" # change to "console" to print to stdout + +INFERENCE_ENGINE_MAX_MODEL_LEN=32000 + +CLIP_RATIO_LOW=0.2 +CLIP_RATIO_HIGH=0.28 +# use token mean loss reduction +LOSS_REDUCTION="token_mean" +# applies overlong filtering (but not soft overlong punishment) +APPLY_OVERLONG_FILTERING=true +# apply soft overlong punishment with custom trainer impl in main_dapo.py +OVERLONG_BUFFER_LEN=$((1024 * 3)) +OVERLONG_BUFFER_PENALTY_FACTOR=0 + +# other DAPO parameters +USE_KL_LOSS=false +TEMPERATURE=1.0 +TOP_P=1.0 +EVAL_TOP_P=0.7 +CLIP_RATIO_C=10.0 +MAX_PROMPT_LENGTH=$((1024 * 2)) +MAX_RESPONSE_LENGTH=$((1024 * 20)) + +# repro run parameters +TRAIN_BATCH_SIZE=128 +MINI_BATCH_SIZE=32 +N_SAMPLES_PER_PROMPT=8 +EVAL_N_SAMPLES_PER_PROMPT=32 +ENFORCE_EAGER=true # cuda graphs can cause some instability +LR=1e-6 + +# GLM-4.7-Flash supports flash attention (v_head_dim == qk_head_dim + qk_rope_head_dim == 256). +FLASH_ATTN=true + +# Megatron parallelism: TP=1, EP=8 fits 64 MoE experts across 8 GPUs (8 experts/GPU) +MEGATRON_TP=4 +MEGATRON_PP=1 +MEGATRON_CP=1 +MEGATRON_EP=8 +MEGATRON_ETP=1 +# MEGATRON_LAST_PIPELINE_STAGE_LAYER=23 + +# MoE routing flags (DeepSeek-V3 style: sigmoid scoring with expert bias) +MOE_TOKEN_DISPATCHER="alltoall" +MOE_ROUTER_LB="none" +MOE_GROUPED_GEMM=true +MOE_ROUTER_SCORE_FN="sigmoid" +MOE_ROUTER_EXPERT_BIAS=true + +# CPU optimizer offload to fit in 80GB GPUs +OPTIMIZER_CPU_OFFLOAD=true +OPTIMIZER_OFFLOAD_FRACTION=1.0 + +# TIS parameters +TIS_IMP_RATIO_CAP=2.0 +USE_TIS=true + +# EFA +SKYRL_LD_LIBRARY_PATH_EXPORT=1 +LD_LIBRARY_PATH=/opt/amazon/efa/lib:$LD_LIBRARY_PATH +FI_PROVIDER=efa + +SKYRL_RAY_PG_TIMEOUT_IN_S=450 uv run --isolated --extra megatron -m examples.train.algorithms.dapo.main_dapo \ + data.train_data="['$TRAIN_FILE']" \ + data.val_data="['$TEST_FILE']" \ + trainer.algorithm.advantage_estimator="grpo" \ + trainer.algorithm.policy_loss_type="dual_clip" \ + trainer.algorithm.overlong_buffer_len=$OVERLONG_BUFFER_LEN \ + trainer.algorithm.overlong_buffer_penalty_factor=$OVERLONG_BUFFER_PENALTY_FACTOR \ + trainer.algorithm.loss_reduction=$LOSS_REDUCTION \ + generator.inference_engine.enforce_eager=$ENFORCE_EAGER \ + generator.apply_overlong_filtering=$APPLY_OVERLONG_FILTERING \ + generator.sampling_params.temperature=$TEMPERATURE \ + generator.sampling_params.top_p=$TOP_P \ + generator.eval_sampling_params.top_p=$EVAL_TOP_P \ + generator.eval_sampling_params.temperature=$TEMPERATURE \ + generator.eval_sampling_params.max_generate_length=$MAX_RESPONSE_LENGTH \ + trainer.algorithm.use_kl_loss=$USE_KL_LOSS \ + trainer.algorithm.clip_ratio_c=$CLIP_RATIO_C \ + trainer.policy.model.path="$MODEL_NAME" \ + trainer.placement.colocate_all=true \ + trainer.strategy=megatron \ + trainer.placement.policy_num_nodes=$NUM_NODES \ + trainer.placement.policy_num_gpus_per_node=$NUM_GPUS_PER_NODE \ + generator.inference_engine.num_engines=$NUM_INFERENCE_ENGINES \ + generator.inference_engine.tensor_parallel_size=$INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE \ + trainer.policy.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \ + trainer.policy.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \ + trainer.policy.megatron_config.context_parallel_size=$MEGATRON_CP \ + trainer.policy.megatron_config.expert_model_parallel_size=$MEGATRON_EP \ + trainer.policy.megatron_config.expert_tensor_parallel_size=$MEGATRON_ETP \ + trainer.policy.megatron_config.moe_token_dispatcher_type=$MOE_TOKEN_DISPATCHER \ + trainer.policy.megatron_config.moe_router_load_balancing_type=$MOE_ROUTER_LB \ + trainer.policy.megatron_config.moe_grouped_gemm=$MOE_GROUPED_GEMM \ + trainer.policy.megatron_config.moe_router_score_function=$MOE_ROUTER_SCORE_FN \ + trainer.policy.megatron_config.moe_router_enable_expert_bias=$MOE_ROUTER_EXPERT_BIAS \ + trainer.policy.megatron_config.optimizer_config_kwargs.optimizer_cpu_offload=$OPTIMIZER_CPU_OFFLOAD \ + trainer.policy.megatron_config.optimizer_config_kwargs.optimizer_offload_fraction=$OPTIMIZER_OFFLOAD_FRACTION \ + trainer.policy.megatron_config.empty_cuda_cache=true \ + trainer.use_sample_packing=true \ + trainer.flash_attn=$FLASH_ATTN \ + generator.inference_engine.engine_init_kwargs.max_model_len=$INFERENCE_ENGINE_MAX_MODEL_LEN \ + trainer.algorithm.use_tis=$USE_TIS \ + trainer.algorithm.tis_imp_ratio_cap=$TIS_IMP_RATIO_CAP \ + trainer.epochs=20 \ + trainer.algorithm.eps_clip_low=$CLIP_RATIO_LOW \ + trainer.algorithm.eps_clip_high=$CLIP_RATIO_HIGH \ + trainer.eval_batch_size=1024 \ + trainer.eval_before_train=false \ + trainer.eval_interval=5 \ + trainer.update_epochs_per_batch=1 \ + trainer.train_batch_size=$TRAIN_BATCH_SIZE \ + trainer.policy_mini_batch_size=$MINI_BATCH_SIZE \ + trainer.micro_forward_batch_size_per_gpu=2 \ + trainer.micro_train_batch_size_per_gpu=2 \ + trainer.ckpt_interval=1 \ + trainer.max_prompt_length=$MAX_PROMPT_LENGTH \ + generator.sampling_params.max_generate_length=$MAX_RESPONSE_LENGTH \ + trainer.policy.optimizer_config.lr=$LR \ + trainer.policy.optimizer_config.num_warmup_steps=40 \ + trainer.policy.optimizer_config.weight_decay=0.1 \ + trainer.policy.optimizer_config.max_grad_norm=1.0 \ + generator.inference_engine.backend=vllm \ + generator.inference_engine.run_engines_locally=true \ + generator.inference_engine.weight_sync_backend=nccl \ + generator.inference_engine.async_engine=false \ + generator.batched=true \ + environment.env_class=aime \ + generator.n_samples_per_prompt=$N_SAMPLES_PER_PROMPT \ + generator.eval_n_samples_per_prompt=$EVAL_N_SAMPLES_PER_PROMPT \ + generator.inference_engine.gpu_memory_utilization=0.7 \ + trainer.logger="$LOGGER" \ + trainer.project_name="dapo_glm_flash" \ + trainer.run_name="dapo_glm4_7_flash_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_ep${MEGATRON_EP}_etp${MEGATRON_ETP}" \ + trainer.export_path="$HOME/exports/dapo_glm4_7_flash_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_ep${MEGATRON_EP}_etp${MEGATRON_ETP}" \ + trainer.hf_save_interval=300 \ + trainer.resume_mode=latest \ + trainer.max_ckpts_to_keep=3 \ + trainer.ckpt_path="s3://skyrl-anyscale/org_vz1ufrqstecz2uet1xkwdzrm9b/cld_cntqf5nf645kv8esukgcy9yveg/artifact_storage/ckpts/dapo_glm4_7_flash_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_ep${MEGATRON_EP}_etp${MEGATRON_ETP}" \ + $@ \ No newline at end of file diff --git a/examples/train/megatron/run_dapo_glm_flash_lora.sh b/examples/train/megatron/run_dapo_glm_flash_lora.sh new file mode 100644 index 0000000000..08c9b0154e --- /dev/null +++ b/examples/train/megatron/run_dapo_glm_flash_lora.sh @@ -0,0 +1,175 @@ +set -x + +# Colocated DAPO training+generation for GLM-4.7-Flash on DAPO with Megatron. +# GLM-4.7-Flash (zai-org/GLM-4.7-Flash) is a DeepSeek-V3 architecture clone +# with MLA + MoE (64 routed experts, 4 active per token, ~3B active parameters). +# +# Runs on 1 node of 8 GPUs (TP=1 EP=8 for Megatron, 2x TP=4 vLLM engines). +# GLM-4.7-Flash has 20 attention heads, so vLLM TP must divide 20 (use TP=4). +# +# Setup: +# 1. Install deps: +# uv sync --extra megatron +# 2. GLM-4.7-Flash needs transformers>=5.0.0 (for Glm4MoeLiteConfig). +# If not yet available via uv sync, install manually: +# uv pip install "transformers>=5.0.0" +# 3. Prepare data: +# bash examples/algorithms/dapo/prepare_dapo_data.sh +# 4. Run: +# export WANDB_API_KEY= # or set LOGGER=console below +# bash examples/train/megatron/run_dapo_glm_flash_lora.sh + +MODEL_NAME="zai-org/GLM-4.7-Flash" +DATA_DIR="$HOME/data/dapo" +TRAIN_FILE="$DATA_DIR/dapo-math-17k-cleaned.parquet" +TEST_FILE="$DATA_DIR/aime-2024-cleaned.parquet" +NUM_NODES=2 +NUM_GPUS_PER_NODE=8 +NUM_INFERENCE_ENGINES=4 +INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE=4 +LOGGER="wandb" # change to "console" to print to stdout + +INFERENCE_ENGINE_MAX_MODEL_LEN=32000 + +CLIP_RATIO_LOW=0.2 +CLIP_RATIO_HIGH=0.28 +# use token mean loss reduction +LOSS_REDUCTION="token_mean" +# applies overlong filtering (but not soft overlong punishment) +APPLY_OVERLONG_FILTERING=true +# apply soft overlong punishment with custom trainer impl in main_dapo.py +OVERLONG_BUFFER_LEN=$((1024 * 3)) +OVERLONG_BUFFER_PENALTY_FACTOR=0 + +# other DAPO parameters +USE_KL_LOSS=false +TEMPERATURE=1.0 +TOP_P=1.0 +EVAL_TOP_P=0.7 +CLIP_RATIO_C=10.0 +MAX_PROMPT_LENGTH=$((1024 * 2)) +MAX_RESPONSE_LENGTH=$((1024 * 20)) + +# repro run parameters +TRAIN_BATCH_SIZE=128 +MINI_BATCH_SIZE=32 +N_SAMPLES_PER_PROMPT=8 +EVAL_N_SAMPLES_PER_PROMPT=32 +ENFORCE_EAGER=true # cuda graphs can cause some instability + +# GLM-4.7-Flash supports flash attention (v_head_dim == qk_head_dim + qk_rope_head_dim == 256). +FLASH_ATTN=true + +# Megatron parallelism: TP=1, EP=8 fits 64 MoE experts across 8 GPUs (8 experts/GPU) +MEGATRON_TP=4 +MEGATRON_PP=1 +MEGATRON_CP=1 +MEGATRON_EP=8 +MEGATRON_ETP=1 +# MEGATRON_LAST_PIPELINE_STAGE_LAYER=23 + +# MoE routing flags (DeepSeek-V3 style: sigmoid scoring with expert bias) +MOE_TOKEN_DISPATCHER="alltoall" +MOE_ROUTER_LB="none" +MOE_GROUPED_GEMM=true +MOE_ROUTER_SCORE_FN="sigmoid" +MOE_ROUTER_EXPERT_BIAS=true + +# CPU optimizer offload to fit in 80GB GPUs +OPTIMIZER_CPU_OFFLOAD=true +OPTIMIZER_OFFLOAD_FRACTION=1.0 + +# TIS parameters +TIS_IMP_RATIO_CAP=2.0 +USE_TIS=true + +# EFA +SKYRL_LD_LIBRARY_PATH_EXPORT=1 +LD_LIBRARY_PATH=/opt/amazon/efa/lib:$LD_LIBRARY_PATH +FI_PROVIDER=efa + +# LoRA +LR=1e-5 +LORA_RANK=128 +LORA_ALPHA=128 + +SKYRL_RAY_PG_TIMEOUT_IN_S=450 uv run --isolated --extra megatron -m examples.train.algorithms.dapo.main_dapo \ + data.train_data="['$TRAIN_FILE']" \ + data.val_data="['$TEST_FILE']" \ + trainer.algorithm.advantage_estimator="grpo" \ + trainer.algorithm.policy_loss_type="dual_clip" \ + trainer.algorithm.overlong_buffer_len=$OVERLONG_BUFFER_LEN \ + trainer.algorithm.overlong_buffer_penalty_factor=$OVERLONG_BUFFER_PENALTY_FACTOR \ + trainer.algorithm.loss_reduction=$LOSS_REDUCTION \ + generator.inference_engine.enforce_eager=$ENFORCE_EAGER \ + generator.apply_overlong_filtering=$APPLY_OVERLONG_FILTERING \ + generator.sampling_params.temperature=$TEMPERATURE \ + generator.sampling_params.top_p=$TOP_P \ + generator.eval_sampling_params.top_p=$EVAL_TOP_P \ + generator.eval_sampling_params.temperature=$TEMPERATURE \ + generator.eval_sampling_params.max_generate_length=$MAX_RESPONSE_LENGTH \ + trainer.algorithm.use_kl_loss=$USE_KL_LOSS \ + trainer.algorithm.clip_ratio_c=$CLIP_RATIO_C \ + trainer.policy.model.path="$MODEL_NAME" \ + trainer.policy.model.lora.rank=$LORA_RANK \ + trainer.policy.model.lora.alpha=$LORA_ALPHA \ + trainer.placement.colocate_all=true \ + trainer.strategy=megatron \ + trainer.placement.policy_num_nodes=$NUM_NODES \ + trainer.placement.policy_num_gpus_per_node=$NUM_GPUS_PER_NODE \ + generator.inference_engine.num_engines=$NUM_INFERENCE_ENGINES \ + generator.inference_engine.tensor_parallel_size=$INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE \ + trainer.policy.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \ + trainer.policy.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \ + trainer.policy.megatron_config.context_parallel_size=$MEGATRON_CP \ + trainer.policy.megatron_config.expert_model_parallel_size=$MEGATRON_EP \ + trainer.policy.megatron_config.expert_tensor_parallel_size=$MEGATRON_ETP \ + trainer.policy.megatron_config.moe_token_dispatcher_type=$MOE_TOKEN_DISPATCHER \ + trainer.policy.megatron_config.moe_router_load_balancing_type=$MOE_ROUTER_LB \ + trainer.policy.megatron_config.moe_grouped_gemm=$MOE_GROUPED_GEMM \ + trainer.policy.megatron_config.moe_router_score_function=$MOE_ROUTER_SCORE_FN \ + trainer.policy.megatron_config.moe_router_enable_expert_bias=$MOE_ROUTER_EXPERT_BIAS \ + trainer.policy.megatron_config.optimizer_config_kwargs.optimizer_cpu_offload=$OPTIMIZER_CPU_OFFLOAD \ + trainer.policy.megatron_config.optimizer_config_kwargs.optimizer_offload_fraction=$OPTIMIZER_OFFLOAD_FRACTION \ + trainer.policy.megatron_config.empty_cuda_cache=true \ + trainer.use_sample_packing=true \ + trainer.flash_attn=$FLASH_ATTN \ + generator.inference_engine.engine_init_kwargs.max_model_len=$INFERENCE_ENGINE_MAX_MODEL_LEN \ + trainer.algorithm.use_tis=$USE_TIS \ + trainer.algorithm.tis_imp_ratio_cap=$TIS_IMP_RATIO_CAP \ + trainer.epochs=20 \ + trainer.algorithm.eps_clip_low=$CLIP_RATIO_LOW \ + trainer.algorithm.eps_clip_high=$CLIP_RATIO_HIGH \ + trainer.eval_batch_size=1024 \ + trainer.eval_before_train=false \ + trainer.eval_interval=5 \ + trainer.update_epochs_per_batch=1 \ + trainer.train_batch_size=$TRAIN_BATCH_SIZE \ + trainer.policy_mini_batch_size=$MINI_BATCH_SIZE \ + trainer.micro_forward_batch_size_per_gpu=2 \ + trainer.micro_train_batch_size_per_gpu=2 \ + trainer.ckpt_interval=1 \ + trainer.max_prompt_length=$MAX_PROMPT_LENGTH \ + generator.sampling_params.max_generate_length=$MAX_RESPONSE_LENGTH \ + trainer.policy.optimizer_config.lr=$LR \ + trainer.policy.optimizer_config.num_warmup_steps=40 \ + trainer.policy.optimizer_config.weight_decay=0.1 \ + trainer.policy.optimizer_config.max_grad_norm=1.0 \ + generator.inference_engine.backend=vllm \ + generator.inference_engine.run_engines_locally=true \ + generator.inference_engine.weight_sync_backend=nccl \ + generator.inference_engine.async_engine=false \ + generator.batched=true \ + environment.env_class=aime \ + generator.n_samples_per_prompt=$N_SAMPLES_PER_PROMPT \ + generator.eval_n_samples_per_prompt=$EVAL_N_SAMPLES_PER_PROMPT \ + generator.inference_engine.gpu_memory_utilization=0.7 \ + trainer.logger="$LOGGER" \ + trainer.project_name="dapo_glm_flash" \ + trainer.run_name="dapo_glm4_7_flash_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_ep${MEGATRON_EP}_etp${MEGATRON_ETP}" \ + trainer.export_path="$HOME/exports/dapo_glm4_7_flash_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_ep${MEGATRON_EP}_etp${MEGATRON_ETP}" \ + trainer.hf_save_interval=300 \ + trainer.resume_mode=latest \ + trainer.max_ckpts_to_keep=3 \ + trainer.ckpt_path="/mnt/local_storage/ckpts/dapo_glm4_7_flash_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_ep${MEGATRON_EP}_etp${MEGATRON_ETP}" \ + $@ \ No newline at end of file diff --git a/examples/train/megatron/run_dapo_glm_flash_lora_r3.sh b/examples/train/megatron/run_dapo_glm_flash_lora_r3.sh new file mode 100644 index 0000000000..70d70ef519 --- /dev/null +++ b/examples/train/megatron/run_dapo_glm_flash_lora_r3.sh @@ -0,0 +1,192 @@ +set -x + +# Colocated DAPO training+generation for GLM-4.7-Flash on DAPO with Megatron. +# GLM-4.7-Flash (zai-org/GLM-4.7-Flash) is a DeepSeek-V3 architecture clone +# with MLA + MoE (64 routed experts, 4 active per token, ~3B active parameters). +# +# Runs on 1 node of 8 GPUs (TP=1 EP=8 for Megatron, 2x TP=4 vLLM engines). +# GLM-4.7-Flash has 20 attention heads, so vLLM TP must divide 20 (use TP=4). +# +# Setup: +# 1. Install deps: +# uv sync --extra megatron +# 2. GLM-4.7-Flash needs transformers>=5.0.0 (for Glm4MoeLiteConfig). +# If not yet available via uv sync, install manually: +# uv pip install "transformers>=5.0.0" +# 3. Prepare data: +# bash examples/algorithms/dapo/prepare_dapo_data.sh +# 4. Run: +# export WANDB_API_KEY= # or set LOGGER=console below +# bash examples/train/megatron/run_dapo_glm_flash_lora_r3.sh + +MODEL_NAME="zai-org/GLM-4.7-Flash" +DATA_DIR="/mnt/local_storage/data/dapo" +TRAIN_FILE="$DATA_DIR/dapo-math-17k-cleaned.parquet" +TEST_FILE="$DATA_DIR/aime-2024-cleaned.parquet" +NUM_NODES=2 +NUM_GPUS_PER_NODE=8 +NUM_INFERENCE_ENGINES=4 +INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE=4 +LOGGER="wandb" # change to "console" to print to stdout + +INFERENCE_ENGINE_MAX_MODEL_LEN=32000 + +CLIP_RATIO_LOW=0.2 +CLIP_RATIO_HIGH=0.28 +# use token mean loss reduction +LOSS_REDUCTION="token_mean" +# applies overlong filtering (but not soft overlong punishment) +APPLY_OVERLONG_FILTERING=true +# apply soft overlong punishment with custom trainer impl in main_dapo.py +OVERLONG_BUFFER_LEN=$((1024 * 3)) +OVERLONG_BUFFER_PENALTY_FACTOR=0 + +# other DAPO parameters +USE_KL_LOSS=false +TEMPERATURE=1.0 +TOP_P=1.0 +EVAL_TOP_P=0.7 +CLIP_RATIO_C=10.0 +MAX_PROMPT_LENGTH=$((1024 * 2)) +MAX_RESPONSE_LENGTH=$((1024 * 20)) + +# repro run parameters +TRAIN_BATCH_SIZE=128 +MINI_BATCH_SIZE=32 +N_SAMPLES_PER_PROMPT=8 +EVAL_N_SAMPLES_PER_PROMPT=32 +ENFORCE_EAGER=true # cuda graphs can cause some instability + +# GLM-4.7-Flash supports flash attention (v_head_dim == qk_head_dim + qk_rope_head_dim == 256). +FLASH_ATTN=true + +# Megatron parallelism: TP=1, EP=8 fits 64 MoE experts across 8 GPUs (8 experts/GPU) +MEGATRON_TP=4 +MEGATRON_PP=1 +MEGATRON_CP=1 +MEGATRON_EP=8 +MEGATRON_ETP=1 +# MEGATRON_LAST_PIPELINE_STAGE_LAYER=23 +# trainer.policy.megatron_config.transformer_config_kwargs.num_layers_in_last_pipeline_stage=$MEGATRON_LAST_PIPELINE_STAGE_LAYER \ + + +# MoE routing flags (DeepSeek-V3 style: sigmoid scoring with expert bias) +MOE_TOKEN_DISPATCHER="alltoall" +MOE_ROUTER_LB="none" +MOE_GROUPED_GEMM=true +MOE_ROUTER_SCORE_FN="sigmoid" +MOE_ROUTER_EXPERT_BIAS=true + + +# CPU optimizer offload to fit in 80GB GPUs +OPTIMIZER_CPU_OFFLOAD=true +OPTIMIZER_OFFLOAD_FRACTION=1.0 + +# TIS parameters +TIS_IMP_RATIO_CAP=2.0 +USE_TIS=true + +# EFA +SKYRL_LD_LIBRARY_PATH_EXPORT=1 +LD_LIBRARY_PATH=/opt/amazon/efa/lib:$LD_LIBRARY_PATH +FI_PROVIDER=efa + +#r3 +ROUTER_REPLAY=True +DISTRIBUTED_EXECUTOR_BACKEND="mp" + +# LoRA +LR=1e-5 +LORA_RANK=128 +LORA_ALPHA=128 + +# export UV_CACHE_DIR=/mnt/local_storage/uv_cache +# export HF_HOME=/mnt/local_storage/hf_cache +# export TRANSFORMERS_CACHE=/mnt/local_storage/hf_cache +# export HF_DATASETS_CACHE=/mnt/local_storage/hf_cache + +UV_HTTP_TIMEOUT=100 SKYRL_RAY_PG_TIMEOUT_IN_S=600 uv run --isolated --extra megatron -m examples.train.algorithms.dapo.main_dapo \ + data.train_data="['$TRAIN_FILE']" \ + data.val_data="['$TEST_FILE']" \ + trainer.algorithm.advantage_estimator="grpo" \ + trainer.algorithm.policy_loss_type="dual_clip" \ + trainer.algorithm.overlong_buffer_len=$OVERLONG_BUFFER_LEN \ + trainer.algorithm.overlong_buffer_penalty_factor=$OVERLONG_BUFFER_PENALTY_FACTOR \ + trainer.algorithm.loss_reduction=$LOSS_REDUCTION \ + generator.inference_engine.enforce_eager=$ENFORCE_EAGER \ + generator.apply_overlong_filtering=$APPLY_OVERLONG_FILTERING \ + generator.sampling_params.temperature=$TEMPERATURE \ + generator.sampling_params.top_p=$TOP_P \ + generator.eval_sampling_params.top_p=$EVAL_TOP_P \ + generator.eval_sampling_params.temperature=$TEMPERATURE \ + generator.eval_sampling_params.max_generate_length=$MAX_RESPONSE_LENGTH \ + trainer.algorithm.use_kl_loss=$USE_KL_LOSS \ + trainer.algorithm.clip_ratio_c=$CLIP_RATIO_C \ + trainer.policy.model.path="$MODEL_NAME" \ + trainer.policy.model.lora.rank=$LORA_RANK \ + trainer.policy.model.lora.alpha=$LORA_ALPHA \ + trainer.policy.megatron_config.moe_enable_routing_replay=$ROUTER_REPLAY \ + generator.inference_engine.enable_return_routed_experts=$ROUTER_REPLAY \ + generator.inference_engine.distributed_executor_backend=$DISTRIBUTED_EXECUTOR_BACKEND \ + trainer.placement.colocate_all=true \ + trainer.strategy=megatron \ + trainer.placement.policy_num_nodes=$NUM_NODES \ + trainer.placement.policy_num_gpus_per_node=$NUM_GPUS_PER_NODE \ + generator.inference_engine.num_engines=$NUM_INFERENCE_ENGINES \ + generator.inference_engine.tensor_parallel_size=$INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE \ + trainer.policy.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \ + trainer.policy.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \ + trainer.policy.megatron_config.context_parallel_size=$MEGATRON_CP \ + trainer.policy.megatron_config.expert_model_parallel_size=$MEGATRON_EP \ + trainer.policy.megatron_config.expert_tensor_parallel_size=$MEGATRON_ETP \ + trainer.policy.megatron_config.moe_token_dispatcher_type=$MOE_TOKEN_DISPATCHER \ + trainer.policy.megatron_config.moe_router_load_balancing_type=$MOE_ROUTER_LB \ + trainer.policy.megatron_config.moe_grouped_gemm=$MOE_GROUPED_GEMM \ + trainer.policy.megatron_config.moe_router_score_function=$MOE_ROUTER_SCORE_FN \ + trainer.policy.megatron_config.moe_router_enable_expert_bias=$MOE_ROUTER_EXPERT_BIAS \ + trainer.policy.megatron_config.optimizer_config_kwargs.overlap_cpu_optimizer_d2h_h2d=$OPTIMIZER_CPU_OFFLOAD \ + trainer.policy.megatron_config.optimizer_config_kwargs.use_precision_aware_optimizer=$OPTIMIZER_CPU_OFFLOAD \ + trainer.policy.megatron_config.optimizer_config_kwargs.optimizer_cpu_offload=$OPTIMIZER_CPU_OFFLOAD \ + trainer.policy.megatron_config.optimizer_config_kwargs.optimizer_offload_fraction=$OPTIMIZER_OFFLOAD_FRACTION \ + trainer.policy.megatron_config.empty_cuda_cache=true \ + trainer.use_sample_packing=true \ + trainer.flash_attn=$FLASH_ATTN \ + generator.inference_engine.engine_init_kwargs.max_model_len=$INFERENCE_ENGINE_MAX_MODEL_LEN \ + trainer.algorithm.use_tis=$USE_TIS \ + trainer.algorithm.tis_imp_ratio_cap=$TIS_IMP_RATIO_CAP \ + trainer.epochs=20 \ + trainer.algorithm.eps_clip_low=$CLIP_RATIO_LOW \ + trainer.algorithm.eps_clip_high=$CLIP_RATIO_HIGH \ + trainer.eval_batch_size=1024 \ + trainer.eval_before_train=false \ + trainer.eval_interval=5 \ + trainer.update_epochs_per_batch=1 \ + trainer.train_batch_size=$TRAIN_BATCH_SIZE \ + trainer.policy_mini_batch_size=$MINI_BATCH_SIZE \ + trainer.micro_forward_batch_size_per_gpu=2 \ + trainer.micro_train_batch_size_per_gpu=2 \ + trainer.ckpt_interval=10 \ + trainer.max_prompt_length=$MAX_PROMPT_LENGTH \ + generator.sampling_params.max_generate_length=$MAX_RESPONSE_LENGTH \ + trainer.policy.optimizer_config.lr=$LR \ + trainer.policy.optimizer_config.num_warmup_steps=40 \ + trainer.policy.optimizer_config.weight_decay=0.1 \ + trainer.policy.optimizer_config.max_grad_norm=1.0 \ + generator.inference_engine.backend=vllm \ + generator.inference_engine.run_engines_locally=true \ + generator.inference_engine.weight_sync_backend=nccl \ + generator.inference_engine.async_engine=false \ + generator.batched=true \ + environment.env_class=aime \ + generator.n_samples_per_prompt=$N_SAMPLES_PER_PROMPT \ + generator.eval_n_samples_per_prompt=$EVAL_N_SAMPLES_PER_PROMPT \ + generator.inference_engine.gpu_memory_utilization=0.7 \ + trainer.logger="$LOGGER" \ + trainer.project_name="dapo_glm_flash" \ + trainer.run_name="dapo_glm4_7_flash_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_ep${MEGATRON_EP}_etp${MEGATRON_ETP}_tis_r3_lora" \ + trainer.export_path="/mnt/local_storage/exports/dapo_glm4_7_flash_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_ep${MEGATRON_EP}_etp${MEGATRON_ETP}" \ + trainer.hf_save_interval=300 \ + trainer.resume_mode=null \ + trainer.max_ckpts_to_keep=3 \ + trainer.ckpt_path="/mnt/local_storage/ckpts/dapo_glm4_7_flash_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_ep${MEGATRON_EP}_etp${MEGATRON_ETP}" \ + $@ \ No newline at end of file diff --git a/examples/train/megatron/run_dapo_glm_flash_r3.sh b/examples/train/megatron/run_dapo_glm_flash_r3.sh new file mode 100644 index 0000000000..e1ba49b2e4 --- /dev/null +++ b/examples/train/megatron/run_dapo_glm_flash_r3.sh @@ -0,0 +1,178 @@ +set -x + +# Colocated DAPO training+generation for GLM-4.7-Flash on DAPO with Megatron. +# GLM-4.7-Flash (zai-org/GLM-4.7-Flash) is a DeepSeek-V3 architecture clone +# with MLA + MoE (64 routed experts, 4 active per token, ~3B active parameters). +# +# Runs on 1 node of 8 GPUs (TP=1 EP=8 for Megatron, 2x TP=4 vLLM engines). +# GLM-4.7-Flash has 20 attention heads, so vLLM TP must divide 20 (use TP=4). +# +# Setup: +# 1. Install deps: +# uv sync --extra megatron +# 2. GLM-4.7-Flash needs transformers>=5.0.0 (for Glm4MoeLiteConfig). +# If not yet available via uv sync, install manually: +# uv pip install "transformers>=5.0.0" +# 3. Prepare data: +# bash examples/algorithms/dapo/prepare_dapo_data.sh +# 4. Run: +# export WANDB_API_KEY= # or set LOGGER=console below +# bash examples/train/megatron/run_dapo_glm_flash_r3.sh + +MODEL_NAME="zai-org/GLM-4.7-Flash" +DATA_DIR="/mnt/local_storage/data/dapo" +TRAIN_FILE="$DATA_DIR/dapo-math-17k-cleaned.parquet" +TEST_FILE="$DATA_DIR/aime-2024-cleaned.parquet" +NUM_NODES=4 +NUM_GPUS_PER_NODE=8 +NUM_INFERENCE_ENGINES=8 +INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE=4 +LOGGER="wandb" # change to "console" to print to stdout + +INFERENCE_ENGINE_MAX_MODEL_LEN=32000 + +CLIP_RATIO_LOW=0.2 +CLIP_RATIO_HIGH=0.28 +# use token mean loss reduction +LOSS_REDUCTION="token_mean" +# applies overlong filtering (but not soft overlong punishment) +APPLY_OVERLONG_FILTERING=true +# apply soft overlong punishment with custom trainer impl in main_dapo.py +OVERLONG_BUFFER_LEN=$((1024 * 3)) +OVERLONG_BUFFER_PENALTY_FACTOR=0 + +# other DAPO parameters +USE_KL_LOSS=false +TEMPERATURE=1.0 +TOP_P=1.0 +EVAL_TOP_P=0.7 +CLIP_RATIO_C=10.0 +MAX_PROMPT_LENGTH=$((1024 * 2)) +MAX_RESPONSE_LENGTH=$((1024 * 20)) + +# repro run parameters +TRAIN_BATCH_SIZE=128 +MINI_BATCH_SIZE=32 +N_SAMPLES_PER_PROMPT=8 +EVAL_N_SAMPLES_PER_PROMPT=32 +ENFORCE_EAGER=true # cuda graphs can cause some instability +LR=1e-6 + +# GLM-4.7-Flash supports flash attention (v_head_dim == qk_head_dim + qk_rope_head_dim == 256). +FLASH_ATTN=true + +# Megatron parallelism: TP=1, EP=8 fits 64 MoE experts across 8 GPUs (8 experts/GPU) +MEGATRON_TP=4 +MEGATRON_PP=1 +MEGATRON_CP=1 +MEGATRON_EP=8 +MEGATRON_ETP=1 +# MEGATRON_LAST_PIPELINE_STAGE_LAYER=23 + +# MoE routing flags (DeepSeek-V3 style: sigmoid scoring with expert bias) +MOE_TOKEN_DISPATCHER="alltoall" +MOE_ROUTER_LB="none" +MOE_GROUPED_GEMM=true +MOE_ROUTER_SCORE_FN="sigmoid" +MOE_ROUTER_EXPERT_BIAS=true + +# CPU optimizer offload to fit in 80GB GPUs +OPTIMIZER_CPU_OFFLOAD=true +OPTIMIZER_OFFLOAD_FRACTION=1.0 + +# TIS parameters +TIS_IMP_RATIO_CAP=2.0 +USE_TIS=false + +# EFA +SKYRL_LD_LIBRARY_PATH_EXPORT=1 +LD_LIBRARY_PATH=/opt/amazon/efa/lib:$LD_LIBRARY_PATH +FI_PROVIDER=efa + +#r3 +ROUTER_REPLAY=false +DISTRIBUTED_EXECUTOR_BACKEND=mp + +UV_HTTP_TIMEOUT=100 + +SKYRL_RAY_PG_TIMEOUT_IN_S=450 uv run --isolated --extra megatron -m examples.train.algorithms.dapo.main_dapo \ + data.train_data="['$TRAIN_FILE']" \ + data.val_data="['$TEST_FILE']" \ + trainer.algorithm.advantage_estimator="grpo" \ + trainer.algorithm.policy_loss_type="dual_clip" \ + trainer.algorithm.overlong_buffer_len=$OVERLONG_BUFFER_LEN \ + trainer.algorithm.overlong_buffer_penalty_factor=$OVERLONG_BUFFER_PENALTY_FACTOR \ + trainer.algorithm.loss_reduction=$LOSS_REDUCTION \ + generator.inference_engine.enforce_eager=$ENFORCE_EAGER \ + generator.apply_overlong_filtering=$APPLY_OVERLONG_FILTERING \ + generator.sampling_params.temperature=$TEMPERATURE \ + generator.sampling_params.top_p=$TOP_P \ + generator.eval_sampling_params.top_p=$EVAL_TOP_P \ + generator.eval_sampling_params.temperature=$TEMPERATURE \ + generator.eval_sampling_params.max_generate_length=$MAX_RESPONSE_LENGTH \ + generator.inference_engine.enable_return_routed_experts=$ROUTER_REPLAY \ + trainer.policy.megatron_config.moe_enable_routing_replay=$ROUTER_REPLAY \ + generator.inference_engine.distributed_executor_backend="mp" \ + trainer.algorithm.use_kl_loss=$USE_KL_LOSS \ + trainer.algorithm.clip_ratio_c=$CLIP_RATIO_C \ + trainer.policy.model.path="$MODEL_NAME" \ + trainer.placement.colocate_all=true \ + trainer.strategy=megatron \ + trainer.placement.policy_num_nodes=$NUM_NODES \ + trainer.placement.policy_num_gpus_per_node=$NUM_GPUS_PER_NODE \ + generator.inference_engine.num_engines=$NUM_INFERENCE_ENGINES \ + generator.inference_engine.tensor_parallel_size=$INFERENCE_ENGINE_TENSOR_PARALLEL_SIZE \ + trainer.policy.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \ + trainer.policy.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \ + trainer.policy.megatron_config.context_parallel_size=$MEGATRON_CP \ + trainer.policy.megatron_config.expert_model_parallel_size=$MEGATRON_EP \ + trainer.policy.megatron_config.expert_tensor_parallel_size=$MEGATRON_ETP \ + trainer.policy.megatron_config.moe_token_dispatcher_type=$MOE_TOKEN_DISPATCHER \ + trainer.policy.megatron_config.moe_router_load_balancing_type=$MOE_ROUTER_LB \ + trainer.policy.megatron_config.moe_grouped_gemm=$MOE_GROUPED_GEMM \ + trainer.policy.megatron_config.moe_router_score_function=$MOE_ROUTER_SCORE_FN \ + trainer.policy.megatron_config.moe_router_enable_expert_bias=$MOE_ROUTER_EXPERT_BIAS \ + trainer.policy.megatron_config.optimizer_config_kwargs.optimizer_cpu_offload=$OPTIMIZER_CPU_OFFLOAD \ + trainer.policy.megatron_config.optimizer_config_kwargs.optimizer_offload_fraction=$OPTIMIZER_OFFLOAD_FRACTION \ + trainer.policy.megatron_config.empty_cuda_cache=true \ + trainer.use_sample_packing=true \ + trainer.flash_attn=$FLASH_ATTN \ + generator.inference_engine.engine_init_kwargs.max_model_len=$INFERENCE_ENGINE_MAX_MODEL_LEN \ + trainer.algorithm.use_tis=$USE_TIS \ + trainer.algorithm.tis_imp_ratio_cap=$TIS_IMP_RATIO_CAP \ + trainer.epochs=20 \ + trainer.algorithm.eps_clip_low=$CLIP_RATIO_LOW \ + trainer.algorithm.eps_clip_high=$CLIP_RATIO_HIGH \ + trainer.eval_batch_size=1024 \ + trainer.eval_before_train=false \ + trainer.eval_interval=5 \ + trainer.update_epochs_per_batch=1 \ + trainer.train_batch_size=$TRAIN_BATCH_SIZE \ + trainer.policy_mini_batch_size=$MINI_BATCH_SIZE \ + trainer.micro_forward_batch_size_per_gpu=2 \ + trainer.micro_train_batch_size_per_gpu=2 \ + trainer.ckpt_interval=10 \ + trainer.max_prompt_length=$MAX_PROMPT_LENGTH \ + generator.sampling_params.max_generate_length=$MAX_RESPONSE_LENGTH \ + trainer.policy.optimizer_config.lr=$LR \ + trainer.policy.optimizer_config.num_warmup_steps=40 \ + trainer.policy.optimizer_config.weight_decay=0.1 \ + trainer.policy.optimizer_config.max_grad_norm=1.0 \ + generator.inference_engine.backend=vllm \ + generator.inference_engine.run_engines_locally=true \ + generator.inference_engine.weight_sync_backend=nccl \ + generator.inference_engine.async_engine=false \ + generator.batched=true \ + environment.env_class=aime \ + generator.n_samples_per_prompt=$N_SAMPLES_PER_PROMPT \ + generator.eval_n_samples_per_prompt=$EVAL_N_SAMPLES_PER_PROMPT \ + generator.inference_engine.gpu_memory_utilization=0.7 \ + trainer.logger="$LOGGER" \ + trainer.project_name="dapo_glm_flash" \ + trainer.run_name="dapo_glm4_7_flash_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_ep${MEGATRON_EP}_etp${MEGATRON_ETP}_test_full_no_r3" \ + trainer.export_path="/mnt/local_storage/exports/dapo_glm4_7_flash_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_ep${MEGATRON_EP}_etp${MEGATRON_ETP}" \ + trainer.hf_save_interval=300 \ + trainer.resume_mode=null \ + trainer.max_ckpts_to_keep=3 \ + trainer.ckpt_path="/mnt/local_storage/ckpts/dapo_glm4_7_flash_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_ep${MEGATRON_EP}_etp${MEGATRON_ETP}" \ + $@ \ No newline at end of file diff --git a/examples/train/megatron/run_megatron.sh b/examples/train/megatron/run_megatron.sh index f5474824fb..8497088143 100644 --- a/examples/train/megatron/run_megatron.sh +++ b/examples/train/megatron/run_megatron.sh @@ -6,8 +6,8 @@ set -x # export WANDB_API_KEY= # bash examples/train/megatron/run_megatron.sh -DATA_DIR="$HOME/data/gsm8k" -NUM_GPUS=4 +DATA_DIR="/mnt/local_storage/data/gsm8k" +NUM_GPUS=8 LOGGER="wandb" # change to "console" to print to stdout MODEL_NAME="Qwen/Qwen3-0.6B" @@ -31,7 +31,9 @@ uv run --isolated --extra megatron -m skyrl.train.entrypoints.main_base \ trainer.strategy=megatron \ trainer.placement.policy_num_gpus_per_node=$NUM_GPUS \ trainer.placement.ref_num_gpus_per_node=$NUM_GPUS \ - generator.inference_engine.num_engines=$NUM_GPUS \ + trainer.placement.policy_num_nodes=2 \ + trainer.placement.ref_num_nodes=2 \ + generator.inference_engine.num_engines=16 \ generator.inference_engine.tensor_parallel_size=1 \ trainer.policy.megatron_config.torch_profiler_config.enable=$ENABLE_TORCH_PROFILER \ trainer.policy.megatron_config.torch_profiler_config.ranks=$RANKS_TO_PROFILE \ @@ -48,11 +50,12 @@ uv run --isolated --extra megatron -m skyrl.train.entrypoints.main_base \ trainer.eval_before_train=false \ trainer.eval_interval=5 \ trainer.update_epochs_per_batch=1 \ - trainer.train_batch_size=128 \ + trainer.train_batch_size=64 \ trainer.policy_mini_batch_size=64 \ trainer.micro_forward_batch_size_per_gpu=4 \ trainer.micro_train_batch_size_per_gpu=4 \ - trainer.ckpt_interval=10 \ + trainer.ckpt_interval=1 \ + trainer.max_ckpts_to_keep=-1 \ trainer.max_prompt_length=512 \ generator.sampling_params.max_generate_length=1024 \ trainer.policy.optimizer_config.lr=1.0e-6 \ @@ -66,8 +69,8 @@ uv run --isolated --extra megatron -m skyrl.train.entrypoints.main_base \ generator.n_samples_per_prompt=5 \ generator.inference_engine.gpu_memory_utilization=0.7 \ trainer.logger="$LOGGER" \ - trainer.project_name="gsm8k_megatron" \ - trainer.run_name="gsm8k_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_${MODEL_NAME}" \ + trainer.project_name="gsm8k_megatron_test" \ + trainer.run_name="gsm8k_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_${MODEL_NAME}_multinode_s3" \ trainer.resume_mode=null \ - trainer.ckpt_path="$HOME/ckpts/gsm8k_megatron_ckpt" \ + trainer.ckpt_path="/mnt/local_storage/gsm8k_ckpt" \ $@ \ No newline at end of file diff --git a/examples/train/megatron/run_megatron_grpo_glm4_7_30b.sh b/examples/train/megatron/run_megatron_grpo_glm4_7_30b.sh index 36b2980740..c365170842 100644 --- a/examples/train/megatron/run_megatron_grpo_glm4_7_30b.sh +++ b/examples/train/megatron/run_megatron_grpo_glm4_7_30b.sh @@ -21,7 +21,7 @@ set -x MODEL_NAME="zai-org/GLM-4.7-Flash" DATA_DIR=${DATA_DIR:-"$HOME/data/gsm8k"} -CKPT_DIR=${CKPT_DIR:-"$HOME/ckpts/glm4_7_30b_a3b_grpo_megatron"} +CKPT_DIR=${CKPT_DIR:-"/mnt/local_storage/ckpts/glm4_7_30b_a3b_grpo_megatron"} LOGGER="wandb" # change to "console" to print to stdout INFERENCE_BACKEND="vllm" @@ -30,11 +30,14 @@ NUM_NODES=1 NUM_GPUS=8 # Megatron parallelism: TP=1, EP=8 fits 64 MoE experts across 8 GPUs (8 experts/GPU) -MEGATRON_TP=1 +MEGATRON_TP=4 MEGATRON_PP=1 MEGATRON_CP=1 MEGATRON_EP=8 MEGATRON_ETP=1 +# MEGATRON_LAST_PIPELINE_STAGE_LAYER=23 + # trainer.policy.megatron_config.transformer_config_kwargs.num_layers_in_last_pipeline_stage=$MEGATRON_LAST_PIPELINE_STAGE_LAYER \ + # vLLM inference: 2 engines x TP=4 = 8 GPUs (20 heads / 4 = 5 heads per GPU) NUM_INFERENCE_ENGINES=2 @@ -57,7 +60,11 @@ MOE_ROUTER_EXPERT_BIAS=true OPTIMIZER_CPU_OFFLOAD=true OPTIMIZER_OFFLOAD_FRACTION=1.0 -uv run --isolated --extra megatron -m skyrl.train.entrypoints.main_base \ +# Routing replay params +ROUTER_REPLAY=True +DISTRIBUTED_EXECUTION_BACKEND="mp" + +SKYRL_RAY_PG_TIMEOUT_IN_S=300 uv run --isolated --extra megatron -m skyrl.train.entrypoints.main_base \ data.train_data="['$DATA_DIR/train.parquet']" \ data.val_data="['$DATA_DIR/validation.parquet']" \ trainer.algorithm.advantage_estimator="grpo" \ @@ -82,6 +89,9 @@ uv run --isolated --extra megatron -m skyrl.train.entrypoints.main_base \ trainer.policy.megatron_config.moe_router_enable_expert_bias=$MOE_ROUTER_EXPERT_BIAS \ trainer.policy.megatron_config.optimizer_config_kwargs.optimizer_cpu_offload=$OPTIMIZER_CPU_OFFLOAD \ trainer.policy.megatron_config.optimizer_config_kwargs.optimizer_offload_fraction=$OPTIMIZER_OFFLOAD_FRACTION \ + trainer.policy.megatron_config.moe_enable_routing_replay=$ROUTER_REPLAY \ + generator.inference_engine.enable_return_routed_experts=$ROUTER_REPLAY \ + generator.inference_engine.distributed_executor_backend=$DISTRIBUTED_EXECUTION_BACKEND \ trainer.policy.megatron_config.empty_cuda_cache=true \ trainer.use_sample_packing=true \ trainer.flash_attn=$FLASH_ATTN \ @@ -92,9 +102,9 @@ uv run --isolated --extra megatron -m skyrl.train.entrypoints.main_base \ trainer.update_epochs_per_batch=1 \ trainer.train_batch_size=128 \ trainer.policy_mini_batch_size=64 \ - trainer.micro_forward_batch_size_per_gpu=4 \ - trainer.micro_train_batch_size_per_gpu=4 \ - trainer.ckpt_interval=10 \ + trainer.micro_forward_batch_size_per_gpu=2 \ + trainer.micro_train_batch_size_per_gpu=2 \ + trainer.ckpt_interval=100 \ trainer.max_prompt_length=512 \ generator.sampling_params.max_generate_length=1024 \ trainer.policy.optimizer_config.lr=1.0e-6 \ @@ -108,10 +118,10 @@ uv run --isolated --extra megatron -m skyrl.train.entrypoints.main_base \ generator.batched=true \ environment.env_class=gsm8k \ generator.n_samples_per_prompt=5 \ - generator.inference_engine.gpu_memory_utilization=0.5 \ + generator.inference_engine.gpu_memory_utilization=0.7 \ trainer.logger="$LOGGER" \ trainer.project_name="glm4_7_30b_grpo" \ - trainer.run_name="glm4_7_30b_a3b_grpo_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_ep${MEGATRON_EP}_etp${MEGATRON_ETP}" \ + trainer.run_name="glm4_7_30b_a3b_grpo_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_ep${MEGATRON_EP}_etp${MEGATRON_ETP}_fixed_weight_sync_r3" \ trainer.resume_mode=null \ trainer.ckpt_path="$CKPT_DIR" \ $@ diff --git a/examples/train/megatron/run_megatron_grpo_glm4_7_30b_lora.sh b/examples/train/megatron/run_megatron_grpo_glm4_7_30b_lora.sh new file mode 100644 index 0000000000..5ec2347c75 --- /dev/null +++ b/examples/train/megatron/run_megatron_grpo_glm4_7_30b_lora.sh @@ -0,0 +1,156 @@ +set -x + +# Colocated GRPO training+generation for GLM-4.7-Flash on GSM8K with Megatron. +# GLM-4.7-Flash (zai-org/GLM-4.7-Flash) is a DeepSeek-V3 architecture clone +# with MLA + MoE (64 routed experts, 4 active per token, ~3B active parameters). +# +# Runs on 1 node of 8 GPUs (TP=1 EP=8 for Megatron, 2x TP=4 vLLM engines). +# GLM-4.7-Flash has 20 attention heads, so vLLM TP must divide 20 (use TP=4). +# +# Setup: +# 1. Install deps: +# uv sync --extra megatron +# 2. GLM-4.7-Flash needs transformers>=5.0.0 (for Glm4MoeLiteConfig). +# If not yet available via uv sync, install manually: +# uv pip install "transformers>=5.0.0" +# 3. Prepare data: +# uv run examples/gsm8k/gsm8k_dataset.py --output_dir $HOME/data/gsm8k +# 4. Run: +# export WANDB_API_KEY= # or set LOGGER=console below +# bash examples/train/megatron/run_megatron_grpo_glm4_7_30b_lora.sh + +MODEL_NAME="zai-org/GLM-4.7-Flash" +DATA_DIR=${DATA_DIR:-"/mnt/local_storage/data/gsm8k"} +CKPT_DIR=${CKPT_DIR:-"/mnt/local_storage/ckpts/glm4_7_30b_a3b_grpo_megatron_lora"} +EXPORT_DIR=${EXPORT_DIR:-"/mnt/local_storage/exports"} +LOGGER="wandb" # change to "console" to print to stdout + +INFERENCE_BACKEND="vllm" + +NUM_NODES=1 +NUM_GPUS=8 + +# Megatron parallelism: TP=1, EP=8 fits 64 MoE experts across 8 GPUs (8 experts/GPU) +MEGATRON_TP=4 +MEGATRON_PP=1 +MEGATRON_CP=1 +MEGATRON_EP=8 +MEGATRON_ETP=1 +# MEGATRON_LAST_PIPELINE_STAGE_LAYER=23 +# trainer.policy.megatron_config.transformer_config_kwargs.num_layers_in_last_pipeline_stage=$MEGATRON_LAST_PIPELINE_STAGE_LAYER \ + + +# vLLM inference: 2 engines x TP=4 = 8 GPUs (20 heads / 4 = 5 heads per GPU) +NUM_INFERENCE_ENGINES=2 +INFERENCE_ENGINE_TP=4 +INFERENCE_ENGINE_MAX_MODEL_LEN=2048 + +# GLM-4.7-Flash supports flash attention (v_head_dim == qk_head_dim + qk_rope_head_dim == 256). +# Most other MLA models (DeepSeek-V3, Moonlight) do NOT support flash attention due to +# mismatched Q/V head dimensions. Use flash_attn=false for those models. +FLASH_ATTN=true + +# MoE routing flags (DeepSeek-V3 style: sigmoid scoring with expert bias) +MOE_TOKEN_DISPATCHER="alltoall" +MOE_ROUTER_LB="none" +MOE_GROUPED_GEMM=true +MOE_ROUTER_SCORE_FN="sigmoid" +MOE_ROUTER_EXPERT_BIAS=true +MOE_ROUTER_EXPERT_BIAS_UPDATE_RATE=0 + +# CPU optimizer offload to fit in 80GB GPUs +OPTIMIZER_CPU_OFFLOAD=true +OPTIMIZER_OFFLOAD_FRACTION=1.0 + +# Routing replay params +ROUTER_REPLAY=True +DISTRIBUTED_EXECUTION_BACKEND="mp" + +# LoRA +LORA_RANK=128 +LORA_ALPHA=128 + +# TIS parameters +TIS_IMP_RATIO_CAP=2.0 +USE_TIS=false + +# MIS parameters +SEQ_MASK=null # null to turn off +SEQ_MASK_HIGH=1.05 +SEQ_MASK_LOW=0.95 + +# Policy loss +POLICY_LOSS=regular + + +SKYRL_RAY_PG_TIMEOUT_IN_S=600 uv run --isolated --extra megatron -m skyrl.train.entrypoints.main_base \ + data.train_data="['$DATA_DIR/train.parquet']" \ + data.val_data="['$DATA_DIR/validation.parquet']" \ + trainer.algorithm.advantage_estimator="grpo" \ + trainer.algorithm.policy_loss_type=$POLICY_LOSS \ + trainer.policy.model.path=$MODEL_NAME \ + trainer.placement.colocate_all=true \ + trainer.strategy=megatron \ + trainer.placement.policy_num_nodes=$NUM_NODES \ + trainer.placement.policy_num_gpus_per_node=$NUM_GPUS \ + generator.inference_engine.num_engines=$NUM_INFERENCE_ENGINES \ + generator.inference_engine.tensor_parallel_size=$INFERENCE_ENGINE_TP \ + generator.inference_engine.enforce_eager=true \ + generator.inference_engine.engine_init_kwargs.max_model_len=$INFERENCE_ENGINE_MAX_MODEL_LEN \ + trainer.policy.model.lora.rank=$LORA_RANK \ + trainer.policy.model.lora.alpha=$LORA_ALPHA \ + trainer.policy.megatron_config.tensor_model_parallel_size=$MEGATRON_TP \ + trainer.policy.megatron_config.pipeline_model_parallel_size=$MEGATRON_PP \ + trainer.policy.megatron_config.context_parallel_size=$MEGATRON_CP \ + trainer.policy.megatron_config.expert_model_parallel_size=$MEGATRON_EP \ + trainer.policy.megatron_config.expert_tensor_parallel_size=$MEGATRON_ETP \ + trainer.policy.megatron_config.moe_token_dispatcher_type=$MOE_TOKEN_DISPATCHER \ + trainer.policy.megatron_config.moe_router_load_balancing_type=$MOE_ROUTER_LB \ + trainer.policy.megatron_config.moe_grouped_gemm=$MOE_GROUPED_GEMM \ + trainer.policy.megatron_config.moe_router_score_function=$MOE_ROUTER_SCORE_FN \ + trainer.policy.megatron_config.moe_router_enable_expert_bias=$MOE_ROUTER_EXPERT_BIAS \ + trainer.policy.megatron_config.transformer_config_kwargs.moe_router_bias_update_rate=$MOE_ROUTER_EXPERT_BIAS_UPDATE_RATE \ + trainer.policy.megatron_config.optimizer_config_kwargs.optimizer_cpu_offload=$OPTIMIZER_CPU_OFFLOAD \ + trainer.policy.megatron_config.optimizer_config_kwargs.optimizer_offload_fraction=$OPTIMIZER_OFFLOAD_FRACTION \ + trainer.algorithm.use_tis=$USE_TIS \ + trainer.algorithm.tis_imp_ratio_cap=$TIS_IMP_RATIO_CAP \ + trainer.algorithm.off_policy_correction.sequence_mask_metric=$SEQ_MASK \ + trainer.algorithm.off_policy_correction.geo_mask_high=$SEQ_MASK_HIGH \ + trainer.algorithm.off_policy_correction.geo_mask_low=$SEQ_MASK_LOW \ + trainer.policy.megatron_config.moe_enable_routing_replay=$ROUTER_REPLAY \ + generator.inference_engine.enable_return_routed_experts=$ROUTER_REPLAY \ + generator.inference_engine.distributed_executor_backend=$DISTRIBUTED_EXECUTION_BACKEND \ + trainer.policy.megatron_config.empty_cuda_cache=true \ + trainer.use_sample_packing=true \ + trainer.flash_attn=$FLASH_ATTN \ + trainer.epochs=20 \ + trainer.eval_batch_size=1024 \ + trainer.eval_before_train=false \ + trainer.eval_interval=5 \ + trainer.update_epochs_per_batch=1 \ + trainer.train_batch_size=128 \ + trainer.policy_mini_batch_size=64 \ + trainer.micro_forward_batch_size_per_gpu=2 \ + trainer.micro_train_batch_size_per_gpu=2 \ + trainer.ckpt_interval=10 \ + trainer.max_prompt_length=512 \ + generator.sampling_params.max_generate_length=1024 \ + trainer.policy.optimizer_config.lr=1.0e-5 \ + trainer.policy.optimizer_config.weight_decay=0.1 \ + trainer.policy.optimizer_config.max_grad_norm=1.0 \ + trainer.algorithm.use_kl_loss=false \ + generator.inference_engine.backend=$INFERENCE_BACKEND \ + generator.inference_engine.run_engines_locally=true \ + generator.inference_engine.weight_sync_backend=nccl \ + generator.inference_engine.async_engine=true \ + generator.batched=true \ + environment.env_class=gsm8k \ + generator.n_samples_per_prompt=5 \ + generator.inference_engine.gpu_memory_utilization=0.7 \ + trainer.logger="$LOGGER" \ + trainer.project_name="glm4_7_30b_grpo" \ + trainer.run_name="glm4_7_30b_a3b_grpo_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_ep${MEGATRON_EP}_etp${MEGATRON_ETP}_lora_{$LORA_RANK}_{$LORA_ALPHA}_lr_1e-5_r3_moe_router_bias_update_rate_0" \ + trainer.resume_mode=null \ + trainer.ckpt_path="$CKPT_DIR" \ + trainer.export_path="$EXPORT_DIR" \ + $@ diff --git a/pyproject.toml b/pyproject.toml index 4012edbc74..e8c670730c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,6 @@ dependencies = [ "rich>=14.1.0", "safetensors>=0.6.2", "tokenizers>=0.21.2", - "transformers>=4.56.1,<5", "typer>=0.17.4", # "wandb>=0.22.0", "peft", @@ -209,6 +208,7 @@ override-dependencies = [ "transformer-engine[pytorch]==2.10.0; sys_platform == 'linux'", "megatron-core==0.16.0; sys_platform == 'linux'", "ml_dtypes>=0.5.0; sys_platform == 'linux'", + "transformers>=5.0.0", ] [tool.uv.extra-build-dependencies] @@ -253,6 +253,7 @@ torchvision = [ ] # pin megatron bridge commit to fix for MoE + LoRA merging. Update this when an official release is cut megatron-bridge = {git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge", rev = "02b5fccab5e5b21856d36c2e357839e0123b4b8f", marker = "sys_platform == 'linux'"} +# megatron-bridge = { path = "./Megatron-Bridge", editable = true, marker = "sys_platform == 'linux'" } harbor = { git = "https://github.com/laude-institute/harbor", rev = "8c040e1bb010201fd3c75bee3dede2407b9f57cd" } [tool.black] diff --git a/skyrl/backends/skyrl_train/workers/megatron/megatron_worker.py b/skyrl/backends/skyrl_train/workers/megatron/megatron_worker.py index 636c518154..5f4f1f12bd 100644 --- a/skyrl/backends/skyrl_train/workers/megatron/megatron_worker.py +++ b/skyrl/backends/skyrl_train/workers/megatron/megatron_worker.py @@ -477,7 +477,7 @@ def forward(self, data: TrainingInputBatch): "position_ids": position_ids, "num_actions": num_actions, "rollout_expert_indices": ( - micro.get("rollout_expert_indices") if self.enable_router_replay else None + micro.get("rollout_expert_indices").to(torch.int32) if self.enable_router_replay else None ), } ) @@ -672,6 +672,9 @@ def forward_backward( # Move data to GPU data.to(torch.cuda.current_device()) + # make sure everyone starts the forward pass after data is materialized + torch.distributed.barrier() + # Build micro-batch dicts expected by forward_backward_mini_batch micro_buffer = [] for experience in BatchIterator(data, micro_batch_size, drop_last=False): @@ -692,7 +695,7 @@ def forward_backward( "loss_mask": experience.loss_mask, "rollout_action_logprobs": experience.rollout_logprobs, "action_mask": experience.action_mask, - "rollout_expert_indices": experience.rollout_expert_indices if self.enable_router_replay else None, + "rollout_expert_indices": experience.rollout_expert_indices.to(torch.int32) if self.enable_router_replay else None, } ) diff --git a/skyrl/train/dataset/preprocess.py b/skyrl/train/dataset/preprocess.py index 7b083bc06f..7e6cd9460d 100644 --- a/skyrl/train/dataset/preprocess.py +++ b/skyrl/train/dataset/preprocess.py @@ -172,6 +172,11 @@ def convert_prompts_responses_to_batch_tensors( n = min(len(sample_indices), max_total - left_pad) padded[i, left_pad : left_pad + n] = torch.tensor(sample_indices[:n], dtype=torch.int32) rollout_expert_indices_tensor = padded + if rollout_expert_indices_tensor.max().item() < 2**8: + rollout_expert_indices_tensor = rollout_expert_indices_tensor.to(torch.uint8) + else: + # this should handle num_experts <= 2**15, which seems like a safe limit for number of experts in an MoE layer (god willing) + rollout_expert_indices_tensor = rollout_expert_indices_tensor.to(torch.int16) return ( sequences, diff --git a/skyrl/train/utils/utils.py b/skyrl/train/utils/utils.py index b2dd2a2b13..9f83c6caf6 100644 --- a/skyrl/train/utils/utils.py +++ b/skyrl/train/utils/utils.py @@ -582,6 +582,11 @@ def prepare_runtime_environment(cfg: SkyRLTrainConfig) -> dict[str, str]: if cfg.generator.inference_engine.weight_sync_backend == "nccl": env_vars["NCCL_CUMEM_ENABLE"] = "0" + # env_vars["UV_CACHE_DIR"] = "/mnt/local_storage/uv_cache" + env_vars["HF_HOME"] = "/mnt/local_storage/hf_cache" + env_vars["TRANSFORMERS_CACHE"] = "/mnt/local_storage/hf_cache" + env_vars["HF_DATASETS_CACHE"] = "/mnt/local_storage/hf_cache" + if cfg.trainer.strategy == "megatron": # this is needed for megatron-core >= 0.15.0, which requires devices to be visible while importing megatron.core env_vars["RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO"] = "0" diff --git a/uv.lock b/uv.lock index 75c4a8802c..3668a187b3 100644 --- a/uv.lock +++ b/uv.lock @@ -299,6 +299,7 @@ overrides = [ { name = "ml-dtypes", marker = "sys_platform == 'linux'", specifier = ">=0.5.0" }, { name = "nvidia-resiliency-ext", marker = "sys_platform == 'never'" }, { name = "transformer-engine", extras = ["pytorch"], marker = "sys_platform == 'linux'", specifier = "==2.10.0" }, + { name = "transformers", specifier = ">=5.0.0" }, ] [[package]] @@ -2684,31 +2685,34 @@ wheels = [ [[package]] name = "hf-xet" -version = "1.3.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/4f/3a/9aa61729228fb03e946409c51963f0cd2fd7c109f4ab93edc5f04a10be86/hf_xet-1.3.0.tar.gz", hash = "sha256:9c154ad63e17aca970987b2cf17dbd8a0c09bb18aeb246f637647a8058e4522b", size = 641390, upload-time = "2026-02-24T00:16:19.935Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/b7/18/16954a87cfdfdc04792f1ffc9a29c0a48253ab10ec0f4856f39c7f7bf7cd/hf_xet-1.3.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:95bdeab4747cb45f855601e39b9e86ae92b4a114978ada6e0401961fcc5d2958", size = 3759481, upload-time = "2026-02-24T00:16:03.387Z" }, - { url = "https://files.pythonhosted.org/packages/d8/6f/a55752047e9b0e69517775531c14680331f00c9cd4dc07f5e9b7f7f68a12/hf_xet-1.3.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:f99992583f27b139392601fe99e88df155dc4de7feba98ed27ce2d3e6b4a65bb", size = 3517927, upload-time = "2026-02-24T00:16:02.108Z" }, - { url = "https://files.pythonhosted.org/packages/ef/71/a909dbf9c8b166aa3f15db2bcf5d8afbe9d53170922edde2b919cf0bc455/hf_xet-1.3.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:687a71fc6d2eaa79d864da3aa13e5d887e124d357f5f306bfff6c385eea9d990", size = 4174328, upload-time = "2026-02-24T00:15:55.056Z" }, - { url = "https://files.pythonhosted.org/packages/21/cc/dec0d971bb5872345b8d64363a0b78ed6a147eea5b4281575ce5a8150f42/hf_xet-1.3.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:75d19813ed0e24525409bc22566282ae9bc93e5d764b185565e863dc28280a45", size = 3953184, upload-time = "2026-02-24T00:15:53.43Z" }, - { url = "https://files.pythonhosted.org/packages/3d/d8/d4259146e7c7089dd3f22cd62676d665bcfbc27428a070abee8985e0ab33/hf_xet-1.3.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:078af43569c2e05233137a93a33d2293f95c272745eaf030a9bb5f27bb0c9e9c", size = 4152800, upload-time = "2026-02-24T00:16:10.391Z" }, - { url = "https://files.pythonhosted.org/packages/c9/0d/39d9d32e4cde689da618739197e264bba5a55d870377d5d32cdd5c03fad8/hf_xet-1.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:be8731e1620cc8549025c39ed3917c8fd125efaeae54ae679214a3d573e6c109", size = 4390499, upload-time = "2026-02-24T00:16:11.671Z" }, - { url = "https://files.pythonhosted.org/packages/d9/27/5b9c323bf5513e8971702eeac43ba5cb554921e0f292ad52f20ed6028131/hf_xet-1.3.0-cp313-cp313t-win_amd64.whl", hash = "sha256:1552616c0e0fa728a4ffdffa106e91faa0fd4edb44868e79b464fad00b2758ee", size = 3634124, upload-time = "2026-02-24T00:16:20.964Z" }, - { url = "https://files.pythonhosted.org/packages/85/32/76949adb65b7ca54c1e2b0519a98f7c88221b9091ae8780fc76d7d1bae70/hf_xet-1.3.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:a61496eccf412d7c51a5613c31a2051d357ddea6be53a0672c7644cf39bfefe9", size = 3759780, upload-time = "2026-02-24T00:16:09.037Z" }, - { url = "https://files.pythonhosted.org/packages/63/c4/ad6fa712611711c129fa49eb17baaf0665647eb0abce32d94ccd44b69c6d/hf_xet-1.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:aba35218871cc438826076778958f7ab2a1f4f8d654e91c307073a815360558f", size = 3517640, upload-time = "2026-02-24T00:16:07.536Z" }, - { url = "https://files.pythonhosted.org/packages/15/6b/b44659c5261cde6320a579d0acc949f19283a13d32fc9389fc49639f435e/hf_xet-1.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c444d8f657dedd7a72aa0ef0178fe01fe92b04b58014ee49e2b3b4985aea1529", size = 4174285, upload-time = "2026-02-24T00:16:00.848Z" }, - { url = "https://files.pythonhosted.org/packages/61/cf/16ef1b366482fa4e71d1642b019158d7ac891bcb961477102ceadfe69436/hf_xet-1.3.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:6d1bbda7900d72bc591cd39a64e35ad07f89a24f90e3d7b7c692cb93a1926cde", size = 3952705, upload-time = "2026-02-24T00:15:59.355Z" }, - { url = "https://files.pythonhosted.org/packages/d5/5a/d03453902ab9373715f50f3969979782a355df94329ea958ae78304ca06b/hf_xet-1.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:588f5df302e7dba5c3b60d4e5c683f95678526c29b9f64cbeb23e9f1889c6b83", size = 4152353, upload-time = "2026-02-24T00:16:15.857Z" }, - { url = "https://files.pythonhosted.org/packages/ab/98/d3cd8cdd8d771bee9a03bd52faed6fa114a68a107a0e337aaf0b4c52bf0c/hf_xet-1.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:944ae454b296c42b18219c37f245c78d0e64a734057423e9309f4938faa85d7f", size = 4390010, upload-time = "2026-02-24T00:16:18.713Z" }, - { url = "https://files.pythonhosted.org/packages/1f/10/3c58501d44d7a148d749ffa6046cbd14aa75a7ab07c9e7a984f86294cc53/hf_xet-1.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:34cdd5f10e61b7a1a7542672d20887c85debcfeb70a471ff1506f5a4c9441e42", size = 3634277, upload-time = "2026-02-24T00:16:23.718Z" }, - { url = "https://files.pythonhosted.org/packages/a1/00/22d3d896466ded4c46ef6465b85fa434fa97d79f8f61cea322afde1d6157/hf_xet-1.3.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:df4447f69086dcc6418583315eda6ed09033ac1fbbc784fedcbbbdf67bea1680", size = 3761293, upload-time = "2026-02-24T00:16:06.012Z" }, - { url = "https://files.pythonhosted.org/packages/97/fd/ebb0ea49e9bd9eb9f52844e417e0e6e9c8a59a1e84790691873fa910adc5/hf_xet-1.3.0-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:39f4fe714628adc2214ab4a67391182ee751bc4db581868cb3204900817758a8", size = 3523345, upload-time = "2026-02-24T00:16:04.615Z" }, - { url = "https://files.pythonhosted.org/packages/8a/bb/72ceaaf619cad23d151a281d52e15456bae72f52c3795e820c0b64a5f637/hf_xet-1.3.0-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9b16e53ed6b5c8197cefb3fd12047a430b7034428effed463c03cec68de7e9a3", size = 4178623, upload-time = "2026-02-24T00:15:57.857Z" }, - { url = "https://files.pythonhosted.org/packages/19/30/3280f4b5e407b442923a80ac0b2d96a65be7494457c55695e63f9a2b33dd/hf_xet-1.3.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:92051a1f73019489be77f6837671024ec785a3d1b888466b09d3a9ea15c4a1b5", size = 3958884, upload-time = "2026-02-24T00:15:56.326Z" }, - { url = "https://files.pythonhosted.org/packages/8f/13/5174c6d52583e54a761c88570ca657d621ac684747613f47846debfd6d4d/hf_xet-1.3.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:943046b160e7804a85e68a659d2eee1a83ce3661f72d1294d3cc5ece0f45a355", size = 4158146, upload-time = "2026-02-24T00:16:13.158Z" }, - { url = "https://files.pythonhosted.org/packages/12/13/ea8619021b119e19efdcaeec72f762b5be923cf79b5d4434f2cbbff39829/hf_xet-1.3.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:9b798a95d41b4f33b0b455c8aa76ff1fd26a587a4dd3bdec29f0a37c60b78a2f", size = 4395565, upload-time = "2026-02-24T00:16:14.574Z" }, - { url = "https://files.pythonhosted.org/packages/64/cd/b81d922118a171bfbbecffd60a477e79188ab876260412fac47226a685bf/hf_xet-1.3.0-cp37-abi3-win_amd64.whl", hash = "sha256:227eee5b99d19b9f20c31d901a0c2373af610a24a34e6c2701072c9de48d6d95", size = 3637830, upload-time = "2026-02-24T00:16:22.474Z" }, +version = "1.4.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/09/08/23c84a26716382c89151b5b447b4beb19e3345f3a93d3b73009a71a57ad3/hf_xet-1.4.2.tar.gz", hash = "sha256:b7457b6b482d9e0743bd116363239b1fa904a5e65deede350fbc0c4ea67c71ea", size = 672357, upload-time = "2026-03-13T06:58:51.077Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/06/e8cf74c3c48e5485c7acc5a990d0d8516cdfb5fdf80f799174f1287cc1b5/hf_xet-1.4.2-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:ac8202ae1e664b2c15cdfc7298cbb25e80301ae596d602ef7870099a126fcad4", size = 3796125, upload-time = "2026-03-13T06:58:33.177Z" }, + { url = "https://files.pythonhosted.org/packages/66/d4/b73ebab01cbf60777323b7de9ef05550790451eb5172a220d6b9845385ec/hf_xet-1.4.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:6d2f8ee39fa9fba9af929f8c0d0482f8ee6e209179ad14a909b6ad78ffcb7c81", size = 3555985, upload-time = "2026-03-13T06:58:31.797Z" }, + { url = "https://files.pythonhosted.org/packages/ff/e7/ded6d1bd041c3f2bca9e913a0091adfe32371988e047dd3a68a2463c15a2/hf_xet-1.4.2-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4642a6cf249c09da8c1f87fe50b24b2a3450b235bf8adb55700b52f0ea6e2eb6", size = 4212085, upload-time = "2026-03-13T06:58:24.323Z" }, + { url = "https://files.pythonhosted.org/packages/97/c1/a0a44d1f98934f7bdf17f7a915b934f9fca44bb826628c553589900f6df8/hf_xet-1.4.2-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:769431385e746c92dc05492dde6f687d304584b89c33d79def8367ace06cb555", size = 3988266, upload-time = "2026-03-13T06:58:22.887Z" }, + { url = "https://files.pythonhosted.org/packages/7a/82/be713b439060e7d1f1d93543c8053d4ef2fe7e6922c5b31642eaa26f3c4b/hf_xet-1.4.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c9dd1c1bc4cc56168f81939b0e05b4c36dd2d28c13dc1364b17af89aa0082496", size = 4188513, upload-time = "2026-03-13T06:58:40.858Z" }, + { url = "https://files.pythonhosted.org/packages/21/a6/cbd4188b22abd80ebd0edbb2b3e87f2633e958983519980815fb8314eae5/hf_xet-1.4.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:fca58a2ae4e6f6755cc971ac6fcdf777ea9284d7e540e350bb000813b9a3008d", size = 4428287, upload-time = "2026-03-13T06:58:42.601Z" }, + { url = "https://files.pythonhosted.org/packages/b2/4e/84e45b25e2e3e903ed3db68d7eafa96dae9a1d1f6d0e7fc85120347a852f/hf_xet-1.4.2-cp313-cp313t-win_amd64.whl", hash = "sha256:163aab46854ccae0ab6a786f8edecbbfbaa38fcaa0184db6feceebf7000c93c0", size = 3665574, upload-time = "2026-03-13T06:58:53.881Z" }, + { url = "https://files.pythonhosted.org/packages/ee/71/c5ac2b9a7ae39c14e91973035286e73911c31980fe44e7b1d03730c00adc/hf_xet-1.4.2-cp313-cp313t-win_arm64.whl", hash = "sha256:09b138422ecbe50fd0c84d4da5ff537d27d487d3607183cd10e3e53f05188e82", size = 3528760, upload-time = "2026-03-13T06:58:52.187Z" }, + { url = "https://files.pythonhosted.org/packages/1e/0f/fcd2504015eab26358d8f0f232a1aed6b8d363a011adef83fe130bff88f7/hf_xet-1.4.2-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:949dcf88b484bb9d9276ca83f6599e4aa03d493c08fc168c124ad10b2e6f75d7", size = 3796493, upload-time = "2026-03-13T06:58:39.267Z" }, + { url = "https://files.pythonhosted.org/packages/82/56/19c25105ff81731ca6d55a188b5de2aa99d7a2644c7aa9de1810d5d3b726/hf_xet-1.4.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:41659966020d59eb9559c57de2cde8128b706a26a64c60f0531fa2318f409418", size = 3555797, upload-time = "2026-03-13T06:58:37.546Z" }, + { url = "https://files.pythonhosted.org/packages/bf/e3/8933c073186849b5e06762aa89847991d913d10a95d1603eb7f2c3834086/hf_xet-1.4.2-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5c588e21d80010119458dd5d02a69093f0d115d84e3467efe71ffb2c67c19146", size = 4212127, upload-time = "2026-03-13T06:58:30.539Z" }, + { url = "https://files.pythonhosted.org/packages/eb/01/f89ebba4e369b4ed699dcb60d3152753870996f41c6d22d3d7cac01310e1/hf_xet-1.4.2-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:a296744d771a8621ad1d50c098d7ab975d599800dae6d48528ba3944e5001ba0", size = 3987788, upload-time = "2026-03-13T06:58:29.139Z" }, + { url = "https://files.pythonhosted.org/packages/84/4d/8a53e5ffbc2cc33bbf755382ac1552c6d9af13f623ed125fe67cc3e6772f/hf_xet-1.4.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:f563f7efe49588b7d0629d18d36f46d1658fe7e08dce3fa3d6526e1c98315e2d", size = 4188315, upload-time = "2026-03-13T06:58:48.017Z" }, + { url = "https://files.pythonhosted.org/packages/d1/b8/b7a1c1b5592254bd67050632ebbc1b42cc48588bf4757cb03c2ef87e704a/hf_xet-1.4.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:5b2e0132c56d7ee1bf55bdb638c4b62e7106f6ac74f0b786fed499d5548c5570", size = 4428306, upload-time = "2026-03-13T06:58:49.502Z" }, + { url = "https://files.pythonhosted.org/packages/a0/0c/40779e45b20e11c7c5821a94135e0207080d6b3d76e7b78ccb413c6f839b/hf_xet-1.4.2-cp314-cp314t-win_amd64.whl", hash = "sha256:2f45c712c2fa1215713db10df6ac84b49d0e1c393465440e9cb1de73ecf7bbf6", size = 3665826, upload-time = "2026-03-13T06:58:59.88Z" }, + { url = "https://files.pythonhosted.org/packages/51/4c/e2688c8ad1760d7c30f7c429c79f35f825932581bc7c9ec811436d2f21a0/hf_xet-1.4.2-cp314-cp314t-win_arm64.whl", hash = "sha256:6d53df40616f7168abfccff100d232e9d460583b9d86fa4912c24845f192f2b8", size = 3529113, upload-time = "2026-03-13T06:58:58.491Z" }, + { url = "https://files.pythonhosted.org/packages/b4/86/b40b83a2ff03ef05c4478d2672b1fc2b9683ff870e2b25f4f3af240f2e7b/hf_xet-1.4.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:71f02d6e4cdd07f344f6844845d78518cc7186bd2bc52d37c3b73dc26a3b0bc5", size = 3800339, upload-time = "2026-03-13T06:58:36.245Z" }, + { url = "https://files.pythonhosted.org/packages/64/2e/af4475c32b4378b0e92a587adb1aa3ec53e3450fd3e5fe0372a874531c00/hf_xet-1.4.2-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:e9b38d876e94d4bdcf650778d6ebbaa791dd28de08db9736c43faff06ede1b5a", size = 3559664, upload-time = "2026-03-13T06:58:34.787Z" }, + { url = "https://files.pythonhosted.org/packages/3c/4c/781267da3188db679e601de18112021a5cb16506fe86b246e22c5401a9c4/hf_xet-1.4.2-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:77e8c180b7ef12d8a96739a4e1e558847002afe9ea63b6f6358b2271a8bdda1c", size = 4217422, upload-time = "2026-03-13T06:58:27.472Z" }, + { url = "https://files.pythonhosted.org/packages/68/47/d6cf4a39ecf6c7705f887a46f6ef5c8455b44ad9eb0d391aa7e8a2ff7fea/hf_xet-1.4.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:c3b3c6a882016b94b6c210957502ff7877802d0dbda8ad142c8595db8b944271", size = 3992847, upload-time = "2026-03-13T06:58:25.989Z" }, + { url = "https://files.pythonhosted.org/packages/2d/ef/e80815061abff54697239803948abc665c6b1d237102c174f4f7a9a5ffc5/hf_xet-1.4.2-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9d9a634cc929cfbaf2e1a50c0e532ae8c78fa98618426769480c58501e8c8ac2", size = 4193843, upload-time = "2026-03-13T06:58:44.59Z" }, + { url = "https://files.pythonhosted.org/packages/54/75/07f6aa680575d9646c4167db6407c41340cbe2357f5654c4e72a1b01ca14/hf_xet-1.4.2-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:6b0932eb8b10317ea78b7da6bab172b17be03bbcd7809383d8d5abd6a2233e04", size = 4432751, upload-time = "2026-03-13T06:58:46.533Z" }, + { url = "https://files.pythonhosted.org/packages/cd/71/193eabd7e7d4b903c4aa983a215509c6114915a5a237525ec562baddb868/hf_xet-1.4.2-cp37-abi3-win_amd64.whl", hash = "sha256:ad185719fb2e8ac26f88c8100562dbf9dbdcc3d9d2add00faa94b5f106aea53f", size = 3671149, upload-time = "2026-03-13T06:58:57.07Z" }, + { url = "https://files.pythonhosted.org/packages/b4/7e/ccf239da366b37ba7f0b36095450efae4a64980bdc7ec2f51354205fdf39/hf_xet-1.4.2-cp37-abi3-win_arm64.whl", hash = "sha256:32c012286b581f783653e718c1862aea5b9eb140631685bb0c5e7012c8719a87", size = 3533426, upload-time = "2026-03-13T06:58:55.46Z" }, ] [[package]] @@ -2800,21 +2804,22 @@ wheels = [ [[package]] name = "huggingface-hub" -version = "0.36.2" +version = "1.7.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "filelock" }, { name = "fsspec" }, - { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64' or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-fsdp') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-gpu') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-jax') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-megatron') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-miniswe') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-tpu') or (extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-jax') or (extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-megatron') or (extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-megatron') or (extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-miniswe') or (extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-tpu') or (extra == 'extra-5-skyrl-jax' and extra == 'extra-5-skyrl-megatron') or (extra == 'extra-5-skyrl-megatron' and extra == 'extra-5-skyrl-miniswe') or (extra == 'extra-5-skyrl-megatron' and extra == 'extra-5-skyrl-tpu') or (extra == 'extra-5-skyrl-miniswe' and extra == 'extra-5-skyrl-tpu')" }, + { name = "hf-xet", marker = "platform_machine == 'AMD64' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64' or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-fsdp') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-gpu') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-jax') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-megatron') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-miniswe') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-tpu') or (extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-jax') or (extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-megatron') or (extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-megatron') or (extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-miniswe') or (extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-tpu') or (extra == 'extra-5-skyrl-jax' and extra == 'extra-5-skyrl-megatron') or (extra == 'extra-5-skyrl-megatron' and extra == 'extra-5-skyrl-miniswe') or (extra == 'extra-5-skyrl-megatron' and extra == 'extra-5-skyrl-tpu') or (extra == 'extra-5-skyrl-miniswe' and extra == 'extra-5-skyrl-tpu')" }, + { name = "httpx" }, { name = "packaging" }, { name = "pyyaml" }, - { name = "requests" }, { name = "tqdm" }, + { name = "typer" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/7c/b7/8cb61d2eece5fb05a83271da168186721c450eb74e3c31f7ef3169fa475b/huggingface_hub-0.36.2.tar.gz", hash = "sha256:1934304d2fb224f8afa3b87007d58501acfda9215b334eed53072dd5e815ff7a", size = 649782, upload-time = "2026-02-06T09:24:13.098Z" } +sdist = { url = "https://files.pythonhosted.org/packages/19/15/eafc1c57bf0f8afffb243dcd4c0cceb785e956acc17bba4d9bf2ae21fc9c/huggingface_hub-1.7.2.tar.gz", hash = "sha256:7f7e294e9bbb822e025bdb2ada025fa4344d978175a7f78e824d86e35f7ab43b", size = 724684, upload-time = "2026-03-20T10:36:08.767Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a8/af/48ac8483240de756d2438c380746e7130d1c6f75802ef22f3c6d49982787/huggingface_hub-0.36.2-py3-none-any.whl", hash = "sha256:48f0c8eac16145dfce371e9d2d7772854a4f591bcb56c9cf548accf531d54270", size = 566395, upload-time = "2026-02-06T09:24:11.133Z" }, + { url = "https://files.pythonhosted.org/packages/08/de/3ad061a05f74728927ded48c90b73521b9a9328c85d841bdefb30e01fb85/huggingface_hub-1.7.2-py3-none-any.whl", hash = "sha256:288f33a0a17b2a73a1359e2a5fd28d1becb2c121748c6173ab8643fb342c850e", size = 618036, upload-time = "2026-03-20T10:36:06.824Z" }, ] [[package]] @@ -9445,7 +9450,6 @@ dependencies = [ { name = "rich" }, { name = "safetensors" }, { name = "tokenizers" }, - { name = "transformers" }, { name = "typer" }, ] @@ -9754,7 +9758,6 @@ requires-dist = [ { name = "torchvision", marker = "sys_platform == 'linux' and extra == 'megatron'", index = "https://download.pytorch.org/whl/cu128" }, { name = "tqdm", marker = "extra == 'skyrl-train'" }, { name = "transformer-engine", extras = ["pytorch"], marker = "sys_platform == 'linux' and extra == 'megatron'", specifier = "==2.10.0" }, - { name = "transformers", specifier = ">=4.56.1,<5" }, { name = "transformers", marker = "extra == 'skyrl-train'", specifier = ">=4.51.0" }, { name = "ty", marker = "extra == 'dev'" }, { name = "typer", specifier = ">=0.17.4" }, @@ -9767,7 +9770,7 @@ provides-extras = ["gpu", "tpu", "tinker", "aws", "gcp", "azure", "jax", "skyrl- [[package]] name = "skyrl-gym" -version = "0.1.1" +version = "0.2.0" source = { editable = "skyrl-gym" } dependencies = [ { name = "func-timeout" }, @@ -10554,6 +10557,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0f/8b/4b61d6e13f7108f36910df9ab4b58fd389cc2520d54d81b88660804aad99/torch-2.10.0-2-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:418997cb02d0a0f1497cf6a09f63166f9f5df9f3e16c8a716ab76a72127c714f", size = 79423467, upload-time = "2026-02-10T21:44:48.711Z" }, { url = "https://files.pythonhosted.org/packages/d3/54/a2ba279afcca44bbd320d4e73675b282fcee3d81400ea1b53934efca6462/torch-2.10.0-2-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:13ec4add8c3faaed8d13e0574f5cd4a323c11655546f91fbe6afa77b57423574", size = 79498202, upload-time = "2026-02-10T21:44:52.603Z" }, { url = "https://files.pythonhosted.org/packages/ec/23/2c9fe0c9c27f7f6cb865abcea8a4568f29f00acaeadfc6a37f6801f84cb4/torch-2.10.0-2-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:e521c9f030a3774ed770a9c011751fb47c4d12029a3d6522116e48431f2ff89e", size = 79498254, upload-time = "2026-02-10T21:44:44.095Z" }, + { url = "https://files.pythonhosted.org/packages/36/ab/7b562f1808d3f65414cd80a4f7d4bb00979d9355616c034c171249e1a303/torch-2.10.0-3-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:ac5bdcbb074384c66fa160c15b1ead77839e3fe7ed117d667249afce0acabfac", size = 915518691, upload-time = "2026-03-11T14:15:43.147Z" }, + { url = "https://files.pythonhosted.org/packages/b3/7a/abada41517ce0011775f0f4eacc79659bc9bc6c361e6bfe6f7052a6b9363/torch-2.10.0-3-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:98c01b8bb5e3240426dcde1446eed6f40c778091c8544767ef1168fc663a05a6", size = 915622781, upload-time = "2026-03-11T14:17:11.354Z" }, + { url = "https://files.pythonhosted.org/packages/ab/c6/4dfe238342ffdcec5aef1c96c457548762d33c40b45a1ab7033bb26d2ff2/torch-2.10.0-3-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:80b1b5bfe38eb0e9f5ff09f206dcac0a87aadd084230d4a36eea5ec5232c115b", size = 915627275, upload-time = "2026-03-11T14:16:11.325Z" }, + { url = "https://files.pythonhosted.org/packages/d8/f0/72bf18847f58f877a6a8acf60614b14935e2f156d942483af1ffc081aea0/torch-2.10.0-3-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:46b3574d93a2a8134b3f5475cfb98e2eb46771794c57015f6ad1fb795ec25e49", size = 915523474, upload-time = "2026-03-11T14:17:44.422Z" }, + { url = "https://files.pythonhosted.org/packages/f4/39/590742415c3030551944edc2ddc273ea1fdfe8ffb2780992e824f1ebee98/torch-2.10.0-3-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:b1d5e2aba4eb7f8e87fbe04f86442887f9167a35f092afe4c237dfcaaef6e328", size = 915632474, upload-time = "2026-03-11T14:15:13.666Z" }, + { url = "https://files.pythonhosted.org/packages/b6/8e/34949484f764dde5b222b7fe3fede43e4a6f0da9d7f8c370bb617d629ee2/torch-2.10.0-3-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:0228d20b06701c05a8f978357f657817a4a63984b0c90745def81c18aedfa591", size = 915523882, upload-time = "2026-03-11T14:14:46.311Z" }, { url = "https://files.pythonhosted.org/packages/78/89/f5554b13ebd71e05c0b002f95148033e730d3f7067f67423026cc9c69410/torch-2.10.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:3282d9febd1e4e476630a099692b44fdc214ee9bf8ee5377732d9d9dfe5712e4", size = 145992610, upload-time = "2026-01-21T16:25:26.327Z" }, { url = "https://files.pythonhosted.org/packages/ae/30/a3a2120621bf9c17779b169fc17e3dc29b230c29d0f8222f499f5e159aa8/torch-2.10.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:a2f9edd8dbc99f62bc4dfb78af7bf89499bca3d753423ac1b4e06592e467b763", size = 915607863, upload-time = "2026-01-21T16:25:06.696Z" }, { url = "https://files.pythonhosted.org/packages/6f/3d/c87b33c5f260a2a8ad68da7147e105f05868c281c63d65ed85aa4da98c66/torch-2.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:29b7009dba4b7a1c960260fc8ac85022c784250af43af9fb0ebafc9883782ebd", size = 113723116, upload-time = "2026-01-21T16:25:21.916Z" }, @@ -10701,10 +10710,10 @@ dependencies = [ { name = "torch", version = "2.7.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "python_full_version < '3.14' and platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux'" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.22.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:6be714bcdd8849549571f6acfaa2dfa9e00676f042bda517432745fb116f7904" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.22.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6e9752b48c1cdd7f6428bcd30c3d198b30ecea348d16afb651f95035e5252506" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.22.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:e4d4d5a14225875d9bf8c5221d43d8be97786adc498659493799bdeff52c54cf" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.22.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:e50ff5bbae11f57fd3af8e6f2185c136f32e8b94324613428228dd27eba6a4f6" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.22.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:6be714bcdd8849549571f6acfaa2dfa9e00676f042bda517432745fb116f7904" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.22.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6e9752b48c1cdd7f6428bcd30c3d198b30ecea348d16afb651f95035e5252506" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.22.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:e4d4d5a14225875d9bf8c5221d43d8be97786adc498659493799bdeff52c54cf" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.22.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:e50ff5bbae11f57fd3af8e6f2185c136f32e8b94324613428228dd27eba6a4f6" }, ] [[package]] @@ -10724,14 +10733,14 @@ dependencies = [ { name = "torch", version = "2.7.0+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "(python_full_version >= '3.14' and sys_platform == 'linux') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (platform_python_implementation != 'CPython' and sys_platform == 'linux')" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.22.0%2Bcu128-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:f3ac527d58b4c2043eb8d9e29fc56cd1751f36f2aaa6dc75e34ec54c951bcb9c" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.22.0%2Bcu128-cp311-cp311-win_amd64.whl", hash = "sha256:f5dae1307c34813425c0b753530c035e1cc72af0bded395d1ba64dcb2872889f" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.22.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:06c101f40e1ff94869be14487c91fd5352e376f202fdeafb8f53c58cee2fbeb5" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.22.0%2Bcu128-cp312-cp312-win_amd64.whl", hash = "sha256:a87393c86649b7e56b4bf859fe95922ee6ec1c1f3b430246fb1a5b51f8aee37a" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.22.0%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:ee4fa6d4052d9ae25c1233289947fbfa4b88d23710254ab1772b108c1fc5fb4d" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.22.0%2Bcu128-cp313-cp313-win_amd64.whl", hash = "sha256:17d50ffb1df6320da16b85395f1078bf369250ea144f3bb405088aca3d5f030f" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.22.0%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:209c29d78cf2003cf4e22c9b651790f57171334998ee3125594d130526aeaa50" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.22.0%2Bcu128-cp313-cp313t-win_amd64.whl", hash = "sha256:03b454b867f7a0aa9861a463042141448c4f15bec784def19eed39a57fac217b" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.22.0%2Bcu128-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:f3ac527d58b4c2043eb8d9e29fc56cd1751f36f2aaa6dc75e34ec54c951bcb9c" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.22.0%2Bcu128-cp311-cp311-win_amd64.whl", hash = "sha256:f5dae1307c34813425c0b753530c035e1cc72af0bded395d1ba64dcb2872889f" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.22.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:06c101f40e1ff94869be14487c91fd5352e376f202fdeafb8f53c58cee2fbeb5" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.22.0%2Bcu128-cp312-cp312-win_amd64.whl", hash = "sha256:a87393c86649b7e56b4bf859fe95922ee6ec1c1f3b430246fb1a5b51f8aee37a" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.22.0%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:ee4fa6d4052d9ae25c1233289947fbfa4b88d23710254ab1772b108c1fc5fb4d" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.22.0%2Bcu128-cp313-cp313-win_amd64.whl", hash = "sha256:17d50ffb1df6320da16b85395f1078bf369250ea144f3bb405088aca3d5f030f" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.22.0%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:209c29d78cf2003cf4e22c9b651790f57171334998ee3125594d130526aeaa50" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.22.0%2Bcu128-cp313-cp313t-win_amd64.whl", hash = "sha256:03b454b867f7a0aa9861a463042141448c4f15bec784def19eed39a57fac217b" }, ] [[package]] @@ -10750,12 +10759,12 @@ dependencies = [ { name = "torch", version = "2.9.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "(python_full_version < '3.15' and platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux' and extra == 'extra-5-skyrl-fsdp') or (python_full_version < '3.15' and platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and sys_platform == 'linux' and extra != 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-megatron') or (python_full_version >= '3.15' and extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-jax') or (python_full_version >= '3.15' and extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-megatron') or (python_full_version >= '3.15' and extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-miniswe') or (python_full_version >= '3.15' and extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-tpu') or (python_full_version >= '3.15' and extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-miniswe' and extra == 'extra-5-skyrl-tpu') or (python_full_version >= '3.15' and extra != 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-jax' and extra == 'extra-5-skyrl-megatron') or (python_full_version >= '3.15' and extra != 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-megatron' and extra == 'extra-5-skyrl-miniswe') or (python_full_version >= '3.15' and extra != 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-megatron' and extra == 'extra-5-skyrl-tpu') or (platform_machine != 'aarch64' and extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-jax') or (platform_machine != 'aarch64' and extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-megatron') or (platform_machine != 'aarch64' and extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-megatron') or (platform_machine != 'aarch64' and extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-miniswe') or (platform_machine != 'aarch64' and extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-tpu') or (platform_machine != 'aarch64' and extra == 'extra-5-skyrl-jax' and extra == 'extra-5-skyrl-megatron') or (platform_machine != 'aarch64' and extra == 'extra-5-skyrl-megatron' and extra == 'extra-5-skyrl-miniswe') or (platform_machine != 'aarch64' and extra == 'extra-5-skyrl-megatron' and extra == 'extra-5-skyrl-tpu') or (platform_machine != 'aarch64' and extra == 'extra-5-skyrl-miniswe' and extra == 'extra-5-skyrl-tpu') or (platform_python_implementation != 'CPython' and extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-jax') or (platform_python_implementation != 'CPython' and extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-megatron') or (platform_python_implementation != 'CPython' and extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-megatron') or (platform_python_implementation != 'CPython' and extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-miniswe') or (platform_python_implementation != 'CPython' and extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-tpu') or (platform_python_implementation != 'CPython' and extra == 'extra-5-skyrl-jax' and extra == 'extra-5-skyrl-megatron') or (platform_python_implementation != 'CPython' and extra == 'extra-5-skyrl-megatron' and extra == 'extra-5-skyrl-miniswe') or (platform_python_implementation != 'CPython' and extra == 'extra-5-skyrl-megatron' and extra == 'extra-5-skyrl-tpu') or (platform_python_implementation != 'CPython' and extra == 'extra-5-skyrl-miniswe' and extra == 'extra-5-skyrl-tpu') or (sys_platform != 'linux' and extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-jax') or (sys_platform != 'linux' and extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-megatron') or (sys_platform != 'linux' and extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-megatron') or (sys_platform != 'linux' and extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-miniswe') or (sys_platform != 'linux' and extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-tpu') or (sys_platform != 'linux' and extra == 'extra-5-skyrl-jax' and extra == 'extra-5-skyrl-megatron') or (sys_platform != 'linux' and extra == 'extra-5-skyrl-megatron' and extra == 'extra-5-skyrl-miniswe') or (sys_platform != 'linux' and extra == 'extra-5-skyrl-megatron' and extra == 'extra-5-skyrl-tpu') or (sys_platform != 'linux' and extra == 'extra-5-skyrl-miniswe' and extra == 'extra-5-skyrl-tpu') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-fsdp') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-gpu') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-jax') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-megatron') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-miniswe') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-tpu') or (extra != 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-megatron') or (extra != 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-miniswe') or (extra != 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-tpu') or (extra != 'extra-5-skyrl-fsdp' and extra != 'extra-5-skyrl-megatron' and extra == 'extra-5-skyrl-miniswe' and extra == 'extra-5-skyrl-tpu')" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.24.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:61a2d857a2be441cbd5a80b807b70b5d3b580c95166b3a19d4e433e7a85aeb76" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.24.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:bd33a7cc32122bc92919f95ea0e7bf73588e71be0ca2c5cad8fb7eebd333e8dd" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.24.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:7695d95e4e4c25fe1af3b880ffcd2dbcaa43cce7fd7edbe0157305b837c1dcf8" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.24.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:9db0306f8eec7dc11745044c78dc49a80b84cc0935e36575677cdc2bce9be23c" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.24.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:68c8c884e7730146b7915d863526e8f32194532629ecc64da865242d35f417c0" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.24.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:932dcfe6718f1306b6844477939d18c9102e678cdaffc13da9c3a1841d57ddde" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.24.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:61a2d857a2be441cbd5a80b807b70b5d3b580c95166b3a19d4e433e7a85aeb76" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.24.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:bd33a7cc32122bc92919f95ea0e7bf73588e71be0ca2c5cad8fb7eebd333e8dd" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.24.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:7695d95e4e4c25fe1af3b880ffcd2dbcaa43cce7fd7edbe0157305b837c1dcf8" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.24.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:9db0306f8eec7dc11745044c78dc49a80b84cc0935e36575677cdc2bce9be23c" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.24.1-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:68c8c884e7730146b7915d863526e8f32194532629ecc64da865242d35f417c0" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.24.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:932dcfe6718f1306b6844477939d18c9102e678cdaffc13da9c3a1841d57ddde" }, ] [[package]] @@ -10809,18 +10818,18 @@ dependencies = [ { name = "torch", version = "2.9.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "(python_full_version < '3.15' and platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-jax') or (python_full_version < '3.15' and platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-megatron') or (python_full_version < '3.15' and platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-miniswe') or (python_full_version < '3.15' and platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-tpu') or (python_full_version < '3.15' and platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-miniswe' and extra == 'extra-5-skyrl-tpu') or (python_full_version < '3.15' and platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and extra != 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-jax' and extra == 'extra-5-skyrl-megatron') or (python_full_version < '3.15' and platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and extra != 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-megatron' and extra == 'extra-5-skyrl-miniswe') or (python_full_version < '3.15' and platform_machine == 'aarch64' and platform_python_implementation == 'CPython' and extra != 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-megatron' and extra == 'extra-5-skyrl-tpu') or (python_full_version >= '3.15' and sys_platform == 'linux' and extra == 'extra-5-skyrl-fsdp') or (python_full_version >= '3.15' and sys_platform == 'linux' and extra != 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-megatron') or (platform_machine != 'aarch64' and sys_platform == 'linux' and extra == 'extra-5-skyrl-fsdp') or (platform_machine != 'aarch64' and sys_platform == 'linux' and extra != 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-megatron') or (platform_python_implementation != 'CPython' and sys_platform == 'linux' and extra == 'extra-5-skyrl-fsdp') or (platform_python_implementation != 'CPython' and sys_platform == 'linux' and extra != 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-megatron') or (sys_platform != 'linux' and extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-jax') or (sys_platform != 'linux' and extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-megatron') or (sys_platform != 'linux' and extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-megatron') or (sys_platform != 'linux' and extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-miniswe') or (sys_platform != 'linux' and extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-tpu') or (sys_platform != 'linux' and extra == 'extra-5-skyrl-jax' and extra == 'extra-5-skyrl-megatron') or (sys_platform != 'linux' and extra == 'extra-5-skyrl-megatron' and extra == 'extra-5-skyrl-miniswe') or (sys_platform != 'linux' and extra == 'extra-5-skyrl-megatron' and extra == 'extra-5-skyrl-tpu') or (sys_platform != 'linux' and extra == 'extra-5-skyrl-miniswe' and extra == 'extra-5-skyrl-tpu') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-fsdp') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-gpu') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-jax') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-megatron') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-miniswe') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-tpu') or (extra != 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-megatron') or (extra != 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-miniswe') or (extra != 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-tpu') or (extra != 'extra-5-skyrl-fsdp' and extra != 'extra-5-skyrl-megatron' and extra == 'extra-5-skyrl-miniswe' and extra == 'extra-5-skyrl-tpu')" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.24.1%2Bcu128-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:1b44f67cbd8f36e2a58bfaa3176d35b37df55604adf5929e89006e531f849faa" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.24.1%2Bcu128-cp311-cp311-win_amd64.whl", hash = "sha256:6d836745bd3130ef8f3569c9f0d9d70103b5e2e9fa058310bcac5f63bcf2d043" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.24.1%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:cf84eae1d2d12a7d261a7496eca00dd927b71792011b1e84d4162c950eb3201d" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.24.1%2Bcu128-cp312-cp312-win_amd64.whl", hash = "sha256:33ecea57afa1daeedfed443a8a0cb8e4b0b403fdf18c2a328ba6f9069d403384" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.24.1%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:5f7c5e0fa08d2cbee93b6e04bbedd59b5e11462cff6cefd07949217265df2370" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.24.1%2Bcu128-cp313-cp313-win_amd64.whl", hash = "sha256:c38b0ece839de439de81ed0e81e915c200975972c0b9419608fa9568aa74ecec" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.24.1%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:5ae2dc0f582215b078d7fd52410fe51f79b801770c53e7cfb8ad04316283017d" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.24.1%2Bcu128-cp313-cp313t-win_amd64.whl", hash = "sha256:b50d48f4074039e6067230f123f55404014b849d7c4fe1dac3a1924ea02bbd78" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.24.1%2Bcu128-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:3b72e32377e5e91398ddc4579c77784b269652a5795f4b20a5a1d4c80e9bd3dd" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.24.1%2Bcu128-cp314-cp314-win_amd64.whl", hash = "sha256:b5528b460d65c64e87301e942f6450d0ae958d919386e01fa682ba5eb77e5c9d" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.24.1%2Bcu128-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:45792b58c2a9761da4e1d9d12c4bf5140b6250ef9210f42f716f284cff5566ea" }, - { url = "https://download.pytorch.org/whl/cu128/torchvision-0.24.1%2Bcu128-cp314-cp314t-win_amd64.whl", hash = "sha256:c129e153561be8992c998f87d099ff74203ac19f8b2aadeb8edfbfd30036f81c" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.24.1%2Bcu128-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:1b44f67cbd8f36e2a58bfaa3176d35b37df55604adf5929e89006e531f849faa" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.24.1%2Bcu128-cp311-cp311-win_amd64.whl", hash = "sha256:6d836745bd3130ef8f3569c9f0d9d70103b5e2e9fa058310bcac5f63bcf2d043" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.24.1%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:cf84eae1d2d12a7d261a7496eca00dd927b71792011b1e84d4162c950eb3201d" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.24.1%2Bcu128-cp312-cp312-win_amd64.whl", hash = "sha256:33ecea57afa1daeedfed443a8a0cb8e4b0b403fdf18c2a328ba6f9069d403384" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.24.1%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:5f7c5e0fa08d2cbee93b6e04bbedd59b5e11462cff6cefd07949217265df2370" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.24.1%2Bcu128-cp313-cp313-win_amd64.whl", hash = "sha256:c38b0ece839de439de81ed0e81e915c200975972c0b9419608fa9568aa74ecec" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.24.1%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:5ae2dc0f582215b078d7fd52410fe51f79b801770c53e7cfb8ad04316283017d" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.24.1%2Bcu128-cp313-cp313t-win_amd64.whl", hash = "sha256:b50d48f4074039e6067230f123f55404014b849d7c4fe1dac3a1924ea02bbd78" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.24.1%2Bcu128-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:3b72e32377e5e91398ddc4579c77784b269652a5795f4b20a5a1d4c80e9bd3dd" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.24.1%2Bcu128-cp314-cp314-win_amd64.whl", hash = "sha256:b5528b460d65c64e87301e942f6450d0ae958d919386e01fa682ba5eb77e5c9d" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.24.1%2Bcu128-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:45792b58c2a9761da4e1d9d12c4bf5140b6250ef9210f42f716f284cff5566ea" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torchvision-0.24.1%2Bcu128-cp314-cp314t-win_amd64.whl", hash = "sha256:c129e153561be8992c998f87d099ff74203ac19f8b2aadeb8edfbfd30036f81c" }, ] [[package]] @@ -10879,24 +10888,23 @@ sdist = { url = "https://files.pythonhosted.org/packages/18/94/609a7772569d3acdb [[package]] name = "transformers" -version = "4.57.1" +version = "5.3.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "filelock" }, { name = "huggingface-hub" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra == 'extra-5-skyrl-fsdp') or (sys_platform == 'linux' and extra == 'extra-5-skyrl-megatron') or (extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-jax') or (extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-megatron') or (extra == 'extra-5-skyrl-jax' and extra == 'extra-5-skyrl-megatron') or (extra == 'extra-5-skyrl-megatron' and extra == 'extra-5-skyrl-miniswe') or (extra == 'extra-5-skyrl-megatron' and extra == 'extra-5-skyrl-tpu') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-fsdp') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-gpu') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-jax') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-megatron') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-miniswe') or (extra == 'extra-5-skyrl-flashrl' and extra == 'extra-5-skyrl-tpu') or (extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-megatron') or (extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-miniswe') or (extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-tpu') or (extra == 'extra-5-skyrl-miniswe' and extra == 'extra-5-skyrl-tpu')" }, { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform != 'linux' or extra == 'extra-5-skyrl-flashrl' or (extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-jax') or (extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-megatron') or (extra != 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-gpu') or (extra != 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-jax') or (extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-miniswe') or (extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-tpu') or (extra == 'extra-5-skyrl-fsdp' and extra == 'extra-5-skyrl-miniswe' and extra == 'extra-5-skyrl-tpu') or (extra != 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-megatron' and extra == 'extra-5-skyrl-miniswe') or (extra != 'extra-5-skyrl-gpu' and extra == 'extra-5-skyrl-megatron' and extra == 'extra-5-skyrl-tpu') or (extra != 'extra-5-skyrl-fsdp' and extra != 'extra-5-skyrl-jax' and extra != 'extra-5-skyrl-megatron')" }, { name = "packaging" }, { name = "pyyaml" }, { name = "regex" }, - { name = "requests" }, { name = "safetensors" }, { name = "tokenizers" }, { name = "tqdm" }, + { name = "typer" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/d6/68/a39307bcc4116a30b2106f2e689130a48de8bd8a1e635b5e1030e46fcd9e/transformers-4.57.1.tar.gz", hash = "sha256:f06c837959196c75039809636cd964b959f6604b75b8eeec6fdfc0440b89cc55", size = 10142511, upload-time = "2025-10-14T15:39:26.18Z" } +sdist = { url = "https://files.pythonhosted.org/packages/fc/1a/70e830d53ecc96ce69cfa8de38f163712d2b43ac52fbd743f39f56025c31/transformers-5.3.0.tar.gz", hash = "sha256:009555b364029da9e2946d41f1c5de9f15e6b1df46b189b7293f33a161b9c557", size = 8830831, upload-time = "2026-03-04T17:41:46.119Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/71/d3/c16c3b3cf7655a67db1144da94b021c200ac1303f82428f2beef6c2e72bb/transformers-4.57.1-py3-none-any.whl", hash = "sha256:b10d05da8fa67dc41644dbbf9bc45a44cb86ae33da6f9295f5fbf5b7890bd267", size = 11990925, upload-time = "2025-10-14T15:39:23.085Z" }, + { url = "https://files.pythonhosted.org/packages/b8/88/ae8320064e32679a5429a2c9ebbc05c2bf32cefb6e076f9b07f6d685a9b4/transformers-5.3.0-py3-none-any.whl", hash = "sha256:50ac8c89c3c7033444fb3f9f53138096b997ebb70d4b5e50a2e810bf12d3d29a", size = 10661827, upload-time = "2026-03-04T17:41:42.722Z" }, ] [[package]]