-
Notifications
You must be signed in to change notification settings - Fork 285
Open
Labels
Description
Summary
Distributed checkpointing with Megatron is broken again after upgrading to megatron-core 0.16.0.
Traceback (most recent call last):
File "<frozen runpy>", line 198, in _run_module_as_main
File "<frozen runpy>", line 88, in _run_code
File "/home/ray/default/SkyRL/skyrl/train/entrypoints/main_base.py", line 466, in <module>
main()
File "/home/ray/default/SkyRL/skyrl/train/entrypoints/main_base.py", line 462, in main
ray.get(skyrl_entrypoint.remote(cfg))
File "/home/ray/.cache/uv/builds-v0/.tmpgTA40j/lib/python3.12/site-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/home/ray/.cache/uv/builds-v0/.tmpgTA40j/lib/python3.12/site-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/ray/.cache/uv/builds-v0/.tmpgTA40j/lib/python3.12/site-packages/ray/_private/worker.py", line 2961, in get
values, debugger_breakpoint = worker.get_objects(
^^^^^^^^^^^^^^^^^^^
File "/home/ray/.cache/uv/builds-v0/.tmpgTA40j/lib/python3.12/site-packages/ray/_private/worker.py", line 1026, in get_objects
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(OSError): �[36mray::skyrl_entrypoint()�[39m (pid=100139, ip=172.25.105.18)
File "/home/ray/default/SkyRL/skyrl/train/entrypoints/main_base.py", line 451, in skyrl_entrypoint
File "/home/ray/default/SkyRL/skyrl/train/entrypoints/main_base.py", line 444, in run
File "/home/ray/anaconda3/lib/python3.12/asyncio/runners.py", line 195, in run
return runner.run(main)
^^^^^^^^^^^^^^^^
File "/home/ray/anaconda3/lib/python3.12/asyncio/runners.py", line 118, in run
return self._loop.run_until_complete(task)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
File "/tmp/ray/session_2026-03-23_04-19-31_463883_2941/runtime_resources/working_dir_files/_ray_pkg_2121ff5350732964/skyrl/train/trainer.py", line 297, in train
self.save_checkpoints()
File "/tmp/ray/session_2026-03-23_04-19-31_463883_2941/runtime_resources/working_dir_files/_ray_pkg_2121ff5350732964/skyrl/train/trainer.py", line 1225, in save_checkpoints
self.dispatch.save_checkpoint("policy", policy_save_dir, self.tokenizer)
File "/tmp/ray/session_2026-03-23_04-19-31_463883_2941/runtime_resources/working_dir_files/_ray_pkg_2121ff5350732964/skyrl/backends/skyrl_train/workers/worker_dispatch.py", line 301, in save_checkpoint
ray.get(
^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^
ray.exceptions.RayTaskError(OSError): �[36mray::MegatronPolicyWorkerBase.save_checkpoint()�[39m (pid=108782, ip=172.25.105.18, actor_id=ca5fc047ee2ab5c53354937304000000, repr=<skyrl.backends.skyrl_train.workers.megatron.megatron_worker.MegatronPolicyWorkerBase object at 0x7242648edd00>)
File "/home/ray/anaconda3/lib/python3.12/concurrent/futures/_base.py", line 456, in result
return self.__get_result()
^^^^^^^^^^^^^^^^^^^
File "/home/ray/anaconda3/lib/python3.12/concurrent/futures/_base.py", line 401, in __get_result
raise self._exception
^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/tmp/ray/session_2026-03-23_04-19-31_463883_2941/runtime_resources/working_dir_files/_ray_pkg_2121ff5350732964/skyrl/backends/skyrl_train/workers/worker.py", line 991, in save_checkpoint
self.strategy.save_checkpoint(
File "/tmp/ray/session_2026-03-23_04-19-31_463883_2941/runtime_resources/working_dir_files/_ray_pkg_2121ff5350732964/skyrl/backends/skyrl_train/distributed/megatron/megatron_strategy.py", line 218, in save_checkpoint
async_save_request = dist_checkpointing.save(
^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ray/.cache/uv/builds-v0/.tmpk3I241/lib/python3.12/site-packages/megatron/core/dist_checkpointing/serialization.py", line 429, in save
sharded_strategy.save(sharded_state_dict, checkpoint_dir)
File "/home/ray/.cache/uv/builds-v0/.tmpk3I241/lib/python3.12/site-packages/megatron/core/dist_checkpointing/strategies/fully_parallel.py", line 98, in save
return self.base_strategy.save(sharded_state_dict, checkpoint_dir)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ray/.cache/uv/builds-v0/.tmpk3I241/lib/python3.12/site-packages/megatron/core/dist_checkpointing/strategies/base.py", line 215, in save
async_request.execute_sync()
File "/home/ray/.cache/uv/builds-v0/.tmpk3I241/lib/python3.12/site-packages/megatron/core/dist_checkpointing/strategies/async_utils.py", line 96, in execute_sync
self.async_fn(*async_fn_args, **self.async_fn_kwargs)
File "/home/ray/anaconda3/lib/python3.12/contextlib.py", line 81, in inner
return func(*args, **kwds)
^^^^^^^^^^^^^^^^^^^
File "/home/ray/.cache/uv/builds-v0/.tmpk3I241/lib/python3.12/site-packages/megatron/core/dist_checkpointing/strategies/filesystem_async.py", line 303, in write_preloaded_data_multiproc
p.start()
File "/home/ray/anaconda3/lib/python3.12/multiprocessing/process.py", line 121, in start
self._popen = self._Popen(self)
^^^^^^^^^^^^^^^^^
File "/home/ray/anaconda3/lib/python3.12/multiprocessing/context.py", line 282, in _Popen
return Popen(process_obj)
^^^^^^^^^^^^^^^^^^
File "/home/ray/anaconda3/lib/python3.12/multiprocessing/popen_fork.py", line 19, in __init__
self._launch(process_obj)
File "/home/ray/anaconda3/lib/python3.12/multiprocessing/popen_fork.py", line 66, in _launch
self.pid = os.fork()
^^^^^^^^^
OSError: [Errno 12] Cannot allocate memory
�[36m(skyrl_entrypoint pid=100139, ip=172.25.105.18)�[0m �[32m2026-03-23 16:43:33.793�[0m | �[1m�[32mINFO �[0m | �[36mskyrl.train.trainer�[0m:�[36mtrain�[0m:�[36m296�[0m - �[1m�[32mFinished: 'save_checkpoints', time cost: 2.83s�[0m
�[36m(skyrl_entrypoint pid=100139, ip=172.25.105.18)�[0m �[32m2026-03-23 16:43:33.794�[0m | �[1m�[32mINFO �[0m | �[36mskyrl.train.trainer�[0m:�[36mtrain�[0m:�[36m210�[0m - �[1m�[32mFinished: 'step', time cost: 37.55s�[0mThis is the same issue we had encountered with ray + fork interactions with the default checkpointing method in Megatron dist-checkpointing in 0.13.0. The previous fix #357 no longer works.
Reactions are currently unavailable