From 689d94afa5a249b00bb66905af025721af028011 Mon Sep 17 00:00:00 2001 From: mishaschwartz <4380924+mishaschwartz@users.noreply.github.com> Date: Thu, 5 Feb 2026 09:50:29 -0500 Subject: [PATCH 1/9] tmp --- .../nvidia-multi-process-service/default.env | 9 +++++++ .../docker-compose-extra.yml | 24 +++++++++++++++++++ .../pre-docker-compose-up.include | 3 +++ 3 files changed, 36 insertions(+) create mode 100644 birdhouse/optional-components/nvidia-multi-process-service/default.env create mode 100644 birdhouse/optional-components/nvidia-multi-process-service/docker-compose-extra.yml create mode 100644 birdhouse/optional-components/nvidia-multi-process-service/pre-docker-compose-up.include diff --git a/birdhouse/optional-components/nvidia-multi-process-service/default.env b/birdhouse/optional-components/nvidia-multi-process-service/default.env new file mode 100644 index 000000000..3416fc3e6 --- /dev/null +++ b/birdhouse/optional-components/nvidia-multi-process-service/default.env @@ -0,0 +1,9 @@ +export NVIDIA_MULTIPROCESS_SERVICE_DOCKER=debian +export NVIDIA_MULTIPROCESS_SERVICE_VERSION=bookworm-slim +export NVIDIA_MULTIPROCESS_SERVICE_IMAGE='${NVIDIA_MULTIPROCESS_SERVICE_DOCKER}:${NVIDIA_MULTIPROCESS_SERVICE_VERSION}' + +export DELAYED_EVAL=" + $DELAYED_EVAL + NVIDIA_MULTIPROCESS_SERVICE_IMAGE +" + \ No newline at end of file diff --git a/birdhouse/optional-components/nvidia-multi-process-service/docker-compose-extra.yml b/birdhouse/optional-components/nvidia-multi-process-service/docker-compose-extra.yml new file mode 100644 index 000000000..7a37598ba --- /dev/null +++ b/birdhouse/optional-components/nvidia-multi-process-service/docker-compose-extra.yml @@ -0,0 +1,24 @@ +services: + mps: + image: ${NVIDIA_MULTIPROCESS_SERVICE_IMAGE} + container_name: mps + restart: always + ipc: shareable + volumes: + - nvidia_mps:/tmp/nvidia-mps + init: true + command: ["nvidia-cuda-mps-control", "-f"] + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + +volumes: + nvidia_mps: + driver: local + driver_opts: + type: tmpfs + device: tmpfs diff --git a/birdhouse/optional-components/nvidia-multi-process-service/pre-docker-compose-up.include b/birdhouse/optional-components/nvidia-multi-process-service/pre-docker-compose-up.include new file mode 100644 index 000000000..f0e57b6cd --- /dev/null +++ b/birdhouse/optional-components/nvidia-multi-process-service/pre-docker-compose-up.include @@ -0,0 +1,3 @@ +if [ "$(nvidia-smi --query-gpu=compute_mode --format=csv,noheader | grep -vc 'Exclusive_Process')" -ne 0 ]; then + log WARN "Nvidia GPUs with compute mode set to something other than EXCLUSIVE_PROCESS detected. We recommend you set the compute mode to EXCLUSIVE_PROCESS when enabling nvidia's Multi Process Service (MPS)." +fi From 8a8d64be429f2c7e7ee34f0c8ec0fdc1bfd5196f Mon Sep 17 00:00:00 2001 From: mishaschwartz <4380924+mishaschwartz@users.noreply.github.com> Date: Mon, 9 Feb 2026 21:31:40 -0500 Subject: [PATCH 2/9] add limits --- birdhouse/components/jupyterhub/default.env | 6 +++ .../jupyterhub/jupyterhub_config.py.template | 2 + .../jupyterhub_custom/custom_dockerspawner.py | 42 +++++++++++++++---- .../nvidia-multi-process-service/default.env | 37 +++++++++++++++- tests/unit/test_jupyterhub_custom.py | 22 ++++++++++ 5 files changed, 100 insertions(+), 9 deletions(-) diff --git a/birdhouse/components/jupyterhub/default.env b/birdhouse/components/jupyterhub/default.env index d7b2fe6f3..b24637de5 100644 --- a/birdhouse/components/jupyterhub/default.env +++ b/birdhouse/components/jupyterhub/default.env @@ -55,6 +55,11 @@ export JUPYTER_IDLE_KERNEL_CULL_INTERVAL=0 # config/jupyterhub/jupyterhub_config.py.template. export JUPYTERHUB_CONFIG_OVERRIDE="" +# Allows adding new configuration or overriding existing configurations +# This variable should only be set by other components not directly by the user. +# Users should set JUPYTERHUB_CONFIG_OVERRIDE instead. +export JUPYTERHUB_CONFIG_OVERRIDE_INTERNAL="" + # URL used to verify that a logged in user has permission to access Jupyterhub # To disable this feature, unset this variable. However, disabling this feature is NOT # recommended as it may permit unauthorized users from accessing jupyterhub. @@ -104,6 +109,7 @@ OPTIONAL_VARS=" \$JUPYTER_LOGIN_BANNER_BOTTOM_SECTION \$JUPYTER_LOGIN_TERMS_URL \$JUPYTERHUB_CONFIG_OVERRIDE + \$JUPYTERHUB_CONFIG_OVERRIDE_INTERNAL \$JUPYTERHUB_DOCKER \$JUPYTERHUB_VERSION \$JUPYTERHUB_IMAGE diff --git a/birdhouse/components/jupyterhub/jupyterhub_config.py.template b/birdhouse/components/jupyterhub/jupyterhub_config.py.template index 3e60d7ab9..20d404bc2 100644 --- a/birdhouse/components/jupyterhub/jupyterhub_config.py.template +++ b/birdhouse/components/jupyterhub/jupyterhub_config.py.template @@ -104,4 +104,6 @@ if """${JUPYTERHUB_ADMIN_USERS}""": # Configuration overrides # ------------------------------------------------------------------------------ +${JUPYTERHUB_CONFIG_OVERRIDE_INTERNAL} # noqa + ${JUPYTERHUB_CONFIG_OVERRIDE} # noqa diff --git a/birdhouse/components/jupyterhub/jupyterhub_custom/jupyterhub_custom/custom_dockerspawner.py b/birdhouse/components/jupyterhub/jupyterhub_custom/jupyterhub_custom/custom_dockerspawner.py index f79e25faf..9d8789f69 100644 --- a/birdhouse/components/jupyterhub/jupyterhub_custom/jupyterhub_custom/custom_dockerspawner.py +++ b/birdhouse/components/jupyterhub/jupyterhub_custom/jupyterhub_custom/custom_dockerspawner.py @@ -5,7 +5,7 @@ import docker from dockerspawner import DockerSpawner -from traitlets import default +from traitlets import Callable, Dict, List, Unicode, default from . import constants @@ -227,6 +227,35 @@ def _default_start_timeout(self) -> int: """Timeout (in seconds) before giving up on starting of single-user server.""" return 120 + resource_limit_callbacks = Dict( + value_trait=Callable(), + key_trait=Unicode(), + config=True, + help=( + "Dictionary mapping limit names to a callable that takes two arguments: " + "the spawner instance and the value for that limit. These can be used to " + "add additional resource limits that are enforced by optional components." + ), + ) + + pre_spawn_hooks = List( + Callable(), + config=True, + help=( + "List of pre spawn hooks to run as well as the pre_spawn_hook function. " + "This is intended to be set by internal tools, users should set the " + "pre_spawn_hook directly." + ), + ) + + @default("pre_spawn_hooks") + def _default_pre_spawn_hooks(self) -> list: + return [ + CustomDockerSpawner.__create_dir_hook, + CustomDockerSpawner.__limit_resource_hook, + CustomDockerSpawner.__create_tutorial_notebook_hook, + ] + @property def escaped_name(self) -> str: """ @@ -321,6 +350,8 @@ def __limit_resource_hook(self) -> None: gpu_ids = value elif limit == "gpu_count": gpu_count = value + elif limit in self.resource_limit_callbacks: + self.resource_limit_callbacks[limit](self, value) if gpu_ids: # randomly assign GPUs in an attempt to evenly distribute GPU resources random.shuffle(gpu_ids) @@ -331,12 +362,7 @@ def __limit_resource_hook(self) -> None: def run_pre_spawn_hook(self) -> None: """Run the builtin pre-spawn hooks as well as any set by pre_spawn_hook if defined.""" - self._custom_pre_spawn_hook() + for hook in self.pre_spawn_hooks: + hook(self) if self.pre_spawn_hook: self.pre_spawn_hook(self) - - def _custom_pre_spawn_hook(self) -> None: - """Run before spawning a singleuser jupyterlab server.""" - self.__create_dir_hook() - self.__limit_resource_hook() - self.__create_tutorial_notebook_hook() diff --git a/birdhouse/optional-components/nvidia-multi-process-service/default.env b/birdhouse/optional-components/nvidia-multi-process-service/default.env index 3416fc3e6..56f9427a4 100644 --- a/birdhouse/optional-components/nvidia-multi-process-service/default.env +++ b/birdhouse/optional-components/nvidia-multi-process-service/default.env @@ -6,4 +6,39 @@ export DELAYED_EVAL=" $DELAYED_EVAL NVIDIA_MULTIPROCESS_SERVICE_IMAGE " - \ No newline at end of file + +export JUPYTERHUB_CONFIG_OVERRIDE_INTERNAL=" +${JUPYTERHUB_CONFIG_OVERRIDE_INTERNAL} + +def _gpu_device_mem_limit(spawner, value): + ''' + Set memory limits for GPUs allocated to this user. + + See: https://docs.nvidia.com/deploy/mps/appendix-tools-and-interface-reference.html#cuda-mps-pinned-device-mem-limit + ''' + spawner.environment['CUDA_MPS_PINNED_DEVICE_MEM_LIMIT'] = value + +def _gpu_active_thread_percentage(spawner, value): + ''' + Set active thread percentage for GPUs allocated to this user + + See: https://docs.nvidia.com/deploy/mps/appendix-tools-and-interface-reference.html#cuda-mps-active-thread-percentage + ''' + spawner.environment['CUDA_MPS_ACTIVE_THREAD_PERCENTAGE'] = value + +c.CustomDockerSpawner.resource_limit_callbacks.update({ + 'gpu_device_mem_limit': _gpu_device_mem_limit, + 'gpu_active_thread_percentage': _gpu_active_thread_percentage, +}) + +def _gpu_set_mps_configs(spawner): + ''' + Set configurations so this container uses the multi-process service running in the container named mps + + See: https://gitlab.com/nvidia/container-images/samples/-/blob/master/mps/docker-compose.yml + ''' + spawner.extra_host_config['ipc_mode'] = 'container:mps' + spawner.volumes['nvidia_mps'] = '/tmp/nvidia-mps' + +c.CustomDockerSpawner.pre_spawn_hooks.append(_gpu_set_mps_configs) +" diff --git a/tests/unit/test_jupyterhub_custom.py b/tests/unit/test_jupyterhub_custom.py index 80d251bc9..dec9d5be1 100644 --- a/tests/unit/test_jupyterhub_custom.py +++ b/tests/unit/test_jupyterhub_custom.py @@ -598,6 +598,28 @@ def test_user_name_matches_gpu_ids_with_count(self, spawner, constants, generate assert len(device_ids) == 2 assert set(device_ids) < {1, 2, 3} + def test_additional_resource_limits(self, spawner, constants, generate_spawner_inst): + mock = Mock() + spawner_inst = generate_spawner_inst(spawner) + spawner_inst.resource_limit_callbacks["test_limit"] = mock + constants.RESOURCE_LIMITS = [ + { + "type": "user", + "name": spawner_inst.user.name, + "limits": {"test_limit": 22}, + } + ] + spawner_inst.run_pre_spawn_hook() + assert mock.call_args == ((spawner_inst, 22),) + + class TestAdditionalPreSpawnHooks: + + def test_custom_pre_spawn_hook(self, spawner, generate_spawner_inst): + mock = Mock() + spawner_inst = generate_spawner_inst(spawner) + spawner_inst.pre_spawn_hooks.append(mock) + spawner_inst.run_pre_spawn_hook() + assert mock.call_args == ((spawner_inst,),) # @pytest.mark.asyncio class TestMagpieAuthenticator: From ae177fb6684112d645b2173e9c5df69150a0623f Mon Sep 17 00:00:00 2001 From: mishaschwartz <4380924+mishaschwartz@users.noreply.github.com> Date: Tue, 10 Feb 2026 10:50:59 -0500 Subject: [PATCH 3/9] set ids and environment variables as strings --- .../jupyterhub_custom/custom_dockerspawner.py | 2 +- birdhouse/env.local.example | 4 ++-- .../nvidia-multi-process-service/default.env | 2 +- tests/unit/test_jupyterhub_custom.py | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/birdhouse/components/jupyterhub/jupyterhub_custom/jupyterhub_custom/custom_dockerspawner.py b/birdhouse/components/jupyterhub/jupyterhub_custom/jupyterhub_custom/custom_dockerspawner.py index 9d8789f69..dd1e76b16 100644 --- a/birdhouse/components/jupyterhub/jupyterhub_custom/jupyterhub_custom/custom_dockerspawner.py +++ b/birdhouse/components/jupyterhub/jupyterhub_custom/jupyterhub_custom/custom_dockerspawner.py @@ -357,7 +357,7 @@ def __limit_resource_hook(self) -> None: random.shuffle(gpu_ids) gpu_ids = gpu_ids[:gpu_count] self.extra_host_config["device_requests"] = [ - docker.types.DeviceRequest(device_ids=gpu_ids, capabilities=[["gpu"]]) + docker.types.DeviceRequest(device_ids=[str(i) for i in gpu_ids], capabilities=[["gpu"]]) ] def run_pre_spawn_hook(self) -> None: diff --git a/birdhouse/env.local.example b/birdhouse/env.local.example index ce993894e..3c3b2f536 100644 --- a/birdhouse/env.local.example +++ b/birdhouse/env.local.example @@ -462,7 +462,7 @@ export GEOSERVER_ADMIN_PASSWORD="${__DEFAULT__GEOSERVER_ADMIN_PASSWORD}" # {"type": "user", "name": "user1", "limits": {"mem_limit": "30G"}}, # {"type": "group", "name": "group1", "limits": {"mem_limit": "10G", "cpu_limit": 1}}, # {"type": "group", "name": "group2", "limits": {"cpu_limit": 3, "gpu_ids": [0, 3, 4]}}, -# {"type": "user", "name": "user2", "limits": {"gpu_ids": [1, 2, 3], "gpu_count": 2}} +# {"type": "user", "name": "user2", "limits": {"gpu_ids": ["1", "2", "3"], "gpu_count": 2}} # ] #' # Supported limits are: "mem_limit", "cpu_limit", "gpu_count", "gpu_ids". @@ -478,7 +478,7 @@ export GEOSERVER_ADMIN_PASSWORD="${__DEFAULT__GEOSERVER_ADMIN_PASSWORD}" # is possible but discouraged since it makes it possible to select the same GPU multiple times. # If gpu_count is also specified, this is an integer indicating how many GPUs to make available to that user or group. # If gpu_count is not specified, then exactly one GPU will be randomly selected. -# For example, if {"gpu_ids": [1,2,6], "gpu_count": 2} then two GPUs will be randomly selected from the gpu_ids list. +# For example, if {"gpu_ids": ["1","2","6"], "gpu_count": 2} then two GPUs will be randomly selected from the gpu_ids list. # Note that this will not create the groups in Magpie, that must be done manually. # Note that if a user belongs to multiple groups, later values in `JUPYTERHUB_RESOURCE_LIMITS` will take # precedence. For example, if a user named user1 belongs to group1 and group2 then the following limits will apply: diff --git a/birdhouse/optional-components/nvidia-multi-process-service/default.env b/birdhouse/optional-components/nvidia-multi-process-service/default.env index 56f9427a4..b91779c58 100644 --- a/birdhouse/optional-components/nvidia-multi-process-service/default.env +++ b/birdhouse/optional-components/nvidia-multi-process-service/default.env @@ -24,7 +24,7 @@ def _gpu_active_thread_percentage(spawner, value): See: https://docs.nvidia.com/deploy/mps/appendix-tools-and-interface-reference.html#cuda-mps-active-thread-percentage ''' - spawner.environment['CUDA_MPS_ACTIVE_THREAD_PERCENTAGE'] = value + spawner.environment['CUDA_MPS_ACTIVE_THREAD_PERCENTAGE'] = str(value) c.CustomDockerSpawner.resource_limit_callbacks.update({ 'gpu_device_mem_limit': _gpu_device_mem_limit, diff --git a/tests/unit/test_jupyterhub_custom.py b/tests/unit/test_jupyterhub_custom.py index dec9d5be1..b35c26878 100644 --- a/tests/unit/test_jupyterhub_custom.py +++ b/tests/unit/test_jupyterhub_custom.py @@ -582,7 +582,7 @@ def test_user_name_matches_gpu_ids_no_count(self, spawner, constants, generate_s spawner_inst.run_pre_spawn_hook() device_ids = spawner_inst.extra_host_config["device_requests"][0].device_ids assert len(device_ids) == 1 - assert device_ids[0] in [1, 2, 3] + assert device_ids[0] in ["1", "2", "3"] def test_user_name_matches_gpu_ids_with_count(self, spawner, constants, generate_spawner_inst): spawner_inst = generate_spawner_inst(spawner) @@ -596,7 +596,7 @@ def test_user_name_matches_gpu_ids_with_count(self, spawner, constants, generate spawner_inst.run_pre_spawn_hook() device_ids = spawner_inst.extra_host_config["device_requests"][0].device_ids assert len(device_ids) == 2 - assert set(device_ids) < {1, 2, 3} + assert set(device_ids) < {"1", "2", "3"} def test_additional_resource_limits(self, spawner, constants, generate_spawner_inst): mock = Mock() From 7a07628071162cb4591667084e772fefb6824d4d Mon Sep 17 00:00:00 2001 From: mishaschwartz <4380924+mishaschwartz@users.noreply.github.com> Date: Tue, 10 Feb 2026 14:39:08 -0500 Subject: [PATCH 4/9] make limit vars mostly readonly --- .../nvidia-multi-process-service/02-readonly-cuda-vars.sh | 2 ++ .../config/jupyterhub/docker-compose-extra.yml | 4 ++++ .../nvidia-multi-process-service/default.env | 4 ++++ 3 files changed, 10 insertions(+) create mode 100644 birdhouse/optional-components/nvidia-multi-process-service/02-readonly-cuda-vars.sh create mode 100644 birdhouse/optional-components/nvidia-multi-process-service/config/jupyterhub/docker-compose-extra.yml diff --git a/birdhouse/optional-components/nvidia-multi-process-service/02-readonly-cuda-vars.sh b/birdhouse/optional-components/nvidia-multi-process-service/02-readonly-cuda-vars.sh new file mode 100644 index 000000000..01ced9b85 --- /dev/null +++ b/birdhouse/optional-components/nvidia-multi-process-service/02-readonly-cuda-vars.sh @@ -0,0 +1,2 @@ +readonly CUDA_MPS_PINNED_DEVICE_MEM_LIMIT +readonly CUDA_MPS_ACTIVE_THREAD_PERCENTAGE diff --git a/birdhouse/optional-components/nvidia-multi-process-service/config/jupyterhub/docker-compose-extra.yml b/birdhouse/optional-components/nvidia-multi-process-service/config/jupyterhub/docker-compose-extra.yml new file mode 100644 index 000000000..5233d0bbf --- /dev/null +++ b/birdhouse/optional-components/nvidia-multi-process-service/config/jupyterhub/docker-compose-extra.yml @@ -0,0 +1,4 @@ +services: + jupyterhub: + environment: + - NVIDIA_MPS_PROFILE_SCRIPT=${COMPOSE_DIR}/optional-components/nvidia-multi-process-service/02-readonly-cuda-vars.sh diff --git a/birdhouse/optional-components/nvidia-multi-process-service/default.env b/birdhouse/optional-components/nvidia-multi-process-service/default.env index b91779c58..d7661a789 100644 --- a/birdhouse/optional-components/nvidia-multi-process-service/default.env +++ b/birdhouse/optional-components/nvidia-multi-process-service/default.env @@ -41,4 +41,8 @@ def _gpu_set_mps_configs(spawner): spawner.volumes['nvidia_mps'] = '/tmp/nvidia-mps' c.CustomDockerSpawner.pre_spawn_hooks.append(_gpu_set_mps_configs) + +c.CustomDockerSpawner.volumes.update({ + os.environ['NVIDIA_MPS_PROFILE_SCRIPT']: '/etc/profile.d/02-readonly-cuda-vars.sh' +}) " From bea17f9fa0c6dc40c75ce2aa617180315d15d2b1 Mon Sep 17 00:00:00 2001 From: mishaschwartz <4380924+mishaschwartz@users.noreply.github.com> Date: Tue, 10 Feb 2026 15:38:42 -0500 Subject: [PATCH 5/9] documentation --- CHANGES.md | 35 ++++++++++++++++++- birdhouse/optional-components/README.rst | 35 +++++++++++++++++++ .../nvidia-multi-process-service/default.env | 2 ++ 3 files changed, 71 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 0db75dcad..7ae7ce8eb 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -15,7 +15,40 @@ [Unreleased](https://github.com/bird-house/birdhouse-deploy/tree/master) (latest) ------------------------------------------------------------------------------------------------------------------ -[//]: # (list changes here, using '-' for each new entry, remove this when items are added) +## Changes + +- Add Nvidia MPS component for managing Nvidia GPU resources + +This creates a container running Nvidia's Multi Process Service ([MPS](https://docs.nvidia.com/deploy/mps/index.html)) +which helps manage multi-user GPU access. +It runs an alternative CUDA interface which manages resource allocation when multiple processes are running simultaneously +on the same GPU. +It also allows the node admin to set additional per-user limits through the `JUPYTERHUB_RESOURCE_LIMITS` variable +which configures Jupyterlab containers: + +- `"gpu_device_mem_limit"`: sets the `CUDA_MPS_PINNED_DEVICE_MEM_LIMIT` environment variable +- `"gpu_active_thread_percentage"`: sets the `CUDA_MPS_ACTIVE_THREAD_PERCENTAGE` environment variable + +For example, the following will give all users in the group named `"users"` access to three GPUs in their Jupyterlab +container. On the first one (id = 0) only 1GB of memory is available, on the second (id = 1) only 5GB, and on the third +(id = 2) only 10GB. Additionally, the container will be able to use 10% of available threads on the GPUs. + +```shell +export JUPYTERHUB_RESOURCE_LIMITS=' +[{ + "type": "group", + "name": "users", + "limits": { + "gpu_ids": ["0", "1", "2"], + "gpu_count": 3, + "gpu_device_mem_limit": "0=1G,1=5G,2=10G", + "gpu_active_thread_percentage": "10" + } +}] +' +``` + +Note that leaving any of these limits unset will default to allowing the user full access to the given resource. [2.22.0](https://github.com/bird-house/birdhouse-deploy/tree/2.22.0) (2026-02-09) ------------------------------------------------------------------------------------------------------------------ diff --git a/birdhouse/optional-components/README.rst b/birdhouse/optional-components/README.rst index 81ec452af..1aae0f27f 100644 --- a/birdhouse/optional-components/README.rst +++ b/birdhouse/optional-components/README.rst @@ -730,3 +730,38 @@ that your custom component creates and read the `proxy` access logs at a file de For example, if `PROXY_LOG_FILE` is set to ``access_file.log`` (the default) and you mount the `proxy-logs` volume to the ``/logs`` directory in your container, the `proxy` access logs can be read at ``/logs/access_file.log`` in your container. + +Nvidia multi process service +---------------------------- + +This creates a container running Nvidia's Multi Process Service (MPS_) which helps manage multi-user GPU access. +It runs an alternative CUDA interface which manages resource allocation when multiple processes are running simultaneously +on the same GPU. +It also allows the node admin to set additional per-user limits through the ``JUPYTERHUB_RESOURCE_LIMITS`` variable +which configures Jupyterlab containers: + +* ``"gpu_device_mem_limit"``: sets the ``CUDA_MPS_PINNED_DEVICE_MEM_LIMIT`` environment variable +* ``"gpu_active_thread_percentage"``: sets the ``CUDA_MPS_ACTIVE_THREAD_PERCENTAGE`` environment variable + +For example, the following will give all users in the group named ``"users"`` access to three GPUs in their Jupyterlab +container. On the first one (id = 0) only 1GB of memory is available, on the second (id = 1) only 5GB, and on the third +(id = 2) only 10GB. Additionally, the container will be able to use 10% of available threads on the GPUs. + +.. code::shell + + export JUPYTERHUB_RESOURCE_LIMITS=' + [{ + "type": "group", + "name": "users", + "limits": { + "gpu_ids": ["0", "1", "2"], + "gpu_count": 3, + "gpu_device_mem_limit": "0=1G,1=5G,2=10G", + "gpu_active_thread_percentage": "10" + } + }] + ' + +Note that leaving any of these limits unset will default to allowing the user full access to the given resource. + +.. _MPS: https://docs.nvidia.com/deploy/mps/index.html diff --git a/birdhouse/optional-components/nvidia-multi-process-service/default.env b/birdhouse/optional-components/nvidia-multi-process-service/default.env index d7661a789..0f932bd3e 100644 --- a/birdhouse/optional-components/nvidia-multi-process-service/default.env +++ b/birdhouse/optional-components/nvidia-multi-process-service/default.env @@ -42,6 +42,8 @@ def _gpu_set_mps_configs(spawner): c.CustomDockerSpawner.pre_spawn_hooks.append(_gpu_set_mps_configs) +# This sets the variables as readonly so that users can't unset/update the environment variables +# that set these limits in the jupyterlab docker container. c.CustomDockerSpawner.volumes.update({ os.environ['NVIDIA_MPS_PROFILE_SCRIPT']: '/etc/profile.d/02-readonly-cuda-vars.sh' }) From 9e9fa8a9d9b5839bab48049b0cb28579b8259e7c Mon Sep 17 00:00:00 2001 From: mishaschwartz <4380924+mishaschwartz@users.noreply.github.com> Date: Tue, 10 Feb 2026 15:47:40 -0500 Subject: [PATCH 6/9] more coments --- CHANGES.md | 71 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 42 insertions(+), 29 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 7ae7ce8eb..a1f69a3e2 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -19,36 +19,49 @@ - Add Nvidia MPS component for managing Nvidia GPU resources -This creates a container running Nvidia's Multi Process Service ([MPS](https://docs.nvidia.com/deploy/mps/index.html)) -which helps manage multi-user GPU access. -It runs an alternative CUDA interface which manages resource allocation when multiple processes are running simultaneously -on the same GPU. -It also allows the node admin to set additional per-user limits through the `JUPYTERHUB_RESOURCE_LIMITS` variable -which configures Jupyterlab containers: - -- `"gpu_device_mem_limit"`: sets the `CUDA_MPS_PINNED_DEVICE_MEM_LIMIT` environment variable -- `"gpu_active_thread_percentage"`: sets the `CUDA_MPS_ACTIVE_THREAD_PERCENTAGE` environment variable - -For example, the following will give all users in the group named `"users"` access to three GPUs in their Jupyterlab -container. On the first one (id = 0) only 1GB of memory is available, on the second (id = 1) only 5GB, and on the third -(id = 2) only 10GB. Additionally, the container will be able to use 10% of available threads on the GPUs. - -```shell -export JUPYTERHUB_RESOURCE_LIMITS=' -[{ - "type": "group", - "name": "users", - "limits": { - "gpu_ids": ["0", "1", "2"], - "gpu_count": 3, - "gpu_device_mem_limit": "0=1G,1=5G,2=10G", - "gpu_active_thread_percentage": "10" - } -}] -' -``` + This creates a container running Nvidia's Multi Process Service ([MPS](https://docs.nvidia.com/deploy/mps/index.html)) + which helps manage multi-user GPU access. + It runs an alternative CUDA interface which manages resource allocation when multiple processes are running simultaneously + on the same GPU. + It also allows the node admin to set additional per-user limits through the `JUPYTERHUB_RESOURCE_LIMITS` variable + which configures Jupyterlab containers: + + - `"gpu_device_mem_limit"`: sets the `CUDA_MPS_PINNED_DEVICE_MEM_LIMIT` environment variable + - `"gpu_active_thread_percentage"`: sets the `CUDA_MPS_ACTIVE_THREAD_PERCENTAGE` environment variable + + For example, the following will give all users in the group named `"users"` access to three GPUs in their Jupyterlab + container. On the first one (id = 0) only 1GB of memory is available, on the second (id = 1) only 5GB, and on the third + (id = 2) only 10GB. Additionally, the container will be able to use 10% of available threads on the GPUs. + + ```shell + export JUPYTERHUB_RESOURCE_LIMITS=' + [{ + "type": "group", + "name": "users", + "limits": { + "gpu_ids": ["0", "1", "2"], + "gpu_count": 3, + "gpu_device_mem_limit": "0=1G,1=5G,2=10G", + "gpu_active_thread_percentage": "10" + } + }] + ' + ``` + + Note that leaving any of these limits unset will default to allowing the user full access to the given resource. + +- Update `CustomDockerSpawner` to make pre spawn hooks and resource limits more configurable + + Introduce `pre_spawn_hooks` and `resource_limit_callbacks` attributes to the `CustomDockerSpawner` class which + can be used to further customize the `CustomDockerSpawner` from optional components. This gives us a way to + add additional functionality without having to directly modify existing functions which may be overwritten by the + user when they configure the spawner in `JUPYTERHUB_CONFIG_OVERRIDE`. -Note that leaving any of these limits unset will default to allowing the user full access to the given resource. + This also introduces the `JUPYTERHUB_CONFIG_OVERRIDE_INTERNAL` variable which is identical to the + `JUPYTERHUB_CONFIG_OVERRIDE` variable except that it is intended to only be set by other components (not be the + user in the local environment file). This allows components to customize Jupyterhub deployments without interfering + with custom settings created by the user. Note that `JUPYTERHUB_CONFIG_OVERRIDE` has precedence over + `JUPYTERHUB_CONFIG_OVERRIDE_INTERNAL`. [2.22.0](https://github.com/bird-house/birdhouse-deploy/tree/2.22.0) (2026-02-09) ------------------------------------------------------------------------------------------------------------------ From 022b325805b05604fc7d4072ac9ddfe2d783d403 Mon Sep 17 00:00:00 2001 From: mishaschwartz <4380924+mishaschwartz@users.noreply.github.com> Date: Tue, 10 Feb 2026 15:53:21 -0500 Subject: [PATCH 7/9] additional fixes in changes --- CHANGES.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index a1f69a3e2..8b33dac51 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -63,6 +63,15 @@ with custom settings created by the user. Note that `JUPYTERHUB_CONFIG_OVERRIDE` has precedence over `JUPYTERHUB_CONFIG_OVERRIDE_INTERNAL`. +## Fixes + +- Update GPU limit examples to show expected syntax + + Fixes some examples that showed that `gpu_ids` could be given as integers if they were meant to be indexes. However, + due to limitation of docker they must be strings. This modifies examples so that it is clear that strings must be + used and also updates the code to ensure that string values are only ever passed to docker when spawning a new + jupyterlab server. + [2.22.0](https://github.com/bird-house/birdhouse-deploy/tree/2.22.0) (2026-02-09) ------------------------------------------------------------------------------------------------------------------ From 75a3771a461151322066ff743c5e3e110d96cf74 Mon Sep 17 00:00:00 2001 From: mishaschwartz <4380924+mishaschwartz@users.noreply.github.com> Date: Wed, 11 Feb 2026 10:39:59 -0500 Subject: [PATCH 8/9] review suggestions --- CHANGES.md | 9 ++++++--- birdhouse/components/jupyterhub/default.env | 4 +++- birdhouse/optional-components/README.rst | 20 +++++++++++++++++++ .../02-readonly-cuda-vars.sh | 2 ++ .../nvidia-multi-process-service/default.env | 6 +++--- .../pre-docker-compose-up.include | 5 +++++ 6 files changed, 39 insertions(+), 7 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 8b33dac51..5eb71b005 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -58,10 +58,13 @@ user when they configure the spawner in `JUPYTERHUB_CONFIG_OVERRIDE`. This also introduces the `JUPYTERHUB_CONFIG_OVERRIDE_INTERNAL` variable which is identical to the - `JUPYTERHUB_CONFIG_OVERRIDE` variable except that it is intended to only be set by other components (not be the + `JUPYTERHUB_CONFIG_OVERRIDE` variable except that it is intended to only be set by other components (not by the user in the local environment file). This allows components to customize Jupyterhub deployments without interfering - with custom settings created by the user. Note that `JUPYTERHUB_CONFIG_OVERRIDE` has precedence over - `JUPYTERHUB_CONFIG_OVERRIDE_INTERNAL`. + with custom settings created by the user. + + Note that the contents of `JUPYTERHUB_CONFIG_OVERRIDE` have precedence over the contents of + `JUPYTERHUB_CONFIG_OVERRIDE_INTERNAL`. So for example if you create a volume mount named `my_volume` in both, only + the one defined in `JUPYTERHUB_CONFIG_OVERRIDE` will be applied. ## Fixes diff --git a/birdhouse/components/jupyterhub/default.env b/birdhouse/components/jupyterhub/default.env index b24637de5..8a53b520a 100644 --- a/birdhouse/components/jupyterhub/default.env +++ b/birdhouse/components/jupyterhub/default.env @@ -58,7 +58,9 @@ export JUPYTERHUB_CONFIG_OVERRIDE="" # Allows adding new configuration or overriding existing configurations # This variable should only be set by other components not directly by the user. # Users should set JUPYTERHUB_CONFIG_OVERRIDE instead. -export JUPYTERHUB_CONFIG_OVERRIDE_INTERNAL="" +# Note that this references itself in case another component has previously set this +# variable (before this file is processed). +export JUPYTERHUB_CONFIG_OVERRIDE_INTERNAL="${JUPYTERHUB_CONFIG_OVERRIDE_INTERNAL}" # URL used to verify that a logged in user has permission to access Jupyterhub # To disable this feature, unset this variable. However, disabling this feature is NOT diff --git a/birdhouse/optional-components/README.rst b/birdhouse/optional-components/README.rst index 1aae0f27f..82b4e228b 100644 --- a/birdhouse/optional-components/README.rst +++ b/birdhouse/optional-components/README.rst @@ -764,4 +764,24 @@ container. On the first one (id = 0) only 1GB of memory is available, on the sec Note that leaving any of these limits unset will default to allowing the user full access to the given resource. +.. note:: + + The ``mps`` docker container currently applies the MPS server to all GPUs. If you want to only apply the MPS server + to a subset of the GPUs available on your machine, you will need to create an additional component with a + ``docker-compose-extra.yml`` file that specifically overrides the container device settings for the ``mps`` container. + + For example, the docker compose configuration below would set the MPS server to only apply to GPUs with ids `"0"` and `"1"`. + +.. code-block:: yaml + + services: + mps: + deploy: + resources: + reservations: + devices: !override + - capabilities: [gpu] + driver: nvidia + device_ids: ["0", "1"] + .. _MPS: https://docs.nvidia.com/deploy/mps/index.html diff --git a/birdhouse/optional-components/nvidia-multi-process-service/02-readonly-cuda-vars.sh b/birdhouse/optional-components/nvidia-multi-process-service/02-readonly-cuda-vars.sh index 01ced9b85..c616041a6 100644 --- a/birdhouse/optional-components/nvidia-multi-process-service/02-readonly-cuda-vars.sh +++ b/birdhouse/optional-components/nvidia-multi-process-service/02-readonly-cuda-vars.sh @@ -1,2 +1,4 @@ +# Make these two variables that set limits readonly so that users cannot overwrite +# these variables from inside their jupyterlab container. readonly CUDA_MPS_PINNED_DEVICE_MEM_LIMIT readonly CUDA_MPS_ACTIVE_THREAD_PERCENTAGE diff --git a/birdhouse/optional-components/nvidia-multi-process-service/default.env b/birdhouse/optional-components/nvidia-multi-process-service/default.env index 0f932bd3e..433b22b9b 100644 --- a/birdhouse/optional-components/nvidia-multi-process-service/default.env +++ b/birdhouse/optional-components/nvidia-multi-process-service/default.env @@ -10,7 +10,7 @@ export DELAYED_EVAL=" export JUPYTERHUB_CONFIG_OVERRIDE_INTERNAL=" ${JUPYTERHUB_CONFIG_OVERRIDE_INTERNAL} -def _gpu_device_mem_limit(spawner, value): +def _gpu_device_mem_limit(spawner: CustomDockerSpawner, value: str) -> None: ''' Set memory limits for GPUs allocated to this user. @@ -18,7 +18,7 @@ def _gpu_device_mem_limit(spawner, value): ''' spawner.environment['CUDA_MPS_PINNED_DEVICE_MEM_LIMIT'] = value -def _gpu_active_thread_percentage(spawner, value): +def _gpu_active_thread_percentage(spawner: CustomDockerSpawner, value: str | int) -> None: ''' Set active thread percentage for GPUs allocated to this user @@ -31,7 +31,7 @@ c.CustomDockerSpawner.resource_limit_callbacks.update({ 'gpu_active_thread_percentage': _gpu_active_thread_percentage, }) -def _gpu_set_mps_configs(spawner): +def _gpu_set_mps_configs(spawner: CustomDockerSpawner) -> None: ''' Set configurations so this container uses the multi-process service running in the container named mps diff --git a/birdhouse/optional-components/nvidia-multi-process-service/pre-docker-compose-up.include b/birdhouse/optional-components/nvidia-multi-process-service/pre-docker-compose-up.include index f0e57b6cd..a105ddef5 100644 --- a/birdhouse/optional-components/nvidia-multi-process-service/pre-docker-compose-up.include +++ b/birdhouse/optional-components/nvidia-multi-process-service/pre-docker-compose-up.include @@ -1,3 +1,8 @@ +if ! command -v nvidia-smi >/dev/null; then + log ERROR "The optional-components/nvidia-multi-process-service component is enabled but no Nvidia GPUs or drivers can be detected on this system (no nvidia-smi command exists). Please ensure that GPUs are installed properly or disable this component." + expect_exit 1 +fi + if [ "$(nvidia-smi --query-gpu=compute_mode --format=csv,noheader | grep -vc 'Exclusive_Process')" -ne 0 ]; then log WARN "Nvidia GPUs with compute mode set to something other than EXCLUSIVE_PROCESS detected. We recommend you set the compute mode to EXCLUSIVE_PROCESS when enabling nvidia's Multi Process Service (MPS)." fi From d616dcf6e111995e9db02ff4f2d0fc89f181391c Mon Sep 17 00:00:00 2001 From: mishaschwartz <4380924+mishaschwartz@users.noreply.github.com> Date: Wed, 11 Mar 2026 11:59:18 -0400 Subject: [PATCH 9/9] more review suggestions --- CHANGES.md | 2 +- .../jupyterhub_custom/custom_dockerspawner.py | 8 ++++---- .../nvidia-multi-process-service/default.env | 2 +- tests/unit/test_jupyterhub_custom.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 5eb71b005..bda478821 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -52,7 +52,7 @@ - Update `CustomDockerSpawner` to make pre spawn hooks and resource limits more configurable - Introduce `pre_spawn_hooks` and `resource_limit_callbacks` attributes to the `CustomDockerSpawner` class which + Introduce `builtin_pre_spawn_hooks` and `resource_limit_callbacks` attributes to the `CustomDockerSpawner` class which can be used to further customize the `CustomDockerSpawner` from optional components. This gives us a way to add additional functionality without having to directly modify existing functions which may be overwritten by the user when they configure the spawner in `JUPYTERHUB_CONFIG_OVERRIDE`. diff --git a/birdhouse/components/jupyterhub/jupyterhub_custom/jupyterhub_custom/custom_dockerspawner.py b/birdhouse/components/jupyterhub/jupyterhub_custom/jupyterhub_custom/custom_dockerspawner.py index dd1e76b16..b2bf1d80c 100644 --- a/birdhouse/components/jupyterhub/jupyterhub_custom/jupyterhub_custom/custom_dockerspawner.py +++ b/birdhouse/components/jupyterhub/jupyterhub_custom/jupyterhub_custom/custom_dockerspawner.py @@ -238,7 +238,7 @@ def _default_start_timeout(self) -> int: ), ) - pre_spawn_hooks = List( + builtin_pre_spawn_hooks = List( Callable(), config=True, help=( @@ -248,8 +248,8 @@ def _default_start_timeout(self) -> int: ), ) - @default("pre_spawn_hooks") - def _default_pre_spawn_hooks(self) -> list: + @default("builtin_pre_spawn_hooks") + def _default_builtin_pre_spawn_hooks(self) -> list: return [ CustomDockerSpawner.__create_dir_hook, CustomDockerSpawner.__limit_resource_hook, @@ -362,7 +362,7 @@ def __limit_resource_hook(self) -> None: def run_pre_spawn_hook(self) -> None: """Run the builtin pre-spawn hooks as well as any set by pre_spawn_hook if defined.""" - for hook in self.pre_spawn_hooks: + for hook in self.builtin_pre_spawn_hooks: hook(self) if self.pre_spawn_hook: self.pre_spawn_hook(self) diff --git a/birdhouse/optional-components/nvidia-multi-process-service/default.env b/birdhouse/optional-components/nvidia-multi-process-service/default.env index 433b22b9b..6c0b0d28e 100644 --- a/birdhouse/optional-components/nvidia-multi-process-service/default.env +++ b/birdhouse/optional-components/nvidia-multi-process-service/default.env @@ -40,7 +40,7 @@ def _gpu_set_mps_configs(spawner: CustomDockerSpawner) -> None: spawner.extra_host_config['ipc_mode'] = 'container:mps' spawner.volumes['nvidia_mps'] = '/tmp/nvidia-mps' -c.CustomDockerSpawner.pre_spawn_hooks.append(_gpu_set_mps_configs) +c.CustomDockerSpawner.builtin_pre_spawn_hooks.append(_gpu_set_mps_configs) # This sets the variables as readonly so that users can't unset/update the environment variables # that set these limits in the jupyterlab docker container. diff --git a/tests/unit/test_jupyterhub_custom.py b/tests/unit/test_jupyterhub_custom.py index b35c26878..848957adc 100644 --- a/tests/unit/test_jupyterhub_custom.py +++ b/tests/unit/test_jupyterhub_custom.py @@ -617,7 +617,7 @@ class TestAdditionalPreSpawnHooks: def test_custom_pre_spawn_hook(self, spawner, generate_spawner_inst): mock = Mock() spawner_inst = generate_spawner_inst(spawner) - spawner_inst.pre_spawn_hooks.append(mock) + spawner_inst.builtin_pre_spawn_hooks.append(mock) spawner_inst.run_pre_spawn_hook() assert mock.call_args == ((spawner_inst,),)