From dba5d7d9e3a8a06cbfc6bbd0c495e7cb0ac8aeab Mon Sep 17 00:00:00 2001 From: suhasthegame <44828963+suhasthegame@users.noreply.github.com> Date: Wed, 3 Jan 2024 10:54:47 -0500 Subject: [PATCH 01/28] Added code to support Singularity on HiperGator --- girder_worker/docker/tasks/__init__.py | 120 ++++++++++++++++++++++++- girder_worker/docker/utils.py | 14 +++ girder_worker/entrypoint.py | 5 +- tests/integration/requirements.txt | 1 + 4 files changed, 136 insertions(+), 4 deletions(-) diff --git a/girder_worker/docker/tasks/__init__.py b/girder_worker/docker/tasks/__init__.py index b07be509..a6535f17 100644 --- a/girder_worker/docker/tasks/__init__.py +++ b/girder_worker/docker/tasks/__init__.py @@ -4,6 +4,9 @@ import socket import sys import threading +import time +import drmaa + try: import docker from docker.errors import DockerException, APIError, InvalidVersion @@ -33,6 +36,7 @@ BLACKLISTED_DOCKER_RUN_ARGS = ['tty', 'detach'] +JOB_STATUS = utils.job_status_codes() def _pull_image(image): @@ -359,6 +363,43 @@ def _cleanup_temp_volumes(self, temp_volumes, default_temp_volume): shutil.rmtree(v.host_path) +#Class for SingularityTask similar to DockerTask +class SingularityTask(Task): + def __call__(self, image,*args,container_args,bind_paths,**kwargs): + image = image or kwargs.pop('image',None) + container_args = container_args or kwargs.pop('container_args',[]) + bind_paths = bind_paths or kwargs.pop('bind_paths',{}) + temporary_directory = os.getenv('TEMPORARY_DIRECTORY','/tmp') + log_file_path = os.getenv('SINGULARITY_LOG_FILE','/log') + + #check if the user has provided the image of the plugin. + if not image: + raise ValueError('Plugin Image required for Singularity') + + #Commnad to be called for executing Singularity Job + bind_paths[temporary_directory] = '/output' + super().__call__(*args,**kwargs) + + qos = kwargs.pop('qos', 'pinaki.sarder') + cpus = kwargs.pop('cpus', 4) + gpus = kwargs.pop('gpus', 1) + memory = kwargs.pop('memory', '4GB') + other_slurm_options = kwargs.pop('other_slurm_options', '') + + slurm_script = _generate_slurm_script(image, container_args, bind_paths, qos, cpus, gpus, memory, other_slurm_options) + + exit_status = _monitor_singularity_job(self,slurm_script=slurm_script, log_file_path=log_file_path,temp_directory=temporary_directory) + #Handling exit status based on the DRM package's expected exit status codes + + if exit_status == JOB_STATUS.SUCCESS: + logger.info(f"Singularity job completed Successfully.") + elif exit_status == JOB_STATUS.FAILURE: + logger.error(f"Singularity Job exited with error") + elif exit_status == JOB_STATUS.CANCELLED: + logger.info('Singularity Job cancelled by the user') + + + def _docker_run(task, image, pull_image=True, entrypoint=None, container_args=None, volumes=None, remove_container=True, stream_connectors=None, **kwargs): volumes = volumes or {} @@ -419,10 +460,8 @@ def _docker_run(task, image, pull_image=True, entrypoint=None, container_args=No results = [] if hasattr(task.request, 'girder_result_hooks'): results = (None,) * len(task.request.girder_result_hooks) - return results - @app.task(base=DockerTask, bind=True) def docker_run(task, image, pull_image=True, entrypoint=None, container_args=None, volumes=None, remove_container=True, **kwargs): @@ -450,3 +489,80 @@ def docker_run(task, image, pull_image=True, entrypoint=None, container_args=Non return _docker_run( task, image, pull_image, entrypoint, container_args, volumes, remove_container, **kwargs) + +@app.task(base=SingularityTask, bind=True) +def singularity_run(task, image, *args, container_args=None, bind_paths=None, **kwargs): + return task(image,*args,container_args,bind_paths,**kwargs) + +#This function is used to check whether we need to switch to singularity or not. +def use_singularity(): + runtime = os.environ.get('RUNTIME') + if runtime == 'SINGULARITY': + return True + if runtime == 'DOCKER': + return False + try: + #Check whether we are connected to a docker socket. + with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as s: + return s.connect_ex('/var/run/docker.sock') != 0 + except socket.error: + return True + +def _generate_slurm_script(image, container_args, bind_paths, qos, cpus, gpus, memory, other_slurm_options): + # Construct the bind option for the Singularity command + bind_option = ','.join([f"{host}:{container}" for host, container in bind_paths.items()]) + + # Construct the Singularity command + singularity_command = f'singularity exec --bind {bind_option} docker://{image} {" ".join(container_args)}' + + # Generate the SLURM job script with the specified parameters + slurm_script = f"""#!/bin/bash +#SBATCH --qos={qos} +#SBATCH --cpus-per-task={cpus} +#SBATCH --gres=gpu:{gpus} +#SBATCH --mem={memory} +# Add any other SLURM options here +{other_slurm_options} + +{singularity_command} +""" + return slurm_script + +def _monitor_singularity_job(task,slurm_script,log_file_path,temp_directory): + """Create a drmaa session and monitor the job accordingly""" + def job_monitor(): + s = drmaa.Session() + s.initialize() + jt = s.createJobTemplate() + jt.remoteCommand = '/bin/bash' + jt.args = ['-c', slurm_script] + jt.workingDirectory = temp_directory + jt.outputPath = ':' + log_file_path + jt.errorPath = ':' + log_file_path + jobid = s.runJob(jt) + logger.log((f'Submitted singularity job with jobid {jobid}')) + while True: + job_info = s.jobStatus(jobid) + if job_info in [drmaa.JobState.DONE, drmaa.JobState.FAILED]: + break + + # Check if the task has been aborted by the user + if task.is_aborted: + s.control(jobid, drmaa.JobControl.TERMINATE) + logger.info(f'Job {jobid} was cancelled by user.') + return JOB_STATUS.CANCELLED + + time.sleep(5) # Sleep to avoid busy waiting + + exit_status = s.wait(jobid, drmaa.Session.TIMEOUT_WAIT_FOREVER).exitStatus + logger.info(f'Job {jobid} finished with exit status {exit_status}') + + s.deleteJobTemplate(jt) + return JOB_STATUS.SUCCESS if exit_status == 0 else JOB_STATUS.FAILURE + + # Start the job monitor in a new thread + monitor_thread = threading.Thread(target=job_monitor) + monitor_thread.start() + monitor_thread.join() + + return job_monitor() \ No newline at end of file diff --git a/girder_worker/docker/utils.py b/girder_worker/docker/utils.py index 4f87b56f..c8e55a14 100644 --- a/girder_worker/docker/utils.py +++ b/girder_worker/docker/utils.py @@ -1,3 +1,4 @@ +from types import SimpleNamespace import select import uuid import os @@ -103,3 +104,16 @@ def chmod_writable(host_paths): except DockerException: logger.exception('Error setting perms on docker volumes %s.' % host_paths) raise + + +JOB_STATUS = { + 'SUCCESS': 'Success', + 'FAILURE': "Failure", + 'CANCELLED': 'Cancelled' + } + +def job_status_codes(): + statusCodes = SimpleNamespace(JOB_STATUS) + return statusCodes + + diff --git a/girder_worker/entrypoint.py b/girder_worker/entrypoint.py index c6cd67ae..703c1bd4 100644 --- a/girder_worker/entrypoint.py +++ b/girder_worker/entrypoint.py @@ -1,7 +1,7 @@ from importlib import import_module import celery from girder_worker_utils import decorators - +from girder_worker.docker.tasks import use_singularity from stevedore import extension @@ -61,7 +61,8 @@ def get_module_tasks(module_name): if not hasattr(func, '__call__'): # filter out objects that are not callable continue - + if (use_singularity() and name == 'docker_run') or (not use_singularity() and name == 'singularity_run'): + continue try: decorators.get_description_attribute(func) tasks[full_name] = func diff --git a/tests/integration/requirements.txt b/tests/integration/requirements.txt index ef6bb370..e08d59e0 100644 --- a/tests/integration/requirements.txt +++ b/tests/integration/requirements.txt @@ -5,3 +5,4 @@ requests-toolbelt girder_client==2.3.0 girder-worker-utils>=0.8.0 celery>=4.0.0 +drmaa From 75b815628d90283789438b362343c9c91808f293 Mon Sep 17 00:00:00 2001 From: suhasthegame <44828963+suhasthegame@users.noreply.github.com> Date: Tue, 9 Jan 2024 10:49:02 -0500 Subject: [PATCH 02/28] Testing for docker_entrypoint --- girder_worker/docker/tasks/__init__.py | 57 +++++++++++++------------- girder_worker/entrypoint.py | 4 +- 2 files changed, 31 insertions(+), 30 deletions(-) diff --git a/girder_worker/docker/tasks/__init__.py b/girder_worker/docker/tasks/__init__.py index a6535f17..40ca596f 100644 --- a/girder_worker/docker/tasks/__init__.py +++ b/girder_worker/docker/tasks/__init__.py @@ -374,7 +374,8 @@ def __call__(self, image,*args,container_args,bind_paths,**kwargs): #check if the user has provided the image of the plugin. if not image: - raise ValueError('Plugin Image required for Singularity') + image = "sarderlab/histo-cloud:latest" + # raise ValueError('Plugin Image required for Singularity') #Commnad to be called for executing Singularity Job bind_paths[temporary_directory] = '/output' @@ -462,33 +463,33 @@ def _docker_run(task, image, pull_image=True, entrypoint=None, container_args=No results = (None,) * len(task.request.girder_result_hooks) return results -@app.task(base=DockerTask, bind=True) -def docker_run(task, image, pull_image=True, entrypoint=None, container_args=None, - volumes=None, remove_container=True, **kwargs): - """ - This task runs a docker container. For details on how to use this task, see the - :ref:`docker-run` guide. - - :param task: The bound task reference. - :type task: :py:class:`girder_worker.task.Task` - :param image: The docker image identifier. - :type image: str - :param pull_image: Whether to explicitly pull the image prior to running the container. - :type pull_image: bool - :param entrypoint: Alternative entrypoint to use when running the container. - :type entrypoint: str - :param container_args: Arguments to pass to the container. - :type container_args: list - :param volumes: Volumes to expose to the container. - :type volumes: dict - :param remove_container: Whether to delete the container after the task is done. - :type remove_container: bool - :return: Fulfilled result hooks. - :rtype: list - """ - return _docker_run( - task, image, pull_image, entrypoint, container_args, volumes, - remove_container, **kwargs) +# @app.task(base=DockerTask, bind=True) +# def docker_run(task, image, pull_image=True, entrypoint=None, container_args=None, +# volumes=None, remove_container=True, **kwargs): +# """ +# This task runs a docker container. For details on how to use this task, see the +# :ref:`docker-run` guide. + +# :param task: The bound task reference. +# :type task: :py:class:`girder_worker.task.Task` +# :param image: The docker image identifier. +# :type image: str +# :param pull_image: Whether to explicitly pull the image prior to running the container. +# :type pull_image: bool +# :param entrypoint: Alternative entrypoint to use when running the container. +# :type entrypoint: str +# :param container_args: Arguments to pass to the container. +# :type container_args: list +# :param volumes: Volumes to expose to the container. +# :type volumes: dict +# :param remove_container: Whether to delete the container after the task is done. +# :type remove_container: bool +# :return: Fulfilled result hooks. +# :rtype: list +# """ +# return _docker_run( +# task, image, pull_image, entrypoint, container_args, volumes, +# remove_container, **kwargs) @app.task(base=SingularityTask, bind=True) def singularity_run(task, image, *args, container_args=None, bind_paths=None, **kwargs): diff --git a/girder_worker/entrypoint.py b/girder_worker/entrypoint.py index 703c1bd4..287a5a03 100644 --- a/girder_worker/entrypoint.py +++ b/girder_worker/entrypoint.py @@ -61,8 +61,8 @@ def get_module_tasks(module_name): if not hasattr(func, '__call__'): # filter out objects that are not callable continue - if (use_singularity() and name == 'docker_run') or (not use_singularity() and name == 'singularity_run'): - continue + # if (use_singularity() and name == 'docker_run') or (not use_singularity() and name == 'singularity_run'): + # continue try: decorators.get_description_attribute(func) tasks[full_name] = func From 2a9b1958021bcf36fc0f11066f430c3900ee3227 Mon Sep 17 00:00:00 2001 From: suhasthegame <44828963+suhasthegame@users.noreply.github.com> Date: Wed, 14 Feb 2024 14:58:56 -0500 Subject: [PATCH 03/28] Intermediate Commit --ignore --- girder_worker/docker/tasks/__init__.py | 667 +++++++++++++------------ 1 file changed, 335 insertions(+), 332 deletions(-) diff --git a/girder_worker/docker/tasks/__init__.py b/girder_worker/docker/tasks/__init__.py index 40ca596f..4862760d 100644 --- a/girder_worker/docker/tasks/__init__.py +++ b/girder_worker/docker/tasks/__init__.py @@ -39,264 +39,264 @@ JOB_STATUS = utils.job_status_codes() -def _pull_image(image): - """ - Pulls the specified Docker image onto this worker. - """ - client = docker.from_env(version='auto') - try: - client.images.pull(image) - except DockerException: - logger.exception('Error pulling Docker image %s:' % image) - raise - - -def _get_docker_network(): - try: - ip = socket.gethostbyname(socket.gethostname()) - if 'DOCKER_CLIENT_TIMEOUT' in os.environ: - timeout = int(os.environ['DOCKER_CLIENT_TIMEOUT']) - client = docker.from_env(version='auto', timeout=timeout) - else: - client = docker.from_env(version='auto') - for container in client.containers.list(all=True, filters={'status': 'running'}): - for nw in container.attrs['NetworkSettings']['Networks'].values(): - if nw['IPAddress'] == ip: - return 'container:%s' % container.id - except Exception: - logger.exception('Failed to get docker network') - - -def _remove_stopped_container(client, name): - if name is None: - return - for container in client.containers.list(all=True, filters={'name': name}): - try: - logger.info('Removing container %s ' % (name)) - container.remove() - except Exception: - pass - - -def _run_container(image, container_args, **kwargs): - # TODO we could allow configuration of non default socket - if 'DOCKER_CLIENT_TIMEOUT' in os.environ: - timeout = int(os.environ['DOCKER_CLIENT_TIMEOUT']) - client = docker.from_env(version='auto', timeout=timeout) - else: - client = docker.from_env(version='auto') - - runtime = kwargs.pop('runtime', None) - origRuntime = runtime - if runtime is None and nvidia.is_nvidia_image(client.api, image): - runtime = 'nvidia' - - container_args = [str(arg) for arg in container_args] - - if 'network' not in kwargs and 'network_mode' not in kwargs: - docker_network = _get_docker_network() - if docker_network: - kwargs = kwargs.copy() - kwargs['network'] = docker_network - - logger.info('Running container: image: %s args: %s runtime: %s kwargs: %s' - % (image, container_args, runtime, kwargs)) - try: - name = None - try: - if runtime == 'nvidia' and kwargs.get('device_requests') is None: - # Docker < 19.03 required the runtime='nvidia' argument. - # Newer versions require a device request for some number of - # GPUs. This should handle either version of the docker - # daemon. - try: - device_requests_kwargs = kwargs.copy() - device_requests_kwargs['device_requests'] = [ - docker.types.DeviceRequest(count=-1, capabilities=[['gpu']])] - name = device_requests_kwargs.setdefault( - 'name', - 'girder_worker_' + datetime.datetime.now().strftime('%Y%m%d_%H%M%S_%f')) - return client.containers.run( - image, container_args, **device_requests_kwargs) - except (APIError, InvalidVersion): - _remove_stopped_container(client, name) - pass - kwargs = kwargs.copy() - name = kwargs.setdefault( - 'name', - 'girder_worker_' + datetime.datetime.now().strftime('%Y%m%d_%H%M%S_%f')) - return client.containers.run( - image, container_args, runtime=runtime, **kwargs) - except APIError: - _remove_stopped_container(client, name) - if origRuntime is None and runtime is not None: - kwargs = kwargs.copy() - name = kwargs.setdefault( - 'name', - 'girder_worker_' + datetime.datetime.now().strftime('%Y%m%d_%H%M%S_%f')) - return client.containers.run(image, container_args, **kwargs) - else: - raise - except DockerException: - logger.exception('Exception when running docker container') - raise - - -class _SocketReader(FileDescriptorReader): - """ - Used to mediate the difference between the python 2/3 implementation of docker-py - with python 2 attach_socket(...) returns a socket like object, with python 3 - it returns an instance of SocketIO. - """ - def __init__(self, socket): - self._socket = socket - - def read(self, n): - # socket - if hasattr(self._socket, 'recv'): - return self._socket.recv(n) - - # SocketIO - return self._socket.read(n) - - def fileno(self): - return self._socket.fileno() - - def close(self): - self._socket.close() - - -def _run_select_loop( # noqa: C901 - task, container, read_stream_connectors, write_stream_connectors): - stdout = None - stderr = None - try: - # attach to standard streams - stdout = container.attach_socket(params={ - 'stdout': True, - 'logs': True, - 'stream': True - }) - - stderr = container.attach_socket(params={ - 'stderr': True, - 'logs': True, - 'stream': True - }) - - def exit_condition(): - container.reload() - return container.status in {'exited', 'dead'} or task.canceled - - # Look for ContainerStdOut and ContainerStdErr instances that need - # to be replace with the real container streams. - stdout_connected = False - for read_stream_connector in read_stream_connectors: - if isinstance(read_stream_connector.input, ContainerStdOut): - stdout_reader = _SocketReader(stdout) - read_stream_connector.output = DockerStreamPushAdapter(read_stream_connector.output) - read_stream_connector.input = stdout_reader - stdout_connected = True - break - - stderr_connected = False - for read_stream_connector in read_stream_connectors: - if isinstance(read_stream_connector.input, ContainerStdErr): - stderr_reader = _SocketReader(stderr) - read_stream_connector.output = DockerStreamPushAdapter(read_stream_connector.output) - read_stream_connector.input = stderr_reader - stderr_connected = True - break - - # If not stdout and stderr connection has been provided just use - # sys.stdXXX - if not stdout_connected: - stdout_reader = _SocketReader(stdout) - connector = FDReadStreamConnector( - stdout_reader, - DockerStreamPushAdapter(StdStreamWriter(sys.stdout))) - read_stream_connectors.append(connector) - - if not stderr_connected: - stderr_reader = _SocketReader(stderr) - connector = FDReadStreamConnector( - stderr_reader, - DockerStreamPushAdapter(StdStreamWriter(sys.stderr))) - read_stream_connectors.append(connector) - - # Run select loop - utils.select_loop(exit_condition=exit_condition, - readers=read_stream_connectors, - writers=write_stream_connectors) - - if task.canceled: - try: - msg = 'Asking to stop container: %s' % container.id - logger.info(msg) - container.stop() - # Catch the ReadTimeout from requests and wait for container to - # exit. See https://github.com/docker/docker-py/issues/1374 for - # more details. - except ReadTimeout: - tries = 10 - while tries > 0: - container.reload() - if container.status == 'exited': - break - - if container.status != 'exited': - msg = 'Unable to stop container: %s' % container.id - logger.error(msg) - except DockerException as dex: - logger.error(dex) - raise - - container.reload() - exit_code = container.attrs['State']['ExitCode'] - if not task.canceled and exit_code != 0: - raise DockerException('Non-zero exit code from docker container (%d).' % exit_code) - finally: - # Close our stdout and stderr sockets - if stdout: - stdout.close() - if stderr: - stderr.close() - - -def _handle_streaming_args(args): - processed_args = [] - write_streams = [] - read_streams = [] - - def _maybe_path(arg): - if hasattr(arg, 'path'): - return arg.path() - - # Don't pass anything - return '' - - for arg in args: - if isinstance(arg, FDStreamConnector): - if isinstance(arg, FDWriteStreamConnector): - write_streams.append(arg) - arg = _maybe_path(arg.output) - - elif isinstance(arg, FDReadStreamConnector): - read_streams.append(arg) - arg = _maybe_path(arg.input) - - processed_args.append(arg) - - return (processed_args, read_streams, write_streams) - - -class _RequestDefaultTemporaryVolume(_TemporaryVolumeBase): - def __init__(self): - super().__init__(None, None) - self._make_paths() - - def transform(self, **kwargs): - self._transformed = True +# def _pull_image(image): +# """ +# Pulls the specified Docker image onto this worker. +# """ +# client = docker.from_env(version='auto') +# try: +# client.images.pull(image) +# except DockerException: +# logger.exception('Error pulling Docker image %s:' % image) +# raise + + +# def _get_docker_network(): +# try: +# ip = socket.gethostbyname(socket.gethostname()) +# if 'DOCKER_CLIENT_TIMEOUT' in os.environ: +# timeout = int(os.environ['DOCKER_CLIENT_TIMEOUT']) +# client = docker.from_env(version='auto', timeout=timeout) +# else: +# client = docker.from_env(version='auto') +# for container in client.containers.list(all=True, filters={'status': 'running'}): +# for nw in container.attrs['NetworkSettings']['Networks'].values(): +# if nw['IPAddress'] == ip: +# return 'container:%s' % container.id +# except Exception: +# logger.exception('Failed to get docker network') + + +# def _remove_stopped_container(client, name): +# if name is None: +# return +# for container in client.containers.list(all=True, filters={'name': name}): +# try: +# logger.info('Removing container %s ' % (name)) +# container.remove() +# except Exception: +# pass + + +# def _run_container(image, container_args, **kwargs): +# # TODO we could allow configuration of non default socket +# if 'DOCKER_CLIENT_TIMEOUT' in os.environ: +# timeout = int(os.environ['DOCKER_CLIENT_TIMEOUT']) +# client = docker.from_env(version='auto', timeout=timeout) +# else: +# client = docker.from_env(version='auto') + +# runtime = kwargs.pop('runtime', None) +# origRuntime = runtime +# if runtime is None and nvidia.is_nvidia_image(client.api, image): +# runtime = 'nvidia' + +# container_args = [str(arg) for arg in container_args] + +# if 'network' not in kwargs and 'network_mode' not in kwargs: +# docker_network = _get_docker_network() +# if docker_network: +# kwargs = kwargs.copy() +# kwargs['network'] = docker_network + +# logger.info('Running container: image: %s args: %s runtime: %s kwargs: %s' +# % (image, container_args, runtime, kwargs)) +# try: +# name = None +# try: +# if runtime == 'nvidia' and kwargs.get('device_requests') is None: +# # Docker < 19.03 required the runtime='nvidia' argument. +# # Newer versions require a device request for some number of +# # GPUs. This should handle either version of the docker +# # daemon. +# try: +# device_requests_kwargs = kwargs.copy() +# device_requests_kwargs['device_requests'] = [ +# docker.types.DeviceRequest(count=-1, capabilities=[['gpu']])] +# name = device_requests_kwargs.setdefault( +# 'name', +# 'girder_worker_' + datetime.datetime.now().strftime('%Y%m%d_%H%M%S_%f')) +# return client.containers.run( +# image, container_args, **device_requests_kwargs) +# except (APIError, InvalidVersion): +# _remove_stopped_container(client, name) +# pass +# kwargs = kwargs.copy() +# name = kwargs.setdefault( +# 'name', +# 'girder_worker_' + datetime.datetime.now().strftime('%Y%m%d_%H%M%S_%f')) +# return client.containers.run( +# image, container_args, runtime=runtime, **kwargs) +# except APIError: +# _remove_stopped_container(client, name) +# if origRuntime is None and runtime is not None: +# kwargs = kwargs.copy() +# name = kwargs.setdefault( +# 'name', +# 'girder_worker_' + datetime.datetime.now().strftime('%Y%m%d_%H%M%S_%f')) +# return client.containers.run(image, container_args, **kwargs) +# else: +# raise +# except DockerException: +# logger.exception('Exception when running docker container') +# raise + + +# class _SocketReader(FileDescriptorReader): +# """ +# Used to mediate the difference between the python 2/3 implementation of docker-py +# with python 2 attach_socket(...) returns a socket like object, with python 3 +# it returns an instance of SocketIO. +# """ +# def __init__(self, socket): +# self._socket = socket + +# def read(self, n): +# # socket +# if hasattr(self._socket, 'recv'): +# return self._socket.recv(n) + +# # SocketIO +# return self._socket.read(n) + +# def fileno(self): +# return self._socket.fileno() + +# def close(self): +# self._socket.close() + + +# def _run_select_loop( # noqa: C901 +# task, container, read_stream_connectors, write_stream_connectors): +# stdout = None +# stderr = None +# try: +# # attach to standard streams +# stdout = container.attach_socket(params={ +# 'stdout': True, +# 'logs': True, +# 'stream': True +# }) + +# stderr = container.attach_socket(params={ +# 'stderr': True, +# 'logs': True, +# 'stream': True +# }) + +# def exit_condition(): +# container.reload() +# return container.status in {'exited', 'dead'} or task.canceled + +# # Look for ContainerStdOut and ContainerStdErr instances that need +# # to be replace with the real container streams. +# stdout_connected = False +# for read_stream_connector in read_stream_connectors: +# if isinstance(read_stream_connector.input, ContainerStdOut): +# stdout_reader = _SocketReader(stdout) +# read_stream_connector.output = DockerStreamPushAdapter(read_stream_connector.output) +# read_stream_connector.input = stdout_reader +# stdout_connected = True +# break + +# stderr_connected = False +# for read_stream_connector in read_stream_connectors: +# if isinstance(read_stream_connector.input, ContainerStdErr): +# stderr_reader = _SocketReader(stderr) +# read_stream_connector.output = DockerStreamPushAdapter(read_stream_connector.output) +# read_stream_connector.input = stderr_reader +# stderr_connected = True +# break + +# # If not stdout and stderr connection has been provided just use +# # sys.stdXXX +# if not stdout_connected: +# stdout_reader = _SocketReader(stdout) +# connector = FDReadStreamConnector( +# stdout_reader, +# DockerStreamPushAdapter(StdStreamWriter(sys.stdout))) +# read_stream_connectors.append(connector) + +# if not stderr_connected: +# stderr_reader = _SocketReader(stderr) +# connector = FDReadStreamConnector( +# stderr_reader, +# DockerStreamPushAdapter(StdStreamWriter(sys.stderr))) +# read_stream_connectors.append(connector) + +# # Run select loop +# utils.select_loop(exit_condition=exit_condition, +# readers=read_stream_connectors, +# writers=write_stream_connectors) + +# if task.canceled: +# try: +# msg = 'Asking to stop container: %s' % container.id +# logger.info(msg) +# container.stop() +# # Catch the ReadTimeout from requests and wait for container to +# # exit. See https://github.com/docker/docker-py/issues/1374 for +# # more details. +# except ReadTimeout: +# tries = 10 +# while tries > 0: +# container.reload() +# if container.status == 'exited': +# break + +# if container.status != 'exited': +# msg = 'Unable to stop container: %s' % container.id +# logger.error(msg) +# except DockerException as dex: +# logger.error(dex) +# raise + +# container.reload() +# exit_code = container.attrs['State']['ExitCode'] +# if not task.canceled and exit_code != 0: +# raise DockerException('Non-zero exit code from docker container (%d).' % exit_code) +# finally: +# # Close our stdout and stderr sockets +# if stdout: +# stdout.close() +# if stderr: +# stderr.close() + + +# def _handle_streaming_args(args): +# processed_args = [] +# write_streams = [] +# read_streams = [] + +# def _maybe_path(arg): +# if hasattr(arg, 'path'): +# return arg.path() + +# # Don't pass anything +# return '' + +# for arg in args: +# if isinstance(arg, FDStreamConnector): +# if isinstance(arg, FDWriteStreamConnector): +# write_streams.append(arg) +# arg = _maybe_path(arg.output) + +# elif isinstance(arg, FDReadStreamConnector): +# read_streams.append(arg) +# arg = _maybe_path(arg.input) + +# processed_args.append(arg) + +# return (processed_args, read_streams, write_streams) + + +# class _RequestDefaultTemporaryVolume(_TemporaryVolumeBase): +# def __init__(self): +# super().__init__(None, None) +# self._make_paths() + +# def transform(self, **kwargs): +# self._transformed = True class DockerTask(Task): @@ -374,9 +374,8 @@ def __call__(self, image,*args,container_args,bind_paths,**kwargs): #check if the user has provided the image of the plugin. if not image: - image = "sarderlab/histo-cloud:latest" - # raise ValueError('Plugin Image required for Singularity') - + raise ValueError('Plugin Image required for Singularity') + logger.write(f'Image {image}') #Commnad to be called for executing Singularity Job bind_paths[temporary_directory] = '/output' super().__call__(*args,**kwargs) @@ -401,67 +400,67 @@ def __call__(self, image,*args,container_args,bind_paths,**kwargs): -def _docker_run(task, image, pull_image=True, entrypoint=None, container_args=None, - volumes=None, remove_container=True, stream_connectors=None, **kwargs): - volumes = volumes or {} - stream_connectors = stream_connectors or [] - container_args = container_args or [] - - if pull_image: - logger.info('Pulling Docker image: %s', image) - _pull_image(image) - - if entrypoint is not None and not isinstance(entrypoint, (list, tuple)): - entrypoint = [entrypoint] - - run_kwargs = { - 'tty': False, - 'volumes': volumes, - 'detach': True - } - - # Allow run args to be overridden,filter out any we don't want to override - extra_run_kwargs = {k: v for k, v in kwargs.items() if k not in BLACKLISTED_DOCKER_RUN_ARGS} - run_kwargs.update(extra_run_kwargs) - - if entrypoint is not None: - run_kwargs['entrypoint'] = entrypoint - - container_args, read_streams, write_streams = _handle_streaming_args(container_args) - - for connector in stream_connectors: - if isinstance(connector, FDReadStreamConnector): - read_streams.append(connector) - elif isinstance(connector, FDWriteStreamConnector): - write_streams.append(connector) - else: - raise TypeError( - "Expected 'FDReadStreamConnector' or 'FDWriterStreamConnector', received '%s'" - % type(connector)) - - # We need to open any read streams before starting the container, so the - # underling named pipes are opened for read. - for stream in read_streams: - stream.open() - - container = _run_container(image, container_args, **run_kwargs) - try: - _run_select_loop(task, container, read_streams, write_streams) - finally: - if container and remove_container: - container.reload() - # If the container is still running issue a warning - if container.status == 'running': - logger.warning('Container is still running, unable to remove.') - else: - container.remove() - - # return an array of None's equal to number of entries in the girder_result_hooks - # header, in order to trigger processing of the container outputs. - results = [] - if hasattr(task.request, 'girder_result_hooks'): - results = (None,) * len(task.request.girder_result_hooks) - return results +# def _docker_run(task, image, pull_image=True, entrypoint=None, container_args=None, +# volumes=None, remove_container=True, stream_connectors=None, **kwargs): +# volumes = volumes or {} +# stream_connectors = stream_connectors or [] +# container_args = container_args or [] + +# if pull_image: +# logger.info('Pulling Docker image: %s', image) +# _pull_image(image) + +# if entrypoint is not None and not isinstance(entrypoint, (list, tuple)): +# entrypoint = [entrypoint] + +# run_kwargs = { +# 'tty': False, +# 'volumes': volumes, +# 'detach': True +# } + +# # Allow run args to be overridden,filter out any we don't want to override +# extra_run_kwargs = {k: v for k, v in kwargs.items() if k not in BLACKLISTED_DOCKER_RUN_ARGS} +# run_kwargs.update(extra_run_kwargs) + +# if entrypoint is not None: +# run_kwargs['entrypoint'] = entrypoint + +# container_args, read_streams, write_streams = _handle_streaming_args(container_args) + +# for connector in stream_connectors: +# if isinstance(connector, FDReadStreamConnector): +# read_streams.append(connector) +# elif isinstance(connector, FDWriteStreamConnector): +# write_streams.append(connector) +# else: +# raise TypeError( +# "Expected 'FDReadStreamConnector' or 'FDWriterStreamConnector', received '%s'" +# % type(connector)) + +# # We need to open any read streams before starting the container, so the +# # underling named pipes are opened for read. +# for stream in read_streams: +# stream.open() + +# container = _run_container(image, container_args, **run_kwargs) +# try: +# _run_select_loop(task, container, read_streams, write_streams) +# finally: +# if container and remove_container: +# container.reload() +# # If the container is still running issue a warning +# if container.status == 'running': +# logger.warning('Container is still running, unable to remove.') +# else: +# container.remove() + +# # return an array of None's equal to number of entries in the girder_result_hooks +# # header, in order to trigger processing of the container outputs. +# results = [] +# if hasattr(task.request, 'girder_result_hooks'): +# results = (None,) * len(task.request.girder_result_hooks) +# return results # @app.task(base=DockerTask, bind=True) # def docker_run(task, image, pull_image=True, entrypoint=None, container_args=None, @@ -491,22 +490,26 @@ def _docker_run(task, image, pull_image=True, entrypoint=None, container_args=No # task, image, pull_image, entrypoint, container_args, volumes, # remove_container, **kwargs) +#Chaging base temorarily to @app.task(base=SingularityTask, bind=True) def singularity_run(task, image, *args, container_args=None, bind_paths=None, **kwargs): return task(image,*args,container_args,bind_paths,**kwargs) #This function is used to check whether we need to switch to singularity or not. def use_singularity(): - runtime = os.environ.get('RUNTIME') - if runtime == 'SINGULARITY': - return True - if runtime == 'DOCKER': - return False - try: - #Check whether we are connected to a docker socket. - with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as s: - return s.connect_ex('/var/run/docker.sock') != 0 - except socket.error: + ''' + #This needs to be uncommented. Only for testing purposes. + ''' + # runtime = os.environ.get('RUNTIME') + # if runtime == 'SINGULARITY': + # return True + # if runtime == 'DOCKER': + # return False + # try: + # #Check whether we are connected to a docker socket. + # with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as s: + # return s.connect_ex('/var/run/docker.sock') != 0 + # except socket.error: return True def _generate_slurm_script(image, container_args, bind_paths, qos, cpus, gpus, memory, other_slurm_options): From 0705809fa0275556ec0e1ea7029c2ff9fa6f446f Mon Sep 17 00:00:00 2001 From: suhasthegame <44828963+suhasthegame@users.noreply.github.com> Date: Mon, 19 Feb 2024 13:18:40 -0500 Subject: [PATCH 04/28] Added Dependencies --ignore --- requirements-dev.in | 1 + requirements.in | 2 ++ requirements.txt | 1 + 3 files changed, 4 insertions(+) diff --git a/requirements-dev.in b/requirements-dev.in index f27839a5..cbaa6de1 100644 --- a/requirements-dev.in +++ b/requirements-dev.in @@ -16,3 +16,4 @@ pytest-cov Sphinx sphinx-rtd-theme tox +drmaa diff --git a/requirements.in b/requirements.in index 4e7d178b..758468bc 100644 --- a/requirements.in +++ b/requirements.in @@ -11,3 +11,5 @@ stevedore jsonpickle girder_worker_utils>=0.8.4 docker>=2.6.0 +drmaa + diff --git a/requirements.txt b/requirements.txt index 2e9de961..4ec9fa12 100644 --- a/requirements.txt +++ b/requirements.txt @@ -381,6 +381,7 @@ zipp==3.17.0 # via # importlib-metadata # importlib-resources +drmaa # The following packages are considered to be unsafe in a requirements file: # setuptools From 9957574fd3f67a257c1fd4999263d38c28703cd9 Mon Sep 17 00:00:00 2001 From: suhasthegame <44828963+suhasthegame@users.noreply.github.com> Date: Wed, 21 Feb 2024 13:31:02 -0500 Subject: [PATCH 05/28] Ignore --all --- girder_worker/docker/tasks/__init__.py | 74 +++++++++++++++----------- 1 file changed, 43 insertions(+), 31 deletions(-) diff --git a/girder_worker/docker/tasks/__init__.py b/girder_worker/docker/tasks/__init__.py index 4862760d..c0069373 100644 --- a/girder_worker/docker/tasks/__init__.py +++ b/girder_worker/docker/tasks/__init__.py @@ -5,7 +5,7 @@ import sys import threading import time -import drmaa +import subprocess try: import docker @@ -517,7 +517,7 @@ def _generate_slurm_script(image, container_args, bind_paths, qos, cpus, gpus, m bind_option = ','.join([f"{host}:{container}" for host, container in bind_paths.items()]) # Construct the Singularity command - singularity_command = f'singularity exec --bind {bind_option} docker://{image} {" ".join(container_args)}' + singularity_command = f'singularity run --bind {bind_option} docker://{image} {" ".join(container_args)}' # Generate the SLURM job script with the specified parameters slurm_script = f"""#!/bin/bash @@ -530,39 +530,51 @@ def _generate_slurm_script(image, container_args, bind_paths, qos, cpus, gpus, m {singularity_command} """ - return slurm_script + return singularity_command def _monitor_singularity_job(task,slurm_script,log_file_path,temp_directory): """Create a drmaa session and monitor the job accordingly""" def job_monitor(): - s = drmaa.Session() - s.initialize() - jt = s.createJobTemplate() - jt.remoteCommand = '/bin/bash' - jt.args = ['-c', slurm_script] - jt.workingDirectory = temp_directory - jt.outputPath = ':' + log_file_path - jt.errorPath = ':' + log_file_path - jobid = s.runJob(jt) - logger.log((f'Submitted singularity job with jobid {jobid}')) - while True: - job_info = s.jobStatus(jobid) - if job_info in [drmaa.JobState.DONE, drmaa.JobState.FAILED]: - break - - # Check if the task has been aborted by the user - if task.is_aborted: - s.control(jobid, drmaa.JobControl.TERMINATE) - logger.info(f'Job {jobid} was cancelled by user.') - return JOB_STATUS.CANCELLED - - time.sleep(5) # Sleep to avoid busy waiting - - exit_status = s.wait(jobid, drmaa.Session.TIMEOUT_WAIT_FOREVER).exitStatus - logger.info(f'Job {jobid} finished with exit status {exit_status}') - - s.deleteJobTemplate(jt) - return JOB_STATUS.SUCCESS if exit_status == 0 else JOB_STATUS.FAILURE + # s = drmaa.Session() + # s.initialize() + # jt = s.createJobTemplate() + # jt.remoteCommand = '/bin/bash' + # jt.args = ['-c', slurm_script] + # jt.workingDirectory = temp_directory + # jt.outputPath = ':' + log_file_path + # jt.errorPath = ':' + log_file_path + # jobid = s.runJob(jt) + # logger.log((f'Submitted singularity job with jobid {jobid}')) + # while True: + # job_info = s.jobStatus(jobid) + # if job_info in [drmaa.JobState.DONE, drmaa.JobState.FAILED]: + # break + + # # Check if the task has been aborted by the user + # if task.is_aborted: + # s.control(jobid, drmaa.JobControl.TERMINATE) + # logger.info(f'Job {jobid} was cancelled by user.') + # return JOB_STATUS.CANCELLED + + # time.sleep(5) # Sleep to avoid busy waiting + + # exit_status = s.wait(jobid, drmaa.Session.TIMEOUT_WAIT_FOREVER).exitStatus + # logger.info(f'Job {jobid} finished with exit status {exit_status}') + + # s.deleteJobTemplate(jt) + # return JOB_STATUS.SUCCESS if exit_status == 0 else JOB_STATUS.FAILURE + ''' + THis is just for testing. Need to repalce with DRMAA + ''' + try: + res = subprocess.run(slurm_script,stdout=subprocess.PIPE,stderr=subprocess.PIPE, check=True) + if isinstance(res.stdout, bytes): + res = res.stdout.decode('utf-8') + logger.info(res) + except Exception as e: + logger.exception(f"Exception occured {e}") + + # Start the job monitor in a new thread monitor_thread = threading.Thread(target=job_monitor) From 5f2aac3f4dbf5dbdf3780af2d2a1d177de67714c Mon Sep 17 00:00:00 2001 From: suhasthegame <44828963+suhasthegame@users.noreply.github.com> Date: Tue, 27 Feb 2024 14:02:03 -0500 Subject: [PATCH 06/28] Added version number to girder worker --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 1628f3b8..f09b68ec 100644 --- a/setup.py +++ b/setup.py @@ -75,6 +75,7 @@ def run(self, *args, **kwargs): setuptools.setup( name='girder-worker', use_scm_version={'local_scheme': prerelease_local_scheme}, + version='0.12.1', setup_requires=['setuptools_scm'], description='Batch execution engine built on celery.', long_description=readme, From 03b371786416262c5f015c6a2cd221ec648c6c3f Mon Sep 17 00:00:00 2001 From: suhasthegame <44828963+suhasthegame@users.noreply.github.com> Date: Tue, 27 Feb 2024 14:28:37 -0500 Subject: [PATCH 07/28] Changed version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f09b68ec..c124eb1a 100644 --- a/setup.py +++ b/setup.py @@ -74,7 +74,7 @@ def run(self, *args, **kwargs): # perform the install setuptools.setup( name='girder-worker', - use_scm_version={'local_scheme': prerelease_local_scheme}, + # use_scm_version={'local_scheme': prerelease_local_scheme}, version='0.12.1', setup_requires=['setuptools_scm'], description='Batch execution engine built on celery.', From ba0c9c37a33e49b04c029877356cb5e4bb473001 Mon Sep 17 00:00:00 2001 From: suhasthegame <44828963+suhasthegame@users.noreply.github.com> Date: Wed, 28 Feb 2024 14:42:57 -0500 Subject: [PATCH 08/28] Reverted version back to original --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index c124eb1a..75a63dde 100644 --- a/setup.py +++ b/setup.py @@ -74,8 +74,8 @@ def run(self, *args, **kwargs): # perform the install setuptools.setup( name='girder-worker', - # use_scm_version={'local_scheme': prerelease_local_scheme}, - version='0.12.1', + use_scm_version={'local_scheme': prerelease_local_scheme}, + # version='0.12.1', setup_requires=['setuptools_scm'], description='Batch execution engine built on celery.', long_description=readme, From 10a47404b2c779af8f13cdb59d56b35de14d020a Mon Sep 17 00:00:00 2001 From: "User pinaki.sarder-web" Date: Tue, 16 Apr 2024 16:52:42 -0400 Subject: [PATCH 09/28] Intermediate Changes Please ignore --- .gitignore | 3 + .../gwexample/analyses/tasks.py | 29 + girder_worker/docker/tasks/__init__.py | 825 ++++++++++-------- girder_worker/entrypoint.py | 21 +- setup.py | 2 +- 5 files changed, 508 insertions(+), 372 deletions(-) diff --git a/.gitignore b/.gitignore index 92a71b09..3d781a34 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,6 @@ __pycache__/ .tox/ .cache/ htmlcov/ +build/ +.eggs + diff --git a/examples/plugin_example/gwexample/analyses/tasks.py b/examples/plugin_example/gwexample/analyses/tasks.py index 4c7bfd86..486801f7 100644 --- a/examples/plugin_example/gwexample/analyses/tasks.py +++ b/examples/plugin_example/gwexample/analyses/tasks.py @@ -10,3 +10,32 @@ def fibonacci(n): if n == 1 or n == 2: return 1 return fibonacci(n-1) + fibonacci(n-2) + +@app.task +# @argument('image_name', 'slide_name', 'path') +def nuclei(image_name, slide_name, path): + "running nuclei" + print(app, '++++++++++') + if path: + print('using arg path !!') + os.chdir(path) + else: + print('using default path !!') + os.chdir('/home/rc-svc-pinaki.sarder-web/digital_slide_archive/devops/singularity-minimal') + print('Current Path => ', os.getcwd()) + path = os.getcwd() + flags = os.O_RDWR | os.O_CREAT + sif_image = os.open('sarderlab_histomicstk_latest.sif', flags) + sif_image_path = path + image_name if image_name else '/sarderlab_histomicstk_latest.sif' + slide_image = os.open(slide_name, flags) + slide_image_path = path + slide_name if slide_name else '18-142_PAS_1of6.svs' + output = os.open('Nuclei-outputNucleiAnnotationFile.anot', flags) + output_path = path + '/Nuclei-outputNucleiAnnotationFile.anot' + run_container = f'apptainer run --pwd /HistomicsTK/histomicstk/cli {sif_image} NucleiDetection {slide_image} {output}' + try: + res = subprocess.call(f'apptainer run --pwd /HistomicsTK/histomicstk/cli {sif_image_path} NucleiDetection {slide_image_path} {output_path}', shell=True, bufsize=0,stdin=subprocess.PIPE,stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding="UTF8") + print(res, '----1') + + except Exception as e: + print(f"Exception occured {e}") + diff --git a/girder_worker/docker/tasks/__init__.py b/girder_worker/docker/tasks/__init__.py index c0069373..f60fe469 100644 --- a/girder_worker/docker/tasks/__init__.py +++ b/girder_worker/docker/tasks/__init__.py @@ -17,7 +17,7 @@ pass from girder_worker.app import app, Task from girder_worker import logger -from girder_worker.docker import utils +from girder_worker import utils from girder_worker.docker.stream_adapter import DockerStreamPushAdapter from girder_worker.docker.io import ( FileDescriptorReader, @@ -26,6 +26,8 @@ FDStreamConnector, StdStreamWriter ) +from slicer_cli_web.singularity.utils import switch_to_sif_image_folder +from slicer_cli_web.singularity.job import _get_last_workdir from girder_worker.docker.transforms import ( ContainerStdErr, ContainerStdOut, @@ -36,267 +38,276 @@ BLACKLISTED_DOCKER_RUN_ARGS = ['tty', 'detach'] -JOB_STATUS = utils.job_status_codes() - - -# def _pull_image(image): -# """ -# Pulls the specified Docker image onto this worker. -# """ -# client = docker.from_env(version='auto') -# try: -# client.images.pull(image) -# except DockerException: -# logger.exception('Error pulling Docker image %s:' % image) -# raise - - -# def _get_docker_network(): -# try: -# ip = socket.gethostbyname(socket.gethostname()) -# if 'DOCKER_CLIENT_TIMEOUT' in os.environ: -# timeout = int(os.environ['DOCKER_CLIENT_TIMEOUT']) -# client = docker.from_env(version='auto', timeout=timeout) -# else: -# client = docker.from_env(version='auto') -# for container in client.containers.list(all=True, filters={'status': 'running'}): -# for nw in container.attrs['NetworkSettings']['Networks'].values(): -# if nw['IPAddress'] == ip: -# return 'container:%s' % container.id -# except Exception: -# logger.exception('Failed to get docker network') - - -# def _remove_stopped_container(client, name): -# if name is None: -# return -# for container in client.containers.list(all=True, filters={'name': name}): -# try: -# logger.info('Removing container %s ' % (name)) -# container.remove() -# except Exception: -# pass - - -# def _run_container(image, container_args, **kwargs): -# # TODO we could allow configuration of non default socket -# if 'DOCKER_CLIENT_TIMEOUT' in os.environ: -# timeout = int(os.environ['DOCKER_CLIENT_TIMEOUT']) -# client = docker.from_env(version='auto', timeout=timeout) -# else: -# client = docker.from_env(version='auto') - -# runtime = kwargs.pop('runtime', None) -# origRuntime = runtime -# if runtime is None and nvidia.is_nvidia_image(client.api, image): -# runtime = 'nvidia' - -# container_args = [str(arg) for arg in container_args] - -# if 'network' not in kwargs and 'network_mode' not in kwargs: -# docker_network = _get_docker_network() -# if docker_network: -# kwargs = kwargs.copy() -# kwargs['network'] = docker_network - -# logger.info('Running container: image: %s args: %s runtime: %s kwargs: %s' -# % (image, container_args, runtime, kwargs)) -# try: -# name = None -# try: -# if runtime == 'nvidia' and kwargs.get('device_requests') is None: -# # Docker < 19.03 required the runtime='nvidia' argument. -# # Newer versions require a device request for some number of -# # GPUs. This should handle either version of the docker -# # daemon. -# try: -# device_requests_kwargs = kwargs.copy() -# device_requests_kwargs['device_requests'] = [ -# docker.types.DeviceRequest(count=-1, capabilities=[['gpu']])] -# name = device_requests_kwargs.setdefault( -# 'name', -# 'girder_worker_' + datetime.datetime.now().strftime('%Y%m%d_%H%M%S_%f')) -# return client.containers.run( -# image, container_args, **device_requests_kwargs) -# except (APIError, InvalidVersion): -# _remove_stopped_container(client, name) -# pass -# kwargs = kwargs.copy() -# name = kwargs.setdefault( -# 'name', -# 'girder_worker_' + datetime.datetime.now().strftime('%Y%m%d_%H%M%S_%f')) -# return client.containers.run( -# image, container_args, runtime=runtime, **kwargs) -# except APIError: -# _remove_stopped_container(client, name) -# if origRuntime is None and runtime is not None: -# kwargs = kwargs.copy() -# name = kwargs.setdefault( -# 'name', -# 'girder_worker_' + datetime.datetime.now().strftime('%Y%m%d_%H%M%S_%f')) -# return client.containers.run(image, container_args, **kwargs) -# else: -# raise -# except DockerException: -# logger.exception('Exception when running docker container') -# raise - - -# class _SocketReader(FileDescriptorReader): -# """ -# Used to mediate the difference between the python 2/3 implementation of docker-py -# with python 2 attach_socket(...) returns a socket like object, with python 3 -# it returns an instance of SocketIO. -# """ -# def __init__(self, socket): -# self._socket = socket - -# def read(self, n): -# # socket -# if hasattr(self._socket, 'recv'): -# return self._socket.recv(n) - -# # SocketIO -# return self._socket.read(n) - -# def fileno(self): -# return self._socket.fileno() - -# def close(self): -# self._socket.close() - - -# def _run_select_loop( # noqa: C901 -# task, container, read_stream_connectors, write_stream_connectors): -# stdout = None -# stderr = None -# try: -# # attach to standard streams -# stdout = container.attach_socket(params={ -# 'stdout': True, -# 'logs': True, -# 'stream': True -# }) - -# stderr = container.attach_socket(params={ -# 'stderr': True, -# 'logs': True, -# 'stream': True -# }) - -# def exit_condition(): -# container.reload() -# return container.status in {'exited', 'dead'} or task.canceled - -# # Look for ContainerStdOut and ContainerStdErr instances that need -# # to be replace with the real container streams. -# stdout_connected = False -# for read_stream_connector in read_stream_connectors: -# if isinstance(read_stream_connector.input, ContainerStdOut): -# stdout_reader = _SocketReader(stdout) -# read_stream_connector.output = DockerStreamPushAdapter(read_stream_connector.output) -# read_stream_connector.input = stdout_reader -# stdout_connected = True -# break - -# stderr_connected = False -# for read_stream_connector in read_stream_connectors: -# if isinstance(read_stream_connector.input, ContainerStdErr): -# stderr_reader = _SocketReader(stderr) -# read_stream_connector.output = DockerStreamPushAdapter(read_stream_connector.output) -# read_stream_connector.input = stderr_reader -# stderr_connected = True -# break - -# # If not stdout and stderr connection has been provided just use -# # sys.stdXXX -# if not stdout_connected: -# stdout_reader = _SocketReader(stdout) -# connector = FDReadStreamConnector( -# stdout_reader, -# DockerStreamPushAdapter(StdStreamWriter(sys.stdout))) -# read_stream_connectors.append(connector) - -# if not stderr_connected: -# stderr_reader = _SocketReader(stderr) -# connector = FDReadStreamConnector( -# stderr_reader, -# DockerStreamPushAdapter(StdStreamWriter(sys.stderr))) -# read_stream_connectors.append(connector) - -# # Run select loop -# utils.select_loop(exit_condition=exit_condition, -# readers=read_stream_connectors, -# writers=write_stream_connectors) - -# if task.canceled: -# try: -# msg = 'Asking to stop container: %s' % container.id -# logger.info(msg) -# container.stop() -# # Catch the ReadTimeout from requests and wait for container to -# # exit. See https://github.com/docker/docker-py/issues/1374 for -# # more details. -# except ReadTimeout: -# tries = 10 -# while tries > 0: -# container.reload() -# if container.status == 'exited': -# break - -# if container.status != 'exited': -# msg = 'Unable to stop container: %s' % container.id -# logger.error(msg) -# except DockerException as dex: -# logger.error(dex) -# raise - -# container.reload() -# exit_code = container.attrs['State']['ExitCode'] -# if not task.canceled and exit_code != 0: -# raise DockerException('Non-zero exit code from docker container (%d).' % exit_code) -# finally: -# # Close our stdout and stderr sockets -# if stdout: -# stdout.close() -# if stderr: -# stderr.close() - - -# def _handle_streaming_args(args): -# processed_args = [] -# write_streams = [] -# read_streams = [] - -# def _maybe_path(arg): -# if hasattr(arg, 'path'): -# return arg.path() - -# # Don't pass anything -# return '' - -# for arg in args: -# if isinstance(arg, FDStreamConnector): -# if isinstance(arg, FDWriteStreamConnector): -# write_streams.append(arg) -# arg = _maybe_path(arg.output) - -# elif isinstance(arg, FDReadStreamConnector): -# read_streams.append(arg) -# arg = _maybe_path(arg.input) - -# processed_args.append(arg) - -# return (processed_args, read_streams, write_streams) - - -# class _RequestDefaultTemporaryVolume(_TemporaryVolumeBase): -# def __init__(self): -# super().__init__(None, None) -# self._make_paths() - -# def transform(self, **kwargs): -# self._transformed = True + + +# JOB_STATUS = utils.job_status_codes() + + +JOB_STATUS = { + 'SUCCESS': 'Success', + 'FAILURE': "Failure", + 'CANCELLED': 'Cancelled' + } + + +def _pull_image(image): + """ + Pulls the specified Docker image onto this worker. + """ + client = docker.from_env(version='auto') + try: + client.images.pull(image) + except DockerException: + logger.exception('Error pulling Docker image %s:' % image) + raise + + +def _get_docker_network(): + try: + ip = socket.gethostbyname(socket.gethostname()) + if 'DOCKER_CLIENT_TIMEOUT' in os.environ: + timeout = int(os.environ['DOCKER_CLIENT_TIMEOUT']) + client = docker.from_env(version='auto', timeout=timeout) + else: + client = docker.from_env(version='auto') + for container in client.containers.list(all=True, filters={'status': 'running'}): + for nw in container.attrs['NetworkSettings']['Networks'].values(): + if nw['IPAddress'] == ip: + return 'container:%s' % container.id + except Exception: + logger.exception('Failed to get docker network') + + +def _remove_stopped_container(client, name): + if name is None: + return + for container in client.containers.list(all=True, filters={'name': name}): + try: + logger.info('Removing container %s ' % (name)) + container.remove() + except Exception: + pass + + +def _run_container(image, container_args, **kwargs): + # TODO we could allow configuration of non default socket + if 'DOCKER_CLIENT_TIMEOUT' in os.environ: + timeout = int(os.environ['DOCKER_CLIENT_TIMEOUT']) + client = docker.from_env(version='auto', timeout=timeout) + else: + client = docker.from_env(version='auto') + + runtime = kwargs.pop('runtime', None) + origRuntime = runtime + if runtime is None and nvidia.is_nvidia_image(client.api, image): + runtime = 'nvidia' + + container_args = [str(arg) for arg in container_args] + + if 'network' not in kwargs and 'network_mode' not in kwargs: + docker_network = _get_docker_network() + if docker_network: + kwargs = kwargs.copy() + kwargs['network'] = docker_network + + logger.info('Running container: image: %s args: %s runtime: %s kwargs: %s' + % (image, container_args, runtime, kwargs)) + try: + name = None + try: + if runtime == 'nvidia' and kwargs.get('device_requests') is None: + # Docker < 19.03 required the runtime='nvidia' argument. + # Newer versions require a device request for some number of + # GPUs. This should handle either version of the docker + # daemon. + try: + device_requests_kwargs = kwargs.copy() + device_requests_kwargs['device_requests'] = [ + docker.types.DeviceRequest(count=-1, capabilities=[['gpu']])] + name = device_requests_kwargs.setdefault( + 'name', + 'girder_worker_' + datetime.datetime.now().strftime('%Y%m%d_%H%M%S_%f')) + return client.containers.run( + image, container_args, **device_requests_kwargs) + except (APIError, InvalidVersion): + _remove_stopped_container(client, name) + pass + kwargs = kwargs.copy() + name = kwargs.setdefault( + 'name', + 'girder_worker_' + datetime.datetime.now().strftime('%Y%m%d_%H%M%S_%f')) + return client.containers.run( + image, container_args, runtime=runtime, **kwargs) + except APIError: + _remove_stopped_container(client, name) + if origRuntime is None and runtime is not None: + kwargs = kwargs.copy() + name = kwargs.setdefault( + 'name', + 'girder_worker_' + datetime.datetime.now().strftime('%Y%m%d_%H%M%S_%f')) + return client.containers.run(image, container_args, **kwargs) + else: + raise + except DockerException: + logger.exception('Exception when running docker container') + raise + + +class _SocketReader(FileDescriptorReader): + """ + Used to mediate the difference between the python 2/3 implementation of docker-py + with python 2 attach_socket(...) returns a socket like object, with python 3 + it returns an instance of SocketIO. + """ + def __init__(self, socket): + self._socket = socket + + def read(self, n): + # socket + if hasattr(self._socket, 'recv'): + return self._socket.recv(n) + + # SocketIO + return self._socket.read(n) + + def fileno(self): + return self._socket.fileno() + + def close(self): + self._socket.close() + + +def _run_select_loop( # noqa: C901 + task, container, read_stream_connectors, write_stream_connectors): + stdout = None + stderr = None + try: + # attach to standard streams + stdout = container.attach_socket(params={ + 'stdout': True, + 'logs': True, + 'stream': True + }) + + stderr = container.attach_socket(params={ + 'stderr': True, + 'logs': True, + 'stream': True + }) + + def exit_condition(): + container.reload() + return container.status in {'exited', 'dead'} or task.canceled + + # Look for ContainerStdOut and ContainerStdErr instances that need + # to be replace with the real container streams. + stdout_connected = False + for read_stream_connector in read_stream_connectors: + if isinstance(read_stream_connector.input, ContainerStdOut): + stdout_reader = _SocketReader(stdout) + read_stream_connector.output = DockerStreamPushAdapter(read_stream_connector.output) + read_stream_connector.input = stdout_reader + stdout_connected = True + break + + stderr_connected = False + for read_stream_connector in read_stream_connectors: + if isinstance(read_stream_connector.input, ContainerStdErr): + stderr_reader = _SocketReader(stderr) + read_stream_connector.output = DockerStreamPushAdapter(read_stream_connector.output) + read_stream_connector.input = stderr_reader + stderr_connected = True + break + + # If not stdout and stderr connection has been provided just use + # sys.stdXXX + if not stdout_connected: + stdout_reader = _SocketReader(stdout) + connector = FDReadStreamConnector( + stdout_reader, + DockerStreamPushAdapter(StdStreamWriter(sys.stdout))) + read_stream_connectors.append(connector) + + if not stderr_connected: + stderr_reader = _SocketReader(stderr) + connector = FDReadStreamConnector( + stderr_reader, + DockerStreamPushAdapter(StdStreamWriter(sys.stderr))) + read_stream_connectors.append(connector) + + # Run select loop + utils.select_loop(exit_condition=exit_condition, + readers=read_stream_connectors, + writers=write_stream_connectors) + + if task.canceled: + try: + msg = 'Asking to stop container: %s' % container.id + logger.info(msg) + container.stop() + # Catch the ReadTimeout from requests and wait for container to + # exit. See https://github.com/docker/docker-py/issues/1374 for + # more details. + except ReadTimeout: + tries = 10 + while tries > 0: + container.reload() + if container.status == 'exited': + break + + if container.status != 'exited': + msg = 'Unable to stop container: %s' % container.id + logger.error(msg) + except DockerException as dex: + logger.error(dex) + raise + + container.reload() + exit_code = container.attrs['State']['ExitCode'] + if not task.canceled and exit_code != 0: + raise DockerException('Non-zero exit code from docker container (%d).' % exit_code) + finally: + # Close our stdout and stderr sockets + if stdout: + stdout.close() + if stderr: + stderr.close() + + +def _handle_streaming_args(args): + processed_args = [] + write_streams = [] + read_streams = [] + + def _maybe_path(arg): + if hasattr(arg, 'path'): + return arg.path() + + # Don't pass anything + return '' + + for arg in args: + if isinstance(arg, FDStreamConnector): + if isinstance(arg, FDWriteStreamConnector): + write_streams.append(arg) + arg = _maybe_path(arg.output) + + elif isinstance(arg, FDReadStreamConnector): + read_streams.append(arg) + arg = _maybe_path(arg.input) + + processed_args.append(arg) + + return (processed_args, read_streams, write_streams) + + +class _RequestDefaultTemporaryVolume(_TemporaryVolumeBase): + def __init__(self): + super().__init__(None, None) + self._make_paths() + + def transform(self, **kwargs): + self._transformed = True class DockerTask(Task): @@ -363,22 +374,158 @@ def _cleanup_temp_volumes(self, temp_volumes, default_temp_volume): shutil.rmtree(v.host_path) + + + +def _docker_run(task, image, pull_image=True, entrypoint=None, container_args=None, + volumes=None, remove_container=True, stream_connectors=None, **kwargs): + volumes = volumes or {} + stream_connectors = stream_connectors or [] + container_args = container_args or [] + + if pull_image: + logger.info('Pulling Docker image: %s', image) + _pull_image(image) + + if entrypoint is not None and not isinstance(entrypoint, (list, tuple)): + entrypoint = [entrypoint] + + run_kwargs = { + 'tty': False, + 'volumes': volumes, + 'detach': True + } + + # Allow run args to be overridden,filter out any we don't want to override + extra_run_kwargs = {k: v for k, v in kwargs.items() if k not in BLACKLISTED_DOCKER_RUN_ARGS} + run_kwargs.update(extra_run_kwargs) + + if entrypoint is not None: + run_kwargs['entrypoint'] = entrypoint + + container_args, read_streams, write_streams = _handle_streaming_args(container_args) + + for connector in stream_connectors: + if isinstance(connector, FDReadStreamConnector): + read_streams.append(connector) + elif isinstance(connector, FDWriteStreamConnector): + write_streams.append(connector) + else: + raise TypeError( + "Expected 'FDReadStreamConnector' or 'FDWriterStreamConnector', received '%s'" + % type(connector)) + + # We need to open any read streams before starting the container, so the + # underling named pipes are opened for read. + for stream in read_streams: + stream.open() + + container = _run_container(image, container_args, **run_kwargs) + try: + _run_select_loop(task, container, read_streams, write_streams) + finally: + if container and remove_container: + container.reload() + # If the container is still running issue a warning + if container.status == 'running': + logger.warning('Container is still running, unable to remove.') + else: + container.remove() + + # return an array of None's equal to number of entries in the girder_result_hooks + # header, in order to trigger processing of the container outputs. + results = [] + if hasattr(task.request, 'girder_result_hooks'): + results = (None,) * len(task.request.girder_result_hooks) + return results + +@app.task(base=DockerTask, bind=True) +def docker_run(task, image, pull_image=True, entrypoint=None, container_args=None, + volumes=None, remove_container=True, **kwargs): + """ + This task runs a docker container. For details on how to use this task, see the + :ref:`docker-run` guide. + + :param task: The bound task reference. + :type task: :py:class:`girder_worker.task.Task` + :param image: The docker image identifier. + :type image: str + :param pull_image: Whether to explicitly pull the image prior to running the container. + :type pull_image: bool + :param entrypoint: Alternative entrypoint to use when running the container. + :type entrypoint: str + :param container_args: Arguments to pass to the container. + :type container_args: list + :param volumes: Volumes to expose to the container. + :type volumes: dict + :param remove_container: Whether to delete the container after the task is done. + :type remove_container: bool + :return: Fulfilled result hooks. + :rtype: list + """ + return _docker_run( + task, image, pull_image, entrypoint, container_args, volumes, + remove_container, **kwargs) + + #Class for SingularityTask similar to DockerTask class SingularityTask(Task): - def __call__(self, image,*args,container_args,bind_paths,**kwargs): - image = image or kwargs.pop('image',None) + def _maybe_transform_argument(self, arg): + return super()._maybe_transform_argument( + arg, task=self, _default_temp_volume=self.request._default_temp_volume) + + def _maybe_transform_result(self, idx, result): + return super()._maybe_transform_result( + idx, result, _default_temp_volume=self.request._default_temp_volume) + + def __call__(self,container_args=None,bind_paths=None,**kwargs): + #image = image or kwargs.pop('image',None) + default_temp_volume = _RequestDefaultTemporaryVolume() + self.request._default_temp_volume = default_temp_volume + + volumes = kwargs.setdefault('volumes', {}) + # If we have a list of volumes, the user provide a list of Volume objects, + # we need to transform them. + temp_volumes = [] + if isinstance(volumes, list): + # See if we have been passed any TemporaryVolume instances. + for v in volumes: + if isinstance(v, TemporaryVolume): + temp_volumes.append(v) + + # First call the transform method, this we replace default temp volumes + # with the instance associated with this task create above. That is any + # reference to TemporaryVolume.default + _walk_obj(volumes, self._maybe_transform_argument) + + # Now convert them to JSON + def _json(volume): + return volume._repr_json_() + + volumes = _walk_obj(volumes, _json) + # We then need to merge them into a single dict and it will be ready + # for docker-py. + volumes = {k: v for volume in volumes for k, v in volume.items()} + kwargs['volumes'] = volumes + + volumes.update(default_temp_volume._repr_json_()) + + + + #image = kwargs.pop('image','suhaskc_histo-cloud_segmentation.sif') + image = 'suhaskc_histo-cloud_segmentation.sif' container_args = container_args or kwargs.pop('container_args',[]) bind_paths = bind_paths or kwargs.pop('bind_paths',{}) - temporary_directory = os.getenv('TEMPORARY_DIRECTORY','/tmp') + temporary_directory = os.getenv('TEMPORARY_DIRECTORY','./tmp') log_file_path = os.getenv('SINGULARITY_LOG_FILE','/log') #check if the user has provided the image of the plugin. if not image: raise ValueError('Plugin Image required for Singularity') - logger.write(f'Image {image}') + logger.info(f'Image {image}') #Commnad to be called for executing Singularity Job bind_paths[temporary_directory] = '/output' - super().__call__(*args,**kwargs) + #super().__call__(*args,**kwargs) qos = kwargs.pop('qos', 'pinaki.sarder') cpus = kwargs.pop('cpus', 4) @@ -387,113 +534,22 @@ def __call__(self, image,*args,container_args,bind_paths,**kwargs): other_slurm_options = kwargs.pop('other_slurm_options', '') slurm_script = _generate_slurm_script(image, container_args, bind_paths, qos, cpus, gpus, memory, other_slurm_options) - + print(f"{slurm_script}") exit_status = _monitor_singularity_job(self,slurm_script=slurm_script, log_file_path=log_file_path,temp_directory=temporary_directory) #Handling exit status based on the DRM package's expected exit status codes - if exit_status == JOB_STATUS.SUCCESS: + if exit_status == JOB_STATUS['SUCCESS']: logger.info(f"Singularity job completed Successfully.") - elif exit_status == JOB_STATUS.FAILURE: + return JOB_STATUS['SUCCESS'] + elif exit_status == JOB_STATUS['FAILURE']: logger.error(f"Singularity Job exited with error") - elif exit_status == JOB_STATUS.CANCELLED: + elif exit_status == JOB_STATUS['CANCELLED']: logger.info('Singularity Job cancelled by the user') - - - -# def _docker_run(task, image, pull_image=True, entrypoint=None, container_args=None, -# volumes=None, remove_container=True, stream_connectors=None, **kwargs): -# volumes = volumes or {} -# stream_connectors = stream_connectors or [] -# container_args = container_args or [] - -# if pull_image: -# logger.info('Pulling Docker image: %s', image) -# _pull_image(image) - -# if entrypoint is not None and not isinstance(entrypoint, (list, tuple)): -# entrypoint = [entrypoint] - -# run_kwargs = { -# 'tty': False, -# 'volumes': volumes, -# 'detach': True -# } - -# # Allow run args to be overridden,filter out any we don't want to override -# extra_run_kwargs = {k: v for k, v in kwargs.items() if k not in BLACKLISTED_DOCKER_RUN_ARGS} -# run_kwargs.update(extra_run_kwargs) - -# if entrypoint is not None: -# run_kwargs['entrypoint'] = entrypoint - -# container_args, read_streams, write_streams = _handle_streaming_args(container_args) - -# for connector in stream_connectors: -# if isinstance(connector, FDReadStreamConnector): -# read_streams.append(connector) -# elif isinstance(connector, FDWriteStreamConnector): -# write_streams.append(connector) -# else: -# raise TypeError( -# "Expected 'FDReadStreamConnector' or 'FDWriterStreamConnector', received '%s'" -# % type(connector)) - -# # We need to open any read streams before starting the container, so the -# # underling named pipes are opened for read. -# for stream in read_streams: -# stream.open() - -# container = _run_container(image, container_args, **run_kwargs) -# try: -# _run_select_loop(task, container, read_streams, write_streams) -# finally: -# if container and remove_container: -# container.reload() -# # If the container is still running issue a warning -# if container.status == 'running': -# logger.warning('Container is still running, unable to remove.') -# else: -# container.remove() - -# # return an array of None's equal to number of entries in the girder_result_hooks -# # header, in order to trigger processing of the container outputs. -# results = [] -# if hasattr(task.request, 'girder_result_hooks'): -# results = (None,) * len(task.request.girder_result_hooks) -# return results - -# @app.task(base=DockerTask, bind=True) -# def docker_run(task, image, pull_image=True, entrypoint=None, container_args=None, -# volumes=None, remove_container=True, **kwargs): -# """ -# This task runs a docker container. For details on how to use this task, see the -# :ref:`docker-run` guide. - -# :param task: The bound task reference. -# :type task: :py:class:`girder_worker.task.Task` -# :param image: The docker image identifier. -# :type image: str -# :param pull_image: Whether to explicitly pull the image prior to running the container. -# :type pull_image: bool -# :param entrypoint: Alternative entrypoint to use when running the container. -# :type entrypoint: str -# :param container_args: Arguments to pass to the container. -# :type container_args: list -# :param volumes: Volumes to expose to the container. -# :type volumes: dict -# :param remove_container: Whether to delete the container after the task is done. -# :type remove_container: bool -# :return: Fulfilled result hooks. -# :rtype: list -# """ -# return _docker_run( -# task, image, pull_image, entrypoint, container_args, volumes, -# remove_container, **kwargs) #Chaging base temorarily to @app.task(base=SingularityTask, bind=True) -def singularity_run(task, image, *args, container_args=None, bind_paths=None, **kwargs): - return task(image,*args,container_args,bind_paths,**kwargs) +def singularity_run(task, container_args=None, bind_paths=None, **kwargs): + return task(container_args,bind_paths,**kwargs) #This function is used to check whether we need to switch to singularity or not. def use_singularity(): @@ -515,10 +571,37 @@ def use_singularity(): def _generate_slurm_script(image, container_args, bind_paths, qos, cpus, gpus, memory, other_slurm_options): # Construct the bind option for the Singularity command bind_option = ','.join([f"{host}:{container}" for host, container in bind_paths.items()]) - + logger.info(f'{container_args}') # Construct the Singularity command - singularity_command = f'singularity run --bind {bind_option} docker://{image} {" ".join(container_args)}' - + pwd = _get_last_workdir('suhaskc/histo-cloud:segmentation') + #container_args = str(container_args).replace('[','').replace(']','').replace(',', '').replace('<','').replace('>','') + container_args = ["SegmentWSI", + "--batch_size", + "1", + "--gpu", + "0", + "--heatmap_stride", + "2", + "--min_size", + "2000", + "--patch_size", + "2000", + "--remove_border", + "100", + "--save_heatmap", + "false", + "--simplify_contours", + "0.005", + "--tile_stride", + "1000", + "--wsi_downsample", + "2", + "/home/rc-svc-pinaki.sarder-web/digital_slide_archive/devops/singularity-minimal/tmp/18-142_PAS_1of6.svs", + "/home/rc-svc-pinaki.sarder-web/digital_slide_archive/devops/singularity-minimal/tmp/model-Glomeruli-11-13-20.zip", + "/home/rc-svc-pinaki.sarder-web/digital_slide_archive/devops/singularity-minimal/tmp/outputAnnotationFile.anot"] + #singularity_command = f'apptainer run --pwd {pwd} ./{image} {" ".join(container_args)}' + singularity_command = ['apptainer','run','--pwd',pwd,'--no-mount', '/cmsuf', image] + singularity_command.extend(container_args) # Generate the SLURM job script with the specified parameters slurm_script = f"""#!/bin/bash #SBATCH --qos={qos} @@ -567,10 +650,14 @@ def job_monitor(): THis is just for testing. Need to repalce with DRMAA ''' try: + switch_to_sif_image_folder() + logger.info(f"The current working direcrtory is {os.getcwd()}") + logger.info(f"Directory contents {os.listdir()}") res = subprocess.run(slurm_script,stdout=subprocess.PIPE,stderr=subprocess.PIPE, check=True) if isinstance(res.stdout, bytes): res = res.stdout.decode('utf-8') logger.info(res) + return JOB_STATUS['SUCCESS'] except Exception as e: logger.exception(f"Exception occured {e}") @@ -581,4 +668,4 @@ def job_monitor(): monitor_thread.start() monitor_thread.join() - return job_monitor() \ No newline at end of file + return job_monitor() diff --git a/girder_worker/entrypoint.py b/girder_worker/entrypoint.py index 287a5a03..7ea59b98 100644 --- a/girder_worker/entrypoint.py +++ b/girder_worker/entrypoint.py @@ -1,9 +1,12 @@ from importlib import import_module import celery from girder_worker_utils import decorators -from girder_worker.docker.tasks import use_singularity +# from girder_worker.docker.tasks import use_singularity from stevedore import extension +#Delete after testing +from girder_jobs.models.job import Job + #: Defines the namespace used for plugin entrypoints NAMESPACE = 'girder_worker_plugins' @@ -58,16 +61,30 @@ def get_module_tasks(module_name): for name, func in vars(module).items(): full_name = '%s.%s' % (module_name, name) + #Just for debugging + job = Job().updateJob( + job, + log=f"The fullname of function is {full_name} and func is {func}", + status="Error", + ) if not hasattr(func, '__call__'): # filter out objects that are not callable continue + # if name != 'singularity_run' or name != 'run': + # continue # if (use_singularity() and name == 'docker_run') or (not use_singularity() and name == 'singularity_run'): # continue try: decorators.get_description_attribute(func) tasks[full_name] = func except decorators.MissingDescriptionException: - pass + #Just for testing + job = Job().updateJob( + job, + log=f"The fullname of function is {full_name} and func is {func}", + status="Error", + ) + #pass return tasks diff --git a/setup.py b/setup.py index 75a63dde..f09b68ec 100644 --- a/setup.py +++ b/setup.py @@ -75,7 +75,7 @@ def run(self, *args, **kwargs): setuptools.setup( name='girder-worker', use_scm_version={'local_scheme': prerelease_local_scheme}, - # version='0.12.1', + version='0.12.1', setup_requires=['setuptools_scm'], description='Batch execution engine built on celery.', long_description=readme, From b1889c17eaff66291f1d1bcdfe5fe97766d867f5 Mon Sep 17 00:00:00 2001 From: "User pinaki.sarder-web" Date: Tue, 30 Apr 2024 14:45:30 -0400 Subject: [PATCH 10/28] Temporary commit --- girder_worker/docker/tasks/__init__.py | 206 ++++++++++--------------- setup.py | 21 +-- 2 files changed, 95 insertions(+), 132 deletions(-) diff --git a/girder_worker/docker/tasks/__init__.py b/girder_worker/docker/tasks/__init__.py index f60fe469..cc8fbb76 100644 --- a/girder_worker/docker/tasks/__init__.py +++ b/girder_worker/docker/tasks/__init__.py @@ -6,6 +6,7 @@ import threading import time import subprocess +import drmaa try: import docker @@ -478,8 +479,12 @@ def _maybe_transform_result(self, idx, result): return super()._maybe_transform_result( idx, result, _default_temp_volume=self.request._default_temp_volume) - def __call__(self,container_args=None,bind_paths=None,**kwargs): + def __call__(self,*args,container_args=None,bind_paths=None,**kwargs): #image = image or kwargs.pop('image',None) + container_args = container_args or kwargs.pop('container_args') or [] + container_args, read_streams, write_streams = _handle_streaming_args(container_args) + + logger.info(f'Container Args = {container_args}' ) default_temp_volume = _RequestDefaultTemporaryVolume() self.request._default_temp_volume = default_temp_volume @@ -510,46 +515,34 @@ def _json(volume): volumes.update(default_temp_volume._repr_json_()) - - - #image = kwargs.pop('image','suhaskc_histo-cloud_segmentation.sif') - image = 'suhaskc_histo-cloud_segmentation.sif' - container_args = container_args or kwargs.pop('container_args',[]) - bind_paths = bind_paths or kwargs.pop('bind_paths',{}) - temporary_directory = os.getenv('TEMPORARY_DIRECTORY','./tmp') - log_file_path = os.getenv('SINGULARITY_LOG_FILE','/log') - - #check if the user has provided the image of the plugin. - if not image: - raise ValueError('Plugin Image required for Singularity') - logger.info(f'Image {image}') - #Commnad to be called for executing Singularity Job - bind_paths[temporary_directory] = '/output' - #super().__call__(*args,**kwargs) + #Add Image checking later qos = kwargs.pop('qos', 'pinaki.sarder') cpus = kwargs.pop('cpus', 4) gpus = kwargs.pop('gpus', 1) memory = kwargs.pop('memory', '4GB') other_slurm_options = kwargs.pop('other_slurm_options', '') - - slurm_script = _generate_slurm_script(image, container_args, bind_paths, qos, cpus, gpus, memory, other_slurm_options) - print(f"{slurm_script}") - exit_status = _monitor_singularity_job(self,slurm_script=slurm_script, log_file_path=log_file_path,temp_directory=temporary_directory) + temp_directory = f"/blue/pinaki.sarder/rc-svc-pinaki.sarder-web/tmp" + sif_directory = f"/blue/pinaki.sarder/rc-svc-pinaki.sarder-web/SIF" + log_file_path = f"/blue/pinaki.sarder/rc-svc-pinaki.sarder-web/tmp" + # slurm_script = _generate_slurm_script(container_args, bind_paths, qos, cpus, gpus, memory, other_slurm_options) + # exit_status = _monitor_singularity_job(self,slurm_script=slurm_script, log_file_path=log_file_path,temp_directory=temp_directory,sif_directory=sif_directory) #Handling exit status based on the DRM package's expected exit status codes - - if exit_status == JOB_STATUS['SUCCESS']: - logger.info(f"Singularity job completed Successfully.") - return JOB_STATUS['SUCCESS'] - elif exit_status == JOB_STATUS['FAILURE']: - logger.error(f"Singularity Job exited with error") - elif exit_status == JOB_STATUS['CANCELLED']: - logger.info('Singularity Job cancelled by the user') + # exit_status = JOB_STATUS['SUCCESS'] + # if exit_status == JOB_STATUS['SUCCESS']: + # logger.info(f"Singularity job completed Successfully.") + # elif exit_status == JOB_STATUS['FAILURE']: + # logger.error(f"Singularity Job exited with error") + # elif exit_status == JOB_STATUS['CANCELLED']: + # logger.info('Singularity Job cancelled by the user') #Chaging base temorarily to @app.task(base=SingularityTask, bind=True) -def singularity_run(task, container_args=None, bind_paths=None, **kwargs): - return task(container_args,bind_paths,**kwargs) +def singularity_run(task,*args, container_args=None, bind_paths=None, **kwargs): + logger.info(f'KWARGS = {kwargs}') + logger.info(f'ARGS = {args}') + logger.info(f'task = {task}') + return task(*args,container_args,bind_paths,**kwargs) #This function is used to check whether we need to switch to singularity or not. def use_singularity(): @@ -568,104 +561,73 @@ def use_singularity(): # except socket.error: return True -def _generate_slurm_script(image, container_args, bind_paths, qos, cpus, gpus, memory, other_slurm_options): - # Construct the bind option for the Singularity command - bind_option = ','.join([f"{host}:{container}" for host, container in bind_paths.items()]) - logger.info(f'{container_args}') - # Construct the Singularity command - pwd = _get_last_workdir('suhaskc/histo-cloud:segmentation') - #container_args = str(container_args).replace('[','').replace(']','').replace(',', '').replace('<','').replace('>','') - container_args = ["SegmentWSI", - "--batch_size", - "1", - "--gpu", - "0", - "--heatmap_stride", - "2", - "--min_size", - "2000", - "--patch_size", - "2000", - "--remove_border", - "100", - "--save_heatmap", - "false", - "--simplify_contours", - "0.005", - "--tile_stride", - "1000", - "--wsi_downsample", - "2", - "/home/rc-svc-pinaki.sarder-web/digital_slide_archive/devops/singularity-minimal/tmp/18-142_PAS_1of6.svs", - "/home/rc-svc-pinaki.sarder-web/digital_slide_archive/devops/singularity-minimal/tmp/model-Glomeruli-11-13-20.zip", - "/home/rc-svc-pinaki.sarder-web/digital_slide_archive/devops/singularity-minimal/tmp/outputAnnotationFile.anot"] - #singularity_command = f'apptainer run --pwd {pwd} ./{image} {" ".join(container_args)}' - singularity_command = ['apptainer','run','--pwd',pwd,'--no-mount', '/cmsuf', image] - singularity_command.extend(container_args) - # Generate the SLURM job script with the specified parameters - slurm_script = f"""#!/bin/bash -#SBATCH --qos={qos} -#SBATCH --cpus-per-task={cpus} -#SBATCH --gres=gpu:{gpus} -#SBATCH --mem={memory} -# Add any other SLURM options here -{other_slurm_options} - -{singularity_command} -""" - return singularity_command - -def _monitor_singularity_job(task,slurm_script,log_file_path,temp_directory): +def _generate_slurm_script(container_args, bind_paths, qos, cpus, gpus, memory, other_slurm_options): + + #IMPLEMENT LATER - TESTING WITH DEFAULT + + + # # Construct the bind option for the Singularity command + # bind_option = ','.join([f"{host}:{container}" for host, container in bind_paths.items()]) + # logger.info(f'{container_args}') + # # Construct the Singularity command + # pwd = _get_last_workdir('suhaskc/histo-cloud:segmentation') + # #container_args = str(container_args).replace('[','').replace(']','').replace(',', '').replace('<','').replace('>','') + + # #singularity_command = f'apptainer run --pwd {pwd} ./{image} {" ".join(container_args)}' + # singularity_command = ['apptainer','run','--pwd',pwd,'--no-mount', '/cmsuf', image] + # singularity_command.extend(container_args) + # # Generate the SLURM job script with the specified parameters + + slurm_script = "singularity run --nv --pwd HistomicsTK/histomicstk/cli /blue/pinaki.sarder/rc-svc-pinaki.sarder-web/SIF/suhaskc.sif SegmentWSI --batch_size 1 --gpu 0 --heatmap_stride 2 --min_size 2000 --patch_size 2000 --remove_border 100 --save_heatmap false --simplify_contours 0.005 --tile_stride 1000 --wsi_downsample 2 18-142_PAS_1of6.svs model-Glomeruli-11-13-20.zip outputFile.anot" + logger.info(f"USER - {os.getenv('USER')}, {os.getuid()}") + return slurm_script + +def _monitor_singularity_job(task,slurm_script,sif_directory,log_file_path,temp_directory): """Create a drmaa session and monitor the job accordingly""" + decodestatus = {drmaa.JobState.UNDETERMINED: 'process status cannot be determined', + drmaa.JobState.QUEUED_ACTIVE: 'job is queued and active', + drmaa.JobState.SYSTEM_ON_HOLD: 'job is queued and in system hold', + drmaa.JobState.USER_ON_HOLD: 'job is queued and in user hold', + drmaa.JobState.USER_SYSTEM_ON_HOLD: 'job is queued and in user and system hold', + drmaa.JobState.RUNNING: 'job is running', + drmaa.JobState.SYSTEM_SUSPENDED: 'job is system suspended', + drmaa.JobState.USER_SUSPENDED: 'job is user suspended', + drmaa.JobState.DONE: 'job finished normally', + drmaa.JobState.FAILED: 'job finished, but failed'} def job_monitor(): - # s = drmaa.Session() - # s.initialize() - # jt = s.createJobTemplate() - # jt.remoteCommand = '/bin/bash' - # jt.args = ['-c', slurm_script] - # jt.workingDirectory = temp_directory - # jt.outputPath = ':' + log_file_path - # jt.errorPath = ':' + log_file_path - # jobid = s.runJob(jt) - # logger.log((f'Submitted singularity job with jobid {jobid}')) - # while True: - # job_info = s.jobStatus(jobid) - # if job_info in [drmaa.JobState.DONE, drmaa.JobState.FAILED]: - # break - - # # Check if the task has been aborted by the user - # if task.is_aborted: - # s.control(jobid, drmaa.JobControl.TERMINATE) - # logger.info(f'Job {jobid} was cancelled by user.') - # return JOB_STATUS.CANCELLED - - # time.sleep(5) # Sleep to avoid busy waiting - - # exit_status = s.wait(jobid, drmaa.Session.TIMEOUT_WAIT_FOREVER).exitStatus - # logger.info(f'Job {jobid} finished with exit status {exit_status}') - - # s.deleteJobTemplate(jt) - # return JOB_STATUS.SUCCESS if exit_status == 0 else JOB_STATUS.FAILURE - ''' - THis is just for testing. Need to repalce with DRMAA - ''' + s = drmaa.Session() + s.initialize() + jt = s.createJobTemplate() + jt.workingDirectory = temp_directory + jt.remoteCommand = os.path.join(temp_directory , 'submit.sh') + jt.nativeSpecification = "--mem=16000 --ntasks=1 --time=00:15 --mincpus=4 --partition=gpu --gres=gres:gpu:1 --comment=SegmentWSI" + jt.args = [sif_directory,temp_directory] + jt.outputPath = ':' + log_file_path + '/output.log' + jt.errorPath = ':' + log_file_path + '/error.log' try: - switch_to_sif_image_folder() - logger.info(f"The current working direcrtory is {os.getcwd()}") - logger.info(f"Directory contents {os.listdir()}") - res = subprocess.run(slurm_script,stdout=subprocess.PIPE,stderr=subprocess.PIPE, check=True) - if isinstance(res.stdout, bytes): - res = res.stdout.decode('utf-8') - logger.info(res) - return JOB_STATUS['SUCCESS'] - except Exception as e: - logger.exception(f"Exception occured {e}") + jobid = s.runJob(jt) + print((f'Submitted singularity job with jobid {jobid}')) + while True: + job_info = s.jobStatus(jobid) + print(f"Job is Still running") + if job_info in [drmaa.JobState.DONE, drmaa.JobState.FAILED]: + s.deleteJobTemplate(jt) + break + time.sleep(5) # Sleep to avoid busy waiting + exit_status = s.jobStatus(jobid) + logger.info(decodestatus[exit_status]) + s.exit() + return exit_status + + + except Exception as e: + s.deleteJobTemplate(jt) + print(f"Error Occured {e}") # Start the job monitor in a new thread monitor_thread = threading.Thread(target=job_monitor) monitor_thread.start() - monitor_thread.join() - return job_monitor() + return job_monitor \ No newline at end of file diff --git a/setup.py b/setup.py index f09b68ec..519917c6 100644 --- a/setup.py +++ b/setup.py @@ -108,24 +108,25 @@ def run(self, *args, **kwargs): zip_safe=False, entry_points={ 'console_scripts': [ - 'girder-worker = girder_worker.__main__:main', - 'girder-worker-config = girder_worker.configure:main' + # 'girder-worker = girder_worker.__main__:main', + # 'girder-worker-config = girder_worker.configure:main' ], 'girder_worker_plugins': [ - 'docker = girder_worker.docker:DockerPlugin [docker]' + # 'docker = girder_worker.docker:DockerPlugin [docker]', + # 'gwexample = girder_worker.examples.plugin_example.gwexample:GWExamplePlugin' ], 'girder_worker._test_plugins.valid_plugins': [ - 'plugin1 = girder_worker._test_plugins.plugins:TestPlugin1', - 'plugin2 = girder_worker._test_plugins.plugins:TestPlugin2' + # 'plugin1 = girder_worker._test_plugins.plugins:TestPlugin1', + # 'plugin2 = girder_worker._test_plugins.plugins:TestPlugin2' ], 'girder_worker._test_plugins.invalid_plugins': [ - 'exception1 = girder_worker._test_plugins.plugins:TestPluginException1', # noqa - 'exception2 = girder_worker._test_plugins.plugins:TestPluginException2', # noqa - 'import = girder_worker._test_plugins.plugins:TestPluginInvalidModule', # noqa - 'invalid = girder_worker._test_plugins.plugins:NotAValidClass' + # 'exception1 = girder_worker._test_plugins.plugins:TestPluginException1', # noqa + # 'exception2 = girder_worker._test_plugins.plugins:TestPluginException2', # noqa + # 'import = girder_worker._test_plugins.plugins:TestPluginInvalidModule', # noqa + # 'invalid = girder_worker._test_plugins.plugins:NotAValidClass' ], 'girder.plugin': [ - 'worker = girder_worker.girder_plugin:WorkerPlugin' + # 'worker = girder_worker.girder_plugin:WorkerPlugin' ] } ) From b3429fed12b78ec7ac045633f16a401dcd19440f Mon Sep 17 00:00:00 2001 From: "User pinaki.sarder-web" Date: Tue, 7 May 2024 18:36:36 -0400 Subject: [PATCH 11/28] Almost working --- girder_worker/app.py | 3 +- girder_worker/docker/tasks/__init__.py | 302 +++++++++++++++++++------ girder_worker/utils.py | 2 +- 3 files changed, 242 insertions(+), 65 deletions(-) diff --git a/girder_worker/app.py b/girder_worker/app.py index 264c8460..4ed5d175 100644 --- a/girder_worker/app.py +++ b/girder_worker/app.py @@ -132,7 +132,8 @@ def gw_task_prerun(task=None, sender=None, task_id=None, raise try: - task.girder_client = GirderClient(apiUrl=task.request.girder_api_url) + #task.girder_client = GirderClient(apiUrl=task.request.girder_api_url) + task.girder_client = GirderClient(apiUrl='http://0.0.0.0:8101/api/v1') task.girder_client.token = task.request.girder_client_token except AttributeError: task.girder_client = None diff --git a/girder_worker/docker/tasks/__init__.py b/girder_worker/docker/tasks/__init__.py index cc8fbb76..a270e233 100644 --- a/girder_worker/docker/tasks/__init__.py +++ b/girder_worker/docker/tasks/__init__.py @@ -18,7 +18,7 @@ pass from girder_worker.app import app, Task from girder_worker import logger -from girder_worker import utils +from girder_worker.docker import utils from girder_worker.docker.stream_adapter import DockerStreamPushAdapter from girder_worker.docker.io import ( FileDescriptorReader, @@ -28,7 +28,6 @@ StdStreamWriter ) from slicer_cli_web.singularity.utils import switch_to_sif_image_folder -from slicer_cli_web.singularity.job import _get_last_workdir from girder_worker.docker.transforms import ( ContainerStdErr, ContainerStdOut, @@ -479,12 +478,7 @@ def _maybe_transform_result(self, idx, result): return super()._maybe_transform_result( idx, result, _default_temp_volume=self.request._default_temp_volume) - def __call__(self,*args,container_args=None,bind_paths=None,**kwargs): - #image = image or kwargs.pop('image',None) - container_args = container_args or kwargs.pop('container_args') or [] - container_args, read_streams, write_streams = _handle_streaming_args(container_args) - - logger.info(f'Container Args = {container_args}' ) + def __call__(self, *args, **kwargs): default_temp_volume = _RequestDefaultTemporaryVolume() self.request._default_temp_volume = default_temp_volume @@ -515,35 +509,158 @@ def _json(volume): volumes.update(default_temp_volume._repr_json_()) - #Add Image checking later + try: + super().__call__(*args, **kwargs) + finally: + threading.Thread( + target=self._cleanup_temp_volumes, + args=(temp_volumes, default_temp_volume), + daemon=True).start() + + def _cleanup_temp_volumes(self, temp_volumes, default_temp_volume): + # Set the permission to allow cleanup of temp directories + temp_volumes = [v for v in temp_volumes if os.path.exists(v.host_path)] + to_chmod = temp_volumes[:] + # If our default_temp_volume instance has been transformed then we + # know it has been used and we have to clean it up. + if default_temp_volume._transformed: + to_chmod.append(default_temp_volume) + temp_volumes.append(default_temp_volume) + + # if len(to_chmod) > 0: + # utils.chmod_writable([v.host_path for v in to_chmod]) + + # for v in temp_volumes: + # shutil.rmtree(v.host_path) + + # def __call__(self,*args,container_args=None,bind_paths=None,**kwargs): + # #image = image or kwargs.pop('image',None) + # logger.info(f"KWARGS {kwargs}") + # container_args = container_args or kwargs.pop('container_args',[]) + # container_args, read_streams, write_streams = _handle_streaming_args(container_args) + # default_temp_volume = _RequestDefaultTemporaryVolume() + # self.request._default_temp_volume = default_temp_volume + + # volumes = kwargs.setdefault('volumes', {}) + # # If we have a list of volumes, the user provide a list of Volume objects, + # # we need to transform them. + # temp_volumes = [] + # if isinstance(volumes, list): + # # See if we have been passed any TemporaryVolume instances. + # for v in volumes: + # if isinstance(v, TemporaryVolume): + # temp_volumes.append(v) + + # # First call the transform method, this we replace default temp volumes + # # with the instance associated with this task create above. That is any + # # reference to TemporaryVolume.default + # _walk_obj(volumes, self._maybe_transform_argument) + + # # Now convert them to JSON + # def _json(volume): + # return volume._repr_json_() + + # volumes = _walk_obj(volumes, _json) + # # We then need to merge them into a single dict and it will be ready + # # for docker-py. + # volumes = {k: v for volume in volumes for k, v in volume.items()} + # kwargs['volumes'] = volumes + + # volumes.update(default_temp_volume._repr_json_()) + # #Add Image checking later + # logger.info(f"KWARGS After {container_args}") + # qos = kwargs.pop('qos', 'pinaki.sarder') + # cpus = kwargs.pop('cpus', 4) + # gpus = kwargs.pop('gpus', 1) + # memory = kwargs.pop('memory', '4GB') + # other_slurm_options = kwargs.pop('other_slurm_options', '') + # temp_directory = f"/blue/pinaki.sarder/rc-svc-pinaki.sarder-web/tmp" + # sif_directory = f"/blue/pinaki.sarder/rc-svc-pinaki.sarder-web/SIF" + # log_file_path = f"/blue/pinaki.sarder/rc-svc-pinaki.sarder-web/tmp" + # # slurm_script = _generate_slurm_script(container_args, bind_paths, qos, cpus, gpus, memory, other_slurm_options) + # # exit_status = _monitor_singularity_job(self,slurm_script=slurm_script, log_file_path=log_file_path,temp_directory=temp_directory,sif_directory=sif_directory) + # #Handling exit status based on the DRM package's expected exit status codes + # # exit_status = JOB_STATUS['SUCCESS'] + # # if exit_status == JOB_STATUS['SUCCESS']: + # # logger.info(f"Singularity job completed Successfully.") + # # elif exit_status == JOB_STATUS['FAILURE']: + # # logger.error(f"Singularity Job exited with error") + # # elif exit_status == JOB_STATUS['CANCELLED']: + # # logger.info('Singularity Job cancelled by the user') + + +def _run_singularity_container(container_args=None,**kwargs): + image = kwargs['image'] + container_args = container_args or kwargs['container_args'] or [] + try: + container_args = _process_container_args(container_args, kwargs) + + logger.info('Running container: image: %s args: %s kwargs: %s' + % (image, container_args, kwargs)) - qos = kwargs.pop('qos', 'pinaki.sarder') - cpus = kwargs.pop('cpus', 4) - gpus = kwargs.pop('gpus', 1) - memory = kwargs.pop('memory', '4GB') - other_slurm_options = kwargs.pop('other_slurm_options', '') - temp_directory = f"/blue/pinaki.sarder/rc-svc-pinaki.sarder-web/tmp" - sif_directory = f"/blue/pinaki.sarder/rc-svc-pinaki.sarder-web/SIF" - log_file_path = f"/blue/pinaki.sarder/rc-svc-pinaki.sarder-web/tmp" - # slurm_script = _generate_slurm_script(container_args, bind_paths, qos, cpus, gpus, memory, other_slurm_options) - # exit_status = _monitor_singularity_job(self,slurm_script=slurm_script, log_file_path=log_file_path,temp_directory=temp_directory,sif_directory=sif_directory) - #Handling exit status based on the DRM package's expected exit status codes - # exit_status = JOB_STATUS['SUCCESS'] - # if exit_status == JOB_STATUS['SUCCESS']: - # logger.info(f"Singularity job completed Successfully.") - # elif exit_status == JOB_STATUS['FAILURE']: - # logger.error(f"Singularity Job exited with error") - # elif exit_status == JOB_STATUS['CANCELLED']: - # logger.info('Singularity Job cancelled by the user') - -#Chaging base temorarily to -@app.task(base=SingularityTask, bind=True) -def singularity_run(task,*args, container_args=None, bind_paths=None, **kwargs): - logger.info(f'KWARGS = {kwargs}') - logger.info(f'ARGS = {args}') - logger.info(f'task = {task}') - return task(*args,container_args,bind_paths,**kwargs) + slurm_run_command = _generate_slurm_script(container_args,kwargs) + + slurm_config = _get_slurm_config(kwargs) + + return [slurm_run_command,slurm_config] + except Exception as e: + logger.exception(e) + raise Exception(e) + + + + +def singularity_run(task,**kwargs): + volumes = kwargs.pop('volumes',{}) + container_args = kwargs.pop('container_args',[]) + pull_image = kwargs['pull_image'] or False + stream_connectors = kwargs['stream_connectors'] or [] + image = kwargs['image'] or '' + entrypoint = None + if pull_image: + pass + #ADD some code + + run_kwargs = { + 'tty': False, + 'volumes': volumes + } + + # Allow run args to be overridden,filter out any we don't want to override + extra_run_kwargs = {k: v for k, v in kwargs.items() if k not in BLACKLISTED_DOCKER_RUN_ARGS} + run_kwargs.update(extra_run_kwargs) + + #Make entrypoint as pwd + if entrypoint is not None: + run_kwargs['entrypoint'] = entrypoint + + container_args,read_streams,write_streams = _handle_streaming_args(container_args) + #MODIFIED FOR SINGULARITY (CHANGE CODE OF SINGULARITY CONTAINER) + slurm_run_command,slurm_config = _run_singularity_container(container_args,**run_kwargs) + for connector in stream_connectors: + if isinstance(connector, FDReadStreamConnector): + read_streams.append(connector) + elif isinstance(connector, FDWriteStreamConnector): + write_streams.append(connector) + else: + raise TypeError( + "Expected 'FDReadStreamConnector' or 'FDWriterStreamConnector', received '%s'" + % type(connector)) + try: + monitor_thread = _monitor_singularity_job(task,slurm_run_command,slurm_config,read_streams,write_streams) + def singularity_exit_condition(): + return not monitor_thread.is_alive() + utils.select_loop(exit_condition = singularity_exit_condition, + readers= read_streams, + writers = write_streams ) + finally: + logger.info('DONE') + results = [] + if hasattr(task.request,'girder_result_hooks'): + results = (None,) * len(task.request.girder_result_hooks) + + return results #This function is used to check whether we need to switch to singularity or not. def use_singularity(): ''' @@ -561,28 +678,38 @@ def use_singularity(): # except socket.error: return True -def _generate_slurm_script(container_args, bind_paths, qos, cpus, gpus, memory, other_slurm_options): - - #IMPLEMENT LATER - TESTING WITH DEFAULT - - - # # Construct the bind option for the Singularity command - # bind_option = ','.join([f"{host}:{container}" for host, container in bind_paths.items()]) - # logger.info(f'{container_args}') - # # Construct the Singularity command - # pwd = _get_last_workdir('suhaskc/histo-cloud:segmentation') - # #container_args = str(container_args).replace('[','').replace(']','').replace(',', '').replace('<','').replace('>','') - - # #singularity_command = f'apptainer run --pwd {pwd} ./{image} {" ".join(container_args)}' - # singularity_command = ['apptainer','run','--pwd',pwd,'--no-mount', '/cmsuf', image] - # singularity_command.extend(container_args) - # # Generate the SLURM job script with the specified parameters - - slurm_script = "singularity run --nv --pwd HistomicsTK/histomicstk/cli /blue/pinaki.sarder/rc-svc-pinaki.sarder-web/SIF/suhaskc.sif SegmentWSI --batch_size 1 --gpu 0 --heatmap_stride 2 --min_size 2000 --patch_size 2000 --remove_border 100 --save_heatmap false --simplify_contours 0.005 --tile_stride 1000 --wsi_downsample 2 18-142_PAS_1of6.svs model-Glomeruli-11-13-20.zip outputFile.anot" - logger.info(f"USER - {os.getenv('USER')}, {os.getuid()}") - return slurm_script - -def _monitor_singularity_job(task,slurm_script,sif_directory,log_file_path,temp_directory): +def _generate_slurm_script(container_args,kwargs): + container_args = container_args or [] + image = kwargs.pop('image',None) + singularity_command = [] + if not image: + raise Exception(' Issue with Slicer_Cli_Plugin_Image. Plugin Not available') + SIF_DIRECTORY = os.getenv('SIF_IMAGE_PATH') + image_full_path = os.path.join(SIF_DIRECTORY,image) + #Code to check for allocating multiple gpus. + try: + gpu_index = container_args.index('--gpu') + gpus = int(container_args[gpu_index+1]) + kwargs['--gres'] = f"gres:gpu:{gpus}" if gpus > 1 else f"gres:gpu:1" + kwargs['--partition'] = 'gpu' + singularity_command.append('--nv') + except ValueError as e: + kwargs['gpu'] = None + try: + pwd = kwargs['pwd'] + if not pwd: + raise Exception("PWD cannot be empty") + singularity_command.extend(['--pwd',pwd]) + singularity_command.append(image_full_path) + singularity_command.extend(container_args) + singularity_command = ' '.join(singularity_command) + logger.info(f'Singularity Command = \n{singularity_command}') + except Exception as e: + logger.info(f"Error occured - {e}") + raise Exception(f"Error Occured - {e}") + return singularity_command + +def _monitor_singularity_job(task,slurm_run_command,slurm_config,read_streams=None,write_streams=None): """Create a drmaa session and monitor the job accordingly""" decodestatus = {drmaa.JobState.UNDETERMINED: 'process status cannot be determined', drmaa.JobState.QUEUED_ACTIVE: 'job is queued and active', @@ -594,25 +721,31 @@ def _monitor_singularity_job(task,slurm_script,sif_directory,log_file_path,temp_ drmaa.JobState.USER_SUSPENDED: 'job is user suspended', drmaa.JobState.DONE: 'job finished normally', drmaa.JobState.FAILED: 'job finished, but failed'} + temp_directory = os.getenv('TMPDIR') + logger.info(f"TEMP_DIR = {temp_directory}") def job_monitor(): s = drmaa.Session() s.initialize() jt = s.createJobTemplate() jt.workingDirectory = temp_directory jt.remoteCommand = os.path.join(temp_directory , 'submit.sh') - jt.nativeSpecification = "--mem=16000 --ntasks=1 --time=00:15 --mincpus=4 --partition=gpu --gres=gres:gpu:1 --comment=SegmentWSI" - jt.args = [sif_directory,temp_directory] - jt.outputPath = ':' + log_file_path + '/output.log' - jt.errorPath = ':' + log_file_path + '/error.log' + jt.nativeSpecification = slurm_config + jt.args = [slurm_run_command] + jt.outputPath = ':' + temp_directory + '/logs.log' + jt.errorPath = ':' + temp_directory + '/logs.log' try: jobid = s.runJob(jt) - print((f'Submitted singularity job with jobid {jobid}')) + logger.info((f'Submitted singularity job with jobid {jobid}')) while True: job_info = s.jobStatus(jobid) print(f"Job is Still running") if job_info in [drmaa.JobState.DONE, drmaa.JobState.FAILED]: s.deleteJobTemplate(jt) break + elif task.canceled: + s.control(jobid,drmaa.JobControlAction.TERMINATE) + s.deleteJobTemplate(jt) + break time.sleep(5) # Sleep to avoid busy waiting exit_status = s.jobStatus(jobid) logger.info(decodestatus[exit_status]) @@ -630,4 +763,47 @@ def job_monitor(): monitor_thread = threading.Thread(target=job_monitor) monitor_thread.start() - return job_monitor \ No newline at end of file + return monitor_thread + + +def _process_container_args(container_args,kwargs): + volumes = kwargs['volumes'] or {} + def find_matching_volume_key(path): + for key, value in volumes.items(): + if path.startswith(value['bind']): + # Append the suffix from the original path that isn't part of the 'bind' path + suffix = path[len(value['bind']):] if value['bind'] != path else '' + if 'assetstore' in key: + key = '/blue/pinaki.sarder/rc-svc-pinaki.sarder-web' + key + new_key = key + suffix.replace(" ", "_") # Replace spaces in suffix with underscores + return new_key + return path # Replace spaces in paths that don't match any volume + try: + # Replace paths in container_args with their corresponding volume keys + updated_container_args = [str(find_matching_volume_key(arg)) for arg in container_args] + except Exception as e: + logger.info(f"error {e}") + # Print the updated container arguments + logger.info(f"updated container args = {updated_container_args}") + return updated_container_args + +def _get_slurm_config(kwargs): + #Use this function to add or modify any configuration parameters for the SLURM job + config_defaults = { + '--qos': os.getenv('SLURM_QOS'), + '--account': os.getenv('SLURM_ACCOUNT'), + '--mem':os.getenv('SLURM_MEMORY','16000'), + '--ntasks': os.getenv("SLURM_NTASKS",'1'), + '--time': os.getenv("SLURM_TIME",'00:30'), + '--partition':os.getenv('SLURM_PARTITION'), + '--gres':os.getenv('SLURM_GRES_CONFIG'), + '--mincpus':os.getenv('SLURM_MIN_CPUS','4') + } + + config = {k:kwargs.get(k,config_defaults[k]) for k in config_defaults} + + slurm_config = ' '.join(f"{k}={v}" for k,v in config.items() if v is not None) + + logger.info(f"SLURM CONFIG = {slurm_config}") + return slurm_config + diff --git a/girder_worker/utils.py b/girder_worker/utils.py index 34c503d7..d7fff6dd 100644 --- a/girder_worker/utils.py +++ b/girder_worker/utils.py @@ -79,7 +79,7 @@ def _job_manager(request=None, headers=None, kwargs=None): if hasattr(request, 'jobInfoSpec'): jobSpec = request.jobInfoSpec - + jobSpec['url'] = jobSpec['url'].replace('girder:8080','0.0.0.0:8101') # We are being called from revoked signal elif headers is not None and \ 'jobInfoSpec' in headers: From 0ff3fecd4711526bfe9bb1fa6321525e125c08c3 Mon Sep 17 00:00:00 2001 From: "User pinaki.sarder-web" Date: Tue, 14 May 2024 06:45:53 -0400 Subject: [PATCH 12/28] Bug Fixes --- girder_worker/docker/tasks/__init__.py | 115 ++++++-------------- girder_worker/docker/transforms/__init__.py | 5 +- girder_worker/utils.py | 7 ++ 3 files changed, 42 insertions(+), 85 deletions(-) diff --git a/girder_worker/docker/tasks/__init__.py b/girder_worker/docker/tasks/__init__.py index a270e233..c0933423 100644 --- a/girder_worker/docker/tasks/__init__.py +++ b/girder_worker/docker/tasks/__init__.py @@ -515,7 +515,7 @@ def _json(volume): threading.Thread( target=self._cleanup_temp_volumes, args=(temp_volumes, default_temp_volume), - daemon=True).start() + daemon=False).start() def _cleanup_temp_volumes(self, temp_volumes, default_temp_volume): # Set the permission to allow cleanup of temp directories @@ -532,61 +532,6 @@ def _cleanup_temp_volumes(self, temp_volumes, default_temp_volume): # for v in temp_volumes: # shutil.rmtree(v.host_path) - - # def __call__(self,*args,container_args=None,bind_paths=None,**kwargs): - # #image = image or kwargs.pop('image',None) - # logger.info(f"KWARGS {kwargs}") - # container_args = container_args or kwargs.pop('container_args',[]) - # container_args, read_streams, write_streams = _handle_streaming_args(container_args) - # default_temp_volume = _RequestDefaultTemporaryVolume() - # self.request._default_temp_volume = default_temp_volume - - # volumes = kwargs.setdefault('volumes', {}) - # # If we have a list of volumes, the user provide a list of Volume objects, - # # we need to transform them. - # temp_volumes = [] - # if isinstance(volumes, list): - # # See if we have been passed any TemporaryVolume instances. - # for v in volumes: - # if isinstance(v, TemporaryVolume): - # temp_volumes.append(v) - - # # First call the transform method, this we replace default temp volumes - # # with the instance associated with this task create above. That is any - # # reference to TemporaryVolume.default - # _walk_obj(volumes, self._maybe_transform_argument) - - # # Now convert them to JSON - # def _json(volume): - # return volume._repr_json_() - - # volumes = _walk_obj(volumes, _json) - # # We then need to merge them into a single dict and it will be ready - # # for docker-py. - # volumes = {k: v for volume in volumes for k, v in volume.items()} - # kwargs['volumes'] = volumes - - # volumes.update(default_temp_volume._repr_json_()) - # #Add Image checking later - # logger.info(f"KWARGS After {container_args}") - # qos = kwargs.pop('qos', 'pinaki.sarder') - # cpus = kwargs.pop('cpus', 4) - # gpus = kwargs.pop('gpus', 1) - # memory = kwargs.pop('memory', '4GB') - # other_slurm_options = kwargs.pop('other_slurm_options', '') - # temp_directory = f"/blue/pinaki.sarder/rc-svc-pinaki.sarder-web/tmp" - # sif_directory = f"/blue/pinaki.sarder/rc-svc-pinaki.sarder-web/SIF" - # log_file_path = f"/blue/pinaki.sarder/rc-svc-pinaki.sarder-web/tmp" - # # slurm_script = _generate_slurm_script(container_args, bind_paths, qos, cpus, gpus, memory, other_slurm_options) - # # exit_status = _monitor_singularity_job(self,slurm_script=slurm_script, log_file_path=log_file_path,temp_directory=temp_directory,sif_directory=sif_directory) - # #Handling exit status based on the DRM package's expected exit status codes - # # exit_status = JOB_STATUS['SUCCESS'] - # # if exit_status == JOB_STATUS['SUCCESS']: - # # logger.info(f"Singularity job completed Successfully.") - # # elif exit_status == JOB_STATUS['FAILURE']: - # # logger.error(f"Singularity Job exited with error") - # # elif exit_status == JOB_STATUS['CANCELLED']: - # # logger.info('Singularity Job cancelled by the user') def _run_singularity_container(container_args=None,**kwargs): @@ -634,6 +579,8 @@ def singularity_run(task,**kwargs): if entrypoint is not None: run_kwargs['entrypoint'] = entrypoint + log_file_name = kwargs['log_file'] + container_args,read_streams,write_streams = _handle_streaming_args(container_args) #MODIFIED FOR SINGULARITY (CHANGE CODE OF SINGULARITY CONTAINER) slurm_run_command,slurm_config = _run_singularity_container(container_args,**run_kwargs) @@ -647,7 +594,7 @@ def singularity_run(task,**kwargs): "Expected 'FDReadStreamConnector' or 'FDWriterStreamConnector', received '%s'" % type(connector)) try: - monitor_thread = _monitor_singularity_job(task,slurm_run_command,slurm_config,read_streams,write_streams) + monitor_thread = _monitor_singularity_job(task,slurm_run_command,slurm_config,log_file_name) def singularity_exit_condition(): return not monitor_thread.is_alive() utils.select_loop(exit_condition = singularity_exit_condition, @@ -702,14 +649,12 @@ def _generate_slurm_script(container_args,kwargs): singularity_command.extend(['--pwd',pwd]) singularity_command.append(image_full_path) singularity_command.extend(container_args) - singularity_command = ' '.join(singularity_command) - logger.info(f'Singularity Command = \n{singularity_command}') except Exception as e: logger.info(f"Error occured - {e}") raise Exception(f"Error Occured - {e}") return singularity_command -def _monitor_singularity_job(task,slurm_run_command,slurm_config,read_streams=None,write_streams=None): +def _monitor_singularity_job(task,slurm_run_command,slurm_config,log_file_name): """Create a drmaa session and monitor the job accordingly""" decodestatus = {drmaa.JobState.UNDETERMINED: 'process status cannot be determined', drmaa.JobState.QUEUED_ACTIVE: 'job is queued and active', @@ -722,31 +667,37 @@ def _monitor_singularity_job(task,slurm_run_command,slurm_config,read_streams=No drmaa.JobState.DONE: 'job finished normally', drmaa.JobState.FAILED: 'job finished, but failed'} temp_directory = os.getenv('TMPDIR') - logger.info(f"TEMP_DIR = {temp_directory}") + submit_dir = '/blue/pinaki.sarder/rc-svc-pinaki.sarder-web/submission' def job_monitor(): s = drmaa.Session() s.initialize() jt = s.createJobTemplate() jt.workingDirectory = temp_directory - jt.remoteCommand = os.path.join(temp_directory , 'submit.sh') + jt.remoteCommand = os.path.join(submit_dir , 'submit.sh') jt.nativeSpecification = slurm_config - jt.args = [slurm_run_command] - jt.outputPath = ':' + temp_directory + '/logs.log' - jt.errorPath = ':' + temp_directory + '/logs.log' + jt.args = slurm_run_command + jt.outputPath = ':' + log_file_name + jt.errorPath = ':' + log_file_name try: jobid = s.runJob(jt) logger.info((f'Submitted singularity job with jobid {jobid}')) - while True: - job_info = s.jobStatus(jobid) - print(f"Job is Still running") - if job_info in [drmaa.JobState.DONE, drmaa.JobState.FAILED]: - s.deleteJobTemplate(jt) - break - elif task.canceled: - s.control(jobid,drmaa.JobControlAction.TERMINATE) - s.deleteJobTemplate(jt) - break - time.sleep(5) # Sleep to avoid busy waiting + with open(log_file_name,'r') as f: + while True: + job_info = s.jobStatus(jobid) + where = f.tell() + line = f.readlines() + if line: + print(''.join(line),end='') + else: + f.seek(where) + if job_info in [drmaa.JobState.DONE, drmaa.JobState.FAILED]: + s.deleteJobTemplate(jt) + break + elif task.canceled: + s.control(jobid,drmaa.JobControlAction.TERMINATE) + s.deleteJobTemplate(jt) + break + time.sleep(5) # Sleep to avoid busy waiting exit_status = s.jobStatus(jobid) logger.info(decodestatus[exit_status]) s.exit() @@ -760,7 +711,7 @@ def job_monitor(): # Start the job monitor in a new thread - monitor_thread = threading.Thread(target=job_monitor) + monitor_thread = threading.Thread(target=job_monitor,daemon=True) monitor_thread.start() return monitor_thread @@ -771,7 +722,7 @@ def _process_container_args(container_args,kwargs): def find_matching_volume_key(path): for key, value in volumes.items(): if path.startswith(value['bind']): - # Append the suffix from the original path that isn't part of the 'bind' path + # Append the suffix from the original path that isn't part of the 'bind' path #Replace bind path later suffix = path[len(value['bind']):] if value['bind'] != path else '' if 'assetstore' in key: key = '/blue/pinaki.sarder/rc-svc-pinaki.sarder-web' + key @@ -783,8 +734,6 @@ def find_matching_volume_key(path): updated_container_args = [str(find_matching_volume_key(arg)) for arg in container_args] except Exception as e: logger.info(f"error {e}") - # Print the updated container arguments - logger.info(f"updated container args = {updated_container_args}") return updated_container_args def _get_slurm_config(kwargs): @@ -793,11 +742,11 @@ def _get_slurm_config(kwargs): '--qos': os.getenv('SLURM_QOS'), '--account': os.getenv('SLURM_ACCOUNT'), '--mem':os.getenv('SLURM_MEMORY','16000'), - '--ntasks': os.getenv("SLURM_NTASKS",'1'), + '--ntasks': os.getenv("SLURM_NTASKS",'2'), '--time': os.getenv("SLURM_TIME",'00:30'), - '--partition':os.getenv('SLURM_PARTITION'), + '--partition':os.getenv('SLURM_PARTITION','hpg2-compute'), '--gres':os.getenv('SLURM_GRES_CONFIG'), - '--mincpus':os.getenv('SLURM_MIN_CPUS','4') + '--cpus-per-task':os.getenv('SLURM_CPUS','2') } config = {k:kwargs.get(k,config_defaults[k]) for k in config_defaults} diff --git a/girder_worker/docker/transforms/__init__.py b/girder_worker/docker/transforms/__init__.py index 05dd1751..f710b220 100644 --- a/girder_worker/docker/transforms/__init__.py +++ b/girder_worker/docker/transforms/__init__.py @@ -305,8 +305,9 @@ class VolumePath(Transform): def __init__(self, filename, volume=TemporaryVolume.default): if os.path.isabs(filename): raise Exception('VolumePath paths must be relative to a volume (%s).' % filename) - - self.filename = filename + #Modify filename for cli_run + #self.filename = filename + self.filename = filename.replace(' ','_') self._volume = volume def transform(self, *pargs, **kwargs): diff --git a/girder_worker/utils.py b/girder_worker/utils.py index d7fff6dd..fca1ef54 100644 --- a/girder_worker/utils.py +++ b/girder_worker/utils.py @@ -76,6 +76,13 @@ def _job_manager(request=None, headers=None, kwargs=None): girder_client_session_kwargs = {} if hasattr(request, 'girder_client_session_kwargs'): girder_client_session_kwargs = request.girder_client_session_kwargs + + #Modify better later + if hasattr(request,'girder_result_hooks'): + girder_result_hooks = request.girder_result_hooks + grh_object = girder_result_hooks[0] + grh_object['gc']['urlBase'] = grh_object['gc']['urlBase'].replace('girder:8080','0.0.0.0:8101') + if hasattr(request, 'jobInfoSpec'): jobSpec = request.jobInfoSpec From 27f790bdd2f2d1c24ed4fb92d85bac2c651ef6ec Mon Sep 17 00:00:00 2001 From: "User pinaki.sarder-web" Date: Thu, 16 May 2024 01:58:38 -0400 Subject: [PATCH 13/28] Addressed GPU config issue for plugins --- girder_worker/docker/nvidia.py | 20 ++++++++++++++++++++ girder_worker/docker/tasks/__init__.py | 19 +++++++++---------- girder_worker/utils.py | 8 -------- 3 files changed, 29 insertions(+), 18 deletions(-) diff --git a/girder_worker/docker/nvidia.py b/girder_worker/docker/nvidia.py index e66d2f48..27272a86 100644 --- a/girder_worker/docker/nvidia.py +++ b/girder_worker/docker/nvidia.py @@ -1,3 +1,23 @@ def is_nvidia_image(api, image): labels = api.inspect_image(image).get('Config', {}).get('Labels') return bool(labels and labels.get('com.nvidia.volumes.needed') == 'nvidia_driver') + +def set_nvidia_params(kwargs:dict,singularity_command:list,gpus:int=1): + ''' + This function is used to set the gpu parameters based on the user input and plugin job. + + Parameters: + kwargs (dict, required): The keyword arguments dictionary sent to the celery task as an input, part of the request + + singularity_command (list, required): A list that container all the arguments to construct a singularity command that will be sent to the HPC job + + gps (int, optional): If the plugin doesn't have a --gpu parameter in contianer_args, then a default of 1 gpu is allocated, else the user specified number of gpus is allocated. + + Returns: + None + ''' + kwargs['--gres'] = f"gres:gpu:{gpus}" if gpus > 1 else f"gres:gpu:1" + kwargs['--partition'] = 'gpu' + #Reducing CPU count for gpu-based job for resource conservation + kwargs['--cpus-per-task'] = '2' + singularity_command.append('--nv') \ No newline at end of file diff --git a/girder_worker/docker/tasks/__init__.py b/girder_worker/docker/tasks/__init__.py index c0933423..0bf7abd1 100644 --- a/girder_worker/docker/tasks/__init__.py +++ b/girder_worker/docker/tasks/__init__.py @@ -27,7 +27,7 @@ FDStreamConnector, StdStreamWriter ) -from slicer_cli_web.singularity.utils import switch_to_sif_image_folder + from girder_worker.docker.transforms import ( ContainerStdErr, ContainerStdOut, @@ -560,11 +560,11 @@ def singularity_run(task,**kwargs): container_args = kwargs.pop('container_args',[]) pull_image = kwargs['pull_image'] or False stream_connectors = kwargs['stream_connectors'] or [] - image = kwargs['image'] or '' + image = kwargs.get('image') or '' entrypoint = None - if pull_image: - pass - #ADD some code + if not image: + logger.exception(f"Image name cannot be emptu") + raise Exception(f"Image name cannot be empty") run_kwargs = { 'tty': False, @@ -637,11 +637,10 @@ def _generate_slurm_script(container_args,kwargs): try: gpu_index = container_args.index('--gpu') gpus = int(container_args[gpu_index+1]) - kwargs['--gres'] = f"gres:gpu:{gpus}" if gpus > 1 else f"gres:gpu:1" - kwargs['--partition'] = 'gpu' - singularity_command.append('--nv') + nvidia.set_nvidia_params(kwargs,singularity_command,gpus) except ValueError as e: - kwargs['gpu'] = None + if kwargs['nvidia']: + nvidia.set_nvidia_params(kwargs,singularity_command) try: pwd = kwargs['pwd'] if not pwd: @@ -746,7 +745,7 @@ def _get_slurm_config(kwargs): '--time': os.getenv("SLURM_TIME",'00:30'), '--partition':os.getenv('SLURM_PARTITION','hpg2-compute'), '--gres':os.getenv('SLURM_GRES_CONFIG'), - '--cpus-per-task':os.getenv('SLURM_CPUS','2') + '--cpus-per-task':os.getenv('SLURM_CPUS','4') } config = {k:kwargs.get(k,config_defaults[k]) for k in config_defaults} diff --git a/girder_worker/utils.py b/girder_worker/utils.py index fca1ef54..987a25e2 100644 --- a/girder_worker/utils.py +++ b/girder_worker/utils.py @@ -77,16 +77,8 @@ def _job_manager(request=None, headers=None, kwargs=None): if hasattr(request, 'girder_client_session_kwargs'): girder_client_session_kwargs = request.girder_client_session_kwargs - #Modify better later - if hasattr(request,'girder_result_hooks'): - girder_result_hooks = request.girder_result_hooks - grh_object = girder_result_hooks[0] - grh_object['gc']['urlBase'] = grh_object['gc']['urlBase'].replace('girder:8080','0.0.0.0:8101') - - if hasattr(request, 'jobInfoSpec'): jobSpec = request.jobInfoSpec - jobSpec['url'] = jobSpec['url'].replace('girder:8080','0.0.0.0:8101') # We are being called from revoked signal elif headers is not None and \ 'jobInfoSpec' in headers: From a0f1b7268f8c154ad0fe967f08bff62e600e3c34 Mon Sep 17 00:00:00 2001 From: willdunklin Date: Tue, 21 May 2024 13:25:40 -0400 Subject: [PATCH 14/28] UF Progress --- .gitignore | 3 + .../gwexample/analyses/tasks.py | 34 +- girder_worker/__main__.py | 1 + girder_worker/docker/tasks/__init__.py | 342 +++++++++++++++++- girder_worker/docker/utils.py | 12 + girder_worker/entrypoint.py | 26 +- girder_worker/utils.py | 16 +- requirements-dev.in | 1 + requirements.in | 2 + requirements.txt | 1 + setup.py | 6 +- tests/integration/requirements.txt | 1 + tox.ini | 12 + 13 files changed, 426 insertions(+), 31 deletions(-) diff --git a/.gitignore b/.gitignore index 92a71b09..3d781a34 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,6 @@ __pycache__/ .tox/ .cache/ htmlcov/ +build/ +.eggs + diff --git a/examples/plugin_example/gwexample/analyses/tasks.py b/examples/plugin_example/gwexample/analyses/tasks.py index 4c7bfd86..5fbf3bd2 100644 --- a/examples/plugin_example/gwexample/analyses/tasks.py +++ b/examples/plugin_example/gwexample/analyses/tasks.py @@ -1,7 +1,11 @@ -from girder_worker.app import app +import os +import subprocess + from girder_worker_utils import types from girder_worker_utils.decorators import argument +from girder_worker.app import app + @app.task @argument('n', types.Integer, min=1) @@ -10,3 +14,31 @@ def fibonacci(n): if n == 1 or n == 2: return 1 return fibonacci(n-1) + fibonacci(n-2) + +@app.task +# @argument('image_name', 'slide_name', 'path') +def nuclei(image_name, slide_name, path): + print(app, '++++++++++') + if path: + print('using arg path !!') + os.chdir(path) + else: + print('using default path !!') + os.chdir('/home/rc-svc-pinaki.sarder-web/digital_slide_archive/devops/singularity-minimal') + print('Current Path => ', os.getcwd()) + path = os.getcwd() + flags = os.O_RDWR | os.O_CREAT + sif_image = os.open('sarderlab_histomicstk_latest.sif', flags) + sif_image_path = path + image_name if image_name else '/sarderlab_histomicstk_latest.sif' + slide_image = os.open(slide_name, flags) + slide_image_path = path + slide_name if slide_name else '18-142_PAS_1of6.svs' + output = os.open('Nuclei-outputNucleiAnnotationFile.anot', flags) + output_path = path + '/Nuclei-outputNucleiAnnotationFile.anot' + run_container = f'apptainer run --pwd /HistomicsTK/histomicstk/cli {sif_image} NucleiDetection {slide_image} {output}' + try: + res = subprocess.call(f'apptainer run --pwd /HistomicsTK/histomicstk/cli {sif_image_path} NucleiDetection {slide_image_path} {output_path}', shell=True, bufsize=0,stdin=subprocess.PIPE,stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding="UTF8") + print(res, '----1') + + except Exception as e: + print(f"Exception occured {e}") + diff --git a/girder_worker/__main__.py b/girder_worker/__main__.py index a7ac7190..fb074a69 100644 --- a/girder_worker/__main__.py +++ b/girder_worker/__main__.py @@ -8,4 +8,5 @@ def main(): if __name__ == '__main__': + print('main!') main() diff --git a/girder_worker/docker/tasks/__init__.py b/girder_worker/docker/tasks/__init__.py index f661f805..3ef7d820 100644 --- a/girder_worker/docker/tasks/__init__.py +++ b/girder_worker/docker/tasks/__init__.py @@ -6,33 +6,31 @@ import sys import threading import time + +# import drmaa + try: import docker - from docker.errors import DockerException, APIError, InvalidVersion - from girder_worker.docker import nvidia + from docker.errors import APIError, DockerException, InvalidVersion from requests.exceptions import ReadTimeout + + from girder_worker.docker import nvidia except ImportError: # These imports will not be available on the girder side. pass -from girder_worker.app import app, Task +from girder_worker_utils import _walk_obj +# from slicer_cli_web.singularity.utils import switch_to_sif_image_folder + from girder_worker import logger +from girder_worker.app import Task, app from girder_worker.docker import utils +from girder_worker.docker.io import (FDReadStreamConnector, FDStreamConnector, + FDWriteStreamConnector, + FileDescriptorReader, StdStreamWriter) from girder_worker.docker.stream_adapter import DockerStreamPushAdapter -from girder_worker.docker.io import ( - FileDescriptorReader, - FDWriteStreamConnector, - FDReadStreamConnector, - FDStreamConnector, - StdStreamWriter -) -from girder_worker.docker.transforms import ( - ContainerStdErr, - ContainerStdOut, - _TemporaryVolumeBase, - TemporaryVolume -) -from girder_worker_utils import _walk_obj - +from girder_worker.docker.transforms import (ContainerStdErr, ContainerStdOut, + TemporaryVolume, + _TemporaryVolumeBase) BLACKLISTED_DOCKER_RUN_ARGS = ['tty', 'detach'] @@ -42,6 +40,11 @@ docker.transport.basehttpadapter.BaseHTTPAdapter._get_connection = ( lambda self, request, *args, proxies=None, **kwargs: self.get_connection( request.url, proxies)) +# JOB_STATUS = { +# 'SUCCESS': 'Success', +# 'FAILURE': "Failure", +# 'CANCELLED': 'Cancelled' +# } def _pull_image(image): @@ -153,6 +156,7 @@ class _SocketReader(FileDescriptorReader): with python 2 attach_socket(...) returns a socket like object, with python 3 it returns an instance of SocketIO. """ + def __init__(self, socket): self._socket = socket @@ -432,7 +436,6 @@ def _docker_run(task, image, pull_image=True, entrypoint=None, container_args=No results = [] if hasattr(task.request, 'girder_result_hooks'): results = (None,) * len(task.request.girder_result_hooks) - return results @@ -463,3 +466,304 @@ def docker_run(task, image, pull_image=True, entrypoint=None, container_args=Non return _docker_run( task, image, pull_image, entrypoint, container_args, volumes, remove_container, **kwargs) + + +# Class for SingularityTask similar to DockerTask +class SingularityTask(Task): + def _maybe_transform_argument(self, arg): + return super()._maybe_transform_argument( + arg, task=self, _default_temp_volume=self.request._default_temp_volume) + + def _maybe_transform_result(self, idx, result): + return super()._maybe_transform_result( + idx, result, _default_temp_volume=self.request._default_temp_volume) + + def __call__(self, *args, **kwargs): + default_temp_volume = _RequestDefaultTemporaryVolume() + self.request._default_temp_volume = default_temp_volume + + volumes = kwargs.setdefault('volumes', {}) + # If we have a list of volumes, the user provide a list of Volume objects, + # we need to transform them. + temp_volumes = [] + if isinstance(volumes, list): + # See if we have been passed any TemporaryVolume instances. + for v in volumes: + if isinstance(v, TemporaryVolume): + temp_volumes.append(v) + + # First call the transform method, this we replace default temp volumes + # with the instance associated with this task create above. That is any + # reference to TemporaryVolume.default + _walk_obj(volumes, self._maybe_transform_argument) + + # Now convert them to JSON + def _json(volume): + return volume._repr_json_() + + volumes = _walk_obj(volumes, _json) + # We then need to merge them into a single dict and it will be ready + # for docker-py. + volumes = {k: v for volume in volumes for k, v in volume.items()} + kwargs['volumes'] = volumes + + volumes.update(default_temp_volume._repr_json_()) + + try: + super().__call__(*args, **kwargs) + finally: + threading.Thread( + target=self._cleanup_temp_volumes, + args=(temp_volumes, default_temp_volume), + daemon=False).start() + + def _cleanup_temp_volumes(self, temp_volumes, default_temp_volume): + # Set the permission to allow cleanup of temp directories + temp_volumes = [v for v in temp_volumes if os.path.exists(v.host_path)] + to_chmod = temp_volumes[:] + # If our default_temp_volume instance has been transformed then we + # know it has been used and we have to clean it up. + if default_temp_volume._transformed: + to_chmod.append(default_temp_volume) + temp_volumes.append(default_temp_volume) + + # if len(to_chmod) > 0: + # utils.chmod_writable([v.host_path for v in to_chmod]) + + # for v in temp_volumes: + # shutil.rmtree(v.host_path) + + +def _run_singularity_container(container_args=None, **kwargs): + image = kwargs['image'] + container_args = container_args or kwargs['container_args'] or [] + try: + container_args = _process_container_args(container_args, kwargs) + + logger.info('Running container: image: %s args: %s kwargs: %s' + % (image, container_args, kwargs)) + + slurm_run_command = _generate_slurm_script(container_args, kwargs) + + slurm_config = _get_slurm_config(kwargs) + + return [slurm_run_command, slurm_config] + except Exception as e: + logger.exception(e) + raise Exception(e) + + +def singularity_run(task, **kwargs): + volumes = kwargs.pop('volumes', {}) + container_args = kwargs.pop('container_args', []) + pull_image = kwargs['pull_image'] or False + stream_connectors = kwargs['stream_connectors'] or [] + image = kwargs['image'] or '' + entrypoint = None + if pull_image: + pass + # ADD some code + + run_kwargs = { + 'tty': False, + 'volumes': volumes + } + + # Allow run args to be overridden,filter out any we don't want to override + extra_run_kwargs = {k: v for k, v in kwargs.items() if k not in BLACKLISTED_DOCKER_RUN_ARGS} + run_kwargs.update(extra_run_kwargs) + + # Make entrypoint as pwd + if entrypoint is not None: + run_kwargs['entrypoint'] = entrypoint + + log_file_name = kwargs['log_file'] + + container_args, read_streams, write_streams = _handle_streaming_args(container_args) + # MODIFIED FOR SINGULARITY (CHANGE CODE OF SINGULARITY CONTAINER) + slurm_run_command, slurm_config = _run_singularity_container(container_args, **run_kwargs) + for connector in stream_connectors: + if isinstance(connector, FDReadStreamConnector): + read_streams.append(connector) + elif isinstance(connector, FDWriteStreamConnector): + write_streams.append(connector) + else: + raise TypeError( + "Expected 'FDReadStreamConnector' or 'FDWriterStreamConnector', received '%s'" + % type(connector)) + try: + monitor_thread = _monitor_singularity_job( + task, slurm_run_command, slurm_config, log_file_name) + + def singularity_exit_condition(): + return not monitor_thread.is_alive() + utils.select_loop(exit_condition=singularity_exit_condition, + readers=read_streams, + writers=write_streams) + finally: + logger.info('DONE') + + results = [] + if hasattr(task.request, 'girder_result_hooks'): + results = (None,) * len(task.request.girder_result_hooks) + + return results + + +# This function is used to check whether we need to switch to singularity or not. +def use_singularity(): + ''' + #This needs to be uncommented. Only for testing purposes. + ''' + # runtime = os.environ.get('RUNTIME') + # if runtime == 'SINGULARITY': + # return True + # if runtime == 'DOCKER': + # return False + # try: + # #Check whether we are connected to a docker socket. + # with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as s: + # return s.connect_ex('/var/run/docker.sock') != 0 + # except socket.error: + return False + # return True + + +@app.task +def container_backend(**kwargs): + return use_singularity() + + +def _generate_slurm_script(container_args, kwargs): + container_args = container_args or [] + image = kwargs.pop('image', None) + singularity_command = [] + if not image: + raise Exception(' Issue with Slicer_Cli_Plugin_Image. Plugin Not available') + SIF_DIRECTORY = os.getenv('SIF_IMAGE_PATH') + image_full_path = os.path.join(SIF_DIRECTORY, image) + # Code to check for allocating multiple gpus. + try: + gpu_index = container_args.index('--gpu') + gpus = int(container_args[gpu_index+1]) + kwargs['--gres'] = f'gres:gpu:{gpus}' if gpus > 1 else 'gres:gpu:1' + kwargs['--partition'] = 'gpu' + singularity_command.append('--nv') + except ValueError: + kwargs['gpu'] = None + try: + pwd = kwargs['pwd'] + if not pwd: + raise Exception('PWD cannot be empty') + singularity_command.extend(['--pwd', pwd]) + singularity_command.append(image_full_path) + singularity_command.extend(container_args) + except Exception as e: + logger.info(f'Error occured - {e}') + raise Exception(f'Error Occured - {e}') + return singularity_command + + +def _monitor_singularity_job(task, slurm_run_command, slurm_config, log_file_name): + """Create a drmaa session and monitor the job accordingly""" + decodestatus = {drmaa.JobState.UNDETERMINED: 'process status cannot be determined', + drmaa.JobState.QUEUED_ACTIVE: 'job is queued and active', + drmaa.JobState.SYSTEM_ON_HOLD: 'job is queued and in system hold', + drmaa.JobState.USER_ON_HOLD: 'job is queued and in user hold', + drmaa.JobState.USER_SYSTEM_ON_HOLD: 'job is queued and in user and system hold', + drmaa.JobState.RUNNING: 'job is running', + drmaa.JobState.SYSTEM_SUSPENDED: 'job is system suspended', + drmaa.JobState.USER_SUSPENDED: 'job is user suspended', + drmaa.JobState.DONE: 'job finished normally', + drmaa.JobState.FAILED: 'job finished, but failed'} + temp_directory = os.getenv('TMPDIR') + submit_dir = '/blue/pinaki.sarder/rc-svc-pinaki.sarder-web/submission' + + def job_monitor(): + s = drmaa.Session() + s.initialize() + jt = s.createJobTemplate() + jt.workingDirectory = temp_directory + jt.remoteCommand = os.path.join(submit_dir, 'submit.sh') + jt.nativeSpecification = slurm_config + jt.args = slurm_run_command + jt.outputPath = ':' + log_file_name + jt.errorPath = ':' + log_file_name + try: + jobid = s.runJob(jt) + logger.info((f'Submitted singularity job with jobid {jobid}')) + with open(log_file_name, 'r') as f: + while True: + job_info = s.jobStatus(jobid) + where = f.tell() + line = f.readlines() + if line: + print(''.join(line), end='') + else: + f.seek(where) + if job_info in [drmaa.JobState.DONE, drmaa.JobState.FAILED]: + s.deleteJobTemplate(jt) + break + elif task.canceled: + s.control(jobid, drmaa.JobControlAction.TERMINATE) + s.deleteJobTemplate(jt) + break + time.sleep(5) # Sleep to avoid busy waiting + exit_status = s.jobStatus(jobid) + logger.info(decodestatus[exit_status]) + s.exit() + return exit_status + + except Exception as e: + s.deleteJobTemplate(jt) + print(f'Error Occured {e}') + + # Start the job monitor in a new thread + monitor_thread = threading.Thread(target=job_monitor, daemon=True) + monitor_thread.start() + + return monitor_thread + + +def _process_container_args(container_args, kwargs): + volumes = kwargs['volumes'] or {} + + def find_matching_volume_key(path): + for key, value in volumes.items(): + if path.startswith(value['bind']): + # Append the suffix from the original path that isn't part of the 'bind' + # path #Replace bind path later + suffix = path[len(value['bind']):] if value['bind'] != path else '' + if 'assetstore' in key: + key = '/blue/pinaki.sarder/rc-svc-pinaki.sarder-web' + key + # Replace spaces in suffix with underscores + new_key = key + suffix.replace(' ', '_') + return new_key + return path # Replace spaces in paths that don't match any volume + try: + # Replace paths in container_args with their corresponding volume keys + updated_container_args = [str(find_matching_volume_key(arg)) for arg in container_args] + except Exception as e: + logger.info(f'error {e}') + return updated_container_args + + +def _get_slurm_config(kwargs): + # Use this function to add or modify any configuration parameters for the SLURM job + config_defaults = { + '--qos': os.getenv('SLURM_QOS'), + '--account': os.getenv('SLURM_ACCOUNT'), + '--mem': os.getenv('SLURM_MEMORY', '16000'), + '--ntasks': os.getenv('SLURM_NTASKS', '2'), + '--time': os.getenv('SLURM_TIME', '00:30'), + '--partition': os.getenv('SLURM_PARTITION', 'hpg2-compute'), + '--gres': os.getenv('SLURM_GRES_CONFIG'), + '--cpus-per-task': os.getenv('SLURM_CPUS', '2') + } + + config = {k: kwargs.get(k, config_defaults[k]) for k in config_defaults} + + slurm_config = ' '.join(f'{k}={v}' for k, v in config.items() if v is not None) + + logger.info(f'SLURM CONFIG = {slurm_config}') + return slurm_config diff --git a/girder_worker/docker/utils.py b/girder_worker/docker/utils.py index 21c89084..0cf0daf8 100644 --- a/girder_worker/docker/utils.py +++ b/girder_worker/docker/utils.py @@ -2,6 +2,7 @@ import os import select import uuid +from types import SimpleNamespace import docker from docker.errors import DockerException @@ -112,3 +113,14 @@ def chmod_writable(host_paths): except DockerException: logger.exception('Error setting perms on docker volumes %s.' % host_paths) raise + + +# JOB_STATUS = { +# 'SUCCESS': 'Success', +# 'FAILURE': "Failure", +# 'CANCELLED': 'Cancelled' +# } + +# def job_status_codes(): +# statusCodes = SimpleNamespace(JOB_STATUS) +# return statusCodes diff --git a/girder_worker/entrypoint.py b/girder_worker/entrypoint.py index c6cd67ae..c1881d83 100644 --- a/girder_worker/entrypoint.py +++ b/girder_worker/entrypoint.py @@ -1,10 +1,12 @@ from importlib import import_module + import celery +# Delete after testing +from girder_jobs.models.job import Job from girder_worker_utils import decorators - +# from girder_worker.docker.tasks import use_singularity from stevedore import extension - #: Defines the namespace used for plugin entrypoints NAMESPACE = 'girder_worker_plugins' @@ -58,15 +60,31 @@ def get_module_tasks(module_name): for name, func in vars(module).items(): full_name = '%s.%s' % (module_name, name) + # Just for debugging + # job = Job(). + job = Job().updateJob( + job, + log=f'The fullname of function is {full_name} and func is {func}', + status='Error', + ) if not hasattr(func, '__call__'): # filter out objects that are not callable continue - + # if name != 'singularity_run' or name != 'run': + # continue + # if (use_singularity() and name == 'docker_run') or (not use_singularity() and name == 'singularity_run'): + # continue try: decorators.get_description_attribute(func) tasks[full_name] = func except decorators.MissingDescriptionException: - pass + # Just for testing + job = Job().updateJob( + job, + log=f'The fullname of function is {full_name} and func is {func}', + status='Error', + ) + # pass return tasks diff --git a/girder_worker/utils.py b/girder_worker/utils.py index 34c503d7..660d5a63 100644 --- a/girder_worker/utils.py +++ b/girder_worker/utils.py @@ -1,13 +1,12 @@ import time -from girder_worker_utils.tee import Tee, tee_stderr, tee_stdout - import requests -from requests import HTTPError - # Disable urllib3 warnings about certificate validation. As they are printed in the console, the # messages are sent to Girder, creating an infinite loop. import urllib3 +from girder_worker_utils.tee import Tee, tee_stderr, tee_stdout +from requests import HTTPError + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) @@ -77,9 +76,16 @@ def _job_manager(request=None, headers=None, kwargs=None): if hasattr(request, 'girder_client_session_kwargs'): girder_client_session_kwargs = request.girder_client_session_kwargs + # # Modify better later + # if hasattr(request, 'girder_result_hooks'): + # girder_result_hooks = request.girder_result_hooks + # grh_object = girder_result_hooks[0] + # grh_object['gc']['urlBase'] = grh_object['gc']['urlBase'].replace( + # 'girder:8080', '0.0.0.0:8101') + if hasattr(request, 'jobInfoSpec'): jobSpec = request.jobInfoSpec - + # jobSpec['url'] = jobSpec['url'].replace('girder:8080', '0.0.0.0:8101') # We are being called from revoked signal elif headers is not None and \ 'jobInfoSpec' in headers: diff --git a/requirements-dev.in b/requirements-dev.in index f27839a5..cbaa6de1 100644 --- a/requirements-dev.in +++ b/requirements-dev.in @@ -16,3 +16,4 @@ pytest-cov Sphinx sphinx-rtd-theme tox +drmaa diff --git a/requirements.in b/requirements.in index 713fcb41..ab518b5b 100644 --- a/requirements.in +++ b/requirements.in @@ -10,3 +10,5 @@ stevedore jsonpickle girder_worker_utils>=0.8.4 docker>=2.6.0 +drmaa + diff --git a/requirements.txt b/requirements.txt index 26bcce9c..caa75f59 100644 --- a/requirements.txt +++ b/requirements.txt @@ -375,6 +375,7 @@ zipp==3.18.2 # via # importlib-metadata # importlib-resources +drmaa # The following packages are considered to be unsafe in a requirements file: # setuptools diff --git a/setup.py b/setup.py index 106c631d..9295716c 100644 --- a/setup.py +++ b/setup.py @@ -19,8 +19,8 @@ import os import shutil -import setuptools +import setuptools from setuptools.command.install import install @@ -73,6 +73,7 @@ def run(self, *args, **kwargs): setuptools.setup( name='girder-worker', use_scm_version={'local_scheme': prerelease_local_scheme}, + # version='0.12.1', setup_requires=['setuptools_scm'], description='Batch execution engine built on celery.', long_description=readme, @@ -109,7 +110,8 @@ def run(self, *args, **kwargs): 'girder-worker-config = girder_worker.configure:main' ], 'girder_worker_plugins': [ - 'docker = girder_worker.docker:DockerPlugin [docker]' + 'docker = girder_worker.docker:DockerPlugin [docker]', + # 'gwexample = girder_worker.examples.plugin_example.gwexample:GWExamplePlugin' ], 'girder_worker._test_plugins.valid_plugins': [ 'plugin1 = girder_worker._test_plugins.plugins:TestPlugin1', diff --git a/tests/integration/requirements.txt b/tests/integration/requirements.txt index ef6bb370..e08d59e0 100644 --- a/tests/integration/requirements.txt +++ b/tests/integration/requirements.txt @@ -5,3 +5,4 @@ requests-toolbelt girder_client==2.3.0 girder-worker-utils>=0.8.0 celery>=4.0.0 +drmaa diff --git a/tox.ini b/tox.ini index 84f17d64..6237090d 100644 --- a/tox.ini +++ b/tox.ini @@ -51,6 +51,18 @@ deps = flake8-quotes commands = flake8 {posargs} girder_worker tests +[testenv:format] +skipsdist = true +skip_install = true +deps = + autopep8 + isort + unify +commands = + isort {posargs:.} + autopep8 -ria girder_worker + unify --in-place --recursive girder_worker + [testenv:release] skip_install = true skipsdist = true From 9524112221b2b1733be70e88734cbf19edde5723 Mon Sep 17 00:00:00 2001 From: "User pinaki.sarder-web" Date: Mon, 24 Jun 2024 13:05:43 -0400 Subject: [PATCH 15/28] Intermediate commit --- girder_worker/docker/tasks/__init__.py | 9 --------- girder_worker/docker/utils.py | 10 ---------- 2 files changed, 19 deletions(-) diff --git a/girder_worker/docker/tasks/__init__.py b/girder_worker/docker/tasks/__init__.py index 0bf7abd1..fe3d17ae 100644 --- a/girder_worker/docker/tasks/__init__.py +++ b/girder_worker/docker/tasks/__init__.py @@ -40,15 +40,6 @@ BLACKLISTED_DOCKER_RUN_ARGS = ['tty', 'detach'] -# JOB_STATUS = utils.job_status_codes() - - -JOB_STATUS = { - 'SUCCESS': 'Success', - 'FAILURE': "Failure", - 'CANCELLED': 'Cancelled' - } - def _pull_image(image): """ diff --git a/girder_worker/docker/utils.py b/girder_worker/docker/utils.py index c8e55a14..f18cfc91 100644 --- a/girder_worker/docker/utils.py +++ b/girder_worker/docker/utils.py @@ -106,14 +106,4 @@ def chmod_writable(host_paths): raise -JOB_STATUS = { - 'SUCCESS': 'Success', - 'FAILURE': "Failure", - 'CANCELLED': 'Cancelled' - } - -def job_status_codes(): - statusCodes = SimpleNamespace(JOB_STATUS) - return statusCodes - From 31367c6f612518974195e6dfabcf3fbe516e3715 Mon Sep 17 00:00:00 2001 From: "User pinaki.sarder-web" Date: Mon, 1 Jul 2024 11:47:24 -0400 Subject: [PATCH 16/28] Fixed GPU and CPU allocation --- girder_worker/docker/tasks/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/girder_worker/docker/tasks/__init__.py b/girder_worker/docker/tasks/__init__.py index fe3d17ae..9faef5c0 100644 --- a/girder_worker/docker/tasks/__init__.py +++ b/girder_worker/docker/tasks/__init__.py @@ -631,7 +631,7 @@ def _generate_slurm_script(container_args,kwargs): nvidia.set_nvidia_params(kwargs,singularity_command,gpus) except ValueError as e: if kwargs['nvidia']: - nvidia.set_nvidia_params(kwargs,singularity_command) + nvidia.set_nvidia_params(kwargs,singularity_command) try: pwd = kwargs['pwd'] if not pwd: @@ -701,7 +701,7 @@ def job_monitor(): # Start the job monitor in a new thread - monitor_thread = threading.Thread(target=job_monitor,daemon=True) + monitor_thread = threading.Thread(target=job_monitor,daemon=False) monitor_thread.start() return monitor_thread @@ -712,7 +712,7 @@ def _process_container_args(container_args,kwargs): def find_matching_volume_key(path): for key, value in volumes.items(): if path.startswith(value['bind']): - # Append the suffix from the original path that isn't part of the 'bind' path #Replace bind path later + # Append the suffix from the original path that isn't part of the 'bind' path suffix = path[len(value['bind']):] if value['bind'] != path else '' if 'assetstore' in key: key = '/blue/pinaki.sarder/rc-svc-pinaki.sarder-web' + key From 3b08406312a559ad748c4ff1b0f865a7e0541b2a Mon Sep 17 00:00:00 2001 From: "User pinaki.sarder-web" Date: Wed, 10 Jul 2024 09:55:05 -0400 Subject: [PATCH 17/28] Added code to clean up tmp folders after job --- girder_worker/docker/nvidia.py | 5 +++-- girder_worker/docker/tasks/__init__.py | 18 ++++++------------ girder_worker/docker/utils.py | 15 +++++++++++++++ 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/girder_worker/docker/nvidia.py b/girder_worker/docker/nvidia.py index 27272a86..1edc2144 100644 --- a/girder_worker/docker/nvidia.py +++ b/girder_worker/docker/nvidia.py @@ -16,8 +16,9 @@ def set_nvidia_params(kwargs:dict,singularity_command:list,gpus:int=1): Returns: None ''' - kwargs['--gres'] = f"gres:gpu:{gpus}" if gpus > 1 else f"gres:gpu:1" + kwargs['--gres'] = f"gres:gpu:a100:{gpus}" if gpus > 1 else f"gres:gpu:a100:1" kwargs['--partition'] = 'gpu' + kwargs['--mem'] = '32000' #Reducing CPU count for gpu-based job for resource conservation - kwargs['--cpus-per-task'] = '2' + kwargs['--cpus-per-task'] = '8' singularity_command.append('--nv') \ No newline at end of file diff --git a/girder_worker/docker/tasks/__init__.py b/girder_worker/docker/tasks/__init__.py index 9faef5c0..09a0c34c 100644 --- a/girder_worker/docker/tasks/__init__.py +++ b/girder_worker/docker/tasks/__init__.py @@ -511,18 +511,12 @@ def _json(volume): def _cleanup_temp_volumes(self, temp_volumes, default_temp_volume): # Set the permission to allow cleanup of temp directories temp_volumes = [v for v in temp_volumes if os.path.exists(v.host_path)] - to_chmod = temp_volumes[:] - # If our default_temp_volume instance has been transformed then we - # know it has been used and we have to clean it up. if default_temp_volume._transformed: - to_chmod.append(default_temp_volume) temp_volumes.append(default_temp_volume) - - # if len(to_chmod) > 0: - # utils.chmod_writable([v.host_path for v in to_chmod]) - - # for v in temp_volumes: - # shutil.rmtree(v.host_path) + + for v in temp_volumes: + utils.remove_tmp_folder_apptainer(v.host_path) + def _run_singularity_container(container_args=None,**kwargs): @@ -732,8 +726,8 @@ def _get_slurm_config(kwargs): '--qos': os.getenv('SLURM_QOS'), '--account': os.getenv('SLURM_ACCOUNT'), '--mem':os.getenv('SLURM_MEMORY','16000'), - '--ntasks': os.getenv("SLURM_NTASKS",'2'), - '--time': os.getenv("SLURM_TIME",'00:30'), + '--ntasks': os.getenv("SLURM_NTASKS",'1'), + '--time': os.getenv("SLURM_TIME",'72:00'), '--partition':os.getenv('SLURM_PARTITION','hpg2-compute'), '--gres':os.getenv('SLURM_GRES_CONFIG'), '--cpus-per-task':os.getenv('SLURM_CPUS','4') diff --git a/girder_worker/docker/utils.py b/girder_worker/docker/utils.py index f18cfc91..11e44d7d 100644 --- a/girder_worker/docker/utils.py +++ b/girder_worker/docker/utils.py @@ -5,6 +5,8 @@ import docker from docker.errors import DockerException from girder_worker import logger +import re +import subprocess def select_loop(exit_condition=lambda: True, readers=None, writers=None): @@ -106,4 +108,17 @@ def chmod_writable(host_paths): raise +def remove_tmp_folder_apptainer(host_path=None): + ''' + This function will run after the slurm job completes and returns. If a temp folder is created in the temp directory to + do file I/O operations before/while the job was run, we need to clean up by removing the folder. + ''' + if not host_path: + return + temp_path = os.getenv("TMPIR") + #Cautious checking host path before removing it from the filesystem. + if temp_path in host_path: + if os.path.exists(host_path): + subprocess.call(['rm','-rf',host_path]) + From c7a9d3b2d8e048be7efdfcda3997457b3385789f Mon Sep 17 00:00:00 2001 From: willdunklin Date: Mon, 22 Jul 2024 16:29:06 -0400 Subject: [PATCH 18/28] Split girder-worker-singularity into a seperate package --- .gitignore | 2 +- girder_worker/docker/tasks/__init__.py | 39 ++-- girder_worker/singularity/__init__.py | 0 .../girder_worker_singularity/__init__.py | 10 + .../tasks/__init__.py | 156 ++++++++++++++++ .../girder_worker_singularity/tasks/uf.py | 173 ++++++++++++++++++ girder_worker/singularity/pyproject.toml | 3 + girder_worker/singularity/setup.py | 29 +++ setup.py | 11 +- 9 files changed, 397 insertions(+), 26 deletions(-) create mode 100644 girder_worker/singularity/__init__.py create mode 100644 girder_worker/singularity/girder_worker_singularity/__init__.py create mode 100644 girder_worker/singularity/girder_worker_singularity/tasks/__init__.py create mode 100644 girder_worker/singularity/girder_worker_singularity/tasks/uf.py create mode 100644 girder_worker/singularity/pyproject.toml create mode 100644 girder_worker/singularity/setup.py diff --git a/.gitignore b/.gitignore index 3d781a34..437f39ca 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,7 @@ /docs/_build/ /tmp/ __pycache__/ -/girder_worker.egg-info/ +*.egg-info/ /girder-worker-*.tar.gz *.retry .vagrant diff --git a/girder_worker/docker/tasks/__init__.py b/girder_worker/docker/tasks/__init__.py index fad79577..9a54834f 100644 --- a/girder_worker/docker/tasks/__init__.py +++ b/girder_worker/docker/tasks/__init__.py @@ -390,9 +390,6 @@ def _cleanup_temp_volumes(self, temp_volumes, default_temp_volume): shutil.rmtree(v.host_path) - - - def _docker_run(task, image, pull_image=True, entrypoint=None, container_args=None, volumes=None, remove_container=True, stream_connectors=None, **kwargs): volumes = volumes or {} @@ -578,7 +575,7 @@ def singularity_run(task,**kwargs): 'volumes': volumes } - # Allow run args to be overridden,filter out any we don't want to override + # Allow run args to be overridden,filter out any we don't want to override extra_run_kwargs = {k: v for k, v in kwargs.items() if k not in BLACKLISTED_DOCKER_RUN_ARGS} run_kwargs.update(extra_run_kwargs) @@ -617,9 +614,9 @@ def singularity_exit_condition(): return results #This function is used to check whether we need to switch to singularity or not. def use_singularity(): - ''' - #This needs to be uncommented. Only for testing purposes. - ''' + ''' + #This needs to be uncommented. Only for testing purposes. + ''' # runtime = os.environ.get('RUNTIME') # if runtime == 'SINGULARITY': # return True @@ -630,8 +627,8 @@ def use_singularity(): # with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as s: # return s.connect_ex('/var/run/docker.sock') != 0 # except socket.error: - return False - # return True + # return False + return True @app.task @@ -647,7 +644,7 @@ def _generate_slurm_script(container_args, kwargs): raise Exception(' Issue with Slicer_Cli_Plugin_Image. Plugin Not available') SIF_DIRECTORY = os.getenv('SIF_IMAGE_PATH') image_full_path = os.path.join(SIF_DIRECTORY,image) - #Code to check for allocating multiple gpus. + #Code to check for allocating multiple gpus. try: gpu_index = container_args.index('--gpu') gpus = int(container_args[gpu_index+1]) @@ -655,8 +652,8 @@ def _generate_slurm_script(container_args, kwargs): except ValueError as e: if kwargs['nvidia']: nvidia.set_nvidia_params(kwargs,singularity_command) - try: - pwd = kwargs['pwd'] + try: + pwd = kwargs['pwd'] if not pwd: raise Exception("PWD cannot be empty") singularity_command.extend(['--pwd',pwd]) @@ -670,15 +667,15 @@ def _generate_slurm_script(container_args, kwargs): def _monitor_singularity_job(task,slurm_run_command,slurm_config,log_file_name): """Create a drmaa session and monitor the job accordingly""" decodestatus = {drmaa.JobState.UNDETERMINED: 'process status cannot be determined', - drmaa.JobState.QUEUED_ACTIVE: 'job is queued and active', - drmaa.JobState.SYSTEM_ON_HOLD: 'job is queued and in system hold', - drmaa.JobState.USER_ON_HOLD: 'job is queued and in user hold', - drmaa.JobState.USER_SYSTEM_ON_HOLD: 'job is queued and in user and system hold', - drmaa.JobState.RUNNING: 'job is running', - drmaa.JobState.SYSTEM_SUSPENDED: 'job is system suspended', - drmaa.JobState.USER_SUSPENDED: 'job is user suspended', - drmaa.JobState.DONE: 'job finished normally', - drmaa.JobState.FAILED: 'job finished, but failed'} + drmaa.JobState.QUEUED_ACTIVE: 'job is queued and active', + drmaa.JobState.SYSTEM_ON_HOLD: 'job is queued and in system hold', + drmaa.JobState.USER_ON_HOLD: 'job is queued and in user hold', + drmaa.JobState.USER_SYSTEM_ON_HOLD: 'job is queued and in user and system hold', + drmaa.JobState.RUNNING: 'job is running', + drmaa.JobState.SYSTEM_SUSPENDED: 'job is system suspended', + drmaa.JobState.USER_SUSPENDED: 'job is user suspended', + drmaa.JobState.DONE: 'job finished normally', + drmaa.JobState.FAILED: 'job finished, but failed'} temp_directory = os.getenv('TMPDIR') submit_dir = '/blue/pinaki.sarder/rc-svc-pinaki.sarder-web/submission' def job_monitor(): diff --git a/girder_worker/singularity/__init__.py b/girder_worker/singularity/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/girder_worker/singularity/girder_worker_singularity/__init__.py b/girder_worker/singularity/girder_worker_singularity/__init__.py new file mode 100644 index 00000000..03195665 --- /dev/null +++ b/girder_worker/singularity/girder_worker_singularity/__init__.py @@ -0,0 +1,10 @@ +from girder_worker import GirderWorkerPluginABC + + +class SingularityPlugin(GirderWorkerPluginABC): + + def __init__(self, app, *args, **kwargs): + self.app = app + + def task_imports(self): + return ['girder_worker_singularity.tasks'] diff --git a/girder_worker/singularity/girder_worker_singularity/tasks/__init__.py b/girder_worker/singularity/girder_worker_singularity/tasks/__init__.py new file mode 100644 index 00000000..4e659a18 --- /dev/null +++ b/girder_worker/singularity/girder_worker_singularity/tasks/__init__.py @@ -0,0 +1,156 @@ +import os +import threading + +from girder_worker_utils import _walk_obj +# from slicer_cli_web.singularity.utils import switch_to_sif_image_folder + +from girder_worker import logger +from girder_worker.app import Task, app +from girder_worker.docker import utils +from girder_worker.docker.io import FDReadStreamConnector, FDWriteStreamConnector, FDWriteStreamConnector, FDReadStreamConnector +from girder_worker.docker.tasks import _RequestDefaultTemporaryVolume, _handle_streaming_args +from girder_worker.docker.transforms import TemporaryVolume + +BLACKLISTED_DOCKER_RUN_ARGS = ['tty', 'detach'] + + +print('!!!!!!!!!!!!!!') + +# Class for SingularityTask similar to DockerTask +class SingularityTask(Task): + def _maybe_transform_argument(self, arg): + return super()._maybe_transform_argument( + arg, task=self, _default_temp_volume=self.request._default_temp_volume) + + def _maybe_transform_result(self, idx, result): + return super()._maybe_transform_result( + idx, result, _default_temp_volume=self.request._default_temp_volume) + + def __call__(self, *args, **kwargs): + default_temp_volume = _RequestDefaultTemporaryVolume() + self.request._default_temp_volume = default_temp_volume + + volumes = kwargs.setdefault('volumes', {}) + # If we have a list of volumes, the user provide a list of Volume objects, + # we need to transform them. + temp_volumes = [] + if isinstance(volumes, list): + # See if we have been passed any TemporaryVolume instances. + for v in volumes: + if isinstance(v, TemporaryVolume): + temp_volumes.append(v) + + # First call the transform method, this we replace default temp volumes + # with the instance associated with this task create above. That is any + # reference to TemporaryVolume.default + _walk_obj(volumes, self._maybe_transform_argument) + + # Now convert them to JSON + def _json(volume): + return volume._repr_json_() + + volumes = _walk_obj(volumes, _json) + # We then need to merge them into a single dict and it will be ready + # for docker-py. + volumes = {k: v for volume in volumes for k, v in volume.items()} + kwargs['volumes'] = volumes + + volumes.update(default_temp_volume._repr_json_()) + + try: + super().__call__(*args, **kwargs) + finally: + threading.Thread( + target=self._cleanup_temp_volumes, + args=(temp_volumes, default_temp_volume), + daemon=False).start() + + def _cleanup_temp_volumes(self, temp_volumes, default_temp_volume): + # Set the permission to allow cleanup of temp directories + temp_volumes = [v for v in temp_volumes if os.path.exists(v.host_path)] + if default_temp_volume._transformed: + temp_volumes.append(default_temp_volume) + + for v in temp_volumes: + utils.remove_tmp_folder_apptainer(v.host_path) + + +def singularity_run(task,**kwargs): + volumes = kwargs.pop('volumes',{}) + container_args = kwargs.pop('container_args',[]) + pull_image = kwargs['pull_image'] or False + stream_connectors = kwargs['stream_connectors'] or [] + image = kwargs.get('image') or '' + entrypoint = None + if not image: + logger.exception(f"Image name cannot be emptu") + raise Exception(f"Image name cannot be empty") + + run_kwargs = { + 'tty': False, + 'volumes': volumes + } + + # Allow run args to be overridden,filter out any we don't want to override + extra_run_kwargs = {k: v for k, v in kwargs.items() if k not in BLACKLISTED_DOCKER_RUN_ARGS} + run_kwargs.update(extra_run_kwargs) + + #Make entrypoint as pwd + if entrypoint is not None: + run_kwargs['entrypoint'] = entrypoint + + log_file_name = kwargs['log_file'] + + container_args,read_streams,write_streams = _handle_streaming_args(container_args) + #MODIFIED FOR SINGULARITY (CHANGE CODE OF SINGULARITY CONTAINER) + for connector in stream_connectors: + if isinstance(connector, FDReadStreamConnector): + read_streams.append(connector) + elif isinstance(connector, FDWriteStreamConnector): + write_streams.append(connector) + else: + raise TypeError( + "Expected 'FDReadStreamConnector' or 'FDWriterStreamConnector', received '%s'" + % type(connector)) + + from uf import slurm_dispatch + slurm_dispatch(task, container_args, run_kwargs, read_streams, write_streams, log_file_name) + # slurm_run_command,slurm_config = _run_singularity_container(container_args,**run_kwargs) + # try: + # monitor_thread = _monitor_singularity_job(task,slurm_run_command,slurm_config,log_file_name) + # def singularity_exit_condition(): + # return not monitor_thread.is_alive() + # utils.select_loop(exit_condition = singularity_exit_condition, + # readers= read_streams, + # writers = write_streams ) + # finally: + # logger.info('DONE') + + results = [] + if hasattr(task.request,'girder_result_hooks'): + results = (None,) * len(task.request.girder_result_hooks) + + return results + +#This function is used to check whether we need to switch to singularity or not. +def use_singularity(): + ''' + #This needs to be uncommented. Only for testing purposes. + ''' + # runtime = os.environ.get('RUNTIME') + # if runtime == 'SINGULARITY': + # return True + # if runtime == 'DOCKER': + # return False + # try: + # #Check whether we are connected to a docker socket. + # with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as s: + # return s.connect_ex('/var/run/docker.sock') != 0 + # except socket.error: + # return False + return True + + +@app.task +def container_backend(**kwargs): + return use_singularity() \ No newline at end of file diff --git a/girder_worker/singularity/girder_worker_singularity/tasks/uf.py b/girder_worker/singularity/girder_worker_singularity/tasks/uf.py new file mode 100644 index 00000000..f7d3f9f7 --- /dev/null +++ b/girder_worker/singularity/girder_worker_singularity/tasks/uf.py @@ -0,0 +1,173 @@ +import drmaa +import time +import threading +import os +from girder_worker.docker import utils +from girder_worker import logger + +try: + from girder_worker.docker import nvidia +except ImportError: + pass + + +def slurm_dispatch(task, container_args, run_kwargs, read_streams, write_streams, log_file_name): + slurm_run_command,slurm_config = _run_singularity_container(container_args,**run_kwargs) + try: + monitor_thread = _monitor_singularity_job(task,slurm_run_command,slurm_config,log_file_name) + def singularity_exit_condition(): + return not monitor_thread.is_alive() + utils.select_loop(exit_condition=singularity_exit_condition, + readers=read_streams, + writers=write_streams) + finally: + logger.info('DONE') + + +def _monitor_singularity_job(task,slurm_run_command,slurm_config,log_file_name): + """Create a drmaa session and monitor the job accordingly""" + decodestatus = {drmaa.JobState.UNDETERMINED: 'process status cannot be determined', + drmaa.JobState.QUEUED_ACTIVE: 'job is queued and active', + drmaa.JobState.SYSTEM_ON_HOLD: 'job is queued and in system hold', + drmaa.JobState.USER_ON_HOLD: 'job is queued and in user hold', + drmaa.JobState.USER_SYSTEM_ON_HOLD: 'job is queued and in user and system hold', + drmaa.JobState.RUNNING: 'job is running', + drmaa.JobState.SYSTEM_SUSPENDED: 'job is system suspended', + drmaa.JobState.USER_SUSPENDED: 'job is user suspended', + drmaa.JobState.DONE: 'job finished normally', + drmaa.JobState.FAILED: 'job finished, but failed'} + temp_directory = os.getenv('TMPDIR') + submit_dir = '/blue/pinaki.sarder/rc-svc-pinaki.sarder-web/submission' + def job_monitor(): + s = drmaa.Session() + s.initialize() + jt = s.createJobTemplate() + jt.workingDirectory = temp_directory + jt.remoteCommand = os.path.join(submit_dir, 'submit.sh') + jt.nativeSpecification = slurm_config + jt.args = slurm_run_command + jt.outputPath = ':' + log_file_name + jt.errorPath = ':' + log_file_name + try: + jobid = s.runJob(jt) + logger.info((f'Submitted singularity job with jobid {jobid}')) + with open(log_file_name, 'r') as f: + while True: + job_info = s.jobStatus(jobid) + where = f.tell() + line = f.readlines() + if line: + print(''.join(line), end='') + else: + f.seek(where) + if job_info in [drmaa.JobState.DONE, drmaa.JobState.FAILED]: + s.deleteJobTemplate(jt) + break + elif task.canceled: + s.control(jobid, drmaa.JobControlAction.TERMINATE) + s.deleteJobTemplate(jt) + break + time.sleep(5) # Sleep to avoid busy waiting + exit_status = s.jobStatus(jobid) + logger.info(decodestatus[exit_status]) + s.exit() + return exit_status + + except Exception as e: + s.deleteJobTemplate(jt) + print(f'Error Occured {e}') + + # Start the job monitor in a new thread + monitor_thread = threading.Thread(target=job_monitor, daemon=True) + monitor_thread.start() + + return monitor_thread + + +def _run_singularity_container(container_args=None,**kwargs): + image = kwargs['image'] + container_args = container_args or kwargs['container_args'] or [] + try: + container_args = _process_container_args(container_args, kwargs) + + logger.info('Running container: image: %s args: %s kwargs: %s' + % (image, container_args, kwargs)) + + slurm_run_command = _generate_slurm_script(container_args,kwargs) + + slurm_config = _get_slurm_config(kwargs) + + return [slurm_run_command, slurm_config] + except Exception as e: + logger.exception(e) + raise Exception(e) + + +def _process_container_args(container_args,kwargs): + volumes = kwargs['volumes'] or {} + def find_matching_volume_key(path): + for key, value in volumes.items(): + if path.startswith(value['bind']): + # Append the suffix from the original path that isn't part of the 'bind' path + suffix = path[len(value['bind']):] if value['bind'] != path else '' + if 'assetstore' in key: + key = '/blue/pinaki.sarder/rc-svc-pinaki.sarder-web' + key + new_key = key + suffix.replace(" ", "_") # Replace spaces in suffix with underscores + return new_key + return path # Replace spaces in paths that don't match any volume + try: + # Replace paths in container_args with their corresponding volume keys + updated_container_args = [str(find_matching_volume_key(arg)) for arg in container_args] + except Exception as e: + logger.info(f"error {e}") + return updated_container_args + + +def _generate_slurm_script(container_args, kwargs): + container_args = container_args or [] + image = kwargs.pop('image', None) + singularity_command = [] + if not image: + raise Exception(' Issue with Slicer_Cli_Plugin_Image. Plugin Not available') + SIF_DIRECTORY = os.getenv('SIF_IMAGE_PATH') + image_full_path = os.path.join(SIF_DIRECTORY, image) + #Code to check for allocating multiple gpus. + try: + gpu_index = container_args.index('--gpu') + gpus = int(container_args[gpu_index+1]) + nvidia.set_nvidia_params(kwargs, singularity_command, gpus) + except ValueError as e: + if kwargs['nvidia']: + nvidia.set_nvidia_params(kwargs, singularity_command) + try: + pwd = kwargs['pwd'] + if not pwd: + raise Exception("PWD cannot be empty") + singularity_command.extend(['--pwd', pwd]) + singularity_command.append(image_full_path) + singularity_command.extend(container_args) + except Exception as e: + logger.info(f"Error occured - {e}") + raise Exception(f"Error Occured - {e}") + return singularity_command + + +def _get_slurm_config(kwargs): + #Use this function to add or modify any configuration parameters for the SLURM job + config_defaults = { + '--qos': os.getenv('SLURM_QOS'), + '--account': os.getenv('SLURM_ACCOUNT'), + '--mem':os.getenv('SLURM_MEMORY','16000'), + '--ntasks': os.getenv("SLURM_NTASKS",'1'), + '--time': os.getenv("SLURM_TIME",'72:00'), + '--partition':os.getenv('SLURM_PARTITION','hpg2-compute'), + '--gres':os.getenv('SLURM_GRES_CONFIG'), + '--cpus-per-task':os.getenv('SLURM_CPUS','4') + } + + config = {k:kwargs.get(k,config_defaults[k]) for k in config_defaults} + + slurm_config = ' '.join(f"{k}={v}" for k,v in config.items() if v is not None) + + logger.info(f"SLURM CONFIG = {slurm_config}") + return slurm_config diff --git a/girder_worker/singularity/pyproject.toml b/girder_worker/singularity/pyproject.toml new file mode 100644 index 00000000..8fd8d67e --- /dev/null +++ b/girder_worker/singularity/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools", "setuptools-scm"] +build-backend = "setuptools.build_meta" diff --git a/girder_worker/singularity/setup.py b/girder_worker/singularity/setup.py new file mode 100644 index 00000000..bfbc580e --- /dev/null +++ b/girder_worker/singularity/setup.py @@ -0,0 +1,29 @@ +from setuptools import setup, find_packages + +setup( + name='girder-worker-singularity', + version='0.0.0', + description='An example girder worker extension', + author='Kitware, Inc.', + author_email='kitware@kitware.com', + license='Apache Software License 2.0', + classifiers=[ + 'Development Status :: 4 - Beta', + 'Environment :: Console', + 'License :: OSI Approved :: Apache Software License', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', + ], + entry_points={ + 'girder_worker_plugins': [ + 'singularity = girder_worker_singularity:SingularityPlugin', + ] + }, + packages=find_packages(), + zip_safe=False +) diff --git a/setup.py b/setup.py index 6a6ba4bf..5b6c9a07 100644 --- a/setup.py +++ b/setup.py @@ -68,6 +68,9 @@ def run(self, *args, **kwargs): extras_require = {} extras_require['girder'] = ['girder>=3.0.1,<5', 'girder-jobs>=3.0.1,<5'] +# TODO: handle package discovery for singularity +extras_require['singularity'] = ['girder-worker-singularity'] + # perform the install setuptools.setup( @@ -106,8 +109,8 @@ def run(self, *args, **kwargs): zip_safe=False, entry_points={ 'console_scripts': [ - # 'girder-worker = girder_worker.__main__:main', - # 'girder-worker-config = girder_worker.configure:main' + 'girder-worker = girder_worker.__main__:main', + 'girder-worker-config = girder_worker.configure:main' ], 'girder_worker_plugins': [ 'docker = girder_worker.docker:DockerPlugin [docker]', @@ -124,7 +127,7 @@ def run(self, *args, **kwargs): # 'invalid = girder_worker._test_plugins.plugins:NotAValidClass' ], 'girder.plugin': [ - # 'worker = girder_worker.girder_plugin:WorkerPlugin' + 'worker = girder_worker.girder_plugin:WorkerPlugin' ] - } + }, ) From 8baadbc70d941e684b9756159ea5794f935d5a73 Mon Sep 17 00:00:00 2001 From: willdunklin Date: Thu, 1 Aug 2024 12:41:52 -0400 Subject: [PATCH 19/28] Update singularity package dependencies --- girder_worker/__main__.py | 1 - girder_worker/docker/tasks/__init__.py | 225 --------------------- girder_worker/singularity/__init__.py | 0 girder_worker/singularity/requirements.txt | 1 + girder_worker/singularity/setup.py | 5 + requirements-dev.in | 1 + requirements.txt | 1 - 7 files changed, 7 insertions(+), 227 deletions(-) delete mode 100644 girder_worker/singularity/__init__.py create mode 100644 girder_worker/singularity/requirements.txt diff --git a/girder_worker/__main__.py b/girder_worker/__main__.py index fb074a69..a7ac7190 100644 --- a/girder_worker/__main__.py +++ b/girder_worker/__main__.py @@ -8,5 +8,4 @@ def main(): if __name__ == '__main__': - print('main!') main() diff --git a/girder_worker/docker/tasks/__init__.py b/girder_worker/docker/tasks/__init__.py index 9a54834f..604932bf 100644 --- a/girder_worker/docker/tasks/__init__.py +++ b/girder_worker/docker/tasks/__init__.py @@ -7,7 +7,6 @@ import threading import time import subprocess -import drmaa try: import docker @@ -538,227 +537,3 @@ def _cleanup_temp_volumes(self, temp_volumes, default_temp_volume): for v in temp_volumes: utils.remove_tmp_folder_apptainer(v.host_path) - - -def _run_singularity_container(container_args=None,**kwargs): - image = kwargs['image'] - container_args = container_args or kwargs['container_args'] or [] - try: - container_args = _process_container_args(container_args, kwargs) - - logger.info('Running container: image: %s args: %s kwargs: %s' - % (image, container_args, kwargs)) - - slurm_run_command = _generate_slurm_script(container_args,kwargs) - - slurm_config = _get_slurm_config(kwargs) - - return [slurm_run_command,slurm_config] - except Exception as e: - logger.exception(e) - raise Exception(e) - - -def singularity_run(task,**kwargs): - volumes = kwargs.pop('volumes',{}) - container_args = kwargs.pop('container_args',[]) - pull_image = kwargs['pull_image'] or False - stream_connectors = kwargs['stream_connectors'] or [] - image = kwargs.get('image') or '' - entrypoint = None - if not image: - logger.exception(f"Image name cannot be emptu") - raise Exception(f"Image name cannot be empty") - - run_kwargs = { - 'tty': False, - 'volumes': volumes - } - - # Allow run args to be overridden,filter out any we don't want to override - extra_run_kwargs = {k: v for k, v in kwargs.items() if k not in BLACKLISTED_DOCKER_RUN_ARGS} - run_kwargs.update(extra_run_kwargs) - - #Make entrypoint as pwd - if entrypoint is not None: - run_kwargs['entrypoint'] = entrypoint - - log_file_name = kwargs['log_file'] - - container_args,read_streams,write_streams = _handle_streaming_args(container_args) - #MODIFIED FOR SINGULARITY (CHANGE CODE OF SINGULARITY CONTAINER) - slurm_run_command,slurm_config = _run_singularity_container(container_args,**run_kwargs) - for connector in stream_connectors: - if isinstance(connector, FDReadStreamConnector): - read_streams.append(connector) - elif isinstance(connector, FDWriteStreamConnector): - write_streams.append(connector) - else: - raise TypeError( - "Expected 'FDReadStreamConnector' or 'FDWriterStreamConnector', received '%s'" - % type(connector)) - try: - monitor_thread = _monitor_singularity_job(task,slurm_run_command,slurm_config,log_file_name) - def singularity_exit_condition(): - return not monitor_thread.is_alive() - utils.select_loop(exit_condition = singularity_exit_condition, - readers= read_streams, - writers = write_streams ) - finally: - logger.info('DONE') - - results = [] - if hasattr(task.request,'girder_result_hooks'): - results = (None,) * len(task.request.girder_result_hooks) - - return results -#This function is used to check whether we need to switch to singularity or not. -def use_singularity(): - ''' - #This needs to be uncommented. Only for testing purposes. - ''' - # runtime = os.environ.get('RUNTIME') - # if runtime == 'SINGULARITY': - # return True - # if runtime == 'DOCKER': - # return False - # try: - # #Check whether we are connected to a docker socket. - # with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as s: - # return s.connect_ex('/var/run/docker.sock') != 0 - # except socket.error: - # return False - return True - - -@app.task -def container_backend(**kwargs): - return use_singularity() - - -def _generate_slurm_script(container_args, kwargs): - container_args = container_args or [] - image = kwargs.pop('image', None) - singularity_command = [] - if not image: - raise Exception(' Issue with Slicer_Cli_Plugin_Image. Plugin Not available') - SIF_DIRECTORY = os.getenv('SIF_IMAGE_PATH') - image_full_path = os.path.join(SIF_DIRECTORY,image) - #Code to check for allocating multiple gpus. - try: - gpu_index = container_args.index('--gpu') - gpus = int(container_args[gpu_index+1]) - nvidia.set_nvidia_params(kwargs,singularity_command,gpus) - except ValueError as e: - if kwargs['nvidia']: - nvidia.set_nvidia_params(kwargs,singularity_command) - try: - pwd = kwargs['pwd'] - if not pwd: - raise Exception("PWD cannot be empty") - singularity_command.extend(['--pwd',pwd]) - singularity_command.append(image_full_path) - singularity_command.extend(container_args) - except Exception as e: - logger.info(f"Error occured - {e}") - raise Exception(f"Error Occured - {e}") - return singularity_command - -def _monitor_singularity_job(task,slurm_run_command,slurm_config,log_file_name): - """Create a drmaa session and monitor the job accordingly""" - decodestatus = {drmaa.JobState.UNDETERMINED: 'process status cannot be determined', - drmaa.JobState.QUEUED_ACTIVE: 'job is queued and active', - drmaa.JobState.SYSTEM_ON_HOLD: 'job is queued and in system hold', - drmaa.JobState.USER_ON_HOLD: 'job is queued and in user hold', - drmaa.JobState.USER_SYSTEM_ON_HOLD: 'job is queued and in user and system hold', - drmaa.JobState.RUNNING: 'job is running', - drmaa.JobState.SYSTEM_SUSPENDED: 'job is system suspended', - drmaa.JobState.USER_SUSPENDED: 'job is user suspended', - drmaa.JobState.DONE: 'job finished normally', - drmaa.JobState.FAILED: 'job finished, but failed'} - temp_directory = os.getenv('TMPDIR') - submit_dir = '/blue/pinaki.sarder/rc-svc-pinaki.sarder-web/submission' - def job_monitor(): - s = drmaa.Session() - s.initialize() - jt = s.createJobTemplate() - jt.workingDirectory = temp_directory - jt.remoteCommand = os.path.join(submit_dir, 'submit.sh') - jt.nativeSpecification = slurm_config - jt.args = slurm_run_command - jt.outputPath = ':' + log_file_name - jt.errorPath = ':' + log_file_name - try: - jobid = s.runJob(jt) - logger.info((f'Submitted singularity job with jobid {jobid}')) - with open(log_file_name, 'r') as f: - while True: - job_info = s.jobStatus(jobid) - where = f.tell() - line = f.readlines() - if line: - print(''.join(line), end='') - else: - f.seek(where) - if job_info in [drmaa.JobState.DONE, drmaa.JobState.FAILED]: - s.deleteJobTemplate(jt) - break - elif task.canceled: - s.control(jobid, drmaa.JobControlAction.TERMINATE) - s.deleteJobTemplate(jt) - break - time.sleep(5) # Sleep to avoid busy waiting - exit_status = s.jobStatus(jobid) - logger.info(decodestatus[exit_status]) - s.exit() - return exit_status - - except Exception as e: - s.deleteJobTemplate(jt) - print(f'Error Occured {e}') - - # Start the job monitor in a new thread - monitor_thread = threading.Thread(target=job_monitor, daemon=True) - monitor_thread.start() - - return monitor_thread - - -def _process_container_args(container_args,kwargs): - volumes = kwargs['volumes'] or {} - def find_matching_volume_key(path): - for key, value in volumes.items(): - if path.startswith(value['bind']): - # Append the suffix from the original path that isn't part of the 'bind' path - suffix = path[len(value['bind']):] if value['bind'] != path else '' - if 'assetstore' in key: - key = '/blue/pinaki.sarder/rc-svc-pinaki.sarder-web' + key - new_key = key + suffix.replace(" ", "_") # Replace spaces in suffix with underscores - return new_key - return path # Replace spaces in paths that don't match any volume - try: - # Replace paths in container_args with their corresponding volume keys - updated_container_args = [str(find_matching_volume_key(arg)) for arg in container_args] - except Exception as e: - logger.info(f"error {e}") - return updated_container_args - -def _get_slurm_config(kwargs): - #Use this function to add or modify any configuration parameters for the SLURM job - config_defaults = { - '--qos': os.getenv('SLURM_QOS'), - '--account': os.getenv('SLURM_ACCOUNT'), - '--mem':os.getenv('SLURM_MEMORY','16000'), - '--ntasks': os.getenv("SLURM_NTASKS",'1'), - '--time': os.getenv("SLURM_TIME",'72:00'), - '--partition':os.getenv('SLURM_PARTITION','hpg2-compute'), - '--gres':os.getenv('SLURM_GRES_CONFIG'), - '--cpus-per-task':os.getenv('SLURM_CPUS','4') - } - - config = {k:kwargs.get(k,config_defaults[k]) for k in config_defaults} - - slurm_config = ' '.join(f"{k}={v}" for k,v in config.items() if v is not None) - - logger.info(f"SLURM CONFIG = {slurm_config}") - return slurm_config diff --git a/girder_worker/singularity/__init__.py b/girder_worker/singularity/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/girder_worker/singularity/requirements.txt b/girder_worker/singularity/requirements.txt new file mode 100644 index 00000000..91ffcad8 --- /dev/null +++ b/girder_worker/singularity/requirements.txt @@ -0,0 +1 @@ +drmaa diff --git a/girder_worker/singularity/setup.py b/girder_worker/singularity/setup.py index bfbc580e..23a34644 100644 --- a/girder_worker/singularity/setup.py +++ b/girder_worker/singularity/setup.py @@ -1,5 +1,9 @@ from setuptools import setup, find_packages + +with open('requirements.txt') as f: + install_reqs = f.readlines() + setup( name='girder-worker-singularity', version='0.0.0', @@ -19,6 +23,7 @@ 'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: 3.12', ], + install_requires=install_reqs, entry_points={ 'girder_worker_plugins': [ 'singularity = girder_worker_singularity:SingularityPlugin', diff --git a/requirements-dev.in b/requirements-dev.in index cbaa6de1..e4ca145e 100644 --- a/requirements-dev.in +++ b/requirements-dev.in @@ -1,3 +1,4 @@ +-e girder_worker/singularity -e git+https://github.com/girder/girder.git@master#egg=girder -e git+https://github.com/girder/girder.git@master#egg=girder-jobs&subdirectory=plugins/jobs -e git+https://github.com/girder/girder.git@master#egg=pytest-girder&subdirectory=pytest_girder diff --git a/requirements.txt b/requirements.txt index caa75f59..26bcce9c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -375,7 +375,6 @@ zipp==3.18.2 # via # importlib-metadata # importlib-resources -drmaa # The following packages are considered to be unsafe in a requirements file: # setuptools From 66e99b3bc5481f9ad8c4497590011744e00579f5 Mon Sep 17 00:00:00 2001 From: willdunklin Date: Wed, 21 Aug 2024 14:39:00 -0400 Subject: [PATCH 20/28] Update singularity threads to track jobId --- girder_worker/docker/tasks/__init__.py | 60 ------------------- girder_worker/docker/utils.py | 19 ------ .../tasks/__init__.py | 24 ++------ .../girder_worker_singularity/tasks/uf.py | 46 +++++++++++++- .../girder_worker_singularity/tasks/utils.py | 20 +++++++ girder_worker/singularity/setup.py | 2 +- 6 files changed, 71 insertions(+), 100 deletions(-) create mode 100644 girder_worker/singularity/girder_worker_singularity/tasks/utils.py diff --git a/girder_worker/docker/tasks/__init__.py b/girder_worker/docker/tasks/__init__.py index 604932bf..f39753b4 100644 --- a/girder_worker/docker/tasks/__init__.py +++ b/girder_worker/docker/tasks/__init__.py @@ -6,7 +6,6 @@ import sys import threading import time -import subprocess try: import docker @@ -478,62 +477,3 @@ def docker_run(task, image, pull_image=True, entrypoint=None, container_args=Non return _docker_run( task, image, pull_image, entrypoint, container_args, volumes, remove_container, **kwargs) - - -# Class for SingularityTask similar to DockerTask -class SingularityTask(Task): - def _maybe_transform_argument(self, arg): - return super()._maybe_transform_argument( - arg, task=self, _default_temp_volume=self.request._default_temp_volume) - - def _maybe_transform_result(self, idx, result): - return super()._maybe_transform_result( - idx, result, _default_temp_volume=self.request._default_temp_volume) - - def __call__(self, *args, **kwargs): - default_temp_volume = _RequestDefaultTemporaryVolume() - self.request._default_temp_volume = default_temp_volume - - volumes = kwargs.setdefault('volumes', {}) - # If we have a list of volumes, the user provide a list of Volume objects, - # we need to transform them. - temp_volumes = [] - if isinstance(volumes, list): - # See if we have been passed any TemporaryVolume instances. - for v in volumes: - if isinstance(v, TemporaryVolume): - temp_volumes.append(v) - - # First call the transform method, this we replace default temp volumes - # with the instance associated with this task create above. That is any - # reference to TemporaryVolume.default - _walk_obj(volumes, self._maybe_transform_argument) - - # Now convert them to JSON - def _json(volume): - return volume._repr_json_() - - volumes = _walk_obj(volumes, _json) - # We then need to merge them into a single dict and it will be ready - # for docker-py. - volumes = {k: v for volume in volumes for k, v in volume.items()} - kwargs['volumes'] = volumes - - volumes.update(default_temp_volume._repr_json_()) - - try: - super().__call__(*args, **kwargs) - finally: - threading.Thread( - target=self._cleanup_temp_volumes, - args=(temp_volumes, default_temp_volume), - daemon=False).start() - - def _cleanup_temp_volumes(self, temp_volumes, default_temp_volume): - # Set the permission to allow cleanup of temp directories - temp_volumes = [v for v in temp_volumes if os.path.exists(v.host_path)] - if default_temp_volume._transformed: - temp_volumes.append(default_temp_volume) - - for v in temp_volumes: - utils.remove_tmp_folder_apptainer(v.host_path) diff --git a/girder_worker/docker/utils.py b/girder_worker/docker/utils.py index ca2de659..21c89084 100644 --- a/girder_worker/docker/utils.py +++ b/girder_worker/docker/utils.py @@ -2,14 +2,11 @@ import os import select import uuid -from types import SimpleNamespace import docker from docker.errors import DockerException from girder_worker import logger -import re -import subprocess if (importlib.metadata.version('docker') == '7.0.0' and not hasattr(docker.transport.basehttpadapter.BaseHTTPAdapter, '_get_connection')): @@ -115,19 +112,3 @@ def chmod_writable(host_paths): except DockerException: logger.exception('Error setting perms on docker volumes %s.' % host_paths) raise - - -def remove_tmp_folder_apptainer(host_path=None): - ''' - This function will run after the slurm job completes and returns. If a temp folder is created in the temp directory to - do file I/O operations before/while the job was run, we need to clean up by removing the folder. - ''' - if not host_path: - return - temp_path = os.getenv("TMPIR") - #Cautious checking host path before removing it from the filesystem. - if temp_path in host_path: - if os.path.exists(host_path): - subprocess.call(['rm','-rf',host_path]) - - diff --git a/girder_worker/singularity/girder_worker_singularity/tasks/__init__.py b/girder_worker/singularity/girder_worker_singularity/tasks/__init__.py index 4e659a18..c910a5c2 100644 --- a/girder_worker/singularity/girder_worker_singularity/tasks/__init__.py +++ b/girder_worker/singularity/girder_worker_singularity/tasks/__init__.py @@ -3,10 +3,10 @@ from girder_worker_utils import _walk_obj # from slicer_cli_web.singularity.utils import switch_to_sif_image_folder +from .utils import remove_tmp_folder_apptainer from girder_worker import logger from girder_worker.app import Task, app -from girder_worker.docker import utils from girder_worker.docker.io import FDReadStreamConnector, FDWriteStreamConnector, FDWriteStreamConnector, FDReadStreamConnector from girder_worker.docker.tasks import _RequestDefaultTemporaryVolume, _handle_streaming_args from girder_worker.docker.transforms import TemporaryVolume @@ -14,8 +14,6 @@ BLACKLISTED_DOCKER_RUN_ARGS = ['tty', 'detach'] -print('!!!!!!!!!!!!!!') - # Class for SingularityTask similar to DockerTask class SingularityTask(Task): def _maybe_transform_argument(self, arg): @@ -67,23 +65,21 @@ def _json(volume): def _cleanup_temp_volumes(self, temp_volumes, default_temp_volume): # Set the permission to allow cleanup of temp directories - temp_volumes = [v for v in temp_volumes if os.path.exists(v.host_path)] + temp_volumes = [v.host_path for v in temp_volumes if os.path.exists(v.host_path)] if default_temp_volume._transformed: - temp_volumes.append(default_temp_volume) + temp_volumes.append(default_temp_volume.host_path) - for v in temp_volumes: - utils.remove_tmp_folder_apptainer(v.host_path) + remove_tmp_folder_apptainer(temp_volumes) def singularity_run(task,**kwargs): volumes = kwargs.pop('volumes',{}) container_args = kwargs.pop('container_args',[]) - pull_image = kwargs['pull_image'] or False stream_connectors = kwargs['stream_connectors'] or [] image = kwargs.get('image') or '' entrypoint = None if not image: - logger.exception(f"Image name cannot be emptu") + logger.exception(f"Image name cannot be empty") raise Exception(f"Image name cannot be empty") run_kwargs = { @@ -115,16 +111,6 @@ def singularity_run(task,**kwargs): from uf import slurm_dispatch slurm_dispatch(task, container_args, run_kwargs, read_streams, write_streams, log_file_name) - # slurm_run_command,slurm_config = _run_singularity_container(container_args,**run_kwargs) - # try: - # monitor_thread = _monitor_singularity_job(task,slurm_run_command,slurm_config,log_file_name) - # def singularity_exit_condition(): - # return not monitor_thread.is_alive() - # utils.select_loop(exit_condition = singularity_exit_condition, - # readers= read_streams, - # writers = write_streams ) - # finally: - # logger.info('DONE') results = [] if hasattr(task.request,'girder_result_hooks'): diff --git a/girder_worker/singularity/girder_worker_singularity/tasks/uf.py b/girder_worker/singularity/girder_worker_singularity/tasks/uf.py index f7d3f9f7..f19bafd1 100644 --- a/girder_worker/singularity/girder_worker_singularity/tasks/uf.py +++ b/girder_worker/singularity/girder_worker_singularity/tasks/uf.py @@ -2,8 +2,10 @@ import time import threading import os +import subprocess from girder_worker.docker import utils from girder_worker import logger +from .utils import remove_tmp_folder_apptainer try: from girder_worker.docker import nvidia @@ -16,12 +18,24 @@ def slurm_dispatch(task, container_args, run_kwargs, read_streams, write_streams try: monitor_thread = _monitor_singularity_job(task,slurm_run_command,slurm_config,log_file_name) def singularity_exit_condition(): + ''' + This function is used to handle task cancellation and also enable exit condition to stop logging. + ''' + #Check if the cancel event is called and the jobId is set for the current job thread we are intending to cancel. + if task.canceled and monitor_thread.jobId: + try: + returnCode = subprocess.call(apptainer_cancel_cmd(monitor_thread.jobId)) + if returnCode != 0: + raise Exception(f"Failed to Cancel job with jobID {monitor_thread.jobId}") + except Exception as e: + logger.info(f'Error Occured {e}') return not monitor_thread.is_alive() utils.select_loop(exit_condition=singularity_exit_condition, readers=read_streams, writers=write_streams) finally: logger.info('DONE') + remove_tmp_folder_apptainer(container_args) def _monitor_singularity_job(task,slurm_run_command,slurm_config,log_file_name): @@ -50,6 +64,8 @@ def job_monitor(): jt.errorPath = ':' + log_file_name try: jobid = s.runJob(jt) + #Set the jobID for the current thread so we can access it outside this thread incase we need to cancel the job. + threading.current_thread().jobId = jobid logger.info((f'Submitted singularity job with jobid {jobid}')) with open(log_file_name, 'r') as f: while True: @@ -78,7 +94,7 @@ def job_monitor(): print(f'Error Occured {e}') # Start the job monitor in a new thread - monitor_thread = threading.Thread(target=job_monitor, daemon=True) + monitor_thread = SingularityThread(target=job_monitor, daemon=True) monitor_thread.start() return monitor_thread @@ -171,3 +187,31 @@ def _get_slurm_config(kwargs): logger.info(f"SLURM CONFIG = {slurm_config}") return slurm_config + + +class SingularityThread(threading.Thread): + ''' + This is a custom Thread class in order to handle cancelling a slurm job outside of the thread since the task context object is not available inside the thread. + Methods: + __init__(self,target, daemon) - Initialize the thread similar to threading.Thread class, requires a jobId param to keep track of the jobId + run(self) - This method is used to run the target function. This is essentially called when you do thread.start() + ''' + def __init__(self, target, daemon=False): + super().__init__(daemon=daemon) + self.target = target + self.jobId = None + + def run(self): + if self.target: + self.target() + + +def apptainer_cancel_cmd(jobID, slurm=True): + if not jobID: + raise Exception("Please provide jobID for the job that needs to be cancelled") + cmd = [] + #If any other type of mechanism is used to interact with HPG, use that. + if slurm: + cmd.append('scancel') + cmd.append(jobID) + return cmd diff --git a/girder_worker/singularity/girder_worker_singularity/tasks/utils.py b/girder_worker/singularity/girder_worker_singularity/tasks/utils.py new file mode 100644 index 00000000..88e0dac0 --- /dev/null +++ b/girder_worker/singularity/girder_worker_singularity/tasks/utils.py @@ -0,0 +1,20 @@ +import os +import re +import subprocess + +from girder_worker import logger + + +def remove_tmp_folder_apptainer(container_args=[]): + ''' + This function will run after the slurm job completes and returns. If a temp folder is created in the temp directory to + do file I/O operations before/while the job was run, we need to clean up by removing the folder. + ''' + if not container_args: + logger.info("Host path not found.") + #Cautious checking host path before removing it from the filesystem. + pattern = r"\/tmp\/tmp[^/]+" + for arg in container_args: + if re.search(pattern, arg): + if os.path.exists(arg): + subprocess.call(['rm', '-rf', arg]) diff --git a/girder_worker/singularity/setup.py b/girder_worker/singularity/setup.py index 23a34644..2c8e1a81 100644 --- a/girder_worker/singularity/setup.py +++ b/girder_worker/singularity/setup.py @@ -23,7 +23,7 @@ 'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: 3.12', ], - install_requires=install_reqs, + install_requires=['girder-worker', *install_reqs], entry_points={ 'girder_worker_plugins': [ 'singularity = girder_worker_singularity:SingularityPlugin', From 6c326eeb3cc2cbf95c0fd2e1abcefd862caed64b Mon Sep 17 00:00:00 2001 From: willdunklin Date: Tue, 27 Aug 2024 14:38:09 -0400 Subject: [PATCH 21/28] Refactor singularity-slurm into separate package --- .../tasks/__init__.py | 2 +- girder_worker/singularity/setup.py | 6 +--- .../girder_worker_slurm/__init__.py} | 32 +++++++++-------- girder_worker/slurm/pyproject.toml | 3 ++ .../{singularity => slurm}/requirements.txt | 0 girder_worker/slurm/setup.py | 34 +++++++++++++++++++ setup.py | 2 +- 7 files changed, 57 insertions(+), 22 deletions(-) rename girder_worker/{singularity/girder_worker_singularity/tasks/uf.py => slurm/girder_worker_slurm/__init__.py} (87%) create mode 100644 girder_worker/slurm/pyproject.toml rename girder_worker/{singularity => slurm}/requirements.txt (100%) create mode 100644 girder_worker/slurm/setup.py diff --git a/girder_worker/singularity/girder_worker_singularity/tasks/__init__.py b/girder_worker/singularity/girder_worker_singularity/tasks/__init__.py index c910a5c2..6c2dc685 100644 --- a/girder_worker/singularity/girder_worker_singularity/tasks/__init__.py +++ b/girder_worker/singularity/girder_worker_singularity/tasks/__init__.py @@ -109,7 +109,7 @@ def singularity_run(task,**kwargs): "Expected 'FDReadStreamConnector' or 'FDWriterStreamConnector', received '%s'" % type(connector)) - from uf import slurm_dispatch + from girder_worker_slurm import slurm_dispatch slurm_dispatch(task, container_args, run_kwargs, read_streams, write_streams, log_file_name) results = [] diff --git a/girder_worker/singularity/setup.py b/girder_worker/singularity/setup.py index 2c8e1a81..a0239703 100644 --- a/girder_worker/singularity/setup.py +++ b/girder_worker/singularity/setup.py @@ -1,9 +1,5 @@ from setuptools import setup, find_packages - -with open('requirements.txt') as f: - install_reqs = f.readlines() - setup( name='girder-worker-singularity', version='0.0.0', @@ -23,7 +19,7 @@ 'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: 3.12', ], - install_requires=['girder-worker', *install_reqs], + install_requires=['girder-worker'], entry_points={ 'girder_worker_plugins': [ 'singularity = girder_worker_singularity:SingularityPlugin', diff --git a/girder_worker/singularity/girder_worker_singularity/tasks/uf.py b/girder_worker/slurm/girder_worker_slurm/__init__.py similarity index 87% rename from girder_worker/singularity/girder_worker_singularity/tasks/uf.py rename to girder_worker/slurm/girder_worker_slurm/__init__.py index f19bafd1..2327a5c7 100644 --- a/girder_worker/singularity/girder_worker_singularity/tasks/uf.py +++ b/girder_worker/slurm/girder_worker_slurm/__init__.py @@ -5,7 +5,7 @@ import subprocess from girder_worker.docker import utils from girder_worker import logger -from .utils import remove_tmp_folder_apptainer +from girder_worker_singularity.tasks.utils import remove_tmp_folder_apptainer try: from girder_worker.docker import nvidia @@ -14,9 +14,9 @@ def slurm_dispatch(task, container_args, run_kwargs, read_streams, write_streams, log_file_name): - slurm_run_command,slurm_config = _run_singularity_container(container_args,**run_kwargs) + singularity_run_command, slurm_config = _slurm_singularity_config(container_args, **run_kwargs) try: - monitor_thread = _monitor_singularity_job(task,slurm_run_command,slurm_config,log_file_name) + monitor_thread = _monitor_singularity_job(task, singularity_run_command, slurm_config, log_file_name) def singularity_exit_condition(): ''' This function is used to handle task cancellation and also enable exit condition to stop logging. @@ -30,6 +30,7 @@ def singularity_exit_condition(): except Exception as e: logger.info(f'Error Occured {e}') return not monitor_thread.is_alive() + utils.select_loop(exit_condition=singularity_exit_condition, readers=read_streams, writers=write_streams) @@ -38,7 +39,7 @@ def singularity_exit_condition(): remove_tmp_folder_apptainer(container_args) -def _monitor_singularity_job(task,slurm_run_command,slurm_config,log_file_name): +def _monitor_singularity_job(task, slurm_command, slurm_config, log_file_name): """Create a drmaa session and monitor the job accordingly""" decodestatus = {drmaa.JobState.UNDETERMINED: 'process status cannot be determined', drmaa.JobState.QUEUED_ACTIVE: 'job is queued and active', @@ -51,15 +52,16 @@ def _monitor_singularity_job(task,slurm_run_command,slurm_config,log_file_name): drmaa.JobState.DONE: 'job finished normally', drmaa.JobState.FAILED: 'job finished, but failed'} temp_directory = os.getenv('TMPDIR') - submit_dir = '/blue/pinaki.sarder/rc-svc-pinaki.sarder-web/submission' + submit_script = os.getenv('GIRDER_WORKER_SLURM_SUBMIT_SCRIPT') # '/blue/pinaki.sarder/rc-svc-pinaki.sarder-web/submission/submit.sh' + # TODO: check for validity ^ def job_monitor(): s = drmaa.Session() s.initialize() jt = s.createJobTemplate() jt.workingDirectory = temp_directory - jt.remoteCommand = os.path.join(submit_dir, 'submit.sh') + jt.remoteCommand = submit_script jt.nativeSpecification = slurm_config - jt.args = slurm_run_command + jt.args = slurm_command jt.outputPath = ':' + log_file_name jt.errorPath = ':' + log_file_name try: @@ -100,7 +102,7 @@ def job_monitor(): return monitor_thread -def _run_singularity_container(container_args=None,**kwargs): +def _slurm_singularity_config(container_args=None, **kwargs): image = kwargs['image'] container_args = container_args or kwargs['container_args'] or [] try: @@ -109,25 +111,25 @@ def _run_singularity_container(container_args=None,**kwargs): logger.info('Running container: image: %s args: %s kwargs: %s' % (image, container_args, kwargs)) - slurm_run_command = _generate_slurm_script(container_args,kwargs) - + singularity_run_command = _generate_singularity_command(container_args, kwargs) slurm_config = _get_slurm_config(kwargs) - return [slurm_run_command, slurm_config] + return singularity_run_command, slurm_config except Exception as e: logger.exception(e) raise Exception(e) -def _process_container_args(container_args,kwargs): +def _process_container_args(container_args, kwargs): volumes = kwargs['volumes'] or {} + prefix = os.getenv('GIRDER_WORKER_SLURM_MOUNT_PREFIX') # '/blue/pinaki.sarder/rc-svc-pinaki.sarder-web' def find_matching_volume_key(path): for key, value in volumes.items(): if path.startswith(value['bind']): # Append the suffix from the original path that isn't part of the 'bind' path suffix = path[len(value['bind']):] if value['bind'] != path else '' if 'assetstore' in key: - key = '/blue/pinaki.sarder/rc-svc-pinaki.sarder-web' + key + key = prefix + key new_key = key + suffix.replace(" ", "_") # Replace spaces in suffix with underscores return new_key return path # Replace spaces in paths that don't match any volume @@ -139,7 +141,7 @@ def find_matching_volume_key(path): return updated_container_args -def _generate_slurm_script(container_args, kwargs): +def _generate_singularity_command(container_args, kwargs): container_args = container_args or [] image = kwargs.pop('image', None) singularity_command = [] @@ -150,7 +152,7 @@ def _generate_slurm_script(container_args, kwargs): #Code to check for allocating multiple gpus. try: gpu_index = container_args.index('--gpu') - gpus = int(container_args[gpu_index+1]) + gpus = int(container_args[gpu_index + 1]) nvidia.set_nvidia_params(kwargs, singularity_command, gpus) except ValueError as e: if kwargs['nvidia']: diff --git a/girder_worker/slurm/pyproject.toml b/girder_worker/slurm/pyproject.toml new file mode 100644 index 00000000..8fd8d67e --- /dev/null +++ b/girder_worker/slurm/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools", "setuptools-scm"] +build-backend = "setuptools.build_meta" diff --git a/girder_worker/singularity/requirements.txt b/girder_worker/slurm/requirements.txt similarity index 100% rename from girder_worker/singularity/requirements.txt rename to girder_worker/slurm/requirements.txt diff --git a/girder_worker/slurm/setup.py b/girder_worker/slurm/setup.py new file mode 100644 index 00000000..bfddd68b --- /dev/null +++ b/girder_worker/slurm/setup.py @@ -0,0 +1,34 @@ +from setuptools import setup, find_packages + + +with open('requirements.txt') as f: + install_reqs = f.readlines() + +setup( + name='girder-worker-slurm', + version='0.0.0', + description='An example girder worker extension', + author='Kitware, Inc.', + author_email='kitware@kitware.com', + license='Apache Software License 2.0', + classifiers=[ + 'Development Status :: 4 - Beta', + 'Environment :: Console', + 'License :: OSI Approved :: Apache Software License', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', + ], + install_requires=['girder-worker', 'girder-worker-singularity', *install_reqs], + entry_points={ + 'girder_worker_plugins': [ + 'singularity = girder_worker_singularity:SingularityPlugin', + ] + }, + packages=find_packages(), + zip_safe=False +) diff --git a/setup.py b/setup.py index 5b6c9a07..eb035902 100644 --- a/setup.py +++ b/setup.py @@ -68,8 +68,8 @@ def run(self, *args, **kwargs): extras_require = {} extras_require['girder'] = ['girder>=3.0.1,<5', 'girder-jobs>=3.0.1,<5'] -# TODO: handle package discovery for singularity extras_require['singularity'] = ['girder-worker-singularity'] +extras_require['slurm'] = ['girder-worker-slurm'] # perform the install From 32c8e77edde6ca4a8e9d8297d040af8907bd219b Mon Sep 17 00:00:00 2001 From: willdunklin Date: Mon, 16 Sep 2024 12:09:08 -0400 Subject: [PATCH 22/28] Format code --- girder_worker/docker/io/__init__.py | 6 ++++ girder_worker/docker/io/girder.py | 1 + girder_worker/docker/nvidia.py | 32 +++++++++-------- girder_worker/docker/tasks/__init__.py | 2 +- girder_worker/docker/transforms/__init__.py | 18 ++++++++-- girder_worker/docker/transforms/girder.py | 8 +++++ .../tasks/__init__.py | 35 +++++++++++-------- .../girder_worker_singularity/tasks/utils.py | 6 ++-- girder_worker/singularity/setup.py | 2 +- girder_worker/slurm/setup.py | 3 +- 10 files changed, 74 insertions(+), 39 deletions(-) diff --git a/girder_worker/docker/io/__init__.py b/girder_worker/docker/io/__init__.py index 1cd4fd54..ad262d6b 100644 --- a/girder_worker/docker/io/__init__.py +++ b/girder_worker/docker/io/__init__.py @@ -191,6 +191,7 @@ class FileDescriptorReader(StreamReader): """ Reader to read from a file descriptor. """ + def __init__(self, fd): self._fd = fd @@ -208,6 +209,7 @@ class FileDescriptorWriter(StreamWriter): """ Writer to write to a file descriptor. """ + def __init__(self, fd): self._fd = fd @@ -225,6 +227,7 @@ class StdStreamWriter(StreamWriter): """ Writer for write to stdout and stderr. """ + def __init__(self, stream): self._stream = stream @@ -240,6 +243,7 @@ class NamedPipe: """ A named pipe. """ + def __init__(self, path): self.path = path self._fd = None @@ -267,6 +271,7 @@ class NamedPipeReader(FileDescriptorReader): """ Reader to read from a named pipe. """ + def __init__(self, pipe, container_path=None): super().__init__(None) self._pipe = pipe @@ -290,6 +295,7 @@ class NamedPipeWriter(FileDescriptorWriter): """ Write to write to a named pipe. """ + def __init__(self, pipe, container_path=None): super().__init__(None) self._pipe = pipe diff --git a/girder_worker/docker/io/girder.py b/girder_worker/docker/io/girder.py index 91703578..eabaf14d 100644 --- a/girder_worker/docker/io/girder.py +++ b/girder_worker/docker/io/girder.py @@ -5,6 +5,7 @@ class GirderFileStreamReader(StreamReader): """ Stream a file from Girder. """ + def __init__(self, client, file_id): """ :param client: The GirderClient instance to use. diff --git a/girder_worker/docker/nvidia.py b/girder_worker/docker/nvidia.py index 1edc2144..da99f6b7 100644 --- a/girder_worker/docker/nvidia.py +++ b/girder_worker/docker/nvidia.py @@ -2,23 +2,27 @@ def is_nvidia_image(api, image): labels = api.inspect_image(image).get('Config', {}).get('Labels') return bool(labels and labels.get('com.nvidia.volumes.needed') == 'nvidia_driver') -def set_nvidia_params(kwargs:dict,singularity_command:list,gpus:int=1): - ''' - This function is used to set the gpu parameters based on the user input and plugin job. - + +def set_nvidia_params(kwargs: dict, singularity_command: list, gpus: int = 1): + """ + This function is used to set the gpu parameters based on the user input and plugin job. + Parameters: - kwargs (dict, required): The keyword arguments dictionary sent to the celery task as an input, part of the request - - singularity_command (list, required): A list that container all the arguments to construct a singularity command that will be sent to the HPC job - - gps (int, optional): If the plugin doesn't have a --gpu parameter in contianer_args, then a default of 1 gpu is allocated, else the user specified number of gpus is allocated. - + kwargs (dict, required): The keyword arguments dictionary sent to the celery task as an input, + part of the request + + singularity_command (list, required): A list that container all the arguments to construct a + singularity command that will be sent to the HPC job + + gps (int, optional): If the plugin doesn't have a --gpu parameter in contianer_args, then a + default of 1 gpu is allocated, else the user specified number of gpus is allocated. + Returns: None - ''' - kwargs['--gres'] = f"gres:gpu:a100:{gpus}" if gpus > 1 else f"gres:gpu:a100:1" + """ + kwargs['--gres'] = f'gres:gpu:a100:{gpus}' if gpus > 1 else f'gres:gpu:a100:1' kwargs['--partition'] = 'gpu' kwargs['--mem'] = '32000' - #Reducing CPU count for gpu-based job for resource conservation + # Reducing CPU count for gpu-based job for resource conservation kwargs['--cpus-per-task'] = '8' - singularity_command.append('--nv') \ No newline at end of file + singularity_command.append('--nv') diff --git a/girder_worker/docker/tasks/__init__.py b/girder_worker/docker/tasks/__init__.py index f39753b4..557fc956 100644 --- a/girder_worker/docker/tasks/__init__.py +++ b/girder_worker/docker/tasks/__init__.py @@ -58,7 +58,6 @@ # } - def _pull_image(image): """ Pulls the specified Docker image onto this worker. @@ -450,6 +449,7 @@ def _docker_run(task, image, pull_image=True, entrypoint=None, container_args=No results = (None,) * len(task.request.girder_result_hooks) return results + @app.task(base=DockerTask, bind=True) def docker_run(task, image, pull_image=True, entrypoint=None, container_args=None, volumes=None, remove_container=True, **kwargs): diff --git a/girder_worker/docker/transforms/__init__.py b/girder_worker/docker/transforms/__init__.py index 0c23db81..d00c0bb7 100644 --- a/girder_worker/docker/transforms/__init__.py +++ b/girder_worker/docker/transforms/__init__.py @@ -22,6 +22,7 @@ class HostStdOut(Transform): Represents the standard output stream on the host machine. Can be used with :py:class:`girder_worker.docker.transforms.Connect` to write text to stdout. """ + def transform(self, **kwargs): from girder_worker.docker.io import ( StdStreamWriter @@ -34,6 +35,7 @@ class HostStdErr(Transform): Represents the standard error stream on the host machine. Can be used with :py:class:`girder_worker.docker.transforms.Connect` to write text to stderr. """ + def transform(self, **kwargs): from girder_worker.docker.io import ( StdStreamWriter @@ -47,6 +49,7 @@ class ContainerStdOut(Transform): :py:class:`girder_worker.docker.transforms.Connect` to redirect the containers standard output to another stream. """ + def transform(self, **kwargs): return self @@ -61,6 +64,7 @@ class ContainerStdErr(Transform): :py:class:`girder_worker.docker.transforms.Connect` to redirect the containers standard error to another stream. """ + def transform(self, **kwargs): return self @@ -81,6 +85,7 @@ class BindMountVolume(Transform): :param mode: The mounting mode :type mode: str """ + def __init__(self, host_path, container_path, mode='rw'): self._host_path = host_path self._container_path = container_path @@ -156,6 +161,7 @@ class TemporaryVolume(_TemporaryVolumeBase, metaclass=_TemporaryVolumeMetaClass) """ # Note that this mode is explicitly set with os.chmod. What you # set, is what you get - no os.makedirs umask shenanigans. + def __init__(self, host_dir=None, mode=0o777): super().__init__(None, None) self.host_dir = host_dir @@ -179,6 +185,7 @@ class _DefaultTemporaryVolume(TemporaryVolume): containing information about the actual default temporary volume associated with the task. The place holder then delegates all functionality to this instance. """ + def transform(self, _default_temp_volume=None, **kwargs): self._instance = _default_temp_volume self._transformed = True @@ -253,6 +260,7 @@ class NamedInputPipe(NamedPipeBase): the volume will be used when creating the pipe. The default location is :py:obj:`girder_worker.docker.transforms.TemporaryVolume.default` """ + def __init__(self, name, container_path=None, host_path=None, volume=TemporaryVolume.default): super().__init__(name, container_path, host_path, volume) @@ -283,6 +291,7 @@ class NamedOutputPipe(NamedPipeBase): the volume will be use when creating the pipe. The default location is :py:attr:`girder_worker.docker.transforms.TemporaryVolume.default` """ + def __init__(self, name, container_path=None, host_path=None, volume=TemporaryVolume.default): super().__init__(name, container_path, host_path, volume) @@ -308,12 +317,13 @@ class VolumePath(Transform): :py:attr:`girder_worker.docker.transforms.TemporaryVolume.default` :type volume: :py:class:`girder_worker.docker.transforms.BindMountVolume` """ + def __init__(self, filename, volume=TemporaryVolume.default): if os.path.isabs(filename): raise Exception('VolumePath paths must be relative to a volume (%s).' % filename) - #Modify filename for cli_run - #self.filename = filename - self.filename = filename.replace(' ','_') + # Modify filename for cli_run + # self.filename = filename + self.filename = filename.replace(' ', '_') self._volume = volume def transform(self, *pargs, **kwargs): @@ -345,6 +355,7 @@ class Connect(Transform): :py:class:`girder_worker.docker.transforms.HostStdOut` or :py:class:`girder_worker.docker.transforms.HostStdErr` """ + def __init__(self, input, output): super().__init__() self._input = input @@ -381,6 +392,7 @@ class ChunkedTransferEncodingStream(Transform): :param headers: HTTP headers to send. :type header: dict """ + def __init__(self, url, headers={}, **kwargs): self.url = url self.headers = headers diff --git a/girder_worker/docker/transforms/girder.py b/girder_worker/docker/transforms/girder.py index 08756299..f44509be 100644 --- a/girder_worker/docker/transforms/girder.py +++ b/girder_worker/docker/transforms/girder.py @@ -45,6 +45,7 @@ class ProgressPipe(Transform): :param volume: The bind mount volume where the underlying named pipe will reside. :type volume: :py:class:`girder_worker.docker.transforms.BindMountVolume` """ + def __init__(self, name='.girder_progress', volume=TemporaryVolume.default): self.name = name self._volume = volume @@ -70,6 +71,7 @@ class GirderFileIdToStream(GirderClientTransform): :param _id: The Girder file ID. :type _id: str or ObjectId """ + def __init__(self, _id, **kwargs): super().__init__(**kwargs) self.file_id = _id @@ -93,6 +95,7 @@ class GirderFileIdToVolume(GirderClientTransform): :param filename: Alternate name for the file. Default is to use the name from Girder. :type filename: str """ + def __init__(self, _id, volume=TemporaryVolume.default, filename=None, **kwargs): super().__init__(**kwargs) self._file_id = str(_id) @@ -153,6 +156,7 @@ class GirderFolderIdToVolume(GirderClientTransform): :param folder_name: Alternate name for the directory. Default is to use the name from Girder. :type folder_name: str """ + def __init__(self, _id, volume=TemporaryVolume.default, folder_name=None, **kwargs): super().__init__(**kwargs) self._folder_id = str(_id) @@ -206,6 +210,7 @@ class GirderItemIdToVolume(GirderClientTransform): :param item_name: Alternate name for the file. Default is to use the name from Girder. :type item_name: str """ + def __init__(self, _id, volume=TemporaryVolume.default, **kwargs): super().__init__(**kwargs) self._item_id = str(_id) @@ -257,6 +262,7 @@ class GirderUploadVolumePathToItem(GirderUploadToItem): :param delete_file: Whether to delete the file afterward. :type delete_file: bool """ + def __init__(self, volumepath, item_id, delete_file=False, **kwargs): item_id = str(item_id) super().__init__(item_id, delete_file, **kwargs) @@ -281,6 +287,7 @@ class GirderUploadVolumePathToFolder(GirderUploadToFolder): :param delete_file: Whether to delete the data afterward. :type delete_file: bool """ + def __init__(self, volumepath, folder_id, delete_file=False, **kwargs): super().__init__(str(folder_id), delete_file, **kwargs) self._volumepath = volumepath @@ -311,6 +318,7 @@ class GirderUploadVolumePathJobArtifact(GirderUploadJobArtifact): fails. This can be used to debug failed ``docker_run`` tasks. :type upload_on_exception: bool """ + def __init__(self, volumepath, job_id=None, name=None, upload_on_exception=False, **kwargs): if job_id is not None: job_id = str(job_id) diff --git a/girder_worker/singularity/girder_worker_singularity/tasks/__init__.py b/girder_worker/singularity/girder_worker_singularity/tasks/__init__.py index 6c2dc685..d20dc970 100644 --- a/girder_worker/singularity/girder_worker_singularity/tasks/__init__.py +++ b/girder_worker/singularity/girder_worker_singularity/tasks/__init__.py @@ -2,15 +2,18 @@ import threading from girder_worker_utils import _walk_obj -# from slicer_cli_web.singularity.utils import switch_to_sif_image_folder -from .utils import remove_tmp_folder_apptainer from girder_worker import logger from girder_worker.app import Task, app -from girder_worker.docker.io import FDReadStreamConnector, FDWriteStreamConnector, FDWriteStreamConnector, FDReadStreamConnector -from girder_worker.docker.tasks import _RequestDefaultTemporaryVolume, _handle_streaming_args +from girder_worker.docker.io import (FDReadStreamConnector, + FDWriteStreamConnector) +from girder_worker.docker.tasks import (_handle_streaming_args, + _RequestDefaultTemporaryVolume) from girder_worker.docker.transforms import TemporaryVolume +# from slicer_cli_web.singularity.utils import switch_to_sif_image_folder +from .utils import remove_tmp_folder_apptainer + BLACKLISTED_DOCKER_RUN_ARGS = ['tty', 'detach'] @@ -72,15 +75,15 @@ def _cleanup_temp_volumes(self, temp_volumes, default_temp_volume): remove_tmp_folder_apptainer(temp_volumes) -def singularity_run(task,**kwargs): - volumes = kwargs.pop('volumes',{}) - container_args = kwargs.pop('container_args',[]) +def singularity_run(task, **kwargs): + volumes = kwargs.pop('volumes', {}) + container_args = kwargs.pop('container_args', []) stream_connectors = kwargs['stream_connectors'] or [] image = kwargs.get('image') or '' entrypoint = None if not image: - logger.exception(f"Image name cannot be empty") - raise Exception(f"Image name cannot be empty") + logger.exception(f'Image name cannot be empty') + raise Exception(f'Image name cannot be empty') run_kwargs = { 'tty': False, @@ -91,14 +94,14 @@ def singularity_run(task,**kwargs): extra_run_kwargs = {k: v for k, v in kwargs.items() if k not in BLACKLISTED_DOCKER_RUN_ARGS} run_kwargs.update(extra_run_kwargs) - #Make entrypoint as pwd + # Make entrypoint as pwd if entrypoint is not None: run_kwargs['entrypoint'] = entrypoint log_file_name = kwargs['log_file'] - container_args,read_streams,write_streams = _handle_streaming_args(container_args) - #MODIFIED FOR SINGULARITY (CHANGE CODE OF SINGULARITY CONTAINER) + container_args, read_streams, write_streams = _handle_streaming_args(container_args) + # MODIFIED FOR SINGULARITY (CHANGE CODE OF SINGULARITY CONTAINER) for connector in stream_connectors: if isinstance(connector, FDReadStreamConnector): read_streams.append(connector) @@ -113,12 +116,14 @@ def singularity_run(task,**kwargs): slurm_dispatch(task, container_args, run_kwargs, read_streams, write_streams, log_file_name) results = [] - if hasattr(task.request,'girder_result_hooks'): + if hasattr(task.request, 'girder_result_hooks'): results = (None,) * len(task.request.girder_result_hooks) return results -#This function is used to check whether we need to switch to singularity or not. +# This function is used to check whether we need to switch to singularity or not. + + def use_singularity(): ''' #This needs to be uncommented. Only for testing purposes. @@ -139,4 +144,4 @@ def use_singularity(): @app.task def container_backend(**kwargs): - return use_singularity() \ No newline at end of file + return use_singularity() diff --git a/girder_worker/singularity/girder_worker_singularity/tasks/utils.py b/girder_worker/singularity/girder_worker_singularity/tasks/utils.py index 88e0dac0..d08bde9e 100644 --- a/girder_worker/singularity/girder_worker_singularity/tasks/utils.py +++ b/girder_worker/singularity/girder_worker_singularity/tasks/utils.py @@ -11,9 +11,9 @@ def remove_tmp_folder_apptainer(container_args=[]): do file I/O operations before/while the job was run, we need to clean up by removing the folder. ''' if not container_args: - logger.info("Host path not found.") - #Cautious checking host path before removing it from the filesystem. - pattern = r"\/tmp\/tmp[^/]+" + logger.info('Host path not found.') + # Cautious checking host path before removing it from the filesystem. + pattern = r'\/tmp\/tmp[^/]+' for arg in container_args: if re.search(pattern, arg): if os.path.exists(arg): diff --git a/girder_worker/singularity/setup.py b/girder_worker/singularity/setup.py index a0239703..33cb6cdb 100644 --- a/girder_worker/singularity/setup.py +++ b/girder_worker/singularity/setup.py @@ -1,4 +1,4 @@ -from setuptools import setup, find_packages +from setuptools import find_packages, setup setup( name='girder-worker-singularity', diff --git a/girder_worker/slurm/setup.py b/girder_worker/slurm/setup.py index bfddd68b..78f32799 100644 --- a/girder_worker/slurm/setup.py +++ b/girder_worker/slurm/setup.py @@ -1,5 +1,4 @@ -from setuptools import setup, find_packages - +from setuptools import find_packages, setup with open('requirements.txt') as f: install_reqs = f.readlines() From 2399313837c932985a97ab38ad1b8f78ca29ace0 Mon Sep 17 00:00:00 2001 From: willdunklin Date: Mon, 16 Sep 2024 14:22:06 -0400 Subject: [PATCH 23/28] Add slurm configuration settings --- girder_worker/docker/nvidia.py | 25 ---- girder_worker/girder_plugin/constants.py | 13 ++ girder_worker/girder_plugin/event_handlers.py | 21 ++++ .../web_client/templates/configView.pug | 52 +++++++- .../web_client/views/ConfigView.js | 57 ++++++++- .../slurm/girder_worker_slurm/__init__.py | 118 +++++++++++------- 6 files changed, 217 insertions(+), 69 deletions(-) diff --git a/girder_worker/docker/nvidia.py b/girder_worker/docker/nvidia.py index da99f6b7..e66d2f48 100644 --- a/girder_worker/docker/nvidia.py +++ b/girder_worker/docker/nvidia.py @@ -1,28 +1,3 @@ def is_nvidia_image(api, image): labels = api.inspect_image(image).get('Config', {}).get('Labels') return bool(labels and labels.get('com.nvidia.volumes.needed') == 'nvidia_driver') - - -def set_nvidia_params(kwargs: dict, singularity_command: list, gpus: int = 1): - """ - This function is used to set the gpu parameters based on the user input and plugin job. - - Parameters: - kwargs (dict, required): The keyword arguments dictionary sent to the celery task as an input, - part of the request - - singularity_command (list, required): A list that container all the arguments to construct a - singularity command that will be sent to the HPC job - - gps (int, optional): If the plugin doesn't have a --gpu parameter in contianer_args, then a - default of 1 gpu is allocated, else the user specified number of gpus is allocated. - - Returns: - None - """ - kwargs['--gres'] = f'gres:gpu:a100:{gpus}' if gpus > 1 else f'gres:gpu:a100:1' - kwargs['--partition'] = 'gpu' - kwargs['--mem'] = '32000' - # Reducing CPU count for gpu-based job for resource conservation - kwargs['--cpus-per-task'] = '8' - singularity_command.append('--nv') diff --git a/girder_worker/girder_plugin/constants.py b/girder_worker/girder_plugin/constants.py index 8bc616d0..8817507e 100644 --- a/girder_worker/girder_plugin/constants.py +++ b/girder_worker/girder_plugin/constants.py @@ -29,3 +29,16 @@ class PluginSettings: BACKEND = 'worker.backend' API_URL = 'worker.api_url' DIRECT_PATH = 'worker.direct_path' + # Slurm Settings + SLURM_ACCOUNT = 'worker.slurm_account' + SLURM_QOS = 'worker.slurm_qos' + SLURM_MEM = 'worker.slurm_mem' + SLURM_CPUS = 'worker.slurm_cpus' + SLURM_NTASKS = 'worker.slurm_ntasks', + SLURM_PARTITION = 'worker.slurm_partition' + SLURM_TIME = 'worker.slurm_time' + SLURM_GRES_CONFIG = 'worker.slurm_gres_config' + # GPU Settings + SLURM_GPU = 'worker.slurm_gpu' + SLURM_GPU_PARTITION = 'worker.slurm_gpu_partition' + SLURM_GPU_MEM = 'worker.slurm_gpu_mem' diff --git a/girder_worker/girder_plugin/event_handlers.py b/girder_worker/girder_plugin/event_handlers.py index 5cd712e5..808a1e28 100644 --- a/girder_worker/girder_plugin/event_handlers.py +++ b/girder_worker/girder_plugin/event_handlers.py @@ -58,6 +58,27 @@ def _validateAutoCompute(doc): raise ValidationException('The direct path setting must be true or false.') +@setting_utilities.validator({ + PluginSettings.SLURM_ACCOUNT, + PluginSettings.SLURM_QOS, + PluginSettings.SLURM_MEM, + PluginSettings.SLURM_CPUS, + PluginSettings.SLURM_NTASKS, + PluginSettings.SLURM_PARTITION, + PluginSettings.SLURM_TIME, + PluginSettings.SLURM_GRES_CONFIG, + PluginSettings.SLURM_GPU, + PluginSettings.SLURM_GPU_PARTITION, + PluginSettings.SLURM_GPU_MEM +}) +def validateSlurmSettings(doc): + """ + Validate the SLURM settings. + """ + # TODO: add validation + pass + + def validateJobStatus(event): """Allow our custom job status values.""" if CustomJobStatus.isValid(event.info): diff --git a/girder_worker/girder_plugin/web_client/templates/configView.pug b/girder_worker/girder_plugin/web_client/templates/configView.pug index ad15842f..196a763f 100644 --- a/girder_worker/girder_plugin/web_client/templates/configView.pug +++ b/girder_worker/girder_plugin/web_client/templates/configView.pug @@ -19,7 +19,57 @@ form#g-worker-settings-form(role="form") .checkbox label.control-label(for="g-worker-direct-path") input#g-worker-direct-path(type="checkbox") - span When possible, send local file paths to the worker to avoid downloading files + span When possible, send local file paths to the worker to avoid downloading file + + h3. + Slurm + + .form-group + label.control-label(for="g-worker-slurm-account") Slurm Account + input#g-worker-slurm-account.input-sm.form-control( + type="text", placeholder="Slurm Account") + .form-group + label.control-label(for="g-worker-slurm-qos") Slurm QOS + input#g-worker-slurm-qos.input-sm.form-control( + type="text", placeholder="Slurm QOS") + .form-group + label.control-label(for="g-worker-slurm-mem") Slurm Memory Allocation (MB) + input#g-worker-slurm-mem.input-sm.form-control( + type="text", placeholder="Slurm Memory (default: 16,000)") + .form-group + label.control-label(for="g-worker-slurm-cpu") Slurm CPU Count + input#g-worker-slurm-cpu.input-sm.form-control( + type="text", placeholder="Slurm CPU Count (default: 4)") + .form-group + label.control-label(for="g-worker-slurm-ntasks") Slurm Number of Tasks + input#g-worker-slurm-ntasks.input-sm.form-control( + type="text", placeholder="Slurm Number of Tasks") + .form-group + label.control-label(for="g-worker-slurm-partition") Slurm Partition + input#g-worker-slurm-partition.input-sm.form-control( + type="text", placeholder="Slurm Partition") + .form-group + label.control-label(for="g-worker-slurm-time") Slurm Time + input#g-worker-slurm-time.input-sm.form-control( + type="text", placeholder="Slurm Time (default: 72:00)") + .form-group + label.control-label(for="g-worker-slurm-gres-config") Slurm GRES Configuration + input#g-worker-slurm-gres-config.input-sm.form-control( + type="text", placeholder="Slurm GRES Configuration") + + h4 Slurm GPU Settings + .form-group + label.control-label(for="g-worker-slurm-gpu") Slurm GPU + input#g-worker-slurm-gpu.input-sm.form-control( + type="text", placeholder="Slurm GPU") + .form-group + label.control-label(for="g-worker-slurm-gpu-partition") Slurm GPU Partition + input#g-worker-slurm-gpu-partition.input-sm.form-control( + type="text", placeholder="Slurm GPU Partition (default: gpu)") + .form-group + label.control-label(for="g-worker-slurm-gpu-mem") Slurm GPU Memory Allocation (MB) + input#g-worker-slurm-gpu-mem.input-sm.form-control( + type="text", placeholder="Slurm GPU Memory (MB)") p#g-worker-settings-error-message.g-validation-failed-message input.btn.btn-sm.btn-primary(type="submit", value="Save") diff --git a/girder_worker/girder_plugin/web_client/views/ConfigView.js b/girder_worker/girder_plugin/web_client/views/ConfigView.js index 49bde80a..ccedb491 100644 --- a/girder_worker/girder_plugin/web_client/views/ConfigView.js +++ b/girder_worker/girder_plugin/web_client/views/ConfigView.js @@ -24,6 +24,39 @@ var ConfigView = View.extend({ }, { key: 'worker.direct_path', value: this.$('#g-worker-direct-path').is(':checked') + }, { + key: 'worker.slurm_account', + value: this.$('#g-worker-slurm-account').val().trim() + }, { + key: 'worker.slurm_qos', + value: this.$('#g-worker-slurm-qos').val().trim() + }, { + key: 'worker.slurm_mem', + value: this.$('#g-worker-slurm-mem').val().trim() + }, { + key: 'worker.slurm_cpus', + value: this.$('#g-worker-slurm-cpu').val().trim() + }, { + key: 'worker.slurm_ntasks', + value: this.$('#g-worker-slurm-ntasks').val().trim() + }, { + key: 'worker.slurm_partition', + value: this.$('#g-worker-slurm-partition').val().trim() + }, { + key: 'worker.slurm_time', + value: this.$('#g-worker-slurm-time').val().trim() + }, { + key: 'worker.slurm_gres_config', + value: this.$('#g-worker-slurm-gres-config').val().trim() + }, { + key: 'worker.slurm_gpu', + value: this.$('#g-worker-slurm-gpu').val().trim() + }, { + key: 'worker.slurm_gpu_partition', + value: this.$('#g-worker-slurm-gpu-partition').val().trim() + }, { + key: 'worker.slurm_gpu_mem', + value: this.$('#g-worker-slurm-gpu-mem').val().trim() }]); }, @@ -41,7 +74,18 @@ var ConfigView = View.extend({ 'worker.api_url', 'worker.broker', 'worker.backend', - 'worker.direct_path' + 'worker.direct_path', + 'worker.slurm_account', + 'worker.slurm_qos', + 'worker.slurm_mem', + 'worker.slurm_cpus', + 'worker.slurm_ntasks', + 'worker.slurm_partition', + 'worker.slurm_time', + 'worker.slurm_gres_config', + 'worker.slurm_gpu', + 'worker.slurm_gpu_partition', + 'worker.slurm_gpu_mem' ]) } }).done((resp) => { @@ -50,6 +94,17 @@ var ConfigView = View.extend({ this.$('#g-worker-broker').val(resp['worker.broker']); this.$('#g-worker-backend').val(resp['worker.backend']); this.$('#g-worker-direct-path').prop('checked', resp['worker.direct_path']); + this.$('#g-worker-slurm-account').val(resp['worker.slurm_account']); + this.$('#g-worker-slurm-qos').val(resp['worker.slurm_qos']); + this.$('#g-worker-slurm-mem').val(resp['worker.slurm_mem']); + this.$('#g-worker-slurm-cpu').val(resp['worker.slurm_cpus']); + this.$('#g-worker-slurm-ntasks').val(resp['worker.slurm_ntasks']); + this.$('#g-worker-slurm-partition').val(resp['worker.slurm_partition']); + this.$('#g-worker-slurm-time').val(resp['worker.slurm_time']); + this.$('#g-worker-slurm-gres-config').val(resp['worker.slurm_gres_config']); + this.$('#g-worker-slurm-gpu').val(resp['worker.slurm_gpu']); + this.$('#g-worker-slurm-gpu-partition').val(resp['worker.slurm_gpu_partition']); + this.$('#g-worker-slurm-gpu-mem').val(resp['worker.slurm_gpu_mem']); }); }, diff --git a/girder_worker/slurm/girder_worker_slurm/__init__.py b/girder_worker/slurm/girder_worker_slurm/__init__.py index 2327a5c7..0f9f1538 100644 --- a/girder_worker/slurm/girder_worker_slurm/__init__.py +++ b/girder_worker/slurm/girder_worker_slurm/__init__.py @@ -1,32 +1,34 @@ -import drmaa -import time -import threading import os import subprocess -from girder_worker.docker import utils -from girder_worker import logger +import threading +import time + +import drmaa from girder_worker_singularity.tasks.utils import remove_tmp_folder_apptainer -try: - from girder_worker.docker import nvidia -except ImportError: - pass +from girder.models.setting import Setting +from girder_worker import logger +from girder_worker.docker import utils +from girder_worker.girder_plugin.constants import PluginSettings def slurm_dispatch(task, container_args, run_kwargs, read_streams, write_streams, log_file_name): singularity_run_command, slurm_config = _slurm_singularity_config(container_args, **run_kwargs) try: - monitor_thread = _monitor_singularity_job(task, singularity_run_command, slurm_config, log_file_name) + monitor_thread = _monitor_singularity_job( + task, singularity_run_command, slurm_config, log_file_name) + def singularity_exit_condition(): - ''' + """ This function is used to handle task cancellation and also enable exit condition to stop logging. - ''' - #Check if the cancel event is called and the jobId is set for the current job thread we are intending to cancel. + """ + # Check if the cancel event is called and the jobId is set for the current + # job thread we are intending to cancel. if task.canceled and monitor_thread.jobId: try: returnCode = subprocess.call(apptainer_cancel_cmd(monitor_thread.jobId)) if returnCode != 0: - raise Exception(f"Failed to Cancel job with jobID {monitor_thread.jobId}") + raise Exception(f'Failed to Cancel job with jobID {monitor_thread.jobId}') except Exception as e: logger.info(f'Error Occured {e}') return not monitor_thread.is_alive() @@ -52,8 +54,10 @@ def _monitor_singularity_job(task, slurm_command, slurm_config, log_file_name): drmaa.JobState.DONE: 'job finished normally', drmaa.JobState.FAILED: 'job finished, but failed'} temp_directory = os.getenv('TMPDIR') - submit_script = os.getenv('GIRDER_WORKER_SLURM_SUBMIT_SCRIPT') # '/blue/pinaki.sarder/rc-svc-pinaki.sarder-web/submission/submit.sh' + # '/blue/pinaki.sarder/rc-svc-pinaki.sarder-web/submission/submit.sh' + submit_script = os.getenv('GIRDER_WORKER_SLURM_SUBMIT_SCRIPT') # TODO: check for validity ^ + def job_monitor(): s = drmaa.Session() s.initialize() @@ -66,7 +70,8 @@ def job_monitor(): jt.errorPath = ':' + log_file_name try: jobid = s.runJob(jt) - #Set the jobID for the current thread so we can access it outside this thread incase we need to cancel the job. + # Set the jobID for the current thread so we can access it outside this + # thread incase we need to cancel the job. threading.current_thread().jobId = jobid logger.info((f'Submitted singularity job with jobid {jobid}')) with open(log_file_name, 'r') as f: @@ -122,7 +127,9 @@ def _slurm_singularity_config(container_args=None, **kwargs): def _process_container_args(container_args, kwargs): volumes = kwargs['volumes'] or {} - prefix = os.getenv('GIRDER_WORKER_SLURM_MOUNT_PREFIX') # '/blue/pinaki.sarder/rc-svc-pinaki.sarder-web' + # '/blue/pinaki.sarder/rc-svc-pinaki.sarder-web' + prefix = os.getenv('GIRDER_WORKER_SLURM_MOUNT_PREFIX') + def find_matching_volume_key(path): for key, value in volumes.items(): if path.startswith(value['bind']): @@ -130,14 +137,15 @@ def find_matching_volume_key(path): suffix = path[len(value['bind']):] if value['bind'] != path else '' if 'assetstore' in key: key = prefix + key - new_key = key + suffix.replace(" ", "_") # Replace spaces in suffix with underscores + # Replace spaces in suffix with underscores + new_key = key + suffix.replace(' ', '_') return new_key return path # Replace spaces in paths that don't match any volume try: - # Replace paths in container_args with their corresponding volume keys + # Replace paths in container_args with their corresponding volume keys updated_container_args = [str(find_matching_volume_key(arg)) for arg in container_args] except Exception as e: - logger.info(f"error {e}") + logger.info(f'error {e}') return updated_container_args @@ -149,55 +157,81 @@ def _generate_singularity_command(container_args, kwargs): raise Exception(' Issue with Slicer_Cli_Plugin_Image. Plugin Not available') SIF_DIRECTORY = os.getenv('SIF_IMAGE_PATH') image_full_path = os.path.join(SIF_DIRECTORY, image) - #Code to check for allocating multiple gpus. + # Code to check for allocating multiple gpus. try: gpu_index = container_args.index('--gpu') gpus = int(container_args[gpu_index + 1]) - nvidia.set_nvidia_params(kwargs, singularity_command, gpus) + set_nvidia_params(kwargs, singularity_command, gpus) except ValueError as e: if kwargs['nvidia']: - nvidia.set_nvidia_params(kwargs, singularity_command) + set_nvidia_params(kwargs, singularity_command) try: pwd = kwargs['pwd'] if not pwd: - raise Exception("PWD cannot be empty") + raise Exception('PWD cannot be empty') singularity_command.extend(['--pwd', pwd]) singularity_command.append(image_full_path) singularity_command.extend(container_args) except Exception as e: - logger.info(f"Error occured - {e}") - raise Exception(f"Error Occured - {e}") + logger.info(f'Error occured - {e}') + raise Exception(f'Error Occured - {e}') return singularity_command def _get_slurm_config(kwargs): - #Use this function to add or modify any configuration parameters for the SLURM job + # Use this function to add or modify any configuration parameters for the SLURM job config_defaults = { - '--qos': os.getenv('SLURM_QOS'), - '--account': os.getenv('SLURM_ACCOUNT'), - '--mem':os.getenv('SLURM_MEMORY','16000'), - '--ntasks': os.getenv("SLURM_NTASKS",'1'), - '--time': os.getenv("SLURM_TIME",'72:00'), - '--partition':os.getenv('SLURM_PARTITION','hpg2-compute'), - '--gres':os.getenv('SLURM_GRES_CONFIG'), - '--cpus-per-task':os.getenv('SLURM_CPUS','4') + '--qos': Setting().get(PluginSettings.SLURM_QOS), + '--account': Setting().get(PluginSettings.SLURM_ACCOUNT), + '--mem': Setting().get(PluginSettings.SLURM_MEM), + '--ntasks': Setting().get(PluginSettings.SLURM_NTASKS), + '--time': Setting().get(PluginSettings.SLURM_TIME), + '--partition': Setting().get(PluginSettings.SLURM_PARTITION), + '--gres': Setting().get(PluginSettings.SLURM_GRES_CONFIG), + '--cpus-per-task': Setting().get(PluginSettings.SLURM_CPUS) } - config = {k:kwargs.get(k,config_defaults[k]) for k in config_defaults} + config = {k: kwargs.get(k, config_defaults[k]) for k in config_defaults} - slurm_config = ' '.join(f"{k}={v}" for k,v in config.items() if v is not None) + slurm_config = ' '.join(f'{k}={v}' for k, v in config.items() if v is not None) - logger.info(f"SLURM CONFIG = {slurm_config}") + logger.info(f'SLURM CONFIG = {slurm_config}') return slurm_config +def set_nvidia_params(kwargs: dict, singularity_command: list, gpus: int = 1): + """ + This function is used to set the gpu parameters based on the user input and plugin job. + + Parameters: + kwargs (dict, required): The keyword arguments dictionary sent to the celery task as an input, + part of the request + + singularity_command (list, required): A list that container all the arguments to construct a + singularity command that will be sent to the HPC job + + gps (int, optional): If the plugin doesn't have a --gpu parameter in contianer_args, then a + default of 1 gpu is allocated, else the user specified number of gpus is allocated. + + Returns: + None + """ + kwargs['--gres'] = f'gres:gpu:a100:{gpus}' if gpus > 1 else f'gres:gpu:a100:1' + kwargs['--partition'] = Setting().get(PluginSettings.SLURM_GPU_PARTITION) # 'gpu' + kwargs['--mem'] = Setting().get(PluginSettings.SLURM_GPU_MEM) #'32000' + # Reducing CPU count for gpu-based job for resource conservation + # kwargs['--cpus-per-task'] = '8' + singularity_command.append('--nv') + + class SingularityThread(threading.Thread): - ''' + """ This is a custom Thread class in order to handle cancelling a slurm job outside of the thread since the task context object is not available inside the thread. Methods: __init__(self,target, daemon) - Initialize the thread similar to threading.Thread class, requires a jobId param to keep track of the jobId run(self) - This method is used to run the target function. This is essentially called when you do thread.start() - ''' + """ + def __init__(self, target, daemon=False): super().__init__(daemon=daemon) self.target = target @@ -210,9 +244,9 @@ def run(self): def apptainer_cancel_cmd(jobID, slurm=True): if not jobID: - raise Exception("Please provide jobID for the job that needs to be cancelled") + raise Exception('Please provide jobID for the job that needs to be cancelled') cmd = [] - #If any other type of mechanism is used to interact with HPG, use that. + # If any other type of mechanism is used to interact with HPG, use that. if slurm: cmd.append('scancel') cmd.append(jobID) From 5daf31657bcffb820d4f820f7333f78b826e2be6 Mon Sep 17 00:00:00 2001 From: willdunklin Date: Thu, 26 Sep 2024 12:04:12 -0400 Subject: [PATCH 24/28] Format code --- girder_worker/app.py | 2 +- girder_worker/docker/tasks/__init__.py | 5 --- girder_worker/entrypoint.py | 25 ++------------- .../tasks/__init__.py | 31 ++----------------- .../girder_worker_singularity/tasks/utils.py | 10 +++--- .../slurm/girder_worker_slurm/__init__.py | 21 ++++++------- 6 files changed, 22 insertions(+), 72 deletions(-) diff --git a/girder_worker/app.py b/girder_worker/app.py index 4ed5d175..ab72abd6 100644 --- a/girder_worker/app.py +++ b/girder_worker/app.py @@ -132,7 +132,7 @@ def gw_task_prerun(task=None, sender=None, task_id=None, raise try: - #task.girder_client = GirderClient(apiUrl=task.request.girder_api_url) + # task.girder_client = GirderClient(apiUrl=task.request.girder_api_url) task.girder_client = GirderClient(apiUrl='http://0.0.0.0:8101/api/v1') task.girder_client.token = task.request.girder_client_token except AttributeError: diff --git a/girder_worker/docker/tasks/__init__.py b/girder_worker/docker/tasks/__init__.py index 11ed7d59..6d9af3c3 100644 --- a/girder_worker/docker/tasks/__init__.py +++ b/girder_worker/docker/tasks/__init__.py @@ -15,15 +15,10 @@ except ImportError: # These imports will not be available on the girder side. pass -from girder_worker_utils import _walk_obj -# from slicer_cli_web.singularity.utils import switch_to_sif_image_folder from girder_worker import logger from girder_worker.app import Task, app from girder_worker.docker import utils -from girder_worker.docker.io import (FDReadStreamConnector, FDStreamConnector, - FDWriteStreamConnector, - FileDescriptorReader, StdStreamWriter) from girder_worker.docker.stream_adapter import DockerStreamPushAdapter from girder_worker.docker.io import ( FileDescriptorReader, diff --git a/girder_worker/entrypoint.py b/girder_worker/entrypoint.py index 1f7106c9..573c278a 100644 --- a/girder_worker/entrypoint.py +++ b/girder_worker/entrypoint.py @@ -1,15 +1,9 @@ from importlib import import_module import celery -# Delete after testing -from girder_jobs.models.job import Job from girder_worker_utils import decorators -# from girder_worker.docker.tasks import use_singularity from stevedore import extension -#Delete after testing -from girder_jobs.models.job import Job - #: Defines the namespace used for plugin entrypoints NAMESPACE = 'girder_worker_plugins' @@ -64,30 +58,15 @@ def get_module_tasks(module_name): for name, func in vars(module).items(): full_name = '%s.%s' % (module_name, name) - #Just for debugging - job = Job().updateJob( - job, - log=f"The fullname of function is {full_name} and func is {func}", - status="Error", - ) if not hasattr(func, '__call__'): # filter out objects that are not callable continue - # if name != 'singularity_run' or name != 'run': - # continue - # if (use_singularity() and name == 'docker_run') or (not use_singularity() and name == 'singularity_run'): - # continue + try: decorators.get_description_attribute(func) tasks[full_name] = func except decorators.MissingDescriptionException: - #Just for testing - job = Job().updateJob( - job, - log=f"The fullname of function is {full_name} and func is {func}", - status="Error", - ) - #pass + pass return tasks diff --git a/girder_worker/singularity/girder_worker_singularity/tasks/__init__.py b/girder_worker/singularity/girder_worker_singularity/tasks/__init__.py index d20dc970..9503e288 100644 --- a/girder_worker/singularity/girder_worker_singularity/tasks/__init__.py +++ b/girder_worker/singularity/girder_worker_singularity/tasks/__init__.py @@ -4,7 +4,7 @@ from girder_worker_utils import _walk_obj from girder_worker import logger -from girder_worker.app import Task, app +from girder_worker.app import Task from girder_worker.docker.io import (FDReadStreamConnector, FDWriteStreamConnector) from girder_worker.docker.tasks import (_handle_streaming_args, @@ -82,8 +82,8 @@ def singularity_run(task, **kwargs): image = kwargs.get('image') or '' entrypoint = None if not image: - logger.exception(f'Image name cannot be empty') - raise Exception(f'Image name cannot be empty') + logger.exception('Image name cannot be empty') + raise Exception('Image name cannot be empty') run_kwargs = { 'tty': False, @@ -120,28 +120,3 @@ def singularity_run(task, **kwargs): results = (None,) * len(task.request.girder_result_hooks) return results - -# This function is used to check whether we need to switch to singularity or not. - - -def use_singularity(): - ''' - #This needs to be uncommented. Only for testing purposes. - ''' - # runtime = os.environ.get('RUNTIME') - # if runtime == 'SINGULARITY': - # return True - # if runtime == 'DOCKER': - # return False - # try: - # #Check whether we are connected to a docker socket. - # with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as s: - # return s.connect_ex('/var/run/docker.sock') != 0 - # except socket.error: - # return False - return True - - -@app.task -def container_backend(**kwargs): - return use_singularity() diff --git a/girder_worker/singularity/girder_worker_singularity/tasks/utils.py b/girder_worker/singularity/girder_worker_singularity/tasks/utils.py index d08bde9e..65a27bbb 100644 --- a/girder_worker/singularity/girder_worker_singularity/tasks/utils.py +++ b/girder_worker/singularity/girder_worker_singularity/tasks/utils.py @@ -6,10 +6,11 @@ def remove_tmp_folder_apptainer(container_args=[]): - ''' - This function will run after the slurm job completes and returns. If a temp folder is created in the temp directory to - do file I/O operations before/while the job was run, we need to clean up by removing the folder. - ''' + """ + This function will run after the slurm job completes and returns. If a temp folder is created + in the temp directory to do file I/O operations before/while the job was run, we need to + clean up by removing the folder. + """ if not container_args: logger.info('Host path not found.') # Cautious checking host path before removing it from the filesystem. @@ -18,3 +19,4 @@ def remove_tmp_folder_apptainer(container_args=[]): if re.search(pattern, arg): if os.path.exists(arg): subprocess.call(['rm', '-rf', arg]) + return diff --git a/girder_worker/slurm/girder_worker_slurm/__init__.py b/girder_worker/slurm/girder_worker_slurm/__init__.py index 0f9f1538..f9be52a4 100644 --- a/girder_worker/slurm/girder_worker_slurm/__init__.py +++ b/girder_worker/slurm/girder_worker_slurm/__init__.py @@ -4,9 +4,9 @@ import time import drmaa +from girder.models.setting import Setting from girder_worker_singularity.tasks.utils import remove_tmp_folder_apptainer -from girder.models.setting import Setting from girder_worker import logger from girder_worker.docker import utils from girder_worker.girder_plugin.constants import PluginSettings @@ -19,9 +19,6 @@ def slurm_dispatch(task, container_args, run_kwargs, read_streams, write_streams task, singularity_run_command, slurm_config, log_file_name) def singularity_exit_condition(): - """ - This function is used to handle task cancellation and also enable exit condition to stop logging. - """ # Check if the cancel event is called and the jobId is set for the current # job thread we are intending to cancel. if task.canceled and monitor_thread.jobId: @@ -162,7 +159,7 @@ def _generate_singularity_command(container_args, kwargs): gpu_index = container_args.index('--gpu') gpus = int(container_args[gpu_index + 1]) set_nvidia_params(kwargs, singularity_command, gpus) - except ValueError as e: + except ValueError: if kwargs['nvidia']: set_nvidia_params(kwargs, singularity_command) try: @@ -216,9 +213,8 @@ def set_nvidia_params(kwargs: dict, singularity_command: list, gpus: int = 1): Returns: None """ - kwargs['--gres'] = f'gres:gpu:a100:{gpus}' if gpus > 1 else f'gres:gpu:a100:1' - kwargs['--partition'] = Setting().get(PluginSettings.SLURM_GPU_PARTITION) # 'gpu' - kwargs['--mem'] = Setting().get(PluginSettings.SLURM_GPU_MEM) #'32000' + kwargs['--gres'] = f'gres:gpu:a100:{gpus}' if gpus > 1 else 'gres:gpu:a100:1' + kwargs['--partition'] = Setting().get(PluginSettings.SLURM_GPU_PARTITION) # Reducing CPU count for gpu-based job for resource conservation # kwargs['--cpus-per-task'] = '8' singularity_command.append('--nv') @@ -226,10 +222,13 @@ def set_nvidia_params(kwargs: dict, singularity_command: list, gpus: int = 1): class SingularityThread(threading.Thread): """ - This is a custom Thread class in order to handle cancelling a slurm job outside of the thread since the task context object is not available inside the thread. + This is a custom Thread class in order to handle cancelling a slurm job outside of the thread + since the task context object is not available inside the thread. Methods: - __init__(self,target, daemon) - Initialize the thread similar to threading.Thread class, requires a jobId param to keep track of the jobId - run(self) - This method is used to run the target function. This is essentially called when you do thread.start() + __init__(self,target, daemon) - Initialize the thread similar to threading. Thread class, + requires a jobId param to keep track of the jobId + run(self) - This method is used to run the target function. This is essentially called when + you do thread.start() """ def __init__(self, target, daemon=False): From 7d3783f7840e7819c285d15c41b44cc320e6389e Mon Sep 17 00:00:00 2001 From: willdunklin Date: Thu, 26 Sep 2024 12:40:15 -0400 Subject: [PATCH 25/28] Undo test case removal --- setup.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index eb035902..1aa1ca29 100644 --- a/setup.py +++ b/setup.py @@ -117,14 +117,14 @@ def run(self, *args, **kwargs): # 'gwexample = girder_worker.examples.plugin_example.gwexample:GWExamplePlugin' ], 'girder_worker._test_plugins.valid_plugins': [ - # 'plugin1 = girder_worker._test_plugins.plugins:TestPlugin1', - # 'plugin2 = girder_worker._test_plugins.plugins:TestPlugin2' + 'plugin1 = girder_worker._test_plugins.plugins:TestPlugin1', + 'plugin2 = girder_worker._test_plugins.plugins:TestPlugin2' ], 'girder_worker._test_plugins.invalid_plugins': [ - # 'exception1 = girder_worker._test_plugins.plugins:TestPluginException1', # noqa - # 'exception2 = girder_worker._test_plugins.plugins:TestPluginException2', # noqa - # 'import = girder_worker._test_plugins.plugins:TestPluginInvalidModule', # noqa - # 'invalid = girder_worker._test_plugins.plugins:NotAValidClass' + 'exception1 = girder_worker._test_plugins.plugins:TestPluginException1', # noqa + 'exception2 = girder_worker._test_plugins.plugins:TestPluginException2', # noqa + 'import = girder_worker._test_plugins.plugins:TestPluginInvalidModule', # noqa + 'invalid = girder_worker._test_plugins.plugins:NotAValidClass' ], 'girder.plugin': [ 'worker = girder_worker.girder_plugin:WorkerPlugin' From 971a91813b5061444ce3183b23437ba3ccd4458c Mon Sep 17 00:00:00 2001 From: willdunklin Date: Wed, 2 Oct 2024 12:09:55 -0400 Subject: [PATCH 26/28] Clean up dependencies --- .../gwexample/analyses/tasks.py | 34 ------------------- requirements-dev.in | 1 - requirements.in | 1 - setup.py | 21 ++++++------ tests/integration/requirements.txt | 1 - 5 files changed, 10 insertions(+), 48 deletions(-) diff --git a/examples/plugin_example/gwexample/analyses/tasks.py b/examples/plugin_example/gwexample/analyses/tasks.py index 4d650f52..a74e708a 100644 --- a/examples/plugin_example/gwexample/analyses/tasks.py +++ b/examples/plugin_example/gwexample/analyses/tasks.py @@ -1,11 +1,6 @@ -import os -import subprocess - from girder_worker_utils import types from girder_worker_utils.decorators import argument -from girder_worker.app import app - @app.task @argument('n', types.Integer, min=1) @@ -14,32 +9,3 @@ def fibonacci(n): if n == 1 or n == 2: return 1 return fibonacci(n-1) + fibonacci(n-2) - -@app.task -# @argument('image_name', 'slide_name', 'path') -def nuclei(image_name, slide_name, path): - # "running nuclei" - print(app, '++++++++++') - if path: - print('using arg path !!') - os.chdir(path) - else: - print('using default path !!') - os.chdir('/home/rc-svc-pinaki.sarder-web/digital_slide_archive/devops/singularity-minimal') - print('Current Path => ', os.getcwd()) - path = os.getcwd() - flags = os.O_RDWR | os.O_CREAT - sif_image = os.open('sarderlab_histomicstk_latest.sif', flags) - sif_image_path = path + image_name if image_name else '/sarderlab_histomicstk_latest.sif' - slide_image = os.open(slide_name, flags) - slide_image_path = path + slide_name if slide_name else '18-142_PAS_1of6.svs' - output = os.open('Nuclei-outputNucleiAnnotationFile.anot', flags) - output_path = path + '/Nuclei-outputNucleiAnnotationFile.anot' - run_container = f'apptainer run --pwd /HistomicsTK/histomicstk/cli {sif_image} NucleiDetection {slide_image} {output}' - try: - res = subprocess.call(f'apptainer run --pwd /HistomicsTK/histomicstk/cli {sif_image_path} NucleiDetection {slide_image_path} {output_path}', shell=True, bufsize=0,stdin=subprocess.PIPE,stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding="UTF8") - print(res, '----1') - - except Exception as e: - print(f"Exception occured {e}") - diff --git a/requirements-dev.in b/requirements-dev.in index e4ca145e..4f51b9a3 100644 --- a/requirements-dev.in +++ b/requirements-dev.in @@ -17,4 +17,3 @@ pytest-cov Sphinx sphinx-rtd-theme tox -drmaa diff --git a/requirements.in b/requirements.in index ab518b5b..62e07cbb 100644 --- a/requirements.in +++ b/requirements.in @@ -10,5 +10,4 @@ stevedore jsonpickle girder_worker_utils>=0.8.4 docker>=2.6.0 -drmaa diff --git a/setup.py b/setup.py index 1aa1ca29..0ffc25fd 100644 --- a/setup.py +++ b/setup.py @@ -109,25 +109,24 @@ def run(self, *args, **kwargs): zip_safe=False, entry_points={ 'console_scripts': [ - 'girder-worker = girder_worker.__main__:main', - 'girder-worker-config = girder_worker.configure:main' + 'girder-worker = girder_worker.__main__:main', + 'girder-worker-config = girder_worker.configure:main' ], 'girder_worker_plugins': [ - 'docker = girder_worker.docker:DockerPlugin [docker]', - # 'gwexample = girder_worker.examples.plugin_example.gwexample:GWExamplePlugin' + 'docker = girder_worker.docker:DockerPlugin [docker]' ], 'girder_worker._test_plugins.valid_plugins': [ - 'plugin1 = girder_worker._test_plugins.plugins:TestPlugin1', - 'plugin2 = girder_worker._test_plugins.plugins:TestPlugin2' + 'plugin1 = girder_worker._test_plugins.plugins:TestPlugin1', + 'plugin2 = girder_worker._test_plugins.plugins:TestPlugin2' ], 'girder_worker._test_plugins.invalid_plugins': [ - 'exception1 = girder_worker._test_plugins.plugins:TestPluginException1', # noqa - 'exception2 = girder_worker._test_plugins.plugins:TestPluginException2', # noqa - 'import = girder_worker._test_plugins.plugins:TestPluginInvalidModule', # noqa - 'invalid = girder_worker._test_plugins.plugins:NotAValidClass' + 'exception1 = girder_worker._test_plugins.plugins:TestPluginException1', # noqa + 'exception2 = girder_worker._test_plugins.plugins:TestPluginException2', # noqa + 'import = girder_worker._test_plugins.plugins:TestPluginInvalidModule', # noqa + 'invalid = girder_worker._test_plugins.plugins:NotAValidClass' ], 'girder.plugin': [ - 'worker = girder_worker.girder_plugin:WorkerPlugin' + 'worker = girder_worker.girder_plugin:WorkerPlugin' ] }, ) diff --git a/tests/integration/requirements.txt b/tests/integration/requirements.txt index e08d59e0..ef6bb370 100644 --- a/tests/integration/requirements.txt +++ b/tests/integration/requirements.txt @@ -5,4 +5,3 @@ requests-toolbelt girder_client==2.3.0 girder-worker-utils>=0.8.0 celery>=4.0.0 -drmaa From bc129a9209ed18adfa2e5d989a8311c37a021d54 Mon Sep 17 00:00:00 2001 From: willdunklin Date: Wed, 2 Oct 2024 12:22:27 -0400 Subject: [PATCH 27/28] Split slurm configuration setting out from worker --- girder_worker/girder_plugin/constants.py | 13 --- girder_worker/girder_plugin/event_handlers.py | 21 ----- .../web_client/templates/configView.pug | 50 ----------- .../web_client/views/ConfigView.js | 55 ------------ .../slurm/girder_worker_slurm/__init__.py | 2 +- .../girder_worker_slurm/girder_plugin.py | 38 ++++++++ .../girder_worker_slurm/web_client/main.js | 1 + .../web_client/package.json | 25 ++++++ .../web_client/templates/configView.pug | 59 ++++++++++++ .../web_client/views/ConfigView.js | 89 +++++++++++++++++++ girder_worker/slurm/setup.py | 6 +- 11 files changed, 216 insertions(+), 143 deletions(-) create mode 100644 girder_worker/slurm/girder_worker_slurm/girder_plugin.py create mode 100644 girder_worker/slurm/girder_worker_slurm/web_client/main.js create mode 100644 girder_worker/slurm/girder_worker_slurm/web_client/package.json create mode 100644 girder_worker/slurm/girder_worker_slurm/web_client/templates/configView.pug create mode 100644 girder_worker/slurm/girder_worker_slurm/web_client/views/ConfigView.js diff --git a/girder_worker/girder_plugin/constants.py b/girder_worker/girder_plugin/constants.py index 8817507e..8bc616d0 100644 --- a/girder_worker/girder_plugin/constants.py +++ b/girder_worker/girder_plugin/constants.py @@ -29,16 +29,3 @@ class PluginSettings: BACKEND = 'worker.backend' API_URL = 'worker.api_url' DIRECT_PATH = 'worker.direct_path' - # Slurm Settings - SLURM_ACCOUNT = 'worker.slurm_account' - SLURM_QOS = 'worker.slurm_qos' - SLURM_MEM = 'worker.slurm_mem' - SLURM_CPUS = 'worker.slurm_cpus' - SLURM_NTASKS = 'worker.slurm_ntasks', - SLURM_PARTITION = 'worker.slurm_partition' - SLURM_TIME = 'worker.slurm_time' - SLURM_GRES_CONFIG = 'worker.slurm_gres_config' - # GPU Settings - SLURM_GPU = 'worker.slurm_gpu' - SLURM_GPU_PARTITION = 'worker.slurm_gpu_partition' - SLURM_GPU_MEM = 'worker.slurm_gpu_mem' diff --git a/girder_worker/girder_plugin/event_handlers.py b/girder_worker/girder_plugin/event_handlers.py index 808a1e28..5cd712e5 100644 --- a/girder_worker/girder_plugin/event_handlers.py +++ b/girder_worker/girder_plugin/event_handlers.py @@ -58,27 +58,6 @@ def _validateAutoCompute(doc): raise ValidationException('The direct path setting must be true or false.') -@setting_utilities.validator({ - PluginSettings.SLURM_ACCOUNT, - PluginSettings.SLURM_QOS, - PluginSettings.SLURM_MEM, - PluginSettings.SLURM_CPUS, - PluginSettings.SLURM_NTASKS, - PluginSettings.SLURM_PARTITION, - PluginSettings.SLURM_TIME, - PluginSettings.SLURM_GRES_CONFIG, - PluginSettings.SLURM_GPU, - PluginSettings.SLURM_GPU_PARTITION, - PluginSettings.SLURM_GPU_MEM -}) -def validateSlurmSettings(doc): - """ - Validate the SLURM settings. - """ - # TODO: add validation - pass - - def validateJobStatus(event): """Allow our custom job status values.""" if CustomJobStatus.isValid(event.info): diff --git a/girder_worker/girder_plugin/web_client/templates/configView.pug b/girder_worker/girder_plugin/web_client/templates/configView.pug index 196a763f..ad9d2d03 100644 --- a/girder_worker/girder_plugin/web_client/templates/configView.pug +++ b/girder_worker/girder_plugin/web_client/templates/configView.pug @@ -21,56 +21,6 @@ form#g-worker-settings-form(role="form") input#g-worker-direct-path(type="checkbox") span When possible, send local file paths to the worker to avoid downloading file - h3. - Slurm - - .form-group - label.control-label(for="g-worker-slurm-account") Slurm Account - input#g-worker-slurm-account.input-sm.form-control( - type="text", placeholder="Slurm Account") - .form-group - label.control-label(for="g-worker-slurm-qos") Slurm QOS - input#g-worker-slurm-qos.input-sm.form-control( - type="text", placeholder="Slurm QOS") - .form-group - label.control-label(for="g-worker-slurm-mem") Slurm Memory Allocation (MB) - input#g-worker-slurm-mem.input-sm.form-control( - type="text", placeholder="Slurm Memory (default: 16,000)") - .form-group - label.control-label(for="g-worker-slurm-cpu") Slurm CPU Count - input#g-worker-slurm-cpu.input-sm.form-control( - type="text", placeholder="Slurm CPU Count (default: 4)") - .form-group - label.control-label(for="g-worker-slurm-ntasks") Slurm Number of Tasks - input#g-worker-slurm-ntasks.input-sm.form-control( - type="text", placeholder="Slurm Number of Tasks") - .form-group - label.control-label(for="g-worker-slurm-partition") Slurm Partition - input#g-worker-slurm-partition.input-sm.form-control( - type="text", placeholder="Slurm Partition") - .form-group - label.control-label(for="g-worker-slurm-time") Slurm Time - input#g-worker-slurm-time.input-sm.form-control( - type="text", placeholder="Slurm Time (default: 72:00)") - .form-group - label.control-label(for="g-worker-slurm-gres-config") Slurm GRES Configuration - input#g-worker-slurm-gres-config.input-sm.form-control( - type="text", placeholder="Slurm GRES Configuration") - - h4 Slurm GPU Settings - .form-group - label.control-label(for="g-worker-slurm-gpu") Slurm GPU - input#g-worker-slurm-gpu.input-sm.form-control( - type="text", placeholder="Slurm GPU") - .form-group - label.control-label(for="g-worker-slurm-gpu-partition") Slurm GPU Partition - input#g-worker-slurm-gpu-partition.input-sm.form-control( - type="text", placeholder="Slurm GPU Partition (default: gpu)") - .form-group - label.control-label(for="g-worker-slurm-gpu-mem") Slurm GPU Memory Allocation (MB) - input#g-worker-slurm-gpu-mem.input-sm.form-control( - type="text", placeholder="Slurm GPU Memory (MB)") - p#g-worker-settings-error-message.g-validation-failed-message input.btn.btn-sm.btn-primary(type="submit", value="Save") diff --git a/girder_worker/girder_plugin/web_client/views/ConfigView.js b/girder_worker/girder_plugin/web_client/views/ConfigView.js index ccedb491..23d1bead 100644 --- a/girder_worker/girder_plugin/web_client/views/ConfigView.js +++ b/girder_worker/girder_plugin/web_client/views/ConfigView.js @@ -24,39 +24,6 @@ var ConfigView = View.extend({ }, { key: 'worker.direct_path', value: this.$('#g-worker-direct-path').is(':checked') - }, { - key: 'worker.slurm_account', - value: this.$('#g-worker-slurm-account').val().trim() - }, { - key: 'worker.slurm_qos', - value: this.$('#g-worker-slurm-qos').val().trim() - }, { - key: 'worker.slurm_mem', - value: this.$('#g-worker-slurm-mem').val().trim() - }, { - key: 'worker.slurm_cpus', - value: this.$('#g-worker-slurm-cpu').val().trim() - }, { - key: 'worker.slurm_ntasks', - value: this.$('#g-worker-slurm-ntasks').val().trim() - }, { - key: 'worker.slurm_partition', - value: this.$('#g-worker-slurm-partition').val().trim() - }, { - key: 'worker.slurm_time', - value: this.$('#g-worker-slurm-time').val().trim() - }, { - key: 'worker.slurm_gres_config', - value: this.$('#g-worker-slurm-gres-config').val().trim() - }, { - key: 'worker.slurm_gpu', - value: this.$('#g-worker-slurm-gpu').val().trim() - }, { - key: 'worker.slurm_gpu_partition', - value: this.$('#g-worker-slurm-gpu-partition').val().trim() - }, { - key: 'worker.slurm_gpu_mem', - value: this.$('#g-worker-slurm-gpu-mem').val().trim() }]); }, @@ -75,17 +42,6 @@ var ConfigView = View.extend({ 'worker.broker', 'worker.backend', 'worker.direct_path', - 'worker.slurm_account', - 'worker.slurm_qos', - 'worker.slurm_mem', - 'worker.slurm_cpus', - 'worker.slurm_ntasks', - 'worker.slurm_partition', - 'worker.slurm_time', - 'worker.slurm_gres_config', - 'worker.slurm_gpu', - 'worker.slurm_gpu_partition', - 'worker.slurm_gpu_mem' ]) } }).done((resp) => { @@ -94,17 +50,6 @@ var ConfigView = View.extend({ this.$('#g-worker-broker').val(resp['worker.broker']); this.$('#g-worker-backend').val(resp['worker.backend']); this.$('#g-worker-direct-path').prop('checked', resp['worker.direct_path']); - this.$('#g-worker-slurm-account').val(resp['worker.slurm_account']); - this.$('#g-worker-slurm-qos').val(resp['worker.slurm_qos']); - this.$('#g-worker-slurm-mem').val(resp['worker.slurm_mem']); - this.$('#g-worker-slurm-cpu').val(resp['worker.slurm_cpus']); - this.$('#g-worker-slurm-ntasks').val(resp['worker.slurm_ntasks']); - this.$('#g-worker-slurm-partition').val(resp['worker.slurm_partition']); - this.$('#g-worker-slurm-time').val(resp['worker.slurm_time']); - this.$('#g-worker-slurm-gres-config').val(resp['worker.slurm_gres_config']); - this.$('#g-worker-slurm-gpu').val(resp['worker.slurm_gpu']); - this.$('#g-worker-slurm-gpu-partition').val(resp['worker.slurm_gpu_partition']); - this.$('#g-worker-slurm-gpu-mem').val(resp['worker.slurm_gpu_mem']); }); }, diff --git a/girder_worker/slurm/girder_worker_slurm/__init__.py b/girder_worker/slurm/girder_worker_slurm/__init__.py index f9be52a4..a162273f 100644 --- a/girder_worker/slurm/girder_worker_slurm/__init__.py +++ b/girder_worker/slurm/girder_worker_slurm/__init__.py @@ -9,7 +9,7 @@ from girder_worker import logger from girder_worker.docker import utils -from girder_worker.girder_plugin.constants import PluginSettings +from .girder_plugin import PluginSettings def slurm_dispatch(task, container_args, run_kwargs, read_streams, write_streams, log_file_name): diff --git a/girder_worker/slurm/girder_worker_slurm/girder_plugin.py b/girder_worker/slurm/girder_worker_slurm/girder_plugin.py new file mode 100644 index 00000000..a418437c --- /dev/null +++ b/girder_worker/slurm/girder_worker_slurm/girder_plugin.py @@ -0,0 +1,38 @@ +from girder.plugin import GirderPlugin +from girder.utility import setting_utilities + + +class PluginSettings: + SLURM_ACCOUNT = 'worker_slurm.account' + SLURM_QOS = 'worker_slurm.qos' + SLURM_MEM = 'worker_slurm.mem' + SLURM_CPUS = 'worker_slurm.cpus' + SLURM_NTASKS = 'worker_slurm.ntasks' + SLURM_PARTITION = 'worker_slurm.partition' + SLURM_TIME = 'worker_slurm.time' + SLURM_GRES_CONFIG = 'worker_slurm.gres_config' + # GPU Settings + SLURM_GPU = 'worker_slurm.gpu' + SLURM_GPU_PARTITION = 'worker_slurm.gpu_partition' + + +class WorkerSlurmPlugin(GirderPlugin): + DISPLAY_NAME = 'Worker Slurm' + CLIENT_SOURCE_PATH = 'web_client' + + def load(self, info): + @setting_utilities.validator({ + PluginSettings.SLURM_ACCOUNT, + PluginSettings.SLURM_QOS, + PluginSettings.SLURM_MEM, + PluginSettings.SLURM_CPUS, + PluginSettings.SLURM_NTASKS, + PluginSettings.SLURM_PARTITION, + PluginSettings.SLURM_TIME, + PluginSettings.SLURM_GRES_CONFIG, + PluginSettings.SLURM_GPU, + PluginSettings.SLURM_GPU_PARTITION, + }) + def validateSlurmSettings(doc): + # TODO: add validation + pass diff --git a/girder_worker/slurm/girder_worker_slurm/web_client/main.js b/girder_worker/slurm/girder_worker_slurm/web_client/main.js new file mode 100644 index 00000000..bb8d4050 --- /dev/null +++ b/girder_worker/slurm/girder_worker_slurm/web_client/main.js @@ -0,0 +1 @@ +import './views/ConfigView'; diff --git a/girder_worker/slurm/girder_worker_slurm/web_client/package.json b/girder_worker/slurm/girder_worker_slurm/web_client/package.json new file mode 100644 index 00000000..d33a4832 --- /dev/null +++ b/girder_worker/slurm/girder_worker_slurm/web_client/package.json @@ -0,0 +1,25 @@ +{ + "name": "@girder/worker-slurm", + "version": "0.0.0", + "description": "Extension to girder_worker enabling Slurm support.", + "homepage": "http://girder.readthedocs.io/en/latest/plugins.html#remote-worker", + "bugs": { + "url": "https://github.com/girder/girder_worker/issues" + }, + "license": "Apache-2.0", + "repository": { + "type": "git", + "url": "https://github.com/girder/girder_worker.git" + }, + "dependencies": { + }, + "peerDependencies": { + "@girder/core": "*", + "@girder/worker": "*" + }, + "girderPlugin": { + "name": "worker_slurm", + "main": "./main.js", + "dependencies": ["worker"] + } +} diff --git a/girder_worker/slurm/girder_worker_slurm/web_client/templates/configView.pug b/girder_worker/slurm/girder_worker_slurm/web_client/templates/configView.pug new file mode 100644 index 00000000..bd4dfef5 --- /dev/null +++ b/girder_worker/slurm/girder_worker_slurm/web_client/templates/configView.pug @@ -0,0 +1,59 @@ +#g-configuration-accordion.panel-group + .panel.panel-default + .panel-heading(data-toggle="collapse", + data-parent="#g-configuration-accordion", + data-target="#g-advanced-settings-tab") + .panel-title + a + b Advanced Settings + #g-advanced-settings-tab.panel-collapse.collapse + .panel-body + h3. + Slurm + + .form-group + label.control-label(for="g-worker-slurm-account") Slurm Account + input#g-worker-slurm-account.input-sm.form-control( + type="text", placeholder="Slurm Account") + .form-group + label.control-label(for="g-worker-slurm-qos") Slurm QOS + input#g-worker-slurm-qos.input-sm.form-control( + type="text", placeholder="Slurm QOS") + .form-group + label.control-label(for="g-worker-slurm-mem") Slurm Memory Allocation (MB) + input#g-worker-slurm-mem.input-sm.form-control( + type="text", placeholder="Slurm Memory (default: 16,000)") + .form-group + label.control-label(for="g-worker-slurm-cpu") Slurm CPU Count + input#g-worker-slurm-cpu.input-sm.form-control( + type="text", placeholder="Slurm CPU Count (default: 4)") + .form-group + label.control-label(for="g-worker-slurm-ntasks") Slurm Number of Tasks + input#g-worker-slurm-ntasks.input-sm.form-control( + type="text", placeholder="Slurm Number of Tasks") + .form-group + label.control-label(for="g-worker-slurm-partition") Slurm Partition + input#g-worker-slurm-partition.input-sm.form-control( + type="text", placeholder="Slurm Partition") + .form-group + label.control-label(for="g-worker-slurm-time") Slurm Time + input#g-worker-slurm-time.input-sm.form-control( + type="text", placeholder="Slurm Time (default: 72:00)") + .form-group + label.control-label(for="g-worker-slurm-gres-config") Slurm GRES Configuration + input#g-worker-slurm-gres-config.input-sm.form-control( + type="text", placeholder="Slurm GRES Configuration") + + h4 Slurm GPU Settings + .form-group + label.control-label(for="g-worker-slurm-gpu") Slurm GPU + input#g-worker-slurm-gpu.input-sm.form-control( + type="text", placeholder="Slurm GPU") + .form-group + label.control-label(for="g-worker-slurm-gpu-partition") Slurm GPU Partition + input#g-worker-slurm-gpu-partition.input-sm.form-control( + type="text", placeholder="Slurm GPU Partition (default: gpu)") + .form-group + label.control-label(for="g-worker-slurm-gpu-mem") Slurm GPU Memory Allocation (MB) + input#g-worker-slurm-gpu-mem.input-sm.form-control( + type="text", placeholder="Slurm GPU Memory (MB)") diff --git a/girder_worker/slurm/girder_worker_slurm/web_client/views/ConfigView.js b/girder_worker/slurm/girder_worker_slurm/web_client/views/ConfigView.js new file mode 100644 index 00000000..de429176 --- /dev/null +++ b/girder_worker/slurm/girder_worker_slurm/web_client/views/ConfigView.js @@ -0,0 +1,89 @@ +import _ from 'underscore'; + +import { wrap } from '@girder/core/utilities/PluginUtils'; +import { restRequest } from '@girder/core/rest'; + +import ConfigTemplate from '../templates/configView.pug'; + +import ConfigView from '@girder/worker/views/ConfigView'; + +wrap(ConfigView, 'render', function (render) { + render.call(this); + + this.$("p#g-worker-settings-error-message", "#g-worker-settings-form").before( + ConfigTemplate() + ); + + return this; +}); + +wrap(ConfigView, 'initialize', function (initialize) { + initialize.call(this); + + restRequest({ + method: 'GET', + url: 'system/setting', + data: { + list: JSON.stringify([ + 'worker_slurm.account', + 'worker_slurm.qos', + 'worker_slurm.mem', + 'worker_slurm.cpus', + 'worker_slurm.ntasks', + 'worker_slurm.partition', + 'worker_slurm.time', + 'worker_slurm.gres_config', + 'worker_slurm.gpu', + 'worker_slurm.gpu_partition' + ]) + } + }).done((resp) => { + this.$('#g-worker-slurm-account').val(resp['worker_slurm.account']); + this.$('#g-worker-slurm-qos').val(resp['worker_slurm.qos']); + this.$('#g-worker-slurm-mem').val(resp['worker_slurm.mem']); + this.$('#g-worker-slurm-cpu').val(resp['worker_slurm.cpus']); + this.$('#g-worker-slurm-ntasks').val(resp['worker_slurm.ntasks']); + this.$('#g-worker-slurm-partition').val(resp['worker_slurm.partition']); + this.$('#g-worker-slurm-time').val(resp['worker_slurm.time']); + this.$('#g-worker-slurm-gres-config').val(resp['worker_slurm.gres_config']); + this.$('#g-worker-slurm-gpu').val(resp['worker_slurm.gpu']); + this.$('#g-worker-slurm-gpu-partition').val(resp['worker_slurm.gpu_partition']); + }); +}); + +const workerConfigSubmitEvent = ConfigView.prototype.events['submit #g-worker-settings-form']; +ConfigView.prototype.events['submit #g-worker-settings-form'] = function (event) { + workerConfigSubmitEvent.call(this, event); + + this._saveSettings([{ + key: 'worker_slurm.account', + value: this.$('#g-worker-slurm-account').val().trim() + }, { + key: 'worker_slurm.qos', + value: this.$('#g-worker-slurm-qos').val().trim() + }, { + key: 'worker_slurm.mem', + value: this.$('#g-worker-slurm-mem').val().trim() + }, { + key: 'worker_slurm.cpus', + value: this.$('#g-worker-slurm-cpu').val().trim() + }, { + key: 'worker_slurm.ntasks', + value: this.$('#g-worker-slurm-ntasks').val().trim() + }, { + key: 'worker_slurm.partition', + value: this.$('#g-worker-slurm-partition').val().trim() + }, { + key: 'worker_slurm.time', + value: this.$('#g-worker-slurm-time').val().trim() + }, { + key: 'worker_slurm.gres_config', + value: this.$('#g-worker-slurm-gres-config').val().trim() + }, { + key: 'worker_slurm.gpu', + value: this.$('#g-worker-slurm-gpu').val().trim() + }, { + key: 'worker_slurm.gpu_partition', + value: this.$('#g-worker-slurm-gpu-partition').val().trim() + }]); +}; diff --git a/girder_worker/slurm/setup.py b/girder_worker/slurm/setup.py index 78f32799..8630c2c3 100644 --- a/girder_worker/slurm/setup.py +++ b/girder_worker/slurm/setup.py @@ -24,9 +24,9 @@ ], install_requires=['girder-worker', 'girder-worker-singularity', *install_reqs], entry_points={ - 'girder_worker_plugins': [ - 'singularity = girder_worker_singularity:SingularityPlugin', - ] + 'girder.plugin': [ + 'worker_slurm = girder_worker_slurm.girder_plugin:WorkerSlurmPlugin', + ], }, packages=find_packages(), zip_safe=False From 2058ecc60728dd7648d2ff6385a27d4e687ff539 Mon Sep 17 00:00:00 2001 From: willdunklin Date: Wed, 2 Oct 2024 12:46:30 -0400 Subject: [PATCH 28/28] Update slurm/singularity extensions to use scm versioning --- .../girder_plugin.py | 8 +++++ girder_worker/singularity/setup.py | 31 +++++++++++++++++-- girder_worker/slurm/setup.py | 26 ++++++++++++++-- setup.py | 1 - 4 files changed, 60 insertions(+), 6 deletions(-) create mode 100644 girder_worker/singularity/girder_worker_singularity/girder_plugin.py diff --git a/girder_worker/singularity/girder_worker_singularity/girder_plugin.py b/girder_worker/singularity/girder_worker_singularity/girder_plugin.py new file mode 100644 index 00000000..b86a5c95 --- /dev/null +++ b/girder_worker/singularity/girder_worker_singularity/girder_plugin.py @@ -0,0 +1,8 @@ +from girder.plugin import GirderPlugin + + +class WorkerSingularityPlugin(GirderPlugin): + DISPLAY_NAME = 'Worker Singularity' + + def load(self, info): + pass diff --git a/girder_worker/singularity/setup.py b/girder_worker/singularity/setup.py index 33cb6cdb..1f76d069 100644 --- a/girder_worker/singularity/setup.py +++ b/girder_worker/singularity/setup.py @@ -1,9 +1,31 @@ +import os + from setuptools import find_packages, setup + +def prerelease_local_scheme(version): + """Return local scheme version unless building on master in CircleCI. + + This function returns the local scheme version number + (e.g. 0.0.0.dev+g) unless building on CircleCI for a + pre-release in which case it ignores the hash and produces a + PEP440 compliant pre-release version number (e.g. 0.0.0.dev). + + """ + + from setuptools_scm.version import get_local_node_and_date + + if os.getenv('CIRCLE_BRANCH') in ('master', ): + return '' + else: + return get_local_node_and_date(version) + + setup( name='girder-worker-singularity', - version='0.0.0', - description='An example girder worker extension', + use_scm_version={'root': '../..', 'local_scheme': prerelease_local_scheme, + 'fallback_version': '0.0.0'}, + description='Enables Singularity support for Worker.', author='Kitware, Inc.', author_email='kitware@kitware.com', license='Apache Software License 2.0', @@ -23,7 +45,10 @@ entry_points={ 'girder_worker_plugins': [ 'singularity = girder_worker_singularity:SingularityPlugin', - ] + ], + 'girder.plugin': [ + 'worker_singularity = girder_worker_singularity.girder_plugin:WorkerSingularityPlugin', + ], }, packages=find_packages(), zip_safe=False diff --git a/girder_worker/slurm/setup.py b/girder_worker/slurm/setup.py index 8630c2c3..42b167f9 100644 --- a/girder_worker/slurm/setup.py +++ b/girder_worker/slurm/setup.py @@ -1,12 +1,34 @@ +import os + from setuptools import find_packages, setup + +def prerelease_local_scheme(version): + """Return local scheme version unless building on master in CircleCI. + + This function returns the local scheme version number + (e.g. 0.0.0.dev+g) unless building on CircleCI for a + pre-release in which case it ignores the hash and produces a + PEP440 compliant pre-release version number (e.g. 0.0.0.dev). + + """ + + from setuptools_scm.version import get_local_node_and_date + + if os.getenv('CIRCLE_BRANCH') in ('master', ): + return '' + else: + return get_local_node_and_date(version) + + with open('requirements.txt') as f: install_reqs = f.readlines() setup( name='girder-worker-slurm', - version='0.0.0', - description='An example girder worker extension', + use_scm_version={'root': '../..', 'local_scheme': prerelease_local_scheme, + 'fallback_version': '0.0.0'}, + description='Enables Slurm support for Worker Singularity.', author='Kitware, Inc.', author_email='kitware@kitware.com', license='Apache Software License 2.0', diff --git a/setup.py b/setup.py index 0ffc25fd..dc938d08 100644 --- a/setup.py +++ b/setup.py @@ -76,7 +76,6 @@ def run(self, *args, **kwargs): setuptools.setup( name='girder-worker', use_scm_version={'local_scheme': prerelease_local_scheme}, - version='0.12.1', setup_requires=['setuptools_scm'], description='Batch execution engine built on celery.', long_description=readme,