From 41f7dc26690fd8679a62630a93cab3859798ec47 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Thu, 24 Oct 2024 10:16:57 -0400 Subject: [PATCH 1/4] Investigate issue with hap failure Signed-off-by: Maroun Touma --- transforms/universal/hap/kfp_ray/Makefile | 59 +++++ transforms/universal/hap/kfp_ray/hap_wf.py | 239 +++++++++++++++++++++ 2 files changed, 298 insertions(+) create mode 100644 transforms/universal/hap/kfp_ray/Makefile create mode 100644 transforms/universal/hap/kfp_ray/hap_wf.py diff --git a/transforms/universal/hap/kfp_ray/Makefile b/transforms/universal/hap/kfp_ray/Makefile new file mode 100644 index 000000000..4074b8713 --- /dev/null +++ b/transforms/universal/hap/kfp_ray/Makefile @@ -0,0 +1,59 @@ +REPOROOT=${CURDIR}/../../../../ + +WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate +include $(REPOROOT)/transforms/.make.workflows + +# Include the common configuration for this transform +include ../transform.config + +SRC_DIR=${CURDIR}/../ray/ + +PYTHON_WF := $(shell find ./ -name '*_wf.py') +YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) + +workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} + +.PHONY: clean +clean: + @# Help: Clean up the virtual environment. + rm -rf ${REPOROOT}/transforms/venv + +venv:: + +build:: + +setup:: + +test:: + +test-src:: + +test-image:: + +publish:: + +image:: + +kind-load-image:: + +docker-load-image:: + +docker-save-image:: + +.PHONY: workflow-build +workflow-build: workflow-venv + $(MAKE) $(YAML_WF) + +.PHONY: workflow-test +workflow-test: workflow-build + $(MAKE) .workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=hap_wf.yaml + +.PHONY: workflow-upload +workflow-upload: workflow-build + @for file in $(YAML_WF); do \ + $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ + done + +.PHONY: workflow-generate +workflow-generate: workflow-venv + . ${WORKFLOW_VENV_ACTIVATE} && ../../../../kfp/pipeline_generator/single-pipeline/run.sh -c `pwd`/pipeline_definitions.yaml -od . diff --git a/transforms/universal/hap/kfp_ray/hap_wf.py b/transforms/universal/hap/kfp_ray/hap_wf.py new file mode 100644 index 000000000..786011d4d --- /dev/null +++ b/transforms/universal/hap/kfp_ray/hap_wf.py @@ -0,0 +1,239 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import os + +import kfp.compiler as compiler +import kfp.components as comp +import kfp.dsl as dsl +from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils + + +task_image = "quay.io/dataprep1/data-prep-kit/hap-ray:latest" + +# the name of the job script +EXEC_SCRIPT_NAME: str = "hap_transform_ray.py" + +# components +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" + +# path to kfp component specifications files +component_spec_path = "../../../../kfp/kfp_ray_components/" + +# compute execution parameters. Here different transforms might need different implementations. As +# a result, instead of creating a component we are creating it in place here. +def compute_exec_params_func( + worker_options: dict, + actor_options: dict, + data_s3_config: str, + data_max_files: int, + data_num_samples: int, + data_checkpointing: bool, + runtime_pipeline_id: str, + runtime_job_id: str, + runtime_code_location: dict, + model_name_or_path: str, + annotation_column: str, + doc_text_column: str, + inference_engine: str, + max_length: int, + batch_size: int, +) -> dict: + from runtime_utils import KFPUtils + + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "data_checkpointing": data_checkpointing, + "runtime_num_workers": KFPUtils.default_compute_execution_params(str(worker_options), str(actor_options)), + "runtime_worker_options": str(actor_options), + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": str(runtime_code_location), + "model_name_or_path": model_name_or_path, + "annotation_column": annotation_column, + "doc_text_column": doc_text_column, + "inference_engine": inference_engine, + "max_length": max_length, + "batch_size": batch_size, + } + + +# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the +# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. +# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use +# this if/else statement and explicitly call the decorator. +if os.getenv("KFPv2", "0") == "1": + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at + # compilation time. + import uuid + + compute_exec_params_op = dsl.component_decorator.component( + func=compute_exec_params_func, base_image=base_kfp_image + ) + print( + "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + + "same version of the same pipeline !!!" + ) + run_id = uuid.uuid4().hex +else: + compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) + run_id = dsl.RUN_ID_PLACEHOLDER + +# create Ray cluster +create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") +# execute job +execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") +# clean up Ray +cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") + +# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. +TASK_NAME: str = "hap" + + +@dsl.pipeline( + name=TASK_NAME + "-ray-pipeline", + description="Pipeline for hap task", +) +def hap( + # Ray cluster + ray_name: str = "hap-kfp-ray", # name of Ray cluster + # Add image_pull_secret and image_pull_policy to ray workers if needed + ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, + ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image": task_image, + }, + server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", + # data access + data_s3_config: str = "{'input_folder': 'test/hap/input/', 'output_folder': 'test/hap/output/'}", + data_s3_access_secret: str = "s3-secret", + data_max_files: int = -1, + data_num_samples: int = -1, + data_checkpointing: bool = False, + # orchestrator + runtime_actor_options: dict = {"num_cpus": 0.8}, + runtime_pipeline_id: str = "pipeline_id", + runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, + # hap parameters + model_name_or_path: str = "ibm-granite/granite-guardian-hap-38m", + annotation_column: str = "hap_score", + doc_text_column: str = "contents", + inference_engine: str = "CPU", + max_length: int = 512, + batch_size: int = 128, + # additional parameters + additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5, "delete_cluster_delay_minutes": 0}', +): + """ + Pipeline to execute hap transform + :param ray_name: name of the Ray cluster + :param ray_head_options: head node options, containing the following: + cpu - number of cpus + memory - memory + image - image to use + image_pull_secret - image pull secret + tolerations - (optional) tolerations for the ray pods + :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: + replicas - number of replicas to create + max_replicas - max number of replicas + min_replicas - min number of replicas + cpu - number of cpus + memory - memory + image - image to use + image_pull_secret - image pull secret + tolerations - (optional) tolerations for the ray pods + :param server_url - server url + :param additional_params: additional (support) parameters, containing the following: + wait_interval - wait interval for API server, sec + wait_cluster_ready_tmout - time to wait for cluster ready, sec + wait_cluster_up_tmout - time to wait for cluster up, sec + wait_job_ready_tmout - time to wait for job ready, sec + wait_print_tmout - time between prints, sec + http_retries - http retries for API server calls + :param data_s3_access_secret - s3 access secret + :param data_s3_config - s3 configuration + :param data_max_files - max files to process + :param data_num_samples - num samples to process + :param runtime_actor_options - actor options + :param runtime_pipeline_id - pipeline id + :param runtime_code_location - code location + :param model_name_or_path - # HAP model path + :param annotation_column - # hap score for each document + :param doc_text_column - # The column name that contains the document text + :param inference_engine - # inference engine used + :param max_length - # inference engine used + :param batch_size - # batch size + :return: None + """ + # create clean_up task + clean_up_task = cleanup_ray_op( + ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ) + ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) + # pipeline definition + with dsl.ExitHandler(clean_up_task): + # compute execution params + compute_exec_params = compute_exec_params_op( + worker_options=ray_worker_options, + actor_options=runtime_actor_options, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + data_checkpointing=data_checkpointing, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + model_name_or_path=model_name_or_path, + annotation_column=annotation_column, + doc_text_column=doc_text_column, + inference_engine=inference_engine, + max_length=max_length, + batch_size=batch_size, + ) + + ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) + # start Ray cluster + ray_cluster = create_ray_op( + ray_name=ray_name, + run_id=run_id, + ray_head_options=ray_head_options, + ray_worker_options=ray_worker_options, + server_url=server_url, + additional_params=additional_params, + ) + ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) + ray_cluster.after(compute_exec_params) + + # Execute job + execute_job = execute_ray_jobs_op( + ray_name=ray_name, + run_id=run_id, + additional_params=additional_params, + exec_params=compute_exec_params.output, + exec_script_name=EXEC_SCRIPT_NAME, + server_url=server_url, + ) + ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) + ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) + execute_job.after(ray_cluster) + + +if __name__ == "__main__": + # Compiling the pipeline + compiler.Compiler().compile(hap, __file__.replace(".py", ".yaml")) From 7f8060490e6562040ffb1924ceb981a9f0725434 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Thu, 24 Oct 2024 10:57:52 -0400 Subject: [PATCH 2/4] try to reduce the size of the dockerfile Signed-off-by: Maroun Touma --- .../hap/kfp_ray.disable-due-to-error/Makefile | 59 ----- .../kfp_ray.disable-due-to-error/hap_wf.py | 239 ------------------ .../pipeline_definitions.yaml | 44 ---- transforms/universal/hap/ray/.dockerignore | 2 + transforms/universal/hap/ray/Dockerfile | 6 +- 5 files changed, 5 insertions(+), 345 deletions(-) delete mode 100644 transforms/universal/hap/kfp_ray.disable-due-to-error/Makefile delete mode 100644 transforms/universal/hap/kfp_ray.disable-due-to-error/hap_wf.py delete mode 100644 transforms/universal/hap/kfp_ray.disable-due-to-error/pipeline_definitions.yaml create mode 100644 transforms/universal/hap/ray/.dockerignore diff --git a/transforms/universal/hap/kfp_ray.disable-due-to-error/Makefile b/transforms/universal/hap/kfp_ray.disable-due-to-error/Makefile deleted file mode 100644 index 4074b8713..000000000 --- a/transforms/universal/hap/kfp_ray.disable-due-to-error/Makefile +++ /dev/null @@ -1,59 +0,0 @@ -REPOROOT=${CURDIR}/../../../../ - -WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate -include $(REPOROOT)/transforms/.make.workflows - -# Include the common configuration for this transform -include ../transform.config - -SRC_DIR=${CURDIR}/../ray/ - -PYTHON_WF := $(shell find ./ -name '*_wf.py') -YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) - -workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} - -.PHONY: clean -clean: - @# Help: Clean up the virtual environment. - rm -rf ${REPOROOT}/transforms/venv - -venv:: - -build:: - -setup:: - -test:: - -test-src:: - -test-image:: - -publish:: - -image:: - -kind-load-image:: - -docker-load-image:: - -docker-save-image:: - -.PHONY: workflow-build -workflow-build: workflow-venv - $(MAKE) $(YAML_WF) - -.PHONY: workflow-test -workflow-test: workflow-build - $(MAKE) .workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=hap_wf.yaml - -.PHONY: workflow-upload -workflow-upload: workflow-build - @for file in $(YAML_WF); do \ - $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done - -.PHONY: workflow-generate -workflow-generate: workflow-venv - . ${WORKFLOW_VENV_ACTIVATE} && ../../../../kfp/pipeline_generator/single-pipeline/run.sh -c `pwd`/pipeline_definitions.yaml -od . diff --git a/transforms/universal/hap/kfp_ray.disable-due-to-error/hap_wf.py b/transforms/universal/hap/kfp_ray.disable-due-to-error/hap_wf.py deleted file mode 100644 index 786011d4d..000000000 --- a/transforms/universal/hap/kfp_ray.disable-due-to-error/hap_wf.py +++ /dev/null @@ -1,239 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ -import os - -import kfp.compiler as compiler -import kfp.components as comp -import kfp.dsl as dsl -from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils - - -task_image = "quay.io/dataprep1/data-prep-kit/hap-ray:latest" - -# the name of the job script -EXEC_SCRIPT_NAME: str = "hap_transform_ray.py" - -# components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" - -# path to kfp component specifications files -component_spec_path = "../../../../kfp/kfp_ray_components/" - -# compute execution parameters. Here different transforms might need different implementations. As -# a result, instead of creating a component we are creating it in place here. -def compute_exec_params_func( - worker_options: dict, - actor_options: dict, - data_s3_config: str, - data_max_files: int, - data_num_samples: int, - data_checkpointing: bool, - runtime_pipeline_id: str, - runtime_job_id: str, - runtime_code_location: dict, - model_name_or_path: str, - annotation_column: str, - doc_text_column: str, - inference_engine: str, - max_length: int, - batch_size: int, -) -> dict: - from runtime_utils import KFPUtils - - return { - "data_s3_config": data_s3_config, - "data_max_files": data_max_files, - "data_num_samples": data_num_samples, - "data_checkpointing": data_checkpointing, - "runtime_num_workers": KFPUtils.default_compute_execution_params(str(worker_options), str(actor_options)), - "runtime_worker_options": str(actor_options), - "runtime_pipeline_id": runtime_pipeline_id, - "runtime_job_id": runtime_job_id, - "runtime_code_location": str(runtime_code_location), - "model_name_or_path": model_name_or_path, - "annotation_column": annotation_column, - "doc_text_column": doc_text_column, - "inference_engine": inference_engine, - "max_length": max_length, - "batch_size": batch_size, - } - - -# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the -# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. -# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use -# this if/else statement and explicitly call the decorator. -if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - - compute_exec_params_op = dsl.component_decorator.component( - func=compute_exec_params_func, base_image=base_kfp_image - ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex -else: - compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER - -# create Ray cluster -create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") -# execute job -execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") -# clean up Ray -cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") - -# Task name is part of the pipeline name, the ray cluster name and the job name in DMF. -TASK_NAME: str = "hap" - - -@dsl.pipeline( - name=TASK_NAME + "-ray-pipeline", - description="Pipeline for hap task", -) -def hap( - # Ray cluster - ray_name: str = "hap-kfp-ray", # name of Ray cluster - # Add image_pull_secret and image_pull_policy to ray workers if needed - ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, - ray_worker_options: dict = { - "replicas": 2, - "max_replicas": 2, - "min_replicas": 2, - "cpu": 2, - "memory": 4, - "image": task_image, - }, - server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", - # data access - data_s3_config: str = "{'input_folder': 'test/hap/input/', 'output_folder': 'test/hap/output/'}", - data_s3_access_secret: str = "s3-secret", - data_max_files: int = -1, - data_num_samples: int = -1, - data_checkpointing: bool = False, - # orchestrator - runtime_actor_options: dict = {"num_cpus": 0.8}, - runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, - # hap parameters - model_name_or_path: str = "ibm-granite/granite-guardian-hap-38m", - annotation_column: str = "hap_score", - doc_text_column: str = "contents", - inference_engine: str = "CPU", - max_length: int = 512, - batch_size: int = 128, - # additional parameters - additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5, "delete_cluster_delay_minutes": 0}', -): - """ - Pipeline to execute hap transform - :param ray_name: name of the Ray cluster - :param ray_head_options: head node options, containing the following: - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - tolerations - (optional) tolerations for the ray pods - :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following: - replicas - number of replicas to create - max_replicas - max number of replicas - min_replicas - min number of replicas - cpu - number of cpus - memory - memory - image - image to use - image_pull_secret - image pull secret - tolerations - (optional) tolerations for the ray pods - :param server_url - server url - :param additional_params: additional (support) parameters, containing the following: - wait_interval - wait interval for API server, sec - wait_cluster_ready_tmout - time to wait for cluster ready, sec - wait_cluster_up_tmout - time to wait for cluster up, sec - wait_job_ready_tmout - time to wait for job ready, sec - wait_print_tmout - time between prints, sec - http_retries - http retries for API server calls - :param data_s3_access_secret - s3 access secret - :param data_s3_config - s3 configuration - :param data_max_files - max files to process - :param data_num_samples - num samples to process - :param runtime_actor_options - actor options - :param runtime_pipeline_id - pipeline id - :param runtime_code_location - code location - :param model_name_or_path - # HAP model path - :param annotation_column - # hap score for each document - :param doc_text_column - # The column name that contains the document text - :param inference_engine - # inference engine used - :param max_length - # inference engine used - :param batch_size - # batch size - :return: None - """ - # create clean_up task - clean_up_task = cleanup_ray_op( - ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params - ) - ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) - # pipeline definition - with dsl.ExitHandler(clean_up_task): - # compute execution params - compute_exec_params = compute_exec_params_op( - worker_options=ray_worker_options, - actor_options=runtime_actor_options, - data_s3_config=data_s3_config, - data_max_files=data_max_files, - data_num_samples=data_num_samples, - data_checkpointing=data_checkpointing, - runtime_pipeline_id=runtime_pipeline_id, - runtime_job_id=run_id, - runtime_code_location=runtime_code_location, - model_name_or_path=model_name_or_path, - annotation_column=annotation_column, - doc_text_column=doc_text_column, - inference_engine=inference_engine, - max_length=max_length, - batch_size=batch_size, - ) - - ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) - # start Ray cluster - ray_cluster = create_ray_op( - ray_name=ray_name, - run_id=run_id, - ray_head_options=ray_head_options, - ray_worker_options=ray_worker_options, - server_url=server_url, - additional_params=additional_params, - ) - ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) - ray_cluster.after(compute_exec_params) - - # Execute job - execute_job = execute_ray_jobs_op( - ray_name=ray_name, - run_id=run_id, - additional_params=additional_params, - exec_params=compute_exec_params.output, - exec_script_name=EXEC_SCRIPT_NAME, - server_url=server_url, - ) - ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) - execute_job.after(ray_cluster) - - -if __name__ == "__main__": - # Compiling the pipeline - compiler.Compiler().compile(hap, __file__.replace(".py", ".yaml")) diff --git a/transforms/universal/hap/kfp_ray.disable-due-to-error/pipeline_definitions.yaml b/transforms/universal/hap/kfp_ray.disable-due-to-error/pipeline_definitions.yaml deleted file mode 100644 index 9716bb349..000000000 --- a/transforms/universal/hap/kfp_ray.disable-due-to-error/pipeline_definitions.yaml +++ /dev/null @@ -1,44 +0,0 @@ -pipeline_parameters: - name: "hap" - description: "Pipeline for hap task" - script_name: "hap_transform_ray.py" - prefix: "" - multi_s3: False - compute_func_name: "" - compute_func_import: "" - component_spec_path: "" - -pipeline_common_input_parameters_values: - kfp_base_image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" - transform_image: "quay.io/dataprep1/data-prep-kit/hap-ray:latest" - s3_access_secret: "s3-secret" - image_pull_secret: "" - input_folder: "test/hap/input/" - output_folder: "test/hap/output/" - -pipeline_transform_input_parameters: - pipeline_arguments: - - name: "model_name_or_path" - type: "str" - value: "ibm-granite/granite-guardian-hap-38m" - description: "# HAP model path" - - name: "annotation_column" - type: "str" - value: "hap_score" - description: "# hap score for each document" - - name: "doc_text_column" - type: "str" - value: "contents" - description: "# The column name that contains the document text" - - name: "inference_engine" - type: "str" - value: "CPU" - description: "# inference engine used" - - name: max_length - type: "int" - value: 512 - description: "# inference engine used" - - name: "batch_size" - type: "int" - value: 128 - description: "# batch size" diff --git a/transforms/universal/hap/ray/.dockerignore b/transforms/universal/hap/ray/.dockerignore new file mode 100644 index 000000000..d74206b84 --- /dev/null +++ b/transforms/universal/hap/ray/.dockerignore @@ -0,0 +1,2 @@ +venv/ +.pytest_cache diff --git a/transforms/universal/hap/ray/Dockerfile b/transforms/universal/hap/ray/Dockerfile index 42005e9ba..0d21d7358 100644 --- a/transforms/universal/hap/ray/Dockerfile +++ b/transforms/universal/hap/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} RUN pip install --upgrade --no-cache-dir pip # install pytest -RUN pip install --no-cache-dir pytest +#RUN pip install --no-cache-dir pytest # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). @@ -29,8 +29,8 @@ COPY ./src/hap_transform_ray.py . COPY ./src/hap_local_ray.py local/ # copy test -COPY test/ test/ -COPY test-data/ test-data/ +#COPY test/ test/ +#COPY test-data/ test-data/ # Set environment ENV PYTHONPATH /home/ray From 6ed553d02362a970e15c4690a17b285141b6197c Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Thu, 24 Oct 2024 15:51:56 -0400 Subject: [PATCH 3/4] switch to 2.36 that has support for pythone 3.12 Signed-off-by: Maroun Touma --- transforms/universal/hap/ray/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/universal/hap/ray/Dockerfile b/transforms/universal/hap/ray/Dockerfile index 0d21d7358..48dc41849 100644 --- a/transforms/universal/hap/ray/Dockerfile +++ b/transforms/universal/hap/ray/Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310 +ARG BASE_IMAGE=docker.io/rayproject/ray:2.36.1-py312 FROM ${BASE_IMAGE} RUN pip install --upgrade --no-cache-dir pip From 284ac3e39b30d25c5cd558a9b165177e46aced58 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Thu, 24 Oct 2024 15:59:02 -0400 Subject: [PATCH 4/4] Was missing requirements.txt Signed-off-by: Maroun Touma --- transforms/universal/hap/ray/Dockerfile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/transforms/universal/hap/ray/Dockerfile b/transforms/universal/hap/ray/Dockerfile index 48dc41849..351253789 100644 --- a/transforms/universal/hap/ray/Dockerfile +++ b/transforms/universal/hap/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} RUN pip install --upgrade --no-cache-dir pip # install pytest -#RUN pip install --no-cache-dir pytest +RUN pip install --no-cache-dir pytest # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). @@ -20,6 +20,7 @@ RUN cd python-transform && pip install --no-cache-dir -e . COPY --chown=ray:users src/ src/ COPY --chown=ray:users pyproject.toml pyproject.toml +COPY --chown=ray:users requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy the main() entry point to the image @@ -29,8 +30,8 @@ COPY ./src/hap_transform_ray.py . COPY ./src/hap_local_ray.py local/ # copy test -#COPY test/ test/ -#COPY test-data/ test-data/ +COPY test/ test/ +COPY test-data/ test-data/ # Set environment ENV PYTHONPATH /home/ray