From 41f7dc26690fd8679a62630a93cab3859798ec47 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Thu, 24 Oct 2024 10:16:57 -0400
Subject: [PATCH 1/4] Investigate issue with hap failure

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 transforms/universal/hap/kfp_ray/Makefile  |  59 +++++
 transforms/universal/hap/kfp_ray/hap_wf.py | 239 +++++++++++++++++++++
 2 files changed, 298 insertions(+)
 create mode 100644 transforms/universal/hap/kfp_ray/Makefile
 create mode 100644 transforms/universal/hap/kfp_ray/hap_wf.py

diff --git a/transforms/universal/hap/kfp_ray/Makefile b/transforms/universal/hap/kfp_ray/Makefile
new file mode 100644
index 000000000..4074b8713
--- /dev/null
+++ b/transforms/universal/hap/kfp_ray/Makefile
@@ -0,0 +1,59 @@
+REPOROOT=${CURDIR}/../../../../
+
+WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate
+include $(REPOROOT)/transforms/.make.workflows
+
+# Include the common configuration for this transform
+include ../transform.config
+
+SRC_DIR=${CURDIR}/../ray/
+
+PYTHON_WF := $(shell find ./ -name '*_wf.py')
+YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF})
+
+workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE}
+
+.PHONY: clean
+clean:
+	@# Help: Clean up the virtual environment.
+	rm -rf ${REPOROOT}/transforms/venv 
+
+venv::
+
+build::
+
+setup::
+
+test::
+
+test-src::
+
+test-image::
+
+publish::
+
+image::
+
+kind-load-image::
+
+docker-load-image::
+
+docker-save-image::
+
+.PHONY: workflow-build
+workflow-build: workflow-venv
+	$(MAKE) $(YAML_WF)
+
+.PHONY: workflow-test
+workflow-test: workflow-build
+	$(MAKE) .workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=hap_wf.yaml
+
+.PHONY: workflow-upload
+workflow-upload: workflow-build
+	@for file in $(YAML_WF); do \
+		$(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \
+	done
+
+.PHONY: workflow-generate
+workflow-generate: workflow-venv
+	. ${WORKFLOW_VENV_ACTIVATE}  && ../../../../kfp/pipeline_generator/single-pipeline/run.sh -c `pwd`/pipeline_definitions.yaml -od  .
diff --git a/transforms/universal/hap/kfp_ray/hap_wf.py b/transforms/universal/hap/kfp_ray/hap_wf.py
new file mode 100644
index 000000000..786011d4d
--- /dev/null
+++ b/transforms/universal/hap/kfp_ray/hap_wf.py
@@ -0,0 +1,239 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+import os
+
+import kfp.compiler as compiler
+import kfp.components as comp
+import kfp.dsl as dsl
+from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils
+
+
+task_image = "quay.io/dataprep1/data-prep-kit/hap-ray:latest"
+
+# the name of the job script
+EXEC_SCRIPT_NAME: str = "hap_transform_ray.py"
+
+# components
+base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
+
+# path to kfp component specifications files
+component_spec_path = "../../../../kfp/kfp_ray_components/"
+
+# compute execution parameters. Here different transforms might need different implementations. As
+# a result, instead of creating a component we are creating it in place here.
+def compute_exec_params_func(
+    worker_options: dict,
+    actor_options: dict,
+    data_s3_config: str,
+    data_max_files: int,
+    data_num_samples: int,
+    data_checkpointing: bool,
+    runtime_pipeline_id: str,
+    runtime_job_id: str,
+    runtime_code_location: dict,
+    model_name_or_path: str,
+    annotation_column: str,
+    doc_text_column: str,
+    inference_engine: str,
+    max_length: int,
+    batch_size: int,
+) -> dict:
+    from runtime_utils import KFPUtils
+
+    return {
+        "data_s3_config": data_s3_config,
+        "data_max_files": data_max_files,
+        "data_num_samples": data_num_samples,
+        "data_checkpointing": data_checkpointing,
+        "runtime_num_workers": KFPUtils.default_compute_execution_params(str(worker_options), str(actor_options)),
+        "runtime_worker_options": str(actor_options),
+        "runtime_pipeline_id": runtime_pipeline_id,
+        "runtime_job_id": runtime_job_id,
+        "runtime_code_location": str(runtime_code_location),
+        "model_name_or_path": model_name_or_path,
+        "annotation_column": annotation_column,
+        "doc_text_column": doc_text_column,
+        "inference_engine": inference_engine,
+        "max_length": max_length,
+        "batch_size": batch_size,
+    }
+
+
+# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the
+# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path.
+# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use
+# this if/else statement and explicitly call the decorator.
+if os.getenv("KFPv2", "0") == "1":
+    # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create
+    # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to
+    # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at
+    # compilation time.
+    import uuid
+
+    compute_exec_params_op = dsl.component_decorator.component(
+        func=compute_exec_params_func, base_image=base_kfp_image
+    )
+    print(
+        "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the "
+        + "same version of the same pipeline !!!"
+    )
+    run_id = uuid.uuid4().hex
+else:
+    compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image)
+    run_id = dsl.RUN_ID_PLACEHOLDER
+
+# create Ray cluster
+create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml")
+# execute job
+execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml")
+# clean up Ray
+cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml")
+
+# Task name is part of the pipeline name, the ray cluster name and the job name in DMF.
+TASK_NAME: str = "hap"
+
+
+@dsl.pipeline(
+    name=TASK_NAME + "-ray-pipeline",
+    description="Pipeline for hap task",
+)
+def hap(
+    # Ray cluster
+    ray_name: str = "hap-kfp-ray",  # name of Ray cluster
+    # Add image_pull_secret and image_pull_policy to ray workers if needed
+    ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image},
+    ray_worker_options: dict = {
+        "replicas": 2,
+        "max_replicas": 2,
+        "min_replicas": 2,
+        "cpu": 2,
+        "memory": 4,
+        "image": task_image,
+    },
+    server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888",
+    # data access
+    data_s3_config: str = "{'input_folder': 'test/hap/input/', 'output_folder': 'test/hap/output/'}",
+    data_s3_access_secret: str = "s3-secret",
+    data_max_files: int = -1,
+    data_num_samples: int = -1,
+    data_checkpointing: bool = False,
+    # orchestrator
+    runtime_actor_options: dict = {"num_cpus": 0.8},
+    runtime_pipeline_id: str = "pipeline_id",
+    runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"},
+    # hap parameters
+    model_name_or_path: str = "ibm-granite/granite-guardian-hap-38m",
+    annotation_column: str = "hap_score",
+    doc_text_column: str = "contents",
+    inference_engine: str = "CPU",
+    max_length: int = 512,
+    batch_size: int = 128,
+    # additional parameters
+    additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5, "delete_cluster_delay_minutes": 0}',
+):
+    """
+    Pipeline to execute hap transform
+    :param ray_name: name of the Ray cluster
+    :param ray_head_options: head node options, containing the following:
+        cpu - number of cpus
+        memory - memory
+        image - image to use
+        image_pull_secret - image pull secret
+        tolerations - (optional) tolerations for the ray pods
+    :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following:
+        replicas - number of replicas to create
+        max_replicas - max number of replicas
+        min_replicas - min number of replicas
+        cpu - number of cpus
+        memory - memory
+        image - image to use
+        image_pull_secret - image pull secret
+        tolerations - (optional) tolerations for the ray pods
+    :param server_url - server url
+    :param additional_params: additional (support) parameters, containing the following:
+        wait_interval - wait interval for API server, sec
+        wait_cluster_ready_tmout - time to wait for cluster ready, sec
+        wait_cluster_up_tmout - time to wait for cluster up, sec
+        wait_job_ready_tmout - time to wait for job ready, sec
+        wait_print_tmout - time between prints, sec
+        http_retries - http retries for API server calls
+    :param data_s3_access_secret - s3 access secret
+    :param data_s3_config - s3 configuration
+    :param data_max_files - max files to process
+    :param data_num_samples - num samples to process
+    :param runtime_actor_options - actor options
+    :param runtime_pipeline_id - pipeline id
+    :param runtime_code_location - code location
+    :param model_name_or_path - # HAP model path
+    :param annotation_column - # hap score for each document
+    :param doc_text_column - # The column name that contains the document text
+    :param inference_engine - # inference engine used
+    :param max_length - # inference engine used
+    :param batch_size - # batch size
+    :return: None
+    """
+    # create clean_up task
+    clean_up_task = cleanup_ray_op(
+        ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params
+    )
+    ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2)
+    # pipeline definition
+    with dsl.ExitHandler(clean_up_task):
+        # compute execution params
+        compute_exec_params = compute_exec_params_op(
+            worker_options=ray_worker_options,
+            actor_options=runtime_actor_options,
+            data_s3_config=data_s3_config,
+            data_max_files=data_max_files,
+            data_num_samples=data_num_samples,
+            data_checkpointing=data_checkpointing,
+            runtime_pipeline_id=runtime_pipeline_id,
+            runtime_job_id=run_id,
+            runtime_code_location=runtime_code_location,
+            model_name_or_path=model_name_or_path,
+            annotation_column=annotation_column,
+            doc_text_column=doc_text_column,
+            inference_engine=inference_engine,
+            max_length=max_length,
+            batch_size=batch_size,
+        )
+
+        ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2)
+        # start Ray cluster
+        ray_cluster = create_ray_op(
+            ray_name=ray_name,
+            run_id=run_id,
+            ray_head_options=ray_head_options,
+            ray_worker_options=ray_worker_options,
+            server_url=server_url,
+            additional_params=additional_params,
+        )
+        ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2)
+        ray_cluster.after(compute_exec_params)
+
+        # Execute job
+        execute_job = execute_ray_jobs_op(
+            ray_name=ray_name,
+            run_id=run_id,
+            additional_params=additional_params,
+            exec_params=compute_exec_params.output,
+            exec_script_name=EXEC_SCRIPT_NAME,
+            server_url=server_url,
+        )
+        ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC)
+        ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret)
+        execute_job.after(ray_cluster)
+
+
+if __name__ == "__main__":
+    # Compiling the pipeline
+    compiler.Compiler().compile(hap, __file__.replace(".py", ".yaml"))

From 7f8060490e6562040ffb1924ceb981a9f0725434 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Thu, 24 Oct 2024 10:57:52 -0400
Subject: [PATCH 2/4] try to reduce the size of the dockerfile

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 .../hap/kfp_ray.disable-due-to-error/Makefile |  59 -----
 .../kfp_ray.disable-due-to-error/hap_wf.py    | 239 ------------------
 .../pipeline_definitions.yaml                 |  44 ----
 transforms/universal/hap/ray/.dockerignore    |   2 +
 transforms/universal/hap/ray/Dockerfile       |   6 +-
 5 files changed, 5 insertions(+), 345 deletions(-)
 delete mode 100644 transforms/universal/hap/kfp_ray.disable-due-to-error/Makefile
 delete mode 100644 transforms/universal/hap/kfp_ray.disable-due-to-error/hap_wf.py
 delete mode 100644 transforms/universal/hap/kfp_ray.disable-due-to-error/pipeline_definitions.yaml
 create mode 100644 transforms/universal/hap/ray/.dockerignore

diff --git a/transforms/universal/hap/kfp_ray.disable-due-to-error/Makefile b/transforms/universal/hap/kfp_ray.disable-due-to-error/Makefile
deleted file mode 100644
index 4074b8713..000000000
--- a/transforms/universal/hap/kfp_ray.disable-due-to-error/Makefile
+++ /dev/null
@@ -1,59 +0,0 @@
-REPOROOT=${CURDIR}/../../../../
-
-WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate
-include $(REPOROOT)/transforms/.make.workflows
-
-# Include the common configuration for this transform
-include ../transform.config
-
-SRC_DIR=${CURDIR}/../ray/
-
-PYTHON_WF := $(shell find ./ -name '*_wf.py')
-YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF})
-
-workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE}
-
-.PHONY: clean
-clean:
-	@# Help: Clean up the virtual environment.
-	rm -rf ${REPOROOT}/transforms/venv 
-
-venv::
-
-build::
-
-setup::
-
-test::
-
-test-src::
-
-test-image::
-
-publish::
-
-image::
-
-kind-load-image::
-
-docker-load-image::
-
-docker-save-image::
-
-.PHONY: workflow-build
-workflow-build: workflow-venv
-	$(MAKE) $(YAML_WF)
-
-.PHONY: workflow-test
-workflow-test: workflow-build
-	$(MAKE) .workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=hap_wf.yaml
-
-.PHONY: workflow-upload
-workflow-upload: workflow-build
-	@for file in $(YAML_WF); do \
-		$(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \
-	done
-
-.PHONY: workflow-generate
-workflow-generate: workflow-venv
-	. ${WORKFLOW_VENV_ACTIVATE}  && ../../../../kfp/pipeline_generator/single-pipeline/run.sh -c `pwd`/pipeline_definitions.yaml -od  .
diff --git a/transforms/universal/hap/kfp_ray.disable-due-to-error/hap_wf.py b/transforms/universal/hap/kfp_ray.disable-due-to-error/hap_wf.py
deleted file mode 100644
index 786011d4d..000000000
--- a/transforms/universal/hap/kfp_ray.disable-due-to-error/hap_wf.py
+++ /dev/null
@@ -1,239 +0,0 @@
-# (C) Copyright IBM Corp. 2024.
-# Licensed under the Apache License, Version 2.0 (the “License”);
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#  http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an “AS IS” BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################################################################
-import os
-
-import kfp.compiler as compiler
-import kfp.components as comp
-import kfp.dsl as dsl
-from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils
-
-
-task_image = "quay.io/dataprep1/data-prep-kit/hap-ray:latest"
-
-# the name of the job script
-EXEC_SCRIPT_NAME: str = "hap_transform_ray.py"
-
-# components
-base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
-
-# path to kfp component specifications files
-component_spec_path = "../../../../kfp/kfp_ray_components/"
-
-# compute execution parameters. Here different transforms might need different implementations. As
-# a result, instead of creating a component we are creating it in place here.
-def compute_exec_params_func(
-    worker_options: dict,
-    actor_options: dict,
-    data_s3_config: str,
-    data_max_files: int,
-    data_num_samples: int,
-    data_checkpointing: bool,
-    runtime_pipeline_id: str,
-    runtime_job_id: str,
-    runtime_code_location: dict,
-    model_name_or_path: str,
-    annotation_column: str,
-    doc_text_column: str,
-    inference_engine: str,
-    max_length: int,
-    batch_size: int,
-) -> dict:
-    from runtime_utils import KFPUtils
-
-    return {
-        "data_s3_config": data_s3_config,
-        "data_max_files": data_max_files,
-        "data_num_samples": data_num_samples,
-        "data_checkpointing": data_checkpointing,
-        "runtime_num_workers": KFPUtils.default_compute_execution_params(str(worker_options), str(actor_options)),
-        "runtime_worker_options": str(actor_options),
-        "runtime_pipeline_id": runtime_pipeline_id,
-        "runtime_job_id": runtime_job_id,
-        "runtime_code_location": str(runtime_code_location),
-        "model_name_or_path": model_name_or_path,
-        "annotation_column": annotation_column,
-        "doc_text_column": doc_text_column,
-        "inference_engine": inference_engine,
-        "max_length": max_length,
-        "batch_size": batch_size,
-    }
-
-
-# KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the
-# `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path.
-# KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use
-# this if/else statement and explicitly call the decorator.
-if os.getenv("KFPv2", "0") == "1":
-    # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create
-    # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to
-    # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at
-    # compilation time.
-    import uuid
-
-    compute_exec_params_op = dsl.component_decorator.component(
-        func=compute_exec_params_func, base_image=base_kfp_image
-    )
-    print(
-        "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the "
-        + "same version of the same pipeline !!!"
-    )
-    run_id = uuid.uuid4().hex
-else:
-    compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image)
-    run_id = dsl.RUN_ID_PLACEHOLDER
-
-# create Ray cluster
-create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml")
-# execute job
-execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml")
-# clean up Ray
-cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml")
-
-# Task name is part of the pipeline name, the ray cluster name and the job name in DMF.
-TASK_NAME: str = "hap"
-
-
-@dsl.pipeline(
-    name=TASK_NAME + "-ray-pipeline",
-    description="Pipeline for hap task",
-)
-def hap(
-    # Ray cluster
-    ray_name: str = "hap-kfp-ray",  # name of Ray cluster
-    # Add image_pull_secret and image_pull_policy to ray workers if needed
-    ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image},
-    ray_worker_options: dict = {
-        "replicas": 2,
-        "max_replicas": 2,
-        "min_replicas": 2,
-        "cpu": 2,
-        "memory": 4,
-        "image": task_image,
-    },
-    server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888",
-    # data access
-    data_s3_config: str = "{'input_folder': 'test/hap/input/', 'output_folder': 'test/hap/output/'}",
-    data_s3_access_secret: str = "s3-secret",
-    data_max_files: int = -1,
-    data_num_samples: int = -1,
-    data_checkpointing: bool = False,
-    # orchestrator
-    runtime_actor_options: dict = {"num_cpus": 0.8},
-    runtime_pipeline_id: str = "pipeline_id",
-    runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"},
-    # hap parameters
-    model_name_or_path: str = "ibm-granite/granite-guardian-hap-38m",
-    annotation_column: str = "hap_score",
-    doc_text_column: str = "contents",
-    inference_engine: str = "CPU",
-    max_length: int = 512,
-    batch_size: int = 128,
-    # additional parameters
-    additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5, "delete_cluster_delay_minutes": 0}',
-):
-    """
-    Pipeline to execute hap transform
-    :param ray_name: name of the Ray cluster
-    :param ray_head_options: head node options, containing the following:
-        cpu - number of cpus
-        memory - memory
-        image - image to use
-        image_pull_secret - image pull secret
-        tolerations - (optional) tolerations for the ray pods
-    :param ray_worker_options: worker node options (we here are using only 1 worker pool), containing the following:
-        replicas - number of replicas to create
-        max_replicas - max number of replicas
-        min_replicas - min number of replicas
-        cpu - number of cpus
-        memory - memory
-        image - image to use
-        image_pull_secret - image pull secret
-        tolerations - (optional) tolerations for the ray pods
-    :param server_url - server url
-    :param additional_params: additional (support) parameters, containing the following:
-        wait_interval - wait interval for API server, sec
-        wait_cluster_ready_tmout - time to wait for cluster ready, sec
-        wait_cluster_up_tmout - time to wait for cluster up, sec
-        wait_job_ready_tmout - time to wait for job ready, sec
-        wait_print_tmout - time between prints, sec
-        http_retries - http retries for API server calls
-    :param data_s3_access_secret - s3 access secret
-    :param data_s3_config - s3 configuration
-    :param data_max_files - max files to process
-    :param data_num_samples - num samples to process
-    :param runtime_actor_options - actor options
-    :param runtime_pipeline_id - pipeline id
-    :param runtime_code_location - code location
-    :param model_name_or_path - # HAP model path
-    :param annotation_column - # hap score for each document
-    :param doc_text_column - # The column name that contains the document text
-    :param inference_engine - # inference engine used
-    :param max_length - # inference engine used
-    :param batch_size - # batch size
-    :return: None
-    """
-    # create clean_up task
-    clean_up_task = cleanup_ray_op(
-        ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params
-    )
-    ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2)
-    # pipeline definition
-    with dsl.ExitHandler(clean_up_task):
-        # compute execution params
-        compute_exec_params = compute_exec_params_op(
-            worker_options=ray_worker_options,
-            actor_options=runtime_actor_options,
-            data_s3_config=data_s3_config,
-            data_max_files=data_max_files,
-            data_num_samples=data_num_samples,
-            data_checkpointing=data_checkpointing,
-            runtime_pipeline_id=runtime_pipeline_id,
-            runtime_job_id=run_id,
-            runtime_code_location=runtime_code_location,
-            model_name_or_path=model_name_or_path,
-            annotation_column=annotation_column,
-            doc_text_column=doc_text_column,
-            inference_engine=inference_engine,
-            max_length=max_length,
-            batch_size=batch_size,
-        )
-
-        ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2)
-        # start Ray cluster
-        ray_cluster = create_ray_op(
-            ray_name=ray_name,
-            run_id=run_id,
-            ray_head_options=ray_head_options,
-            ray_worker_options=ray_worker_options,
-            server_url=server_url,
-            additional_params=additional_params,
-        )
-        ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2)
-        ray_cluster.after(compute_exec_params)
-
-        # Execute job
-        execute_job = execute_ray_jobs_op(
-            ray_name=ray_name,
-            run_id=run_id,
-            additional_params=additional_params,
-            exec_params=compute_exec_params.output,
-            exec_script_name=EXEC_SCRIPT_NAME,
-            server_url=server_url,
-        )
-        ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC)
-        ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret)
-        execute_job.after(ray_cluster)
-
-
-if __name__ == "__main__":
-    # Compiling the pipeline
-    compiler.Compiler().compile(hap, __file__.replace(".py", ".yaml"))
diff --git a/transforms/universal/hap/kfp_ray.disable-due-to-error/pipeline_definitions.yaml b/transforms/universal/hap/kfp_ray.disable-due-to-error/pipeline_definitions.yaml
deleted file mode 100644
index 9716bb349..000000000
--- a/transforms/universal/hap/kfp_ray.disable-due-to-error/pipeline_definitions.yaml
+++ /dev/null
@@ -1,44 +0,0 @@
-pipeline_parameters:
-    name: "hap"
-    description: "Pipeline for hap task"
-    script_name: "hap_transform_ray.py"
-    prefix: ""
-    multi_s3: False
-    compute_func_name: ""
-    compute_func_import: ""
-    component_spec_path: ""
-
-pipeline_common_input_parameters_values:
-    kfp_base_image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest"
-    transform_image: "quay.io/dataprep1/data-prep-kit/hap-ray:latest"
-    s3_access_secret: "s3-secret"
-    image_pull_secret: ""
-    input_folder: "test/hap/input/"
-    output_folder: "test/hap/output/"
-
-pipeline_transform_input_parameters:
-    pipeline_arguments:
-        - name: "model_name_or_path"
-          type: "str"
-          value: "ibm-granite/granite-guardian-hap-38m"
-          description: "# HAP model path"
-        - name: "annotation_column"
-          type: "str"
-          value: "hap_score"
-          description: "# hap score for each document"
-        - name: "doc_text_column"
-          type: "str"
-          value: "contents"
-          description: "# The column name that contains the document text"
-        - name: "inference_engine"
-          type: "str"
-          value: "CPU"
-          description: "# inference engine used"
-        - name: max_length
-          type: "int"
-          value: 512
-          description: "# inference engine used"
-        - name: "batch_size"
-          type: "int"
-          value: 128
-          description: "# batch size"
diff --git a/transforms/universal/hap/ray/.dockerignore b/transforms/universal/hap/ray/.dockerignore
new file mode 100644
index 000000000..d74206b84
--- /dev/null
+++ b/transforms/universal/hap/ray/.dockerignore
@@ -0,0 +1,2 @@
+venv/
+.pytest_cache
diff --git a/transforms/universal/hap/ray/Dockerfile b/transforms/universal/hap/ray/Dockerfile
index 42005e9ba..0d21d7358 100644
--- a/transforms/universal/hap/ray/Dockerfile
+++ b/transforms/universal/hap/ray/Dockerfile
@@ -4,7 +4,7 @@ FROM ${BASE_IMAGE}
 RUN pip install --upgrade --no-cache-dir pip 
 
 # install pytest
-RUN pip install --no-cache-dir pytest
+#RUN pip install --no-cache-dir pytest
 
 # Copy and install data processing libraries 
 # These are expected to be placed in the docker context before this is run (see the make image).
@@ -29,8 +29,8 @@ COPY ./src/hap_transform_ray.py .
 COPY ./src/hap_local_ray.py local/
 
 # copy test
-COPY test/ test/
-COPY test-data/ test-data/
+#COPY test/ test/
+#COPY test-data/ test-data/
 
 # Set environment
 ENV PYTHONPATH /home/ray

From 6ed553d02362a970e15c4690a17b285141b6197c Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Thu, 24 Oct 2024 15:51:56 -0400
Subject: [PATCH 3/4] switch to 2.36 that has support for pythone 3.12

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 transforms/universal/hap/ray/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transforms/universal/hap/ray/Dockerfile b/transforms/universal/hap/ray/Dockerfile
index 0d21d7358..48dc41849 100644
--- a/transforms/universal/hap/ray/Dockerfile
+++ b/transforms/universal/hap/ray/Dockerfile
@@ -1,4 +1,4 @@
-ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310
+ARG BASE_IMAGE=docker.io/rayproject/ray:2.36.1-py312
 FROM ${BASE_IMAGE}
 
 RUN pip install --upgrade --no-cache-dir pip 

From 284ac3e39b30d25c5cd558a9b165177e46aced58 Mon Sep 17 00:00:00 2001
From: Maroun Touma <touma@us.ibm.com>
Date: Thu, 24 Oct 2024 15:59:02 -0400
Subject: [PATCH 4/4] Was missing requirements.txt

Signed-off-by: Maroun Touma <touma@us.ibm.com>
---
 transforms/universal/hap/ray/Dockerfile | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/transforms/universal/hap/ray/Dockerfile b/transforms/universal/hap/ray/Dockerfile
index 48dc41849..351253789 100644
--- a/transforms/universal/hap/ray/Dockerfile
+++ b/transforms/universal/hap/ray/Dockerfile
@@ -4,7 +4,7 @@ FROM ${BASE_IMAGE}
 RUN pip install --upgrade --no-cache-dir pip 
 
 # install pytest
-#RUN pip install --no-cache-dir pytest
+RUN pip install --no-cache-dir pytest
 
 # Copy and install data processing libraries 
 # These are expected to be placed in the docker context before this is run (see the make image).
@@ -20,6 +20,7 @@ RUN cd python-transform && pip install --no-cache-dir -e .
 
 COPY --chown=ray:users src/ src/
 COPY --chown=ray:users pyproject.toml pyproject.toml 
+COPY --chown=ray:users requirements.txt requirements.txt 
 RUN pip install --no-cache-dir -e .
 
 # copy the main() entry point to the image 
@@ -29,8 +30,8 @@ COPY ./src/hap_transform_ray.py .
 COPY ./src/hap_local_ray.py local/
 
 # copy test
-#COPY test/ test/
-#COPY test-data/ test-data/
+COPY test/ test/
+COPY test-data/ test-data/
 
 # Set environment
 ENV PYTHONPATH /home/ray